aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-11-19 15:59:28 -0500
committerChris Mason <chris.mason@oracle.com>2008-11-19 15:59:28 -0500
commitae20a6afec1cf21919d97303f2d8b737eac5acc7 (patch)
treea4ddf02d4f19bdee1119dcc8a0f54edb40fb5986
parent9bf1a2445f3c569098b8de7097ca324e65abecc2 (diff)
parent07103a3cdb24099324a11be1f35279b463cdfc31 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
-rw-r--r--fs/Kconfig19
-rw-r--r--fs/Makefile1
-rw-r--r--fs/btrfs/COPYING356
-rw-r--r--fs/btrfs/INSTALL48
-rw-r--r--fs/btrfs/Makefile25
-rw-r--r--fs/btrfs/acl.c352
-rw-r--r--fs/btrfs/async-thread.c419
-rw-r--r--fs/btrfs/async-thread.h101
-rw-r--r--fs/btrfs/btrfs_inode.h133
-rw-r--r--fs/btrfs/compat.h24
-rw-r--r--fs/btrfs/compression.c618
-rw-r--r--fs/btrfs/compression.h47
-rw-r--r--fs/btrfs/crc32c.h120
-rw-r--r--fs/btrfs/ctree.c3892
-rw-r--r--fs/btrfs/ctree.h2043
-rw-r--r--fs/btrfs/dir-item.c386
-rw-r--r--fs/btrfs/disk-io.c2234
-rw-r--r--fs/btrfs/disk-io.h89
-rw-r--r--fs/btrfs/export.c201
-rw-r--r--fs/btrfs/export.h19
-rw-r--r--fs/btrfs/extent-tree.c5918
-rw-r--r--fs/btrfs/extent_io.c3787
-rw-r--r--fs/btrfs/extent_io.h268
-rw-r--r--fs/btrfs/extent_map.c351
-rw-r--r--fs/btrfs/extent_map.h62
-rw-r--r--fs/btrfs/file-item.c586
-rw-r--r--fs/btrfs/file.c1299
-rw-r--r--fs/btrfs/free-space-cache.c489
-rw-r--r--fs/btrfs/hash.h27
-rw-r--r--fs/btrfs/inode-item.c206
-rw-r--r--fs/btrfs/inode-map.c145
-rw-r--r--fs/btrfs/inode.c5022
-rw-r--r--fs/btrfs/ioctl.c1150
-rw-r--r--fs/btrfs/ioctl.h67
-rw-r--r--fs/btrfs/locking.c87
-rw-r--r--fs/btrfs/locking.h27
-rw-r--r--fs/btrfs/ordered-data.c733
-rw-r--r--fs/btrfs/ordered-data.h156
-rw-r--r--fs/btrfs/orphan.c67
-rw-r--r--fs/btrfs/print-tree.c201
-rw-r--r--fs/btrfs/print-tree.h23
-rw-r--r--fs/btrfs/ref-cache.c230
-rw-r--r--fs/btrfs/ref-cache.h77
-rw-r--r--fs/btrfs/root-tree.c367
-rw-r--r--fs/btrfs/struct-funcs.c132
-rw-r--r--fs/btrfs/super.c713
-rw-r--r--fs/btrfs/sysfs.c268
-rw-r--r--fs/btrfs/transaction.c1102
-rw-r--r--fs/btrfs/transaction.h106
-rw-r--r--fs/btrfs/tree-defrag.c146
-rw-r--r--fs/btrfs/tree-log.c2896
-rw-r--r--fs/btrfs/tree-log.h41
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/version.sh43
-rw-r--r--fs/btrfs/volumes.c3117
-rw-r--r--fs/btrfs/volumes.h158
-rw-r--r--fs/btrfs/xattr.c321
-rw-r--r--fs/btrfs/xattr.h39
-rw-r--r--fs/btrfs/zlib.c638
59 files changed, 42196 insertions, 0 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 522469a7eca3..5f62e7e0e6cb 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -268,6 +268,25 @@ config OCFS2_COMPAT_JBD
268 is backwards compatible with JBD. It is safe to say N here. 268 is backwards compatible with JBD. It is safe to say N here.
269 However, if you really want to use the original JBD, say Y here. 269 However, if you really want to use the original JBD, say Y here.
270 270
271config BTRFS_FS
272 tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
273 depends on EXPERIMENTAL
274 select LIBCRC32C
275 select ZLIB_INFLATE
276 select ZLIB_DEFLATE
277 help
278 Btrfs is a new filesystem with extents, writable snapshotting,
279 support for multiple devices and many more features.
280
281 Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
282 FINALIZED. You should say N here unless you are interested in
283 testing Btrfs with non-critical data.
284
285 To compile this file system support as a module, choose M here. The
286 module will be called btrfs.
287
288 If unsure, say N.
289
271endif # BLOCK 290endif # BLOCK
272 291
273config DNOTIFY 292config DNOTIFY
diff --git a/fs/Makefile b/fs/Makefile
index d9f8afe6f0c4..5fbb23db281c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -121,4 +121,5 @@ obj-$(CONFIG_HOSTFS) += hostfs/
121obj-$(CONFIG_HPPFS) += hppfs/ 121obj-$(CONFIG_HPPFS) += hppfs/
122obj-$(CONFIG_DEBUG_FS) += debugfs/ 122obj-$(CONFIG_DEBUG_FS) += debugfs/
123obj-$(CONFIG_OCFS2_FS) += ocfs2/ 123obj-$(CONFIG_OCFS2_FS) += ocfs2/
124obj-$(CONFIG_BTRFS_FS) += btrfs/
124obj-$(CONFIG_GFS2_FS) += gfs2/ 125obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/btrfs/COPYING b/fs/btrfs/COPYING
new file mode 100644
index 000000000000..ca442d313d86
--- /dev/null
+++ b/fs/btrfs/COPYING
@@ -0,0 +1,356 @@
1
2 NOTE! This copyright does *not* cover user programs that use kernel
3 services by normal system calls - this is merely considered normal use
4 of the kernel, and does *not* fall under the heading of "derived work".
5 Also note that the GPL below is copyrighted by the Free Software
6 Foundation, but the instance of code that it refers to (the Linux
7 kernel) is copyrighted by me and others who actually wrote it.
8
9 Also note that the only valid version of the GPL as far as the kernel
10 is concerned is _this_ particular version of the license (ie v2, not
11 v2.2 or v3.x or whatever), unless explicitly otherwise stated.
12
13 Linus Torvalds
14
15----------------------------------------
16
17 GNU GENERAL PUBLIC LICENSE
18 Version 2, June 1991
19
20 Copyright (C) 1989, 1991 Free Software Foundation, Inc.
21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 Everyone is permitted to copy and distribute verbatim copies
23 of this license document, but changing it is not allowed.
24
25 Preamble
26
27 The licenses for most software are designed to take away your
28freedom to share and change it. By contrast, the GNU General Public
29License is intended to guarantee your freedom to share and change free
30software--to make sure the software is free for all its users. This
31General Public License applies to most of the Free Software
32Foundation's software and to any other program whose authors commit to
33using it. (Some other Free Software Foundation software is covered by
34the GNU Library General Public License instead.) You can apply it to
35your programs, too.
36
37 When we speak of free software, we are referring to freedom, not
38price. Our General Public Licenses are designed to make sure that you
39have the freedom to distribute copies of free software (and charge for
40this service if you wish), that you receive source code or can get it
41if you want it, that you can change the software or use pieces of it
42in new free programs; and that you know you can do these things.
43
44 To protect your rights, we need to make restrictions that forbid
45anyone to deny you these rights or to ask you to surrender the rights.
46These restrictions translate to certain responsibilities for you if you
47distribute copies of the software, or if you modify it.
48
49 For example, if you distribute copies of such a program, whether
50gratis or for a fee, you must give the recipients all the rights that
51you have. You must make sure that they, too, receive or can get the
52source code. And you must show them these terms so they know their
53rights.
54
55 We protect your rights with two steps: (1) copyright the software, and
56(2) offer you this license which gives you legal permission to copy,
57distribute and/or modify the software.
58
59 Also, for each author's protection and ours, we want to make certain
60that everyone understands that there is no warranty for this free
61software. If the software is modified by someone else and passed on, we
62want its recipients to know that what they have is not the original, so
63that any problems introduced by others will not reflect on the original
64authors' reputations.
65
66 Finally, any free program is threatened constantly by software
67patents. We wish to avoid the danger that redistributors of a free
68program will individually obtain patent licenses, in effect making the
69program proprietary. To prevent this, we have made it clear that any
70patent must be licensed for everyone's free use or not licensed at all.
71
72 The precise terms and conditions for copying, distribution and
73modification follow.
74
75 GNU GENERAL PUBLIC LICENSE
76 TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
77
78 0. This License applies to any program or other work which contains
79a notice placed by the copyright holder saying it may be distributed
80under the terms of this General Public License. The "Program", below,
81refers to any such program or work, and a "work based on the Program"
82means either the Program or any derivative work under copyright law:
83that is to say, a work containing the Program or a portion of it,
84either verbatim or with modifications and/or translated into another
85language. (Hereinafter, translation is included without limitation in
86the term "modification".) Each licensee is addressed as "you".
87
88Activities other than copying, distribution and modification are not
89covered by this License; they are outside its scope. The act of
90running the Program is not restricted, and the output from the Program
91is covered only if its contents constitute a work based on the
92Program (independent of having been made by running the Program).
93Whether that is true depends on what the Program does.
94
95 1. You may copy and distribute verbatim copies of the Program's
96source code as you receive it, in any medium, provided that you
97conspicuously and appropriately publish on each copy an appropriate
98copyright notice and disclaimer of warranty; keep intact all the
99notices that refer to this License and to the absence of any warranty;
100and give any other recipients of the Program a copy of this License
101along with the Program.
102
103You may charge a fee for the physical act of transferring a copy, and
104you may at your option offer warranty protection in exchange for a fee.
105
106 2. You may modify your copy or copies of the Program or any portion
107of it, thus forming a work based on the Program, and copy and
108distribute such modifications or work under the terms of Section 1
109above, provided that you also meet all of these conditions:
110
111 a) You must cause the modified files to carry prominent notices
112 stating that you changed the files and the date of any change.
113
114 b) You must cause any work that you distribute or publish, that in
115 whole or in part contains or is derived from the Program or any
116 part thereof, to be licensed as a whole at no charge to all third
117 parties under the terms of this License.
118
119 c) If the modified program normally reads commands interactively
120 when run, you must cause it, when started running for such
121 interactive use in the most ordinary way, to print or display an
122 announcement including an appropriate copyright notice and a
123 notice that there is no warranty (or else, saying that you provide
124 a warranty) and that users may redistribute the program under
125 these conditions, and telling the user how to view a copy of this
126 License. (Exception: if the Program itself is interactive but
127 does not normally print such an announcement, your work based on
128 the Program is not required to print an announcement.)
129
130These requirements apply to the modified work as a whole. If
131identifiable sections of that work are not derived from the Program,
132and can be reasonably considered independent and separate works in
133themselves, then this License, and its terms, do not apply to those
134sections when you distribute them as separate works. But when you
135distribute the same sections as part of a whole which is a work based
136on the Program, the distribution of the whole must be on the terms of
137this License, whose permissions for other licensees extend to the
138entire whole, and thus to each and every part regardless of who wrote it.
139
140Thus, it is not the intent of this section to claim rights or contest
141your rights to work written entirely by you; rather, the intent is to
142exercise the right to control the distribution of derivative or
143collective works based on the Program.
144
145In addition, mere aggregation of another work not based on the Program
146with the Program (or with a work based on the Program) on a volume of
147a storage or distribution medium does not bring the other work under
148the scope of this License.
149
150 3. You may copy and distribute the Program (or a work based on it,
151under Section 2) in object code or executable form under the terms of
152Sections 1 and 2 above provided that you also do one of the following:
153
154 a) Accompany it with the complete corresponding machine-readable
155 source code, which must be distributed under the terms of Sections
156 1 and 2 above on a medium customarily used for software interchange; or,
157
158 b) Accompany it with a written offer, valid for at least three
159 years, to give any third party, for a charge no more than your
160 cost of physically performing source distribution, a complete
161 machine-readable copy of the corresponding source code, to be
162 distributed under the terms of Sections 1 and 2 above on a medium
163 customarily used for software interchange; or,
164
165 c) Accompany it with the information you received as to the offer
166 to distribute corresponding source code. (This alternative is
167 allowed only for noncommercial distribution and only if you
168 received the program in object code or executable form with such
169 an offer, in accord with Subsection b above.)
170
171The source code for a work means the preferred form of the work for
172making modifications to it. For an executable work, complete source
173code means all the source code for all modules it contains, plus any
174associated interface definition files, plus the scripts used to
175control compilation and installation of the executable. However, as a
176special exception, the source code distributed need not include
177anything that is normally distributed (in either source or binary
178form) with the major components (compiler, kernel, and so on) of the
179operating system on which the executable runs, unless that component
180itself accompanies the executable.
181
182If distribution of executable or object code is made by offering
183access to copy from a designated place, then offering equivalent
184access to copy the source code from the same place counts as
185distribution of the source code, even though third parties are not
186compelled to copy the source along with the object code.
187
188 4. You may not copy, modify, sublicense, or distribute the Program
189except as expressly provided under this License. Any attempt
190otherwise to copy, modify, sublicense or distribute the Program is
191void, and will automatically terminate your rights under this License.
192However, parties who have received copies, or rights, from you under
193this License will not have their licenses terminated so long as such
194parties remain in full compliance.
195
196 5. You are not required to accept this License, since you have not
197signed it. However, nothing else grants you permission to modify or
198distribute the Program or its derivative works. These actions are
199prohibited by law if you do not accept this License. Therefore, by
200modifying or distributing the Program (or any work based on the
201Program), you indicate your acceptance of this License to do so, and
202all its terms and conditions for copying, distributing or modifying
203the Program or works based on it.
204
205 6. Each time you redistribute the Program (or any work based on the
206Program), the recipient automatically receives a license from the
207original licensor to copy, distribute or modify the Program subject to
208these terms and conditions. You may not impose any further
209restrictions on the recipients' exercise of the rights granted herein.
210You are not responsible for enforcing compliance by third parties to
211this License.
212
213 7. If, as a consequence of a court judgment or allegation of patent
214infringement or for any other reason (not limited to patent issues),
215conditions are imposed on you (whether by court order, agreement or
216otherwise) that contradict the conditions of this License, they do not
217excuse you from the conditions of this License. If you cannot
218distribute so as to satisfy simultaneously your obligations under this
219License and any other pertinent obligations, then as a consequence you
220may not distribute the Program at all. For example, if a patent
221license would not permit royalty-free redistribution of the Program by
222all those who receive copies directly or indirectly through you, then
223the only way you could satisfy both it and this License would be to
224refrain entirely from distribution of the Program.
225
226If any portion of this section is held invalid or unenforceable under
227any particular circumstance, the balance of the section is intended to
228apply and the section as a whole is intended to apply in other
229circumstances.
230
231It is not the purpose of this section to induce you to infringe any
232patents or other property right claims or to contest validity of any
233such claims; this section has the sole purpose of protecting the
234integrity of the free software distribution system, which is
235implemented by public license practices. Many people have made
236generous contributions to the wide range of software distributed
237through that system in reliance on consistent application of that
238system; it is up to the author/donor to decide if he or she is willing
239to distribute software through any other system and a licensee cannot
240impose that choice.
241
242This section is intended to make thoroughly clear what is believed to
243be a consequence of the rest of this License.
244
245 8. If the distribution and/or use of the Program is restricted in
246certain countries either by patents or by copyrighted interfaces, the
247original copyright holder who places the Program under this License
248may add an explicit geographical distribution limitation excluding
249those countries, so that distribution is permitted only in or among
250countries not thus excluded. In such case, this License incorporates
251the limitation as if written in the body of this License.
252
253 9. The Free Software Foundation may publish revised and/or new versions
254of the General Public License from time to time. Such new versions will
255be similar in spirit to the present version, but may differ in detail to
256address new problems or concerns.
257
258Each version is given a distinguishing version number. If the Program
259specifies a version number of this License which applies to it and "any
260later version", you have the option of following the terms and conditions
261either of that version or of any later version published by the Free
262Software Foundation. If the Program does not specify a version number of
263this License, you may choose any version ever published by the Free Software
264Foundation.
265
266 10. If you wish to incorporate parts of the Program into other free
267programs whose distribution conditions are different, write to the author
268to ask for permission. For software which is copyrighted by the Free
269Software Foundation, write to the Free Software Foundation; we sometimes
270make exceptions for this. Our decision will be guided by the two goals
271of preserving the free status of all derivatives of our free software and
272of promoting the sharing and reuse of software generally.
273
274 NO WARRANTY
275
276 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
277FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
278OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
279PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
280OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
281MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
282TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
283PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
284REPAIR OR CORRECTION.
285
286 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
287WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
288REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
289INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
290OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
291TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
292YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
293PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
294POSSIBILITY OF SUCH DAMAGES.
295
296 END OF TERMS AND CONDITIONS
297
298 How to Apply These Terms to Your New Programs
299
300 If you develop a new program, and you want it to be of the greatest
301possible use to the public, the best way to achieve this is to make it
302free software which everyone can redistribute and change under these terms.
303
304 To do so, attach the following notices to the program. It is safest
305to attach them to the start of each source file to most effectively
306convey the exclusion of warranty; and each file should have at least
307the "copyright" line and a pointer to where the full notice is found.
308
309 <one line to give the program's name and a brief idea of what it does.>
310 Copyright (C) <year> <name of author>
311
312 This program is free software; you can redistribute it and/or modify
313 it under the terms of the GNU General Public License as published by
314 the Free Software Foundation; either version 2 of the License, or
315 (at your option) any later version.
316
317 This program is distributed in the hope that it will be useful,
318 but WITHOUT ANY WARRANTY; without even the implied warranty of
319 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
320 GNU General Public License for more details.
321
322 You should have received a copy of the GNU General Public License
323 along with this program; if not, write to the Free Software
324 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
325
326
327Also add information on how to contact you by electronic and paper mail.
328
329If the program is interactive, make it output a short notice like this
330when it starts in an interactive mode:
331
332 Gnomovision version 69, Copyright (C) year name of author
333 Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
334 This is free software, and you are welcome to redistribute it
335 under certain conditions; type `show c' for details.
336
337The hypothetical commands `show w' and `show c' should show the appropriate
338parts of the General Public License. Of course, the commands you use may
339be called something other than `show w' and `show c'; they could even be
340mouse-clicks or menu items--whatever suits your program.
341
342You should also get your employer (if you work as a programmer) or your
343school, if any, to sign a "copyright disclaimer" for the program, if
344necessary. Here is a sample; alter the names:
345
346 Yoyodyne, Inc., hereby disclaims all copyright interest in the program
347 `Gnomovision' (which makes passes at compilers) written by James Hacker.
348
349 <signature of Ty Coon>, 1 April 1989
350 Ty Coon, President of Vice
351
352This General Public License does not permit incorporating your program into
353proprietary programs. If your program is a subroutine library, you may
354consider it more useful to permit linking proprietary applications with the
355library. If this is what you want to do, use the GNU Library General
356Public License instead of this License.
diff --git a/fs/btrfs/INSTALL b/fs/btrfs/INSTALL
new file mode 100644
index 000000000000..16b45a56878d
--- /dev/null
+++ b/fs/btrfs/INSTALL
@@ -0,0 +1,48 @@
1Install Instructions
2
3Btrfs puts snapshots and subvolumes into the root directory of the FS. This
4directory can only be changed by btrfsctl right now, and normal filesystem
5operations do not work on it. The default subvolume is called 'default',
6and you can create files and directories in mount_point/default
7
8Btrfs uses libcrc32c in the kernel for file and metadata checksums. You need
9to compile the kernel with:
10
11CONFIG_LIBCRC32C=m
12
13libcrc32c can be static as well. Once your kernel is setup, typing make in the
14btrfs module sources will build against the running kernel. When the build is
15complete:
16
17modprobe libcrc32c
18insmod btrfs.ko
19
20The Btrfs utility programs require libuuid to build. This can be found
21in the e2fsprogs sources, and is usually available as libuuid or
22e2fsprogs-devel from various distros.
23
24Building the utilities is just make ; make install. The programs go
25into /usr/local/bin. The commands available are:
26
27mkfs.btrfs: create a filesystem
28
29btrfsctl: control program to create snapshots and subvolumes:
30
31 mount /dev/sda2 /mnt
32 btrfsctl -s new_subvol_name /mnt
33 btrfsctl -s snapshot_of_default /mnt/default
34 btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name
35 btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol
36 ls /mnt
37 default snapshot_of_a_snapshot snapshot_of_new_subvol
38 new_subvol_name snapshot_of_default
39
40 Snapshots and subvolumes cannot be deleted right now, but you can
41 rm -rf all the files and directories inside them.
42
43btrfsck: do a limited check of the FS extent trees.</li>
44
45debug-tree: print all of the FS metadata in text form. Example:
46
47 debug-tree /dev/sda2 >& big_output_file
48
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 000000000000..d2cf5a54a4b8
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,25 @@
1ifneq ($(KERNELRELEASE),)
2# kbuild part of makefile
3
4obj-$(CONFIG_BTRFS_FS) := btrfs.o
5btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 file-item.o inode-item.o inode-map.o disk-io.o \
7 transaction.o inode.o file.o tree-defrag.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
11 compression.o
12else
13
14# Normal Makefile
15
16KERNELDIR := /lib/modules/`uname -r`/build
17all:
18 $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
19
20modules_install:
21 $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
22clean:
23 $(MAKE) -C $(KERNELDIR) M=`pwd` clean
24
25endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 000000000000..867eaf1f8efb
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,352 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/string.h>
21#include <linux/xattr.h>
22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h>
24#include <linux/sched.h>
25
26#include "ctree.h"
27#include "btrfs_inode.h"
28#include "xattr.h"
29
30#ifdef CONFIG_FS_POSIX_ACL
31
32static void btrfs_update_cached_acl(struct inode *inode,
33 struct posix_acl **p_acl,
34 struct posix_acl *acl)
35{
36 spin_lock(&inode->i_lock);
37 if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
38 posix_acl_release(*p_acl);
39 *p_acl = posix_acl_dup(acl);
40 spin_unlock(&inode->i_lock);
41}
42
43static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
44{
45 int size;
46 const char *name;
47 char *value = NULL;
48 struct posix_acl *acl = NULL, **p_acl;
49
50 switch (type) {
51 case ACL_TYPE_ACCESS:
52 name = POSIX_ACL_XATTR_ACCESS;
53 p_acl = &BTRFS_I(inode)->i_acl;
54 break;
55 case ACL_TYPE_DEFAULT:
56 name = POSIX_ACL_XATTR_DEFAULT;
57 p_acl = &BTRFS_I(inode)->i_default_acl;
58 break;
59 default:
60 return ERR_PTR(-EINVAL);
61 }
62
63 spin_lock(&inode->i_lock);
64 if (*p_acl != BTRFS_ACL_NOT_CACHED)
65 acl = posix_acl_dup(*p_acl);
66 spin_unlock(&inode->i_lock);
67
68 if (acl)
69 return acl;
70
71
72 size = __btrfs_getxattr(inode, name, "", 0);
73 if (size > 0) {
74 value = kzalloc(size, GFP_NOFS);
75 if (!value)
76 return ERR_PTR(-ENOMEM);
77 size = __btrfs_getxattr(inode, name, value, size);
78 if (size > 0) {
79 acl = posix_acl_from_xattr(value, size);
80 btrfs_update_cached_acl(inode, p_acl, acl);
81 }
82 kfree(value);
83 } else if (size == -ENOENT) {
84 acl = NULL;
85 btrfs_update_cached_acl(inode, p_acl, acl);
86 }
87
88 return acl;
89}
90
91static int btrfs_xattr_get_acl(struct inode *inode, int type,
92 void *value, size_t size)
93{
94 struct posix_acl *acl;
95 int ret = 0;
96
97 acl = btrfs_get_acl(inode, type);
98
99 if (IS_ERR(acl))
100 return PTR_ERR(acl);
101 if (acl == NULL)
102 return -ENODATA;
103 ret = posix_acl_to_xattr(acl, value, size);
104 posix_acl_release(acl);
105
106 return ret;
107}
108
109/*
110 * Needs to be called with fs_mutex held
111 */
112static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
113{
114 int ret, size = 0;
115 const char *name;
116 struct posix_acl **p_acl;
117 char *value = NULL;
118 mode_t mode;
119
120 if (acl) {
121 ret = posix_acl_valid(acl);
122 if (ret < 0)
123 return ret;
124 ret = 0;
125 }
126
127 switch (type) {
128 case ACL_TYPE_ACCESS:
129 mode = inode->i_mode;
130 ret = posix_acl_equiv_mode(acl, &mode);
131 if (ret < 0)
132 return ret;
133 ret = 0;
134 inode->i_mode = mode;
135 name = POSIX_ACL_XATTR_ACCESS;
136 p_acl = &BTRFS_I(inode)->i_acl;
137 break;
138 case ACL_TYPE_DEFAULT:
139 if (!S_ISDIR(inode->i_mode))
140 return acl ? -EINVAL : 0;
141 name = POSIX_ACL_XATTR_DEFAULT;
142 p_acl = &BTRFS_I(inode)->i_default_acl;
143 break;
144 default:
145 return -EINVAL;
146 }
147
148 if (acl) {
149 size = posix_acl_xattr_size(acl->a_count);
150 value = kmalloc(size, GFP_NOFS);
151 if (!value) {
152 ret = -ENOMEM;
153 goto out;
154 }
155
156 ret = posix_acl_to_xattr(acl, value, size);
157 if (ret < 0)
158 goto out;
159 }
160
161 ret = __btrfs_setxattr(inode, name, value, size, 0);
162
163out:
164 if (value)
165 kfree(value);
166
167 if (!ret)
168 btrfs_update_cached_acl(inode, p_acl, acl);
169
170 return ret;
171}
172
173static int btrfs_xattr_set_acl(struct inode *inode, int type,
174 const void *value, size_t size)
175{
176 int ret = 0;
177 struct posix_acl *acl = NULL;
178
179 if (value) {
180 acl = posix_acl_from_xattr(value, size);
181 if (acl == NULL) {
182 value = NULL;
183 size = 0;
184 } else if (IS_ERR(acl)) {
185 return PTR_ERR(acl);
186 }
187 }
188
189 ret = btrfs_set_acl(inode, acl, type);
190
191 posix_acl_release(acl);
192
193 return ret;
194}
195
196
197static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
198 void *value, size_t size)
199{
200 return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
201}
202
203static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
204 const void *value, size_t size, int flags)
205{
206 return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
207}
208
209static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
210 void *value, size_t size)
211{
212 return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
213}
214
215static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
216 const void *value, size_t size, int flags)
217{
218 return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
219}
220
221int btrfs_check_acl(struct inode *inode, int mask)
222{
223 struct posix_acl *acl;
224 int error = -EAGAIN;
225
226 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
227
228 if (IS_ERR(acl))
229 return PTR_ERR(acl);
230 if (acl) {
231 error = posix_acl_permission(inode, acl, mask);
232 posix_acl_release(acl);
233 }
234
235 return error;
236}
237
238/*
239 * btrfs_init_acl is already generally called under fs_mutex, so the locking
240 * stuff has been fixed to work with that. If the locking stuff changes, we
241 * need to re-evaluate the acl locking stuff.
242 */
243int btrfs_init_acl(struct inode *inode, struct inode *dir)
244{
245 struct posix_acl *acl = NULL;
246 int ret = 0;
247
248 /* this happens with subvols */
249 if (!dir)
250 return 0;
251
252 if (!S_ISLNK(inode->i_mode)) {
253 if (IS_POSIXACL(dir)) {
254 acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
255 if (IS_ERR(acl))
256 return PTR_ERR(acl);
257 }
258
259 if (!acl)
260 inode->i_mode &= ~current->fs->umask;
261 }
262
263 if (IS_POSIXACL(dir) && acl) {
264 struct posix_acl *clone;
265 mode_t mode;
266
267 if (S_ISDIR(inode->i_mode)) {
268 ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
269 if (ret)
270 goto failed;
271 }
272 clone = posix_acl_clone(acl, GFP_NOFS);
273 ret = -ENOMEM;
274 if (!clone)
275 goto failed;
276
277 mode = inode->i_mode;
278 ret = posix_acl_create_masq(clone, &mode);
279 if (ret >= 0) {
280 inode->i_mode = mode;
281 if (ret > 0) {
282 /* we need an acl */
283 ret = btrfs_set_acl(inode, clone,
284 ACL_TYPE_ACCESS);
285 }
286 }
287 }
288failed:
289 posix_acl_release(acl);
290
291 return ret;
292}
293
294int btrfs_acl_chmod(struct inode *inode)
295{
296 struct posix_acl *acl, *clone;
297 int ret = 0;
298
299 if (S_ISLNK(inode->i_mode))
300 return -EOPNOTSUPP;
301
302 if (!IS_POSIXACL(inode))
303 return 0;
304
305 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
306 if (IS_ERR(acl) || !acl)
307 return PTR_ERR(acl);
308
309 clone = posix_acl_clone(acl, GFP_KERNEL);
310 posix_acl_release(acl);
311 if (!clone)
312 return -ENOMEM;
313
314 ret = posix_acl_chmod_masq(clone, inode->i_mode);
315 if (!ret)
316 ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
317
318 posix_acl_release(clone);
319
320 return ret;
321}
322
323struct xattr_handler btrfs_xattr_acl_default_handler = {
324 .prefix = POSIX_ACL_XATTR_DEFAULT,
325 .get = btrfs_xattr_acl_default_get,
326 .set = btrfs_xattr_acl_default_set,
327};
328
329struct xattr_handler btrfs_xattr_acl_access_handler = {
330 .prefix = POSIX_ACL_XATTR_ACCESS,
331 .get = btrfs_xattr_acl_access_get,
332 .set = btrfs_xattr_acl_access_set,
333};
334
335#else /* CONFIG_FS_POSIX_ACL */
336
337int btrfs_acl_chmod(struct inode *inode)
338{
339 return 0;
340}
341
342int btrfs_init_acl(struct inode *inode, struct inode *dir)
343{
344 return 0;
345}
346
347int btrfs_check_acl(struct inode *inode, int mask)
348{
349 return 0;
350}
351
352#endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 000000000000..4229450b7596
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,419 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/version.h>
20#include <linux/kthread.h>
21#include <linux/list.h>
22#include <linux/spinlock.h>
23# include <linux/freezer.h>
24#include "async-thread.h"
25
26#define WORK_QUEUED_BIT 0
27#define WORK_DONE_BIT 1
28#define WORK_ORDER_DONE_BIT 2
29
30/*
31 * container for the kthread task pointer and the list of pending work
32 * One of these is allocated per thread.
33 */
34struct btrfs_worker_thread {
35 /* pool we belong to */
36 struct btrfs_workers *workers;
37
38 /* list of struct btrfs_work that are waiting for service */
39 struct list_head pending;
40
41 /* list of worker threads from struct btrfs_workers */
42 struct list_head worker_list;
43
44 /* kthread */
45 struct task_struct *task;
46
47 /* number of things on the pending list */
48 atomic_t num_pending;
49
50 unsigned long sequence;
51
52 /* protects the pending list. */
53 spinlock_t lock;
54
55 /* set to non-zero when this thread is already awake and kicking */
56 int working;
57
58 /* are we currently idle */
59 int idle;
60};
61
62/*
63 * helper function to move a thread onto the idle list after it
64 * has finished some requests.
65 */
66static void check_idle_worker(struct btrfs_worker_thread *worker)
67{
68 if (!worker->idle && atomic_read(&worker->num_pending) <
69 worker->workers->idle_thresh / 2) {
70 unsigned long flags;
71 spin_lock_irqsave(&worker->workers->lock, flags);
72 worker->idle = 1;
73 list_move(&worker->worker_list, &worker->workers->idle_list);
74 spin_unlock_irqrestore(&worker->workers->lock, flags);
75 }
76}
77
78/*
79 * helper function to move a thread off the idle list after new
80 * pending work is added.
81 */
82static void check_busy_worker(struct btrfs_worker_thread *worker)
83{
84 if (worker->idle && atomic_read(&worker->num_pending) >=
85 worker->workers->idle_thresh) {
86 unsigned long flags;
87 spin_lock_irqsave(&worker->workers->lock, flags);
88 worker->idle = 0;
89 list_move_tail(&worker->worker_list,
90 &worker->workers->worker_list);
91 spin_unlock_irqrestore(&worker->workers->lock, flags);
92 }
93}
94
95static noinline int run_ordered_completions(struct btrfs_workers *workers,
96 struct btrfs_work *work)
97{
98 unsigned long flags;
99
100 if (!workers->ordered)
101 return 0;
102
103 set_bit(WORK_DONE_BIT, &work->flags);
104
105 spin_lock_irqsave(&workers->lock, flags);
106
107 while(!list_empty(&workers->order_list)) {
108 work = list_entry(workers->order_list.next,
109 struct btrfs_work, order_list);
110
111 if (!test_bit(WORK_DONE_BIT, &work->flags))
112 break;
113
114 /* we are going to call the ordered done function, but
115 * we leave the work item on the list as a barrier so
116 * that later work items that are done don't have their
117 * functions called before this one returns
118 */
119 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
120 break;
121
122 spin_unlock_irqrestore(&workers->lock, flags);
123
124 work->ordered_func(work);
125
126 /* now take the lock again and call the freeing code */
127 spin_lock_irqsave(&workers->lock, flags);
128 list_del(&work->order_list);
129 work->ordered_free(work);
130 }
131
132 spin_unlock_irqrestore(&workers->lock, flags);
133 return 0;
134}
135
136/*
137 * main loop for servicing work items
138 */
139static int worker_loop(void *arg)
140{
141 struct btrfs_worker_thread *worker = arg;
142 struct list_head *cur;
143 struct btrfs_work *work;
144 do {
145 spin_lock_irq(&worker->lock);
146 while(!list_empty(&worker->pending)) {
147 cur = worker->pending.next;
148 work = list_entry(cur, struct btrfs_work, list);
149 list_del(&work->list);
150 clear_bit(WORK_QUEUED_BIT, &work->flags);
151
152 work->worker = worker;
153 spin_unlock_irq(&worker->lock);
154
155 work->func(work);
156
157 atomic_dec(&worker->num_pending);
158 /*
159 * unless this is an ordered work queue,
160 * 'work' was probably freed by func above.
161 */
162 run_ordered_completions(worker->workers, work);
163
164 spin_lock_irq(&worker->lock);
165 check_idle_worker(worker);
166
167 }
168 worker->working = 0;
169 if (freezing(current)) {
170 refrigerator();
171 } else {
172 set_current_state(TASK_INTERRUPTIBLE);
173 spin_unlock_irq(&worker->lock);
174 if (!kthread_should_stop())
175 schedule();
176 __set_current_state(TASK_RUNNING);
177 }
178 } while (!kthread_should_stop());
179 return 0;
180}
181
182/*
183 * this will wait for all the worker threads to shutdown
184 */
185int btrfs_stop_workers(struct btrfs_workers *workers)
186{
187 struct list_head *cur;
188 struct btrfs_worker_thread *worker;
189
190 list_splice_init(&workers->idle_list, &workers->worker_list);
191 while(!list_empty(&workers->worker_list)) {
192 cur = workers->worker_list.next;
193 worker = list_entry(cur, struct btrfs_worker_thread,
194 worker_list);
195 kthread_stop(worker->task);
196 list_del(&worker->worker_list);
197 kfree(worker);
198 }
199 return 0;
200}
201
202/*
203 * simple init on struct btrfs_workers
204 */
205void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
206{
207 workers->num_workers = 0;
208 INIT_LIST_HEAD(&workers->worker_list);
209 INIT_LIST_HEAD(&workers->idle_list);
210 INIT_LIST_HEAD(&workers->order_list);
211 spin_lock_init(&workers->lock);
212 workers->max_workers = max;
213 workers->idle_thresh = 32;
214 workers->name = name;
215 workers->ordered = 0;
216}
217
218/*
219 * starts new worker threads. This does not enforce the max worker
220 * count in case you need to temporarily go past it.
221 */
222int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
223{
224 struct btrfs_worker_thread *worker;
225 int ret = 0;
226 int i;
227
228 for (i = 0; i < num_workers; i++) {
229 worker = kzalloc(sizeof(*worker), GFP_NOFS);
230 if (!worker) {
231 ret = -ENOMEM;
232 goto fail;
233 }
234
235 INIT_LIST_HEAD(&worker->pending);
236 INIT_LIST_HEAD(&worker->worker_list);
237 spin_lock_init(&worker->lock);
238 atomic_set(&worker->num_pending, 0);
239 worker->task = kthread_run(worker_loop, worker,
240 "btrfs-%s-%d", workers->name,
241 workers->num_workers + i);
242 worker->workers = workers;
243 if (IS_ERR(worker->task)) {
244 kfree(worker);
245 ret = PTR_ERR(worker->task);
246 goto fail;
247 }
248
249 spin_lock_irq(&workers->lock);
250 list_add_tail(&worker->worker_list, &workers->idle_list);
251 worker->idle = 1;
252 workers->num_workers++;
253 spin_unlock_irq(&workers->lock);
254 }
255 return 0;
256fail:
257 btrfs_stop_workers(workers);
258 return ret;
259}
260
261/*
262 * run through the list and find a worker thread that doesn't have a lot
263 * to do right now. This can return null if we aren't yet at the thread
264 * count limit and all of the threads are busy.
265 */
266static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
267{
268 struct btrfs_worker_thread *worker;
269 struct list_head *next;
270 int enforce_min = workers->num_workers < workers->max_workers;
271
272 /*
273 * if we find an idle thread, don't move it to the end of the
274 * idle list. This improves the chance that the next submission
275 * will reuse the same thread, and maybe catch it while it is still
276 * working
277 */
278 if (!list_empty(&workers->idle_list)) {
279 next = workers->idle_list.next;
280 worker = list_entry(next, struct btrfs_worker_thread,
281 worker_list);
282 return worker;
283 }
284 if (enforce_min || list_empty(&workers->worker_list))
285 return NULL;
286
287 /*
288 * if we pick a busy task, move the task to the end of the list.
289 * hopefully this will keep things somewhat evenly balanced.
290 * Do the move in batches based on the sequence number. This groups
291 * requests submitted at roughly the same time onto the same worker.
292 */
293 next = workers->worker_list.next;
294 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
295 atomic_inc(&worker->num_pending);
296 worker->sequence++;
297
298 if (worker->sequence % workers->idle_thresh == 0)
299 list_move_tail(next, &workers->worker_list);
300 return worker;
301}
302
303/*
304 * selects a worker thread to take the next job. This will either find
305 * an idle worker, start a new worker up to the max count, or just return
306 * one of the existing busy workers.
307 */
308static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
309{
310 struct btrfs_worker_thread *worker;
311 unsigned long flags;
312
313again:
314 spin_lock_irqsave(&workers->lock, flags);
315 worker = next_worker(workers);
316 spin_unlock_irqrestore(&workers->lock, flags);
317
318 if (!worker) {
319 spin_lock_irqsave(&workers->lock, flags);
320 if (workers->num_workers >= workers->max_workers) {
321 struct list_head *fallback = NULL;
322 /*
323 * we have failed to find any workers, just
324 * return the force one
325 */
326 if (!list_empty(&workers->worker_list))
327 fallback = workers->worker_list.next;
328 if (!list_empty(&workers->idle_list))
329 fallback = workers->idle_list.next;
330 BUG_ON(!fallback);
331 worker = list_entry(fallback,
332 struct btrfs_worker_thread, worker_list);
333 spin_unlock_irqrestore(&workers->lock, flags);
334 } else {
335 spin_unlock_irqrestore(&workers->lock, flags);
336 /* we're below the limit, start another worker */
337 btrfs_start_workers(workers, 1);
338 goto again;
339 }
340 }
341 return worker;
342}
343
344/*
345 * btrfs_requeue_work just puts the work item back on the tail of the list
346 * it was taken from. It is intended for use with long running work functions
347 * that make some progress and want to give the cpu up for others.
348 */
349int btrfs_requeue_work(struct btrfs_work *work)
350{
351 struct btrfs_worker_thread *worker = work->worker;
352 unsigned long flags;
353
354 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
355 goto out;
356
357 spin_lock_irqsave(&worker->lock, flags);
358 atomic_inc(&worker->num_pending);
359 list_add_tail(&work->list, &worker->pending);
360
361 /* by definition we're busy, take ourselves off the idle
362 * list
363 */
364 if (worker->idle) {
365 spin_lock_irqsave(&worker->workers->lock, flags);
366 worker->idle = 0;
367 list_move_tail(&worker->worker_list,
368 &worker->workers->worker_list);
369 spin_unlock_irqrestore(&worker->workers->lock, flags);
370 }
371
372 spin_unlock_irqrestore(&worker->lock, flags);
373
374out:
375 return 0;
376}
377
378/*
379 * places a struct btrfs_work into the pending queue of one of the kthreads
380 */
381int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
382{
383 struct btrfs_worker_thread *worker;
384 unsigned long flags;
385 int wake = 0;
386
387 /* don't requeue something already on a list */
388 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
389 goto out;
390
391 worker = find_worker(workers);
392 if (workers->ordered) {
393 spin_lock_irqsave(&workers->lock, flags);
394 list_add_tail(&work->order_list, &workers->order_list);
395 spin_unlock_irqrestore(&workers->lock, flags);
396 } else {
397 INIT_LIST_HEAD(&work->order_list);
398 }
399
400 spin_lock_irqsave(&worker->lock, flags);
401 atomic_inc(&worker->num_pending);
402 check_busy_worker(worker);
403 list_add_tail(&work->list, &worker->pending);
404
405 /*
406 * avoid calling into wake_up_process if this thread has already
407 * been kicked
408 */
409 if (!worker->working)
410 wake = 1;
411 worker->working = 1;
412
413 spin_unlock_irqrestore(&worker->lock, flags);
414
415 if (wake)
416 wake_up_process(worker->task);
417out:
418 return 0;
419}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 000000000000..31be4ed8b63e
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,101 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_ASYNC_THREAD_
20#define __BTRFS_ASYNC_THREAD_
21
22struct btrfs_worker_thread;
23
24/*
25 * This is similar to a workqueue, but it is meant to spread the operations
26 * across all available cpus instead of just the CPU that was used to
27 * queue the work. There is also some batching introduced to try and
28 * cut down on context switches.
29 *
30 * By default threads are added on demand up to 2 * the number of cpus.
31 * Changing struct btrfs_workers->max_workers is one way to prevent
32 * demand creation of kthreads.
33 *
34 * the basic model of these worker threads is to embed a btrfs_work
35 * structure in your own data struct, and use container_of in a
36 * work function to get back to your data struct.
37 */
38struct btrfs_work {
39 /*
40 * func should be set to the function you want called
41 * your work struct is passed as the only arg
42 *
43 * ordered_func must be set for work sent to an ordered work queue,
44 * and it is called to complete a given work item in the same
45 * order they were sent to the queue.
46 */
47 void (*func)(struct btrfs_work *work);
48 void (*ordered_func)(struct btrfs_work *work);
49 void (*ordered_free)(struct btrfs_work *work);
50
51 /*
52 * flags should be set to zero. It is used to make sure the
53 * struct is only inserted once into the list.
54 */
55 unsigned long flags;
56
57 /* don't touch these */
58 struct btrfs_worker_thread *worker;
59 struct list_head list;
60 struct list_head order_list;
61};
62
63struct btrfs_workers {
64 /* current number of running workers */
65 int num_workers;
66
67 /* max number of workers allowed. changed by btrfs_start_workers */
68 int max_workers;
69
70 /* once a worker has this many requests or fewer, it is idle */
71 int idle_thresh;
72
73 /* force completions in the order they were queued */
74 int ordered;
75
76 /* list with all the work threads. The workers on the idle thread
77 * may be actively servicing jobs, but they haven't yet hit the
78 * idle thresh limit above.
79 */
80 struct list_head worker_list;
81 struct list_head idle_list;
82
83 /*
84 * when operating in ordered mode, this maintains the list
85 * of work items waiting for completion
86 */
87 struct list_head order_list;
88
89 /* lock for finding the next worker thread to queue on */
90 spinlock_t lock;
91
92 /* extra name for this worker, used for current->name */
93 char *name;
94};
95
96int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
97int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
98int btrfs_stop_workers(struct btrfs_workers *workers);
99void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
100int btrfs_requeue_work(struct btrfs_work *work);
101#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 000000000000..0b2e623cf421
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,133 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_I__
20#define __BTRFS_I__
21
22#include "extent_map.h"
23#include "extent_io.h"
24#include "ordered-data.h"
25
26/* in memory btrfs inode */
27struct btrfs_inode {
28 /* which subvolume this inode belongs to */
29 struct btrfs_root *root;
30
31 /* the block group preferred for allocations. This pointer is buggy
32 * and needs to be replaced with a bytenr instead
33 */
34 struct btrfs_block_group_cache *block_group;
35
36 /* key used to find this inode on disk. This is used by the code
37 * to read in roots of subvolumes
38 */
39 struct btrfs_key location;
40
41 /* the extent_tree has caches of all the extent mappings to disk */
42 struct extent_map_tree extent_tree;
43
44 /* the io_tree does range state (DIRTY, LOCKED etc) */
45 struct extent_io_tree io_tree;
46
47 /* special utility tree used to record which mirrors have already been
48 * tried when checksums fail for a given block
49 */
50 struct extent_io_tree io_failure_tree;
51
52 /* held while inserting checksums to avoid races */
53 struct mutex csum_mutex;
54
55 /* held while inesrting or deleting extents from files */
56 struct mutex extent_mutex;
57
58 /* held while logging the inode in tree-log.c */
59 struct mutex log_mutex;
60
61 /* used to order data wrt metadata */
62 struct btrfs_ordered_inode_tree ordered_tree;
63
64 /* standard acl pointers */
65 struct posix_acl *i_acl;
66 struct posix_acl *i_default_acl;
67
68 /* for keeping track of orphaned inodes */
69 struct list_head i_orphan;
70
71 /* list of all the delalloc inodes in the FS. There are times we need
72 * to write all the delalloc pages to disk, and this list is used
73 * to walk them all.
74 */
75 struct list_head delalloc_inodes;
76
77 /* full 64 bit generation number, struct vfs_inode doesn't have a big
78 * enough field for this.
79 */
80 u64 generation;
81
82 /*
83 * transid of the trans_handle that last modified this inode
84 */
85 u64 last_trans;
86 /*
87 * transid that last logged this inode
88 */
89 u64 logged_trans;
90
91 /*
92 * trans that last made a change that should be fully fsync'd. This
93 * gets reset to zero each time the inode is logged
94 */
95 u64 log_dirty_trans;
96
97 /* total number of bytes pending delalloc, used by stat to calc the
98 * real block usage of the file
99 */
100 u64 delalloc_bytes;
101
102 /*
103 * the size of the file stored in the metadata on disk. data=ordered
104 * means the in-memory i_size might be larger than the size on disk
105 * because not all the blocks are written yet.
106 */
107 u64 disk_i_size;
108
109 /* flags field from the on disk inode */
110 u32 flags;
111
112 /*
113 * if this is a directory then index_cnt is the counter for the index
114 * number for new files that are created
115 */
116 u64 index_cnt;
117
118 struct inode vfs_inode;
119};
120
121static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
122{
123 return container_of(inode, struct btrfs_inode, vfs_inode);
124}
125
126static inline void btrfs_i_size_write(struct inode *inode, u64 size)
127{
128 inode->i_size = size;
129 BTRFS_I(inode)->disk_i_size = size;
130}
131
132
133#endif
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
new file mode 100644
index 000000000000..cd6598b169df
--- /dev/null
+++ b/fs/btrfs/compat.h
@@ -0,0 +1,24 @@
1#ifndef _COMPAT_H_
2#define _COMPAT_H_
3
4#define btrfs_drop_nlink(inode) drop_nlink(inode)
5#define btrfs_inc_nlink(inode) inc_nlink(inode)
6
7#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27)
8static inline struct dentry *d_obtain_alias(struct inode *inode)
9{
10 struct dentry *d;
11
12 if (!inode)
13 return NULL;
14 if (IS_ERR(inode))
15 return ERR_CAST(inode);
16
17 d = d_alloc_anon(inode);
18 if (!d)
19 iput(inode);
20 return d;
21}
22#endif
23
24#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 000000000000..bfd1512cce0a
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,618 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h>
35#include <linux/version.h>
36#include <linux/pagevec.h>
37#include "ctree.h"
38#include "disk-io.h"
39#include "transaction.h"
40#include "btrfs_inode.h"
41#include "volumes.h"
42#include "ordered-data.h"
43#include "compat.h"
44#include "compression.h"
45#include "extent_io.h"
46#include "extent_map.h"
47
48struct compressed_bio {
49 /* number of bios pending for this compressed extent */
50 atomic_t pending_bios;
51
52 /* the pages with the compressed data on them */
53 struct page **compressed_pages;
54
55 /* inode that owns this data */
56 struct inode *inode;
57
58 /* starting offset in the inode for our pages */
59 u64 start;
60
61 /* number of bytes in the inode we're working on */
62 unsigned long len;
63
64 /* number of bytes on disk */
65 unsigned long compressed_len;
66
67 /* number of compressed pages in the array */
68 unsigned long nr_pages;
69
70 /* IO errors */
71 int errors;
72
73 /* for reads, this is the bio we are copying the data into */
74 struct bio *orig_bio;
75};
76
77static struct bio *compressed_bio_alloc(struct block_device *bdev,
78 u64 first_byte, gfp_t gfp_flags)
79{
80 struct bio *bio;
81 int nr_vecs;
82
83 nr_vecs = bio_get_nr_vecs(bdev);
84 bio = bio_alloc(gfp_flags, nr_vecs);
85
86 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
87 while (!bio && (nr_vecs /= 2))
88 bio = bio_alloc(gfp_flags, nr_vecs);
89 }
90
91 if (bio) {
92 bio->bi_size = 0;
93 bio->bi_bdev = bdev;
94 bio->bi_sector = first_byte >> 9;
95 }
96 return bio;
97}
98
99/* when we finish reading compressed pages from the disk, we
100 * decompress them and then run the bio end_io routines on the
101 * decompressed pages (in the inode address space).
102 *
103 * This allows the checksumming and other IO error handling routines
104 * to work normally
105 *
106 * The compressed pages are freed here, and it must be run
107 * in process context
108 */
109static void end_compressed_bio_read(struct bio *bio, int err)
110{
111 struct extent_io_tree *tree;
112 struct compressed_bio *cb = bio->bi_private;
113 struct inode *inode;
114 struct page *page;
115 unsigned long index;
116 int ret;
117
118 if (err)
119 cb->errors = 1;
120
121 /* if there are more bios still pending for this compressed
122 * extent, just exit
123 */
124 if (!atomic_dec_and_test(&cb->pending_bios))
125 goto out;
126
127 /* ok, we're the last bio for this extent, lets start
128 * the decompression.
129 */
130 inode = cb->inode;
131 tree = &BTRFS_I(inode)->io_tree;
132 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
133 cb->start,
134 cb->orig_bio->bi_io_vec,
135 cb->orig_bio->bi_vcnt,
136 cb->compressed_len);
137 if (ret)
138 cb->errors = 1;
139
140 /* release the compressed pages */
141 index = 0;
142 for (index = 0; index < cb->nr_pages; index++) {
143 page = cb->compressed_pages[index];
144 page->mapping = NULL;
145 page_cache_release(page);
146 }
147
148 /* do io completion on the original bio */
149 if (cb->errors) {
150 bio_io_error(cb->orig_bio);
151 } else
152 bio_endio(cb->orig_bio, 0);
153
154 /* finally free the cb struct */
155 kfree(cb->compressed_pages);
156 kfree(cb);
157out:
158 bio_put(bio);
159}
160
161/*
162 * Clear the writeback bits on all of the file
163 * pages for a compressed write
164 */
165static noinline int end_compressed_writeback(struct inode *inode, u64 start,
166 unsigned long ram_size)
167{
168 unsigned long index = start >> PAGE_CACHE_SHIFT;
169 unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
170 struct page *pages[16];
171 unsigned long nr_pages = end_index - index + 1;
172 int i;
173 int ret;
174
175 while(nr_pages > 0) {
176 ret = find_get_pages_contig(inode->i_mapping, index,
177 min_t(unsigned long,
178 nr_pages, ARRAY_SIZE(pages)), pages);
179 if (ret == 0) {
180 nr_pages -= 1;
181 index += 1;
182 continue;
183 }
184 for (i = 0; i < ret; i++) {
185 end_page_writeback(pages[i]);
186 page_cache_release(pages[i]);
187 }
188 nr_pages -= ret;
189 index += ret;
190 }
191 /* the inode may be gone now */
192 return 0;
193}
194
195/*
196 * do the cleanup once all the compressed pages hit the disk.
197 * This will clear writeback on the file pages and free the compressed
198 * pages.
199 *
200 * This also calls the writeback end hooks for the file pages so that
201 * metadata and checksums can be updated in the file.
202 */
203static void end_compressed_bio_write(struct bio *bio, int err)
204{
205 struct extent_io_tree *tree;
206 struct compressed_bio *cb = bio->bi_private;
207 struct inode *inode;
208 struct page *page;
209 unsigned long index;
210
211 if (err)
212 cb->errors = 1;
213
214 /* if there are more bios still pending for this compressed
215 * extent, just exit
216 */
217 if (!atomic_dec_and_test(&cb->pending_bios))
218 goto out;
219
220 /* ok, we're the last bio for this extent, step one is to
221 * call back into the FS and do all the end_io operations
222 */
223 inode = cb->inode;
224 tree = &BTRFS_I(inode)->io_tree;
225 cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
226 tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
227 cb->start,
228 cb->start + cb->len - 1,
229 NULL, 1);
230 cb->compressed_pages[0]->mapping = NULL;
231
232 end_compressed_writeback(inode, cb->start, cb->len);
233 /* note, our inode could be gone now */
234
235 /*
236 * release the compressed pages, these came from alloc_page and
237 * are not attached to the inode at all
238 */
239 index = 0;
240 for (index = 0; index < cb->nr_pages; index++) {
241 page = cb->compressed_pages[index];
242 page->mapping = NULL;
243 page_cache_release(page);
244 }
245
246 /* finally free the cb struct */
247 kfree(cb->compressed_pages);
248 kfree(cb);
249out:
250 bio_put(bio);
251}
252
253/*
254 * worker function to build and submit bios for previously compressed pages.
255 * The corresponding pages in the inode should be marked for writeback
256 * and the compressed pages should have a reference on them for dropping
257 * when the IO is complete.
258 *
259 * This also checksums the file bytes and gets things ready for
260 * the end io hooks.
261 */
262int btrfs_submit_compressed_write(struct inode *inode, u64 start,
263 unsigned long len, u64 disk_start,
264 unsigned long compressed_len,
265 struct page **compressed_pages,
266 unsigned long nr_pages)
267{
268 struct bio *bio = NULL;
269 struct btrfs_root *root = BTRFS_I(inode)->root;
270 struct compressed_bio *cb;
271 unsigned long bytes_left;
272 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
273 int page_index = 0;
274 struct page *page;
275 u64 first_byte = disk_start;
276 struct block_device *bdev;
277 int ret;
278
279 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
280 cb = kmalloc(sizeof(*cb), GFP_NOFS);
281 atomic_set(&cb->pending_bios, 0);
282 cb->errors = 0;
283 cb->inode = inode;
284 cb->start = start;
285 cb->len = len;
286 cb->compressed_pages = compressed_pages;
287 cb->compressed_len = compressed_len;
288 cb->orig_bio = NULL;
289 cb->nr_pages = nr_pages;
290
291 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
292
293 ret = btrfs_csum_file_bytes(root, inode, start, len);
294 BUG_ON(ret);
295
296 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
297 bio->bi_private = cb;
298 bio->bi_end_io = end_compressed_bio_write;
299 atomic_inc(&cb->pending_bios);
300
301 /* create and submit bios for the compressed pages */
302 bytes_left = compressed_len;
303 for (page_index = 0; page_index < cb->nr_pages; page_index++) {
304 page = compressed_pages[page_index];
305 page->mapping = inode->i_mapping;
306 if (bio->bi_size)
307 ret = io_tree->ops->merge_bio_hook(page, 0,
308 PAGE_CACHE_SIZE,
309 bio, 0);
310 else
311 ret = 0;
312
313 page->mapping = NULL;
314 if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
315 PAGE_CACHE_SIZE) {
316 bio_get(bio);
317
318 /*
319 * inc the count before we submit the bio so
320 * we know the end IO handler won't happen before
321 * we inc the count. Otherwise, the cb might get
322 * freed before we're done setting it up
323 */
324 atomic_inc(&cb->pending_bios);
325 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
326 BUG_ON(ret);
327
328 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
329 BUG_ON(ret);
330
331 bio_put(bio);
332
333 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
334 bio->bi_private = cb;
335 bio->bi_end_io = end_compressed_bio_write;
336 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
337 }
338 if (bytes_left < PAGE_CACHE_SIZE) {
339 printk("bytes left %lu compress len %lu nr %lu\n",
340 bytes_left, cb->compressed_len, cb->nr_pages);
341 }
342 bytes_left -= PAGE_CACHE_SIZE;
343 first_byte += PAGE_CACHE_SIZE;
344 cond_resched();
345 }
346 bio_get(bio);
347
348 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
349 BUG_ON(ret);
350
351 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
352 BUG_ON(ret);
353
354 bio_put(bio);
355 return 0;
356}
357
358static noinline int add_ra_bio_pages(struct inode *inode,
359 u64 compressed_end,
360 struct compressed_bio *cb)
361{
362 unsigned long end_index;
363 unsigned long page_index;
364 u64 last_offset;
365 u64 isize = i_size_read(inode);
366 int ret;
367 struct page *page;
368 unsigned long nr_pages = 0;
369 struct extent_map *em;
370 struct address_space *mapping = inode->i_mapping;
371 struct pagevec pvec;
372 struct extent_map_tree *em_tree;
373 struct extent_io_tree *tree;
374 u64 end;
375 int misses = 0;
376
377 page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
378 last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
379 em_tree = &BTRFS_I(inode)->extent_tree;
380 tree = &BTRFS_I(inode)->io_tree;
381
382 if (isize == 0)
383 return 0;
384
385 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
386
387 pagevec_init(&pvec, 0);
388 while(last_offset < compressed_end) {
389 page_index = last_offset >> PAGE_CACHE_SHIFT;
390
391 if (page_index > end_index)
392 break;
393
394 rcu_read_lock();
395 page = radix_tree_lookup(&mapping->page_tree, page_index);
396 rcu_read_unlock();
397 if (page) {
398 misses++;
399 if (misses > 4)
400 break;
401 goto next;
402 }
403
404 page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
405 if (!page)
406 break;
407
408 page->index = page_index;
409 /*
410 * what we want to do here is call add_to_page_cache_lru,
411 * but that isn't exported, so we reproduce it here
412 */
413 if (add_to_page_cache(page, mapping,
414 page->index, GFP_NOFS)) {
415 page_cache_release(page);
416 goto next;
417 }
418
419 /* open coding of lru_cache_add, also not exported */
420 page_cache_get(page);
421 if (!pagevec_add(&pvec, page))
422 __pagevec_lru_add(&pvec);
423
424 end = last_offset + PAGE_CACHE_SIZE - 1;
425 /*
426 * at this point, we have a locked page in the page cache
427 * for these bytes in the file. But, we have to make
428 * sure they map to this compressed extent on disk.
429 */
430 set_page_extent_mapped(page);
431 lock_extent(tree, last_offset, end, GFP_NOFS);
432 spin_lock(&em_tree->lock);
433 em = lookup_extent_mapping(em_tree, last_offset,
434 PAGE_CACHE_SIZE);
435 spin_unlock(&em_tree->lock);
436
437 if (!em || last_offset < em->start ||
438 (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
439 (em->block_start >> 9) != cb->orig_bio->bi_sector) {
440 free_extent_map(em);
441 unlock_extent(tree, last_offset, end, GFP_NOFS);
442 unlock_page(page);
443 page_cache_release(page);
444 break;
445 }
446 free_extent_map(em);
447
448 if (page->index == end_index) {
449 char *userpage;
450 size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
451
452 if (zero_offset) {
453 int zeros;
454 zeros = PAGE_CACHE_SIZE - zero_offset;
455 userpage = kmap_atomic(page, KM_USER0);
456 memset(userpage + zero_offset, 0, zeros);
457 flush_dcache_page(page);
458 kunmap_atomic(userpage, KM_USER0);
459 }
460 }
461
462 ret = bio_add_page(cb->orig_bio, page,
463 PAGE_CACHE_SIZE, 0);
464
465 if (ret == PAGE_CACHE_SIZE) {
466 nr_pages++;
467 page_cache_release(page);
468 } else {
469 unlock_extent(tree, last_offset, end, GFP_NOFS);
470 unlock_page(page);
471 page_cache_release(page);
472 break;
473 }
474next:
475 last_offset += PAGE_CACHE_SIZE;
476 }
477 if (pagevec_count(&pvec))
478 __pagevec_lru_add(&pvec);
479 return 0;
480}
481
482/*
483 * for a compressed read, the bio we get passed has all the inode pages
484 * in it. We don't actually do IO on those pages but allocate new ones
485 * to hold the compressed pages on disk.
486 *
487 * bio->bi_sector points to the compressed extent on disk
488 * bio->bi_io_vec points to all of the inode pages
489 * bio->bi_vcnt is a count of pages
490 *
491 * After the compressed pages are read, we copy the bytes into the
492 * bio we were passed and then call the bio end_io calls
493 */
494int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
495 int mirror_num, unsigned long bio_flags)
496{
497 struct extent_io_tree *tree;
498 struct extent_map_tree *em_tree;
499 struct compressed_bio *cb;
500 struct btrfs_root *root = BTRFS_I(inode)->root;
501 unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
502 unsigned long compressed_len;
503 unsigned long nr_pages;
504 unsigned long page_index;
505 struct page *page;
506 struct block_device *bdev;
507 struct bio *comp_bio;
508 u64 cur_disk_byte = (u64)bio->bi_sector << 9;
509 u64 em_len;
510 u64 em_start;
511 struct extent_map *em;
512 int ret;
513
514 tree = &BTRFS_I(inode)->io_tree;
515 em_tree = &BTRFS_I(inode)->extent_tree;
516
517 /* we need the actual starting offset of this extent in the file */
518 spin_lock(&em_tree->lock);
519 em = lookup_extent_mapping(em_tree,
520 page_offset(bio->bi_io_vec->bv_page),
521 PAGE_CACHE_SIZE);
522 spin_unlock(&em_tree->lock);
523
524 cb = kmalloc(sizeof(*cb), GFP_NOFS);
525 atomic_set(&cb->pending_bios, 0);
526 cb->errors = 0;
527 cb->inode = inode;
528
529 cb->start = em->orig_start;
530 compressed_len = em->block_len;
531 em_len = em->len;
532 em_start = em->start;
533 free_extent_map(em);
534 em = NULL;
535
536 cb->len = uncompressed_len;
537 cb->compressed_len = compressed_len;
538 cb->orig_bio = bio;
539
540 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
541 PAGE_CACHE_SIZE;
542 cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
543 GFP_NOFS);
544 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
545
546 for (page_index = 0; page_index < nr_pages; page_index++) {
547 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
548 __GFP_HIGHMEM);
549 }
550 cb->nr_pages = nr_pages;
551
552 add_ra_bio_pages(inode, em_start + em_len, cb);
553
554 if (!btrfs_test_opt(root, NODATASUM) &&
555 !btrfs_test_flag(inode, NODATASUM)) {
556 btrfs_lookup_bio_sums(root, inode, cb->orig_bio);
557 }
558
559 /* include any pages we added in add_ra-bio_pages */
560 uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
561 cb->len = uncompressed_len;
562
563 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
564 comp_bio->bi_private = cb;
565 comp_bio->bi_end_io = end_compressed_bio_read;
566 atomic_inc(&cb->pending_bios);
567
568 for (page_index = 0; page_index < nr_pages; page_index++) {
569 page = cb->compressed_pages[page_index];
570 page->mapping = inode->i_mapping;
571 if (comp_bio->bi_size)
572 ret = tree->ops->merge_bio_hook(page, 0,
573 PAGE_CACHE_SIZE,
574 comp_bio, 0);
575 else
576 ret = 0;
577
578 page->mapping = NULL;
579 if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
580 PAGE_CACHE_SIZE) {
581 bio_get(comp_bio);
582
583 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
584 BUG_ON(ret);
585
586 /*
587 * inc the count before we submit the bio so
588 * we know the end IO handler won't happen before
589 * we inc the count. Otherwise, the cb might get
590 * freed before we're done setting it up
591 */
592 atomic_inc(&cb->pending_bios);
593
594 ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
595 BUG_ON(ret);
596
597 bio_put(comp_bio);
598
599 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
600 GFP_NOFS);
601 comp_bio->bi_private = cb;
602 comp_bio->bi_end_io = end_compressed_bio_read;
603
604 bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
605 }
606 cur_disk_byte += PAGE_CACHE_SIZE;
607 }
608 bio_get(comp_bio);
609
610 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
611 BUG_ON(ret);
612
613 ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
614 BUG_ON(ret);
615
616 bio_put(comp_bio);
617 return 0;
618}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 000000000000..421f5b4aa715
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_COMPRESSION_
20#define __BTRFS_COMPRESSION_
21
22int btrfs_zlib_decompress(unsigned char *data_in,
23 struct page *dest_page,
24 unsigned long start_byte,
25 size_t srclen, size_t destlen);
26int btrfs_zlib_compress_pages(struct address_space *mapping,
27 u64 start, unsigned long len,
28 struct page **pages,
29 unsigned long nr_dest_pages,
30 unsigned long *out_pages,
31 unsigned long *total_in,
32 unsigned long *total_out,
33 unsigned long max_out);
34int btrfs_zlib_decompress_biovec(struct page **pages_in,
35 u64 disk_start,
36 struct bio_vec *bvec,
37 int vcnt,
38 size_t srclen);
39void btrfs_zlib_exit(void);
40int btrfs_submit_compressed_write(struct inode *inode, u64 start,
41 unsigned long len, u64 disk_start,
42 unsigned long compressed_len,
43 struct page **compressed_pages,
44 unsigned long nr_pages);
45int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
46 int mirror_num, unsigned long bio_flags);
47#endif
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
new file mode 100644
index 000000000000..1eaf11d334fd
--- /dev/null
+++ b/fs/btrfs/crc32c.h
@@ -0,0 +1,120 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CRC32C__
20#define __BTRFS_CRC32C__
21#include <asm/byteorder.h>
22#include <linux/crc32c.h>
23#include <linux/version.h>
24
25/* #define CONFIG_BTRFS_HW_SUM 1 */
26
27#ifdef CONFIG_BTRFS_HW_SUM
28#ifdef CONFIG_X86
29/*
30 * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
31 * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
32 * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
33 * http://www.intel.com/products/processor/manuals/
34 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
35 * Volume 2A: Instruction Set Reference, A-M
36 */
37
38#include <asm/cpufeature.h>
39#include <asm/processor.h>
40
41#define X86_FEATURE_XMM4_2 (4*32+20) /* Streaming SIMD Extensions-4.2 */
42#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
43
44#ifdef CONFIG_X86_64
45#define REX_PRE "0x48, "
46#define SCALE_F 8
47#else
48#define REX_PRE
49#define SCALE_F 4
50#endif
51
52static inline u32 btrfs_crc32c_le_hw_byte(u32 crc, unsigned char const *data,
53 size_t length)
54{
55 while (length--) {
56 __asm__ __volatile__(
57 ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
58 :"=S"(crc)
59 :"0"(crc), "c"(*data)
60 );
61 data++;
62 }
63
64 return crc;
65}
66
67static inline u32 __pure btrfs_crc32c_le_hw(u32 crc, unsigned char const *p,
68 size_t len)
69{
70 unsigned int iquotient = len / SCALE_F;
71 unsigned int iremainder = len % SCALE_F;
72#ifdef CONFIG_X86_64
73 u64 *ptmp = (u64 *)p;
74#else
75 u32 *ptmp = (u32 *)p;
76#endif
77
78 while (iquotient--) {
79 __asm__ __volatile__(
80 ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
81 :"=S"(crc)
82 :"0"(crc), "c"(*ptmp)
83 );
84 ptmp++;
85 }
86
87 if (iremainder)
88 crc = btrfs_crc32c_le_hw_byte(crc, (unsigned char *)ptmp,
89 iremainder);
90
91 return crc;
92}
93#endif /* CONFIG_BTRFS_HW_SUM */
94
95static inline u32 __btrfs_crc32c(u32 crc, unsigned char const *address,
96 size_t len)
97{
98#ifdef CONFIG_BTRFS_HW_SUM
99 if (cpu_has_xmm4_2)
100 return btrfs_crc32c_le_hw(crc, address, len);
101#endif
102 return crc32c_le(crc, address, len);
103}
104
105#else
106
107#define __btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
108
109#endif /* CONFIG_X86 */
110
111/**
112 * implementation of crc32c_le() changed in linux-2.6.23,
113 * has of v0.13 btrfs-progs is using the latest version.
114 * We must workaround older implementations of crc32c_le()
115 * found on older kernel versions.
116 */
117#define btrfs_crc32c(seed, data, length) \
118 __btrfs_crc32c(seed, (unsigned char const *)data, length)
119#endif
120
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 000000000000..71ef0a2e2daa
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,3892 @@
1/*
2 * Copyright (C) 2007,2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "transaction.h"
23#include "print-tree.h"
24#include "locking.h"
25
26static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
27 *root, struct btrfs_path *path, int level);
28static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29 *root, struct btrfs_key *ins_key,
30 struct btrfs_path *path, int data_size, int extend);
31static int push_node_left(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, struct extent_buffer *dst,
33 struct extent_buffer *src, int empty);
34static int balance_node_right(struct btrfs_trans_handle *trans,
35 struct btrfs_root *root,
36 struct extent_buffer *dst_buf,
37 struct extent_buffer *src_buf);
38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39 struct btrfs_path *path, int level, int slot);
40
41inline void btrfs_init_path(struct btrfs_path *p)
42{
43 memset(p, 0, sizeof(*p));
44}
45
46struct btrfs_path *btrfs_alloc_path(void)
47{
48 struct btrfs_path *path;
49 path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
50 if (path) {
51 btrfs_init_path(path);
52 path->reada = 1;
53 }
54 return path;
55}
56
57/* this also releases the path */
58void btrfs_free_path(struct btrfs_path *p)
59{
60 btrfs_release_path(NULL, p);
61 kmem_cache_free(btrfs_path_cachep, p);
62}
63
64/*
65 * path release drops references on the extent buffers in the path
66 * and it drops any locks held by this path
67 *
68 * It is safe to call this on paths that no locks or extent buffers held.
69 */
70void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
71{
72 int i;
73
74 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75 p->slots[i] = 0;
76 if (!p->nodes[i])
77 continue;
78 if (p->locks[i]) {
79 btrfs_tree_unlock(p->nodes[i]);
80 p->locks[i] = 0;
81 }
82 free_extent_buffer(p->nodes[i]);
83 p->nodes[i] = NULL;
84 }
85}
86
87/*
88 * safely gets a reference on the root node of a tree. A lock
89 * is not taken, so a concurrent writer may put a different node
90 * at the root of the tree. See btrfs_lock_root_node for the
91 * looping required.
92 *
93 * The extent buffer returned by this has a reference taken, so
94 * it won't disappear. It may stop being the root of the tree
95 * at any time because there are no locks held.
96 */
97struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
98{
99 struct extent_buffer *eb;
100 spin_lock(&root->node_lock);
101 eb = root->node;
102 extent_buffer_get(eb);
103 spin_unlock(&root->node_lock);
104 return eb;
105}
106
107/* loop around taking references on and locking the root node of the
108 * tree until you end up with a lock on the root. A locked buffer
109 * is returned, with a reference held.
110 */
111struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
112{
113 struct extent_buffer *eb;
114
115 while(1) {
116 eb = btrfs_root_node(root);
117 btrfs_tree_lock(eb);
118
119 spin_lock(&root->node_lock);
120 if (eb == root->node) {
121 spin_unlock(&root->node_lock);
122 break;
123 }
124 spin_unlock(&root->node_lock);
125
126 btrfs_tree_unlock(eb);
127 free_extent_buffer(eb);
128 }
129 return eb;
130}
131
132/* cowonly root (everything not a reference counted cow subvolume), just get
133 * put onto a simple dirty list. transaction.c walks this to make sure they
134 * get properly updated on disk.
135 */
136static void add_root_to_dirty_list(struct btrfs_root *root)
137{
138 if (root->track_dirty && list_empty(&root->dirty_list)) {
139 list_add(&root->dirty_list,
140 &root->fs_info->dirty_cowonly_roots);
141 }
142}
143
144/*
145 * used by snapshot creation to make a copy of a root for a tree with
146 * a given objectid. The buffer with the new root node is returned in
147 * cow_ret, and this func returns zero on success or a negative error code.
148 */
149int btrfs_copy_root(struct btrfs_trans_handle *trans,
150 struct btrfs_root *root,
151 struct extent_buffer *buf,
152 struct extent_buffer **cow_ret, u64 new_root_objectid)
153{
154 struct extent_buffer *cow;
155 u32 nritems;
156 int ret = 0;
157 int level;
158 struct btrfs_root *new_root;
159
160 new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
161 if (!new_root)
162 return -ENOMEM;
163
164 memcpy(new_root, root, sizeof(*new_root));
165 new_root->root_key.objectid = new_root_objectid;
166
167 WARN_ON(root->ref_cows && trans->transid !=
168 root->fs_info->running_transaction->transid);
169 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
170
171 level = btrfs_header_level(buf);
172 nritems = btrfs_header_nritems(buf);
173
174 cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
175 new_root_objectid, trans->transid,
176 level, buf->start, 0);
177 if (IS_ERR(cow)) {
178 kfree(new_root);
179 return PTR_ERR(cow);
180 }
181
182 copy_extent_buffer(cow, buf, 0, 0, cow->len);
183 btrfs_set_header_bytenr(cow, cow->start);
184 btrfs_set_header_generation(cow, trans->transid);
185 btrfs_set_header_owner(cow, new_root_objectid);
186 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
187
188 write_extent_buffer(cow, root->fs_info->fsid,
189 (unsigned long)btrfs_header_fsid(cow),
190 BTRFS_FSID_SIZE);
191
192 WARN_ON(btrfs_header_generation(buf) > trans->transid);
193 ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
194 kfree(new_root);
195
196 if (ret)
197 return ret;
198
199 btrfs_mark_buffer_dirty(cow);
200 *cow_ret = cow;
201 return 0;
202}
203
204/*
205 * does the dirty work in cow of a single block. The parent block
206 * (if supplied) is updated to point to the new cow copy. The new
207 * buffer is marked dirty and returned locked. If you modify the block
208 * it needs to be marked dirty again.
209 *
210 * search_start -- an allocation hint for the new block
211 *
212 * empty_size -- a hint that you plan on doing more cow. This is the size in bytes
213 * the allocator should try to find free next to the block it returns. This is
214 * just a hint and may be ignored by the allocator.
215 *
216 * prealloc_dest -- if you have already reserved a destination for the cow,
217 * this uses that block instead of allocating a new one. btrfs_alloc_reserved_extent
218 * is used to finish the allocation.
219 */
220int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
221 struct btrfs_root *root,
222 struct extent_buffer *buf,
223 struct extent_buffer *parent, int parent_slot,
224 struct extent_buffer **cow_ret,
225 u64 search_start, u64 empty_size,
226 u64 prealloc_dest)
227{
228 u64 parent_start;
229 struct extent_buffer *cow;
230 u32 nritems;
231 int ret = 0;
232 int level;
233 int unlock_orig = 0;
234
235 if (*cow_ret == buf)
236 unlock_orig = 1;
237
238 WARN_ON(!btrfs_tree_locked(buf));
239
240 if (parent)
241 parent_start = parent->start;
242 else
243 parent_start = 0;
244
245 WARN_ON(root->ref_cows && trans->transid !=
246 root->fs_info->running_transaction->transid);
247 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
248
249 level = btrfs_header_level(buf);
250 nritems = btrfs_header_nritems(buf);
251
252 if (prealloc_dest) {
253 struct btrfs_key ins;
254
255 ins.objectid = prealloc_dest;
256 ins.offset = buf->len;
257 ins.type = BTRFS_EXTENT_ITEM_KEY;
258
259 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
260 root->root_key.objectid,
261 trans->transid, level, &ins);
262 BUG_ON(ret);
263 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
264 buf->len);
265 } else {
266 cow = btrfs_alloc_free_block(trans, root, buf->len,
267 parent_start,
268 root->root_key.objectid,
269 trans->transid, level,
270 search_start, empty_size);
271 }
272 if (IS_ERR(cow))
273 return PTR_ERR(cow);
274
275 copy_extent_buffer(cow, buf, 0, 0, cow->len);
276 btrfs_set_header_bytenr(cow, cow->start);
277 btrfs_set_header_generation(cow, trans->transid);
278 btrfs_set_header_owner(cow, root->root_key.objectid);
279 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
280
281 write_extent_buffer(cow, root->fs_info->fsid,
282 (unsigned long)btrfs_header_fsid(cow),
283 BTRFS_FSID_SIZE);
284
285 WARN_ON(btrfs_header_generation(buf) > trans->transid);
286 if (btrfs_header_generation(buf) != trans->transid) {
287 u32 nr_extents;
288 ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
289 if (ret)
290 return ret;
291
292 ret = btrfs_cache_ref(trans, root, buf, nr_extents);
293 WARN_ON(ret);
294 } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
295 /*
296 * There are only two places that can drop reference to
297 * tree blocks owned by living reloc trees, one is here,
298 * the other place is btrfs_drop_subtree. In both places,
299 * we check reference count while tree block is locked.
300 * Furthermore, if reference count is one, it won't get
301 * increased by someone else.
302 */
303 u32 refs;
304 ret = btrfs_lookup_extent_ref(trans, root, buf->start,
305 buf->len, &refs);
306 BUG_ON(ret);
307 if (refs == 1) {
308 ret = btrfs_update_ref(trans, root, buf, cow,
309 0, nritems);
310 clean_tree_block(trans, root, buf);
311 } else {
312 ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
313 }
314 BUG_ON(ret);
315 } else {
316 ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
317 if (ret)
318 return ret;
319 clean_tree_block(trans, root, buf);
320 }
321
322 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
323 ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
324 WARN_ON(ret);
325 }
326
327 if (buf == root->node) {
328 WARN_ON(parent && parent != buf);
329
330 spin_lock(&root->node_lock);
331 root->node = cow;
332 extent_buffer_get(cow);
333 spin_unlock(&root->node_lock);
334
335 if (buf != root->commit_root) {
336 btrfs_free_extent(trans, root, buf->start,
337 buf->len, buf->start,
338 root->root_key.objectid,
339 btrfs_header_generation(buf),
340 level, 1);
341 }
342 free_extent_buffer(buf);
343 add_root_to_dirty_list(root);
344 } else {
345 btrfs_set_node_blockptr(parent, parent_slot,
346 cow->start);
347 WARN_ON(trans->transid == 0);
348 btrfs_set_node_ptr_generation(parent, parent_slot,
349 trans->transid);
350 btrfs_mark_buffer_dirty(parent);
351 WARN_ON(btrfs_header_generation(parent) != trans->transid);
352 btrfs_free_extent(trans, root, buf->start, buf->len,
353 parent_start, btrfs_header_owner(parent),
354 btrfs_header_generation(parent), level, 1);
355 }
356 if (unlock_orig)
357 btrfs_tree_unlock(buf);
358 free_extent_buffer(buf);
359 btrfs_mark_buffer_dirty(cow);
360 *cow_ret = cow;
361 return 0;
362}
363
364/*
365 * cows a single block, see __btrfs_cow_block for the real work.
366 * This version of it has extra checks so that a block isn't cow'd more than
367 * once per transaction, as long as it hasn't been written yet
368 */
369int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
370 struct btrfs_root *root, struct extent_buffer *buf,
371 struct extent_buffer *parent, int parent_slot,
372 struct extent_buffer **cow_ret, u64 prealloc_dest)
373{
374 u64 search_start;
375 int ret;
376
377 if (trans->transaction != root->fs_info->running_transaction) {
378 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
379 root->fs_info->running_transaction->transid);
380 WARN_ON(1);
381 }
382 if (trans->transid != root->fs_info->generation) {
383 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
384 root->fs_info->generation);
385 WARN_ON(1);
386 }
387
388 spin_lock(&root->fs_info->hash_lock);
389 if (btrfs_header_generation(buf) == trans->transid &&
390 btrfs_header_owner(buf) == root->root_key.objectid &&
391 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
392 *cow_ret = buf;
393 spin_unlock(&root->fs_info->hash_lock);
394 WARN_ON(prealloc_dest);
395 return 0;
396 }
397 spin_unlock(&root->fs_info->hash_lock);
398 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
399 ret = __btrfs_cow_block(trans, root, buf, parent,
400 parent_slot, cow_ret, search_start, 0,
401 prealloc_dest);
402 return ret;
403}
404
405/*
406 * helper function for defrag to decide if two blocks pointed to by a
407 * node are actually close by
408 */
409static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
410{
411 if (blocknr < other && other - (blocknr + blocksize) < 32768)
412 return 1;
413 if (blocknr > other && blocknr - (other + blocksize) < 32768)
414 return 1;
415 return 0;
416}
417
418/*
419 * compare two keys in a memcmp fashion
420 */
421static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
422{
423 struct btrfs_key k1;
424
425 btrfs_disk_key_to_cpu(&k1, disk);
426
427 if (k1.objectid > k2->objectid)
428 return 1;
429 if (k1.objectid < k2->objectid)
430 return -1;
431 if (k1.type > k2->type)
432 return 1;
433 if (k1.type < k2->type)
434 return -1;
435 if (k1.offset > k2->offset)
436 return 1;
437 if (k1.offset < k2->offset)
438 return -1;
439 return 0;
440}
441
442/*
443 * same as comp_keys only with two btrfs_key's
444 */
445static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
446{
447 if (k1->objectid > k2->objectid)
448 return 1;
449 if (k1->objectid < k2->objectid)
450 return -1;
451 if (k1->type > k2->type)
452 return 1;
453 if (k1->type < k2->type)
454 return -1;
455 if (k1->offset > k2->offset)
456 return 1;
457 if (k1->offset < k2->offset)
458 return -1;
459 return 0;
460}
461
462/*
463 * this is used by the defrag code to go through all the
464 * leaves pointed to by a node and reallocate them so that
465 * disk order is close to key order
466 */
467int btrfs_realloc_node(struct btrfs_trans_handle *trans,
468 struct btrfs_root *root, struct extent_buffer *parent,
469 int start_slot, int cache_only, u64 *last_ret,
470 struct btrfs_key *progress)
471{
472 struct extent_buffer *cur;
473 u64 blocknr;
474 u64 gen;
475 u64 search_start = *last_ret;
476 u64 last_block = 0;
477 u64 other;
478 u32 parent_nritems;
479 int end_slot;
480 int i;
481 int err = 0;
482 int parent_level;
483 int uptodate;
484 u32 blocksize;
485 int progress_passed = 0;
486 struct btrfs_disk_key disk_key;
487
488 parent_level = btrfs_header_level(parent);
489 if (cache_only && parent_level != 1)
490 return 0;
491
492 if (trans->transaction != root->fs_info->running_transaction) {
493 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
494 root->fs_info->running_transaction->transid);
495 WARN_ON(1);
496 }
497 if (trans->transid != root->fs_info->generation) {
498 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
499 root->fs_info->generation);
500 WARN_ON(1);
501 }
502
503 parent_nritems = btrfs_header_nritems(parent);
504 blocksize = btrfs_level_size(root, parent_level - 1);
505 end_slot = parent_nritems;
506
507 if (parent_nritems == 1)
508 return 0;
509
510 for (i = start_slot; i < end_slot; i++) {
511 int close = 1;
512
513 if (!parent->map_token) {
514 map_extent_buffer(parent,
515 btrfs_node_key_ptr_offset(i),
516 sizeof(struct btrfs_key_ptr),
517 &parent->map_token, &parent->kaddr,
518 &parent->map_start, &parent->map_len,
519 KM_USER1);
520 }
521 btrfs_node_key(parent, &disk_key, i);
522 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
523 continue;
524
525 progress_passed = 1;
526 blocknr = btrfs_node_blockptr(parent, i);
527 gen = btrfs_node_ptr_generation(parent, i);
528 if (last_block == 0)
529 last_block = blocknr;
530
531 if (i > 0) {
532 other = btrfs_node_blockptr(parent, i - 1);
533 close = close_blocks(blocknr, other, blocksize);
534 }
535 if (!close && i < end_slot - 2) {
536 other = btrfs_node_blockptr(parent, i + 1);
537 close = close_blocks(blocknr, other, blocksize);
538 }
539 if (close) {
540 last_block = blocknr;
541 continue;
542 }
543 if (parent->map_token) {
544 unmap_extent_buffer(parent, parent->map_token,
545 KM_USER1);
546 parent->map_token = NULL;
547 }
548
549 cur = btrfs_find_tree_block(root, blocknr, blocksize);
550 if (cur)
551 uptodate = btrfs_buffer_uptodate(cur, gen);
552 else
553 uptodate = 0;
554 if (!cur || !uptodate) {
555 if (cache_only) {
556 free_extent_buffer(cur);
557 continue;
558 }
559 if (!cur) {
560 cur = read_tree_block(root, blocknr,
561 blocksize, gen);
562 } else if (!uptodate) {
563 btrfs_read_buffer(cur, gen);
564 }
565 }
566 if (search_start == 0)
567 search_start = last_block;
568
569 btrfs_tree_lock(cur);
570 err = __btrfs_cow_block(trans, root, cur, parent, i,
571 &cur, search_start,
572 min(16 * blocksize,
573 (end_slot - i) * blocksize), 0);
574 if (err) {
575 btrfs_tree_unlock(cur);
576 free_extent_buffer(cur);
577 break;
578 }
579 search_start = cur->start;
580 last_block = cur->start;
581 *last_ret = search_start;
582 btrfs_tree_unlock(cur);
583 free_extent_buffer(cur);
584 }
585 if (parent->map_token) {
586 unmap_extent_buffer(parent, parent->map_token,
587 KM_USER1);
588 parent->map_token = NULL;
589 }
590 return err;
591}
592
593/*
594 * The leaf data grows from end-to-front in the node.
595 * this returns the address of the start of the last item,
596 * which is the stop of the leaf data stack
597 */
598static inline unsigned int leaf_data_end(struct btrfs_root *root,
599 struct extent_buffer *leaf)
600{
601 u32 nr = btrfs_header_nritems(leaf);
602 if (nr == 0)
603 return BTRFS_LEAF_DATA_SIZE(root);
604 return btrfs_item_offset_nr(leaf, nr - 1);
605}
606
607/*
608 * extra debugging checks to make sure all the items in a key are
609 * well formed and in the proper order
610 */
611static int check_node(struct btrfs_root *root, struct btrfs_path *path,
612 int level)
613{
614 struct extent_buffer *parent = NULL;
615 struct extent_buffer *node = path->nodes[level];
616 struct btrfs_disk_key parent_key;
617 struct btrfs_disk_key node_key;
618 int parent_slot;
619 int slot;
620 struct btrfs_key cpukey;
621 u32 nritems = btrfs_header_nritems(node);
622
623 if (path->nodes[level + 1])
624 parent = path->nodes[level + 1];
625
626 slot = path->slots[level];
627 BUG_ON(nritems == 0);
628 if (parent) {
629 parent_slot = path->slots[level + 1];
630 btrfs_node_key(parent, &parent_key, parent_slot);
631 btrfs_node_key(node, &node_key, 0);
632 BUG_ON(memcmp(&parent_key, &node_key,
633 sizeof(struct btrfs_disk_key)));
634 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
635 btrfs_header_bytenr(node));
636 }
637 BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
638 if (slot != 0) {
639 btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
640 btrfs_node_key(node, &node_key, slot);
641 BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
642 }
643 if (slot < nritems - 1) {
644 btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
645 btrfs_node_key(node, &node_key, slot);
646 BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
647 }
648 return 0;
649}
650
651/*
652 * extra checking to make sure all the items in a leaf are
653 * well formed and in the proper order
654 */
655static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
656 int level)
657{
658 struct extent_buffer *leaf = path->nodes[level];
659 struct extent_buffer *parent = NULL;
660 int parent_slot;
661 struct btrfs_key cpukey;
662 struct btrfs_disk_key parent_key;
663 struct btrfs_disk_key leaf_key;
664 int slot = path->slots[0];
665
666 u32 nritems = btrfs_header_nritems(leaf);
667
668 if (path->nodes[level + 1])
669 parent = path->nodes[level + 1];
670
671 if (nritems == 0)
672 return 0;
673
674 if (parent) {
675 parent_slot = path->slots[level + 1];
676 btrfs_node_key(parent, &parent_key, parent_slot);
677 btrfs_item_key(leaf, &leaf_key, 0);
678
679 BUG_ON(memcmp(&parent_key, &leaf_key,
680 sizeof(struct btrfs_disk_key)));
681 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
682 btrfs_header_bytenr(leaf));
683 }
684#if 0
685 for (i = 0; nritems > 1 && i < nritems - 2; i++) {
686 btrfs_item_key_to_cpu(leaf, &cpukey, i + 1);
687 btrfs_item_key(leaf, &leaf_key, i);
688 if (comp_keys(&leaf_key, &cpukey) >= 0) {
689 btrfs_print_leaf(root, leaf);
690 printk("slot %d offset bad key\n", i);
691 BUG_ON(1);
692 }
693 if (btrfs_item_offset_nr(leaf, i) !=
694 btrfs_item_end_nr(leaf, i + 1)) {
695 btrfs_print_leaf(root, leaf);
696 printk("slot %d offset bad\n", i);
697 BUG_ON(1);
698 }
699 if (i == 0) {
700 if (btrfs_item_offset_nr(leaf, i) +
701 btrfs_item_size_nr(leaf, i) !=
702 BTRFS_LEAF_DATA_SIZE(root)) {
703 btrfs_print_leaf(root, leaf);
704 printk("slot %d first offset bad\n", i);
705 BUG_ON(1);
706 }
707 }
708 }
709 if (nritems > 0) {
710 if (btrfs_item_size_nr(leaf, nritems - 1) > 4096) {
711 btrfs_print_leaf(root, leaf);
712 printk("slot %d bad size \n", nritems - 1);
713 BUG_ON(1);
714 }
715 }
716#endif
717 if (slot != 0 && slot < nritems - 1) {
718 btrfs_item_key(leaf, &leaf_key, slot);
719 btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
720 if (comp_keys(&leaf_key, &cpukey) <= 0) {
721 btrfs_print_leaf(root, leaf);
722 printk("slot %d offset bad key\n", slot);
723 BUG_ON(1);
724 }
725 if (btrfs_item_offset_nr(leaf, slot - 1) !=
726 btrfs_item_end_nr(leaf, slot)) {
727 btrfs_print_leaf(root, leaf);
728 printk("slot %d offset bad\n", slot);
729 BUG_ON(1);
730 }
731 }
732 if (slot < nritems - 1) {
733 btrfs_item_key(leaf, &leaf_key, slot);
734 btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
735 BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
736 if (btrfs_item_offset_nr(leaf, slot) !=
737 btrfs_item_end_nr(leaf, slot + 1)) {
738 btrfs_print_leaf(root, leaf);
739 printk("slot %d offset bad\n", slot);
740 BUG_ON(1);
741 }
742 }
743 BUG_ON(btrfs_item_offset_nr(leaf, 0) +
744 btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
745 return 0;
746}
747
748static int noinline check_block(struct btrfs_root *root,
749 struct btrfs_path *path, int level)
750{
751 u64 found_start;
752 return 0;
753 if (btrfs_header_level(path->nodes[level]) != level)
754 printk("warning: bad level %Lu wanted %d found %d\n",
755 path->nodes[level]->start, level,
756 btrfs_header_level(path->nodes[level]));
757 found_start = btrfs_header_bytenr(path->nodes[level]);
758 if (found_start != path->nodes[level]->start) {
759 printk("warning: bad bytentr %Lu found %Lu\n",
760 path->nodes[level]->start, found_start);
761 }
762#if 0
763 struct extent_buffer *buf = path->nodes[level];
764
765 if (memcmp_extent_buffer(buf, root->fs_info->fsid,
766 (unsigned long)btrfs_header_fsid(buf),
767 BTRFS_FSID_SIZE)) {
768 printk("warning bad block %Lu\n", buf->start);
769 return 1;
770 }
771#endif
772 if (level == 0)
773 return check_leaf(root, path, level);
774 return check_node(root, path, level);
775}
776
777/*
778 * search for key in the extent_buffer. The items start at offset p,
779 * and they are item_size apart. There are 'max' items in p.
780 *
781 * the slot in the array is returned via slot, and it points to
782 * the place where you would insert key if it is not found in
783 * the array.
784 *
785 * slot may point to max if the key is bigger than all of the keys
786 */
787static noinline int generic_bin_search(struct extent_buffer *eb,
788 unsigned long p,
789 int item_size, struct btrfs_key *key,
790 int max, int *slot)
791{
792 int low = 0;
793 int high = max;
794 int mid;
795 int ret;
796 struct btrfs_disk_key *tmp = NULL;
797 struct btrfs_disk_key unaligned;
798 unsigned long offset;
799 char *map_token = NULL;
800 char *kaddr = NULL;
801 unsigned long map_start = 0;
802 unsigned long map_len = 0;
803 int err;
804
805 while(low < high) {
806 mid = (low + high) / 2;
807 offset = p + mid * item_size;
808
809 if (!map_token || offset < map_start ||
810 (offset + sizeof(struct btrfs_disk_key)) >
811 map_start + map_len) {
812 if (map_token) {
813 unmap_extent_buffer(eb, map_token, KM_USER0);
814 map_token = NULL;
815 }
816 err = map_extent_buffer(eb, offset,
817 sizeof(struct btrfs_disk_key),
818 &map_token, &kaddr,
819 &map_start, &map_len, KM_USER0);
820
821 if (!err) {
822 tmp = (struct btrfs_disk_key *)(kaddr + offset -
823 map_start);
824 } else {
825 read_extent_buffer(eb, &unaligned,
826 offset, sizeof(unaligned));
827 tmp = &unaligned;
828 }
829
830 } else {
831 tmp = (struct btrfs_disk_key *)(kaddr + offset -
832 map_start);
833 }
834 ret = comp_keys(tmp, key);
835
836 if (ret < 0)
837 low = mid + 1;
838 else if (ret > 0)
839 high = mid;
840 else {
841 *slot = mid;
842 if (map_token)
843 unmap_extent_buffer(eb, map_token, KM_USER0);
844 return 0;
845 }
846 }
847 *slot = low;
848 if (map_token)
849 unmap_extent_buffer(eb, map_token, KM_USER0);
850 return 1;
851}
852
853/*
854 * simple bin_search frontend that does the right thing for
855 * leaves vs nodes
856 */
857static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
858 int level, int *slot)
859{
860 if (level == 0) {
861 return generic_bin_search(eb,
862 offsetof(struct btrfs_leaf, items),
863 sizeof(struct btrfs_item),
864 key, btrfs_header_nritems(eb),
865 slot);
866 } else {
867 return generic_bin_search(eb,
868 offsetof(struct btrfs_node, ptrs),
869 sizeof(struct btrfs_key_ptr),
870 key, btrfs_header_nritems(eb),
871 slot);
872 }
873 return -1;
874}
875
876/* given a node and slot number, this reads the blocks it points to. The
877 * extent buffer is returned with a reference taken (but unlocked).
878 * NULL is returned on error.
879 */
880static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
881 struct extent_buffer *parent, int slot)
882{
883 int level = btrfs_header_level(parent);
884 if (slot < 0)
885 return NULL;
886 if (slot >= btrfs_header_nritems(parent))
887 return NULL;
888
889 BUG_ON(level == 0);
890
891 return read_tree_block(root, btrfs_node_blockptr(parent, slot),
892 btrfs_level_size(root, level - 1),
893 btrfs_node_ptr_generation(parent, slot));
894}
895
896/*
897 * node level balancing, used to make sure nodes are in proper order for
898 * item deletion. We balance from the top down, so we have to make sure
899 * that a deletion won't leave an node completely empty later on.
900 */
901static noinline int balance_level(struct btrfs_trans_handle *trans,
902 struct btrfs_root *root,
903 struct btrfs_path *path, int level)
904{
905 struct extent_buffer *right = NULL;
906 struct extent_buffer *mid;
907 struct extent_buffer *left = NULL;
908 struct extent_buffer *parent = NULL;
909 int ret = 0;
910 int wret;
911 int pslot;
912 int orig_slot = path->slots[level];
913 int err_on_enospc = 0;
914 u64 orig_ptr;
915
916 if (level == 0)
917 return 0;
918
919 mid = path->nodes[level];
920 WARN_ON(!path->locks[level]);
921 WARN_ON(btrfs_header_generation(mid) != trans->transid);
922
923 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
924
925 if (level < BTRFS_MAX_LEVEL - 1)
926 parent = path->nodes[level + 1];
927 pslot = path->slots[level + 1];
928
929 /*
930 * deal with the case where there is only one pointer in the root
931 * by promoting the node below to a root
932 */
933 if (!parent) {
934 struct extent_buffer *child;
935
936 if (btrfs_header_nritems(mid) != 1)
937 return 0;
938
939 /* promote the child to a root */
940 child = read_node_slot(root, mid, 0);
941 btrfs_tree_lock(child);
942 BUG_ON(!child);
943 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
944 BUG_ON(ret);
945
946 spin_lock(&root->node_lock);
947 root->node = child;
948 spin_unlock(&root->node_lock);
949
950 ret = btrfs_update_extent_ref(trans, root, child->start,
951 mid->start, child->start,
952 root->root_key.objectid,
953 trans->transid, level - 1);
954 BUG_ON(ret);
955
956 add_root_to_dirty_list(root);
957 btrfs_tree_unlock(child);
958 path->locks[level] = 0;
959 path->nodes[level] = NULL;
960 clean_tree_block(trans, root, mid);
961 btrfs_tree_unlock(mid);
962 /* once for the path */
963 free_extent_buffer(mid);
964 ret = btrfs_free_extent(trans, root, mid->start, mid->len,
965 mid->start, root->root_key.objectid,
966 btrfs_header_generation(mid),
967 level, 1);
968 /* once for the root ptr */
969 free_extent_buffer(mid);
970 return ret;
971 }
972 if (btrfs_header_nritems(mid) >
973 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
974 return 0;
975
976 if (btrfs_header_nritems(mid) < 2)
977 err_on_enospc = 1;
978
979 left = read_node_slot(root, parent, pslot - 1);
980 if (left) {
981 btrfs_tree_lock(left);
982 wret = btrfs_cow_block(trans, root, left,
983 parent, pslot - 1, &left, 0);
984 if (wret) {
985 ret = wret;
986 goto enospc;
987 }
988 }
989 right = read_node_slot(root, parent, pslot + 1);
990 if (right) {
991 btrfs_tree_lock(right);
992 wret = btrfs_cow_block(trans, root, right,
993 parent, pslot + 1, &right, 0);
994 if (wret) {
995 ret = wret;
996 goto enospc;
997 }
998 }
999
1000 /* first, try to make some room in the middle buffer */
1001 if (left) {
1002 orig_slot += btrfs_header_nritems(left);
1003 wret = push_node_left(trans, root, left, mid, 1);
1004 if (wret < 0)
1005 ret = wret;
1006 if (btrfs_header_nritems(mid) < 2)
1007 err_on_enospc = 1;
1008 }
1009
1010 /*
1011 * then try to empty the right most buffer into the middle
1012 */
1013 if (right) {
1014 wret = push_node_left(trans, root, mid, right, 1);
1015 if (wret < 0 && wret != -ENOSPC)
1016 ret = wret;
1017 if (btrfs_header_nritems(right) == 0) {
1018 u64 bytenr = right->start;
1019 u64 generation = btrfs_header_generation(parent);
1020 u32 blocksize = right->len;
1021
1022 clean_tree_block(trans, root, right);
1023 btrfs_tree_unlock(right);
1024 free_extent_buffer(right);
1025 right = NULL;
1026 wret = del_ptr(trans, root, path, level + 1, pslot +
1027 1);
1028 if (wret)
1029 ret = wret;
1030 wret = btrfs_free_extent(trans, root, bytenr,
1031 blocksize, parent->start,
1032 btrfs_header_owner(parent),
1033 generation, level, 1);
1034 if (wret)
1035 ret = wret;
1036 } else {
1037 struct btrfs_disk_key right_key;
1038 btrfs_node_key(right, &right_key, 0);
1039 btrfs_set_node_key(parent, &right_key, pslot + 1);
1040 btrfs_mark_buffer_dirty(parent);
1041 }
1042 }
1043 if (btrfs_header_nritems(mid) == 1) {
1044 /*
1045 * we're not allowed to leave a node with one item in the
1046 * tree during a delete. A deletion from lower in the tree
1047 * could try to delete the only pointer in this node.
1048 * So, pull some keys from the left.
1049 * There has to be a left pointer at this point because
1050 * otherwise we would have pulled some pointers from the
1051 * right
1052 */
1053 BUG_ON(!left);
1054 wret = balance_node_right(trans, root, mid, left);
1055 if (wret < 0) {
1056 ret = wret;
1057 goto enospc;
1058 }
1059 if (wret == 1) {
1060 wret = push_node_left(trans, root, left, mid, 1);
1061 if (wret < 0)
1062 ret = wret;
1063 }
1064 BUG_ON(wret == 1);
1065 }
1066 if (btrfs_header_nritems(mid) == 0) {
1067 /* we've managed to empty the middle node, drop it */
1068 u64 root_gen = btrfs_header_generation(parent);
1069 u64 bytenr = mid->start;
1070 u32 blocksize = mid->len;
1071
1072 clean_tree_block(trans, root, mid);
1073 btrfs_tree_unlock(mid);
1074 free_extent_buffer(mid);
1075 mid = NULL;
1076 wret = del_ptr(trans, root, path, level + 1, pslot);
1077 if (wret)
1078 ret = wret;
1079 wret = btrfs_free_extent(trans, root, bytenr, blocksize,
1080 parent->start,
1081 btrfs_header_owner(parent),
1082 root_gen, level, 1);
1083 if (wret)
1084 ret = wret;
1085 } else {
1086 /* update the parent key to reflect our changes */
1087 struct btrfs_disk_key mid_key;
1088 btrfs_node_key(mid, &mid_key, 0);
1089 btrfs_set_node_key(parent, &mid_key, pslot);
1090 btrfs_mark_buffer_dirty(parent);
1091 }
1092
1093 /* update the path */
1094 if (left) {
1095 if (btrfs_header_nritems(left) > orig_slot) {
1096 extent_buffer_get(left);
1097 /* left was locked after cow */
1098 path->nodes[level] = left;
1099 path->slots[level + 1] -= 1;
1100 path->slots[level] = orig_slot;
1101 if (mid) {
1102 btrfs_tree_unlock(mid);
1103 free_extent_buffer(mid);
1104 }
1105 } else {
1106 orig_slot -= btrfs_header_nritems(left);
1107 path->slots[level] = orig_slot;
1108 }
1109 }
1110 /* double check we haven't messed things up */
1111 check_block(root, path, level);
1112 if (orig_ptr !=
1113 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1114 BUG();
1115enospc:
1116 if (right) {
1117 btrfs_tree_unlock(right);
1118 free_extent_buffer(right);
1119 }
1120 if (left) {
1121 if (path->nodes[level] != left)
1122 btrfs_tree_unlock(left);
1123 free_extent_buffer(left);
1124 }
1125 return ret;
1126}
1127
1128/* Node balancing for insertion. Here we only split or push nodes around
1129 * when they are completely full. This is also done top down, so we
1130 * have to be pessimistic.
1131 */
1132static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
1133 struct btrfs_root *root,
1134 struct btrfs_path *path, int level)
1135{
1136 struct extent_buffer *right = NULL;
1137 struct extent_buffer *mid;
1138 struct extent_buffer *left = NULL;
1139 struct extent_buffer *parent = NULL;
1140 int ret = 0;
1141 int wret;
1142 int pslot;
1143 int orig_slot = path->slots[level];
1144 u64 orig_ptr;
1145
1146 if (level == 0)
1147 return 1;
1148
1149 mid = path->nodes[level];
1150 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1151 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1152
1153 if (level < BTRFS_MAX_LEVEL - 1)
1154 parent = path->nodes[level + 1];
1155 pslot = path->slots[level + 1];
1156
1157 if (!parent)
1158 return 1;
1159
1160 left = read_node_slot(root, parent, pslot - 1);
1161
1162 /* first, try to make some room in the middle buffer */
1163 if (left) {
1164 u32 left_nr;
1165
1166 btrfs_tree_lock(left);
1167 left_nr = btrfs_header_nritems(left);
1168 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1169 wret = 1;
1170 } else {
1171 ret = btrfs_cow_block(trans, root, left, parent,
1172 pslot - 1, &left, 0);
1173 if (ret)
1174 wret = 1;
1175 else {
1176 wret = push_node_left(trans, root,
1177 left, mid, 0);
1178 }
1179 }
1180 if (wret < 0)
1181 ret = wret;
1182 if (wret == 0) {
1183 struct btrfs_disk_key disk_key;
1184 orig_slot += left_nr;
1185 btrfs_node_key(mid, &disk_key, 0);
1186 btrfs_set_node_key(parent, &disk_key, pslot);
1187 btrfs_mark_buffer_dirty(parent);
1188 if (btrfs_header_nritems(left) > orig_slot) {
1189 path->nodes[level] = left;
1190 path->slots[level + 1] -= 1;
1191 path->slots[level] = orig_slot;
1192 btrfs_tree_unlock(mid);
1193 free_extent_buffer(mid);
1194 } else {
1195 orig_slot -=
1196 btrfs_header_nritems(left);
1197 path->slots[level] = orig_slot;
1198 btrfs_tree_unlock(left);
1199 free_extent_buffer(left);
1200 }
1201 return 0;
1202 }
1203 btrfs_tree_unlock(left);
1204 free_extent_buffer(left);
1205 }
1206 right = read_node_slot(root, parent, pslot + 1);
1207
1208 /*
1209 * then try to empty the right most buffer into the middle
1210 */
1211 if (right) {
1212 u32 right_nr;
1213 btrfs_tree_lock(right);
1214 right_nr = btrfs_header_nritems(right);
1215 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1216 wret = 1;
1217 } else {
1218 ret = btrfs_cow_block(trans, root, right,
1219 parent, pslot + 1,
1220 &right, 0);
1221 if (ret)
1222 wret = 1;
1223 else {
1224 wret = balance_node_right(trans, root,
1225 right, mid);
1226 }
1227 }
1228 if (wret < 0)
1229 ret = wret;
1230 if (wret == 0) {
1231 struct btrfs_disk_key disk_key;
1232
1233 btrfs_node_key(right, &disk_key, 0);
1234 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1235 btrfs_mark_buffer_dirty(parent);
1236
1237 if (btrfs_header_nritems(mid) <= orig_slot) {
1238 path->nodes[level] = right;
1239 path->slots[level + 1] += 1;
1240 path->slots[level] = orig_slot -
1241 btrfs_header_nritems(mid);
1242 btrfs_tree_unlock(mid);
1243 free_extent_buffer(mid);
1244 } else {
1245 btrfs_tree_unlock(right);
1246 free_extent_buffer(right);
1247 }
1248 return 0;
1249 }
1250 btrfs_tree_unlock(right);
1251 free_extent_buffer(right);
1252 }
1253 return 1;
1254}
1255
1256/*
1257 * readahead one full node of leaves, finding things that are close
1258 * to the block in 'slot', and triggering ra on them.
1259 */
1260static noinline void reada_for_search(struct btrfs_root *root,
1261 struct btrfs_path *path,
1262 int level, int slot, u64 objectid)
1263{
1264 struct extent_buffer *node;
1265 struct btrfs_disk_key disk_key;
1266 u32 nritems;
1267 u64 search;
1268 u64 lowest_read;
1269 u64 highest_read;
1270 u64 nread = 0;
1271 int direction = path->reada;
1272 struct extent_buffer *eb;
1273 u32 nr;
1274 u32 blocksize;
1275 u32 nscan = 0;
1276
1277 if (level != 1)
1278 return;
1279
1280 if (!path->nodes[level])
1281 return;
1282
1283 node = path->nodes[level];
1284
1285 search = btrfs_node_blockptr(node, slot);
1286 blocksize = btrfs_level_size(root, level - 1);
1287 eb = btrfs_find_tree_block(root, search, blocksize);
1288 if (eb) {
1289 free_extent_buffer(eb);
1290 return;
1291 }
1292
1293 highest_read = search;
1294 lowest_read = search;
1295
1296 nritems = btrfs_header_nritems(node);
1297 nr = slot;
1298 while(1) {
1299 if (direction < 0) {
1300 if (nr == 0)
1301 break;
1302 nr--;
1303 } else if (direction > 0) {
1304 nr++;
1305 if (nr >= nritems)
1306 break;
1307 }
1308 if (path->reada < 0 && objectid) {
1309 btrfs_node_key(node, &disk_key, nr);
1310 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1311 break;
1312 }
1313 search = btrfs_node_blockptr(node, nr);
1314 if ((search >= lowest_read && search <= highest_read) ||
1315 (search < lowest_read && lowest_read - search <= 16384) ||
1316 (search > highest_read && search - highest_read <= 16384)) {
1317 readahead_tree_block(root, search, blocksize,
1318 btrfs_node_ptr_generation(node, nr));
1319 nread += blocksize;
1320 }
1321 nscan++;
1322 if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
1323 break;
1324 if(nread > (256 * 1024) || nscan > 128)
1325 break;
1326
1327 if (search < lowest_read)
1328 lowest_read = search;
1329 if (search > highest_read)
1330 highest_read = search;
1331 }
1332}
1333
1334/*
1335 * when we walk down the tree, it is usually safe to unlock the higher layers in
1336 * the tree. The exceptions are when our path goes through slot 0, because operations
1337 * on the tree might require changing key pointers higher up in the tree.
1338 *
1339 * callers might also have set path->keep_locks, which tells this code to
1340 * keep the lock if the path points to the last slot in the block. This is
1341 * part of walking through the tree, and selecting the next slot in the higher
1342 * block.
1343 *
1344 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.
1345 * so if lowest_unlock is 1, level 0 won't be unlocked
1346 */
1347static noinline void unlock_up(struct btrfs_path *path, int level,
1348 int lowest_unlock)
1349{
1350 int i;
1351 int skip_level = level;
1352 int no_skips = 0;
1353 struct extent_buffer *t;
1354
1355 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1356 if (!path->nodes[i])
1357 break;
1358 if (!path->locks[i])
1359 break;
1360 if (!no_skips && path->slots[i] == 0) {
1361 skip_level = i + 1;
1362 continue;
1363 }
1364 if (!no_skips && path->keep_locks) {
1365 u32 nritems;
1366 t = path->nodes[i];
1367 nritems = btrfs_header_nritems(t);
1368 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1369 skip_level = i + 1;
1370 continue;
1371 }
1372 }
1373 if (skip_level < i && i >= lowest_unlock)
1374 no_skips = 1;
1375
1376 t = path->nodes[i];
1377 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1378 btrfs_tree_unlock(t);
1379 path->locks[i] = 0;
1380 }
1381 }
1382}
1383
1384/*
1385 * look for key in the tree. path is filled in with nodes along the way
1386 * if key is found, we return zero and you can find the item in the leaf
1387 * level of the path (level 0)
1388 *
1389 * If the key isn't found, the path points to the slot where it should
1390 * be inserted, and 1 is returned. If there are other errors during the
1391 * search a negative error number is returned.
1392 *
1393 * if ins_len > 0, nodes and leaves will be split as we walk down the
1394 * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
1395 * possible)
1396 */
1397int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1398 *root, struct btrfs_key *key, struct btrfs_path *p, int
1399 ins_len, int cow)
1400{
1401 struct extent_buffer *b;
1402 struct extent_buffer *tmp;
1403 int slot;
1404 int ret;
1405 int level;
1406 int should_reada = p->reada;
1407 int lowest_unlock = 1;
1408 int blocksize;
1409 u8 lowest_level = 0;
1410 u64 blocknr;
1411 u64 gen;
1412 struct btrfs_key prealloc_block;
1413
1414 lowest_level = p->lowest_level;
1415 WARN_ON(lowest_level && ins_len > 0);
1416 WARN_ON(p->nodes[0] != NULL);
1417
1418 if (ins_len < 0)
1419 lowest_unlock = 2;
1420
1421 prealloc_block.objectid = 0;
1422
1423again:
1424 if (p->skip_locking)
1425 b = btrfs_root_node(root);
1426 else
1427 b = btrfs_lock_root_node(root);
1428
1429 while (b) {
1430 level = btrfs_header_level(b);
1431
1432 /*
1433 * setup the path here so we can release it under lock
1434 * contention with the cow code
1435 */
1436 p->nodes[level] = b;
1437 if (!p->skip_locking)
1438 p->locks[level] = 1;
1439
1440 if (cow) {
1441 int wret;
1442
1443 /* is a cow on this block not required */
1444 spin_lock(&root->fs_info->hash_lock);
1445 if (btrfs_header_generation(b) == trans->transid &&
1446 btrfs_header_owner(b) == root->root_key.objectid &&
1447 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1448 spin_unlock(&root->fs_info->hash_lock);
1449 goto cow_done;
1450 }
1451 spin_unlock(&root->fs_info->hash_lock);
1452
1453 /* ok, we have to cow, is our old prealloc the right
1454 * size?
1455 */
1456 if (prealloc_block.objectid &&
1457 prealloc_block.offset != b->len) {
1458 btrfs_free_reserved_extent(root,
1459 prealloc_block.objectid,
1460 prealloc_block.offset);
1461 prealloc_block.objectid = 0;
1462 }
1463
1464 /*
1465 * for higher level blocks, try not to allocate blocks
1466 * with the block and the parent locks held.
1467 */
1468 if (level > 1 && !prealloc_block.objectid &&
1469 btrfs_path_lock_waiting(p, level)) {
1470 u32 size = b->len;
1471 u64 hint = b->start;
1472
1473 btrfs_release_path(root, p);
1474 ret = btrfs_reserve_extent(trans, root,
1475 size, size, 0,
1476 hint, (u64)-1,
1477 &prealloc_block, 0);
1478 BUG_ON(ret);
1479 goto again;
1480 }
1481
1482 wret = btrfs_cow_block(trans, root, b,
1483 p->nodes[level + 1],
1484 p->slots[level + 1],
1485 &b, prealloc_block.objectid);
1486 prealloc_block.objectid = 0;
1487 if (wret) {
1488 free_extent_buffer(b);
1489 ret = wret;
1490 goto done;
1491 }
1492 }
1493cow_done:
1494 BUG_ON(!cow && ins_len);
1495 if (level != btrfs_header_level(b))
1496 WARN_ON(1);
1497 level = btrfs_header_level(b);
1498
1499 p->nodes[level] = b;
1500 if (!p->skip_locking)
1501 p->locks[level] = 1;
1502
1503 ret = check_block(root, p, level);
1504 if (ret) {
1505 ret = -1;
1506 goto done;
1507 }
1508
1509 ret = bin_search(b, key, level, &slot);
1510 if (level != 0) {
1511 if (ret && slot > 0)
1512 slot -= 1;
1513 p->slots[level] = slot;
1514 if (ins_len > 0 && btrfs_header_nritems(b) >=
1515 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1516 int sret = split_node(trans, root, p, level);
1517 BUG_ON(sret > 0);
1518 if (sret) {
1519 ret = sret;
1520 goto done;
1521 }
1522 b = p->nodes[level];
1523 slot = p->slots[level];
1524 } else if (ins_len < 0) {
1525 int sret = balance_level(trans, root, p,
1526 level);
1527 if (sret) {
1528 ret = sret;
1529 goto done;
1530 }
1531 b = p->nodes[level];
1532 if (!b) {
1533 btrfs_release_path(NULL, p);
1534 goto again;
1535 }
1536 slot = p->slots[level];
1537 BUG_ON(btrfs_header_nritems(b) == 1);
1538 }
1539 unlock_up(p, level, lowest_unlock);
1540
1541 /* this is only true while dropping a snapshot */
1542 if (level == lowest_level) {
1543 ret = 0;
1544 goto done;
1545 }
1546
1547 blocknr = btrfs_node_blockptr(b, slot);
1548 gen = btrfs_node_ptr_generation(b, slot);
1549 blocksize = btrfs_level_size(root, level - 1);
1550
1551 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1552 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1553 b = tmp;
1554 } else {
1555 /*
1556 * reduce lock contention at high levels
1557 * of the btree by dropping locks before
1558 * we read.
1559 */
1560 if (level > 1) {
1561 btrfs_release_path(NULL, p);
1562 if (tmp)
1563 free_extent_buffer(tmp);
1564 if (should_reada)
1565 reada_for_search(root, p,
1566 level, slot,
1567 key->objectid);
1568
1569 tmp = read_tree_block(root, blocknr,
1570 blocksize, gen);
1571 if (tmp)
1572 free_extent_buffer(tmp);
1573 goto again;
1574 } else {
1575 if (tmp)
1576 free_extent_buffer(tmp);
1577 if (should_reada)
1578 reada_for_search(root, p,
1579 level, slot,
1580 key->objectid);
1581 b = read_node_slot(root, b, slot);
1582 }
1583 }
1584 if (!p->skip_locking)
1585 btrfs_tree_lock(b);
1586 } else {
1587 p->slots[level] = slot;
1588 if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
1589 sizeof(struct btrfs_item) + ins_len) {
1590 int sret = split_leaf(trans, root, key,
1591 p, ins_len, ret == 0);
1592 BUG_ON(sret > 0);
1593 if (sret) {
1594 ret = sret;
1595 goto done;
1596 }
1597 }
1598 unlock_up(p, level, lowest_unlock);
1599 goto done;
1600 }
1601 }
1602 ret = 1;
1603done:
1604 if (prealloc_block.objectid) {
1605 btrfs_free_reserved_extent(root,
1606 prealloc_block.objectid,
1607 prealloc_block.offset);
1608 }
1609
1610 return ret;
1611}
1612
1613int btrfs_merge_path(struct btrfs_trans_handle *trans,
1614 struct btrfs_root *root,
1615 struct btrfs_key *node_keys,
1616 u64 *nodes, int lowest_level)
1617{
1618 struct extent_buffer *eb;
1619 struct extent_buffer *parent;
1620 struct btrfs_key key;
1621 u64 bytenr;
1622 u64 generation;
1623 u32 blocksize;
1624 int level;
1625 int slot;
1626 int key_match;
1627 int ret;
1628
1629 eb = btrfs_lock_root_node(root);
1630 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1631 BUG_ON(ret);
1632
1633 parent = eb;
1634 while (1) {
1635 level = btrfs_header_level(parent);
1636 if (level == 0 || level <= lowest_level)
1637 break;
1638
1639 ret = bin_search(parent, &node_keys[lowest_level], level,
1640 &slot);
1641 if (ret && slot > 0)
1642 slot--;
1643
1644 bytenr = btrfs_node_blockptr(parent, slot);
1645 if (nodes[level - 1] == bytenr)
1646 break;
1647
1648 blocksize = btrfs_level_size(root, level - 1);
1649 generation = btrfs_node_ptr_generation(parent, slot);
1650 btrfs_node_key_to_cpu(eb, &key, slot);
1651 key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
1652
1653 if (generation == trans->transid) {
1654 eb = read_tree_block(root, bytenr, blocksize,
1655 generation);
1656 btrfs_tree_lock(eb);
1657 }
1658
1659 /*
1660 * if node keys match and node pointer hasn't been modified
1661 * in the running transaction, we can merge the path. for
1662 * blocks owened by reloc trees, the node pointer check is
1663 * skipped, this is because these blocks are fully controlled
1664 * by the space balance code, no one else can modify them.
1665 */
1666 if (!nodes[level - 1] || !key_match ||
1667 (generation == trans->transid &&
1668 btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
1669 if (level == 1 || level == lowest_level + 1) {
1670 if (generation == trans->transid) {
1671 btrfs_tree_unlock(eb);
1672 free_extent_buffer(eb);
1673 }
1674 break;
1675 }
1676
1677 if (generation != trans->transid) {
1678 eb = read_tree_block(root, bytenr, blocksize,
1679 generation);
1680 btrfs_tree_lock(eb);
1681 }
1682
1683 ret = btrfs_cow_block(trans, root, eb, parent, slot,
1684 &eb, 0);
1685 BUG_ON(ret);
1686
1687 if (root->root_key.objectid ==
1688 BTRFS_TREE_RELOC_OBJECTID) {
1689 if (!nodes[level - 1]) {
1690 nodes[level - 1] = eb->start;
1691 memcpy(&node_keys[level - 1], &key,
1692 sizeof(node_keys[0]));
1693 } else {
1694 WARN_ON(1);
1695 }
1696 }
1697
1698 btrfs_tree_unlock(parent);
1699 free_extent_buffer(parent);
1700 parent = eb;
1701 continue;
1702 }
1703
1704 btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
1705 btrfs_set_node_ptr_generation(parent, slot, trans->transid);
1706 btrfs_mark_buffer_dirty(parent);
1707
1708 ret = btrfs_inc_extent_ref(trans, root,
1709 nodes[level - 1],
1710 blocksize, parent->start,
1711 btrfs_header_owner(parent),
1712 btrfs_header_generation(parent),
1713 level - 1);
1714 BUG_ON(ret);
1715
1716 /*
1717 * If the block was created in the running transaction,
1718 * it's possible this is the last reference to it, so we
1719 * should drop the subtree.
1720 */
1721 if (generation == trans->transid) {
1722 ret = btrfs_drop_subtree(trans, root, eb, parent);
1723 BUG_ON(ret);
1724 btrfs_tree_unlock(eb);
1725 free_extent_buffer(eb);
1726 } else {
1727 ret = btrfs_free_extent(trans, root, bytenr,
1728 blocksize, parent->start,
1729 btrfs_header_owner(parent),
1730 btrfs_header_generation(parent),
1731 level - 1, 1);
1732 BUG_ON(ret);
1733 }
1734 break;
1735 }
1736 btrfs_tree_unlock(parent);
1737 free_extent_buffer(parent);
1738 return 0;
1739}
1740
1741/*
1742 * adjust the pointers going up the tree, starting at level
1743 * making sure the right key of each node is points to 'key'.
1744 * This is used after shifting pointers to the left, so it stops
1745 * fixing up pointers when a given leaf/node is not in slot 0 of the
1746 * higher levels
1747 *
1748 * If this fails to write a tree block, it returns -1, but continues
1749 * fixing up the blocks in ram so the tree is consistent.
1750 */
1751static int fixup_low_keys(struct btrfs_trans_handle *trans,
1752 struct btrfs_root *root, struct btrfs_path *path,
1753 struct btrfs_disk_key *key, int level)
1754{
1755 int i;
1756 int ret = 0;
1757 struct extent_buffer *t;
1758
1759 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1760 int tslot = path->slots[i];
1761 if (!path->nodes[i])
1762 break;
1763 t = path->nodes[i];
1764 btrfs_set_node_key(t, key, tslot);
1765 btrfs_mark_buffer_dirty(path->nodes[i]);
1766 if (tslot != 0)
1767 break;
1768 }
1769 return ret;
1770}
1771
1772/*
1773 * update item key.
1774 *
1775 * This function isn't completely safe. It's the caller's responsibility
1776 * that the new key won't break the order
1777 */
1778int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1779 struct btrfs_root *root, struct btrfs_path *path,
1780 struct btrfs_key *new_key)
1781{
1782 struct btrfs_disk_key disk_key;
1783 struct extent_buffer *eb;
1784 int slot;
1785
1786 eb = path->nodes[0];
1787 slot = path->slots[0];
1788 if (slot > 0) {
1789 btrfs_item_key(eb, &disk_key, slot - 1);
1790 if (comp_keys(&disk_key, new_key) >= 0)
1791 return -1;
1792 }
1793 if (slot < btrfs_header_nritems(eb) - 1) {
1794 btrfs_item_key(eb, &disk_key, slot + 1);
1795 if (comp_keys(&disk_key, new_key) <= 0)
1796 return -1;
1797 }
1798
1799 btrfs_cpu_key_to_disk(&disk_key, new_key);
1800 btrfs_set_item_key(eb, &disk_key, slot);
1801 btrfs_mark_buffer_dirty(eb);
1802 if (slot == 0)
1803 fixup_low_keys(trans, root, path, &disk_key, 1);
1804 return 0;
1805}
1806
1807/*
1808 * try to push data from one node into the next node left in the
1809 * tree.
1810 *
1811 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1812 * error, and > 0 if there was no room in the left hand block.
1813 */
1814static int push_node_left(struct btrfs_trans_handle *trans,
1815 struct btrfs_root *root, struct extent_buffer *dst,
1816 struct extent_buffer *src, int empty)
1817{
1818 int push_items = 0;
1819 int src_nritems;
1820 int dst_nritems;
1821 int ret = 0;
1822
1823 src_nritems = btrfs_header_nritems(src);
1824 dst_nritems = btrfs_header_nritems(dst);
1825 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1826 WARN_ON(btrfs_header_generation(src) != trans->transid);
1827 WARN_ON(btrfs_header_generation(dst) != trans->transid);
1828
1829 if (!empty && src_nritems <= 8)
1830 return 1;
1831
1832 if (push_items <= 0) {
1833 return 1;
1834 }
1835
1836 if (empty) {
1837 push_items = min(src_nritems, push_items);
1838 if (push_items < src_nritems) {
1839 /* leave at least 8 pointers in the node if
1840 * we aren't going to empty it
1841 */
1842 if (src_nritems - push_items < 8) {
1843 if (push_items <= 8)
1844 return 1;
1845 push_items -= 8;
1846 }
1847 }
1848 } else
1849 push_items = min(src_nritems - 8, push_items);
1850
1851 copy_extent_buffer(dst, src,
1852 btrfs_node_key_ptr_offset(dst_nritems),
1853 btrfs_node_key_ptr_offset(0),
1854 push_items * sizeof(struct btrfs_key_ptr));
1855
1856 if (push_items < src_nritems) {
1857 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
1858 btrfs_node_key_ptr_offset(push_items),
1859 (src_nritems - push_items) *
1860 sizeof(struct btrfs_key_ptr));
1861 }
1862 btrfs_set_header_nritems(src, src_nritems - push_items);
1863 btrfs_set_header_nritems(dst, dst_nritems + push_items);
1864 btrfs_mark_buffer_dirty(src);
1865 btrfs_mark_buffer_dirty(dst);
1866
1867 ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
1868 BUG_ON(ret);
1869
1870 return ret;
1871}
1872
1873/*
1874 * try to push data from one node into the next node right in the
1875 * tree.
1876 *
1877 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
1878 * error, and > 0 if there was no room in the right hand block.
1879 *
1880 * this will only push up to 1/2 the contents of the left node over
1881 */
1882static int balance_node_right(struct btrfs_trans_handle *trans,
1883 struct btrfs_root *root,
1884 struct extent_buffer *dst,
1885 struct extent_buffer *src)
1886{
1887 int push_items = 0;
1888 int max_push;
1889 int src_nritems;
1890 int dst_nritems;
1891 int ret = 0;
1892
1893 WARN_ON(btrfs_header_generation(src) != trans->transid);
1894 WARN_ON(btrfs_header_generation(dst) != trans->transid);
1895
1896 src_nritems = btrfs_header_nritems(src);
1897 dst_nritems = btrfs_header_nritems(dst);
1898 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1899 if (push_items <= 0) {
1900 return 1;
1901 }
1902
1903 if (src_nritems < 4) {
1904 return 1;
1905 }
1906
1907 max_push = src_nritems / 2 + 1;
1908 /* don't try to empty the node */
1909 if (max_push >= src_nritems) {
1910 return 1;
1911 }
1912
1913 if (max_push < push_items)
1914 push_items = max_push;
1915
1916 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
1917 btrfs_node_key_ptr_offset(0),
1918 (dst_nritems) *
1919 sizeof(struct btrfs_key_ptr));
1920
1921 copy_extent_buffer(dst, src,
1922 btrfs_node_key_ptr_offset(0),
1923 btrfs_node_key_ptr_offset(src_nritems - push_items),
1924 push_items * sizeof(struct btrfs_key_ptr));
1925
1926 btrfs_set_header_nritems(src, src_nritems - push_items);
1927 btrfs_set_header_nritems(dst, dst_nritems + push_items);
1928
1929 btrfs_mark_buffer_dirty(src);
1930 btrfs_mark_buffer_dirty(dst);
1931
1932 ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
1933 BUG_ON(ret);
1934
1935 return ret;
1936}
1937
1938/*
1939 * helper function to insert a new root level in the tree.
1940 * A new node is allocated, and a single item is inserted to
1941 * point to the existing root
1942 *
1943 * returns zero on success or < 0 on failure.
1944 */
1945static int noinline insert_new_root(struct btrfs_trans_handle *trans,
1946 struct btrfs_root *root,
1947 struct btrfs_path *path, int level)
1948{
1949 u64 lower_gen;
1950 struct extent_buffer *lower;
1951 struct extent_buffer *c;
1952 struct extent_buffer *old;
1953 struct btrfs_disk_key lower_key;
1954 int ret;
1955
1956 BUG_ON(path->nodes[level]);
1957 BUG_ON(path->nodes[level-1] != root->node);
1958
1959 lower = path->nodes[level-1];
1960 if (level == 1)
1961 btrfs_item_key(lower, &lower_key, 0);
1962 else
1963 btrfs_node_key(lower, &lower_key, 0);
1964
1965 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
1966 root->root_key.objectid, trans->transid,
1967 level, root->node->start, 0);
1968 if (IS_ERR(c))
1969 return PTR_ERR(c);
1970
1971 memset_extent_buffer(c, 0, 0, root->nodesize);
1972 btrfs_set_header_nritems(c, 1);
1973 btrfs_set_header_level(c, level);
1974 btrfs_set_header_bytenr(c, c->start);
1975 btrfs_set_header_generation(c, trans->transid);
1976 btrfs_set_header_owner(c, root->root_key.objectid);
1977
1978 write_extent_buffer(c, root->fs_info->fsid,
1979 (unsigned long)btrfs_header_fsid(c),
1980 BTRFS_FSID_SIZE);
1981
1982 write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
1983 (unsigned long)btrfs_header_chunk_tree_uuid(c),
1984 BTRFS_UUID_SIZE);
1985
1986 btrfs_set_node_key(c, &lower_key, 0);
1987 btrfs_set_node_blockptr(c, 0, lower->start);
1988 lower_gen = btrfs_header_generation(lower);
1989 WARN_ON(lower_gen != trans->transid);
1990
1991 btrfs_set_node_ptr_generation(c, 0, lower_gen);
1992
1993 btrfs_mark_buffer_dirty(c);
1994
1995 spin_lock(&root->node_lock);
1996 old = root->node;
1997 root->node = c;
1998 spin_unlock(&root->node_lock);
1999
2000 ret = btrfs_update_extent_ref(trans, root, lower->start,
2001 lower->start, c->start,
2002 root->root_key.objectid,
2003 trans->transid, level - 1);
2004 BUG_ON(ret);
2005
2006 /* the super has an extra ref to root->node */
2007 free_extent_buffer(old);
2008
2009 add_root_to_dirty_list(root);
2010 extent_buffer_get(c);
2011 path->nodes[level] = c;
2012 path->locks[level] = 1;
2013 path->slots[level] = 0;
2014 return 0;
2015}
2016
2017/*
2018 * worker function to insert a single pointer in a node.
2019 * the node should have enough room for the pointer already
2020 *
2021 * slot and level indicate where you want the key to go, and
2022 * blocknr is the block the key points to.
2023 *
2024 * returns zero on success and < 0 on any error
2025 */
2026static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2027 *root, struct btrfs_path *path, struct btrfs_disk_key
2028 *key, u64 bytenr, int slot, int level)
2029{
2030 struct extent_buffer *lower;
2031 int nritems;
2032
2033 BUG_ON(!path->nodes[level]);
2034 lower = path->nodes[level];
2035 nritems = btrfs_header_nritems(lower);
2036 if (slot > nritems)
2037 BUG();
2038 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2039 BUG();
2040 if (slot != nritems) {
2041 memmove_extent_buffer(lower,
2042 btrfs_node_key_ptr_offset(slot + 1),
2043 btrfs_node_key_ptr_offset(slot),
2044 (nritems - slot) * sizeof(struct btrfs_key_ptr));
2045 }
2046 btrfs_set_node_key(lower, key, slot);
2047 btrfs_set_node_blockptr(lower, slot, bytenr);
2048 WARN_ON(trans->transid == 0);
2049 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2050 btrfs_set_header_nritems(lower, nritems + 1);
2051 btrfs_mark_buffer_dirty(lower);
2052 return 0;
2053}
2054
2055/*
2056 * split the node at the specified level in path in two.
2057 * The path is corrected to point to the appropriate node after the split
2058 *
2059 * Before splitting this tries to make some room in the node by pushing
2060 * left and right, if either one works, it returns right away.
2061 *
2062 * returns 0 on success and < 0 on failure
2063 */
2064static noinline int split_node(struct btrfs_trans_handle *trans,
2065 struct btrfs_root *root,
2066 struct btrfs_path *path, int level)
2067{
2068 struct extent_buffer *c;
2069 struct extent_buffer *split;
2070 struct btrfs_disk_key disk_key;
2071 int mid;
2072 int ret;
2073 int wret;
2074 u32 c_nritems;
2075
2076 c = path->nodes[level];
2077 WARN_ON(btrfs_header_generation(c) != trans->transid);
2078 if (c == root->node) {
2079 /* trying to split the root, lets make a new one */
2080 ret = insert_new_root(trans, root, path, level + 1);
2081 if (ret)
2082 return ret;
2083 } else {
2084 ret = push_nodes_for_insert(trans, root, path, level);
2085 c = path->nodes[level];
2086 if (!ret && btrfs_header_nritems(c) <
2087 BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2088 return 0;
2089 if (ret < 0)
2090 return ret;
2091 }
2092
2093 c_nritems = btrfs_header_nritems(c);
2094
2095 split = btrfs_alloc_free_block(trans, root, root->nodesize,
2096 path->nodes[level + 1]->start,
2097 root->root_key.objectid,
2098 trans->transid, level, c->start, 0);
2099 if (IS_ERR(split))
2100 return PTR_ERR(split);
2101
2102 btrfs_set_header_flags(split, btrfs_header_flags(c));
2103 btrfs_set_header_level(split, btrfs_header_level(c));
2104 btrfs_set_header_bytenr(split, split->start);
2105 btrfs_set_header_generation(split, trans->transid);
2106 btrfs_set_header_owner(split, root->root_key.objectid);
2107 btrfs_set_header_flags(split, 0);
2108 write_extent_buffer(split, root->fs_info->fsid,
2109 (unsigned long)btrfs_header_fsid(split),
2110 BTRFS_FSID_SIZE);
2111 write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2112 (unsigned long)btrfs_header_chunk_tree_uuid(split),
2113 BTRFS_UUID_SIZE);
2114
2115 mid = (c_nritems + 1) / 2;
2116
2117 copy_extent_buffer(split, c,
2118 btrfs_node_key_ptr_offset(0),
2119 btrfs_node_key_ptr_offset(mid),
2120 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2121 btrfs_set_header_nritems(split, c_nritems - mid);
2122 btrfs_set_header_nritems(c, mid);
2123 ret = 0;
2124
2125 btrfs_mark_buffer_dirty(c);
2126 btrfs_mark_buffer_dirty(split);
2127
2128 btrfs_node_key(split, &disk_key, 0);
2129 wret = insert_ptr(trans, root, path, &disk_key, split->start,
2130 path->slots[level + 1] + 1,
2131 level + 1);
2132 if (wret)
2133 ret = wret;
2134
2135 ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
2136 BUG_ON(ret);
2137
2138 if (path->slots[level] >= mid) {
2139 path->slots[level] -= mid;
2140 btrfs_tree_unlock(c);
2141 free_extent_buffer(c);
2142 path->nodes[level] = split;
2143 path->slots[level + 1] += 1;
2144 } else {
2145 btrfs_tree_unlock(split);
2146 free_extent_buffer(split);
2147 }
2148 return ret;
2149}
2150
2151/*
2152 * how many bytes are required to store the items in a leaf. start
2153 * and nr indicate which items in the leaf to check. This totals up the
2154 * space used both by the item structs and the item data
2155 */
2156static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2157{
2158 int data_len;
2159 int nritems = btrfs_header_nritems(l);
2160 int end = min(nritems, start + nr) - 1;
2161
2162 if (!nr)
2163 return 0;
2164 data_len = btrfs_item_end_nr(l, start);
2165 data_len = data_len - btrfs_item_offset_nr(l, end);
2166 data_len += sizeof(struct btrfs_item) * nr;
2167 WARN_ON(data_len < 0);
2168 return data_len;
2169}
2170
2171/*
2172 * The space between the end of the leaf items and
2173 * the start of the leaf data. IOW, how much room
2174 * the leaf has left for both items and data
2175 */
2176int noinline btrfs_leaf_free_space(struct btrfs_root *root,
2177 struct extent_buffer *leaf)
2178{
2179 int nritems = btrfs_header_nritems(leaf);
2180 int ret;
2181 ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2182 if (ret < 0) {
2183 printk("leaf free space ret %d, leaf data size %lu, used %d nritems %d\n",
2184 ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2185 leaf_space_used(leaf, 0, nritems), nritems);
2186 }
2187 return ret;
2188}
2189
2190/*
2191 * push some data in the path leaf to the right, trying to free up at
2192 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2193 *
2194 * returns 1 if the push failed because the other node didn't have enough
2195 * room, 0 if everything worked out and < 0 if there were major errors.
2196 */
2197static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2198 *root, struct btrfs_path *path, int data_size,
2199 int empty)
2200{
2201 struct extent_buffer *left = path->nodes[0];
2202 struct extent_buffer *right;
2203 struct extent_buffer *upper;
2204 struct btrfs_disk_key disk_key;
2205 int slot;
2206 u32 i;
2207 int free_space;
2208 int push_space = 0;
2209 int push_items = 0;
2210 struct btrfs_item *item;
2211 u32 left_nritems;
2212 u32 nr;
2213 u32 right_nritems;
2214 u32 data_end;
2215 u32 this_item_size;
2216 int ret;
2217
2218 slot = path->slots[1];
2219 if (!path->nodes[1]) {
2220 return 1;
2221 }
2222 upper = path->nodes[1];
2223 if (slot >= btrfs_header_nritems(upper) - 1)
2224 return 1;
2225
2226 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2227
2228 right = read_node_slot(root, upper, slot + 1);
2229 btrfs_tree_lock(right);
2230 free_space = btrfs_leaf_free_space(root, right);
2231 if (free_space < data_size + sizeof(struct btrfs_item))
2232 goto out_unlock;
2233
2234 /* cow and double check */
2235 ret = btrfs_cow_block(trans, root, right, upper,
2236 slot + 1, &right, 0);
2237 if (ret)
2238 goto out_unlock;
2239
2240 free_space = btrfs_leaf_free_space(root, right);
2241 if (free_space < data_size + sizeof(struct btrfs_item))
2242 goto out_unlock;
2243
2244 left_nritems = btrfs_header_nritems(left);
2245 if (left_nritems == 0)
2246 goto out_unlock;
2247
2248 if (empty)
2249 nr = 0;
2250 else
2251 nr = 1;
2252
2253 if (path->slots[0] >= left_nritems)
2254 push_space += data_size + sizeof(*item);
2255
2256 i = left_nritems - 1;
2257 while (i >= nr) {
2258 item = btrfs_item_nr(left, i);
2259
2260 if (!empty && push_items > 0) {
2261 if (path->slots[0] > i)
2262 break;
2263 if (path->slots[0] == i) {
2264 int space = btrfs_leaf_free_space(root, left);
2265 if (space + push_space * 2 > free_space)
2266 break;
2267 }
2268 }
2269
2270 if (path->slots[0] == i)
2271 push_space += data_size + sizeof(*item);
2272
2273 if (!left->map_token) {
2274 map_extent_buffer(left, (unsigned long)item,
2275 sizeof(struct btrfs_item),
2276 &left->map_token, &left->kaddr,
2277 &left->map_start, &left->map_len,
2278 KM_USER1);
2279 }
2280
2281 this_item_size = btrfs_item_size(left, item);
2282 if (this_item_size + sizeof(*item) + push_space > free_space)
2283 break;
2284
2285 push_items++;
2286 push_space += this_item_size + sizeof(*item);
2287 if (i == 0)
2288 break;
2289 i--;
2290 }
2291 if (left->map_token) {
2292 unmap_extent_buffer(left, left->map_token, KM_USER1);
2293 left->map_token = NULL;
2294 }
2295
2296 if (push_items == 0)
2297 goto out_unlock;
2298
2299 if (!empty && push_items == left_nritems)
2300 WARN_ON(1);
2301
2302 /* push left to right */
2303 right_nritems = btrfs_header_nritems(right);
2304
2305 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2306 push_space -= leaf_data_end(root, left);
2307
2308 /* make room in the right data area */
2309 data_end = leaf_data_end(root, right);
2310 memmove_extent_buffer(right,
2311 btrfs_leaf_data(right) + data_end - push_space,
2312 btrfs_leaf_data(right) + data_end,
2313 BTRFS_LEAF_DATA_SIZE(root) - data_end);
2314
2315 /* copy from the left data area */
2316 copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2317 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2318 btrfs_leaf_data(left) + leaf_data_end(root, left),
2319 push_space);
2320
2321 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2322 btrfs_item_nr_offset(0),
2323 right_nritems * sizeof(struct btrfs_item));
2324
2325 /* copy the items from left to right */
2326 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2327 btrfs_item_nr_offset(left_nritems - push_items),
2328 push_items * sizeof(struct btrfs_item));
2329
2330 /* update the item pointers */
2331 right_nritems += push_items;
2332 btrfs_set_header_nritems(right, right_nritems);
2333 push_space = BTRFS_LEAF_DATA_SIZE(root);
2334 for (i = 0; i < right_nritems; i++) {
2335 item = btrfs_item_nr(right, i);
2336 if (!right->map_token) {
2337 map_extent_buffer(right, (unsigned long)item,
2338 sizeof(struct btrfs_item),
2339 &right->map_token, &right->kaddr,
2340 &right->map_start, &right->map_len,
2341 KM_USER1);
2342 }
2343 push_space -= btrfs_item_size(right, item);
2344 btrfs_set_item_offset(right, item, push_space);
2345 }
2346
2347 if (right->map_token) {
2348 unmap_extent_buffer(right, right->map_token, KM_USER1);
2349 right->map_token = NULL;
2350 }
2351 left_nritems -= push_items;
2352 btrfs_set_header_nritems(left, left_nritems);
2353
2354 if (left_nritems)
2355 btrfs_mark_buffer_dirty(left);
2356 btrfs_mark_buffer_dirty(right);
2357
2358 ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
2359 BUG_ON(ret);
2360
2361 btrfs_item_key(right, &disk_key, 0);
2362 btrfs_set_node_key(upper, &disk_key, slot + 1);
2363 btrfs_mark_buffer_dirty(upper);
2364
2365 /* then fixup the leaf pointer in the path */
2366 if (path->slots[0] >= left_nritems) {
2367 path->slots[0] -= left_nritems;
2368 if (btrfs_header_nritems(path->nodes[0]) == 0)
2369 clean_tree_block(trans, root, path->nodes[0]);
2370 btrfs_tree_unlock(path->nodes[0]);
2371 free_extent_buffer(path->nodes[0]);
2372 path->nodes[0] = right;
2373 path->slots[1] += 1;
2374 } else {
2375 btrfs_tree_unlock(right);
2376 free_extent_buffer(right);
2377 }
2378 return 0;
2379
2380out_unlock:
2381 btrfs_tree_unlock(right);
2382 free_extent_buffer(right);
2383 return 1;
2384}
2385
2386/*
2387 * push some data in the path leaf to the left, trying to free up at
2388 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2389 */
2390static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2391 *root, struct btrfs_path *path, int data_size,
2392 int empty)
2393{
2394 struct btrfs_disk_key disk_key;
2395 struct extent_buffer *right = path->nodes[0];
2396 struct extent_buffer *left;
2397 int slot;
2398 int i;
2399 int free_space;
2400 int push_space = 0;
2401 int push_items = 0;
2402 struct btrfs_item *item;
2403 u32 old_left_nritems;
2404 u32 right_nritems;
2405 u32 nr;
2406 int ret = 0;
2407 int wret;
2408 u32 this_item_size;
2409 u32 old_left_item_size;
2410
2411 slot = path->slots[1];
2412 if (slot == 0)
2413 return 1;
2414 if (!path->nodes[1])
2415 return 1;
2416
2417 right_nritems = btrfs_header_nritems(right);
2418 if (right_nritems == 0) {
2419 return 1;
2420 }
2421
2422 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2423
2424 left = read_node_slot(root, path->nodes[1], slot - 1);
2425 btrfs_tree_lock(left);
2426 free_space = btrfs_leaf_free_space(root, left);
2427 if (free_space < data_size + sizeof(struct btrfs_item)) {
2428 ret = 1;
2429 goto out;
2430 }
2431
2432 /* cow and double check */
2433 ret = btrfs_cow_block(trans, root, left,
2434 path->nodes[1], slot - 1, &left, 0);
2435 if (ret) {
2436 /* we hit -ENOSPC, but it isn't fatal here */
2437 ret = 1;
2438 goto out;
2439 }
2440
2441 free_space = btrfs_leaf_free_space(root, left);
2442 if (free_space < data_size + sizeof(struct btrfs_item)) {
2443 ret = 1;
2444 goto out;
2445 }
2446
2447 if (empty)
2448 nr = right_nritems;
2449 else
2450 nr = right_nritems - 1;
2451
2452 for (i = 0; i < nr; i++) {
2453 item = btrfs_item_nr(right, i);
2454 if (!right->map_token) {
2455 map_extent_buffer(right, (unsigned long)item,
2456 sizeof(struct btrfs_item),
2457 &right->map_token, &right->kaddr,
2458 &right->map_start, &right->map_len,
2459 KM_USER1);
2460 }
2461
2462 if (!empty && push_items > 0) {
2463 if (path->slots[0] < i)
2464 break;
2465 if (path->slots[0] == i) {
2466 int space = btrfs_leaf_free_space(root, right);
2467 if (space + push_space * 2 > free_space)
2468 break;
2469 }
2470 }
2471
2472 if (path->slots[0] == i)
2473 push_space += data_size + sizeof(*item);
2474
2475 this_item_size = btrfs_item_size(right, item);
2476 if (this_item_size + sizeof(*item) + push_space > free_space)
2477 break;
2478
2479 push_items++;
2480 push_space += this_item_size + sizeof(*item);
2481 }
2482
2483 if (right->map_token) {
2484 unmap_extent_buffer(right, right->map_token, KM_USER1);
2485 right->map_token = NULL;
2486 }
2487
2488 if (push_items == 0) {
2489 ret = 1;
2490 goto out;
2491 }
2492 if (!empty && push_items == btrfs_header_nritems(right))
2493 WARN_ON(1);
2494
2495 /* push data from right to left */
2496 copy_extent_buffer(left, right,
2497 btrfs_item_nr_offset(btrfs_header_nritems(left)),
2498 btrfs_item_nr_offset(0),
2499 push_items * sizeof(struct btrfs_item));
2500
2501 push_space = BTRFS_LEAF_DATA_SIZE(root) -
2502 btrfs_item_offset_nr(right, push_items -1);
2503
2504 copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2505 leaf_data_end(root, left) - push_space,
2506 btrfs_leaf_data(right) +
2507 btrfs_item_offset_nr(right, push_items - 1),
2508 push_space);
2509 old_left_nritems = btrfs_header_nritems(left);
2510 BUG_ON(old_left_nritems < 0);
2511
2512 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2513 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2514 u32 ioff;
2515
2516 item = btrfs_item_nr(left, i);
2517 if (!left->map_token) {
2518 map_extent_buffer(left, (unsigned long)item,
2519 sizeof(struct btrfs_item),
2520 &left->map_token, &left->kaddr,
2521 &left->map_start, &left->map_len,
2522 KM_USER1);
2523 }
2524
2525 ioff = btrfs_item_offset(left, item);
2526 btrfs_set_item_offset(left, item,
2527 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2528 }
2529 btrfs_set_header_nritems(left, old_left_nritems + push_items);
2530 if (left->map_token) {
2531 unmap_extent_buffer(left, left->map_token, KM_USER1);
2532 left->map_token = NULL;
2533 }
2534
2535 /* fixup right node */
2536 if (push_items > right_nritems) {
2537 printk("push items %d nr %u\n", push_items, right_nritems);
2538 WARN_ON(1);
2539 }
2540
2541 if (push_items < right_nritems) {
2542 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2543 leaf_data_end(root, right);
2544 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2545 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2546 btrfs_leaf_data(right) +
2547 leaf_data_end(root, right), push_space);
2548
2549 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2550 btrfs_item_nr_offset(push_items),
2551 (btrfs_header_nritems(right) - push_items) *
2552 sizeof(struct btrfs_item));
2553 }
2554 right_nritems -= push_items;
2555 btrfs_set_header_nritems(right, right_nritems);
2556 push_space = BTRFS_LEAF_DATA_SIZE(root);
2557 for (i = 0; i < right_nritems; i++) {
2558 item = btrfs_item_nr(right, i);
2559
2560 if (!right->map_token) {
2561 map_extent_buffer(right, (unsigned long)item,
2562 sizeof(struct btrfs_item),
2563 &right->map_token, &right->kaddr,
2564 &right->map_start, &right->map_len,
2565 KM_USER1);
2566 }
2567
2568 push_space = push_space - btrfs_item_size(right, item);
2569 btrfs_set_item_offset(right, item, push_space);
2570 }
2571 if (right->map_token) {
2572 unmap_extent_buffer(right, right->map_token, KM_USER1);
2573 right->map_token = NULL;
2574 }
2575
2576 btrfs_mark_buffer_dirty(left);
2577 if (right_nritems)
2578 btrfs_mark_buffer_dirty(right);
2579
2580 ret = btrfs_update_ref(trans, root, right, left,
2581 old_left_nritems, push_items);
2582 BUG_ON(ret);
2583
2584 btrfs_item_key(right, &disk_key, 0);
2585 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2586 if (wret)
2587 ret = wret;
2588
2589 /* then fixup the leaf pointer in the path */
2590 if (path->slots[0] < push_items) {
2591 path->slots[0] += old_left_nritems;
2592 if (btrfs_header_nritems(path->nodes[0]) == 0)
2593 clean_tree_block(trans, root, path->nodes[0]);
2594 btrfs_tree_unlock(path->nodes[0]);
2595 free_extent_buffer(path->nodes[0]);
2596 path->nodes[0] = left;
2597 path->slots[1] -= 1;
2598 } else {
2599 btrfs_tree_unlock(left);
2600 free_extent_buffer(left);
2601 path->slots[0] -= push_items;
2602 }
2603 BUG_ON(path->slots[0] < 0);
2604 return ret;
2605out:
2606 btrfs_tree_unlock(left);
2607 free_extent_buffer(left);
2608 return ret;
2609}
2610
2611/*
2612 * split the path's leaf in two, making sure there is at least data_size
2613 * available for the resulting leaf level of the path.
2614 *
2615 * returns 0 if all went well and < 0 on failure.
2616 */
2617static noinline int split_leaf(struct btrfs_trans_handle *trans,
2618 struct btrfs_root *root,
2619 struct btrfs_key *ins_key,
2620 struct btrfs_path *path, int data_size,
2621 int extend)
2622{
2623 struct extent_buffer *l;
2624 u32 nritems;
2625 int mid;
2626 int slot;
2627 struct extent_buffer *right;
2628 int space_needed = data_size + sizeof(struct btrfs_item);
2629 int data_copy_size;
2630 int rt_data_off;
2631 int i;
2632 int ret = 0;
2633 int wret;
2634 int double_split;
2635 int num_doubles = 0;
2636 struct btrfs_disk_key disk_key;
2637
2638 if (extend)
2639 space_needed = data_size;
2640
2641 /* first try to make some room by pushing left and right */
2642 if (ins_key->type != BTRFS_DIR_ITEM_KEY) {
2643 wret = push_leaf_right(trans, root, path, data_size, 0);
2644 if (wret < 0) {
2645 return wret;
2646 }
2647 if (wret) {
2648 wret = push_leaf_left(trans, root, path, data_size, 0);
2649 if (wret < 0)
2650 return wret;
2651 }
2652 l = path->nodes[0];
2653
2654 /* did the pushes work? */
2655 if (btrfs_leaf_free_space(root, l) >= space_needed)
2656 return 0;
2657 }
2658
2659 if (!path->nodes[1]) {
2660 ret = insert_new_root(trans, root, path, 1);
2661 if (ret)
2662 return ret;
2663 }
2664again:
2665 double_split = 0;
2666 l = path->nodes[0];
2667 slot = path->slots[0];
2668 nritems = btrfs_header_nritems(l);
2669 mid = (nritems + 1)/ 2;
2670
2671 right = btrfs_alloc_free_block(trans, root, root->leafsize,
2672 path->nodes[1]->start,
2673 root->root_key.objectid,
2674 trans->transid, 0, l->start, 0);
2675 if (IS_ERR(right)) {
2676 BUG_ON(1);
2677 return PTR_ERR(right);
2678 }
2679
2680 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2681 btrfs_set_header_bytenr(right, right->start);
2682 btrfs_set_header_generation(right, trans->transid);
2683 btrfs_set_header_owner(right, root->root_key.objectid);
2684 btrfs_set_header_level(right, 0);
2685 write_extent_buffer(right, root->fs_info->fsid,
2686 (unsigned long)btrfs_header_fsid(right),
2687 BTRFS_FSID_SIZE);
2688
2689 write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2690 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2691 BTRFS_UUID_SIZE);
2692 if (mid <= slot) {
2693 if (nritems == 1 ||
2694 leaf_space_used(l, mid, nritems - mid) + space_needed >
2695 BTRFS_LEAF_DATA_SIZE(root)) {
2696 if (slot >= nritems) {
2697 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2698 btrfs_set_header_nritems(right, 0);
2699 wret = insert_ptr(trans, root, path,
2700 &disk_key, right->start,
2701 path->slots[1] + 1, 1);
2702 if (wret)
2703 ret = wret;
2704
2705 btrfs_tree_unlock(path->nodes[0]);
2706 free_extent_buffer(path->nodes[0]);
2707 path->nodes[0] = right;
2708 path->slots[0] = 0;
2709 path->slots[1] += 1;
2710 btrfs_mark_buffer_dirty(right);
2711 return ret;
2712 }
2713 mid = slot;
2714 if (mid != nritems &&
2715 leaf_space_used(l, mid, nritems - mid) +
2716 space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
2717 double_split = 1;
2718 }
2719 }
2720 } else {
2721 if (leaf_space_used(l, 0, mid + 1) + space_needed >
2722 BTRFS_LEAF_DATA_SIZE(root)) {
2723 if (!extend && slot == 0) {
2724 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2725 btrfs_set_header_nritems(right, 0);
2726 wret = insert_ptr(trans, root, path,
2727 &disk_key,
2728 right->start,
2729 path->slots[1], 1);
2730 if (wret)
2731 ret = wret;
2732 btrfs_tree_unlock(path->nodes[0]);
2733 free_extent_buffer(path->nodes[0]);
2734 path->nodes[0] = right;
2735 path->slots[0] = 0;
2736 if (path->slots[1] == 0) {
2737 wret = fixup_low_keys(trans, root,
2738 path, &disk_key, 1);
2739 if (wret)
2740 ret = wret;
2741 }
2742 btrfs_mark_buffer_dirty(right);
2743 return ret;
2744 } else if (extend && slot == 0) {
2745 mid = 1;
2746 } else {
2747 mid = slot;
2748 if (mid != nritems &&
2749 leaf_space_used(l, mid, nritems - mid) +
2750 space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
2751 double_split = 1;
2752 }
2753 }
2754 }
2755 }
2756 nritems = nritems - mid;
2757 btrfs_set_header_nritems(right, nritems);
2758 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2759
2760 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2761 btrfs_item_nr_offset(mid),
2762 nritems * sizeof(struct btrfs_item));
2763
2764 copy_extent_buffer(right, l,
2765 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2766 data_copy_size, btrfs_leaf_data(l) +
2767 leaf_data_end(root, l), data_copy_size);
2768
2769 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2770 btrfs_item_end_nr(l, mid);
2771
2772 for (i = 0; i < nritems; i++) {
2773 struct btrfs_item *item = btrfs_item_nr(right, i);
2774 u32 ioff;
2775
2776 if (!right->map_token) {
2777 map_extent_buffer(right, (unsigned long)item,
2778 sizeof(struct btrfs_item),
2779 &right->map_token, &right->kaddr,
2780 &right->map_start, &right->map_len,
2781 KM_USER1);
2782 }
2783
2784 ioff = btrfs_item_offset(right, item);
2785 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2786 }
2787
2788 if (right->map_token) {
2789 unmap_extent_buffer(right, right->map_token, KM_USER1);
2790 right->map_token = NULL;
2791 }
2792
2793 btrfs_set_header_nritems(l, mid);
2794 ret = 0;
2795 btrfs_item_key(right, &disk_key, 0);
2796 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2797 path->slots[1] + 1, 1);
2798 if (wret)
2799 ret = wret;
2800
2801 btrfs_mark_buffer_dirty(right);
2802 btrfs_mark_buffer_dirty(l);
2803 BUG_ON(path->slots[0] != slot);
2804
2805 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2806 BUG_ON(ret);
2807
2808 if (mid <= slot) {
2809 btrfs_tree_unlock(path->nodes[0]);
2810 free_extent_buffer(path->nodes[0]);
2811 path->nodes[0] = right;
2812 path->slots[0] -= mid;
2813 path->slots[1] += 1;
2814 } else {
2815 btrfs_tree_unlock(right);
2816 free_extent_buffer(right);
2817 }
2818
2819 BUG_ON(path->slots[0] < 0);
2820
2821 if (double_split) {
2822 BUG_ON(num_doubles != 0);
2823 num_doubles++;
2824 goto again;
2825 }
2826 return ret;
2827}
2828
2829/*
2830 * make the item pointed to by the path smaller. new_size indicates
2831 * how small to make it, and from_end tells us if we just chop bytes
2832 * off the end of the item or if we shift the item to chop bytes off
2833 * the front.
2834 */
2835int btrfs_truncate_item(struct btrfs_trans_handle *trans,
2836 struct btrfs_root *root,
2837 struct btrfs_path *path,
2838 u32 new_size, int from_end)
2839{
2840 int ret = 0;
2841 int slot;
2842 int slot_orig;
2843 struct extent_buffer *leaf;
2844 struct btrfs_item *item;
2845 u32 nritems;
2846 unsigned int data_end;
2847 unsigned int old_data_start;
2848 unsigned int old_size;
2849 unsigned int size_diff;
2850 int i;
2851
2852 slot_orig = path->slots[0];
2853 leaf = path->nodes[0];
2854 slot = path->slots[0];
2855
2856 old_size = btrfs_item_size_nr(leaf, slot);
2857 if (old_size == new_size)
2858 return 0;
2859
2860 nritems = btrfs_header_nritems(leaf);
2861 data_end = leaf_data_end(root, leaf);
2862
2863 old_data_start = btrfs_item_offset_nr(leaf, slot);
2864
2865 size_diff = old_size - new_size;
2866
2867 BUG_ON(slot < 0);
2868 BUG_ON(slot >= nritems);
2869
2870 /*
2871 * item0..itemN ... dataN.offset..dataN.size .. data0.size
2872 */
2873 /* first correct the data pointers */
2874 for (i = slot; i < nritems; i++) {
2875 u32 ioff;
2876 item = btrfs_item_nr(leaf, i);
2877
2878 if (!leaf->map_token) {
2879 map_extent_buffer(leaf, (unsigned long)item,
2880 sizeof(struct btrfs_item),
2881 &leaf->map_token, &leaf->kaddr,
2882 &leaf->map_start, &leaf->map_len,
2883 KM_USER1);
2884 }
2885
2886 ioff = btrfs_item_offset(leaf, item);
2887 btrfs_set_item_offset(leaf, item, ioff + size_diff);
2888 }
2889
2890 if (leaf->map_token) {
2891 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2892 leaf->map_token = NULL;
2893 }
2894
2895 /* shift the data */
2896 if (from_end) {
2897 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2898 data_end + size_diff, btrfs_leaf_data(leaf) +
2899 data_end, old_data_start + new_size - data_end);
2900 } else {
2901 struct btrfs_disk_key disk_key;
2902 u64 offset;
2903
2904 btrfs_item_key(leaf, &disk_key, slot);
2905
2906 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
2907 unsigned long ptr;
2908 struct btrfs_file_extent_item *fi;
2909
2910 fi = btrfs_item_ptr(leaf, slot,
2911 struct btrfs_file_extent_item);
2912 fi = (struct btrfs_file_extent_item *)(
2913 (unsigned long)fi - size_diff);
2914
2915 if (btrfs_file_extent_type(leaf, fi) ==
2916 BTRFS_FILE_EXTENT_INLINE) {
2917 ptr = btrfs_item_ptr_offset(leaf, slot);
2918 memmove_extent_buffer(leaf, ptr,
2919 (unsigned long)fi,
2920 offsetof(struct btrfs_file_extent_item,
2921 disk_bytenr));
2922 }
2923 }
2924
2925 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2926 data_end + size_diff, btrfs_leaf_data(leaf) +
2927 data_end, old_data_start - data_end);
2928
2929 offset = btrfs_disk_key_offset(&disk_key);
2930 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
2931 btrfs_set_item_key(leaf, &disk_key, slot);
2932 if (slot == 0)
2933 fixup_low_keys(trans, root, path, &disk_key, 1);
2934 }
2935
2936 item = btrfs_item_nr(leaf, slot);
2937 btrfs_set_item_size(leaf, item, new_size);
2938 btrfs_mark_buffer_dirty(leaf);
2939
2940 ret = 0;
2941 if (btrfs_leaf_free_space(root, leaf) < 0) {
2942 btrfs_print_leaf(root, leaf);
2943 BUG();
2944 }
2945 return ret;
2946}
2947
2948/*
2949 * make the item pointed to by the path bigger, data_size is the new size.
2950 */
2951int btrfs_extend_item(struct btrfs_trans_handle *trans,
2952 struct btrfs_root *root, struct btrfs_path *path,
2953 u32 data_size)
2954{
2955 int ret = 0;
2956 int slot;
2957 int slot_orig;
2958 struct extent_buffer *leaf;
2959 struct btrfs_item *item;
2960 u32 nritems;
2961 unsigned int data_end;
2962 unsigned int old_data;
2963 unsigned int old_size;
2964 int i;
2965
2966 slot_orig = path->slots[0];
2967 leaf = path->nodes[0];
2968
2969 nritems = btrfs_header_nritems(leaf);
2970 data_end = leaf_data_end(root, leaf);
2971
2972 if (btrfs_leaf_free_space(root, leaf) < data_size) {
2973 btrfs_print_leaf(root, leaf);
2974 BUG();
2975 }
2976 slot = path->slots[0];
2977 old_data = btrfs_item_end_nr(leaf, slot);
2978
2979 BUG_ON(slot < 0);
2980 if (slot >= nritems) {
2981 btrfs_print_leaf(root, leaf);
2982 printk("slot %d too large, nritems %d\n", slot, nritems);
2983 BUG_ON(1);
2984 }
2985
2986 /*
2987 * item0..itemN ... dataN.offset..dataN.size .. data0.size
2988 */
2989 /* first correct the data pointers */
2990 for (i = slot; i < nritems; i++) {
2991 u32 ioff;
2992 item = btrfs_item_nr(leaf, i);
2993
2994 if (!leaf->map_token) {
2995 map_extent_buffer(leaf, (unsigned long)item,
2996 sizeof(struct btrfs_item),
2997 &leaf->map_token, &leaf->kaddr,
2998 &leaf->map_start, &leaf->map_len,
2999 KM_USER1);
3000 }
3001 ioff = btrfs_item_offset(leaf, item);
3002 btrfs_set_item_offset(leaf, item, ioff - data_size);
3003 }
3004
3005 if (leaf->map_token) {
3006 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3007 leaf->map_token = NULL;
3008 }
3009
3010 /* shift the data */
3011 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3012 data_end - data_size, btrfs_leaf_data(leaf) +
3013 data_end, old_data - data_end);
3014
3015 data_end = old_data;
3016 old_size = btrfs_item_size_nr(leaf, slot);
3017 item = btrfs_item_nr(leaf, slot);
3018 btrfs_set_item_size(leaf, item, old_size + data_size);
3019 btrfs_mark_buffer_dirty(leaf);
3020
3021 ret = 0;
3022 if (btrfs_leaf_free_space(root, leaf) < 0) {
3023 btrfs_print_leaf(root, leaf);
3024 BUG();
3025 }
3026 return ret;
3027}
3028
3029/*
3030 * Given a key and some data, insert items into the tree.
3031 * This does all the path init required, making room in the tree if needed.
3032 * Returns the number of keys that were inserted.
3033 */
3034int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3035 struct btrfs_root *root,
3036 struct btrfs_path *path,
3037 struct btrfs_key *cpu_key, u32 *data_size,
3038 int nr)
3039{
3040 struct extent_buffer *leaf;
3041 struct btrfs_item *item;
3042 int ret = 0;
3043 int slot;
3044 int i;
3045 u32 nritems;
3046 u32 total_data = 0;
3047 u32 total_size = 0;
3048 unsigned int data_end;
3049 struct btrfs_disk_key disk_key;
3050 struct btrfs_key found_key;
3051
3052 found_key.objectid = 0;
3053 nr = min_t(int, nr, BTRFS_NODEPTRS_PER_BLOCK(root));
3054
3055 for (i = 0; i < nr; i++)
3056 total_data += data_size[i];
3057
3058 total_data = min_t(u32, total_data, BTRFS_LEAF_DATA_SIZE(root));
3059 total_size = total_data + (nr * sizeof(struct btrfs_item));
3060 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3061 if (ret == 0)
3062 return -EEXIST;
3063 if (ret < 0)
3064 goto out;
3065
3066 leaf = path->nodes[0];
3067
3068 nritems = btrfs_header_nritems(leaf);
3069 data_end = leaf_data_end(root, leaf);
3070
3071 if (btrfs_leaf_free_space(root, leaf) < total_size) {
3072 for (i = nr; i >= 0; i--) {
3073 total_data -= data_size[i];
3074 total_size -= data_size[i] + sizeof(struct btrfs_item);
3075 if (total_size < btrfs_leaf_free_space(root, leaf))
3076 break;
3077 }
3078 nr = i;
3079 }
3080
3081 slot = path->slots[0];
3082 BUG_ON(slot < 0);
3083
3084 if (slot != nritems) {
3085 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3086
3087 item = btrfs_item_nr(leaf, slot);
3088 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3089
3090 /* figure out how many keys we can insert in here */
3091 total_data = data_size[0];
3092 for (i = 1; i < nr; i++) {
3093 if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3094 break;
3095 total_data += data_size[i];
3096 }
3097 nr = i;
3098
3099 if (old_data < data_end) {
3100 btrfs_print_leaf(root, leaf);
3101 printk("slot %d old_data %d data_end %d\n",
3102 slot, old_data, data_end);
3103 BUG_ON(1);
3104 }
3105 /*
3106 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3107 */
3108 /* first correct the data pointers */
3109 WARN_ON(leaf->map_token);
3110 for (i = slot; i < nritems; i++) {
3111 u32 ioff;
3112
3113 item = btrfs_item_nr(leaf, i);
3114 if (!leaf->map_token) {
3115 map_extent_buffer(leaf, (unsigned long)item,
3116 sizeof(struct btrfs_item),
3117 &leaf->map_token, &leaf->kaddr,
3118 &leaf->map_start, &leaf->map_len,
3119 KM_USER1);
3120 }
3121
3122 ioff = btrfs_item_offset(leaf, item);
3123 btrfs_set_item_offset(leaf, item, ioff - total_data);
3124 }
3125 if (leaf->map_token) {
3126 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3127 leaf->map_token = NULL;
3128 }
3129
3130 /* shift the items */
3131 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3132 btrfs_item_nr_offset(slot),
3133 (nritems - slot) * sizeof(struct btrfs_item));
3134
3135 /* shift the data */
3136 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3137 data_end - total_data, btrfs_leaf_data(leaf) +
3138 data_end, old_data - data_end);
3139 data_end = old_data;
3140 } else {
3141 /*
3142 * this sucks but it has to be done, if we are inserting at
3143 * the end of the leaf only insert 1 of the items, since we
3144 * have no way of knowing whats on the next leaf and we'd have
3145 * to drop our current locks to figure it out
3146 */
3147 nr = 1;
3148 }
3149
3150 /* setup the item for the new data */
3151 for (i = 0; i < nr; i++) {
3152 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3153 btrfs_set_item_key(leaf, &disk_key, slot + i);
3154 item = btrfs_item_nr(leaf, slot + i);
3155 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3156 data_end -= data_size[i];
3157 btrfs_set_item_size(leaf, item, data_size[i]);
3158 }
3159 btrfs_set_header_nritems(leaf, nritems + nr);
3160 btrfs_mark_buffer_dirty(leaf);
3161
3162 ret = 0;
3163 if (slot == 0) {
3164 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3165 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3166 }
3167
3168 if (btrfs_leaf_free_space(root, leaf) < 0) {
3169 btrfs_print_leaf(root, leaf);
3170 BUG();
3171 }
3172out:
3173 if (!ret)
3174 ret = nr;
3175 return ret;
3176}
3177
3178/*
3179 * Given a key and some data, insert items into the tree.
3180 * This does all the path init required, making room in the tree if needed.
3181 */
3182int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3183 struct btrfs_root *root,
3184 struct btrfs_path *path,
3185 struct btrfs_key *cpu_key, u32 *data_size,
3186 int nr)
3187{
3188 struct extent_buffer *leaf;
3189 struct btrfs_item *item;
3190 int ret = 0;
3191 int slot;
3192 int slot_orig;
3193 int i;
3194 u32 nritems;
3195 u32 total_size = 0;
3196 u32 total_data = 0;
3197 unsigned int data_end;
3198 struct btrfs_disk_key disk_key;
3199
3200 for (i = 0; i < nr; i++) {
3201 total_data += data_size[i];
3202 }
3203
3204 total_size = total_data + (nr * sizeof(struct btrfs_item));
3205 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3206 if (ret == 0)
3207 return -EEXIST;
3208 if (ret < 0)
3209 goto out;
3210
3211 slot_orig = path->slots[0];
3212 leaf = path->nodes[0];
3213
3214 nritems = btrfs_header_nritems(leaf);
3215 data_end = leaf_data_end(root, leaf);
3216
3217 if (btrfs_leaf_free_space(root, leaf) < total_size) {
3218 btrfs_print_leaf(root, leaf);
3219 printk("not enough freespace need %u have %d\n",
3220 total_size, btrfs_leaf_free_space(root, leaf));
3221 BUG();
3222 }
3223
3224 slot = path->slots[0];
3225 BUG_ON(slot < 0);
3226
3227 if (slot != nritems) {
3228 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3229
3230 if (old_data < data_end) {
3231 btrfs_print_leaf(root, leaf);
3232 printk("slot %d old_data %d data_end %d\n",
3233 slot, old_data, data_end);
3234 BUG_ON(1);
3235 }
3236 /*
3237 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3238 */
3239 /* first correct the data pointers */
3240 WARN_ON(leaf->map_token);
3241 for (i = slot; i < nritems; i++) {
3242 u32 ioff;
3243
3244 item = btrfs_item_nr(leaf, i);
3245 if (!leaf->map_token) {
3246 map_extent_buffer(leaf, (unsigned long)item,
3247 sizeof(struct btrfs_item),
3248 &leaf->map_token, &leaf->kaddr,
3249 &leaf->map_start, &leaf->map_len,
3250 KM_USER1);
3251 }
3252
3253 ioff = btrfs_item_offset(leaf, item);
3254 btrfs_set_item_offset(leaf, item, ioff - total_data);
3255 }
3256 if (leaf->map_token) {
3257 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3258 leaf->map_token = NULL;
3259 }
3260
3261 /* shift the items */
3262 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3263 btrfs_item_nr_offset(slot),
3264 (nritems - slot) * sizeof(struct btrfs_item));
3265
3266 /* shift the data */
3267 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3268 data_end - total_data, btrfs_leaf_data(leaf) +
3269 data_end, old_data - data_end);
3270 data_end = old_data;
3271 }
3272
3273 /* setup the item for the new data */
3274 for (i = 0; i < nr; i++) {
3275 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3276 btrfs_set_item_key(leaf, &disk_key, slot + i);
3277 item = btrfs_item_nr(leaf, slot + i);
3278 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3279 data_end -= data_size[i];
3280 btrfs_set_item_size(leaf, item, data_size[i]);
3281 }
3282 btrfs_set_header_nritems(leaf, nritems + nr);
3283 btrfs_mark_buffer_dirty(leaf);
3284
3285 ret = 0;
3286 if (slot == 0) {
3287 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3288 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3289 }
3290
3291 if (btrfs_leaf_free_space(root, leaf) < 0) {
3292 btrfs_print_leaf(root, leaf);
3293 BUG();
3294 }
3295out:
3296 return ret;
3297}
3298
3299/*
3300 * Given a key and some data, insert an item into the tree.
3301 * This does all the path init required, making room in the tree if needed.
3302 */
3303int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3304 *root, struct btrfs_key *cpu_key, void *data, u32
3305 data_size)
3306{
3307 int ret = 0;
3308 struct btrfs_path *path;
3309 struct extent_buffer *leaf;
3310 unsigned long ptr;
3311
3312 path = btrfs_alloc_path();
3313 BUG_ON(!path);
3314 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3315 if (!ret) {
3316 leaf = path->nodes[0];
3317 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3318 write_extent_buffer(leaf, data, ptr, data_size);
3319 btrfs_mark_buffer_dirty(leaf);
3320 }
3321 btrfs_free_path(path);
3322 return ret;
3323}
3324
3325/*
3326 * delete the pointer from a given node.
3327 *
3328 * the tree should have been previously balanced so the deletion does not
3329 * empty a node.
3330 */
3331static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3332 struct btrfs_path *path, int level, int slot)
3333{
3334 struct extent_buffer *parent = path->nodes[level];
3335 u32 nritems;
3336 int ret = 0;
3337 int wret;
3338
3339 nritems = btrfs_header_nritems(parent);
3340 if (slot != nritems -1) {
3341 memmove_extent_buffer(parent,
3342 btrfs_node_key_ptr_offset(slot),
3343 btrfs_node_key_ptr_offset(slot + 1),
3344 sizeof(struct btrfs_key_ptr) *
3345 (nritems - slot - 1));
3346 }
3347 nritems--;
3348 btrfs_set_header_nritems(parent, nritems);
3349 if (nritems == 0 && parent == root->node) {
3350 BUG_ON(btrfs_header_level(root->node) != 1);
3351 /* just turn the root into a leaf and break */
3352 btrfs_set_header_level(root->node, 0);
3353 } else if (slot == 0) {
3354 struct btrfs_disk_key disk_key;
3355
3356 btrfs_node_key(parent, &disk_key, 0);
3357 wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
3358 if (wret)
3359 ret = wret;
3360 }
3361 btrfs_mark_buffer_dirty(parent);
3362 return ret;
3363}
3364
3365/*
3366 * a helper function to delete the leaf pointed to by path->slots[1] and
3367 * path->nodes[1]. bytenr is the node block pointer, but since the callers
3368 * already know it, it is faster to have them pass it down than to
3369 * read it out of the node again.
3370 *
3371 * This deletes the pointer in path->nodes[1] and frees the leaf
3372 * block extent. zero is returned if it all worked out, < 0 otherwise.
3373 *
3374 * The path must have already been setup for deleting the leaf, including
3375 * all the proper balancing. path->nodes[1] must be locked.
3376 */
3377noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3378 struct btrfs_root *root,
3379 struct btrfs_path *path, u64 bytenr)
3380{
3381 int ret;
3382 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3383
3384 ret = del_ptr(trans, root, path, 1, path->slots[1]);
3385 if (ret)
3386 return ret;
3387
3388 ret = btrfs_free_extent(trans, root, bytenr,
3389 btrfs_level_size(root, 0),
3390 path->nodes[1]->start,
3391 btrfs_header_owner(path->nodes[1]),
3392 root_gen, 0, 1);
3393 return ret;
3394}
3395/*
3396 * delete the item at the leaf level in path. If that empties
3397 * the leaf, remove it from the tree
3398 */
3399int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3400 struct btrfs_path *path, int slot, int nr)
3401{
3402 struct extent_buffer *leaf;
3403 struct btrfs_item *item;
3404 int last_off;
3405 int dsize = 0;
3406 int ret = 0;
3407 int wret;
3408 int i;
3409 u32 nritems;
3410
3411 leaf = path->nodes[0];
3412 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3413
3414 for (i = 0; i < nr; i++)
3415 dsize += btrfs_item_size_nr(leaf, slot + i);
3416
3417 nritems = btrfs_header_nritems(leaf);
3418
3419 if (slot + nr != nritems) {
3420 int data_end = leaf_data_end(root, leaf);
3421
3422 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3423 data_end + dsize,
3424 btrfs_leaf_data(leaf) + data_end,
3425 last_off - data_end);
3426
3427 for (i = slot + nr; i < nritems; i++) {
3428 u32 ioff;
3429
3430 item = btrfs_item_nr(leaf, i);
3431 if (!leaf->map_token) {
3432 map_extent_buffer(leaf, (unsigned long)item,
3433 sizeof(struct btrfs_item),
3434 &leaf->map_token, &leaf->kaddr,
3435 &leaf->map_start, &leaf->map_len,
3436 KM_USER1);
3437 }
3438 ioff = btrfs_item_offset(leaf, item);
3439 btrfs_set_item_offset(leaf, item, ioff + dsize);
3440 }
3441
3442 if (leaf->map_token) {
3443 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3444 leaf->map_token = NULL;
3445 }
3446
3447 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3448 btrfs_item_nr_offset(slot + nr),
3449 sizeof(struct btrfs_item) *
3450 (nritems - slot - nr));
3451 }
3452 btrfs_set_header_nritems(leaf, nritems - nr);
3453 nritems -= nr;
3454
3455 /* delete the leaf if we've emptied it */
3456 if (nritems == 0) {
3457 if (leaf == root->node) {
3458 btrfs_set_header_level(leaf, 0);
3459 } else {
3460 ret = btrfs_del_leaf(trans, root, path, leaf->start);
3461 BUG_ON(ret);
3462 }
3463 } else {
3464 int used = leaf_space_used(leaf, 0, nritems);
3465 if (slot == 0) {
3466 struct btrfs_disk_key disk_key;
3467
3468 btrfs_item_key(leaf, &disk_key, 0);
3469 wret = fixup_low_keys(trans, root, path,
3470 &disk_key, 1);
3471 if (wret)
3472 ret = wret;
3473 }
3474
3475 /* delete the leaf if it is mostly empty */
3476 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
3477 /* push_leaf_left fixes the path.
3478 * make sure the path still points to our leaf
3479 * for possible call to del_ptr below
3480 */
3481 slot = path->slots[1];
3482 extent_buffer_get(leaf);
3483
3484 wret = push_leaf_left(trans, root, path, 1, 1);
3485 if (wret < 0 && wret != -ENOSPC)
3486 ret = wret;
3487
3488 if (path->nodes[0] == leaf &&
3489 btrfs_header_nritems(leaf)) {
3490 wret = push_leaf_right(trans, root, path, 1, 1);
3491 if (wret < 0 && wret != -ENOSPC)
3492 ret = wret;
3493 }
3494
3495 if (btrfs_header_nritems(leaf) == 0) {
3496 path->slots[1] = slot;
3497 ret = btrfs_del_leaf(trans, root, path, leaf->start);
3498 BUG_ON(ret);
3499 free_extent_buffer(leaf);
3500 } else {
3501 /* if we're still in the path, make sure
3502 * we're dirty. Otherwise, one of the
3503 * push_leaf functions must have already
3504 * dirtied this buffer
3505 */
3506 if (path->nodes[0] == leaf)
3507 btrfs_mark_buffer_dirty(leaf);
3508 free_extent_buffer(leaf);
3509 }
3510 } else {
3511 btrfs_mark_buffer_dirty(leaf);
3512 }
3513 }
3514 return ret;
3515}
3516
3517/*
3518 * search the tree again to find a leaf with lesser keys
3519 * returns 0 if it found something or 1 if there are no lesser leaves.
3520 * returns < 0 on io errors.
3521 *
3522 * This may release the path, and so you may lose any locks held at the
3523 * time you call it.
3524 */
3525int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3526{
3527 struct btrfs_key key;
3528 struct btrfs_disk_key found_key;
3529 int ret;
3530
3531 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3532
3533 if (key.offset > 0)
3534 key.offset--;
3535 else if (key.type > 0)
3536 key.type--;
3537 else if (key.objectid > 0)
3538 key.objectid--;
3539 else
3540 return 1;
3541
3542 btrfs_release_path(root, path);
3543 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3544 if (ret < 0)
3545 return ret;
3546 btrfs_item_key(path->nodes[0], &found_key, 0);
3547 ret = comp_keys(&found_key, &key);
3548 if (ret < 0)
3549 return 0;
3550 return 1;
3551}
3552
3553/*
3554 * A helper function to walk down the tree starting at min_key, and looking
3555 * for nodes or leaves that are either in cache or have a minimum
3556 * transaction id. This is used by the btree defrag code, and tree logging
3557 *
3558 * This does not cow, but it does stuff the starting key it finds back
3559 * into min_key, so you can call btrfs_search_slot with cow=1 on the
3560 * key and get a writable path.
3561 *
3562 * This does lock as it descends, and path->keep_locks should be set
3563 * to 1 by the caller.
3564 *
3565 * This honors path->lowest_level to prevent descent past a given level
3566 * of the tree.
3567 *
3568 * min_trans indicates the oldest transaction that you are interested
3569 * in walking through. Any nodes or leaves older than min_trans are
3570 * skipped over (without reading them).
3571 *
3572 * returns zero if something useful was found, < 0 on error and 1 if there
3573 * was nothing in the tree that matched the search criteria.
3574 */
3575int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3576 struct btrfs_key *max_key,
3577 struct btrfs_path *path, int cache_only,
3578 u64 min_trans)
3579{
3580 struct extent_buffer *cur;
3581 struct btrfs_key found_key;
3582 int slot;
3583 int sret;
3584 u32 nritems;
3585 int level;
3586 int ret = 1;
3587
3588again:
3589 cur = btrfs_lock_root_node(root);
3590 level = btrfs_header_level(cur);
3591 WARN_ON(path->nodes[level]);
3592 path->nodes[level] = cur;
3593 path->locks[level] = 1;
3594
3595 if (btrfs_header_generation(cur) < min_trans) {
3596 ret = 1;
3597 goto out;
3598 }
3599 while(1) {
3600 nritems = btrfs_header_nritems(cur);
3601 level = btrfs_header_level(cur);
3602 sret = bin_search(cur, min_key, level, &slot);
3603
3604 /* at the lowest level, we're done, setup the path and exit */
3605 if (level == path->lowest_level) {
3606 if (slot >= nritems)
3607 goto find_next_key;
3608 ret = 0;
3609 path->slots[level] = slot;
3610 btrfs_item_key_to_cpu(cur, &found_key, slot);
3611 goto out;
3612 }
3613 if (sret && slot > 0)
3614 slot--;
3615 /*
3616 * check this node pointer against the cache_only and
3617 * min_trans parameters. If it isn't in cache or is too
3618 * old, skip to the next one.
3619 */
3620 while(slot < nritems) {
3621 u64 blockptr;
3622 u64 gen;
3623 struct extent_buffer *tmp;
3624 struct btrfs_disk_key disk_key;
3625
3626 blockptr = btrfs_node_blockptr(cur, slot);
3627 gen = btrfs_node_ptr_generation(cur, slot);
3628 if (gen < min_trans) {
3629 slot++;
3630 continue;
3631 }
3632 if (!cache_only)
3633 break;
3634
3635 if (max_key) {
3636 btrfs_node_key(cur, &disk_key, slot);
3637 if (comp_keys(&disk_key, max_key) >= 0) {
3638 ret = 1;
3639 goto out;
3640 }
3641 }
3642
3643 tmp = btrfs_find_tree_block(root, blockptr,
3644 btrfs_level_size(root, level - 1));
3645
3646 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
3647 free_extent_buffer(tmp);
3648 break;
3649 }
3650 if (tmp)
3651 free_extent_buffer(tmp);
3652 slot++;
3653 }
3654find_next_key:
3655 /*
3656 * we didn't find a candidate key in this node, walk forward
3657 * and find another one
3658 */
3659 if (slot >= nritems) {
3660 path->slots[level] = slot;
3661 sret = btrfs_find_next_key(root, path, min_key, level,
3662 cache_only, min_trans);
3663 if (sret == 0) {
3664 btrfs_release_path(root, path);
3665 goto again;
3666 } else {
3667 goto out;
3668 }
3669 }
3670 /* save our key for returning back */
3671 btrfs_node_key_to_cpu(cur, &found_key, slot);
3672 path->slots[level] = slot;
3673 if (level == path->lowest_level) {
3674 ret = 0;
3675 unlock_up(path, level, 1);
3676 goto out;
3677 }
3678 cur = read_node_slot(root, cur, slot);
3679
3680 btrfs_tree_lock(cur);
3681 path->locks[level - 1] = 1;
3682 path->nodes[level - 1] = cur;
3683 unlock_up(path, level, 1);
3684 }
3685out:
3686 if (ret == 0)
3687 memcpy(min_key, &found_key, sizeof(found_key));
3688 return ret;
3689}
3690
3691/*
3692 * this is similar to btrfs_next_leaf, but does not try to preserve
3693 * and fixup the path. It looks for and returns the next key in the
3694 * tree based on the current path and the cache_only and min_trans
3695 * parameters.
3696 *
3697 * 0 is returned if another key is found, < 0 if there are any errors
3698 * and 1 is returned if there are no higher keys in the tree
3699 *
3700 * path->keep_locks should be set to 1 on the search made before
3701 * calling this function.
3702 */
3703int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3704 struct btrfs_key *key, int lowest_level,
3705 int cache_only, u64 min_trans)
3706{
3707 int level = lowest_level;
3708 int slot;
3709 struct extent_buffer *c;
3710
3711 while(level < BTRFS_MAX_LEVEL) {
3712 if (!path->nodes[level])
3713 return 1;
3714
3715 slot = path->slots[level] + 1;
3716 c = path->nodes[level];
3717next:
3718 if (slot >= btrfs_header_nritems(c)) {
3719 level++;
3720 if (level == BTRFS_MAX_LEVEL) {
3721 return 1;
3722 }
3723 continue;
3724 }
3725 if (level == 0)
3726 btrfs_item_key_to_cpu(c, key, slot);
3727 else {
3728 u64 blockptr = btrfs_node_blockptr(c, slot);
3729 u64 gen = btrfs_node_ptr_generation(c, slot);
3730
3731 if (cache_only) {
3732 struct extent_buffer *cur;
3733 cur = btrfs_find_tree_block(root, blockptr,
3734 btrfs_level_size(root, level - 1));
3735 if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
3736 slot++;
3737 if (cur)
3738 free_extent_buffer(cur);
3739 goto next;
3740 }
3741 free_extent_buffer(cur);
3742 }
3743 if (gen < min_trans) {
3744 slot++;
3745 goto next;
3746 }
3747 btrfs_node_key_to_cpu(c, key, slot);
3748 }
3749 return 0;
3750 }
3751 return 1;
3752}
3753
3754/*
3755 * search the tree again to find a leaf with greater keys
3756 * returns 0 if it found something or 1 if there are no greater leaves.
3757 * returns < 0 on io errors.
3758 */
3759int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3760{
3761 int slot;
3762 int level = 1;
3763 struct extent_buffer *c;
3764 struct extent_buffer *next = NULL;
3765 struct btrfs_key key;
3766 u32 nritems;
3767 int ret;
3768
3769 nritems = btrfs_header_nritems(path->nodes[0]);
3770 if (nritems == 0) {
3771 return 1;
3772 }
3773
3774 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
3775
3776 btrfs_release_path(root, path);
3777 path->keep_locks = 1;
3778 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3779 path->keep_locks = 0;
3780
3781 if (ret < 0)
3782 return ret;
3783
3784 nritems = btrfs_header_nritems(path->nodes[0]);
3785 /*
3786 * by releasing the path above we dropped all our locks. A balance
3787 * could have added more items next to the key that used to be
3788 * at the very end of the block. So, check again here and
3789 * advance the path if there are now more items available.
3790 */
3791 if (nritems > 0 && path->slots[0] < nritems - 1) {
3792 path->slots[0]++;
3793 goto done;
3794 }
3795
3796 while(level < BTRFS_MAX_LEVEL) {
3797 if (!path->nodes[level])
3798 return 1;
3799
3800 slot = path->slots[level] + 1;
3801 c = path->nodes[level];
3802 if (slot >= btrfs_header_nritems(c)) {
3803 level++;
3804 if (level == BTRFS_MAX_LEVEL) {
3805 return 1;
3806 }
3807 continue;
3808 }
3809
3810 if (next) {
3811 btrfs_tree_unlock(next);
3812 free_extent_buffer(next);
3813 }
3814
3815 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3816 path->reada)
3817 reada_for_search(root, path, level, slot, 0);
3818
3819 next = read_node_slot(root, c, slot);
3820 if (!path->skip_locking) {
3821 WARN_ON(!btrfs_tree_locked(c));
3822 btrfs_tree_lock(next);
3823 }
3824 break;
3825 }
3826 path->slots[level] = slot;
3827 while(1) {
3828 level--;
3829 c = path->nodes[level];
3830 if (path->locks[level])
3831 btrfs_tree_unlock(c);
3832 free_extent_buffer(c);
3833 path->nodes[level] = next;
3834 path->slots[level] = 0;
3835 if (!path->skip_locking)
3836 path->locks[level] = 1;
3837 if (!level)
3838 break;
3839 if (level == 1 && path->locks[1] && path->reada)
3840 reada_for_search(root, path, level, slot, 0);
3841 next = read_node_slot(root, next, 0);
3842 if (!path->skip_locking) {
3843 WARN_ON(!btrfs_tree_locked(path->nodes[level]));
3844 btrfs_tree_lock(next);
3845 }
3846 }
3847done:
3848 unlock_up(path, 0, 1);
3849 return 0;
3850}
3851
3852/*
3853 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
3854 * searching until it gets past min_objectid or finds an item of 'type'
3855 *
3856 * returns 0 if something is found, 1 if nothing was found and < 0 on error
3857 */
3858int btrfs_previous_item(struct btrfs_root *root,
3859 struct btrfs_path *path, u64 min_objectid,
3860 int type)
3861{
3862 struct btrfs_key found_key;
3863 struct extent_buffer *leaf;
3864 u32 nritems;
3865 int ret;
3866
3867 while(1) {
3868 if (path->slots[0] == 0) {
3869 ret = btrfs_prev_leaf(root, path);
3870 if (ret != 0)
3871 return ret;
3872 } else {
3873 path->slots[0]--;
3874 }
3875 leaf = path->nodes[0];
3876 nritems = btrfs_header_nritems(leaf);
3877 if (nritems == 0)
3878 return 1;
3879 if (path->slots[0] == nritems)
3880 path->slots[0]--;
3881
3882 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3883 if (found_key.type == type)
3884 return 0;
3885 if (found_key.objectid < min_objectid)
3886 break;
3887 if (found_key.objectid == min_objectid &&
3888 found_key.type < type)
3889 break;
3890 }
3891 return 1;
3892}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 000000000000..0f2a9b584fb6
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,2043 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CTREE__
20#define __BTRFS_CTREE__
21
22#include <linux/version.h>
23#include <linux/mm.h>
24#include <linux/highmem.h>
25#include <linux/fs.h>
26#include <linux/completion.h>
27#include <linux/backing-dev.h>
28#include <linux/wait.h>
29#include <asm/kmap_types.h>
30#include "extent_io.h"
31#include "extent_map.h"
32#include "async-thread.h"
33
34struct btrfs_trans_handle;
35struct btrfs_transaction;
36extern struct kmem_cache *btrfs_trans_handle_cachep;
37extern struct kmem_cache *btrfs_transaction_cachep;
38extern struct kmem_cache *btrfs_bit_radix_cachep;
39extern struct kmem_cache *btrfs_path_cachep;
40struct btrfs_ordered_sum;
41
42#define BTRFS_MAGIC "_BFRfS_M"
43
44#define BTRFS_ACL_NOT_CACHED ((void *)-1)
45
46#ifdef CONFIG_LOCKDEP
47# define BTRFS_MAX_LEVEL 7
48#else
49# define BTRFS_MAX_LEVEL 8
50#endif
51
52/* holds pointers to all of the tree roots */
53#define BTRFS_ROOT_TREE_OBJECTID 1ULL
54
55/* stores information about which extents are in use, and reference counts */
56#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
57
58/*
59 * chunk tree stores translations from logical -> physical block numbering
60 * the super block points to the chunk tree
61 */
62#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
63
64/*
65 * stores information about which areas of a given device are in use.
66 * one per device. The tree of tree roots points to the device tree
67 */
68#define BTRFS_DEV_TREE_OBJECTID 4ULL
69
70/* one per subvolume, storing files and directories */
71#define BTRFS_FS_TREE_OBJECTID 5ULL
72
73/* directory objectid inside the root tree */
74#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
75
76/* orhpan objectid for tracking unlinked/truncated files */
77#define BTRFS_ORPHAN_OBJECTID -5ULL
78
79/* does write ahead logging to speed up fsyncs */
80#define BTRFS_TREE_LOG_OBJECTID -6ULL
81#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
82
83/* for space balancing */
84#define BTRFS_TREE_RELOC_OBJECTID -8ULL
85#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
86
87/* dummy objectid represents multiple objectids */
88#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
89
90/*
91 * All files have objectids in this range.
92 */
93#define BTRFS_FIRST_FREE_OBJECTID 256ULL
94#define BTRFS_LAST_FREE_OBJECTID -256ULL
95#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
96
97
98/*
99 * the device items go into the chunk tree. The key is in the form
100 * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
101 */
102#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
103
104/*
105 * we can actually store much bigger names, but lets not confuse the rest
106 * of linux
107 */
108#define BTRFS_NAME_LEN 255
109
110/* 32 bytes in various csum fields */
111#define BTRFS_CSUM_SIZE 32
112/* four bytes for CRC32 */
113#define BTRFS_CRC32_SIZE 4
114#define BTRFS_EMPTY_DIR_SIZE 0
115
116#define BTRFS_FT_UNKNOWN 0
117#define BTRFS_FT_REG_FILE 1
118#define BTRFS_FT_DIR 2
119#define BTRFS_FT_CHRDEV 3
120#define BTRFS_FT_BLKDEV 4
121#define BTRFS_FT_FIFO 5
122#define BTRFS_FT_SOCK 6
123#define BTRFS_FT_SYMLINK 7
124#define BTRFS_FT_XATTR 8
125#define BTRFS_FT_MAX 9
126
127/*
128 * the key defines the order in the tree, and so it also defines (optimal)
129 * block layout. objectid corresonds to the inode number. The flags
130 * tells us things about the object, and is a kind of stream selector.
131 * so for a given inode, keys with flags of 1 might refer to the inode
132 * data, flags of 2 may point to file data in the btree and flags == 3
133 * may point to extents.
134 *
135 * offset is the starting byte offset for this key in the stream.
136 *
137 * btrfs_disk_key is in disk byte order. struct btrfs_key is always
138 * in cpu native order. Otherwise they are identical and their sizes
139 * should be the same (ie both packed)
140 */
141struct btrfs_disk_key {
142 __le64 objectid;
143 u8 type;
144 __le64 offset;
145} __attribute__ ((__packed__));
146
147struct btrfs_key {
148 u64 objectid;
149 u8 type;
150 u64 offset;
151} __attribute__ ((__packed__));
152
153struct btrfs_mapping_tree {
154 struct extent_map_tree map_tree;
155};
156
157#define BTRFS_UUID_SIZE 16
158struct btrfs_dev_item {
159 /* the internal btrfs device id */
160 __le64 devid;
161
162 /* size of the device */
163 __le64 total_bytes;
164
165 /* bytes used */
166 __le64 bytes_used;
167
168 /* optimal io alignment for this device */
169 __le32 io_align;
170
171 /* optimal io width for this device */
172 __le32 io_width;
173
174 /* minimal io size for this device */
175 __le32 sector_size;
176
177 /* type and info about this device */
178 __le64 type;
179
180 /* expected generation for this device */
181 __le64 generation;
182
183 /* grouping information for allocation decisions */
184 __le32 dev_group;
185
186 /* seek speed 0-100 where 100 is fastest */
187 u8 seek_speed;
188
189 /* bandwidth 0-100 where 100 is fastest */
190 u8 bandwidth;
191
192 /* btrfs generated uuid for this device */
193 u8 uuid[BTRFS_UUID_SIZE];
194
195 /* uuid of FS who owns this device */
196 u8 fsid[BTRFS_UUID_SIZE];
197} __attribute__ ((__packed__));
198
199struct btrfs_stripe {
200 __le64 devid;
201 __le64 offset;
202 u8 dev_uuid[BTRFS_UUID_SIZE];
203} __attribute__ ((__packed__));
204
205struct btrfs_chunk {
206 /* size of this chunk in bytes */
207 __le64 length;
208
209 /* objectid of the root referencing this chunk */
210 __le64 owner;
211
212 __le64 stripe_len;
213 __le64 type;
214
215 /* optimal io alignment for this chunk */
216 __le32 io_align;
217
218 /* optimal io width for this chunk */
219 __le32 io_width;
220
221 /* minimal io size for this chunk */
222 __le32 sector_size;
223
224 /* 2^16 stripes is quite a lot, a second limit is the size of a single
225 * item in the btree
226 */
227 __le16 num_stripes;
228
229 /* sub stripes only matter for raid10 */
230 __le16 sub_stripes;
231 struct btrfs_stripe stripe;
232 /* additional stripes go here */
233} __attribute__ ((__packed__));
234
235static inline unsigned long btrfs_chunk_item_size(int num_stripes)
236{
237 BUG_ON(num_stripes == 0);
238 return sizeof(struct btrfs_chunk) +
239 sizeof(struct btrfs_stripe) * (num_stripes - 1);
240}
241
242#define BTRFS_FSID_SIZE 16
243#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
244
245/*
246 * every tree block (leaf or node) starts with this header.
247 */
248struct btrfs_header {
249 /* these first four must match the super block */
250 u8 csum[BTRFS_CSUM_SIZE];
251 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
252 __le64 bytenr; /* which block this node is supposed to live in */
253 __le64 flags;
254
255 /* allowed to be different from the super from here on down */
256 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
257 __le64 generation;
258 __le64 owner;
259 __le32 nritems;
260 u8 level;
261} __attribute__ ((__packed__));
262
263#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
264 sizeof(struct btrfs_header)) / \
265 sizeof(struct btrfs_key_ptr))
266#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
267#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
268#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
269 sizeof(struct btrfs_item) - \
270 sizeof(struct btrfs_file_extent_item))
271
272#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
273
274/*
275 * this is a very generous portion of the super block, giving us
276 * room to translate 14 chunks with 3 stripes each.
277 */
278#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
279#define BTRFS_LABEL_SIZE 256
280
281/*
282 * the super block basically lists the main trees of the FS
283 * it currently lacks any block count etc etc
284 */
285struct btrfs_super_block {
286 u8 csum[BTRFS_CSUM_SIZE];
287 /* the first 4 fields must match struct btrfs_header */
288 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
289 __le64 bytenr; /* this block number */
290 __le64 flags;
291
292 /* allowed to be different from the btrfs_header from here own down */
293 __le64 magic;
294 __le64 generation;
295 __le64 root;
296 __le64 chunk_root;
297 __le64 log_root;
298 __le64 total_bytes;
299 __le64 bytes_used;
300 __le64 root_dir_objectid;
301 __le64 num_devices;
302 __le32 sectorsize;
303 __le32 nodesize;
304 __le32 leafsize;
305 __le32 stripesize;
306 __le32 sys_chunk_array_size;
307 __le64 chunk_root_generation;
308 u8 root_level;
309 u8 chunk_root_level;
310 u8 log_root_level;
311 struct btrfs_dev_item dev_item;
312 char label[BTRFS_LABEL_SIZE];
313 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
314} __attribute__ ((__packed__));
315
316/*
317 * A leaf is full of items. offset and size tell us where to find
318 * the item in the leaf (relative to the start of the data area)
319 */
320struct btrfs_item {
321 struct btrfs_disk_key key;
322 __le32 offset;
323 __le32 size;
324} __attribute__ ((__packed__));
325
326/*
327 * leaves have an item area and a data area:
328 * [item0, item1....itemN] [free space] [dataN...data1, data0]
329 *
330 * The data is separate from the items to get the keys closer together
331 * during searches.
332 */
333struct btrfs_leaf {
334 struct btrfs_header header;
335 struct btrfs_item items[];
336} __attribute__ ((__packed__));
337
338/*
339 * all non-leaf blocks are nodes, they hold only keys and pointers to
340 * other blocks
341 */
342struct btrfs_key_ptr {
343 struct btrfs_disk_key key;
344 __le64 blockptr;
345 __le64 generation;
346} __attribute__ ((__packed__));
347
348struct btrfs_node {
349 struct btrfs_header header;
350 struct btrfs_key_ptr ptrs[];
351} __attribute__ ((__packed__));
352
353/*
354 * btrfs_paths remember the path taken from the root down to the leaf.
355 * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
356 * to any other levels that are present.
357 *
358 * The slots array records the index of the item or block pointer
359 * used while walking the tree.
360 */
361struct btrfs_path {
362 struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
363 int slots[BTRFS_MAX_LEVEL];
364 /* if there is real range locking, this locks field will change */
365 int locks[BTRFS_MAX_LEVEL];
366 int reada;
367 /* keep some upper locks as we walk down */
368 int keep_locks;
369 int skip_locking;
370 int lowest_level;
371};
372
373/*
374 * items in the extent btree are used to record the objectid of the
375 * owner of the block and the number of references
376 */
377struct btrfs_extent_item {
378 __le32 refs;
379} __attribute__ ((__packed__));
380
381struct btrfs_extent_ref {
382 __le64 root;
383 __le64 generation;
384 __le64 objectid;
385 __le32 num_refs;
386} __attribute__ ((__packed__));
387
388/* dev extents record free space on individual devices. The owner
389 * field points back to the chunk allocation mapping tree that allocated
390 * the extent. The chunk tree uuid field is a way to double check the owner
391 */
392struct btrfs_dev_extent {
393 __le64 chunk_tree;
394 __le64 chunk_objectid;
395 __le64 chunk_offset;
396 __le64 length;
397 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
398} __attribute__ ((__packed__));
399
400struct btrfs_inode_ref {
401 __le64 index;
402 __le16 name_len;
403 /* name goes here */
404} __attribute__ ((__packed__));
405
406struct btrfs_timespec {
407 __le64 sec;
408 __le32 nsec;
409} __attribute__ ((__packed__));
410
411typedef enum {
412 BTRFS_COMPRESS_NONE = 0,
413 BTRFS_COMPRESS_ZLIB = 1,
414 BTRFS_COMPRESS_LAST = 2,
415} btrfs_compression_type;
416
417/* we don't understand any encryption methods right now */
418typedef enum {
419 BTRFS_ENCRYPTION_NONE = 0,
420 BTRFS_ENCRYPTION_LAST = 1,
421} btrfs_encryption_type;
422
423struct btrfs_inode_item {
424 /* nfs style generation number */
425 __le64 generation;
426 /* transid that last touched this inode */
427 __le64 transid;
428 __le64 size;
429 __le64 nbytes;
430 __le64 block_group;
431 __le32 nlink;
432 __le32 uid;
433 __le32 gid;
434 __le32 mode;
435 __le64 rdev;
436 __le16 flags;
437 __le16 compat_flags;
438
439 struct btrfs_timespec atime;
440 struct btrfs_timespec ctime;
441 struct btrfs_timespec mtime;
442 struct btrfs_timespec otime;
443} __attribute__ ((__packed__));
444
445struct btrfs_dir_log_item {
446 __le64 end;
447} __attribute__ ((__packed__));
448
449struct btrfs_dir_item {
450 struct btrfs_disk_key location;
451 __le64 transid;
452 __le16 data_len;
453 __le16 name_len;
454 u8 type;
455} __attribute__ ((__packed__));
456
457struct btrfs_root_item {
458 struct btrfs_inode_item inode;
459 __le64 generation;
460 __le64 root_dirid;
461 __le64 bytenr;
462 __le64 byte_limit;
463 __le64 bytes_used;
464 __le64 last_snapshot;
465 __le32 flags;
466 __le32 refs;
467 struct btrfs_disk_key drop_progress;
468 u8 drop_level;
469 u8 level;
470} __attribute__ ((__packed__));
471
472/*
473 * this is used for both forward and backward root refs
474 */
475struct btrfs_root_ref {
476 __le64 dirid;
477 __le64 sequence;
478 __le16 name_len;
479} __attribute__ ((__packed__));
480
481#define BTRFS_FILE_EXTENT_INLINE 0
482#define BTRFS_FILE_EXTENT_REG 1
483#define BTRFS_FILE_EXTENT_PREALLOC 2
484
485struct btrfs_file_extent_item {
486 /*
487 * transaction id that created this extent
488 */
489 __le64 generation;
490 /*
491 * max number of bytes to hold this extent in ram
492 * when we split a compressed extent we can't know how big
493 * each of the resulting pieces will be. So, this is
494 * an upper limit on the size of the extent in ram instead of
495 * an exact limit.
496 */
497 __le64 ram_bytes;
498
499 /*
500 * 32 bits for the various ways we might encode the data,
501 * including compression and encryption. If any of these
502 * are set to something a given disk format doesn't understand
503 * it is treated like an incompat flag for reading and writing,
504 * but not for stat.
505 */
506 u8 compression;
507 u8 encryption;
508 __le16 other_encoding; /* spare for later use */
509
510 /* are we inline data or a real extent? */
511 u8 type;
512
513 /*
514 * disk space consumed by the extent, checksum blocks are included
515 * in these numbers
516 */
517 __le64 disk_bytenr;
518 __le64 disk_num_bytes;
519 /*
520 * the logical offset in file blocks (no csums)
521 * this extent record is for. This allows a file extent to point
522 * into the middle of an existing extent on disk, sharing it
523 * between two snapshots (useful if some bytes in the middle of the
524 * extent have changed
525 */
526 __le64 offset;
527 /*
528 * the logical number of file blocks (no csums included). This
529 * always reflects the size uncompressed and without encoding.
530 */
531 __le64 num_bytes;
532
533} __attribute__ ((__packed__));
534
535struct btrfs_csum_item {
536 u8 csum;
537} __attribute__ ((__packed__));
538
539/* different types of block groups (and chunks) */
540#define BTRFS_BLOCK_GROUP_DATA (1 << 0)
541#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1)
542#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
543#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3)
544#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
545#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
546#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
547
548struct btrfs_block_group_item {
549 __le64 used;
550 __le64 chunk_objectid;
551 __le64 flags;
552} __attribute__ ((__packed__));
553
554struct btrfs_space_info {
555 u64 flags;
556 u64 total_bytes;
557 u64 bytes_used;
558 u64 bytes_pinned;
559 u64 bytes_reserved;
560 u64 bytes_readonly;
561 int full;
562 int force_alloc;
563 struct list_head list;
564
565 /* for block groups in our same type */
566 struct list_head block_groups;
567 spinlock_t lock;
568 struct rw_semaphore groups_sem;
569};
570
571struct btrfs_free_space {
572 struct rb_node bytes_index;
573 struct rb_node offset_index;
574 u64 offset;
575 u64 bytes;
576};
577
578struct btrfs_block_group_cache {
579 struct btrfs_key key;
580 struct btrfs_block_group_item item;
581 spinlock_t lock;
582 struct mutex alloc_mutex;
583 u64 pinned;
584 u64 reserved;
585 u64 flags;
586 int cached;
587 int ro;
588 int dirty;
589
590 struct btrfs_space_info *space_info;
591
592 /* free space cache stuff */
593 struct rb_root free_space_bytes;
594 struct rb_root free_space_offset;
595
596 /* block group cache stuff */
597 struct rb_node cache_node;
598
599 /* for block groups in the same raid type */
600 struct list_head list;
601};
602
603struct btrfs_leaf_ref_tree {
604 struct rb_root root;
605 struct list_head list;
606 spinlock_t lock;
607};
608
609struct btrfs_device;
610struct btrfs_fs_devices;
611struct btrfs_fs_info {
612 u8 fsid[BTRFS_FSID_SIZE];
613 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
614 struct btrfs_root *extent_root;
615 struct btrfs_root *tree_root;
616 struct btrfs_root *chunk_root;
617 struct btrfs_root *dev_root;
618 struct btrfs_root *fs_root;
619
620 /* the log root tree is a directory of all the other log roots */
621 struct btrfs_root *log_root_tree;
622 struct radix_tree_root fs_roots_radix;
623
624 /* block group cache stuff */
625 spinlock_t block_group_cache_lock;
626 struct rb_root block_group_cache_tree;
627
628 struct extent_io_tree pinned_extents;
629 struct extent_io_tree pending_del;
630 struct extent_io_tree extent_ins;
631
632 /* logical->physical extent mapping */
633 struct btrfs_mapping_tree mapping_tree;
634
635 u64 generation;
636 u64 last_trans_committed;
637 u64 last_trans_new_blockgroup;
638 u64 open_ioctl_trans;
639 unsigned long mount_opt;
640 u64 max_extent;
641 u64 max_inline;
642 u64 alloc_start;
643 struct btrfs_transaction *running_transaction;
644 wait_queue_head_t transaction_throttle;
645 wait_queue_head_t transaction_wait;
646
647 wait_queue_head_t async_submit_wait;
648 wait_queue_head_t tree_log_wait;
649
650 struct btrfs_super_block super_copy;
651 struct btrfs_super_block super_for_commit;
652 struct block_device *__bdev;
653 struct super_block *sb;
654 struct inode *btree_inode;
655 struct backing_dev_info bdi;
656 spinlock_t hash_lock;
657 struct mutex trans_mutex;
658 struct mutex tree_log_mutex;
659 struct mutex transaction_kthread_mutex;
660 struct mutex cleaner_mutex;
661 struct mutex extent_ins_mutex;
662 struct mutex pinned_mutex;
663 struct mutex chunk_mutex;
664 struct mutex drop_mutex;
665 struct mutex volume_mutex;
666 struct mutex tree_reloc_mutex;
667 struct list_head trans_list;
668 struct list_head hashers;
669 struct list_head dead_roots;
670
671 atomic_t nr_async_submits;
672 atomic_t async_submit_draining;
673 atomic_t nr_async_bios;
674 atomic_t async_delalloc_pages;
675 atomic_t tree_log_writers;
676 atomic_t tree_log_commit;
677 unsigned long tree_log_batch;
678 u64 tree_log_transid;
679
680 /*
681 * this is used by the balancing code to wait for all the pending
682 * ordered extents
683 */
684 spinlock_t ordered_extent_lock;
685 struct list_head ordered_extents;
686 struct list_head delalloc_inodes;
687
688 /*
689 * there is a pool of worker threads for checksumming during writes
690 * and a pool for checksumming after reads. This is because readers
691 * can run with FS locks held, and the writers may be waiting for
692 * those locks. We don't want ordering in the pending list to cause
693 * deadlocks, and so the two are serviced separately.
694 *
695 * A third pool does submit_bio to avoid deadlocking with the other
696 * two
697 */
698 struct btrfs_workers workers;
699 struct btrfs_workers delalloc_workers;
700 struct btrfs_workers endio_workers;
701 struct btrfs_workers endio_write_workers;
702 struct btrfs_workers submit_workers;
703 /*
704 * fixup workers take dirty pages that didn't properly go through
705 * the cow mechanism and make them safe to write. It happens
706 * for the sys_munmap function call path
707 */
708 struct btrfs_workers fixup_workers;
709 struct task_struct *transaction_kthread;
710 struct task_struct *cleaner_kthread;
711 int thread_pool_size;
712
713 /* tree relocation relocated fields */
714 struct list_head dead_reloc_roots;
715 struct btrfs_leaf_ref_tree reloc_ref_tree;
716 struct btrfs_leaf_ref_tree shared_ref_tree;
717
718 struct kobject super_kobj;
719 struct completion kobj_unregister;
720 int do_barriers;
721 int closing;
722 int log_root_recovering;
723 atomic_t throttles;
724 atomic_t throttle_gen;
725
726 u64 total_pinned;
727 struct list_head dirty_cowonly_roots;
728
729 struct btrfs_fs_devices *fs_devices;
730 struct list_head space_info;
731 spinlock_t delalloc_lock;
732 spinlock_t new_trans_lock;
733 u64 delalloc_bytes;
734 u64 last_alloc;
735 u64 last_data_alloc;
736
737 spinlock_t ref_cache_lock;
738 u64 total_ref_cache_size;
739
740 u64 avail_data_alloc_bits;
741 u64 avail_metadata_alloc_bits;
742 u64 avail_system_alloc_bits;
743 u64 data_alloc_profile;
744 u64 metadata_alloc_profile;
745 u64 system_alloc_profile;
746
747 void *bdev_holder;
748};
749
750/*
751 * in ram representation of the tree. extent_root is used for all allocations
752 * and for the extent tree extent_root root.
753 */
754struct btrfs_dirty_root;
755struct btrfs_root {
756 struct extent_buffer *node;
757
758 /* the node lock is held while changing the node pointer */
759 spinlock_t node_lock;
760
761 struct extent_buffer *commit_root;
762 struct btrfs_leaf_ref_tree *ref_tree;
763 struct btrfs_leaf_ref_tree ref_tree_struct;
764 struct btrfs_dirty_root *dirty_root;
765 struct btrfs_root *log_root;
766 struct btrfs_root *reloc_root;
767
768 struct btrfs_root_item root_item;
769 struct btrfs_key root_key;
770 struct btrfs_fs_info *fs_info;
771 struct extent_io_tree dirty_log_pages;
772
773 struct kobject root_kobj;
774 struct completion kobj_unregister;
775 struct mutex objectid_mutex;
776 struct mutex log_mutex;
777
778 u64 objectid;
779 u64 last_trans;
780
781 /* data allocations are done in sectorsize units */
782 u32 sectorsize;
783
784 /* node allocations are done in nodesize units */
785 u32 nodesize;
786
787 /* leaf allocations are done in leafsize units */
788 u32 leafsize;
789
790 u32 stripesize;
791
792 u32 type;
793 u64 highest_inode;
794 u64 last_inode_alloc;
795 int ref_cows;
796 int track_dirty;
797 u64 defrag_trans_start;
798 struct btrfs_key defrag_progress;
799 struct btrfs_key defrag_max;
800 int defrag_running;
801 int defrag_level;
802 char *name;
803 int in_sysfs;
804
805 /* the dirty list is only used by non-reference counted roots */
806 struct list_head dirty_list;
807
808 spinlock_t list_lock;
809 struct list_head dead_list;
810 struct list_head orphan_list;
811
812 /*
813 * right now this just gets used so that a root has its own devid
814 * for stat. It may be used for more later
815 */
816 struct super_block anon_super;
817};
818
819/*
820
821 * inode items have the data typically returned from stat and store other
822 * info about object characteristics. There is one for every file and dir in
823 * the FS
824 */
825#define BTRFS_INODE_ITEM_KEY 1
826#define BTRFS_INODE_REF_KEY 12
827#define BTRFS_XATTR_ITEM_KEY 24
828#define BTRFS_ORPHAN_ITEM_KEY 48
829/* reserve 2-15 close to the inode for later flexibility */
830
831/*
832 * dir items are the name -> inode pointers in a directory. There is one
833 * for every name in a directory.
834 */
835#define BTRFS_DIR_LOG_ITEM_KEY 60
836#define BTRFS_DIR_LOG_INDEX_KEY 72
837#define BTRFS_DIR_ITEM_KEY 84
838#define BTRFS_DIR_INDEX_KEY 96
839/*
840 * extent data is for file data
841 */
842#define BTRFS_EXTENT_DATA_KEY 108
843/*
844 * csum items have the checksums for data in the extents
845 */
846#define BTRFS_CSUM_ITEM_KEY 120
847
848
849/* reserve 21-31 for other file/dir stuff */
850
851/*
852 * root items point to tree roots. There are typically in the root
853 * tree used by the super block to find all the other trees
854 */
855#define BTRFS_ROOT_ITEM_KEY 132
856
857/*
858 * root backrefs tie subvols and snapshots to the directory entries that
859 * reference them
860 */
861#define BTRFS_ROOT_BACKREF_KEY 144
862
863/*
864 * root refs make a fast index for listing all of the snapshots and
865 * subvolumes referenced by a given root. They point directly to the
866 * directory item in the root that references the subvol
867 */
868#define BTRFS_ROOT_REF_KEY 156
869
870/*
871 * extent items are in the extent map tree. These record which blocks
872 * are used, and how many references there are to each block
873 */
874#define BTRFS_EXTENT_ITEM_KEY 168
875#define BTRFS_EXTENT_REF_KEY 180
876
877/*
878 * block groups give us hints into the extent allocation trees. Which
879 * blocks are free etc etc
880 */
881#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
882
883#define BTRFS_DEV_EXTENT_KEY 204
884#define BTRFS_DEV_ITEM_KEY 216
885#define BTRFS_CHUNK_ITEM_KEY 228
886
887/*
888 * string items are for debugging. They just store a short string of
889 * data in the FS
890 */
891#define BTRFS_STRING_ITEM_KEY 253
892
893#define BTRFS_MOUNT_NODATASUM (1 << 0)
894#define BTRFS_MOUNT_NODATACOW (1 << 1)
895#define BTRFS_MOUNT_NOBARRIER (1 << 2)
896#define BTRFS_MOUNT_SSD (1 << 3)
897#define BTRFS_MOUNT_DEGRADED (1 << 4)
898#define BTRFS_MOUNT_COMPRESS (1 << 5)
899
900#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
901#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
902#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
903 BTRFS_MOUNT_##opt)
904/*
905 * Inode flags
906 */
907#define BTRFS_INODE_NODATASUM (1 << 0)
908#define BTRFS_INODE_NODATACOW (1 << 1)
909#define BTRFS_INODE_READONLY (1 << 2)
910#define BTRFS_INODE_NOCOMPRESS (1 << 3)
911#define BTRFS_INODE_PREALLOC (1 << 4)
912#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \
913 ~BTRFS_INODE_##flag)
914#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \
915 BTRFS_INODE_##flag)
916#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \
917 BTRFS_INODE_##flag)
918/* some macros to generate set/get funcs for the struct fields. This
919 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
920 * one for u8:
921 */
922#define le8_to_cpu(v) (v)
923#define cpu_to_le8(v) (v)
924#define __le8 u8
925
926#define read_eb_member(eb, ptr, type, member, result) ( \
927 read_extent_buffer(eb, (char *)(result), \
928 ((unsigned long)(ptr)) + \
929 offsetof(type, member), \
930 sizeof(((type *)0)->member)))
931
932#define write_eb_member(eb, ptr, type, member, result) ( \
933 write_extent_buffer(eb, (char *)(result), \
934 ((unsigned long)(ptr)) + \
935 offsetof(type, member), \
936 sizeof(((type *)0)->member)))
937
938#ifndef BTRFS_SETGET_FUNCS
939#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
940u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
941void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
942#endif
943
944#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
945static inline u##bits btrfs_##name(struct extent_buffer *eb) \
946{ \
947 type *p = kmap_atomic(eb->first_page, KM_USER0); \
948 u##bits res = le##bits##_to_cpu(p->member); \
949 kunmap_atomic(p, KM_USER0); \
950 return res; \
951} \
952static inline void btrfs_set_##name(struct extent_buffer *eb, \
953 u##bits val) \
954{ \
955 type *p = kmap_atomic(eb->first_page, KM_USER0); \
956 p->member = cpu_to_le##bits(val); \
957 kunmap_atomic(p, KM_USER0); \
958}
959
960#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
961static inline u##bits btrfs_##name(type *s) \
962{ \
963 return le##bits##_to_cpu(s->member); \
964} \
965static inline void btrfs_set_##name(type *s, u##bits val) \
966{ \
967 s->member = cpu_to_le##bits(val); \
968}
969
970BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
971BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
972BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
973BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
974BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
975BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
976BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
977BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
978BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
979BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
980BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
981
982BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
983BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
984 total_bytes, 64);
985BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
986 bytes_used, 64);
987BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
988 io_align, 32);
989BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
990 io_width, 32);
991BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
992 sector_size, 32);
993BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
994BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
995 dev_group, 32);
996BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
997 seek_speed, 8);
998BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
999 bandwidth, 8);
1000BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
1001 generation, 64);
1002
1003static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
1004{
1005 return (char *)d + offsetof(struct btrfs_dev_item, uuid);
1006}
1007
1008static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
1009{
1010 return (char *)d + offsetof(struct btrfs_dev_item, fsid);
1011}
1012
1013BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
1014BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
1015BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
1016BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
1017BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
1018BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
1019BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
1020BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
1021BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
1022BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
1023BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
1024
1025static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
1026{
1027 return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
1028}
1029
1030BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
1031BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
1032BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
1033 stripe_len, 64);
1034BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
1035 io_align, 32);
1036BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
1037 io_width, 32);
1038BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
1039 sector_size, 32);
1040BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
1041BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
1042 num_stripes, 16);
1043BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
1044 sub_stripes, 16);
1045BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
1046BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
1047
1048static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
1049 int nr)
1050{
1051 unsigned long offset = (unsigned long)c;
1052 offset += offsetof(struct btrfs_chunk, stripe);
1053 offset += nr * sizeof(struct btrfs_stripe);
1054 return (struct btrfs_stripe *)offset;
1055}
1056
1057static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
1058{
1059 return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
1060}
1061
1062static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
1063 struct btrfs_chunk *c, int nr)
1064{
1065 return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
1066}
1067
1068static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
1069 struct btrfs_chunk *c, int nr,
1070 u64 val)
1071{
1072 btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
1073}
1074
1075static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
1076 struct btrfs_chunk *c, int nr)
1077{
1078 return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
1079}
1080
1081static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
1082 struct btrfs_chunk *c, int nr,
1083 u64 val)
1084{
1085 btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
1086}
1087
1088/* struct btrfs_block_group_item */
1089BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
1090 used, 64);
1091BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
1092 used, 64);
1093BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
1094 struct btrfs_block_group_item, chunk_objectid, 64);
1095
1096BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
1097 struct btrfs_block_group_item, chunk_objectid, 64);
1098BTRFS_SETGET_FUNCS(disk_block_group_flags,
1099 struct btrfs_block_group_item, flags, 64);
1100BTRFS_SETGET_STACK_FUNCS(block_group_flags,
1101 struct btrfs_block_group_item, flags, 64);
1102
1103/* struct btrfs_inode_ref */
1104BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
1105BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
1106
1107/* struct btrfs_inode_item */
1108BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
1109BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
1110BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
1111BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
1112BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
1113BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
1114BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
1115BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
1116BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
1117BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
1118BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 16);
1119BTRFS_SETGET_FUNCS(inode_compat_flags, struct btrfs_inode_item,
1120 compat_flags, 16);
1121
1122static inline struct btrfs_timespec *
1123btrfs_inode_atime(struct btrfs_inode_item *inode_item)
1124{
1125 unsigned long ptr = (unsigned long)inode_item;
1126 ptr += offsetof(struct btrfs_inode_item, atime);
1127 return (struct btrfs_timespec *)ptr;
1128}
1129
1130static inline struct btrfs_timespec *
1131btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
1132{
1133 unsigned long ptr = (unsigned long)inode_item;
1134 ptr += offsetof(struct btrfs_inode_item, mtime);
1135 return (struct btrfs_timespec *)ptr;
1136}
1137
1138static inline struct btrfs_timespec *
1139btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
1140{
1141 unsigned long ptr = (unsigned long)inode_item;
1142 ptr += offsetof(struct btrfs_inode_item, ctime);
1143 return (struct btrfs_timespec *)ptr;
1144}
1145
1146static inline struct btrfs_timespec *
1147btrfs_inode_otime(struct btrfs_inode_item *inode_item)
1148{
1149 unsigned long ptr = (unsigned long)inode_item;
1150 ptr += offsetof(struct btrfs_inode_item, otime);
1151 return (struct btrfs_timespec *)ptr;
1152}
1153
1154BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
1155BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
1156
1157/* struct btrfs_dev_extent */
1158BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
1159 chunk_tree, 64);
1160BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
1161 chunk_objectid, 64);
1162BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
1163 chunk_offset, 64);
1164BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
1165
1166static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
1167{
1168 unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
1169 return (u8 *)((unsigned long)dev + ptr);
1170}
1171
1172/* struct btrfs_extent_ref */
1173BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
1174BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
1175BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
1176BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
1177
1178BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
1179BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
1180 generation, 64);
1181BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
1182 objectid, 64);
1183BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
1184 num_refs, 32);
1185
1186/* struct btrfs_extent_item */
1187BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
1188BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
1189 refs, 32);
1190
1191/* struct btrfs_node */
1192BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
1193BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
1194
1195static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
1196{
1197 unsigned long ptr;
1198 ptr = offsetof(struct btrfs_node, ptrs) +
1199 sizeof(struct btrfs_key_ptr) * nr;
1200 return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
1201}
1202
1203static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
1204 int nr, u64 val)
1205{
1206 unsigned long ptr;
1207 ptr = offsetof(struct btrfs_node, ptrs) +
1208 sizeof(struct btrfs_key_ptr) * nr;
1209 btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
1210}
1211
1212static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
1213{
1214 unsigned long ptr;
1215 ptr = offsetof(struct btrfs_node, ptrs) +
1216 sizeof(struct btrfs_key_ptr) * nr;
1217 return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
1218}
1219
1220static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
1221 int nr, u64 val)
1222{
1223 unsigned long ptr;
1224 ptr = offsetof(struct btrfs_node, ptrs) +
1225 sizeof(struct btrfs_key_ptr) * nr;
1226 btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
1227}
1228
1229static inline unsigned long btrfs_node_key_ptr_offset(int nr)
1230{
1231 return offsetof(struct btrfs_node, ptrs) +
1232 sizeof(struct btrfs_key_ptr) * nr;
1233}
1234
1235void btrfs_node_key(struct extent_buffer *eb,
1236 struct btrfs_disk_key *disk_key, int nr);
1237
1238static inline void btrfs_set_node_key(struct extent_buffer *eb,
1239 struct btrfs_disk_key *disk_key, int nr)
1240{
1241 unsigned long ptr;
1242 ptr = btrfs_node_key_ptr_offset(nr);
1243 write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
1244 struct btrfs_key_ptr, key, disk_key);
1245}
1246
1247/* struct btrfs_item */
1248BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
1249BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
1250
1251static inline unsigned long btrfs_item_nr_offset(int nr)
1252{
1253 return offsetof(struct btrfs_leaf, items) +
1254 sizeof(struct btrfs_item) * nr;
1255}
1256
1257static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
1258 int nr)
1259{
1260 return (struct btrfs_item *)btrfs_item_nr_offset(nr);
1261}
1262
1263static inline u32 btrfs_item_end(struct extent_buffer *eb,
1264 struct btrfs_item *item)
1265{
1266 return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
1267}
1268
1269static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
1270{
1271 return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
1272}
1273
1274static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
1275{
1276 return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
1277}
1278
1279static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
1280{
1281 return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
1282}
1283
1284static inline void btrfs_item_key(struct extent_buffer *eb,
1285 struct btrfs_disk_key *disk_key, int nr)
1286{
1287 struct btrfs_item *item = btrfs_item_nr(eb, nr);
1288 read_eb_member(eb, item, struct btrfs_item, key, disk_key);
1289}
1290
1291static inline void btrfs_set_item_key(struct extent_buffer *eb,
1292 struct btrfs_disk_key *disk_key, int nr)
1293{
1294 struct btrfs_item *item = btrfs_item_nr(eb, nr);
1295 write_eb_member(eb, item, struct btrfs_item, key, disk_key);
1296}
1297
1298BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
1299
1300/*
1301 * struct btrfs_root_ref
1302 */
1303BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
1304BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
1305BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
1306
1307/* struct btrfs_dir_item */
1308BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
1309BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
1310BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
1311BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
1312
1313static inline void btrfs_dir_item_key(struct extent_buffer *eb,
1314 struct btrfs_dir_item *item,
1315 struct btrfs_disk_key *key)
1316{
1317 read_eb_member(eb, item, struct btrfs_dir_item, location, key);
1318}
1319
1320static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
1321 struct btrfs_dir_item *item,
1322 struct btrfs_disk_key *key)
1323{
1324 write_eb_member(eb, item, struct btrfs_dir_item, location, key);
1325}
1326
1327/* struct btrfs_disk_key */
1328BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
1329 objectid, 64);
1330BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
1331BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
1332
1333static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
1334 struct btrfs_disk_key *disk)
1335{
1336 cpu->offset = le64_to_cpu(disk->offset);
1337 cpu->type = disk->type;
1338 cpu->objectid = le64_to_cpu(disk->objectid);
1339}
1340
1341static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
1342 struct btrfs_key *cpu)
1343{
1344 disk->offset = cpu_to_le64(cpu->offset);
1345 disk->type = cpu->type;
1346 disk->objectid = cpu_to_le64(cpu->objectid);
1347}
1348
1349static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
1350 struct btrfs_key *key, int nr)
1351{
1352 struct btrfs_disk_key disk_key;
1353 btrfs_node_key(eb, &disk_key, nr);
1354 btrfs_disk_key_to_cpu(key, &disk_key);
1355}
1356
1357static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
1358 struct btrfs_key *key, int nr)
1359{
1360 struct btrfs_disk_key disk_key;
1361 btrfs_item_key(eb, &disk_key, nr);
1362 btrfs_disk_key_to_cpu(key, &disk_key);
1363}
1364
1365static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
1366 struct btrfs_dir_item *item,
1367 struct btrfs_key *key)
1368{
1369 struct btrfs_disk_key disk_key;
1370 btrfs_dir_item_key(eb, item, &disk_key);
1371 btrfs_disk_key_to_cpu(key, &disk_key);
1372}
1373
1374
1375static inline u8 btrfs_key_type(struct btrfs_key *key)
1376{
1377 return key->type;
1378}
1379
1380static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
1381{
1382 key->type = val;
1383}
1384
1385/* struct btrfs_header */
1386BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
1387BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
1388 generation, 64);
1389BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
1390BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
1391BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
1392BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
1393
1394static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
1395{
1396 return (btrfs_header_flags(eb) & flag) == flag;
1397}
1398
1399static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
1400{
1401 u64 flags = btrfs_header_flags(eb);
1402 btrfs_set_header_flags(eb, flags | flag);
1403 return (flags & flag) == flag;
1404}
1405
1406static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
1407{
1408 u64 flags = btrfs_header_flags(eb);
1409 btrfs_set_header_flags(eb, flags & ~flag);
1410 return (flags & flag) == flag;
1411}
1412
1413static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
1414{
1415 unsigned long ptr = offsetof(struct btrfs_header, fsid);
1416 return (u8 *)ptr;
1417}
1418
1419static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
1420{
1421 unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
1422 return (u8 *)ptr;
1423}
1424
1425static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
1426{
1427 unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
1428 return (u8 *)ptr;
1429}
1430
1431static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
1432{
1433 unsigned long ptr = offsetof(struct btrfs_header, csum);
1434 return (u8 *)ptr;
1435}
1436
1437static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
1438{
1439 return NULL;
1440}
1441
1442static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
1443{
1444 return NULL;
1445}
1446
1447static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
1448{
1449 return NULL;
1450}
1451
1452static inline int btrfs_is_leaf(struct extent_buffer *eb)
1453{
1454 return (btrfs_header_level(eb) == 0);
1455}
1456
1457/* struct btrfs_root_item */
1458BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
1459 generation, 64);
1460BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
1461BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
1462BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
1463
1464BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
1465 generation, 64);
1466BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
1467BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
1468BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
1469BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
1470BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 32);
1471BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
1472BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
1473BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
1474 last_snapshot, 64);
1475
1476/* struct btrfs_super_block */
1477BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
1478BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
1479BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
1480 generation, 64);
1481BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
1482BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
1483 struct btrfs_super_block, sys_chunk_array_size, 32);
1484BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
1485 struct btrfs_super_block, chunk_root_generation, 64);
1486BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
1487 root_level, 8);
1488BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
1489 chunk_root, 64);
1490BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
1491 chunk_root_level, 8);
1492BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
1493 log_root, 64);
1494BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
1495 log_root_level, 8);
1496BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
1497 total_bytes, 64);
1498BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
1499 bytes_used, 64);
1500BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
1501 sectorsize, 32);
1502BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
1503 nodesize, 32);
1504BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
1505 leafsize, 32);
1506BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
1507 stripesize, 32);
1508BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
1509 root_dir_objectid, 64);
1510BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
1511 num_devices, 64);
1512
1513static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
1514{
1515 return offsetof(struct btrfs_leaf, items);
1516}
1517
1518/* struct btrfs_file_extent_item */
1519BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
1520
1521static inline unsigned long btrfs_file_extent_inline_start(struct
1522 btrfs_file_extent_item *e)
1523{
1524 unsigned long offset = (unsigned long)e;
1525 offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
1526 return offset;
1527}
1528
1529static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
1530{
1531 return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
1532}
1533
1534BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
1535 disk_bytenr, 64);
1536BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
1537 generation, 64);
1538BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
1539 disk_num_bytes, 64);
1540BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
1541 offset, 64);
1542BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
1543 num_bytes, 64);
1544BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
1545 ram_bytes, 64);
1546BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
1547 compression, 8);
1548BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
1549 encryption, 8);
1550BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
1551 other_encoding, 16);
1552
1553/* this returns the number of file bytes represented by the inline item.
1554 * If an item is compressed, this is the uncompressed size
1555 */
1556static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
1557 struct btrfs_file_extent_item *e)
1558{
1559 return btrfs_file_extent_ram_bytes(eb, e);
1560}
1561
1562/*
1563 * this returns the number of bytes used by the item on disk, minus the
1564 * size of any extent headers. If a file is compressed on disk, this is
1565 * the compressed size
1566 */
1567static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
1568 struct btrfs_item *e)
1569{
1570 unsigned long offset;
1571 offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
1572 return btrfs_item_size(eb, e) - offset;
1573}
1574
1575static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
1576{
1577 return sb->s_fs_info;
1578}
1579
1580static inline int btrfs_set_root_name(struct btrfs_root *root,
1581 const char *name, int len)
1582{
1583 /* if we already have a name just free it */
1584 if (root->name)
1585 kfree(root->name);
1586
1587 root->name = kmalloc(len+1, GFP_KERNEL);
1588 if (!root->name)
1589 return -ENOMEM;
1590
1591 memcpy(root->name, name, len);
1592 root->name[len] ='\0';
1593
1594 return 0;
1595}
1596
1597static inline u32 btrfs_level_size(struct btrfs_root *root, int level) {
1598 if (level == 0)
1599 return root->leafsize;
1600 return root->nodesize;
1601}
1602
1603/* helper function to cast into the data area of the leaf. */
1604#define btrfs_item_ptr(leaf, slot, type) \
1605 ((type *)(btrfs_leaf_data(leaf) + \
1606 btrfs_item_offset_nr(leaf, slot)))
1607
1608#define btrfs_item_ptr_offset(leaf, slot) \
1609 ((unsigned long)(btrfs_leaf_data(leaf) + \
1610 btrfs_item_offset_nr(leaf, slot)))
1611
1612static inline struct dentry *fdentry(struct file *file)
1613{
1614 return file->f_path.dentry;
1615}
1616
1617/* extent-tree.c */
1618int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1619int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1620 struct btrfs_root *root, u64 bytenr,
1621 u64 num_bytes, u32 *refs);
1622int btrfs_update_pinned_extents(struct btrfs_root *root,
1623 u64 bytenr, u64 num, int pin);
1624int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1625 struct btrfs_root *root, struct extent_buffer *leaf);
1626int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1627 struct btrfs_root *root, u64 bytenr);
1628int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1629 struct btrfs_root *root);
1630int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
1631struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
1632 btrfs_fs_info *info,
1633 u64 bytenr);
1634struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
1635 struct btrfs_block_group_cache
1636 *hint, u64 search_start,
1637 int data, int owner);
1638struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1639 struct btrfs_root *root,
1640 u32 blocksize, u64 parent,
1641 u64 root_objectid,
1642 u64 ref_generation,
1643 int level,
1644 u64 hint,
1645 u64 empty_size);
1646struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1647 struct btrfs_root *root,
1648 u64 bytenr, u32 blocksize);
1649int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1650 struct btrfs_root *root,
1651 u64 num_bytes, u64 parent, u64 min_bytes,
1652 u64 root_objectid, u64 ref_generation,
1653 u64 owner, u64 empty_size, u64 hint_byte,
1654 u64 search_end, struct btrfs_key *ins, u64 data);
1655int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
1656 struct btrfs_root *root, u64 parent,
1657 u64 root_objectid, u64 ref_generation,
1658 u64 owner, struct btrfs_key *ins);
1659int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
1660 struct btrfs_root *root, u64 parent,
1661 u64 root_objectid, u64 ref_generation,
1662 u64 owner, struct btrfs_key *ins);
1663int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
1664 struct btrfs_root *root,
1665 u64 num_bytes, u64 min_alloc_size,
1666 u64 empty_size, u64 hint_byte,
1667 u64 search_end, struct btrfs_key *ins,
1668 u64 data);
1669int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1670 struct extent_buffer *orig_buf, struct extent_buffer *buf,
1671 u32 *nr_extents);
1672int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1673 struct extent_buffer *buf, u32 nr_extents);
1674int btrfs_update_ref(struct btrfs_trans_handle *trans,
1675 struct btrfs_root *root, struct extent_buffer *orig_buf,
1676 struct extent_buffer *buf, int start_slot, int nr);
1677int btrfs_free_extent(struct btrfs_trans_handle *trans,
1678 struct btrfs_root *root,
1679 u64 bytenr, u64 num_bytes, u64 parent,
1680 u64 root_objectid, u64 ref_generation,
1681 u64 owner_objectid, int pin);
1682int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
1683int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1684 struct btrfs_root *root,
1685 struct extent_io_tree *unpin);
1686int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1687 struct btrfs_root *root,
1688 u64 bytenr, u64 num_bytes, u64 parent,
1689 u64 root_objectid, u64 ref_generation,
1690 u64 owner_objectid);
1691int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1692 struct btrfs_root *root, u64 bytenr,
1693 u64 orig_parent, u64 parent,
1694 u64 root_objectid, u64 ref_generation,
1695 u64 owner_objectid);
1696int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1697 struct btrfs_root *root);
1698int btrfs_free_block_groups(struct btrfs_fs_info *info);
1699int btrfs_read_block_groups(struct btrfs_root *root);
1700int btrfs_make_block_group(struct btrfs_trans_handle *trans,
1701 struct btrfs_root *root, u64 bytes_used,
1702 u64 type, u64 chunk_objectid, u64 chunk_offset,
1703 u64 size);
1704int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
1705 struct btrfs_root *root, u64 group_start);
1706int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
1707int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
1708 struct btrfs_root *root);
1709int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
1710int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
1711 struct btrfs_root *root,
1712 struct extent_buffer *buf, u64 orig_start);
1713int btrfs_add_dead_reloc_root(struct btrfs_root *root);
1714int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
1715u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
1716/* ctree.c */
1717int btrfs_previous_item(struct btrfs_root *root,
1718 struct btrfs_path *path, u64 min_objectid,
1719 int type);
1720int btrfs_merge_path(struct btrfs_trans_handle *trans,
1721 struct btrfs_root *root,
1722 struct btrfs_key *node_keys,
1723 u64 *nodes, int lowest_level);
1724int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1725 struct btrfs_root *root, struct btrfs_path *path,
1726 struct btrfs_key *new_key);
1727struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
1728struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
1729int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
1730 struct btrfs_key *key, int lowest_level,
1731 int cache_only, u64 min_trans);
1732int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
1733 struct btrfs_key *max_key,
1734 struct btrfs_path *path, int cache_only,
1735 u64 min_trans);
1736int btrfs_cow_block(struct btrfs_trans_handle *trans,
1737 struct btrfs_root *root, struct extent_buffer *buf,
1738 struct extent_buffer *parent, int parent_slot,
1739 struct extent_buffer **cow_ret, u64 prealloc_dest);
1740int btrfs_copy_root(struct btrfs_trans_handle *trans,
1741 struct btrfs_root *root,
1742 struct extent_buffer *buf,
1743 struct extent_buffer **cow_ret, u64 new_root_objectid);
1744int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
1745 *root, struct btrfs_path *path, u32 data_size);
1746int btrfs_truncate_item(struct btrfs_trans_handle *trans,
1747 struct btrfs_root *root,
1748 struct btrfs_path *path,
1749 u32 new_size, int from_end);
1750int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1751 *root, struct btrfs_key *key, struct btrfs_path *p, int
1752 ins_len, int cow);
1753int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1754 struct btrfs_root *root, struct extent_buffer *parent,
1755 int start_slot, int cache_only, u64 *last_ret,
1756 struct btrfs_key *progress);
1757void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1758struct btrfs_path *btrfs_alloc_path(void);
1759void btrfs_free_path(struct btrfs_path *p);
1760void btrfs_init_path(struct btrfs_path *p);
1761int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1762 struct btrfs_path *path, int slot, int nr);
1763int btrfs_del_leaf(struct btrfs_trans_handle *trans,
1764 struct btrfs_root *root,
1765 struct btrfs_path *path, u64 bytenr);
1766static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
1767 struct btrfs_root *root,
1768 struct btrfs_path *path)
1769{
1770 return btrfs_del_items(trans, root, path, path->slots[0], 1);
1771}
1772
1773int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
1774 *root, struct btrfs_key *key, void *data, u32 data_size);
1775int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
1776 struct btrfs_root *root,
1777 struct btrfs_path *path,
1778 struct btrfs_key *cpu_key, u32 *data_size,
1779 int nr);
1780int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
1781 struct btrfs_root *root,
1782 struct btrfs_path *path,
1783 struct btrfs_key *cpu_key, u32 *data_size, int nr);
1784
1785static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
1786 struct btrfs_root *root,
1787 struct btrfs_path *path,
1788 struct btrfs_key *key,
1789 u32 data_size)
1790{
1791 return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
1792}
1793
1794int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
1795int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
1796int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
1797int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
1798 *root);
1799int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
1800 struct btrfs_root *root,
1801 struct extent_buffer *node,
1802 struct extent_buffer *parent);
1803/* root-item.c */
1804int btrfs_find_root_ref(struct btrfs_root *tree_root,
1805 struct btrfs_path *path,
1806 u64 root_id, u64 ref_id);
1807int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
1808 struct btrfs_root *tree_root,
1809 u64 root_id, u8 type, u64 ref_id,
1810 u64 dirid, u64 sequence,
1811 const char *name, int name_len);
1812int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1813 struct btrfs_key *key);
1814int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
1815 *root, struct btrfs_key *key, struct btrfs_root_item
1816 *item);
1817int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
1818 *root, struct btrfs_key *key, struct btrfs_root_item
1819 *item);
1820int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
1821 btrfs_root_item *item, struct btrfs_key *key);
1822int btrfs_search_root(struct btrfs_root *root, u64 search_start,
1823 u64 *found_objectid);
1824int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
1825 struct btrfs_root *latest_root);
1826/* dir-item.c */
1827int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
1828 *root, const char *name, int name_len, u64 dir,
1829 struct btrfs_key *location, u8 type, u64 index);
1830struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
1831 struct btrfs_root *root,
1832 struct btrfs_path *path, u64 dir,
1833 const char *name, int name_len,
1834 int mod);
1835struct btrfs_dir_item *
1836btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
1837 struct btrfs_root *root,
1838 struct btrfs_path *path, u64 dir,
1839 u64 objectid, const char *name, int name_len,
1840 int mod);
1841struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
1842 struct btrfs_path *path,
1843 const char *name, int name_len);
1844int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
1845 struct btrfs_root *root,
1846 struct btrfs_path *path,
1847 struct btrfs_dir_item *di);
1848int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
1849 struct btrfs_root *root, const char *name,
1850 u16 name_len, const void *data, u16 data_len,
1851 u64 dir);
1852struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
1853 struct btrfs_root *root,
1854 struct btrfs_path *path, u64 dir,
1855 const char *name, u16 name_len,
1856 int mod);
1857
1858/* orphan.c */
1859int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
1860 struct btrfs_root *root, u64 offset);
1861int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
1862 struct btrfs_root *root, u64 offset);
1863
1864/* inode-map.c */
1865int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
1866 struct btrfs_root *fs_root,
1867 u64 dirid, u64 *objectid);
1868int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
1869
1870/* inode-item.c */
1871int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
1872 struct btrfs_root *root,
1873 const char *name, int name_len,
1874 u64 inode_objectid, u64 ref_objectid, u64 index);
1875int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
1876 struct btrfs_root *root,
1877 const char *name, int name_len,
1878 u64 inode_objectid, u64 ref_objectid, u64 *index);
1879int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
1880 struct btrfs_root *root,
1881 struct btrfs_path *path, u64 objectid);
1882int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
1883 *root, struct btrfs_path *path,
1884 struct btrfs_key *location, int mod);
1885
1886/* file-item.c */
1887int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
1888 struct bio *bio);
1889int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
1890 struct btrfs_root *root,
1891 u64 objectid, u64 pos,
1892 u64 disk_offset, u64 disk_num_bytes,
1893 u64 num_bytes, u64 offset, u64 ram_bytes,
1894 u8 compression, u8 encryption, u16 other_encoding);
1895int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
1896 struct btrfs_root *root,
1897 struct btrfs_path *path, u64 objectid,
1898 u64 bytenr, int mod);
1899int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
1900 struct btrfs_root *root, struct inode *inode,
1901 struct btrfs_ordered_sum *sums);
1902int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
1903 struct bio *bio);
1904int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
1905 u64 start, unsigned long len);
1906struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
1907 struct btrfs_root *root,
1908 struct btrfs_path *path,
1909 u64 objectid, u64 offset,
1910 int cow);
1911int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
1912 struct btrfs_root *root, struct btrfs_path *path,
1913 u64 isize);
1914/* inode.c */
1915
1916/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
1917#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
1918#define ClearPageChecked ClearPageFsMisc
1919#define SetPageChecked SetPageFsMisc
1920#define PageChecked PageFsMisc
1921#endif
1922
1923struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
1924int btrfs_set_inode_index(struct inode *dir, u64 *index);
1925int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
1926 struct btrfs_root *root,
1927 struct inode *dir, struct inode *inode,
1928 const char *name, int name_len);
1929int btrfs_add_link(struct btrfs_trans_handle *trans,
1930 struct inode *parent_inode, struct inode *inode,
1931 const char *name, int name_len, int add_backref, u64 index);
1932int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
1933 struct btrfs_root *root,
1934 struct inode *inode, u64 new_size,
1935 u32 min_type);
1936
1937int btrfs_start_delalloc_inodes(struct btrfs_root *root);
1938int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
1939int btrfs_writepages(struct address_space *mapping,
1940 struct writeback_control *wbc);
1941int btrfs_create_subvol_root(struct btrfs_root *new_root, struct dentry *dentry,
1942 struct btrfs_trans_handle *trans, u64 new_dirid,
1943 struct btrfs_block_group_cache *block_group);
1944
1945int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1946 size_t size, struct bio *bio, unsigned long bio_flags);
1947
1948unsigned long btrfs_force_ra(struct address_space *mapping,
1949 struct file_ra_state *ra, struct file *file,
1950 pgoff_t offset, pgoff_t last_index);
1951int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
1952 int for_del);
1953int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
1954int btrfs_readpage(struct file *file, struct page *page);
1955void btrfs_delete_inode(struct inode *inode);
1956void btrfs_put_inode(struct inode *inode);
1957void btrfs_read_locked_inode(struct inode *inode);
1958int btrfs_write_inode(struct inode *inode, int wait);
1959void btrfs_dirty_inode(struct inode *inode);
1960struct inode *btrfs_alloc_inode(struct super_block *sb);
1961void btrfs_destroy_inode(struct inode *inode);
1962int btrfs_init_cachep(void);
1963void btrfs_destroy_cachep(void);
1964long btrfs_ioctl_trans_end(struct file *file);
1965struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
1966 struct btrfs_root *root, int wait);
1967struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
1968 struct btrfs_root *root);
1969struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
1970 struct btrfs_root *root, int *is_new);
1971int btrfs_commit_write(struct file *file, struct page *page,
1972 unsigned from, unsigned to);
1973struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
1974 size_t page_offset, u64 start, u64 end,
1975 int create);
1976int btrfs_update_inode(struct btrfs_trans_handle *trans,
1977 struct btrfs_root *root,
1978 struct inode *inode);
1979int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
1980int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
1981void btrfs_orphan_cleanup(struct btrfs_root *root);
1982int btrfs_cont_expand(struct inode *inode, loff_t size);
1983
1984/* ioctl.c */
1985long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
1986
1987/* file.c */
1988int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
1989int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
1990 int skip_pinned);
1991int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
1992extern struct file_operations btrfs_file_operations;
1993int btrfs_drop_extents(struct btrfs_trans_handle *trans,
1994 struct btrfs_root *root, struct inode *inode,
1995 u64 start, u64 end, u64 inline_limit, u64 *hint_block);
1996int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1997 struct btrfs_root *root,
1998 struct inode *inode, u64 start, u64 end);
1999int btrfs_release_file(struct inode *inode, struct file *file);
2000
2001/* tree-defrag.c */
2002int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
2003 struct btrfs_root *root, int cache_only);
2004
2005/* sysfs.c */
2006int btrfs_init_sysfs(void);
2007void btrfs_exit_sysfs(void);
2008int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
2009int btrfs_sysfs_add_root(struct btrfs_root *root);
2010void btrfs_sysfs_del_root(struct btrfs_root *root);
2011void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
2012
2013/* xattr.c */
2014ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2015
2016/* super.c */
2017u64 btrfs_parse_size(char *str);
2018int btrfs_parse_options(struct btrfs_root *root, char *options);
2019int btrfs_sync_fs(struct super_block *sb, int wait);
2020
2021/* acl.c */
2022int btrfs_check_acl(struct inode *inode, int mask);
2023int btrfs_init_acl(struct inode *inode, struct inode *dir);
2024int btrfs_acl_chmod(struct inode *inode);
2025
2026/* free-space-cache.c */
2027int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
2028 u64 bytenr, u64 size);
2029int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
2030 u64 offset, u64 bytes);
2031int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2032 u64 bytenr, u64 size);
2033int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
2034 u64 offset, u64 bytes);
2035void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
2036 *block_group);
2037struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
2038 *block_group, u64 offset,
2039 u64 bytes);
2040void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2041 u64 bytes);
2042u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
2043#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 000000000000..5040b71f1900
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,386 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "hash.h"
22#include "transaction.h"
23
24/*
25 * insert a name into a directory, doing overflow properly if there is a hash
26 * collision. data_size indicates how big the item inserted should be. On
27 * success a struct btrfs_dir_item pointer is returned, otherwise it is
28 * an ERR_PTR.
29 *
30 * The name is not copied into the dir item, you have to do that yourself.
31 */
32static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
33 *trans,
34 struct btrfs_root *root,
35 struct btrfs_path *path,
36 struct btrfs_key *cpu_key,
37 u32 data_size,
38 const char *name,
39 int name_len)
40{
41 int ret;
42 char *ptr;
43 struct btrfs_item *item;
44 struct extent_buffer *leaf;
45
46 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
47 if (ret == -EEXIST) {
48 struct btrfs_dir_item *di;
49 di = btrfs_match_dir_item_name(root, path, name, name_len);
50 if (di)
51 return ERR_PTR(-EEXIST);
52 ret = btrfs_extend_item(trans, root, path, data_size);
53 WARN_ON(ret > 0);
54 }
55 if (ret < 0)
56 return ERR_PTR(ret);
57 WARN_ON(ret > 0);
58 leaf = path->nodes[0];
59 item = btrfs_item_nr(leaf, path->slots[0]);
60 ptr = btrfs_item_ptr(leaf, path->slots[0], char);
61 BUG_ON(data_size > btrfs_item_size(leaf, item));
62 ptr += btrfs_item_size(leaf, item) - data_size;
63 return (struct btrfs_dir_item *)ptr;
64}
65
66/*
67 * xattrs work a lot like directories, this inserts an xattr item
68 * into the tree
69 */
70int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
71 struct btrfs_root *root, const char *name,
72 u16 name_len, const void *data, u16 data_len,
73 u64 dir)
74{
75 int ret = 0;
76 struct btrfs_path *path;
77 struct btrfs_dir_item *dir_item;
78 unsigned long name_ptr, data_ptr;
79 struct btrfs_key key, location;
80 struct btrfs_disk_key disk_key;
81 struct extent_buffer *leaf;
82 u32 data_size;
83
84 key.objectid = dir;
85 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
86 key.offset = btrfs_name_hash(name, name_len);
87 path = btrfs_alloc_path();
88 if (!path)
89 return -ENOMEM;
90 if (name_len + data_len + sizeof(struct btrfs_dir_item) >
91 BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
92 return -ENOSPC;
93
94 data_size = sizeof(*dir_item) + name_len + data_len;
95 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
96 name, name_len);
97 /*
98 * FIXME: at some point we should handle xattr's that are larger than
99 * what we can fit in our leaf. We set location to NULL b/c we arent
100 * pointing at anything else, that will change if we store the xattr
101 * data in a separate inode.
102 */
103 BUG_ON(IS_ERR(dir_item));
104 memset(&location, 0, sizeof(location));
105
106 leaf = path->nodes[0];
107 btrfs_cpu_key_to_disk(&disk_key, &location);
108 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
109 btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
110 btrfs_set_dir_name_len(leaf, dir_item, name_len);
111 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
112 btrfs_set_dir_data_len(leaf, dir_item, data_len);
113 name_ptr = (unsigned long)(dir_item + 1);
114 data_ptr = (unsigned long)((char *)name_ptr + name_len);
115
116 write_extent_buffer(leaf, name, name_ptr, name_len);
117 write_extent_buffer(leaf, data, data_ptr, data_len);
118 btrfs_mark_buffer_dirty(path->nodes[0]);
119
120 btrfs_free_path(path);
121 return ret;
122}
123
124/*
125 * insert a directory item in the tree, doing all the magic for
126 * both indexes. 'dir' indicates which objectid to insert it into,
127 * 'location' is the key to stuff into the directory item, 'type' is the
128 * type of the inode we're pointing to, and 'index' is the sequence number
129 * to use for the second index (if one is created).
130 */
131int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
132 *root, const char *name, int name_len, u64 dir,
133 struct btrfs_key *location, u8 type, u64 index)
134{
135 int ret = 0;
136 int ret2 = 0;
137 struct btrfs_path *path;
138 struct btrfs_dir_item *dir_item;
139 struct extent_buffer *leaf;
140 unsigned long name_ptr;
141 struct btrfs_key key;
142 struct btrfs_disk_key disk_key;
143 u32 data_size;
144
145 key.objectid = dir;
146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
147 key.offset = btrfs_name_hash(name, name_len);
148 path = btrfs_alloc_path();
149 data_size = sizeof(*dir_item) + name_len;
150 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
151 name, name_len);
152 if (IS_ERR(dir_item)) {
153 ret = PTR_ERR(dir_item);
154 if (ret == -EEXIST)
155 goto second_insert;
156 goto out;
157 }
158
159 leaf = path->nodes[0];
160 btrfs_cpu_key_to_disk(&disk_key, location);
161 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
162 btrfs_set_dir_type(leaf, dir_item, type);
163 btrfs_set_dir_data_len(leaf, dir_item, 0);
164 btrfs_set_dir_name_len(leaf, dir_item, name_len);
165 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
166 name_ptr = (unsigned long)(dir_item + 1);
167
168 write_extent_buffer(leaf, name, name_ptr, name_len);
169 btrfs_mark_buffer_dirty(leaf);
170
171second_insert:
172 /* FIXME, use some real flag for selecting the extra index */
173 if (root == root->fs_info->tree_root) {
174 ret = 0;
175 goto out;
176 }
177 btrfs_release_path(root, path);
178
179 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
180 key.offset = index;
181 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
182 name, name_len);
183 if (IS_ERR(dir_item)) {
184 ret2 = PTR_ERR(dir_item);
185 goto out;
186 }
187 leaf = path->nodes[0];
188 btrfs_cpu_key_to_disk(&disk_key, location);
189 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
190 btrfs_set_dir_type(leaf, dir_item, type);
191 btrfs_set_dir_data_len(leaf, dir_item, 0);
192 btrfs_set_dir_name_len(leaf, dir_item, name_len);
193 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
194 name_ptr = (unsigned long)(dir_item + 1);
195 write_extent_buffer(leaf, name, name_ptr, name_len);
196 btrfs_mark_buffer_dirty(leaf);
197out:
198 btrfs_free_path(path);
199 if (ret)
200 return ret;
201 if (ret2)
202 return ret2;
203 return 0;
204}
205
206/*
207 * lookup a directory item based on name. 'dir' is the objectid
208 * we're searching in, and 'mod' tells us if you plan on deleting the
209 * item (use mod < 0) or changing the options (use mod > 0)
210 */
211struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
212 struct btrfs_root *root,
213 struct btrfs_path *path, u64 dir,
214 const char *name, int name_len,
215 int mod)
216{
217 int ret;
218 struct btrfs_key key;
219 int ins_len = mod < 0 ? -1 : 0;
220 int cow = mod != 0;
221 struct btrfs_key found_key;
222 struct extent_buffer *leaf;
223
224 key.objectid = dir;
225 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
226
227 key.offset = btrfs_name_hash(name, name_len);
228
229 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
230 if (ret < 0)
231 return ERR_PTR(ret);
232 if (ret > 0) {
233 if (path->slots[0] == 0)
234 return NULL;
235 path->slots[0]--;
236 }
237
238 leaf = path->nodes[0];
239 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
240
241 if (found_key.objectid != dir ||
242 btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
243 found_key.offset != key.offset)
244 return NULL;
245
246 return btrfs_match_dir_item_name(root, path, name, name_len);
247}
248
249/*
250 * lookup a directory item based on index. 'dir' is the objectid
251 * we're searching in, and 'mod' tells us if you plan on deleting the
252 * item (use mod < 0) or changing the options (use mod > 0)
253 *
254 * The name is used to make sure the index really points to the name you were
255 * looking for.
256 */
257struct btrfs_dir_item *
258btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
259 struct btrfs_root *root,
260 struct btrfs_path *path, u64 dir,
261 u64 objectid, const char *name, int name_len,
262 int mod)
263{
264 int ret;
265 struct btrfs_key key;
266 int ins_len = mod < 0 ? -1 : 0;
267 int cow = mod != 0;
268
269 key.objectid = dir;
270 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
271 key.offset = objectid;
272
273 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
274 if (ret < 0)
275 return ERR_PTR(ret);
276 if (ret > 0)
277 return ERR_PTR(-ENOENT);
278 return btrfs_match_dir_item_name(root, path, name, name_len);
279}
280
281struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
282 struct btrfs_root *root,
283 struct btrfs_path *path, u64 dir,
284 const char *name, u16 name_len,
285 int mod)
286{
287 int ret;
288 struct btrfs_key key;
289 int ins_len = mod < 0 ? -1 : 0;
290 int cow = mod != 0;
291 struct btrfs_key found_key;
292 struct extent_buffer *leaf;
293
294 key.objectid = dir;
295 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
296 key.offset = btrfs_name_hash(name, name_len);
297 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
298 if (ret < 0)
299 return ERR_PTR(ret);
300 if (ret > 0) {
301 if (path->slots[0] == 0)
302 return NULL;
303 path->slots[0]--;
304 }
305
306 leaf = path->nodes[0];
307 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
308
309 if (found_key.objectid != dir ||
310 btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
311 found_key.offset != key.offset)
312 return NULL;
313
314 return btrfs_match_dir_item_name(root, path, name, name_len);
315}
316
317/*
318 * helper function to look at the directory item pointed to by 'path'
319 * this walks through all the entries in a dir item and finds one
320 * for a specific name.
321 */
322struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
323 struct btrfs_path *path,
324 const char *name, int name_len)
325{
326 struct btrfs_dir_item *dir_item;
327 unsigned long name_ptr;
328 u32 total_len;
329 u32 cur = 0;
330 u32 this_len;
331 struct extent_buffer *leaf;
332
333 leaf = path->nodes[0];
334 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
335 total_len = btrfs_item_size_nr(leaf, path->slots[0]);
336 while(cur < total_len) {
337 this_len = sizeof(*dir_item) +
338 btrfs_dir_name_len(leaf, dir_item) +
339 btrfs_dir_data_len(leaf, dir_item);
340 name_ptr = (unsigned long)(dir_item + 1);
341
342 if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
343 memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
344 return dir_item;
345
346 cur += this_len;
347 dir_item = (struct btrfs_dir_item *)((char *)dir_item +
348 this_len);
349 }
350 return NULL;
351}
352
353/*
354 * given a pointer into a directory item, delete it. This
355 * handles items that have more than one entry in them.
356 */
357int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
358 struct btrfs_root *root,
359 struct btrfs_path *path,
360 struct btrfs_dir_item *di)
361{
362
363 struct extent_buffer *leaf;
364 u32 sub_item_len;
365 u32 item_len;
366 int ret = 0;
367
368 leaf = path->nodes[0];
369 sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
370 btrfs_dir_data_len(leaf, di);
371 item_len = btrfs_item_size_nr(leaf, path->slots[0]);
372 if (sub_item_len == item_len) {
373 ret = btrfs_del_item(trans, root, path);
374 } else {
375 /* MARKER */
376 unsigned long ptr = (unsigned long)di;
377 unsigned long start;
378
379 start = btrfs_item_ptr_offset(leaf, path->slots[0]);
380 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
381 item_len - (ptr + sub_item_len - start));
382 ret = btrfs_truncate_item(trans, root, path,
383 item_len - sub_item_len, 1);
384 }
385 return 0;
386}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 000000000000..c8dcb47b6d7d
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,2234 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/version.h>
20#include <linux/fs.h>
21#include <linux/blkdev.h>
22#include <linux/scatterlist.h>
23#include <linux/swap.h>
24#include <linux/radix-tree.h>
25#include <linux/writeback.h>
26#include <linux/buffer_head.h> // for block_sync_page
27#include <linux/workqueue.h>
28#include <linux/kthread.h>
29# include <linux/freezer.h>
30#include "crc32c.h"
31#include "ctree.h"
32#include "disk-io.h"
33#include "transaction.h"
34#include "btrfs_inode.h"
35#include "volumes.h"
36#include "print-tree.h"
37#include "async-thread.h"
38#include "locking.h"
39#include "ref-cache.h"
40#include "tree-log.h"
41
42#if 0
43static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
44{
45 if (extent_buffer_blocknr(buf) != btrfs_header_blocknr(buf)) {
46 printk(KERN_CRIT "buf blocknr(buf) is %llu, header is %llu\n",
47 (unsigned long long)extent_buffer_blocknr(buf),
48 (unsigned long long)btrfs_header_blocknr(buf));
49 return 1;
50 }
51 return 0;
52}
53#endif
54
55static struct extent_io_ops btree_extent_io_ops;
56static void end_workqueue_fn(struct btrfs_work *work);
57
58/*
59 * end_io_wq structs are used to do processing in task context when an IO is
60 * complete. This is used during reads to verify checksums, and it is used
61 * by writes to insert metadata for new file extents after IO is complete.
62 */
63struct end_io_wq {
64 struct bio *bio;
65 bio_end_io_t *end_io;
66 void *private;
67 struct btrfs_fs_info *info;
68 int error;
69 int metadata;
70 struct list_head list;
71 struct btrfs_work work;
72};
73
74/*
75 * async submit bios are used to offload expensive checksumming
76 * onto the worker threads. They checksum file and metadata bios
77 * just before they are sent down the IO stack.
78 */
79struct async_submit_bio {
80 struct inode *inode;
81 struct bio *bio;
82 struct list_head list;
83 extent_submit_bio_hook_t *submit_bio_start;
84 extent_submit_bio_hook_t *submit_bio_done;
85 int rw;
86 int mirror_num;
87 unsigned long bio_flags;
88 struct btrfs_work work;
89};
90
91/*
92 * extents on the btree inode are pretty simple, there's one extent
93 * that covers the entire device
94 */
95struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
96 size_t page_offset, u64 start, u64 len,
97 int create)
98{
99 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
100 struct extent_map *em;
101 int ret;
102
103 spin_lock(&em_tree->lock);
104 em = lookup_extent_mapping(em_tree, start, len);
105 if (em) {
106 em->bdev =
107 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
108 spin_unlock(&em_tree->lock);
109 goto out;
110 }
111 spin_unlock(&em_tree->lock);
112
113 em = alloc_extent_map(GFP_NOFS);
114 if (!em) {
115 em = ERR_PTR(-ENOMEM);
116 goto out;
117 }
118 em->start = 0;
119 em->len = (u64)-1;
120 em->block_len = (u64)-1;
121 em->block_start = 0;
122 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
123
124 spin_lock(&em_tree->lock);
125 ret = add_extent_mapping(em_tree, em);
126 if (ret == -EEXIST) {
127 u64 failed_start = em->start;
128 u64 failed_len = em->len;
129
130 printk("failed to insert %Lu %Lu -> %Lu into tree\n",
131 em->start, em->len, em->block_start);
132 free_extent_map(em);
133 em = lookup_extent_mapping(em_tree, start, len);
134 if (em) {
135 printk("after failing, found %Lu %Lu %Lu\n",
136 em->start, em->len, em->block_start);
137 ret = 0;
138 } else {
139 em = lookup_extent_mapping(em_tree, failed_start,
140 failed_len);
141 if (em) {
142 printk("double failure lookup gives us "
143 "%Lu %Lu -> %Lu\n", em->start,
144 em->len, em->block_start);
145 free_extent_map(em);
146 }
147 ret = -EIO;
148 }
149 } else if (ret) {
150 free_extent_map(em);
151 em = NULL;
152 }
153 spin_unlock(&em_tree->lock);
154
155 if (ret)
156 em = ERR_PTR(ret);
157out:
158 return em;
159}
160
161u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
162{
163 return btrfs_crc32c(seed, data, len);
164}
165
166void btrfs_csum_final(u32 crc, char *result)
167{
168 *(__le32 *)result = ~cpu_to_le32(crc);
169}
170
171/*
172 * compute the csum for a btree block, and either verify it or write it
173 * into the csum field of the block.
174 */
175static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
176 int verify)
177{
178 char result[BTRFS_CRC32_SIZE];
179 unsigned long len;
180 unsigned long cur_len;
181 unsigned long offset = BTRFS_CSUM_SIZE;
182 char *map_token = NULL;
183 char *kaddr;
184 unsigned long map_start;
185 unsigned long map_len;
186 int err;
187 u32 crc = ~(u32)0;
188
189 len = buf->len - offset;
190 while(len > 0) {
191 err = map_private_extent_buffer(buf, offset, 32,
192 &map_token, &kaddr,
193 &map_start, &map_len, KM_USER0);
194 if (err) {
195 printk("failed to map extent buffer! %lu\n",
196 offset);
197 return 1;
198 }
199 cur_len = min(len, map_len - (offset - map_start));
200 crc = btrfs_csum_data(root, kaddr + offset - map_start,
201 crc, cur_len);
202 len -= cur_len;
203 offset += cur_len;
204 unmap_extent_buffer(buf, map_token, KM_USER0);
205 }
206 btrfs_csum_final(crc, result);
207
208 if (verify) {
209 /* FIXME, this is not good */
210 if (memcmp_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE)) {
211 u32 val;
212 u32 found = 0;
213 memcpy(&found, result, BTRFS_CRC32_SIZE);
214
215 read_extent_buffer(buf, &val, 0, BTRFS_CRC32_SIZE);
216 printk("btrfs: %s checksum verify failed on %llu "
217 "wanted %X found %X level %d\n",
218 root->fs_info->sb->s_id,
219 buf->start, val, found, btrfs_header_level(buf));
220 return 1;
221 }
222 } else {
223 write_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE);
224 }
225 return 0;
226}
227
228/*
229 * we can't consider a given block up to date unless the transid of the
230 * block matches the transid in the parent node's pointer. This is how we
231 * detect blocks that either didn't get written at all or got written
232 * in the wrong place.
233 */
234static int verify_parent_transid(struct extent_io_tree *io_tree,
235 struct extent_buffer *eb, u64 parent_transid)
236{
237 int ret;
238
239 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
240 return 0;
241
242 lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
243 if (extent_buffer_uptodate(io_tree, eb) &&
244 btrfs_header_generation(eb) == parent_transid) {
245 ret = 0;
246 goto out;
247 }
248 printk("parent transid verify failed on %llu wanted %llu found %llu\n",
249 (unsigned long long)eb->start,
250 (unsigned long long)parent_transid,
251 (unsigned long long)btrfs_header_generation(eb));
252 ret = 1;
253 clear_extent_buffer_uptodate(io_tree, eb);
254out:
255 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
256 GFP_NOFS);
257 return ret;
258}
259
260/*
261 * helper to read a given tree block, doing retries as required when
262 * the checksums don't match and we have alternate mirrors to try.
263 */
264static int btree_read_extent_buffer_pages(struct btrfs_root *root,
265 struct extent_buffer *eb,
266 u64 start, u64 parent_transid)
267{
268 struct extent_io_tree *io_tree;
269 int ret;
270 int num_copies = 0;
271 int mirror_num = 0;
272
273 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
274 while (1) {
275 ret = read_extent_buffer_pages(io_tree, eb, start, 1,
276 btree_get_extent, mirror_num);
277 if (!ret &&
278 !verify_parent_transid(io_tree, eb, parent_transid))
279 return ret;
280printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror_num);
281 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
282 eb->start, eb->len);
283 if (num_copies == 1)
284 return ret;
285
286 mirror_num++;
287 if (mirror_num > num_copies)
288 return ret;
289 }
290 return -EIO;
291}
292
293/*
294 * checksum a dirty tree block before IO. This has extra checks to make
295 * sure we only fill in the checksum field in the first page of a multi-page block
296 */
297int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
298{
299 struct extent_io_tree *tree;
300 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
301 u64 found_start;
302 int found_level;
303 unsigned long len;
304 struct extent_buffer *eb;
305 int ret;
306
307 tree = &BTRFS_I(page->mapping->host)->io_tree;
308
309 if (page->private == EXTENT_PAGE_PRIVATE)
310 goto out;
311 if (!page->private)
312 goto out;
313 len = page->private >> 2;
314 if (len == 0) {
315 WARN_ON(1);
316 }
317 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
318 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
319 btrfs_header_generation(eb));
320 BUG_ON(ret);
321 found_start = btrfs_header_bytenr(eb);
322 if (found_start != start) {
323 printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
324 start, found_start, len);
325 WARN_ON(1);
326 goto err;
327 }
328 if (eb->first_page != page) {
329 printk("bad first page %lu %lu\n", eb->first_page->index,
330 page->index);
331 WARN_ON(1);
332 goto err;
333 }
334 if (!PageUptodate(page)) {
335 printk("csum not up to date page %lu\n", page->index);
336 WARN_ON(1);
337 goto err;
338 }
339 found_level = btrfs_header_level(eb);
340
341 csum_tree_block(root, eb, 0);
342err:
343 free_extent_buffer(eb);
344out:
345 return 0;
346}
347
348static int check_tree_block_fsid(struct btrfs_root *root,
349 struct extent_buffer *eb)
350{
351 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
352 u8 fsid[BTRFS_UUID_SIZE];
353 int ret = 1;
354
355 read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
356 BTRFS_FSID_SIZE);
357 while (fs_devices) {
358 if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
359 ret = 0;
360 break;
361 }
362 fs_devices = fs_devices->seed;
363 }
364 return ret;
365}
366
367int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
368 struct extent_state *state)
369{
370 struct extent_io_tree *tree;
371 u64 found_start;
372 int found_level;
373 unsigned long len;
374 struct extent_buffer *eb;
375 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
376 int ret = 0;
377
378 tree = &BTRFS_I(page->mapping->host)->io_tree;
379 if (page->private == EXTENT_PAGE_PRIVATE)
380 goto out;
381 if (!page->private)
382 goto out;
383 len = page->private >> 2;
384 if (len == 0) {
385 WARN_ON(1);
386 }
387 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
388
389 found_start = btrfs_header_bytenr(eb);
390 if (found_start != start) {
391 printk("bad tree block start %llu %llu\n",
392 (unsigned long long)found_start,
393 (unsigned long long)eb->start);
394 ret = -EIO;
395 goto err;
396 }
397 if (eb->first_page != page) {
398 printk("bad first page %lu %lu\n", eb->first_page->index,
399 page->index);
400 WARN_ON(1);
401 ret = -EIO;
402 goto err;
403 }
404 if (check_tree_block_fsid(root, eb)) {
405 printk("bad fsid on block %Lu\n", eb->start);
406 ret = -EIO;
407 goto err;
408 }
409 found_level = btrfs_header_level(eb);
410
411 ret = csum_tree_block(root, eb, 1);
412 if (ret)
413 ret = -EIO;
414
415 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
416 end = eb->start + end - 1;
417err:
418 free_extent_buffer(eb);
419out:
420 return ret;
421}
422
423static void end_workqueue_bio(struct bio *bio, int err)
424{
425 struct end_io_wq *end_io_wq = bio->bi_private;
426 struct btrfs_fs_info *fs_info;
427
428 fs_info = end_io_wq->info;
429 end_io_wq->error = err;
430 end_io_wq->work.func = end_workqueue_fn;
431 end_io_wq->work.flags = 0;
432 if (bio->bi_rw & (1 << BIO_RW))
433 btrfs_queue_worker(&fs_info->endio_write_workers,
434 &end_io_wq->work);
435 else
436 btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
437}
438
439int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
440 int metadata)
441{
442 struct end_io_wq *end_io_wq;
443 end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
444 if (!end_io_wq)
445 return -ENOMEM;
446
447 end_io_wq->private = bio->bi_private;
448 end_io_wq->end_io = bio->bi_end_io;
449 end_io_wq->info = info;
450 end_io_wq->error = 0;
451 end_io_wq->bio = bio;
452 end_io_wq->metadata = metadata;
453
454 bio->bi_private = end_io_wq;
455 bio->bi_end_io = end_workqueue_bio;
456 return 0;
457}
458
459unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
460{
461 unsigned long limit = min_t(unsigned long,
462 info->workers.max_workers,
463 info->fs_devices->open_devices);
464 return 256 * limit;
465}
466
467int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
468{
469 return atomic_read(&info->nr_async_bios) >
470 btrfs_async_submit_limit(info);
471}
472
473static void run_one_async_start(struct btrfs_work *work)
474{
475 struct btrfs_fs_info *fs_info;
476 struct async_submit_bio *async;
477
478 async = container_of(work, struct async_submit_bio, work);
479 fs_info = BTRFS_I(async->inode)->root->fs_info;
480 async->submit_bio_start(async->inode, async->rw, async->bio,
481 async->mirror_num, async->bio_flags);
482}
483
484static void run_one_async_done(struct btrfs_work *work)
485{
486 struct btrfs_fs_info *fs_info;
487 struct async_submit_bio *async;
488 int limit;
489
490 async = container_of(work, struct async_submit_bio, work);
491 fs_info = BTRFS_I(async->inode)->root->fs_info;
492
493 limit = btrfs_async_submit_limit(fs_info);
494 limit = limit * 2 / 3;
495
496 atomic_dec(&fs_info->nr_async_submits);
497
498 if (atomic_read(&fs_info->nr_async_submits) < limit &&
499 waitqueue_active(&fs_info->async_submit_wait))
500 wake_up(&fs_info->async_submit_wait);
501
502 async->submit_bio_done(async->inode, async->rw, async->bio,
503 async->mirror_num, async->bio_flags);
504}
505
506static void run_one_async_free(struct btrfs_work *work)
507{
508 struct async_submit_bio *async;
509
510 async = container_of(work, struct async_submit_bio, work);
511 kfree(async);
512}
513
514int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
515 int rw, struct bio *bio, int mirror_num,
516 unsigned long bio_flags,
517 extent_submit_bio_hook_t *submit_bio_start,
518 extent_submit_bio_hook_t *submit_bio_done)
519{
520 struct async_submit_bio *async;
521
522 async = kmalloc(sizeof(*async), GFP_NOFS);
523 if (!async)
524 return -ENOMEM;
525
526 async->inode = inode;
527 async->rw = rw;
528 async->bio = bio;
529 async->mirror_num = mirror_num;
530 async->submit_bio_start = submit_bio_start;
531 async->submit_bio_done = submit_bio_done;
532
533 async->work.func = run_one_async_start;
534 async->work.ordered_func = run_one_async_done;
535 async->work.ordered_free = run_one_async_free;
536
537 async->work.flags = 0;
538 async->bio_flags = bio_flags;
539
540 atomic_inc(&fs_info->nr_async_submits);
541 btrfs_queue_worker(&fs_info->workers, &async->work);
542#if 0
543 int limit = btrfs_async_submit_limit(fs_info);
544 if (atomic_read(&fs_info->nr_async_submits) > limit) {
545 wait_event_timeout(fs_info->async_submit_wait,
546 (atomic_read(&fs_info->nr_async_submits) < limit),
547 HZ/10);
548
549 wait_event_timeout(fs_info->async_submit_wait,
550 (atomic_read(&fs_info->nr_async_bios) < limit),
551 HZ/10);
552 }
553#endif
554 while(atomic_read(&fs_info->async_submit_draining) &&
555 atomic_read(&fs_info->nr_async_submits)) {
556 wait_event(fs_info->async_submit_wait,
557 (atomic_read(&fs_info->nr_async_submits) == 0));
558 }
559
560 return 0;
561}
562
563static int btree_csum_one_bio(struct bio *bio)
564{
565 struct bio_vec *bvec = bio->bi_io_vec;
566 int bio_index = 0;
567 struct btrfs_root *root;
568
569 WARN_ON(bio->bi_vcnt <= 0);
570 while(bio_index < bio->bi_vcnt) {
571 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
572 csum_dirty_buffer(root, bvec->bv_page);
573 bio_index++;
574 bvec++;
575 }
576 return 0;
577}
578
579static int __btree_submit_bio_start(struct inode *inode, int rw,
580 struct bio *bio, int mirror_num,
581 unsigned long bio_flags)
582{
583 /*
584 * when we're called for a write, we're already in the async
585 * submission context. Just jump into btrfs_map_bio
586 */
587 btree_csum_one_bio(bio);
588 return 0;
589}
590
591static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
592 int mirror_num, unsigned long bio_flags)
593{
594 /*
595 * when we're called for a write, we're already in the async
596 * submission context. Just jump into btrfs_map_bio
597 */
598 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
599}
600
601static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
602 int mirror_num, unsigned long bio_flags)
603{
604 /*
605 * kthread helpers are used to submit writes so that checksumming
606 * can happen in parallel across all CPUs
607 */
608 if (!(rw & (1 << BIO_RW))) {
609 int ret;
610 /*
611 * called for a read, do the setup so that checksum validation
612 * can happen in the async kernel threads
613 */
614 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
615 bio, 1);
616 BUG_ON(ret);
617
618 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
619 mirror_num, 0);
620 }
621 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
622 inode, rw, bio, mirror_num, 0,
623 __btree_submit_bio_start,
624 __btree_submit_bio_done);
625}
626
627static int btree_writepage(struct page *page, struct writeback_control *wbc)
628{
629 struct extent_io_tree *tree;
630 tree = &BTRFS_I(page->mapping->host)->io_tree;
631
632 if (current->flags & PF_MEMALLOC) {
633 redirty_page_for_writepage(wbc, page);
634 unlock_page(page);
635 return 0;
636 }
637 return extent_write_full_page(tree, page, btree_get_extent, wbc);
638}
639
640static int btree_writepages(struct address_space *mapping,
641 struct writeback_control *wbc)
642{
643 struct extent_io_tree *tree;
644 tree = &BTRFS_I(mapping->host)->io_tree;
645 if (wbc->sync_mode == WB_SYNC_NONE) {
646 u64 num_dirty;
647 u64 start = 0;
648 unsigned long thresh = 32 * 1024 * 1024;
649
650 if (wbc->for_kupdate)
651 return 0;
652
653 num_dirty = count_range_bits(tree, &start, (u64)-1,
654 thresh, EXTENT_DIRTY);
655 if (num_dirty < thresh) {
656 return 0;
657 }
658 }
659 return extent_writepages(tree, mapping, btree_get_extent, wbc);
660}
661
662int btree_readpage(struct file *file, struct page *page)
663{
664 struct extent_io_tree *tree;
665 tree = &BTRFS_I(page->mapping->host)->io_tree;
666 return extent_read_full_page(tree, page, btree_get_extent);
667}
668
669static int btree_releasepage(struct page *page, gfp_t gfp_flags)
670{
671 struct extent_io_tree *tree;
672 struct extent_map_tree *map;
673 int ret;
674
675 if (PageWriteback(page) || PageDirty(page))
676 return 0;
677
678 tree = &BTRFS_I(page->mapping->host)->io_tree;
679 map = &BTRFS_I(page->mapping->host)->extent_tree;
680
681 ret = try_release_extent_state(map, tree, page, gfp_flags);
682 if (!ret) {
683 return 0;
684 }
685
686 ret = try_release_extent_buffer(tree, page);
687 if (ret == 1) {
688 ClearPagePrivate(page);
689 set_page_private(page, 0);
690 page_cache_release(page);
691 }
692
693 return ret;
694}
695
696static void btree_invalidatepage(struct page *page, unsigned long offset)
697{
698 struct extent_io_tree *tree;
699 tree = &BTRFS_I(page->mapping->host)->io_tree;
700 extent_invalidatepage(tree, page, offset);
701 btree_releasepage(page, GFP_NOFS);
702 if (PagePrivate(page)) {
703 printk("warning page private not zero on page %Lu\n",
704 page_offset(page));
705 ClearPagePrivate(page);
706 set_page_private(page, 0);
707 page_cache_release(page);
708 }
709}
710
711#if 0
712static int btree_writepage(struct page *page, struct writeback_control *wbc)
713{
714 struct buffer_head *bh;
715 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
716 struct buffer_head *head;
717 if (!page_has_buffers(page)) {
718 create_empty_buffers(page, root->fs_info->sb->s_blocksize,
719 (1 << BH_Dirty)|(1 << BH_Uptodate));
720 }
721 head = page_buffers(page);
722 bh = head;
723 do {
724 if (buffer_dirty(bh))
725 csum_tree_block(root, bh, 0);
726 bh = bh->b_this_page;
727 } while (bh != head);
728 return block_write_full_page(page, btree_get_block, wbc);
729}
730#endif
731
732static struct address_space_operations btree_aops = {
733 .readpage = btree_readpage,
734 .writepage = btree_writepage,
735 .writepages = btree_writepages,
736 .releasepage = btree_releasepage,
737 .invalidatepage = btree_invalidatepage,
738 .sync_page = block_sync_page,
739};
740
741int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
742 u64 parent_transid)
743{
744 struct extent_buffer *buf = NULL;
745 struct inode *btree_inode = root->fs_info->btree_inode;
746 int ret = 0;
747
748 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
749 if (!buf)
750 return 0;
751 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
752 buf, 0, 0, btree_get_extent, 0);
753 free_extent_buffer(buf);
754 return ret;
755}
756
757struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
758 u64 bytenr, u32 blocksize)
759{
760 struct inode *btree_inode = root->fs_info->btree_inode;
761 struct extent_buffer *eb;
762 eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
763 bytenr, blocksize, GFP_NOFS);
764 return eb;
765}
766
767struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
768 u64 bytenr, u32 blocksize)
769{
770 struct inode *btree_inode = root->fs_info->btree_inode;
771 struct extent_buffer *eb;
772
773 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
774 bytenr, blocksize, NULL, GFP_NOFS);
775 return eb;
776}
777
778
779int btrfs_write_tree_block(struct extent_buffer *buf)
780{
781 return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
782 buf->start + buf->len - 1, WB_SYNC_ALL);
783}
784
785int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
786{
787 return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
788 buf->start, buf->start + buf->len -1);
789}
790
791struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
792 u32 blocksize, u64 parent_transid)
793{
794 struct extent_buffer *buf = NULL;
795 struct inode *btree_inode = root->fs_info->btree_inode;
796 struct extent_io_tree *io_tree;
797 int ret;
798
799 io_tree = &BTRFS_I(btree_inode)->io_tree;
800
801 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
802 if (!buf)
803 return NULL;
804
805 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
806
807 if (ret == 0) {
808 buf->flags |= EXTENT_UPTODATE;
809 } else {
810 WARN_ON(1);
811 }
812 return buf;
813
814}
815
816int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
817 struct extent_buffer *buf)
818{
819 struct inode *btree_inode = root->fs_info->btree_inode;
820 if (btrfs_header_generation(buf) ==
821 root->fs_info->running_transaction->transid) {
822 WARN_ON(!btrfs_tree_locked(buf));
823 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
824 buf);
825 }
826 return 0;
827}
828
829static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
830 u32 stripesize, struct btrfs_root *root,
831 struct btrfs_fs_info *fs_info,
832 u64 objectid)
833{
834 root->node = NULL;
835 root->commit_root = NULL;
836 root->ref_tree = NULL;
837 root->sectorsize = sectorsize;
838 root->nodesize = nodesize;
839 root->leafsize = leafsize;
840 root->stripesize = stripesize;
841 root->ref_cows = 0;
842 root->track_dirty = 0;
843
844 root->fs_info = fs_info;
845 root->objectid = objectid;
846 root->last_trans = 0;
847 root->highest_inode = 0;
848 root->last_inode_alloc = 0;
849 root->name = NULL;
850 root->in_sysfs = 0;
851
852 INIT_LIST_HEAD(&root->dirty_list);
853 INIT_LIST_HEAD(&root->orphan_list);
854 INIT_LIST_HEAD(&root->dead_list);
855 spin_lock_init(&root->node_lock);
856 spin_lock_init(&root->list_lock);
857 mutex_init(&root->objectid_mutex);
858 mutex_init(&root->log_mutex);
859 extent_io_tree_init(&root->dirty_log_pages,
860 fs_info->btree_inode->i_mapping, GFP_NOFS);
861
862 btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
863 root->ref_tree = &root->ref_tree_struct;
864
865 memset(&root->root_key, 0, sizeof(root->root_key));
866 memset(&root->root_item, 0, sizeof(root->root_item));
867 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
868 memset(&root->root_kobj, 0, sizeof(root->root_kobj));
869 root->defrag_trans_start = fs_info->generation;
870 init_completion(&root->kobj_unregister);
871 root->defrag_running = 0;
872 root->defrag_level = 0;
873 root->root_key.objectid = objectid;
874 root->anon_super.s_root = NULL;
875 root->anon_super.s_dev = 0;
876 INIT_LIST_HEAD(&root->anon_super.s_list);
877 INIT_LIST_HEAD(&root->anon_super.s_instances);
878 init_rwsem(&root->anon_super.s_umount);
879
880 return 0;
881}
882
883static int find_and_setup_root(struct btrfs_root *tree_root,
884 struct btrfs_fs_info *fs_info,
885 u64 objectid,
886 struct btrfs_root *root)
887{
888 int ret;
889 u32 blocksize;
890 u64 generation;
891
892 __setup_root(tree_root->nodesize, tree_root->leafsize,
893 tree_root->sectorsize, tree_root->stripesize,
894 root, fs_info, objectid);
895 ret = btrfs_find_last_root(tree_root, objectid,
896 &root->root_item, &root->root_key);
897 BUG_ON(ret);
898
899 generation = btrfs_root_generation(&root->root_item);
900 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
901 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
902 blocksize, generation);
903 BUG_ON(!root->node);
904 return 0;
905}
906
907int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
908 struct btrfs_fs_info *fs_info)
909{
910 struct extent_buffer *eb;
911 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
912 u64 start = 0;
913 u64 end = 0;
914 int ret;
915
916 if (!log_root_tree)
917 return 0;
918
919 while(1) {
920 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
921 0, &start, &end, EXTENT_DIRTY);
922 if (ret)
923 break;
924
925 clear_extent_dirty(&log_root_tree->dirty_log_pages,
926 start, end, GFP_NOFS);
927 }
928 eb = fs_info->log_root_tree->node;
929
930 WARN_ON(btrfs_header_level(eb) != 0);
931 WARN_ON(btrfs_header_nritems(eb) != 0);
932
933 ret = btrfs_free_reserved_extent(fs_info->tree_root,
934 eb->start, eb->len);
935 BUG_ON(ret);
936
937 free_extent_buffer(eb);
938 kfree(fs_info->log_root_tree);
939 fs_info->log_root_tree = NULL;
940 return 0;
941}
942
943int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
944 struct btrfs_fs_info *fs_info)
945{
946 struct btrfs_root *root;
947 struct btrfs_root *tree_root = fs_info->tree_root;
948
949 root = kzalloc(sizeof(*root), GFP_NOFS);
950 if (!root)
951 return -ENOMEM;
952
953 __setup_root(tree_root->nodesize, tree_root->leafsize,
954 tree_root->sectorsize, tree_root->stripesize,
955 root, fs_info, BTRFS_TREE_LOG_OBJECTID);
956
957 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
958 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
959 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
960 root->ref_cows = 0;
961
962 root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
963 0, BTRFS_TREE_LOG_OBJECTID,
964 trans->transid, 0, 0, 0);
965
966 btrfs_set_header_nritems(root->node, 0);
967 btrfs_set_header_level(root->node, 0);
968 btrfs_set_header_bytenr(root->node, root->node->start);
969 btrfs_set_header_generation(root->node, trans->transid);
970 btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
971
972 write_extent_buffer(root->node, root->fs_info->fsid,
973 (unsigned long)btrfs_header_fsid(root->node),
974 BTRFS_FSID_SIZE);
975 btrfs_mark_buffer_dirty(root->node);
976 btrfs_tree_unlock(root->node);
977 fs_info->log_root_tree = root;
978 return 0;
979}
980
981struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
982 struct btrfs_key *location)
983{
984 struct btrfs_root *root;
985 struct btrfs_fs_info *fs_info = tree_root->fs_info;
986 struct btrfs_path *path;
987 struct extent_buffer *l;
988 u64 highest_inode;
989 u64 generation;
990 u32 blocksize;
991 int ret = 0;
992
993 root = kzalloc(sizeof(*root), GFP_NOFS);
994 if (!root)
995 return ERR_PTR(-ENOMEM);
996 if (location->offset == (u64)-1) {
997 ret = find_and_setup_root(tree_root, fs_info,
998 location->objectid, root);
999 if (ret) {
1000 kfree(root);
1001 return ERR_PTR(ret);
1002 }
1003 goto insert;
1004 }
1005
1006 __setup_root(tree_root->nodesize, tree_root->leafsize,
1007 tree_root->sectorsize, tree_root->stripesize,
1008 root, fs_info, location->objectid);
1009
1010 path = btrfs_alloc_path();
1011 BUG_ON(!path);
1012 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1013 if (ret != 0) {
1014 if (ret > 0)
1015 ret = -ENOENT;
1016 goto out;
1017 }
1018 l = path->nodes[0];
1019 read_extent_buffer(l, &root->root_item,
1020 btrfs_item_ptr_offset(l, path->slots[0]),
1021 sizeof(root->root_item));
1022 memcpy(&root->root_key, location, sizeof(*location));
1023 ret = 0;
1024out:
1025 btrfs_release_path(root, path);
1026 btrfs_free_path(path);
1027 if (ret) {
1028 kfree(root);
1029 return ERR_PTR(ret);
1030 }
1031 generation = btrfs_root_generation(&root->root_item);
1032 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1033 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1034 blocksize, generation);
1035 BUG_ON(!root->node);
1036insert:
1037 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
1038 root->ref_cows = 1;
1039 ret = btrfs_find_highest_inode(root, &highest_inode);
1040 if (ret == 0) {
1041 root->highest_inode = highest_inode;
1042 root->last_inode_alloc = highest_inode;
1043 }
1044 }
1045 return root;
1046}
1047
1048struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1049 u64 root_objectid)
1050{
1051 struct btrfs_root *root;
1052
1053 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
1054 return fs_info->tree_root;
1055 if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
1056 return fs_info->extent_root;
1057
1058 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1059 (unsigned long)root_objectid);
1060 return root;
1061}
1062
1063struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1064 struct btrfs_key *location)
1065{
1066 struct btrfs_root *root;
1067 int ret;
1068
1069 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
1070 return fs_info->tree_root;
1071 if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
1072 return fs_info->extent_root;
1073 if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
1074 return fs_info->chunk_root;
1075 if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
1076 return fs_info->dev_root;
1077
1078 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1079 (unsigned long)location->objectid);
1080 if (root)
1081 return root;
1082
1083 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1084 if (IS_ERR(root))
1085 return root;
1086
1087 set_anon_super(&root->anon_super, NULL);
1088
1089 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1090 (unsigned long)root->root_key.objectid,
1091 root);
1092 if (ret) {
1093 free_extent_buffer(root->node);
1094 kfree(root);
1095 return ERR_PTR(ret);
1096 }
1097 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
1098 ret = btrfs_find_dead_roots(fs_info->tree_root,
1099 root->root_key.objectid, root);
1100 BUG_ON(ret);
1101 btrfs_orphan_cleanup(root);
1102 }
1103 return root;
1104}
1105
1106struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1107 struct btrfs_key *location,
1108 const char *name, int namelen)
1109{
1110 struct btrfs_root *root;
1111 int ret;
1112
1113 root = btrfs_read_fs_root_no_name(fs_info, location);
1114 if (!root)
1115 return NULL;
1116
1117 if (root->in_sysfs)
1118 return root;
1119
1120 ret = btrfs_set_root_name(root, name, namelen);
1121 if (ret) {
1122 free_extent_buffer(root->node);
1123 kfree(root);
1124 return ERR_PTR(ret);
1125 }
1126#if 0
1127 ret = btrfs_sysfs_add_root(root);
1128 if (ret) {
1129 free_extent_buffer(root->node);
1130 kfree(root->name);
1131 kfree(root);
1132 return ERR_PTR(ret);
1133 }
1134#endif
1135 root->in_sysfs = 1;
1136 return root;
1137}
1138#if 0
1139static int add_hasher(struct btrfs_fs_info *info, char *type) {
1140 struct btrfs_hasher *hasher;
1141
1142 hasher = kmalloc(sizeof(*hasher), GFP_NOFS);
1143 if (!hasher)
1144 return -ENOMEM;
1145 hasher->hash_tfm = crypto_alloc_hash(type, 0, CRYPTO_ALG_ASYNC);
1146 if (!hasher->hash_tfm) {
1147 kfree(hasher);
1148 return -EINVAL;
1149 }
1150 spin_lock(&info->hash_lock);
1151 list_add(&hasher->list, &info->hashers);
1152 spin_unlock(&info->hash_lock);
1153 return 0;
1154}
1155#endif
1156
1157static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1158{
1159 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1160 int ret = 0;
1161 struct list_head *cur;
1162 struct btrfs_device *device;
1163 struct backing_dev_info *bdi;
1164#if 0
1165 if ((bdi_bits & (1 << BDI_write_congested)) &&
1166 btrfs_congested_async(info, 0))
1167 return 1;
1168#endif
1169 list_for_each(cur, &info->fs_devices->devices) {
1170 device = list_entry(cur, struct btrfs_device, dev_list);
1171 if (!device->bdev)
1172 continue;
1173 bdi = blk_get_backing_dev_info(device->bdev);
1174 if (bdi && bdi_congested(bdi, bdi_bits)) {
1175 ret = 1;
1176 break;
1177 }
1178 }
1179 return ret;
1180}
1181
1182/*
1183 * this unplugs every device on the box, and it is only used when page
1184 * is null
1185 */
1186static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1187{
1188 struct list_head *cur;
1189 struct btrfs_device *device;
1190 struct btrfs_fs_info *info;
1191
1192 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1193 list_for_each(cur, &info->fs_devices->devices) {
1194 device = list_entry(cur, struct btrfs_device, dev_list);
1195 bdi = blk_get_backing_dev_info(device->bdev);
1196 if (bdi->unplug_io_fn) {
1197 bdi->unplug_io_fn(bdi, page);
1198 }
1199 }
1200}
1201
1202void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1203{
1204 struct inode *inode;
1205 struct extent_map_tree *em_tree;
1206 struct extent_map *em;
1207 struct address_space *mapping;
1208 u64 offset;
1209
1210 /* the generic O_DIRECT read code does this */
1211 if (1 || !page) {
1212 __unplug_io_fn(bdi, page);
1213 return;
1214 }
1215
1216 /*
1217 * page->mapping may change at any time. Get a consistent copy
1218 * and use that for everything below
1219 */
1220 smp_mb();
1221 mapping = page->mapping;
1222 if (!mapping)
1223 return;
1224
1225 inode = mapping->host;
1226
1227 /*
1228 * don't do the expensive searching for a small number of
1229 * devices
1230 */
1231 if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1232 __unplug_io_fn(bdi, page);
1233 return;
1234 }
1235
1236 offset = page_offset(page);
1237
1238 em_tree = &BTRFS_I(inode)->extent_tree;
1239 spin_lock(&em_tree->lock);
1240 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1241 spin_unlock(&em_tree->lock);
1242 if (!em) {
1243 __unplug_io_fn(bdi, page);
1244 return;
1245 }
1246
1247 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1248 free_extent_map(em);
1249 __unplug_io_fn(bdi, page);
1250 return;
1251 }
1252 offset = offset - em->start;
1253 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1254 em->block_start + offset, page);
1255 free_extent_map(em);
1256}
1257
1258static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1259{
1260 bdi_init(bdi);
1261 bdi->ra_pages = default_backing_dev_info.ra_pages;
1262 bdi->state = 0;
1263 bdi->capabilities = default_backing_dev_info.capabilities;
1264 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1265 bdi->unplug_io_data = info;
1266 bdi->congested_fn = btrfs_congested_fn;
1267 bdi->congested_data = info;
1268 return 0;
1269}
1270
1271static int bio_ready_for_csum(struct bio *bio)
1272{
1273 u64 length = 0;
1274 u64 buf_len = 0;
1275 u64 start = 0;
1276 struct page *page;
1277 struct extent_io_tree *io_tree = NULL;
1278 struct btrfs_fs_info *info = NULL;
1279 struct bio_vec *bvec;
1280 int i;
1281 int ret;
1282
1283 bio_for_each_segment(bvec, bio, i) {
1284 page = bvec->bv_page;
1285 if (page->private == EXTENT_PAGE_PRIVATE) {
1286 length += bvec->bv_len;
1287 continue;
1288 }
1289 if (!page->private) {
1290 length += bvec->bv_len;
1291 continue;
1292 }
1293 length = bvec->bv_len;
1294 buf_len = page->private >> 2;
1295 start = page_offset(page) + bvec->bv_offset;
1296 io_tree = &BTRFS_I(page->mapping->host)->io_tree;
1297 info = BTRFS_I(page->mapping->host)->root->fs_info;
1298 }
1299 /* are we fully contained in this bio? */
1300 if (buf_len <= length)
1301 return 1;
1302
1303 ret = extent_range_uptodate(io_tree, start + length,
1304 start + buf_len - 1);
1305 if (ret == 1)
1306 return ret;
1307 return ret;
1308}
1309
1310/*
1311 * called by the kthread helper functions to finally call the bio end_io
1312 * functions. This is where read checksum verification actually happens
1313 */
1314static void end_workqueue_fn(struct btrfs_work *work)
1315{
1316 struct bio *bio;
1317 struct end_io_wq *end_io_wq;
1318 struct btrfs_fs_info *fs_info;
1319 int error;
1320
1321 end_io_wq = container_of(work, struct end_io_wq, work);
1322 bio = end_io_wq->bio;
1323 fs_info = end_io_wq->info;
1324
1325 /* metadata bios are special because the whole tree block must
1326 * be checksummed at once. This makes sure the entire block is in
1327 * ram and up to date before trying to verify things. For
1328 * blocksize <= pagesize, it is basically a noop
1329 */
1330 if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
1331 btrfs_queue_worker(&fs_info->endio_workers,
1332 &end_io_wq->work);
1333 return;
1334 }
1335 error = end_io_wq->error;
1336 bio->bi_private = end_io_wq->private;
1337 bio->bi_end_io = end_io_wq->end_io;
1338 kfree(end_io_wq);
1339 bio_endio(bio, error);
1340}
1341
1342static int cleaner_kthread(void *arg)
1343{
1344 struct btrfs_root *root = arg;
1345
1346 do {
1347 smp_mb();
1348 if (root->fs_info->closing)
1349 break;
1350
1351 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1352 mutex_lock(&root->fs_info->cleaner_mutex);
1353 btrfs_clean_old_snapshots(root);
1354 mutex_unlock(&root->fs_info->cleaner_mutex);
1355
1356 if (freezing(current)) {
1357 refrigerator();
1358 } else {
1359 smp_mb();
1360 if (root->fs_info->closing)
1361 break;
1362 set_current_state(TASK_INTERRUPTIBLE);
1363 schedule();
1364 __set_current_state(TASK_RUNNING);
1365 }
1366 } while (!kthread_should_stop());
1367 return 0;
1368}
1369
1370static int transaction_kthread(void *arg)
1371{
1372 struct btrfs_root *root = arg;
1373 struct btrfs_trans_handle *trans;
1374 struct btrfs_transaction *cur;
1375 unsigned long now;
1376 unsigned long delay;
1377 int ret;
1378
1379 do {
1380 smp_mb();
1381 if (root->fs_info->closing)
1382 break;
1383
1384 delay = HZ * 30;
1385 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1386 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1387
1388 if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
1389 printk("btrfs: total reference cache size %Lu\n",
1390 root->fs_info->total_ref_cache_size);
1391 }
1392
1393 mutex_lock(&root->fs_info->trans_mutex);
1394 cur = root->fs_info->running_transaction;
1395 if (!cur) {
1396 mutex_unlock(&root->fs_info->trans_mutex);
1397 goto sleep;
1398 }
1399
1400 now = get_seconds();
1401 if (now < cur->start_time || now - cur->start_time < 30) {
1402 mutex_unlock(&root->fs_info->trans_mutex);
1403 delay = HZ * 5;
1404 goto sleep;
1405 }
1406 mutex_unlock(&root->fs_info->trans_mutex);
1407 trans = btrfs_start_transaction(root, 1);
1408 ret = btrfs_commit_transaction(trans, root);
1409sleep:
1410 wake_up_process(root->fs_info->cleaner_kthread);
1411 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
1412
1413 if (freezing(current)) {
1414 refrigerator();
1415 } else {
1416 if (root->fs_info->closing)
1417 break;
1418 set_current_state(TASK_INTERRUPTIBLE);
1419 schedule_timeout(delay);
1420 __set_current_state(TASK_RUNNING);
1421 }
1422 } while (!kthread_should_stop());
1423 return 0;
1424}
1425
1426struct btrfs_root *open_ctree(struct super_block *sb,
1427 struct btrfs_fs_devices *fs_devices,
1428 char *options)
1429{
1430 u32 sectorsize;
1431 u32 nodesize;
1432 u32 leafsize;
1433 u32 blocksize;
1434 u32 stripesize;
1435 u64 generation;
1436 struct btrfs_key location;
1437 struct buffer_head *bh;
1438 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
1439 GFP_NOFS);
1440 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
1441 GFP_NOFS);
1442 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
1443 GFP_NOFS);
1444 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1445 GFP_NOFS);
1446 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
1447 GFP_NOFS);
1448 struct btrfs_root *log_tree_root;
1449
1450 int ret;
1451 int err = -EINVAL;
1452
1453 struct btrfs_super_block *disk_super;
1454
1455 if (!extent_root || !tree_root || !fs_info ||
1456 !chunk_root || !dev_root) {
1457 err = -ENOMEM;
1458 goto fail;
1459 }
1460 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
1461 INIT_LIST_HEAD(&fs_info->trans_list);
1462 INIT_LIST_HEAD(&fs_info->dead_roots);
1463 INIT_LIST_HEAD(&fs_info->hashers);
1464 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1465 spin_lock_init(&fs_info->hash_lock);
1466 spin_lock_init(&fs_info->delalloc_lock);
1467 spin_lock_init(&fs_info->new_trans_lock);
1468 spin_lock_init(&fs_info->ref_cache_lock);
1469
1470 init_completion(&fs_info->kobj_unregister);
1471 fs_info->tree_root = tree_root;
1472 fs_info->extent_root = extent_root;
1473 fs_info->chunk_root = chunk_root;
1474 fs_info->dev_root = dev_root;
1475 fs_info->fs_devices = fs_devices;
1476 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1477 INIT_LIST_HEAD(&fs_info->space_info);
1478 btrfs_mapping_init(&fs_info->mapping_tree);
1479 atomic_set(&fs_info->nr_async_submits, 0);
1480 atomic_set(&fs_info->async_delalloc_pages, 0);
1481 atomic_set(&fs_info->async_submit_draining, 0);
1482 atomic_set(&fs_info->nr_async_bios, 0);
1483 atomic_set(&fs_info->throttles, 0);
1484 atomic_set(&fs_info->throttle_gen, 0);
1485 fs_info->sb = sb;
1486 fs_info->max_extent = (u64)-1;
1487 fs_info->max_inline = 8192 * 1024;
1488 setup_bdi(fs_info, &fs_info->bdi);
1489 fs_info->btree_inode = new_inode(sb);
1490 fs_info->btree_inode->i_ino = 1;
1491 fs_info->btree_inode->i_nlink = 1;
1492
1493 fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
1494
1495 INIT_LIST_HEAD(&fs_info->ordered_extents);
1496 spin_lock_init(&fs_info->ordered_extent_lock);
1497
1498 sb->s_blocksize = 4096;
1499 sb->s_blocksize_bits = blksize_bits(4096);
1500
1501 /*
1502 * we set the i_size on the btree inode to the max possible int.
1503 * the real end of the address space is determined by all of
1504 * the devices in the system
1505 */
1506 fs_info->btree_inode->i_size = OFFSET_MAX;
1507 fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
1508 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
1509
1510 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
1511 fs_info->btree_inode->i_mapping,
1512 GFP_NOFS);
1513 extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
1514 GFP_NOFS);
1515
1516 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1517
1518 spin_lock_init(&fs_info->block_group_cache_lock);
1519 fs_info->block_group_cache_tree.rb_node = NULL;
1520
1521 extent_io_tree_init(&fs_info->pinned_extents,
1522 fs_info->btree_inode->i_mapping, GFP_NOFS);
1523 extent_io_tree_init(&fs_info->pending_del,
1524 fs_info->btree_inode->i_mapping, GFP_NOFS);
1525 extent_io_tree_init(&fs_info->extent_ins,
1526 fs_info->btree_inode->i_mapping, GFP_NOFS);
1527 fs_info->do_barriers = 1;
1528
1529 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
1530 btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
1531 btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
1532
1533 BTRFS_I(fs_info->btree_inode)->root = tree_root;
1534 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1535 sizeof(struct btrfs_key));
1536 insert_inode_hash(fs_info->btree_inode);
1537
1538 mutex_init(&fs_info->trans_mutex);
1539 mutex_init(&fs_info->tree_log_mutex);
1540 mutex_init(&fs_info->drop_mutex);
1541 mutex_init(&fs_info->extent_ins_mutex);
1542 mutex_init(&fs_info->pinned_mutex);
1543 mutex_init(&fs_info->chunk_mutex);
1544 mutex_init(&fs_info->transaction_kthread_mutex);
1545 mutex_init(&fs_info->cleaner_mutex);
1546 mutex_init(&fs_info->volume_mutex);
1547 mutex_init(&fs_info->tree_reloc_mutex);
1548 init_waitqueue_head(&fs_info->transaction_throttle);
1549 init_waitqueue_head(&fs_info->transaction_wait);
1550 init_waitqueue_head(&fs_info->async_submit_wait);
1551 init_waitqueue_head(&fs_info->tree_log_wait);
1552 atomic_set(&fs_info->tree_log_commit, 0);
1553 atomic_set(&fs_info->tree_log_writers, 0);
1554 fs_info->tree_log_transid = 0;
1555
1556#if 0
1557 ret = add_hasher(fs_info, "crc32c");
1558 if (ret) {
1559 printk("btrfs: failed hash setup, modprobe cryptomgr?\n");
1560 err = -ENOMEM;
1561 goto fail_iput;
1562 }
1563#endif
1564 __setup_root(4096, 4096, 4096, 4096, tree_root,
1565 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1566
1567
1568 bh = __bread(fs_devices->latest_bdev,
1569 BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
1570 if (!bh)
1571 goto fail_iput;
1572
1573 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1574 brelse(bh);
1575
1576 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
1577
1578 disk_super = &fs_info->super_copy;
1579 if (!btrfs_super_root(disk_super))
1580 goto fail_sb_buffer;
1581
1582 ret = btrfs_parse_options(tree_root, options);
1583 if (ret) {
1584 err = ret;
1585 goto fail_sb_buffer;
1586 }
1587
1588 /*
1589 * we need to start all the end_io workers up front because the
1590 * queue work function gets called at interrupt time, and so it
1591 * cannot dynamically grow.
1592 */
1593 btrfs_init_workers(&fs_info->workers, "worker",
1594 fs_info->thread_pool_size);
1595
1596 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
1597 fs_info->thread_pool_size);
1598
1599 btrfs_init_workers(&fs_info->submit_workers, "submit",
1600 min_t(u64, fs_devices->num_devices,
1601 fs_info->thread_pool_size));
1602
1603 /* a higher idle thresh on the submit workers makes it much more
1604 * likely that bios will be send down in a sane order to the
1605 * devices
1606 */
1607 fs_info->submit_workers.idle_thresh = 64;
1608
1609 fs_info->workers.idle_thresh = 16;
1610 fs_info->workers.ordered = 1;
1611
1612 fs_info->delalloc_workers.idle_thresh = 2;
1613 fs_info->delalloc_workers.ordered = 1;
1614
1615 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
1616 btrfs_init_workers(&fs_info->endio_workers, "endio",
1617 fs_info->thread_pool_size);
1618 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1619 fs_info->thread_pool_size);
1620
1621 /*
1622 * endios are largely parallel and should have a very
1623 * low idle thresh
1624 */
1625 fs_info->endio_workers.idle_thresh = 4;
1626 fs_info->endio_write_workers.idle_thresh = 64;
1627
1628 btrfs_start_workers(&fs_info->workers, 1);
1629 btrfs_start_workers(&fs_info->submit_workers, 1);
1630 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1631 btrfs_start_workers(&fs_info->fixup_workers, 1);
1632 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
1633 btrfs_start_workers(&fs_info->endio_write_workers,
1634 fs_info->thread_pool_size);
1635
1636 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1637 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
1638 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
1639
1640 nodesize = btrfs_super_nodesize(disk_super);
1641 leafsize = btrfs_super_leafsize(disk_super);
1642 sectorsize = btrfs_super_sectorsize(disk_super);
1643 stripesize = btrfs_super_stripesize(disk_super);
1644 tree_root->nodesize = nodesize;
1645 tree_root->leafsize = leafsize;
1646 tree_root->sectorsize = sectorsize;
1647 tree_root->stripesize = stripesize;
1648
1649 sb->s_blocksize = sectorsize;
1650 sb->s_blocksize_bits = blksize_bits(sectorsize);
1651
1652 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
1653 sizeof(disk_super->magic))) {
1654 printk("btrfs: valid FS not found on %s\n", sb->s_id);
1655 goto fail_sb_buffer;
1656 }
1657
1658 mutex_lock(&fs_info->chunk_mutex);
1659 ret = btrfs_read_sys_array(tree_root);
1660 mutex_unlock(&fs_info->chunk_mutex);
1661 if (ret) {
1662 printk("btrfs: failed to read the system array on %s\n",
1663 sb->s_id);
1664 goto fail_sys_array;
1665 }
1666
1667 blocksize = btrfs_level_size(tree_root,
1668 btrfs_super_chunk_root_level(disk_super));
1669 generation = btrfs_super_chunk_root_generation(disk_super);
1670
1671 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1672 chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
1673
1674 chunk_root->node = read_tree_block(chunk_root,
1675 btrfs_super_chunk_root(disk_super),
1676 blocksize, generation);
1677 BUG_ON(!chunk_root->node);
1678
1679 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
1680 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
1681 BTRFS_UUID_SIZE);
1682
1683 mutex_lock(&fs_info->chunk_mutex);
1684 ret = btrfs_read_chunk_tree(chunk_root);
1685 mutex_unlock(&fs_info->chunk_mutex);
1686 if (ret) {
1687 printk("btrfs: failed to read chunk tree on %s\n", sb->s_id);
1688 goto fail_chunk_root;
1689 }
1690
1691 btrfs_close_extra_devices(fs_devices);
1692
1693 blocksize = btrfs_level_size(tree_root,
1694 btrfs_super_root_level(disk_super));
1695 generation = btrfs_super_generation(disk_super);
1696
1697 tree_root->node = read_tree_block(tree_root,
1698 btrfs_super_root(disk_super),
1699 blocksize, generation);
1700 if (!tree_root->node)
1701 goto fail_chunk_root;
1702
1703
1704 ret = find_and_setup_root(tree_root, fs_info,
1705 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1706 if (ret)
1707 goto fail_tree_root;
1708 extent_root->track_dirty = 1;
1709
1710 ret = find_and_setup_root(tree_root, fs_info,
1711 BTRFS_DEV_TREE_OBJECTID, dev_root);
1712 dev_root->track_dirty = 1;
1713
1714 if (ret)
1715 goto fail_extent_root;
1716
1717 btrfs_read_block_groups(extent_root);
1718
1719 fs_info->generation = generation + 1;
1720 fs_info->last_trans_committed = generation;
1721 fs_info->data_alloc_profile = (u64)-1;
1722 fs_info->metadata_alloc_profile = (u64)-1;
1723 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1724 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1725 "btrfs-cleaner");
1726 if (!fs_info->cleaner_kthread)
1727 goto fail_extent_root;
1728
1729 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1730 tree_root,
1731 "btrfs-transaction");
1732 if (!fs_info->transaction_kthread)
1733 goto fail_cleaner;
1734
1735 if (btrfs_super_log_root(disk_super) != 0) {
1736 u32 blocksize;
1737 u64 bytenr = btrfs_super_log_root(disk_super);
1738
1739 if (fs_devices->rw_devices == 0) {
1740 printk("Btrfs log replay required on RO media\n");
1741 err = -EIO;
1742 goto fail_trans_kthread;
1743 }
1744 blocksize =
1745 btrfs_level_size(tree_root,
1746 btrfs_super_log_root_level(disk_super));
1747
1748 log_tree_root = kzalloc(sizeof(struct btrfs_root),
1749 GFP_NOFS);
1750
1751 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1752 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
1753
1754 log_tree_root->node = read_tree_block(tree_root, bytenr,
1755 blocksize,
1756 generation + 1);
1757 ret = btrfs_recover_log_trees(log_tree_root);
1758 BUG_ON(ret);
1759 }
1760
1761 if (!(sb->s_flags & MS_RDONLY)) {
1762 ret = btrfs_cleanup_reloc_trees(tree_root);
1763 BUG_ON(ret);
1764 }
1765
1766 location.objectid = BTRFS_FS_TREE_OBJECTID;
1767 location.type = BTRFS_ROOT_ITEM_KEY;
1768 location.offset = (u64)-1;
1769
1770 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
1771 if (!fs_info->fs_root)
1772 goto fail_trans_kthread;
1773 return tree_root;
1774
1775fail_trans_kthread:
1776 kthread_stop(fs_info->transaction_kthread);
1777fail_cleaner:
1778 kthread_stop(fs_info->cleaner_kthread);
1779
1780 /*
1781 * make sure we're done with the btree inode before we stop our
1782 * kthreads
1783 */
1784 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
1785 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1786
1787fail_extent_root:
1788 free_extent_buffer(extent_root->node);
1789fail_tree_root:
1790 free_extent_buffer(tree_root->node);
1791fail_chunk_root:
1792 free_extent_buffer(chunk_root->node);
1793fail_sys_array:
1794 free_extent_buffer(dev_root->node);
1795fail_sb_buffer:
1796 btrfs_stop_workers(&fs_info->fixup_workers);
1797 btrfs_stop_workers(&fs_info->delalloc_workers);
1798 btrfs_stop_workers(&fs_info->workers);
1799 btrfs_stop_workers(&fs_info->endio_workers);
1800 btrfs_stop_workers(&fs_info->endio_write_workers);
1801 btrfs_stop_workers(&fs_info->submit_workers);
1802fail_iput:
1803 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1804 iput(fs_info->btree_inode);
1805fail:
1806 btrfs_close_devices(fs_info->fs_devices);
1807 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1808
1809 kfree(extent_root);
1810 kfree(tree_root);
1811 bdi_destroy(&fs_info->bdi);
1812 kfree(fs_info);
1813 kfree(chunk_root);
1814 kfree(dev_root);
1815 return ERR_PTR(err);
1816}
1817
1818static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
1819{
1820 char b[BDEVNAME_SIZE];
1821
1822 if (uptodate) {
1823 set_buffer_uptodate(bh);
1824 } else {
1825 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
1826 printk(KERN_WARNING "lost page write due to "
1827 "I/O error on %s\n",
1828 bdevname(bh->b_bdev, b));
1829 }
1830 /* note, we dont' set_buffer_write_io_error because we have
1831 * our own ways of dealing with the IO errors
1832 */
1833 clear_buffer_uptodate(bh);
1834 }
1835 unlock_buffer(bh);
1836 put_bh(bh);
1837}
1838
1839int write_all_supers(struct btrfs_root *root)
1840{
1841 struct list_head *cur;
1842 struct list_head *head = &root->fs_info->fs_devices->devices;
1843 struct btrfs_device *dev;
1844 struct btrfs_super_block *sb;
1845 struct btrfs_dev_item *dev_item;
1846 struct buffer_head *bh;
1847 int ret;
1848 int do_barriers;
1849 int max_errors;
1850 int total_errors = 0;
1851 u32 crc;
1852 u64 flags;
1853
1854 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1855 do_barriers = !btrfs_test_opt(root, NOBARRIER);
1856
1857 sb = &root->fs_info->super_for_commit;
1858 dev_item = &sb->dev_item;
1859 list_for_each(cur, head) {
1860 dev = list_entry(cur, struct btrfs_device, dev_list);
1861 if (!dev->bdev) {
1862 total_errors++;
1863 continue;
1864 }
1865 if (!dev->in_fs_metadata || !dev->writeable)
1866 continue;
1867
1868 btrfs_set_stack_device_generation(dev_item, 0);
1869 btrfs_set_stack_device_type(dev_item, dev->type);
1870 btrfs_set_stack_device_id(dev_item, dev->devid);
1871 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
1872 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
1873 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
1874 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
1875 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
1876 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
1877 memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
1878 flags = btrfs_super_flags(sb);
1879 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
1880
1881
1882 crc = ~(u32)0;
1883 crc = btrfs_csum_data(root, (char *)sb + BTRFS_CSUM_SIZE, crc,
1884 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
1885 btrfs_csum_final(crc, sb->csum);
1886
1887 bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET / 4096,
1888 BTRFS_SUPER_INFO_SIZE);
1889
1890 memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
1891 dev->pending_io = bh;
1892
1893 get_bh(bh);
1894 set_buffer_uptodate(bh);
1895 lock_buffer(bh);
1896 bh->b_end_io = btrfs_end_buffer_write_sync;
1897
1898 if (do_barriers && dev->barriers) {
1899 ret = submit_bh(WRITE_BARRIER, bh);
1900 if (ret == -EOPNOTSUPP) {
1901 printk("btrfs: disabling barriers on dev %s\n",
1902 dev->name);
1903 set_buffer_uptodate(bh);
1904 dev->barriers = 0;
1905 get_bh(bh);
1906 lock_buffer(bh);
1907 ret = submit_bh(WRITE, bh);
1908 }
1909 } else {
1910 ret = submit_bh(WRITE, bh);
1911 }
1912 if (ret)
1913 total_errors++;
1914 }
1915 if (total_errors > max_errors) {
1916 printk("btrfs: %d errors while writing supers\n", total_errors);
1917 BUG();
1918 }
1919 total_errors = 0;
1920
1921 list_for_each(cur, head) {
1922 dev = list_entry(cur, struct btrfs_device, dev_list);
1923 if (!dev->bdev)
1924 continue;
1925 if (!dev->in_fs_metadata || !dev->writeable)
1926 continue;
1927
1928 BUG_ON(!dev->pending_io);
1929 bh = dev->pending_io;
1930 wait_on_buffer(bh);
1931 if (!buffer_uptodate(dev->pending_io)) {
1932 if (do_barriers && dev->barriers) {
1933 printk("btrfs: disabling barriers on dev %s\n",
1934 dev->name);
1935 set_buffer_uptodate(bh);
1936 get_bh(bh);
1937 lock_buffer(bh);
1938 dev->barriers = 0;
1939 ret = submit_bh(WRITE, bh);
1940 BUG_ON(ret);
1941 wait_on_buffer(bh);
1942 if (!buffer_uptodate(bh))
1943 total_errors++;
1944 } else {
1945 total_errors++;
1946 }
1947
1948 }
1949 dev->pending_io = NULL;
1950 brelse(bh);
1951 }
1952 if (total_errors > max_errors) {
1953 printk("btrfs: %d errors while writing supers\n", total_errors);
1954 BUG();
1955 }
1956 return 0;
1957}
1958
1959int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
1960 *root)
1961{
1962 int ret;
1963
1964 ret = write_all_supers(root);
1965 return ret;
1966}
1967
1968int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
1969{
1970 radix_tree_delete(&fs_info->fs_roots_radix,
1971 (unsigned long)root->root_key.objectid);
1972 if (root->anon_super.s_dev) {
1973 down_write(&root->anon_super.s_umount);
1974 kill_anon_super(&root->anon_super);
1975 }
1976#if 0
1977 if (root->in_sysfs)
1978 btrfs_sysfs_del_root(root);
1979#endif
1980 if (root->node)
1981 free_extent_buffer(root->node);
1982 if (root->commit_root)
1983 free_extent_buffer(root->commit_root);
1984 if (root->name)
1985 kfree(root->name);
1986 kfree(root);
1987 return 0;
1988}
1989
1990static int del_fs_roots(struct btrfs_fs_info *fs_info)
1991{
1992 int ret;
1993 struct btrfs_root *gang[8];
1994 int i;
1995
1996 while(1) {
1997 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
1998 (void **)gang, 0,
1999 ARRAY_SIZE(gang));
2000 if (!ret)
2001 break;
2002 for (i = 0; i < ret; i++)
2003 btrfs_free_fs_root(fs_info, gang[i]);
2004 }
2005 return 0;
2006}
2007
2008int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2009{
2010 u64 root_objectid = 0;
2011 struct btrfs_root *gang[8];
2012 int i;
2013 int ret;
2014
2015 while (1) {
2016 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2017 (void **)gang, root_objectid,
2018 ARRAY_SIZE(gang));
2019 if (!ret)
2020 break;
2021 for (i = 0; i < ret; i++) {
2022 root_objectid = gang[i]->root_key.objectid;
2023 ret = btrfs_find_dead_roots(fs_info->tree_root,
2024 root_objectid, gang[i]);
2025 BUG_ON(ret);
2026 btrfs_orphan_cleanup(gang[i]);
2027 }
2028 root_objectid++;
2029 }
2030 return 0;
2031}
2032
2033int btrfs_commit_super(struct btrfs_root *root)
2034{
2035 struct btrfs_trans_handle *trans;
2036 int ret;
2037
2038 mutex_lock(&root->fs_info->cleaner_mutex);
2039 btrfs_clean_old_snapshots(root);
2040 mutex_unlock(&root->fs_info->cleaner_mutex);
2041 trans = btrfs_start_transaction(root, 1);
2042 ret = btrfs_commit_transaction(trans, root);
2043 BUG_ON(ret);
2044 /* run commit again to drop the original snapshot */
2045 trans = btrfs_start_transaction(root, 1);
2046 btrfs_commit_transaction(trans, root);
2047 ret = btrfs_write_and_wait_transaction(NULL, root);
2048 BUG_ON(ret);
2049
2050 ret = write_ctree_super(NULL, root);
2051 return ret;
2052}
2053
2054int close_ctree(struct btrfs_root *root)
2055{
2056 struct btrfs_fs_info *fs_info = root->fs_info;
2057 int ret;
2058
2059 fs_info->closing = 1;
2060 smp_mb();
2061
2062 kthread_stop(root->fs_info->transaction_kthread);
2063 kthread_stop(root->fs_info->cleaner_kthread);
2064
2065 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2066 ret = btrfs_commit_super(root);
2067 if (ret) {
2068 printk("btrfs: commit super returns %d\n", ret);
2069 }
2070 }
2071
2072 if (fs_info->delalloc_bytes) {
2073 printk("btrfs: at unmount delalloc count %Lu\n",
2074 fs_info->delalloc_bytes);
2075 }
2076 if (fs_info->total_ref_cache_size) {
2077 printk("btrfs: at umount reference cache size %Lu\n",
2078 fs_info->total_ref_cache_size);
2079 }
2080
2081 if (fs_info->extent_root->node)
2082 free_extent_buffer(fs_info->extent_root->node);
2083
2084 if (fs_info->tree_root->node)
2085 free_extent_buffer(fs_info->tree_root->node);
2086
2087 if (root->fs_info->chunk_root->node);
2088 free_extent_buffer(root->fs_info->chunk_root->node);
2089
2090 if (root->fs_info->dev_root->node);
2091 free_extent_buffer(root->fs_info->dev_root->node);
2092
2093 btrfs_free_block_groups(root->fs_info);
2094
2095 del_fs_roots(fs_info);
2096
2097 iput(fs_info->btree_inode);
2098
2099 btrfs_stop_workers(&fs_info->fixup_workers);
2100 btrfs_stop_workers(&fs_info->delalloc_workers);
2101 btrfs_stop_workers(&fs_info->workers);
2102 btrfs_stop_workers(&fs_info->endio_workers);
2103 btrfs_stop_workers(&fs_info->endio_write_workers);
2104 btrfs_stop_workers(&fs_info->submit_workers);
2105
2106#if 0
2107 while(!list_empty(&fs_info->hashers)) {
2108 struct btrfs_hasher *hasher;
2109 hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
2110 hashers);
2111 list_del(&hasher->hashers);
2112 crypto_free_hash(&fs_info->hash_tfm);
2113 kfree(hasher);
2114 }
2115#endif
2116 btrfs_close_devices(fs_info->fs_devices);
2117 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2118
2119 bdi_destroy(&fs_info->bdi);
2120
2121 kfree(fs_info->extent_root);
2122 kfree(fs_info->tree_root);
2123 kfree(fs_info->chunk_root);
2124 kfree(fs_info->dev_root);
2125 return 0;
2126}
2127
2128int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
2129{
2130 int ret;
2131 struct inode *btree_inode = buf->first_page->mapping->host;
2132
2133 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
2134 if (!ret)
2135 return ret;
2136
2137 ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
2138 parent_transid);
2139 return !ret;
2140}
2141
2142int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
2143{
2144 struct inode *btree_inode = buf->first_page->mapping->host;
2145 return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
2146 buf);
2147}
2148
2149void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2150{
2151 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2152 u64 transid = btrfs_header_generation(buf);
2153 struct inode *btree_inode = root->fs_info->btree_inode;
2154
2155 WARN_ON(!btrfs_tree_locked(buf));
2156 if (transid != root->fs_info->generation) {
2157 printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
2158 (unsigned long long)buf->start,
2159 transid, root->fs_info->generation);
2160 WARN_ON(1);
2161 }
2162 set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
2163}
2164
2165void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2166{
2167 /*
2168 * looks as though older kernels can get into trouble with
2169 * this code, they end up stuck in balance_dirty_pages forever
2170 */
2171 struct extent_io_tree *tree;
2172 u64 num_dirty;
2173 u64 start = 0;
2174 unsigned long thresh = 32 * 1024 * 1024;
2175 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
2176
2177 if (current_is_pdflush() || current->flags & PF_MEMALLOC)
2178 return;
2179
2180 num_dirty = count_range_bits(tree, &start, (u64)-1,
2181 thresh, EXTENT_DIRTY);
2182 if (num_dirty > thresh) {
2183 balance_dirty_pages_ratelimited_nr(
2184 root->fs_info->btree_inode->i_mapping, 1);
2185 }
2186 return;
2187}
2188
2189int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2190{
2191 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2192 int ret;
2193 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2194 if (ret == 0) {
2195 buf->flags |= EXTENT_UPTODATE;
2196 }
2197 return ret;
2198}
2199
2200int btree_lock_page_hook(struct page *page)
2201{
2202 struct inode *inode = page->mapping->host;
2203 struct btrfs_root *root = BTRFS_I(inode)->root;
2204 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2205 struct extent_buffer *eb;
2206 unsigned long len;
2207 u64 bytenr = page_offset(page);
2208
2209 if (page->private == EXTENT_PAGE_PRIVATE)
2210 goto out;
2211
2212 len = page->private >> 2;
2213 eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
2214 if (!eb)
2215 goto out;
2216
2217 btrfs_tree_lock(eb);
2218 spin_lock(&root->fs_info->hash_lock);
2219 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2220 spin_unlock(&root->fs_info->hash_lock);
2221 btrfs_tree_unlock(eb);
2222 free_extent_buffer(eb);
2223out:
2224 lock_page(page);
2225 return 0;
2226}
2227
2228static struct extent_io_ops btree_extent_io_ops = {
2229 .write_cache_pages_lock_hook = btree_lock_page_hook,
2230 .readpage_end_io_hook = btree_readpage_end_io_hook,
2231 .submit_bio_hook = btree_submit_bio_hook,
2232 /* note we're sharing with inode.c for the merge bio hook */
2233 .merge_bio_hook = btrfs_merge_bio_hook,
2234};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 000000000000..717e94811e4e
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,89 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __DISKIO__
20#define __DISKIO__
21
22#define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
23#define BTRFS_SUPER_INFO_SIZE 4096
24struct btrfs_device;
25struct btrfs_fs_devices;
26
27struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
28 u32 blocksize, u64 parent_transid);
29int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
30 u64 parent_transid);
31struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
32 u64 bytenr, u32 blocksize);
33int clean_tree_block(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, struct extent_buffer *buf);
35struct btrfs_root *open_ctree(struct super_block *sb,
36 struct btrfs_fs_devices *fs_devices,
37 char *options);
38int close_ctree(struct btrfs_root *root);
39int write_ctree_super(struct btrfs_trans_handle *trans,
40 struct btrfs_root *root);
41int btrfs_commit_super(struct btrfs_root *root);
42struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
43 u64 bytenr, u32 blocksize);
44struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
45 u64 root_objectid);
46struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
47 struct btrfs_key *location,
48 const char *name, int namelen);
49struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
50 struct btrfs_key *location);
51struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
52 struct btrfs_key *location);
53int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
54int btrfs_insert_dev_radix(struct btrfs_root *root,
55 struct block_device *bdev,
56 u64 device_id,
57 u64 block_start,
58 u64 num_blocks);
59void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
60int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
61void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
62int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
63int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
64int wait_on_tree_block_writeback(struct btrfs_root *root,
65 struct extent_buffer *buf);
66int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
67u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
68void btrfs_csum_final(u32 crc, char *result);
69int btrfs_open_device(struct btrfs_device *dev);
70int btrfs_verify_block_csum(struct btrfs_root *root,
71 struct extent_buffer *buf);
72int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
73 int metadata);
74int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
75 int rw, struct bio *bio, int mirror_num,
76 unsigned long bio_flags,
77 extent_submit_bio_hook_t *submit_bio_start,
78 extent_submit_bio_hook_t *submit_bio_done);
79
80int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
81unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
82int btrfs_write_tree_block(struct extent_buffer *buf);
83int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
84int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
85 struct btrfs_fs_info *fs_info);
86int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
87 struct btrfs_fs_info *fs_info);
88int btree_lock_page_hook(struct page *page);
89#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 000000000000..48b82cd7583c
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,201 @@
1#include <linux/fs.h>
2#include <linux/types.h>
3#include "ctree.h"
4#include "disk-io.h"
5#include "btrfs_inode.h"
6#include "print-tree.h"
7#include "export.h"
8#include "compat.h"
9
10#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, parent_objectid)/4)
11#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, parent_root_objectid)/4)
12#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid)/4)
13
14static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
15 int connectable)
16{
17 struct btrfs_fid *fid = (struct btrfs_fid *)fh;
18 struct inode *inode = dentry->d_inode;
19 int len = *max_len;
20 int type;
21
22 if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
23 (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
24 return 255;
25
26 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
27 type = FILEID_BTRFS_WITHOUT_PARENT;
28
29 fid->objectid = BTRFS_I(inode)->location.objectid;
30 fid->root_objectid = BTRFS_I(inode)->root->objectid;
31 fid->gen = inode->i_generation;
32
33 if (connectable && !S_ISDIR(inode->i_mode)) {
34 struct inode *parent;
35 u64 parent_root_id;
36
37 spin_lock(&dentry->d_lock);
38
39 parent = dentry->d_parent->d_inode;
40 fid->parent_objectid = BTRFS_I(parent)->location.objectid;
41 fid->parent_gen = parent->i_generation;
42 parent_root_id = BTRFS_I(parent)->root->objectid;
43
44 spin_unlock(&dentry->d_lock);
45
46 if (parent_root_id != fid->root_objectid) {
47 fid->parent_root_objectid = parent_root_id;
48 len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
49 type = FILEID_BTRFS_WITH_PARENT_ROOT;
50 } else {
51 len = BTRFS_FID_SIZE_CONNECTABLE;
52 type = FILEID_BTRFS_WITH_PARENT;
53 }
54 }
55
56 *max_len = len;
57 return type;
58}
59
60static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
61 u64 root_objectid, u32 generation)
62{
63 struct btrfs_root *root;
64 struct inode *inode;
65 struct btrfs_key key;
66
67 key.objectid = root_objectid;
68 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
69 key.offset = (u64)-1;
70
71 root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
72 if (IS_ERR(root))
73 return ERR_CAST(root);
74
75 key.objectid = objectid;
76 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
77 key.offset = 0;
78
79 inode = btrfs_iget(sb, &key, root, NULL);
80 if (IS_ERR(inode))
81 return (void *)inode;
82
83 if (generation != inode->i_generation) {
84 iput(inode);
85 return ERR_PTR(-ESTALE);
86 }
87
88 return d_obtain_alias(inode);
89}
90
91static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
92 int fh_len, int fh_type)
93{
94 struct btrfs_fid *fid = (struct btrfs_fid *) fh;
95 u64 objectid, root_objectid;
96 u32 generation;
97
98 if (fh_type == FILEID_BTRFS_WITH_PARENT) {
99 if (fh_len != BTRFS_FID_SIZE_CONNECTABLE)
100 return NULL;
101 root_objectid = fid->root_objectid;
102 } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
103 if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
104 return NULL;
105 root_objectid = fid->parent_root_objectid;
106 } else
107 return NULL;
108
109 objectid = fid->parent_objectid;
110 generation = fid->parent_gen;
111
112 return btrfs_get_dentry(sb, objectid, root_objectid, generation);
113}
114
115static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
116 int fh_len, int fh_type)
117{
118 struct btrfs_fid *fid = (struct btrfs_fid *) fh;
119 u64 objectid, root_objectid;
120 u32 generation;
121
122 if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
123 fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
124 (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
125 fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
126 (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
127 fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
128 return NULL;
129
130 objectid = fid->objectid;
131 root_objectid = fid->root_objectid;
132 generation = fid->gen;
133
134 return btrfs_get_dentry(sb, objectid, root_objectid, generation);
135}
136
137static struct dentry *btrfs_get_parent(struct dentry *child)
138{
139 struct inode *dir = child->d_inode;
140 struct btrfs_root *root = BTRFS_I(dir)->root;
141 struct btrfs_key key;
142 struct btrfs_path *path;
143 struct extent_buffer *leaf;
144 int slot;
145 u64 objectid;
146 int ret;
147
148 path = btrfs_alloc_path();
149
150 key.objectid = dir->i_ino;
151 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
152 key.offset = (u64)-1;
153
154 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
155 if (ret < 0) {
156 /* Error */
157 btrfs_free_path(path);
158 return ERR_PTR(ret);
159 }
160 leaf = path->nodes[0];
161 slot = path->slots[0];
162 if (ret) {
163 /* btrfs_search_slot() returns the slot where we'd want to
164 insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
165 The _real_ backref, telling us what the parent inode
166 _actually_ is, will be in the slot _before_ the one
167 that btrfs_search_slot() returns. */
168 if (!slot) {
169 /* Unless there is _no_ key in the tree before... */
170 btrfs_free_path(path);
171 return ERR_PTR(-EIO);
172 }
173 slot--;
174 }
175
176 btrfs_item_key_to_cpu(leaf, &key, slot);
177 btrfs_free_path(path);
178
179 if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
180 return ERR_PTR(-EINVAL);
181
182 objectid = key.offset;
183
184 /* If we are already at the root of a subvol, return the real root */
185 if (objectid == dir->i_ino)
186 return dget(dir->i_sb->s_root);
187
188 /* Build a new key for the inode item */
189 key.objectid = objectid;
190 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
191 key.offset = 0;
192
193 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
194}
195
196const struct export_operations btrfs_export_ops = {
197 .encode_fh = btrfs_encode_fh,
198 .fh_to_dentry = btrfs_fh_to_dentry,
199 .fh_to_parent = btrfs_fh_to_parent,
200 .get_parent = btrfs_get_parent,
201};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 000000000000..074348a95841
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,19 @@
1#ifndef BTRFS_EXPORT_H
2#define BTRFS_EXPORT_H
3
4#include <linux/exportfs.h>
5
6extern const struct export_operations btrfs_export_ops;
7
8struct btrfs_fid {
9 u64 objectid;
10 u64 root_objectid;
11 u32 gen;
12
13 u64 parent_objectid;
14 u32 parent_gen;
15
16 u64 parent_root_objectid;
17} __attribute__ ((packed));
18
19#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 000000000000..ee73efe75423
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,5918 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/pagemap.h>
20#include <linux/writeback.h>
21#include <linux/blkdev.h>
22#include "hash.h"
23#include "crc32c.h"
24#include "ctree.h"
25#include "disk-io.h"
26#include "print-tree.h"
27#include "transaction.h"
28#include "volumes.h"
29#include "locking.h"
30#include "ref-cache.h"
31
32#define PENDING_EXTENT_INSERT 0
33#define PENDING_EXTENT_DELETE 1
34#define PENDING_BACKREF_UPDATE 2
35
36struct pending_extent_op {
37 int type;
38 u64 bytenr;
39 u64 num_bytes;
40 u64 parent;
41 u64 orig_parent;
42 u64 generation;
43 u64 orig_generation;
44 int level;
45 struct list_head list;
46 int del;
47};
48
49static int finish_current_insert(struct btrfs_trans_handle *trans, struct
50 btrfs_root *extent_root, int all);
51static int del_pending_extents(struct btrfs_trans_handle *trans, struct
52 btrfs_root *extent_root, int all);
53static struct btrfs_block_group_cache *
54__btrfs_find_block_group(struct btrfs_root *root,
55 struct btrfs_block_group_cache *hint,
56 u64 search_start, int data, int owner);
57static int pin_down_bytes(struct btrfs_trans_handle *trans,
58 struct btrfs_root *root,
59 u64 bytenr, u64 num_bytes, int is_data);
60static int update_block_group(struct btrfs_trans_handle *trans,
61 struct btrfs_root *root,
62 u64 bytenr, u64 num_bytes, int alloc,
63 int mark_free);
64
65static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
66{
67 return (cache->flags & bits) == bits;
68}
69
70/*
71 * this adds the block group to the fs_info rb tree for the block group
72 * cache
73 */
74int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
75 struct btrfs_block_group_cache *block_group)
76{
77 struct rb_node **p;
78 struct rb_node *parent = NULL;
79 struct btrfs_block_group_cache *cache;
80
81 spin_lock(&info->block_group_cache_lock);
82 p = &info->block_group_cache_tree.rb_node;
83
84 while (*p) {
85 parent = *p;
86 cache = rb_entry(parent, struct btrfs_block_group_cache,
87 cache_node);
88 if (block_group->key.objectid < cache->key.objectid) {
89 p = &(*p)->rb_left;
90 } else if (block_group->key.objectid > cache->key.objectid) {
91 p = &(*p)->rb_right;
92 } else {
93 spin_unlock(&info->block_group_cache_lock);
94 return -EEXIST;
95 }
96 }
97
98 rb_link_node(&block_group->cache_node, parent, p);
99 rb_insert_color(&block_group->cache_node,
100 &info->block_group_cache_tree);
101 spin_unlock(&info->block_group_cache_lock);
102
103 return 0;
104}
105
106/*
107 * This will return the block group at or after bytenr if contains is 0, else
108 * it will return the block group that contains the bytenr
109 */
110static struct btrfs_block_group_cache *
111block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
112 int contains)
113{
114 struct btrfs_block_group_cache *cache, *ret = NULL;
115 struct rb_node *n;
116 u64 end, start;
117
118 spin_lock(&info->block_group_cache_lock);
119 n = info->block_group_cache_tree.rb_node;
120
121 while (n) {
122 cache = rb_entry(n, struct btrfs_block_group_cache,
123 cache_node);
124 end = cache->key.objectid + cache->key.offset - 1;
125 start = cache->key.objectid;
126
127 if (bytenr < start) {
128 if (!contains && (!ret || start < ret->key.objectid))
129 ret = cache;
130 n = n->rb_left;
131 } else if (bytenr > start) {
132 if (contains && bytenr <= end) {
133 ret = cache;
134 break;
135 }
136 n = n->rb_right;
137 } else {
138 ret = cache;
139 break;
140 }
141 }
142 spin_unlock(&info->block_group_cache_lock);
143
144 return ret;
145}
146
147/*
148 * this is only called by cache_block_group, since we could have freed extents
149 * we need to check the pinned_extents for any extents that can't be used yet
150 * since their free space will be released as soon as the transaction commits.
151 */
152static int add_new_free_space(struct btrfs_block_group_cache *block_group,
153 struct btrfs_fs_info *info, u64 start, u64 end)
154{
155 u64 extent_start, extent_end, size;
156 int ret;
157
158 mutex_lock(&info->pinned_mutex);
159 while (start < end) {
160 ret = find_first_extent_bit(&info->pinned_extents, start,
161 &extent_start, &extent_end,
162 EXTENT_DIRTY);
163 if (ret)
164 break;
165
166 if (extent_start == start) {
167 start = extent_end + 1;
168 } else if (extent_start > start && extent_start < end) {
169 size = extent_start - start;
170 ret = btrfs_add_free_space_lock(block_group, start,
171 size);
172 BUG_ON(ret);
173 start = extent_end + 1;
174 } else {
175 break;
176 }
177 }
178
179 if (start < end) {
180 size = end - start;
181 ret = btrfs_add_free_space_lock(block_group, start, size);
182 BUG_ON(ret);
183 }
184 mutex_unlock(&info->pinned_mutex);
185
186 return 0;
187}
188
189static int cache_block_group(struct btrfs_root *root,
190 struct btrfs_block_group_cache *block_group)
191{
192 struct btrfs_path *path;
193 int ret = 0;
194 struct btrfs_key key;
195 struct extent_buffer *leaf;
196 int slot;
197 u64 last = 0;
198 u64 first_free;
199 int found = 0;
200
201 if (!block_group)
202 return 0;
203
204 root = root->fs_info->extent_root;
205
206 if (block_group->cached)
207 return 0;
208
209 path = btrfs_alloc_path();
210 if (!path)
211 return -ENOMEM;
212
213 path->reada = 2;
214 /*
215 * we get into deadlocks with paths held by callers of this function.
216 * since the alloc_mutex is protecting things right now, just
217 * skip the locking here
218 */
219 path->skip_locking = 1;
220 first_free = max_t(u64, block_group->key.objectid,
221 BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
222 key.objectid = block_group->key.objectid;
223 key.offset = 0;
224 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
225 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
226 if (ret < 0)
227 goto err;
228 ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
229 if (ret < 0)
230 goto err;
231 if (ret == 0) {
232 leaf = path->nodes[0];
233 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
234 if (key.objectid + key.offset > first_free)
235 first_free = key.objectid + key.offset;
236 }
237 while(1) {
238 leaf = path->nodes[0];
239 slot = path->slots[0];
240 if (slot >= btrfs_header_nritems(leaf)) {
241 ret = btrfs_next_leaf(root, path);
242 if (ret < 0)
243 goto err;
244 if (ret == 0)
245 continue;
246 else
247 break;
248 }
249 btrfs_item_key_to_cpu(leaf, &key, slot);
250 if (key.objectid < block_group->key.objectid)
251 goto next;
252
253 if (key.objectid >= block_group->key.objectid +
254 block_group->key.offset)
255 break;
256
257 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
258 if (!found) {
259 last = first_free;
260 found = 1;
261 }
262
263 add_new_free_space(block_group, root->fs_info, last,
264 key.objectid);
265
266 last = key.objectid + key.offset;
267 }
268next:
269 path->slots[0]++;
270 }
271
272 if (!found)
273 last = first_free;
274
275 add_new_free_space(block_group, root->fs_info, last,
276 block_group->key.objectid +
277 block_group->key.offset);
278
279 block_group->cached = 1;
280 ret = 0;
281err:
282 btrfs_free_path(path);
283 return ret;
284}
285
286/*
287 * return the block group that starts at or after bytenr
288 */
289struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
290 btrfs_fs_info *info,
291 u64 bytenr)
292{
293 struct btrfs_block_group_cache *cache;
294
295 cache = block_group_cache_tree_search(info, bytenr, 0);
296
297 return cache;
298}
299
300/*
301 * return the block group that contains teh given bytenr
302 */
303struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
304 btrfs_fs_info *info,
305 u64 bytenr)
306{
307 struct btrfs_block_group_cache *cache;
308
309 cache = block_group_cache_tree_search(info, bytenr, 1);
310
311 return cache;
312}
313
314static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
315 u64 flags)
316{
317 struct list_head *head = &info->space_info;
318 struct list_head *cur;
319 struct btrfs_space_info *found;
320 list_for_each(cur, head) {
321 found = list_entry(cur, struct btrfs_space_info, list);
322 if (found->flags == flags)
323 return found;
324 }
325 return NULL;
326}
327
328static u64 div_factor(u64 num, int factor)
329{
330 if (factor == 10)
331 return num;
332 num *= factor;
333 do_div(num, 10);
334 return num;
335}
336
337static struct btrfs_block_group_cache *
338__btrfs_find_block_group(struct btrfs_root *root,
339 struct btrfs_block_group_cache *hint,
340 u64 search_start, int data, int owner)
341{
342 struct btrfs_block_group_cache *cache;
343 struct btrfs_block_group_cache *found_group = NULL;
344 struct btrfs_fs_info *info = root->fs_info;
345 u64 used;
346 u64 last = 0;
347 u64 free_check;
348 int full_search = 0;
349 int factor = 10;
350 int wrapped = 0;
351
352 if (data & BTRFS_BLOCK_GROUP_METADATA)
353 factor = 9;
354
355 if (search_start) {
356 struct btrfs_block_group_cache *shint;
357 shint = btrfs_lookup_first_block_group(info, search_start);
358 if (shint && block_group_bits(shint, data)) {
359 spin_lock(&shint->lock);
360 used = btrfs_block_group_used(&shint->item);
361 if (used + shint->pinned + shint->reserved <
362 div_factor(shint->key.offset, factor)) {
363 spin_unlock(&shint->lock);
364 return shint;
365 }
366 spin_unlock(&shint->lock);
367 }
368 }
369 if (hint && block_group_bits(hint, data)) {
370 spin_lock(&hint->lock);
371 used = btrfs_block_group_used(&hint->item);
372 if (used + hint->pinned + hint->reserved <
373 div_factor(hint->key.offset, factor)) {
374 spin_unlock(&hint->lock);
375 return hint;
376 }
377 spin_unlock(&hint->lock);
378 last = hint->key.objectid + hint->key.offset;
379 } else {
380 if (hint)
381 last = max(hint->key.objectid, search_start);
382 else
383 last = search_start;
384 }
385again:
386 while (1) {
387 cache = btrfs_lookup_first_block_group(root->fs_info, last);
388 if (!cache)
389 break;
390
391 spin_lock(&cache->lock);
392 last = cache->key.objectid + cache->key.offset;
393 used = btrfs_block_group_used(&cache->item);
394
395 if (block_group_bits(cache, data)) {
396 free_check = div_factor(cache->key.offset, factor);
397 if (used + cache->pinned + cache->reserved <
398 free_check) {
399 found_group = cache;
400 spin_unlock(&cache->lock);
401 goto found;
402 }
403 }
404 spin_unlock(&cache->lock);
405 cond_resched();
406 }
407 if (!wrapped) {
408 last = search_start;
409 wrapped = 1;
410 goto again;
411 }
412 if (!full_search && factor < 10) {
413 last = search_start;
414 full_search = 1;
415 factor = 10;
416 goto again;
417 }
418found:
419 return found_group;
420}
421
422struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
423 struct btrfs_block_group_cache
424 *hint, u64 search_start,
425 int data, int owner)
426{
427
428 struct btrfs_block_group_cache *ret;
429 ret = __btrfs_find_block_group(root, hint, search_start, data, owner);
430 return ret;
431}
432
433/* simple helper to search for an existing extent at a given offset */
434int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
435{
436 int ret;
437 struct btrfs_key key;
438 struct btrfs_path *path;
439
440 path = btrfs_alloc_path();
441 BUG_ON(!path);
442 key.objectid = start;
443 key.offset = len;
444 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
445 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
446 0, 0);
447 btrfs_free_path(path);
448 return ret;
449}
450
451/*
452 * Back reference rules. Back refs have three main goals:
453 *
454 * 1) differentiate between all holders of references to an extent so that
455 * when a reference is dropped we can make sure it was a valid reference
456 * before freeing the extent.
457 *
458 * 2) Provide enough information to quickly find the holders of an extent
459 * if we notice a given block is corrupted or bad.
460 *
461 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
462 * maintenance. This is actually the same as #2, but with a slightly
463 * different use case.
464 *
465 * File extents can be referenced by:
466 *
467 * - multiple snapshots, subvolumes, or different generations in one subvol
468 * - different files inside a single subvolume
469 * - different offsets inside a file (bookend extents in file.c)
470 *
471 * The extent ref structure has fields for:
472 *
473 * - Objectid of the subvolume root
474 * - Generation number of the tree holding the reference
475 * - objectid of the file holding the reference
476 * - number of references holding by parent node (alway 1 for tree blocks)
477 *
478 * Btree leaf may hold multiple references to a file extent. In most cases,
479 * these references are from same file and the corresponding offsets inside
480 * the file are close together.
481 *
482 * When a file extent is allocated the fields are filled in:
483 * (root_key.objectid, trans->transid, inode objectid, 1)
484 *
485 * When a leaf is cow'd new references are added for every file extent found
486 * in the leaf. It looks similar to the create case, but trans->transid will
487 * be different when the block is cow'd.
488 *
489 * (root_key.objectid, trans->transid, inode objectid,
490 * number of references in the leaf)
491 *
492 * When a file extent is removed either during snapshot deletion or
493 * file truncation, we find the corresponding back reference and check
494 * the following fields:
495 *
496 * (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
497 * inode objectid)
498 *
499 * Btree extents can be referenced by:
500 *
501 * - Different subvolumes
502 * - Different generations of the same subvolume
503 *
504 * When a tree block is created, back references are inserted:
505 *
506 * (root->root_key.objectid, trans->transid, level, 1)
507 *
508 * When a tree block is cow'd, new back references are added for all the
509 * blocks it points to. If the tree block isn't in reference counted root,
510 * the old back references are removed. These new back references are of
511 * the form (trans->transid will have increased since creation):
512 *
513 * (root->root_key.objectid, trans->transid, level, 1)
514 *
515 * When a backref is in deleting, the following fields are checked:
516 *
517 * if backref was for a tree root:
518 * (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
519 * else
520 * (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
521 *
522 * Back Reference Key composing:
523 *
524 * The key objectid corresponds to the first byte in the extent, the key
525 * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
526 * byte of parent extent. If a extent is tree root, the key offset is set
527 * to the key objectid.
528 */
529
530static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
531 struct btrfs_root *root,
532 struct btrfs_path *path,
533 u64 bytenr, u64 parent,
534 u64 ref_root, u64 ref_generation,
535 u64 owner_objectid, int del)
536{
537 struct btrfs_key key;
538 struct btrfs_extent_ref *ref;
539 struct extent_buffer *leaf;
540 u64 ref_objectid;
541 int ret;
542
543 key.objectid = bytenr;
544 key.type = BTRFS_EXTENT_REF_KEY;
545 key.offset = parent;
546
547 ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
548 if (ret < 0)
549 goto out;
550 if (ret > 0) {
551 ret = -ENOENT;
552 goto out;
553 }
554
555 leaf = path->nodes[0];
556 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
557 ref_objectid = btrfs_ref_objectid(leaf, ref);
558 if (btrfs_ref_root(leaf, ref) != ref_root ||
559 btrfs_ref_generation(leaf, ref) != ref_generation ||
560 (ref_objectid != owner_objectid &&
561 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
562 ret = -EIO;
563 WARN_ON(1);
564 goto out;
565 }
566 ret = 0;
567out:
568 return ret;
569}
570
571/*
572 * updates all the backrefs that are pending on update_list for the
573 * extent_root
574 */
575static int noinline update_backrefs(struct btrfs_trans_handle *trans,
576 struct btrfs_root *extent_root,
577 struct btrfs_path *path,
578 struct list_head *update_list)
579{
580 struct btrfs_key key;
581 struct btrfs_extent_ref *ref;
582 struct btrfs_fs_info *info = extent_root->fs_info;
583 struct pending_extent_op *op;
584 struct extent_buffer *leaf;
585 int ret = 0;
586 struct list_head *cur = update_list->next;
587 u64 ref_objectid;
588 u64 ref_root = extent_root->root_key.objectid;
589
590 op = list_entry(cur, struct pending_extent_op, list);
591
592search:
593 key.objectid = op->bytenr;
594 key.type = BTRFS_EXTENT_REF_KEY;
595 key.offset = op->orig_parent;
596
597 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
598 BUG_ON(ret);
599
600 leaf = path->nodes[0];
601
602loop:
603 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
604
605 ref_objectid = btrfs_ref_objectid(leaf, ref);
606
607 if (btrfs_ref_root(leaf, ref) != ref_root ||
608 btrfs_ref_generation(leaf, ref) != op->orig_generation ||
609 (ref_objectid != op->level &&
610 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
611 printk(KERN_ERR "couldn't find %Lu, parent %Lu, root %Lu, "
612 "owner %u\n", op->bytenr, op->orig_parent,
613 ref_root, op->level);
614 btrfs_print_leaf(extent_root, leaf);
615 BUG();
616 }
617
618 key.objectid = op->bytenr;
619 key.offset = op->parent;
620 key.type = BTRFS_EXTENT_REF_KEY;
621 ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
622 BUG_ON(ret);
623 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
624 btrfs_set_ref_generation(leaf, ref, op->generation);
625
626 cur = cur->next;
627
628 list_del_init(&op->list);
629 unlock_extent(&info->extent_ins, op->bytenr,
630 op->bytenr + op->num_bytes - 1, GFP_NOFS);
631 kfree(op);
632
633 if (cur == update_list) {
634 btrfs_mark_buffer_dirty(path->nodes[0]);
635 btrfs_release_path(extent_root, path);
636 goto out;
637 }
638
639 op = list_entry(cur, struct pending_extent_op, list);
640
641 path->slots[0]++;
642 while (path->slots[0] < btrfs_header_nritems(leaf)) {
643 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
644 if (key.objectid == op->bytenr &&
645 key.type == BTRFS_EXTENT_REF_KEY)
646 goto loop;
647 path->slots[0]++;
648 }
649
650 btrfs_mark_buffer_dirty(path->nodes[0]);
651 btrfs_release_path(extent_root, path);
652 goto search;
653
654out:
655 return 0;
656}
657
658static int noinline insert_extents(struct btrfs_trans_handle *trans,
659 struct btrfs_root *extent_root,
660 struct btrfs_path *path,
661 struct list_head *insert_list, int nr)
662{
663 struct btrfs_key *keys;
664 u32 *data_size;
665 struct pending_extent_op *op;
666 struct extent_buffer *leaf;
667 struct list_head *cur = insert_list->next;
668 struct btrfs_fs_info *info = extent_root->fs_info;
669 u64 ref_root = extent_root->root_key.objectid;
670 int i = 0, last = 0, ret;
671 int total = nr * 2;
672
673 if (!nr)
674 return 0;
675
676 keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
677 if (!keys)
678 return -ENOMEM;
679
680 data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
681 if (!data_size) {
682 kfree(keys);
683 return -ENOMEM;
684 }
685
686 list_for_each_entry(op, insert_list, list) {
687 keys[i].objectid = op->bytenr;
688 keys[i].offset = op->num_bytes;
689 keys[i].type = BTRFS_EXTENT_ITEM_KEY;
690 data_size[i] = sizeof(struct btrfs_extent_item);
691 i++;
692
693 keys[i].objectid = op->bytenr;
694 keys[i].offset = op->parent;
695 keys[i].type = BTRFS_EXTENT_REF_KEY;
696 data_size[i] = sizeof(struct btrfs_extent_ref);
697 i++;
698 }
699
700 op = list_entry(cur, struct pending_extent_op, list);
701 i = 0;
702 while (i < total) {
703 int c;
704 ret = btrfs_insert_some_items(trans, extent_root, path,
705 keys+i, data_size+i, total-i);
706 BUG_ON(ret < 0);
707
708 if (last && ret > 1)
709 BUG();
710
711 leaf = path->nodes[0];
712 for (c = 0; c < ret; c++) {
713 int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
714
715 /*
716 * if the first item we inserted was a backref, then
717 * the EXTENT_ITEM will be the odd c's, else it will
718 * be the even c's
719 */
720 if ((ref_first && (c % 2)) ||
721 (!ref_first && !(c % 2))) {
722 struct btrfs_extent_item *itm;
723
724 itm = btrfs_item_ptr(leaf, path->slots[0] + c,
725 struct btrfs_extent_item);
726 btrfs_set_extent_refs(path->nodes[0], itm, 1);
727 op->del++;
728 } else {
729 struct btrfs_extent_ref *ref;
730
731 ref = btrfs_item_ptr(leaf, path->slots[0] + c,
732 struct btrfs_extent_ref);
733 btrfs_set_ref_root(leaf, ref, ref_root);
734 btrfs_set_ref_generation(leaf, ref,
735 op->generation);
736 btrfs_set_ref_objectid(leaf, ref, op->level);
737 btrfs_set_ref_num_refs(leaf, ref, 1);
738 op->del++;
739 }
740
741 /*
742 * using del to see when its ok to free up the
743 * pending_extent_op. In the case where we insert the
744 * last item on the list in order to help do batching
745 * we need to not free the extent op until we actually
746 * insert the extent_item
747 */
748 if (op->del == 2) {
749 unlock_extent(&info->extent_ins, op->bytenr,
750 op->bytenr + op->num_bytes - 1,
751 GFP_NOFS);
752 cur = cur->next;
753 list_del_init(&op->list);
754 kfree(op);
755 if (cur != insert_list)
756 op = list_entry(cur,
757 struct pending_extent_op,
758 list);
759 }
760 }
761 btrfs_mark_buffer_dirty(leaf);
762 btrfs_release_path(extent_root, path);
763
764 /*
765 * Ok backref's and items usually go right next to eachother,
766 * but if we could only insert 1 item that means that we
767 * inserted on the end of a leaf, and we have no idea what may
768 * be on the next leaf so we just play it safe. In order to
769 * try and help this case we insert the last thing on our
770 * insert list so hopefully it will end up being the last
771 * thing on the leaf and everything else will be before it,
772 * which will let us insert a whole bunch of items at the same
773 * time.
774 */
775 if (ret == 1 && !last && (i + ret < total)) {
776 /*
777 * last: where we will pick up the next time around
778 * i: our current key to insert, will be total - 1
779 * cur: the current op we are screwing with
780 * op: duh
781 */
782 last = i + ret;
783 i = total - 1;
784 cur = insert_list->prev;
785 op = list_entry(cur, struct pending_extent_op, list);
786 } else if (last) {
787 /*
788 * ok we successfully inserted the last item on the
789 * list, lets reset everything
790 *
791 * i: our current key to insert, so where we left off
792 * last time
793 * last: done with this
794 * cur: the op we are messing with
795 * op: duh
796 * total: since we inserted the last key, we need to
797 * decrement total so we dont overflow
798 */
799 i = last;
800 last = 0;
801 total--;
802 if (i < total) {
803 cur = insert_list->next;
804 op = list_entry(cur, struct pending_extent_op,
805 list);
806 }
807 } else {
808 i += ret;
809 }
810
811 cond_resched();
812 }
813 ret = 0;
814 kfree(keys);
815 kfree(data_size);
816 return ret;
817}
818
819static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
820 struct btrfs_root *root,
821 struct btrfs_path *path,
822 u64 bytenr, u64 parent,
823 u64 ref_root, u64 ref_generation,
824 u64 owner_objectid)
825{
826 struct btrfs_key key;
827 struct extent_buffer *leaf;
828 struct btrfs_extent_ref *ref;
829 u32 num_refs;
830 int ret;
831
832 key.objectid = bytenr;
833 key.type = BTRFS_EXTENT_REF_KEY;
834 key.offset = parent;
835
836 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
837 if (ret == 0) {
838 leaf = path->nodes[0];
839 ref = btrfs_item_ptr(leaf, path->slots[0],
840 struct btrfs_extent_ref);
841 btrfs_set_ref_root(leaf, ref, ref_root);
842 btrfs_set_ref_generation(leaf, ref, ref_generation);
843 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
844 btrfs_set_ref_num_refs(leaf, ref, 1);
845 } else if (ret == -EEXIST) {
846 u64 existing_owner;
847 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
848 leaf = path->nodes[0];
849 ref = btrfs_item_ptr(leaf, path->slots[0],
850 struct btrfs_extent_ref);
851 if (btrfs_ref_root(leaf, ref) != ref_root ||
852 btrfs_ref_generation(leaf, ref) != ref_generation) {
853 ret = -EIO;
854 WARN_ON(1);
855 goto out;
856 }
857
858 num_refs = btrfs_ref_num_refs(leaf, ref);
859 BUG_ON(num_refs == 0);
860 btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
861
862 existing_owner = btrfs_ref_objectid(leaf, ref);
863 if (existing_owner != owner_objectid &&
864 existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
865 btrfs_set_ref_objectid(leaf, ref,
866 BTRFS_MULTIPLE_OBJECTIDS);
867 }
868 ret = 0;
869 } else {
870 goto out;
871 }
872 btrfs_mark_buffer_dirty(path->nodes[0]);
873out:
874 btrfs_release_path(root, path);
875 return ret;
876}
877
878static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
879 struct btrfs_root *root,
880 struct btrfs_path *path)
881{
882 struct extent_buffer *leaf;
883 struct btrfs_extent_ref *ref;
884 u32 num_refs;
885 int ret = 0;
886
887 leaf = path->nodes[0];
888 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
889 num_refs = btrfs_ref_num_refs(leaf, ref);
890 BUG_ON(num_refs == 0);
891 num_refs -= 1;
892 if (num_refs == 0) {
893 ret = btrfs_del_item(trans, root, path);
894 } else {
895 btrfs_set_ref_num_refs(leaf, ref, num_refs);
896 btrfs_mark_buffer_dirty(leaf);
897 }
898 btrfs_release_path(root, path);
899 return ret;
900}
901
902static int noinline free_extents(struct btrfs_trans_handle *trans,
903 struct btrfs_root *extent_root,
904 struct list_head *del_list)
905{
906 struct btrfs_fs_info *info = extent_root->fs_info;
907 struct btrfs_path *path;
908 struct btrfs_key key, found_key;
909 struct extent_buffer *leaf;
910 struct list_head *cur;
911 struct pending_extent_op *op;
912 struct btrfs_extent_item *ei;
913 int ret, num_to_del, extent_slot = 0, found_extent = 0;
914 u32 refs;
915 u64 bytes_freed = 0;
916
917 path = btrfs_alloc_path();
918 if (!path)
919 return -ENOMEM;
920 path->reada = 1;
921
922search:
923 /* search for the backref for the current ref we want to delete */
924 cur = del_list->next;
925 op = list_entry(cur, struct pending_extent_op, list);
926 ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
927 op->orig_parent,
928 extent_root->root_key.objectid,
929 op->orig_generation, op->level, 1);
930 if (ret) {
931 printk("Unable to find backref byte nr %Lu root %Lu gen %Lu "
932 "owner %u\n", op->bytenr,
933 extent_root->root_key.objectid, op->orig_generation,
934 op->level);
935 btrfs_print_leaf(extent_root, path->nodes[0]);
936 WARN_ON(1);
937 goto out;
938 }
939
940 extent_slot = path->slots[0];
941 num_to_del = 1;
942 found_extent = 0;
943
944 /*
945 * if we aren't the first item on the leaf we can move back one and see
946 * if our ref is right next to our extent item
947 */
948 if (likely(extent_slot)) {
949 extent_slot--;
950 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
951 extent_slot);
952 if (found_key.objectid == op->bytenr &&
953 found_key.type == BTRFS_EXTENT_ITEM_KEY &&
954 found_key.offset == op->num_bytes) {
955 num_to_del++;
956 found_extent = 1;
957 }
958 }
959
960 /*
961 * if we didn't find the extent we need to delete the backref and then
962 * search for the extent item key so we can update its ref count
963 */
964 if (!found_extent) {
965 key.objectid = op->bytenr;
966 key.type = BTRFS_EXTENT_ITEM_KEY;
967 key.offset = op->num_bytes;
968
969 ret = remove_extent_backref(trans, extent_root, path);
970 BUG_ON(ret);
971 btrfs_release_path(extent_root, path);
972 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
973 BUG_ON(ret);
974 extent_slot = path->slots[0];
975 }
976
977 /* this is where we update the ref count for the extent */
978 leaf = path->nodes[0];
979 ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
980 refs = btrfs_extent_refs(leaf, ei);
981 BUG_ON(refs == 0);
982 refs--;
983 btrfs_set_extent_refs(leaf, ei, refs);
984
985 btrfs_mark_buffer_dirty(leaf);
986
987 /*
988 * This extent needs deleting. The reason cur_slot is extent_slot +
989 * num_to_del is because extent_slot points to the slot where the extent
990 * is, and if the backref was not right next to the extent we will be
991 * deleting at least 1 item, and will want to start searching at the
992 * slot directly next to extent_slot. However if we did find the
993 * backref next to the extent item them we will be deleting at least 2
994 * items and will want to start searching directly after the ref slot
995 */
996 if (!refs) {
997 struct list_head *pos, *n, *end;
998 int cur_slot = extent_slot+num_to_del;
999 u64 super_used;
1000 u64 root_used;
1001
1002 path->slots[0] = extent_slot;
1003 bytes_freed = op->num_bytes;
1004
1005 mutex_lock(&info->pinned_mutex);
1006 ret = pin_down_bytes(trans, extent_root, op->bytenr,
1007 op->num_bytes, op->level >=
1008 BTRFS_FIRST_FREE_OBJECTID);
1009 mutex_unlock(&info->pinned_mutex);
1010 BUG_ON(ret < 0);
1011 op->del = ret;
1012
1013 /*
1014 * we need to see if we can delete multiple things at once, so
1015 * start looping through the list of extents we are wanting to
1016 * delete and see if their extent/backref's are right next to
1017 * eachother and the extents only have 1 ref
1018 */
1019 for (pos = cur->next; pos != del_list; pos = pos->next) {
1020 struct pending_extent_op *tmp;
1021
1022 tmp = list_entry(pos, struct pending_extent_op, list);
1023
1024 /* we only want to delete extent+ref at this stage */
1025 if (cur_slot >= btrfs_header_nritems(leaf) - 1)
1026 break;
1027
1028 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
1029 if (found_key.objectid != tmp->bytenr ||
1030 found_key.type != BTRFS_EXTENT_ITEM_KEY ||
1031 found_key.offset != tmp->num_bytes)
1032 break;
1033
1034 /* check to make sure this extent only has one ref */
1035 ei = btrfs_item_ptr(leaf, cur_slot,
1036 struct btrfs_extent_item);
1037 if (btrfs_extent_refs(leaf, ei) != 1)
1038 break;
1039
1040 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
1041 if (found_key.objectid != tmp->bytenr ||
1042 found_key.type != BTRFS_EXTENT_REF_KEY ||
1043 found_key.offset != tmp->orig_parent)
1044 break;
1045
1046 /*
1047 * the ref is right next to the extent, we can set the
1048 * ref count to 0 since we will delete them both now
1049 */
1050 btrfs_set_extent_refs(leaf, ei, 0);
1051
1052 /* pin down the bytes for this extent */
1053 mutex_lock(&info->pinned_mutex);
1054 ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
1055 tmp->num_bytes, tmp->level >=
1056 BTRFS_FIRST_FREE_OBJECTID);
1057 mutex_unlock(&info->pinned_mutex);
1058 BUG_ON(ret < 0);
1059
1060 /*
1061 * use the del field to tell if we need to go ahead and
1062 * free up the extent when we delete the item or not.
1063 */
1064 tmp->del = ret;
1065 bytes_freed += tmp->num_bytes;
1066
1067 num_to_del += 2;
1068 cur_slot += 2;
1069 }
1070 end = pos;
1071
1072 /* update the free space counters */
1073 spin_lock_irq(&info->delalloc_lock);
1074 super_used = btrfs_super_bytes_used(&info->super_copy);
1075 btrfs_set_super_bytes_used(&info->super_copy,
1076 super_used - bytes_freed);
1077 spin_unlock_irq(&info->delalloc_lock);
1078
1079 root_used = btrfs_root_used(&extent_root->root_item);
1080 btrfs_set_root_used(&extent_root->root_item,
1081 root_used - bytes_freed);
1082
1083 /* delete the items */
1084 ret = btrfs_del_items(trans, extent_root, path,
1085 path->slots[0], num_to_del);
1086 BUG_ON(ret);
1087
1088 /*
1089 * loop through the extents we deleted and do the cleanup work
1090 * on them
1091 */
1092 for (pos = cur, n = pos->next; pos != end;
1093 pos = n, n = pos->next) {
1094 struct pending_extent_op *tmp;
1095#ifdef BIO_RW_DISCARD
1096 u64 map_length;
1097 struct btrfs_multi_bio *multi = NULL;
1098#endif
1099 tmp = list_entry(pos, struct pending_extent_op, list);
1100
1101 /*
1102 * remember tmp->del tells us wether or not we pinned
1103 * down the extent
1104 */
1105 ret = update_block_group(trans, extent_root,
1106 tmp->bytenr, tmp->num_bytes, 0,
1107 tmp->del);
1108 BUG_ON(ret);
1109
1110#ifdef BIO_RW_DISCARD
1111 ret = btrfs_map_block(&info->mapping_tree, READ,
1112 tmp->bytenr, &map_length, &multi,
1113 0);
1114 if (!ret) {
1115 struct btrfs_bio_stripe *stripe;
1116 int i;
1117
1118 stripe = multi->stripe;
1119
1120 if (map_length > tmp->num_bytes)
1121 map_length = tmp->num_bytes;
1122
1123 for (i = 0; i < multi->num_stripes;
1124 i++, stripe++)
1125 blkdev_issue_discard(stripe->dev->bdev,
1126 stripe->physical >> 9,
1127 map_length >> 9);
1128 kfree(multi);
1129 }
1130#endif
1131 list_del_init(&tmp->list);
1132 unlock_extent(&info->extent_ins, tmp->bytenr,
1133 tmp->bytenr + tmp->num_bytes - 1,
1134 GFP_NOFS);
1135 kfree(tmp);
1136 }
1137 } else if (refs && found_extent) {
1138 /*
1139 * the ref and extent were right next to eachother, but the
1140 * extent still has a ref, so just free the backref and keep
1141 * going
1142 */
1143 ret = remove_extent_backref(trans, extent_root, path);
1144 BUG_ON(ret);
1145
1146 list_del_init(&op->list);
1147 unlock_extent(&info->extent_ins, op->bytenr,
1148 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1149 kfree(op);
1150 } else {
1151 /*
1152 * the extent has multiple refs and the backref we were looking
1153 * for was not right next to it, so just unlock and go next,
1154 * we're good to go
1155 */
1156 list_del_init(&op->list);
1157 unlock_extent(&info->extent_ins, op->bytenr,
1158 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1159 kfree(op);
1160 }
1161
1162 btrfs_release_path(extent_root, path);
1163 if (!list_empty(del_list))
1164 goto search;
1165
1166out:
1167 btrfs_free_path(path);
1168 return ret;
1169}
1170
1171static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1172 struct btrfs_root *root, u64 bytenr,
1173 u64 orig_parent, u64 parent,
1174 u64 orig_root, u64 ref_root,
1175 u64 orig_generation, u64 ref_generation,
1176 u64 owner_objectid)
1177{
1178 int ret;
1179 struct btrfs_root *extent_root = root->fs_info->extent_root;
1180 struct btrfs_path *path;
1181
1182 if (root == root->fs_info->extent_root) {
1183 struct pending_extent_op *extent_op;
1184 u64 num_bytes;
1185
1186 BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
1187 num_bytes = btrfs_level_size(root, (int)owner_objectid);
1188 mutex_lock(&root->fs_info->extent_ins_mutex);
1189 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
1190 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
1191 u64 priv;
1192 ret = get_state_private(&root->fs_info->extent_ins,
1193 bytenr, &priv);
1194 BUG_ON(ret);
1195 extent_op = (struct pending_extent_op *)
1196 (unsigned long)priv;
1197 BUG_ON(extent_op->parent != orig_parent);
1198 BUG_ON(extent_op->generation != orig_generation);
1199
1200 extent_op->parent = parent;
1201 extent_op->generation = ref_generation;
1202 } else {
1203 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
1204 BUG_ON(!extent_op);
1205
1206 extent_op->type = PENDING_BACKREF_UPDATE;
1207 extent_op->bytenr = bytenr;
1208 extent_op->num_bytes = num_bytes;
1209 extent_op->parent = parent;
1210 extent_op->orig_parent = orig_parent;
1211 extent_op->generation = ref_generation;
1212 extent_op->orig_generation = orig_generation;
1213 extent_op->level = (int)owner_objectid;
1214 INIT_LIST_HEAD(&extent_op->list);
1215 extent_op->del = 0;
1216
1217 set_extent_bits(&root->fs_info->extent_ins,
1218 bytenr, bytenr + num_bytes - 1,
1219 EXTENT_WRITEBACK, GFP_NOFS);
1220 set_state_private(&root->fs_info->extent_ins,
1221 bytenr, (unsigned long)extent_op);
1222 }
1223 mutex_unlock(&root->fs_info->extent_ins_mutex);
1224 return 0;
1225 }
1226
1227 path = btrfs_alloc_path();
1228 if (!path)
1229 return -ENOMEM;
1230 ret = lookup_extent_backref(trans, extent_root, path,
1231 bytenr, orig_parent, orig_root,
1232 orig_generation, owner_objectid, 1);
1233 if (ret)
1234 goto out;
1235 ret = remove_extent_backref(trans, extent_root, path);
1236 if (ret)
1237 goto out;
1238 ret = insert_extent_backref(trans, extent_root, path, bytenr,
1239 parent, ref_root, ref_generation,
1240 owner_objectid);
1241 BUG_ON(ret);
1242 finish_current_insert(trans, extent_root, 0);
1243 del_pending_extents(trans, extent_root, 0);
1244out:
1245 btrfs_free_path(path);
1246 return ret;
1247}
1248
1249int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1250 struct btrfs_root *root, u64 bytenr,
1251 u64 orig_parent, u64 parent,
1252 u64 ref_root, u64 ref_generation,
1253 u64 owner_objectid)
1254{
1255 int ret;
1256 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1257 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1258 return 0;
1259 ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
1260 parent, ref_root, ref_root,
1261 ref_generation, ref_generation,
1262 owner_objectid);
1263 return ret;
1264}
1265
1266static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1267 struct btrfs_root *root, u64 bytenr,
1268 u64 orig_parent, u64 parent,
1269 u64 orig_root, u64 ref_root,
1270 u64 orig_generation, u64 ref_generation,
1271 u64 owner_objectid)
1272{
1273 struct btrfs_path *path;
1274 int ret;
1275 struct btrfs_key key;
1276 struct extent_buffer *l;
1277 struct btrfs_extent_item *item;
1278 u32 refs;
1279
1280 path = btrfs_alloc_path();
1281 if (!path)
1282 return -ENOMEM;
1283
1284 path->reada = 1;
1285 key.objectid = bytenr;
1286 key.type = BTRFS_EXTENT_ITEM_KEY;
1287 key.offset = (u64)-1;
1288
1289 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1290 0, 1);
1291 if (ret < 0)
1292 return ret;
1293 BUG_ON(ret == 0 || path->slots[0] == 0);
1294
1295 path->slots[0]--;
1296 l = path->nodes[0];
1297
1298 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1299 if (key.objectid != bytenr) {
1300 btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
1301 printk("wanted %Lu found %Lu\n", bytenr, key.objectid);
1302 BUG();
1303 }
1304 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
1305
1306 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1307 refs = btrfs_extent_refs(l, item);
1308 btrfs_set_extent_refs(l, item, refs + 1);
1309 btrfs_mark_buffer_dirty(path->nodes[0]);
1310
1311 btrfs_release_path(root->fs_info->extent_root, path);
1312
1313 path->reada = 1;
1314 ret = insert_extent_backref(trans, root->fs_info->extent_root,
1315 path, bytenr, parent,
1316 ref_root, ref_generation,
1317 owner_objectid);
1318 BUG_ON(ret);
1319 finish_current_insert(trans, root->fs_info->extent_root, 0);
1320 del_pending_extents(trans, root->fs_info->extent_root, 0);
1321
1322 btrfs_free_path(path);
1323 return 0;
1324}
1325
1326int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *root,
1328 u64 bytenr, u64 num_bytes, u64 parent,
1329 u64 ref_root, u64 ref_generation,
1330 u64 owner_objectid)
1331{
1332 int ret;
1333 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1334 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1335 return 0;
1336 ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
1337 0, ref_root, 0, ref_generation,
1338 owner_objectid);
1339 return ret;
1340}
1341
1342int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1343 struct btrfs_root *root)
1344{
1345 finish_current_insert(trans, root->fs_info->extent_root, 1);
1346 del_pending_extents(trans, root->fs_info->extent_root, 1);
1347 return 0;
1348}
1349
1350int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1351 struct btrfs_root *root, u64 bytenr,
1352 u64 num_bytes, u32 *refs)
1353{
1354 struct btrfs_path *path;
1355 int ret;
1356 struct btrfs_key key;
1357 struct extent_buffer *l;
1358 struct btrfs_extent_item *item;
1359
1360 WARN_ON(num_bytes < root->sectorsize);
1361 path = btrfs_alloc_path();
1362 path->reada = 1;
1363 key.objectid = bytenr;
1364 key.offset = num_bytes;
1365 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1366 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1367 0, 0);
1368 if (ret < 0)
1369 goto out;
1370 if (ret != 0) {
1371 btrfs_print_leaf(root, path->nodes[0]);
1372 printk("failed to find block number %Lu\n", bytenr);
1373 BUG();
1374 }
1375 l = path->nodes[0];
1376 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1377 *refs = btrfs_extent_refs(l, item);
1378out:
1379 btrfs_free_path(path);
1380 return 0;
1381}
1382
1383int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1384 struct btrfs_root *root, u64 bytenr)
1385{
1386 struct btrfs_root *extent_root = root->fs_info->extent_root;
1387 struct btrfs_path *path;
1388 struct extent_buffer *leaf;
1389 struct btrfs_extent_ref *ref_item;
1390 struct btrfs_key key;
1391 struct btrfs_key found_key;
1392 u64 ref_root;
1393 u64 last_snapshot;
1394 u32 nritems;
1395 int ret;
1396
1397 key.objectid = bytenr;
1398 key.offset = (u64)-1;
1399 key.type = BTRFS_EXTENT_ITEM_KEY;
1400
1401 path = btrfs_alloc_path();
1402 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1403 if (ret < 0)
1404 goto out;
1405 BUG_ON(ret == 0);
1406
1407 ret = -ENOENT;
1408 if (path->slots[0] == 0)
1409 goto out;
1410
1411 path->slots[0]--;
1412 leaf = path->nodes[0];
1413 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1414
1415 if (found_key.objectid != bytenr ||
1416 found_key.type != BTRFS_EXTENT_ITEM_KEY)
1417 goto out;
1418
1419 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1420 while (1) {
1421 leaf = path->nodes[0];
1422 nritems = btrfs_header_nritems(leaf);
1423 if (path->slots[0] >= nritems) {
1424 ret = btrfs_next_leaf(extent_root, path);
1425 if (ret < 0)
1426 goto out;
1427 if (ret == 0)
1428 continue;
1429 break;
1430 }
1431 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1432 if (found_key.objectid != bytenr)
1433 break;
1434
1435 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
1436 path->slots[0]++;
1437 continue;
1438 }
1439
1440 ref_item = btrfs_item_ptr(leaf, path->slots[0],
1441 struct btrfs_extent_ref);
1442 ref_root = btrfs_ref_root(leaf, ref_item);
1443 if (ref_root != root->root_key.objectid &&
1444 ref_root != BTRFS_TREE_LOG_OBJECTID) {
1445 ret = 1;
1446 goto out;
1447 }
1448 if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
1449 ret = 1;
1450 goto out;
1451 }
1452
1453 path->slots[0]++;
1454 }
1455 ret = 0;
1456out:
1457 btrfs_free_path(path);
1458 return ret;
1459}
1460
1461int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1462 struct extent_buffer *buf, u32 nr_extents)
1463{
1464 struct btrfs_key key;
1465 struct btrfs_file_extent_item *fi;
1466 u64 root_gen;
1467 u32 nritems;
1468 int i;
1469 int level;
1470 int ret = 0;
1471 int shared = 0;
1472
1473 if (!root->ref_cows)
1474 return 0;
1475
1476 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1477 shared = 0;
1478 root_gen = root->root_key.offset;
1479 } else {
1480 shared = 1;
1481 root_gen = trans->transid - 1;
1482 }
1483
1484 level = btrfs_header_level(buf);
1485 nritems = btrfs_header_nritems(buf);
1486
1487 if (level == 0) {
1488 struct btrfs_leaf_ref *ref;
1489 struct btrfs_extent_info *info;
1490
1491 ref = btrfs_alloc_leaf_ref(root, nr_extents);
1492 if (!ref) {
1493 ret = -ENOMEM;
1494 goto out;
1495 }
1496
1497 ref->root_gen = root_gen;
1498 ref->bytenr = buf->start;
1499 ref->owner = btrfs_header_owner(buf);
1500 ref->generation = btrfs_header_generation(buf);
1501 ref->nritems = nr_extents;
1502 info = ref->extents;
1503
1504 for (i = 0; nr_extents > 0 && i < nritems; i++) {
1505 u64 disk_bytenr;
1506 btrfs_item_key_to_cpu(buf, &key, i);
1507 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1508 continue;
1509 fi = btrfs_item_ptr(buf, i,
1510 struct btrfs_file_extent_item);
1511 if (btrfs_file_extent_type(buf, fi) ==
1512 BTRFS_FILE_EXTENT_INLINE)
1513 continue;
1514 disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1515 if (disk_bytenr == 0)
1516 continue;
1517
1518 info->bytenr = disk_bytenr;
1519 info->num_bytes =
1520 btrfs_file_extent_disk_num_bytes(buf, fi);
1521 info->objectid = key.objectid;
1522 info->offset = key.offset;
1523 info++;
1524 }
1525
1526 ret = btrfs_add_leaf_ref(root, ref, shared);
1527 if (ret == -EEXIST && shared) {
1528 struct btrfs_leaf_ref *old;
1529 old = btrfs_lookup_leaf_ref(root, ref->bytenr);
1530 BUG_ON(!old);
1531 btrfs_remove_leaf_ref(root, old);
1532 btrfs_free_leaf_ref(root, old);
1533 ret = btrfs_add_leaf_ref(root, ref, shared);
1534 }
1535 WARN_ON(ret);
1536 btrfs_free_leaf_ref(root, ref);
1537 }
1538out:
1539 return ret;
1540}
1541
1542int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1543 struct extent_buffer *orig_buf, struct extent_buffer *buf,
1544 u32 *nr_extents)
1545{
1546 u64 bytenr;
1547 u64 ref_root;
1548 u64 orig_root;
1549 u64 ref_generation;
1550 u64 orig_generation;
1551 u32 nritems;
1552 u32 nr_file_extents = 0;
1553 struct btrfs_key key;
1554 struct btrfs_file_extent_item *fi;
1555 int i;
1556 int level;
1557 int ret = 0;
1558 int faili = 0;
1559 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1560 u64, u64, u64, u64, u64, u64, u64, u64);
1561
1562 ref_root = btrfs_header_owner(buf);
1563 ref_generation = btrfs_header_generation(buf);
1564 orig_root = btrfs_header_owner(orig_buf);
1565 orig_generation = btrfs_header_generation(orig_buf);
1566
1567 nritems = btrfs_header_nritems(buf);
1568 level = btrfs_header_level(buf);
1569
1570 if (root->ref_cows) {
1571 process_func = __btrfs_inc_extent_ref;
1572 } else {
1573 if (level == 0 &&
1574 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1575 goto out;
1576 if (level != 0 &&
1577 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1578 goto out;
1579 process_func = __btrfs_update_extent_ref;
1580 }
1581
1582 for (i = 0; i < nritems; i++) {
1583 cond_resched();
1584 if (level == 0) {
1585 btrfs_item_key_to_cpu(buf, &key, i);
1586 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1587 continue;
1588 fi = btrfs_item_ptr(buf, i,
1589 struct btrfs_file_extent_item);
1590 if (btrfs_file_extent_type(buf, fi) ==
1591 BTRFS_FILE_EXTENT_INLINE)
1592 continue;
1593 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1594 if (bytenr == 0)
1595 continue;
1596
1597 nr_file_extents++;
1598
1599 ret = process_func(trans, root, bytenr,
1600 orig_buf->start, buf->start,
1601 orig_root, ref_root,
1602 orig_generation, ref_generation,
1603 key.objectid);
1604
1605 if (ret) {
1606 faili = i;
1607 WARN_ON(1);
1608 goto fail;
1609 }
1610 } else {
1611 bytenr = btrfs_node_blockptr(buf, i);
1612 ret = process_func(trans, root, bytenr,
1613 orig_buf->start, buf->start,
1614 orig_root, ref_root,
1615 orig_generation, ref_generation,
1616 level - 1);
1617 if (ret) {
1618 faili = i;
1619 WARN_ON(1);
1620 goto fail;
1621 }
1622 }
1623 }
1624out:
1625 if (nr_extents) {
1626 if (level == 0)
1627 *nr_extents = nr_file_extents;
1628 else
1629 *nr_extents = nritems;
1630 }
1631 return 0;
1632fail:
1633 WARN_ON(1);
1634 return ret;
1635}
1636
1637int btrfs_update_ref(struct btrfs_trans_handle *trans,
1638 struct btrfs_root *root, struct extent_buffer *orig_buf,
1639 struct extent_buffer *buf, int start_slot, int nr)
1640
1641{
1642 u64 bytenr;
1643 u64 ref_root;
1644 u64 orig_root;
1645 u64 ref_generation;
1646 u64 orig_generation;
1647 struct btrfs_key key;
1648 struct btrfs_file_extent_item *fi;
1649 int i;
1650 int ret;
1651 int slot;
1652 int level;
1653
1654 BUG_ON(start_slot < 0);
1655 BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1656
1657 ref_root = btrfs_header_owner(buf);
1658 ref_generation = btrfs_header_generation(buf);
1659 orig_root = btrfs_header_owner(orig_buf);
1660 orig_generation = btrfs_header_generation(orig_buf);
1661 level = btrfs_header_level(buf);
1662
1663 if (!root->ref_cows) {
1664 if (level == 0 &&
1665 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1666 return 0;
1667 if (level != 0 &&
1668 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1669 return 0;
1670 }
1671
1672 for (i = 0, slot = start_slot; i < nr; i++, slot++) {
1673 cond_resched();
1674 if (level == 0) {
1675 btrfs_item_key_to_cpu(buf, &key, slot);
1676 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1677 continue;
1678 fi = btrfs_item_ptr(buf, slot,
1679 struct btrfs_file_extent_item);
1680 if (btrfs_file_extent_type(buf, fi) ==
1681 BTRFS_FILE_EXTENT_INLINE)
1682 continue;
1683 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1684 if (bytenr == 0)
1685 continue;
1686 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1687 orig_buf->start, buf->start,
1688 orig_root, ref_root,
1689 orig_generation, ref_generation,
1690 key.objectid);
1691 if (ret)
1692 goto fail;
1693 } else {
1694 bytenr = btrfs_node_blockptr(buf, slot);
1695 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1696 orig_buf->start, buf->start,
1697 orig_root, ref_root,
1698 orig_generation, ref_generation,
1699 level - 1);
1700 if (ret)
1701 goto fail;
1702 }
1703 }
1704 return 0;
1705fail:
1706 WARN_ON(1);
1707 return -1;
1708}
1709
1710static int write_one_cache_group(struct btrfs_trans_handle *trans,
1711 struct btrfs_root *root,
1712 struct btrfs_path *path,
1713 struct btrfs_block_group_cache *cache)
1714{
1715 int ret;
1716 int pending_ret;
1717 struct btrfs_root *extent_root = root->fs_info->extent_root;
1718 unsigned long bi;
1719 struct extent_buffer *leaf;
1720
1721 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1722 if (ret < 0)
1723 goto fail;
1724 BUG_ON(ret);
1725
1726 leaf = path->nodes[0];
1727 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1728 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1729 btrfs_mark_buffer_dirty(leaf);
1730 btrfs_release_path(extent_root, path);
1731fail:
1732 finish_current_insert(trans, extent_root, 0);
1733 pending_ret = del_pending_extents(trans, extent_root, 0);
1734 if (ret)
1735 return ret;
1736 if (pending_ret)
1737 return pending_ret;
1738 return 0;
1739
1740}
1741
1742int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1743 struct btrfs_root *root)
1744{
1745 struct btrfs_block_group_cache *cache, *entry;
1746 struct rb_node *n;
1747 int err = 0;
1748 int werr = 0;
1749 struct btrfs_path *path;
1750 u64 last = 0;
1751
1752 path = btrfs_alloc_path();
1753 if (!path)
1754 return -ENOMEM;
1755
1756 while(1) {
1757 cache = NULL;
1758 spin_lock(&root->fs_info->block_group_cache_lock);
1759 for (n = rb_first(&root->fs_info->block_group_cache_tree);
1760 n; n = rb_next(n)) {
1761 entry = rb_entry(n, struct btrfs_block_group_cache,
1762 cache_node);
1763 if (entry->dirty) {
1764 cache = entry;
1765 break;
1766 }
1767 }
1768 spin_unlock(&root->fs_info->block_group_cache_lock);
1769
1770 if (!cache)
1771 break;
1772
1773 cache->dirty = 0;
1774 last += cache->key.offset;
1775
1776 err = write_one_cache_group(trans, root,
1777 path, cache);
1778 /*
1779 * if we fail to write the cache group, we want
1780 * to keep it marked dirty in hopes that a later
1781 * write will work
1782 */
1783 if (err) {
1784 werr = err;
1785 continue;
1786 }
1787 }
1788 btrfs_free_path(path);
1789 return werr;
1790}
1791
1792static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1793 u64 total_bytes, u64 bytes_used,
1794 struct btrfs_space_info **space_info)
1795{
1796 struct btrfs_space_info *found;
1797
1798 found = __find_space_info(info, flags);
1799 if (found) {
1800 spin_lock(&found->lock);
1801 found->total_bytes += total_bytes;
1802 found->bytes_used += bytes_used;
1803 found->full = 0;
1804 spin_unlock(&found->lock);
1805 *space_info = found;
1806 return 0;
1807 }
1808 found = kzalloc(sizeof(*found), GFP_NOFS);
1809 if (!found)
1810 return -ENOMEM;
1811
1812 list_add(&found->list, &info->space_info);
1813 INIT_LIST_HEAD(&found->block_groups);
1814 init_rwsem(&found->groups_sem);
1815 spin_lock_init(&found->lock);
1816 found->flags = flags;
1817 found->total_bytes = total_bytes;
1818 found->bytes_used = bytes_used;
1819 found->bytes_pinned = 0;
1820 found->bytes_reserved = 0;
1821 found->bytes_readonly = 0;
1822 found->full = 0;
1823 found->force_alloc = 0;
1824 *space_info = found;
1825 return 0;
1826}
1827
1828static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1829{
1830 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1831 BTRFS_BLOCK_GROUP_RAID1 |
1832 BTRFS_BLOCK_GROUP_RAID10 |
1833 BTRFS_BLOCK_GROUP_DUP);
1834 if (extra_flags) {
1835 if (flags & BTRFS_BLOCK_GROUP_DATA)
1836 fs_info->avail_data_alloc_bits |= extra_flags;
1837 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1838 fs_info->avail_metadata_alloc_bits |= extra_flags;
1839 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1840 fs_info->avail_system_alloc_bits |= extra_flags;
1841 }
1842}
1843
1844static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
1845{
1846 spin_lock(&cache->space_info->lock);
1847 spin_lock(&cache->lock);
1848 if (!cache->ro) {
1849 cache->space_info->bytes_readonly += cache->key.offset -
1850 btrfs_block_group_used(&cache->item);
1851 cache->ro = 1;
1852 }
1853 spin_unlock(&cache->lock);
1854 spin_unlock(&cache->space_info->lock);
1855}
1856
1857u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1858{
1859 u64 num_devices = root->fs_info->fs_devices->rw_devices;
1860
1861 if (num_devices == 1)
1862 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1863 if (num_devices < 4)
1864 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1865
1866 if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1867 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1868 BTRFS_BLOCK_GROUP_RAID10))) {
1869 flags &= ~BTRFS_BLOCK_GROUP_DUP;
1870 }
1871
1872 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1873 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1874 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1875 }
1876
1877 if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1878 ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1879 (flags & BTRFS_BLOCK_GROUP_RAID10) |
1880 (flags & BTRFS_BLOCK_GROUP_DUP)))
1881 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1882 return flags;
1883}
1884
1885static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1886 struct btrfs_root *extent_root, u64 alloc_bytes,
1887 u64 flags, int force)
1888{
1889 struct btrfs_space_info *space_info;
1890 u64 thresh;
1891 int ret = 0;
1892
1893 mutex_lock(&extent_root->fs_info->chunk_mutex);
1894
1895 flags = btrfs_reduce_alloc_profile(extent_root, flags);
1896
1897 space_info = __find_space_info(extent_root->fs_info, flags);
1898 if (!space_info) {
1899 ret = update_space_info(extent_root->fs_info, flags,
1900 0, 0, &space_info);
1901 BUG_ON(ret);
1902 }
1903 BUG_ON(!space_info);
1904
1905 spin_lock(&space_info->lock);
1906 if (space_info->force_alloc) {
1907 force = 1;
1908 space_info->force_alloc = 0;
1909 }
1910 if (space_info->full) {
1911 spin_unlock(&space_info->lock);
1912 goto out;
1913 }
1914
1915 thresh = space_info->total_bytes - space_info->bytes_readonly;
1916 thresh = div_factor(thresh, 6);
1917 if (!force &&
1918 (space_info->bytes_used + space_info->bytes_pinned +
1919 space_info->bytes_reserved + alloc_bytes) < thresh) {
1920 spin_unlock(&space_info->lock);
1921 goto out;
1922 }
1923 spin_unlock(&space_info->lock);
1924
1925 ret = btrfs_alloc_chunk(trans, extent_root, flags);
1926 if (ret) {
1927printk("space info full %Lu\n", flags);
1928 space_info->full = 1;
1929 }
1930out:
1931 mutex_unlock(&extent_root->fs_info->chunk_mutex);
1932 return ret;
1933}
1934
1935static int update_block_group(struct btrfs_trans_handle *trans,
1936 struct btrfs_root *root,
1937 u64 bytenr, u64 num_bytes, int alloc,
1938 int mark_free)
1939{
1940 struct btrfs_block_group_cache *cache;
1941 struct btrfs_fs_info *info = root->fs_info;
1942 u64 total = num_bytes;
1943 u64 old_val;
1944 u64 byte_in_group;
1945
1946 while(total) {
1947 cache = btrfs_lookup_block_group(info, bytenr);
1948 if (!cache)
1949 return -1;
1950 byte_in_group = bytenr - cache->key.objectid;
1951 WARN_ON(byte_in_group > cache->key.offset);
1952
1953 spin_lock(&cache->space_info->lock);
1954 spin_lock(&cache->lock);
1955 cache->dirty = 1;
1956 old_val = btrfs_block_group_used(&cache->item);
1957 num_bytes = min(total, cache->key.offset - byte_in_group);
1958 if (alloc) {
1959 old_val += num_bytes;
1960 cache->space_info->bytes_used += num_bytes;
1961 if (cache->ro) {
1962 cache->space_info->bytes_readonly -= num_bytes;
1963 WARN_ON(1);
1964 }
1965 btrfs_set_block_group_used(&cache->item, old_val);
1966 spin_unlock(&cache->lock);
1967 spin_unlock(&cache->space_info->lock);
1968 } else {
1969 old_val -= num_bytes;
1970 cache->space_info->bytes_used -= num_bytes;
1971 if (cache->ro)
1972 cache->space_info->bytes_readonly += num_bytes;
1973 btrfs_set_block_group_used(&cache->item, old_val);
1974 spin_unlock(&cache->lock);
1975 spin_unlock(&cache->space_info->lock);
1976 if (mark_free) {
1977 int ret;
1978 ret = btrfs_add_free_space(cache, bytenr,
1979 num_bytes);
1980 if (ret)
1981 return -1;
1982 }
1983 }
1984 total -= num_bytes;
1985 bytenr += num_bytes;
1986 }
1987 return 0;
1988}
1989
1990static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
1991{
1992 struct btrfs_block_group_cache *cache;
1993
1994 cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
1995 if (!cache)
1996 return 0;
1997
1998 return cache->key.objectid;
1999}
2000
2001int btrfs_update_pinned_extents(struct btrfs_root *root,
2002 u64 bytenr, u64 num, int pin)
2003{
2004 u64 len;
2005 struct btrfs_block_group_cache *cache;
2006 struct btrfs_fs_info *fs_info = root->fs_info;
2007
2008 WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2009 if (pin) {
2010 set_extent_dirty(&fs_info->pinned_extents,
2011 bytenr, bytenr + num - 1, GFP_NOFS);
2012 } else {
2013 clear_extent_dirty(&fs_info->pinned_extents,
2014 bytenr, bytenr + num - 1, GFP_NOFS);
2015 }
2016 while (num > 0) {
2017 cache = btrfs_lookup_block_group(fs_info, bytenr);
2018 BUG_ON(!cache);
2019 len = min(num, cache->key.offset -
2020 (bytenr - cache->key.objectid));
2021 if (pin) {
2022 spin_lock(&cache->space_info->lock);
2023 spin_lock(&cache->lock);
2024 cache->pinned += len;
2025 cache->space_info->bytes_pinned += len;
2026 spin_unlock(&cache->lock);
2027 spin_unlock(&cache->space_info->lock);
2028 fs_info->total_pinned += len;
2029 } else {
2030 spin_lock(&cache->space_info->lock);
2031 spin_lock(&cache->lock);
2032 cache->pinned -= len;
2033 cache->space_info->bytes_pinned -= len;
2034 spin_unlock(&cache->lock);
2035 spin_unlock(&cache->space_info->lock);
2036 fs_info->total_pinned -= len;
2037 if (cache->cached)
2038 btrfs_add_free_space(cache, bytenr, len);
2039 }
2040 bytenr += len;
2041 num -= len;
2042 }
2043 return 0;
2044}
2045
2046static int update_reserved_extents(struct btrfs_root *root,
2047 u64 bytenr, u64 num, int reserve)
2048{
2049 u64 len;
2050 struct btrfs_block_group_cache *cache;
2051 struct btrfs_fs_info *fs_info = root->fs_info;
2052
2053 while (num > 0) {
2054 cache = btrfs_lookup_block_group(fs_info, bytenr);
2055 BUG_ON(!cache);
2056 len = min(num, cache->key.offset -
2057 (bytenr - cache->key.objectid));
2058
2059 spin_lock(&cache->space_info->lock);
2060 spin_lock(&cache->lock);
2061 if (reserve) {
2062 cache->reserved += len;
2063 cache->space_info->bytes_reserved += len;
2064 } else {
2065 cache->reserved -= len;
2066 cache->space_info->bytes_reserved -= len;
2067 }
2068 spin_unlock(&cache->lock);
2069 spin_unlock(&cache->space_info->lock);
2070 bytenr += len;
2071 num -= len;
2072 }
2073 return 0;
2074}
2075
2076int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2077{
2078 u64 last = 0;
2079 u64 start;
2080 u64 end;
2081 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2082 int ret;
2083
2084 mutex_lock(&root->fs_info->pinned_mutex);
2085 while(1) {
2086 ret = find_first_extent_bit(pinned_extents, last,
2087 &start, &end, EXTENT_DIRTY);
2088 if (ret)
2089 break;
2090 set_extent_dirty(copy, start, end, GFP_NOFS);
2091 last = end + 1;
2092 }
2093 mutex_unlock(&root->fs_info->pinned_mutex);
2094 return 0;
2095}
2096
2097int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2098 struct btrfs_root *root,
2099 struct extent_io_tree *unpin)
2100{
2101 u64 start;
2102 u64 end;
2103 int ret;
2104
2105 mutex_lock(&root->fs_info->pinned_mutex);
2106 while(1) {
2107 ret = find_first_extent_bit(unpin, 0, &start, &end,
2108 EXTENT_DIRTY);
2109 if (ret)
2110 break;
2111 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2112 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2113 if (need_resched()) {
2114 mutex_unlock(&root->fs_info->pinned_mutex);
2115 cond_resched();
2116 mutex_lock(&root->fs_info->pinned_mutex);
2117 }
2118 }
2119 mutex_unlock(&root->fs_info->pinned_mutex);
2120 return 0;
2121}
2122
2123static int finish_current_insert(struct btrfs_trans_handle *trans,
2124 struct btrfs_root *extent_root, int all)
2125{
2126 u64 start;
2127 u64 end;
2128 u64 priv;
2129 u64 search = 0;
2130 u64 skipped = 0;
2131 struct btrfs_fs_info *info = extent_root->fs_info;
2132 struct btrfs_path *path;
2133 struct pending_extent_op *extent_op, *tmp;
2134 struct list_head insert_list, update_list;
2135 int ret;
2136 int num_inserts = 0, max_inserts;
2137
2138 path = btrfs_alloc_path();
2139 INIT_LIST_HEAD(&insert_list);
2140 INIT_LIST_HEAD(&update_list);
2141
2142 max_inserts = extent_root->leafsize /
2143 (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
2144 sizeof(struct btrfs_extent_ref) +
2145 sizeof(struct btrfs_extent_item));
2146again:
2147 mutex_lock(&info->extent_ins_mutex);
2148 while (1) {
2149 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2150 &end, EXTENT_WRITEBACK);
2151 if (ret) {
2152 if (skipped && all && !num_inserts) {
2153 skipped = 0;
2154 search = 0;
2155 continue;
2156 }
2157 mutex_unlock(&info->extent_ins_mutex);
2158 break;
2159 }
2160
2161 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2162 if (!ret) {
2163 skipped = 1;
2164 search = end + 1;
2165 if (need_resched()) {
2166 mutex_unlock(&info->extent_ins_mutex);
2167 cond_resched();
2168 mutex_lock(&info->extent_ins_mutex);
2169 }
2170 continue;
2171 }
2172
2173 ret = get_state_private(&info->extent_ins, start, &priv);
2174 BUG_ON(ret);
2175 extent_op = (struct pending_extent_op *)(unsigned long) priv;
2176
2177 if (extent_op->type == PENDING_EXTENT_INSERT) {
2178 num_inserts++;
2179 list_add_tail(&extent_op->list, &insert_list);
2180 search = end + 1;
2181 if (num_inserts == max_inserts) {
2182 mutex_unlock(&info->extent_ins_mutex);
2183 break;
2184 }
2185 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
2186 list_add_tail(&extent_op->list, &update_list);
2187 search = end + 1;
2188 } else {
2189 BUG();
2190 }
2191 }
2192
2193 /*
2194 * process the update list, clear the writeback bit for it, and if
2195 * somebody marked this thing for deletion then just unlock it and be
2196 * done, the free_extents will handle it
2197 */
2198 mutex_lock(&info->extent_ins_mutex);
2199 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2200 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2201 extent_op->bytenr + extent_op->num_bytes - 1,
2202 EXTENT_WRITEBACK, GFP_NOFS);
2203 if (extent_op->del) {
2204 list_del_init(&extent_op->list);
2205 unlock_extent(&info->extent_ins, extent_op->bytenr,
2206 extent_op->bytenr + extent_op->num_bytes
2207 - 1, GFP_NOFS);
2208 kfree(extent_op);
2209 }
2210 }
2211 mutex_unlock(&info->extent_ins_mutex);
2212
2213 /*
2214 * still have things left on the update list, go ahead an update
2215 * everything
2216 */
2217 if (!list_empty(&update_list)) {
2218 ret = update_backrefs(trans, extent_root, path, &update_list);
2219 BUG_ON(ret);
2220 }
2221
2222 /*
2223 * if no inserts need to be done, but we skipped some extents and we
2224 * need to make sure everything is cleaned then reset everything and
2225 * go back to the beginning
2226 */
2227 if (!num_inserts && all && skipped) {
2228 search = 0;
2229 skipped = 0;
2230 INIT_LIST_HEAD(&update_list);
2231 INIT_LIST_HEAD(&insert_list);
2232 goto again;
2233 } else if (!num_inserts) {
2234 goto out;
2235 }
2236
2237 /*
2238 * process the insert extents list. Again if we are deleting this
2239 * extent, then just unlock it, pin down the bytes if need be, and be
2240 * done with it. Saves us from having to actually insert the extent
2241 * into the tree and then subsequently come along and delete it
2242 */
2243 mutex_lock(&info->extent_ins_mutex);
2244 list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
2245 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2246 extent_op->bytenr + extent_op->num_bytes - 1,
2247 EXTENT_WRITEBACK, GFP_NOFS);
2248 if (extent_op->del) {
2249 list_del_init(&extent_op->list);
2250 unlock_extent(&info->extent_ins, extent_op->bytenr,
2251 extent_op->bytenr + extent_op->num_bytes
2252 - 1, GFP_NOFS);
2253
2254 mutex_lock(&extent_root->fs_info->pinned_mutex);
2255 ret = pin_down_bytes(trans, extent_root,
2256 extent_op->bytenr,
2257 extent_op->num_bytes, 0);
2258 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2259
2260 ret = update_block_group(trans, extent_root,
2261 extent_op->bytenr,
2262 extent_op->num_bytes,
2263 0, ret > 0);
2264 BUG_ON(ret);
2265 kfree(extent_op);
2266 num_inserts--;
2267 }
2268 }
2269 mutex_unlock(&info->extent_ins_mutex);
2270
2271 ret = insert_extents(trans, extent_root, path, &insert_list,
2272 num_inserts);
2273 BUG_ON(ret);
2274
2275 /*
2276 * if we broke out of the loop in order to insert stuff because we hit
2277 * the maximum number of inserts at a time we can handle, then loop
2278 * back and pick up where we left off
2279 */
2280 if (num_inserts == max_inserts) {
2281 INIT_LIST_HEAD(&insert_list);
2282 INIT_LIST_HEAD(&update_list);
2283 num_inserts = 0;
2284 goto again;
2285 }
2286
2287 /*
2288 * again, if we need to make absolutely sure there are no more pending
2289 * extent operations left and we know that we skipped some, go back to
2290 * the beginning and do it all again
2291 */
2292 if (all && skipped) {
2293 INIT_LIST_HEAD(&insert_list);
2294 INIT_LIST_HEAD(&update_list);
2295 search = 0;
2296 skipped = 0;
2297 num_inserts = 0;
2298 goto again;
2299 }
2300out:
2301 btrfs_free_path(path);
2302 return 0;
2303}
2304
2305static int pin_down_bytes(struct btrfs_trans_handle *trans,
2306 struct btrfs_root *root,
2307 u64 bytenr, u64 num_bytes, int is_data)
2308{
2309 int err = 0;
2310 struct extent_buffer *buf;
2311
2312 if (is_data)
2313 goto pinit;
2314
2315 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
2316 if (!buf)
2317 goto pinit;
2318
2319 /* we can reuse a block if it hasn't been written
2320 * and it is from this transaction. We can't
2321 * reuse anything from the tree log root because
2322 * it has tiny sub-transactions.
2323 */
2324 if (btrfs_buffer_uptodate(buf, 0) &&
2325 btrfs_try_tree_lock(buf)) {
2326 u64 header_owner = btrfs_header_owner(buf);
2327 u64 header_transid = btrfs_header_generation(buf);
2328 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2329 header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2330 header_transid == trans->transid &&
2331 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2332 clean_tree_block(NULL, root, buf);
2333 btrfs_tree_unlock(buf);
2334 free_extent_buffer(buf);
2335 return 1;
2336 }
2337 btrfs_tree_unlock(buf);
2338 }
2339 free_extent_buffer(buf);
2340pinit:
2341 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2342
2343 BUG_ON(err < 0);
2344 return 0;
2345}
2346
2347/*
2348 * remove an extent from the root, returns 0 on success
2349 */
2350static int __free_extent(struct btrfs_trans_handle *trans,
2351 struct btrfs_root *root,
2352 u64 bytenr, u64 num_bytes, u64 parent,
2353 u64 root_objectid, u64 ref_generation,
2354 u64 owner_objectid, int pin, int mark_free)
2355{
2356 struct btrfs_path *path;
2357 struct btrfs_key key;
2358 struct btrfs_fs_info *info = root->fs_info;
2359 struct btrfs_root *extent_root = info->extent_root;
2360 struct extent_buffer *leaf;
2361 int ret;
2362 int extent_slot = 0;
2363 int found_extent = 0;
2364 int num_to_del = 1;
2365 struct btrfs_extent_item *ei;
2366 u32 refs;
2367
2368 key.objectid = bytenr;
2369 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2370 key.offset = num_bytes;
2371 path = btrfs_alloc_path();
2372 if (!path)
2373 return -ENOMEM;
2374
2375 path->reada = 1;
2376 ret = lookup_extent_backref(trans, extent_root, path,
2377 bytenr, parent, root_objectid,
2378 ref_generation, owner_objectid, 1);
2379 if (ret == 0) {
2380 struct btrfs_key found_key;
2381 extent_slot = path->slots[0];
2382 while(extent_slot > 0) {
2383 extent_slot--;
2384 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2385 extent_slot);
2386 if (found_key.objectid != bytenr)
2387 break;
2388 if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
2389 found_key.offset == num_bytes) {
2390 found_extent = 1;
2391 break;
2392 }
2393 if (path->slots[0] - extent_slot > 5)
2394 break;
2395 }
2396 if (!found_extent) {
2397 ret = remove_extent_backref(trans, extent_root, path);
2398 BUG_ON(ret);
2399 btrfs_release_path(extent_root, path);
2400 ret = btrfs_search_slot(trans, extent_root,
2401 &key, path, -1, 1);
2402 if (ret) {
2403 printk(KERN_ERR "umm, got %d back from search"
2404 ", was looking for %Lu\n", ret,
2405 bytenr);
2406 btrfs_print_leaf(extent_root, path->nodes[0]);
2407 }
2408 BUG_ON(ret);
2409 extent_slot = path->slots[0];
2410 }
2411 } else {
2412 btrfs_print_leaf(extent_root, path->nodes[0]);
2413 WARN_ON(1);
2414 printk("Unable to find ref byte nr %Lu root %Lu "
2415 "gen %Lu owner %Lu\n", bytenr,
2416 root_objectid, ref_generation, owner_objectid);
2417 }
2418
2419 leaf = path->nodes[0];
2420 ei = btrfs_item_ptr(leaf, extent_slot,
2421 struct btrfs_extent_item);
2422 refs = btrfs_extent_refs(leaf, ei);
2423 BUG_ON(refs == 0);
2424 refs -= 1;
2425 btrfs_set_extent_refs(leaf, ei, refs);
2426
2427 btrfs_mark_buffer_dirty(leaf);
2428
2429 if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
2430 struct btrfs_extent_ref *ref;
2431 ref = btrfs_item_ptr(leaf, path->slots[0],
2432 struct btrfs_extent_ref);
2433 BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
2434 /* if the back ref and the extent are next to each other
2435 * they get deleted below in one shot
2436 */
2437 path->slots[0] = extent_slot;
2438 num_to_del = 2;
2439 } else if (found_extent) {
2440 /* otherwise delete the extent back ref */
2441 ret = remove_extent_backref(trans, extent_root, path);
2442 BUG_ON(ret);
2443 /* if refs are 0, we need to setup the path for deletion */
2444 if (refs == 0) {
2445 btrfs_release_path(extent_root, path);
2446 ret = btrfs_search_slot(trans, extent_root, &key, path,
2447 -1, 1);
2448 BUG_ON(ret);
2449 }
2450 }
2451
2452 if (refs == 0) {
2453 u64 super_used;
2454 u64 root_used;
2455#ifdef BIO_RW_DISCARD
2456 u64 map_length = num_bytes;
2457 struct btrfs_multi_bio *multi = NULL;
2458#endif
2459
2460 if (pin) {
2461 mutex_lock(&root->fs_info->pinned_mutex);
2462 ret = pin_down_bytes(trans, root, bytenr, num_bytes,
2463 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
2464 mutex_unlock(&root->fs_info->pinned_mutex);
2465 if (ret > 0)
2466 mark_free = 1;
2467 BUG_ON(ret < 0);
2468 }
2469
2470 /* block accounting for super block */
2471 spin_lock_irq(&info->delalloc_lock);
2472 super_used = btrfs_super_bytes_used(&info->super_copy);
2473 btrfs_set_super_bytes_used(&info->super_copy,
2474 super_used - num_bytes);
2475 spin_unlock_irq(&info->delalloc_lock);
2476
2477 /* block accounting for root item */
2478 root_used = btrfs_root_used(&root->root_item);
2479 btrfs_set_root_used(&root->root_item,
2480 root_used - num_bytes);
2481 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2482 num_to_del);
2483 BUG_ON(ret);
2484 btrfs_release_path(extent_root, path);
2485 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
2486 mark_free);
2487 BUG_ON(ret);
2488
2489#ifdef BIO_RW_DISCARD
2490 /* Tell the block device(s) that the sectors can be discarded */
2491 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
2492 bytenr, &map_length, &multi, 0);
2493 if (!ret) {
2494 struct btrfs_bio_stripe *stripe = multi->stripes;
2495 int i;
2496
2497 if (map_length > num_bytes)
2498 map_length = num_bytes;
2499
2500 for (i = 0; i < multi->num_stripes; i++, stripe++) {
2501 blkdev_issue_discard(stripe->dev->bdev,
2502 stripe->physical >> 9,
2503 map_length >> 9);
2504 }
2505 kfree(multi);
2506 }
2507#endif
2508 }
2509 btrfs_free_path(path);
2510 finish_current_insert(trans, extent_root, 0);
2511 return ret;
2512}
2513
2514/*
2515 * find all the blocks marked as pending in the radix tree and remove
2516 * them from the extent map
2517 */
2518static int del_pending_extents(struct btrfs_trans_handle *trans, struct
2519 btrfs_root *extent_root, int all)
2520{
2521 int ret;
2522 int err = 0;
2523 u64 start;
2524 u64 end;
2525 u64 priv;
2526 u64 search = 0;
2527 int nr = 0, skipped = 0;
2528 struct extent_io_tree *pending_del;
2529 struct extent_io_tree *extent_ins;
2530 struct pending_extent_op *extent_op;
2531 struct btrfs_fs_info *info = extent_root->fs_info;
2532 struct list_head delete_list;
2533
2534 INIT_LIST_HEAD(&delete_list);
2535 extent_ins = &extent_root->fs_info->extent_ins;
2536 pending_del = &extent_root->fs_info->pending_del;
2537
2538again:
2539 mutex_lock(&info->extent_ins_mutex);
2540 while(1) {
2541 ret = find_first_extent_bit(pending_del, search, &start, &end,
2542 EXTENT_WRITEBACK);
2543 if (ret) {
2544 if (all && skipped && !nr) {
2545 search = 0;
2546 continue;
2547 }
2548 mutex_unlock(&info->extent_ins_mutex);
2549 break;
2550 }
2551
2552 ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
2553 if (!ret) {
2554 search = end+1;
2555 skipped = 1;
2556
2557 if (need_resched()) {
2558 mutex_unlock(&info->extent_ins_mutex);
2559 cond_resched();
2560 mutex_lock(&info->extent_ins_mutex);
2561 }
2562
2563 continue;
2564 }
2565 BUG_ON(ret < 0);
2566
2567 ret = get_state_private(pending_del, start, &priv);
2568 BUG_ON(ret);
2569 extent_op = (struct pending_extent_op *)(unsigned long)priv;
2570
2571 clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
2572 GFP_NOFS);
2573 if (!test_range_bit(extent_ins, start, end,
2574 EXTENT_WRITEBACK, 0)) {
2575 list_add_tail(&extent_op->list, &delete_list);
2576 nr++;
2577 } else {
2578 kfree(extent_op);
2579
2580 ret = get_state_private(&info->extent_ins, start,
2581 &priv);
2582 BUG_ON(ret);
2583 extent_op = (struct pending_extent_op *)
2584 (unsigned long)priv;
2585
2586 clear_extent_bits(&info->extent_ins, start, end,
2587 EXTENT_WRITEBACK, GFP_NOFS);
2588
2589 if (extent_op->type == PENDING_BACKREF_UPDATE) {
2590 list_add_tail(&extent_op->list, &delete_list);
2591 search = end + 1;
2592 nr++;
2593 continue;
2594 }
2595
2596 mutex_lock(&extent_root->fs_info->pinned_mutex);
2597 ret = pin_down_bytes(trans, extent_root, start,
2598 end + 1 - start, 0);
2599 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2600
2601 ret = update_block_group(trans, extent_root, start,
2602 end + 1 - start, 0, ret > 0);
2603
2604 unlock_extent(extent_ins, start, end, GFP_NOFS);
2605 BUG_ON(ret);
2606 kfree(extent_op);
2607 }
2608 if (ret)
2609 err = ret;
2610
2611 search = end + 1;
2612
2613 if (need_resched()) {
2614 mutex_unlock(&info->extent_ins_mutex);
2615 cond_resched();
2616 mutex_lock(&info->extent_ins_mutex);
2617 }
2618 }
2619
2620 if (nr) {
2621 ret = free_extents(trans, extent_root, &delete_list);
2622 BUG_ON(ret);
2623 }
2624
2625 if (all && skipped) {
2626 INIT_LIST_HEAD(&delete_list);
2627 search = 0;
2628 nr = 0;
2629 goto again;
2630 }
2631
2632 return err;
2633}
2634
2635/*
2636 * remove an extent from the root, returns 0 on success
2637 */
2638static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2639 struct btrfs_root *root,
2640 u64 bytenr, u64 num_bytes, u64 parent,
2641 u64 root_objectid, u64 ref_generation,
2642 u64 owner_objectid, int pin)
2643{
2644 struct btrfs_root *extent_root = root->fs_info->extent_root;
2645 int pending_ret;
2646 int ret;
2647
2648 WARN_ON(num_bytes < root->sectorsize);
2649 if (root == extent_root) {
2650 struct pending_extent_op *extent_op = NULL;
2651
2652 mutex_lock(&root->fs_info->extent_ins_mutex);
2653 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
2654 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
2655 u64 priv;
2656 ret = get_state_private(&root->fs_info->extent_ins,
2657 bytenr, &priv);
2658 BUG_ON(ret);
2659 extent_op = (struct pending_extent_op *)
2660 (unsigned long)priv;
2661
2662 extent_op->del = 1;
2663 if (extent_op->type == PENDING_EXTENT_INSERT) {
2664 mutex_unlock(&root->fs_info->extent_ins_mutex);
2665 return 0;
2666 }
2667 }
2668
2669 if (extent_op) {
2670 ref_generation = extent_op->orig_generation;
2671 parent = extent_op->orig_parent;
2672 }
2673
2674 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2675 BUG_ON(!extent_op);
2676
2677 extent_op->type = PENDING_EXTENT_DELETE;
2678 extent_op->bytenr = bytenr;
2679 extent_op->num_bytes = num_bytes;
2680 extent_op->parent = parent;
2681 extent_op->orig_parent = parent;
2682 extent_op->generation = ref_generation;
2683 extent_op->orig_generation = ref_generation;
2684 extent_op->level = (int)owner_objectid;
2685 INIT_LIST_HEAD(&extent_op->list);
2686 extent_op->del = 0;
2687
2688 set_extent_bits(&root->fs_info->pending_del,
2689 bytenr, bytenr + num_bytes - 1,
2690 EXTENT_WRITEBACK, GFP_NOFS);
2691 set_state_private(&root->fs_info->pending_del,
2692 bytenr, (unsigned long)extent_op);
2693 mutex_unlock(&root->fs_info->extent_ins_mutex);
2694 return 0;
2695 }
2696 /* if metadata always pin */
2697 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2698 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2699 struct btrfs_block_group_cache *cache;
2700
2701 /* btrfs_free_reserved_extent */
2702 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2703 BUG_ON(!cache);
2704 btrfs_add_free_space(cache, bytenr, num_bytes);
2705 update_reserved_extents(root, bytenr, num_bytes, 0);
2706 return 0;
2707 }
2708 pin = 1;
2709 }
2710
2711 /* if data pin when any transaction has committed this */
2712 if (ref_generation != trans->transid)
2713 pin = 1;
2714
2715 ret = __free_extent(trans, root, bytenr, num_bytes, parent,
2716 root_objectid, ref_generation,
2717 owner_objectid, pin, pin == 0);
2718
2719 finish_current_insert(trans, root->fs_info->extent_root, 0);
2720 pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
2721 return ret ? ret : pending_ret;
2722}
2723
2724int btrfs_free_extent(struct btrfs_trans_handle *trans,
2725 struct btrfs_root *root,
2726 u64 bytenr, u64 num_bytes, u64 parent,
2727 u64 root_objectid, u64 ref_generation,
2728 u64 owner_objectid, int pin)
2729{
2730 int ret;
2731
2732 ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
2733 root_objectid, ref_generation,
2734 owner_objectid, pin);
2735 return ret;
2736}
2737
2738static u64 stripe_align(struct btrfs_root *root, u64 val)
2739{
2740 u64 mask = ((u64)root->stripesize - 1);
2741 u64 ret = (val + mask) & ~mask;
2742 return ret;
2743}
2744
2745/*
2746 * walks the btree of allocated extents and find a hole of a given size.
2747 * The key ins is changed to record the hole:
2748 * ins->objectid == block start
2749 * ins->flags = BTRFS_EXTENT_ITEM_KEY
2750 * ins->offset == number of blocks
2751 * Any available blocks before search_start are skipped.
2752 */
2753static int noinline find_free_extent(struct btrfs_trans_handle *trans,
2754 struct btrfs_root *orig_root,
2755 u64 num_bytes, u64 empty_size,
2756 u64 search_start, u64 search_end,
2757 u64 hint_byte, struct btrfs_key *ins,
2758 u64 exclude_start, u64 exclude_nr,
2759 int data)
2760{
2761 int ret = 0;
2762 struct btrfs_root * root = orig_root->fs_info->extent_root;
2763 u64 total_needed = num_bytes;
2764 u64 *last_ptr = NULL;
2765 u64 last_wanted = 0;
2766 struct btrfs_block_group_cache *block_group = NULL;
2767 int chunk_alloc_done = 0;
2768 int empty_cluster = 2 * 1024 * 1024;
2769 int allowed_chunk_alloc = 0;
2770 struct list_head *head = NULL, *cur = NULL;
2771 int loop = 0;
2772 int extra_loop = 0;
2773 struct btrfs_space_info *space_info;
2774
2775 WARN_ON(num_bytes < root->sectorsize);
2776 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2777 ins->objectid = 0;
2778 ins->offset = 0;
2779
2780 if (orig_root->ref_cows || empty_size)
2781 allowed_chunk_alloc = 1;
2782
2783 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2784 last_ptr = &root->fs_info->last_alloc;
2785 empty_cluster = 64 * 1024;
2786 }
2787
2788 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
2789 last_ptr = &root->fs_info->last_data_alloc;
2790
2791 if (last_ptr) {
2792 if (*last_ptr) {
2793 hint_byte = *last_ptr;
2794 last_wanted = *last_ptr;
2795 } else
2796 empty_size += empty_cluster;
2797 } else {
2798 empty_cluster = 0;
2799 }
2800 search_start = max(search_start, first_logical_byte(root, 0));
2801 search_start = max(search_start, hint_byte);
2802
2803 if (last_wanted && search_start != last_wanted) {
2804 last_wanted = 0;
2805 empty_size += empty_cluster;
2806 }
2807
2808 total_needed += empty_size;
2809 block_group = btrfs_lookup_block_group(root->fs_info, search_start);
2810 if (!block_group)
2811 block_group = btrfs_lookup_first_block_group(root->fs_info,
2812 search_start);
2813 space_info = __find_space_info(root->fs_info, data);
2814
2815 down_read(&space_info->groups_sem);
2816 while (1) {
2817 struct btrfs_free_space *free_space;
2818 /*
2819 * the only way this happens if our hint points to a block
2820 * group thats not of the proper type, while looping this
2821 * should never happen
2822 */
2823 if (empty_size)
2824 extra_loop = 1;
2825
2826 if (!block_group)
2827 goto new_group_no_lock;
2828
2829 mutex_lock(&block_group->alloc_mutex);
2830 if (unlikely(!block_group_bits(block_group, data)))
2831 goto new_group;
2832
2833 ret = cache_block_group(root, block_group);
2834 if (ret) {
2835 mutex_unlock(&block_group->alloc_mutex);
2836 break;
2837 }
2838
2839 if (block_group->ro)
2840 goto new_group;
2841
2842 free_space = btrfs_find_free_space(block_group, search_start,
2843 total_needed);
2844 if (free_space) {
2845 u64 start = block_group->key.objectid;
2846 u64 end = block_group->key.objectid +
2847 block_group->key.offset;
2848
2849 search_start = stripe_align(root, free_space->offset);
2850
2851 /* move on to the next group */
2852 if (search_start + num_bytes >= search_end)
2853 goto new_group;
2854
2855 /* move on to the next group */
2856 if (search_start + num_bytes > end)
2857 goto new_group;
2858
2859 if (last_wanted && search_start != last_wanted) {
2860 total_needed += empty_cluster;
2861 empty_size += empty_cluster;
2862 last_wanted = 0;
2863 /*
2864 * if search_start is still in this block group
2865 * then we just re-search this block group
2866 */
2867 if (search_start >= start &&
2868 search_start < end) {
2869 mutex_unlock(&block_group->alloc_mutex);
2870 continue;
2871 }
2872
2873 /* else we go to the next block group */
2874 goto new_group;
2875 }
2876
2877 if (exclude_nr > 0 &&
2878 (search_start + num_bytes > exclude_start &&
2879 search_start < exclude_start + exclude_nr)) {
2880 search_start = exclude_start + exclude_nr;
2881 /*
2882 * if search_start is still in this block group
2883 * then we just re-search this block group
2884 */
2885 if (search_start >= start &&
2886 search_start < end) {
2887 mutex_unlock(&block_group->alloc_mutex);
2888 last_wanted = 0;
2889 continue;
2890 }
2891
2892 /* else we go to the next block group */
2893 goto new_group;
2894 }
2895
2896 ins->objectid = search_start;
2897 ins->offset = num_bytes;
2898
2899 btrfs_remove_free_space_lock(block_group, search_start,
2900 num_bytes);
2901 /* we are all good, lets return */
2902 mutex_unlock(&block_group->alloc_mutex);
2903 break;
2904 }
2905new_group:
2906 mutex_unlock(&block_group->alloc_mutex);
2907new_group_no_lock:
2908 /* don't try to compare new allocations against the
2909 * last allocation any more
2910 */
2911 last_wanted = 0;
2912
2913 /*
2914 * Here's how this works.
2915 * loop == 0: we were searching a block group via a hint
2916 * and didn't find anything, so we start at
2917 * the head of the block groups and keep searching
2918 * loop == 1: we're searching through all of the block groups
2919 * if we hit the head again we have searched
2920 * all of the block groups for this space and we
2921 * need to try and allocate, if we cant error out.
2922 * loop == 2: we allocated more space and are looping through
2923 * all of the block groups again.
2924 */
2925 if (loop == 0) {
2926 head = &space_info->block_groups;
2927 cur = head->next;
2928 loop++;
2929 } else if (loop == 1 && cur == head) {
2930 int keep_going;
2931
2932 /* at this point we give up on the empty_size
2933 * allocations and just try to allocate the min
2934 * space.
2935 *
2936 * The extra_loop field was set if an empty_size
2937 * allocation was attempted above, and if this
2938 * is try we need to try the loop again without
2939 * the additional empty_size.
2940 */
2941 total_needed -= empty_size;
2942 empty_size = 0;
2943 keep_going = extra_loop;
2944 loop++;
2945
2946 if (allowed_chunk_alloc && !chunk_alloc_done) {
2947 up_read(&space_info->groups_sem);
2948 ret = do_chunk_alloc(trans, root, num_bytes +
2949 2 * 1024 * 1024, data, 1);
2950 down_read(&space_info->groups_sem);
2951 if (ret < 0)
2952 goto loop_check;
2953 head = &space_info->block_groups;
2954 /*
2955 * we've allocated a new chunk, keep
2956 * trying
2957 */
2958 keep_going = 1;
2959 chunk_alloc_done = 1;
2960 } else if (!allowed_chunk_alloc) {
2961 space_info->force_alloc = 1;
2962 }
2963loop_check:
2964 if (keep_going) {
2965 cur = head->next;
2966 extra_loop = 0;
2967 } else {
2968 break;
2969 }
2970 } else if (cur == head) {
2971 break;
2972 }
2973
2974 block_group = list_entry(cur, struct btrfs_block_group_cache,
2975 list);
2976 search_start = block_group->key.objectid;
2977 cur = cur->next;
2978 }
2979
2980 /* we found what we needed */
2981 if (ins->objectid) {
2982 if (!(data & BTRFS_BLOCK_GROUP_DATA))
2983 trans->block_group = block_group;
2984
2985 if (last_ptr)
2986 *last_ptr = ins->objectid + ins->offset;
2987 ret = 0;
2988 } else if (!ret) {
2989 printk(KERN_ERR "we were searching for %Lu bytes, num_bytes %Lu,"
2990 " loop %d, allowed_alloc %d\n", total_needed, num_bytes,
2991 loop, allowed_chunk_alloc);
2992 ret = -ENOSPC;
2993 }
2994
2995 up_read(&space_info->groups_sem);
2996 return ret;
2997}
2998
2999static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3000{
3001 struct btrfs_block_group_cache *cache;
3002 struct list_head *l;
3003
3004 printk(KERN_INFO "space_info has %Lu free, is %sfull\n",
3005 info->total_bytes - info->bytes_used - info->bytes_pinned -
3006 info->bytes_reserved, (info->full) ? "" : "not ");
3007
3008 down_read(&info->groups_sem);
3009 list_for_each(l, &info->block_groups) {
3010 cache = list_entry(l, struct btrfs_block_group_cache, list);
3011 spin_lock(&cache->lock);
3012 printk(KERN_INFO "block group %Lu has %Lu bytes, %Lu used "
3013 "%Lu pinned %Lu reserved\n",
3014 cache->key.objectid, cache->key.offset,
3015 btrfs_block_group_used(&cache->item),
3016 cache->pinned, cache->reserved);
3017 btrfs_dump_free_space(cache, bytes);
3018 spin_unlock(&cache->lock);
3019 }
3020 up_read(&info->groups_sem);
3021}
3022
3023static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3024 struct btrfs_root *root,
3025 u64 num_bytes, u64 min_alloc_size,
3026 u64 empty_size, u64 hint_byte,
3027 u64 search_end, struct btrfs_key *ins,
3028 u64 data)
3029{
3030 int ret;
3031 u64 search_start = 0;
3032 u64 alloc_profile;
3033 struct btrfs_fs_info *info = root->fs_info;
3034
3035 if (data) {
3036 alloc_profile = info->avail_data_alloc_bits &
3037 info->data_alloc_profile;
3038 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
3039 } else if (root == root->fs_info->chunk_root) {
3040 alloc_profile = info->avail_system_alloc_bits &
3041 info->system_alloc_profile;
3042 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
3043 } else {
3044 alloc_profile = info->avail_metadata_alloc_bits &
3045 info->metadata_alloc_profile;
3046 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
3047 }
3048again:
3049 data = btrfs_reduce_alloc_profile(root, data);
3050 /*
3051 * the only place that sets empty_size is btrfs_realloc_node, which
3052 * is not called recursively on allocations
3053 */
3054 if (empty_size || root->ref_cows) {
3055 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
3056 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3057 2 * 1024 * 1024,
3058 BTRFS_BLOCK_GROUP_METADATA |
3059 (info->metadata_alloc_profile &
3060 info->avail_metadata_alloc_bits), 0);
3061 }
3062 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3063 num_bytes + 2 * 1024 * 1024, data, 0);
3064 }
3065
3066 WARN_ON(num_bytes < root->sectorsize);
3067 ret = find_free_extent(trans, root, num_bytes, empty_size,
3068 search_start, search_end, hint_byte, ins,
3069 trans->alloc_exclude_start,
3070 trans->alloc_exclude_nr, data);
3071
3072 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
3073 num_bytes = num_bytes >> 1;
3074 num_bytes = num_bytes & ~(root->sectorsize - 1);
3075 num_bytes = max(num_bytes, min_alloc_size);
3076 do_chunk_alloc(trans, root->fs_info->extent_root,
3077 num_bytes, data, 1);
3078 goto again;
3079 }
3080 if (ret) {
3081 struct btrfs_space_info *sinfo;
3082
3083 sinfo = __find_space_info(root->fs_info, data);
3084 printk("allocation failed flags %Lu, wanted %Lu\n",
3085 data, num_bytes);
3086 dump_space_info(sinfo, num_bytes);
3087 BUG();
3088 }
3089
3090 return ret;
3091}
3092
3093int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3094{
3095 struct btrfs_block_group_cache *cache;
3096
3097 cache = btrfs_lookup_block_group(root->fs_info, start);
3098 if (!cache) {
3099 printk(KERN_ERR "Unable to find block group for %Lu\n", start);
3100 return -ENOSPC;
3101 }
3102 btrfs_add_free_space(cache, start, len);
3103 update_reserved_extents(root, start, len, 0);
3104 return 0;
3105}
3106
3107int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3108 struct btrfs_root *root,
3109 u64 num_bytes, u64 min_alloc_size,
3110 u64 empty_size, u64 hint_byte,
3111 u64 search_end, struct btrfs_key *ins,
3112 u64 data)
3113{
3114 int ret;
3115 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3116 empty_size, hint_byte, search_end, ins,
3117 data);
3118 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3119 return ret;
3120}
3121
3122static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3123 struct btrfs_root *root, u64 parent,
3124 u64 root_objectid, u64 ref_generation,
3125 u64 owner, struct btrfs_key *ins)
3126{
3127 int ret;
3128 int pending_ret;
3129 u64 super_used;
3130 u64 root_used;
3131 u64 num_bytes = ins->offset;
3132 u32 sizes[2];
3133 struct btrfs_fs_info *info = root->fs_info;
3134 struct btrfs_root *extent_root = info->extent_root;
3135 struct btrfs_extent_item *extent_item;
3136 struct btrfs_extent_ref *ref;
3137 struct btrfs_path *path;
3138 struct btrfs_key keys[2];
3139
3140 if (parent == 0)
3141 parent = ins->objectid;
3142
3143 /* block accounting for super block */
3144 spin_lock_irq(&info->delalloc_lock);
3145 super_used = btrfs_super_bytes_used(&info->super_copy);
3146 btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
3147 spin_unlock_irq(&info->delalloc_lock);
3148
3149 /* block accounting for root item */
3150 root_used = btrfs_root_used(&root->root_item);
3151 btrfs_set_root_used(&root->root_item, root_used + num_bytes);
3152
3153 if (root == extent_root) {
3154 struct pending_extent_op *extent_op;
3155
3156 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3157 BUG_ON(!extent_op);
3158
3159 extent_op->type = PENDING_EXTENT_INSERT;
3160 extent_op->bytenr = ins->objectid;
3161 extent_op->num_bytes = ins->offset;
3162 extent_op->parent = parent;
3163 extent_op->orig_parent = 0;
3164 extent_op->generation = ref_generation;
3165 extent_op->orig_generation = 0;
3166 extent_op->level = (int)owner;
3167 INIT_LIST_HEAD(&extent_op->list);
3168 extent_op->del = 0;
3169
3170 mutex_lock(&root->fs_info->extent_ins_mutex);
3171 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
3172 ins->objectid + ins->offset - 1,
3173 EXTENT_WRITEBACK, GFP_NOFS);
3174 set_state_private(&root->fs_info->extent_ins,
3175 ins->objectid, (unsigned long)extent_op);
3176 mutex_unlock(&root->fs_info->extent_ins_mutex);
3177 goto update_block;
3178 }
3179
3180 memcpy(&keys[0], ins, sizeof(*ins));
3181 keys[1].objectid = ins->objectid;
3182 keys[1].type = BTRFS_EXTENT_REF_KEY;
3183 keys[1].offset = parent;
3184 sizes[0] = sizeof(*extent_item);
3185 sizes[1] = sizeof(*ref);
3186
3187 path = btrfs_alloc_path();
3188 BUG_ON(!path);
3189
3190 ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
3191 sizes, 2);
3192 BUG_ON(ret);
3193
3194 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3195 struct btrfs_extent_item);
3196 btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
3197 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3198 struct btrfs_extent_ref);
3199
3200 btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
3201 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
3202 btrfs_set_ref_objectid(path->nodes[0], ref, owner);
3203 btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
3204
3205 btrfs_mark_buffer_dirty(path->nodes[0]);
3206
3207 trans->alloc_exclude_start = 0;
3208 trans->alloc_exclude_nr = 0;
3209 btrfs_free_path(path);
3210 finish_current_insert(trans, extent_root, 0);
3211 pending_ret = del_pending_extents(trans, extent_root, 0);
3212
3213 if (ret)
3214 goto out;
3215 if (pending_ret) {
3216 ret = pending_ret;
3217 goto out;
3218 }
3219
3220update_block:
3221 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
3222 if (ret) {
3223 printk("update block group failed for %Lu %Lu\n",
3224 ins->objectid, ins->offset);
3225 BUG();
3226 }
3227out:
3228 return ret;
3229}
3230
3231int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3232 struct btrfs_root *root, u64 parent,
3233 u64 root_objectid, u64 ref_generation,
3234 u64 owner, struct btrfs_key *ins)
3235{
3236 int ret;
3237
3238 if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3239 return 0;
3240 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3241 ref_generation, owner, ins);
3242 update_reserved_extents(root, ins->objectid, ins->offset, 0);
3243 return ret;
3244}
3245
3246/*
3247 * this is used by the tree logging recovery code. It records that
3248 * an extent has been allocated and makes sure to clear the free
3249 * space cache bits as well
3250 */
3251int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3252 struct btrfs_root *root, u64 parent,
3253 u64 root_objectid, u64 ref_generation,
3254 u64 owner, struct btrfs_key *ins)
3255{
3256 int ret;
3257 struct btrfs_block_group_cache *block_group;
3258
3259 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
3260 mutex_lock(&block_group->alloc_mutex);
3261 cache_block_group(root, block_group);
3262
3263 ret = btrfs_remove_free_space_lock(block_group, ins->objectid,
3264 ins->offset);
3265 mutex_unlock(&block_group->alloc_mutex);
3266 BUG_ON(ret);
3267 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3268 ref_generation, owner, ins);
3269 return ret;
3270}
3271
3272/*
3273 * finds a free extent and does all the dirty work required for allocation
3274 * returns the key for the extent through ins, and a tree buffer for
3275 * the first block of the extent through buf.
3276 *
3277 * returns 0 if everything worked, non-zero otherwise.
3278 */
3279int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3280 struct btrfs_root *root,
3281 u64 num_bytes, u64 parent, u64 min_alloc_size,
3282 u64 root_objectid, u64 ref_generation,
3283 u64 owner_objectid, u64 empty_size, u64 hint_byte,
3284 u64 search_end, struct btrfs_key *ins, u64 data)
3285{
3286 int ret;
3287
3288 ret = __btrfs_reserve_extent(trans, root, num_bytes,
3289 min_alloc_size, empty_size, hint_byte,
3290 search_end, ins, data);
3291 BUG_ON(ret);
3292 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3293 ret = __btrfs_alloc_reserved_extent(trans, root, parent,
3294 root_objectid, ref_generation,
3295 owner_objectid, ins);
3296 BUG_ON(ret);
3297
3298 } else {
3299 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3300 }
3301 return ret;
3302}
3303
3304struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3305 struct btrfs_root *root,
3306 u64 bytenr, u32 blocksize)
3307{
3308 struct extent_buffer *buf;
3309
3310 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
3311 if (!buf)
3312 return ERR_PTR(-ENOMEM);
3313 btrfs_set_header_generation(buf, trans->transid);
3314 btrfs_tree_lock(buf);
3315 clean_tree_block(trans, root, buf);
3316 btrfs_set_buffer_uptodate(buf);
3317 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3318 set_extent_dirty(&root->dirty_log_pages, buf->start,
3319 buf->start + buf->len - 1, GFP_NOFS);
3320 } else {
3321 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
3322 buf->start + buf->len - 1, GFP_NOFS);
3323 }
3324 trans->blocks_used++;
3325 return buf;
3326}
3327
3328/*
3329 * helper function to allocate a block for a given tree
3330 * returns the tree buffer or NULL.
3331 */
3332struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3333 struct btrfs_root *root,
3334 u32 blocksize, u64 parent,
3335 u64 root_objectid,
3336 u64 ref_generation,
3337 int level,
3338 u64 hint,
3339 u64 empty_size)
3340{
3341 struct btrfs_key ins;
3342 int ret;
3343 struct extent_buffer *buf;
3344
3345 ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
3346 root_objectid, ref_generation, level,
3347 empty_size, hint, (u64)-1, &ins, 0);
3348 if (ret) {
3349 BUG_ON(ret > 0);
3350 return ERR_PTR(ret);
3351 }
3352
3353 buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
3354 return buf;
3355}
3356
3357int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3358 struct btrfs_root *root, struct extent_buffer *leaf)
3359{
3360 u64 leaf_owner;
3361 u64 leaf_generation;
3362 struct btrfs_key key;
3363 struct btrfs_file_extent_item *fi;
3364 int i;
3365 int nritems;
3366 int ret;
3367
3368 BUG_ON(!btrfs_is_leaf(leaf));
3369 nritems = btrfs_header_nritems(leaf);
3370 leaf_owner = btrfs_header_owner(leaf);
3371 leaf_generation = btrfs_header_generation(leaf);
3372
3373 for (i = 0; i < nritems; i++) {
3374 u64 disk_bytenr;
3375 cond_resched();
3376
3377 btrfs_item_key_to_cpu(leaf, &key, i);
3378 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3379 continue;
3380 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3381 if (btrfs_file_extent_type(leaf, fi) ==
3382 BTRFS_FILE_EXTENT_INLINE)
3383 continue;
3384 /*
3385 * FIXME make sure to insert a trans record that
3386 * repeats the snapshot del on crash
3387 */
3388 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3389 if (disk_bytenr == 0)
3390 continue;
3391
3392 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3393 btrfs_file_extent_disk_num_bytes(leaf, fi),
3394 leaf->start, leaf_owner, leaf_generation,
3395 key.objectid, 0);
3396 BUG_ON(ret);
3397
3398 atomic_inc(&root->fs_info->throttle_gen);
3399 wake_up(&root->fs_info->transaction_throttle);
3400 cond_resched();
3401 }
3402 return 0;
3403}
3404
3405static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3406 struct btrfs_root *root,
3407 struct btrfs_leaf_ref *ref)
3408{
3409 int i;
3410 int ret;
3411 struct btrfs_extent_info *info = ref->extents;
3412
3413 for (i = 0; i < ref->nritems; i++) {
3414 ret = __btrfs_free_extent(trans, root, info->bytenr,
3415 info->num_bytes, ref->bytenr,
3416 ref->owner, ref->generation,
3417 info->objectid, 0);
3418
3419 atomic_inc(&root->fs_info->throttle_gen);
3420 wake_up(&root->fs_info->transaction_throttle);
3421 cond_resched();
3422
3423 BUG_ON(ret);
3424 info++;
3425 }
3426
3427 return 0;
3428}
3429
3430int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
3431 u32 *refs)
3432{
3433 int ret;
3434
3435 ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
3436 BUG_ON(ret);
3437
3438#if 0 // some debugging code in case we see problems here
3439 /* if the refs count is one, it won't get increased again. But
3440 * if the ref count is > 1, someone may be decreasing it at
3441 * the same time we are.
3442 */
3443 if (*refs != 1) {
3444 struct extent_buffer *eb = NULL;
3445 eb = btrfs_find_create_tree_block(root, start, len);
3446 if (eb)
3447 btrfs_tree_lock(eb);
3448
3449 mutex_lock(&root->fs_info->alloc_mutex);
3450 ret = lookup_extent_ref(NULL, root, start, len, refs);
3451 BUG_ON(ret);
3452 mutex_unlock(&root->fs_info->alloc_mutex);
3453
3454 if (eb) {
3455 btrfs_tree_unlock(eb);
3456 free_extent_buffer(eb);
3457 }
3458 if (*refs == 1) {
3459 printk("block %llu went down to one during drop_snap\n",
3460 (unsigned long long)start);
3461 }
3462
3463 }
3464#endif
3465
3466 cond_resched();
3467 return ret;
3468}
3469
3470/*
3471 * helper function for drop_snapshot, this walks down the tree dropping ref
3472 * counts as it goes.
3473 */
3474static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
3475 struct btrfs_root *root,
3476 struct btrfs_path *path, int *level)
3477{
3478 u64 root_owner;
3479 u64 root_gen;
3480 u64 bytenr;
3481 u64 ptr_gen;
3482 struct extent_buffer *next;
3483 struct extent_buffer *cur;
3484 struct extent_buffer *parent;
3485 struct btrfs_leaf_ref *ref;
3486 u32 blocksize;
3487 int ret;
3488 u32 refs;
3489
3490 WARN_ON(*level < 0);
3491 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3492 ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
3493 path->nodes[*level]->len, &refs);
3494 BUG_ON(ret);
3495 if (refs > 1)
3496 goto out;
3497
3498 /*
3499 * walk down to the last node level and free all the leaves
3500 */
3501 while(*level >= 0) {
3502 WARN_ON(*level < 0);
3503 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3504 cur = path->nodes[*level];
3505
3506 if (btrfs_header_level(cur) != *level)
3507 WARN_ON(1);
3508
3509 if (path->slots[*level] >=
3510 btrfs_header_nritems(cur))
3511 break;
3512 if (*level == 0) {
3513 ret = btrfs_drop_leaf_ref(trans, root, cur);
3514 BUG_ON(ret);
3515 break;
3516 }
3517 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3518 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3519 blocksize = btrfs_level_size(root, *level - 1);
3520
3521 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3522 BUG_ON(ret);
3523 if (refs != 1) {
3524 parent = path->nodes[*level];
3525 root_owner = btrfs_header_owner(parent);
3526 root_gen = btrfs_header_generation(parent);
3527 path->slots[*level]++;
3528
3529 ret = __btrfs_free_extent(trans, root, bytenr,
3530 blocksize, parent->start,
3531 root_owner, root_gen,
3532 *level - 1, 1);
3533 BUG_ON(ret);
3534
3535 atomic_inc(&root->fs_info->throttle_gen);
3536 wake_up(&root->fs_info->transaction_throttle);
3537 cond_resched();
3538
3539 continue;
3540 }
3541 /*
3542 * at this point, we have a single ref, and since the
3543 * only place referencing this extent is a dead root
3544 * the reference count should never go higher.
3545 * So, we don't need to check it again
3546 */
3547 if (*level == 1) {
3548 ref = btrfs_lookup_leaf_ref(root, bytenr);
3549 if (ref && ref->generation != ptr_gen) {
3550 btrfs_free_leaf_ref(root, ref);
3551 ref = NULL;
3552 }
3553 if (ref) {
3554 ret = cache_drop_leaf_ref(trans, root, ref);
3555 BUG_ON(ret);
3556 btrfs_remove_leaf_ref(root, ref);
3557 btrfs_free_leaf_ref(root, ref);
3558 *level = 0;
3559 break;
3560 }
3561 if (printk_ratelimit()) {
3562 printk("leaf ref miss for bytenr %llu\n",
3563 (unsigned long long)bytenr);
3564 }
3565 }
3566 next = btrfs_find_tree_block(root, bytenr, blocksize);
3567 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3568 free_extent_buffer(next);
3569
3570 next = read_tree_block(root, bytenr, blocksize,
3571 ptr_gen);
3572 cond_resched();
3573#if 0
3574 /*
3575 * this is a debugging check and can go away
3576 * the ref should never go all the way down to 1
3577 * at this point
3578 */
3579 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3580 &refs);
3581 BUG_ON(ret);
3582 WARN_ON(refs != 1);
3583#endif
3584 }
3585 WARN_ON(*level <= 0);
3586 if (path->nodes[*level-1])
3587 free_extent_buffer(path->nodes[*level-1]);
3588 path->nodes[*level-1] = next;
3589 *level = btrfs_header_level(next);
3590 path->slots[*level] = 0;
3591 cond_resched();
3592 }
3593out:
3594 WARN_ON(*level < 0);
3595 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3596
3597 if (path->nodes[*level] == root->node) {
3598 parent = path->nodes[*level];
3599 bytenr = path->nodes[*level]->start;
3600 } else {
3601 parent = path->nodes[*level + 1];
3602 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
3603 }
3604
3605 blocksize = btrfs_level_size(root, *level);
3606 root_owner = btrfs_header_owner(parent);
3607 root_gen = btrfs_header_generation(parent);
3608
3609 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3610 parent->start, root_owner, root_gen,
3611 *level, 1);
3612 free_extent_buffer(path->nodes[*level]);
3613 path->nodes[*level] = NULL;
3614 *level += 1;
3615 BUG_ON(ret);
3616
3617 cond_resched();
3618 return 0;
3619}
3620
3621/*
3622 * helper function for drop_subtree, this function is similar to
3623 * walk_down_tree. The main difference is that it checks reference
3624 * counts while tree blocks are locked.
3625 */
3626static int noinline walk_down_subtree(struct btrfs_trans_handle *trans,
3627 struct btrfs_root *root,
3628 struct btrfs_path *path, int *level)
3629{
3630 struct extent_buffer *next;
3631 struct extent_buffer *cur;
3632 struct extent_buffer *parent;
3633 u64 bytenr;
3634 u64 ptr_gen;
3635 u32 blocksize;
3636 u32 refs;
3637 int ret;
3638
3639 cur = path->nodes[*level];
3640 ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
3641 &refs);
3642 BUG_ON(ret);
3643 if (refs > 1)
3644 goto out;
3645
3646 while (*level >= 0) {
3647 cur = path->nodes[*level];
3648 if (*level == 0) {
3649 ret = btrfs_drop_leaf_ref(trans, root, cur);
3650 BUG_ON(ret);
3651 clean_tree_block(trans, root, cur);
3652 break;
3653 }
3654 if (path->slots[*level] >= btrfs_header_nritems(cur)) {
3655 clean_tree_block(trans, root, cur);
3656 break;
3657 }
3658
3659 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3660 blocksize = btrfs_level_size(root, *level - 1);
3661 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3662
3663 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3664 btrfs_tree_lock(next);
3665
3666 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3667 &refs);
3668 BUG_ON(ret);
3669 if (refs > 1) {
3670 parent = path->nodes[*level];
3671 ret = btrfs_free_extent(trans, root, bytenr,
3672 blocksize, parent->start,
3673 btrfs_header_owner(parent),
3674 btrfs_header_generation(parent),
3675 *level - 1, 1);
3676 BUG_ON(ret);
3677 path->slots[*level]++;
3678 btrfs_tree_unlock(next);
3679 free_extent_buffer(next);
3680 continue;
3681 }
3682
3683 *level = btrfs_header_level(next);
3684 path->nodes[*level] = next;
3685 path->slots[*level] = 0;
3686 path->locks[*level] = 1;
3687 cond_resched();
3688 }
3689out:
3690 parent = path->nodes[*level + 1];
3691 bytenr = path->nodes[*level]->start;
3692 blocksize = path->nodes[*level]->len;
3693
3694 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3695 parent->start, btrfs_header_owner(parent),
3696 btrfs_header_generation(parent), *level, 1);
3697 BUG_ON(ret);
3698
3699 if (path->locks[*level]) {
3700 btrfs_tree_unlock(path->nodes[*level]);
3701 path->locks[*level] = 0;
3702 }
3703 free_extent_buffer(path->nodes[*level]);
3704 path->nodes[*level] = NULL;
3705 *level += 1;
3706 cond_resched();
3707 return 0;
3708}
3709
3710/*
3711 * helper for dropping snapshots. This walks back up the tree in the path
3712 * to find the first node higher up where we haven't yet gone through
3713 * all the slots
3714 */
3715static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
3716 struct btrfs_root *root,
3717 struct btrfs_path *path,
3718 int *level, int max_level)
3719{
3720 u64 root_owner;
3721 u64 root_gen;
3722 struct btrfs_root_item *root_item = &root->root_item;
3723 int i;
3724 int slot;
3725 int ret;
3726
3727 for (i = *level; i < max_level && path->nodes[i]; i++) {
3728 slot = path->slots[i];
3729 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3730 struct extent_buffer *node;
3731 struct btrfs_disk_key disk_key;
3732 node = path->nodes[i];
3733 path->slots[i]++;
3734 *level = i;
3735 WARN_ON(*level == 0);
3736 btrfs_node_key(node, &disk_key, path->slots[i]);
3737 memcpy(&root_item->drop_progress,
3738 &disk_key, sizeof(disk_key));
3739 root_item->drop_level = i;
3740 return 0;
3741 } else {
3742 struct extent_buffer *parent;
3743 if (path->nodes[*level] == root->node)
3744 parent = path->nodes[*level];
3745 else
3746 parent = path->nodes[*level + 1];
3747
3748 root_owner = btrfs_header_owner(parent);
3749 root_gen = btrfs_header_generation(parent);
3750
3751 clean_tree_block(trans, root, path->nodes[*level]);
3752 ret = btrfs_free_extent(trans, root,
3753 path->nodes[*level]->start,
3754 path->nodes[*level]->len,
3755 parent->start, root_owner,
3756 root_gen, *level, 1);
3757 BUG_ON(ret);
3758 if (path->locks[*level]) {
3759 btrfs_tree_unlock(path->nodes[*level]);
3760 path->locks[*level] = 0;
3761 }
3762 free_extent_buffer(path->nodes[*level]);
3763 path->nodes[*level] = NULL;
3764 *level = i + 1;
3765 }
3766 }
3767 return 1;
3768}
3769
3770/*
3771 * drop the reference count on the tree rooted at 'snap'. This traverses
3772 * the tree freeing any blocks that have a ref count of zero after being
3773 * decremented.
3774 */
3775int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3776 *root)
3777{
3778 int ret = 0;
3779 int wret;
3780 int level;
3781 struct btrfs_path *path;
3782 int i;
3783 int orig_level;
3784 struct btrfs_root_item *root_item = &root->root_item;
3785
3786 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
3787 path = btrfs_alloc_path();
3788 BUG_ON(!path);
3789
3790 level = btrfs_header_level(root->node);
3791 orig_level = level;
3792 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3793 path->nodes[level] = root->node;
3794 extent_buffer_get(root->node);
3795 path->slots[level] = 0;
3796 } else {
3797 struct btrfs_key key;
3798 struct btrfs_disk_key found_key;
3799 struct extent_buffer *node;
3800
3801 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
3802 level = root_item->drop_level;
3803 path->lowest_level = level;
3804 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3805 if (wret < 0) {
3806 ret = wret;
3807 goto out;
3808 }
3809 node = path->nodes[level];
3810 btrfs_node_key(node, &found_key, path->slots[level]);
3811 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
3812 sizeof(found_key)));
3813 /*
3814 * unlock our path, this is safe because only this
3815 * function is allowed to delete this snapshot
3816 */
3817 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3818 if (path->nodes[i] && path->locks[i]) {
3819 path->locks[i] = 0;
3820 btrfs_tree_unlock(path->nodes[i]);
3821 }
3822 }
3823 }
3824 while(1) {
3825 wret = walk_down_tree(trans, root, path, &level);
3826 if (wret > 0)
3827 break;
3828 if (wret < 0)
3829 ret = wret;
3830
3831 wret = walk_up_tree(trans, root, path, &level,
3832 BTRFS_MAX_LEVEL);
3833 if (wret > 0)
3834 break;
3835 if (wret < 0)
3836 ret = wret;
3837 if (trans->transaction->in_commit) {
3838 ret = -EAGAIN;
3839 break;
3840 }
3841 atomic_inc(&root->fs_info->throttle_gen);
3842 wake_up(&root->fs_info->transaction_throttle);
3843 }
3844 for (i = 0; i <= orig_level; i++) {
3845 if (path->nodes[i]) {
3846 free_extent_buffer(path->nodes[i]);
3847 path->nodes[i] = NULL;
3848 }
3849 }
3850out:
3851 btrfs_free_path(path);
3852 return ret;
3853}
3854
3855int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3856 struct btrfs_root *root,
3857 struct extent_buffer *node,
3858 struct extent_buffer *parent)
3859{
3860 struct btrfs_path *path;
3861 int level;
3862 int parent_level;
3863 int ret = 0;
3864 int wret;
3865
3866 path = btrfs_alloc_path();
3867 BUG_ON(!path);
3868
3869 BUG_ON(!btrfs_tree_locked(parent));
3870 parent_level = btrfs_header_level(parent);
3871 extent_buffer_get(parent);
3872 path->nodes[parent_level] = parent;
3873 path->slots[parent_level] = btrfs_header_nritems(parent);
3874
3875 BUG_ON(!btrfs_tree_locked(node));
3876 level = btrfs_header_level(node);
3877 extent_buffer_get(node);
3878 path->nodes[level] = node;
3879 path->slots[level] = 0;
3880
3881 while (1) {
3882 wret = walk_down_subtree(trans, root, path, &level);
3883 if (wret < 0)
3884 ret = wret;
3885 if (wret != 0)
3886 break;
3887
3888 wret = walk_up_tree(trans, root, path, &level, parent_level);
3889 if (wret < 0)
3890 ret = wret;
3891 if (wret != 0)
3892 break;
3893 }
3894
3895 btrfs_free_path(path);
3896 return ret;
3897}
3898
3899static unsigned long calc_ra(unsigned long start, unsigned long last,
3900 unsigned long nr)
3901{
3902 return min(last, start + nr - 1);
3903}
3904
3905static int noinline relocate_inode_pages(struct inode *inode, u64 start,
3906 u64 len)
3907{
3908 u64 page_start;
3909 u64 page_end;
3910 unsigned long first_index;
3911 unsigned long last_index;
3912 unsigned long i;
3913 struct page *page;
3914 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3915 struct file_ra_state *ra;
3916 struct btrfs_ordered_extent *ordered;
3917 unsigned int total_read = 0;
3918 unsigned int total_dirty = 0;
3919 int ret = 0;
3920
3921 ra = kzalloc(sizeof(*ra), GFP_NOFS);
3922
3923 mutex_lock(&inode->i_mutex);
3924 first_index = start >> PAGE_CACHE_SHIFT;
3925 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
3926
3927 /* make sure the dirty trick played by the caller work */
3928 ret = invalidate_inode_pages2_range(inode->i_mapping,
3929 first_index, last_index);
3930 if (ret)
3931 goto out_unlock;
3932
3933 file_ra_state_init(ra, inode->i_mapping);
3934
3935 for (i = first_index ; i <= last_index; i++) {
3936 if (total_read % ra->ra_pages == 0) {
3937 btrfs_force_ra(inode->i_mapping, ra, NULL, i,
3938 calc_ra(i, last_index, ra->ra_pages));
3939 }
3940 total_read++;
3941again:
3942 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
3943 BUG_ON(1);
3944 page = grab_cache_page(inode->i_mapping, i);
3945 if (!page) {
3946 ret = -ENOMEM;
3947 goto out_unlock;
3948 }
3949 if (!PageUptodate(page)) {
3950 btrfs_readpage(NULL, page);
3951 lock_page(page);
3952 if (!PageUptodate(page)) {
3953 unlock_page(page);
3954 page_cache_release(page);
3955 ret = -EIO;
3956 goto out_unlock;
3957 }
3958 }
3959 wait_on_page_writeback(page);
3960
3961 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
3962 page_end = page_start + PAGE_CACHE_SIZE - 1;
3963 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
3964
3965 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3966 if (ordered) {
3967 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3968 unlock_page(page);
3969 page_cache_release(page);
3970 btrfs_start_ordered_extent(inode, ordered, 1);
3971 btrfs_put_ordered_extent(ordered);
3972 goto again;
3973 }
3974 set_page_extent_mapped(page);
3975
3976 btrfs_set_extent_delalloc(inode, page_start, page_end);
3977 if (i == first_index)
3978 set_extent_bits(io_tree, page_start, page_end,
3979 EXTENT_BOUNDARY, GFP_NOFS);
3980
3981 set_page_dirty(page);
3982 total_dirty++;
3983
3984 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3985 unlock_page(page);
3986 page_cache_release(page);
3987 }
3988
3989out_unlock:
3990 kfree(ra);
3991 mutex_unlock(&inode->i_mutex);
3992 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
3993 return ret;
3994}
3995
3996static int noinline relocate_data_extent(struct inode *reloc_inode,
3997 struct btrfs_key *extent_key,
3998 u64 offset)
3999{
4000 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4001 struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
4002 struct extent_map *em;
4003 u64 start = extent_key->objectid - offset;
4004 u64 end = start + extent_key->offset - 1;
4005
4006 em = alloc_extent_map(GFP_NOFS);
4007 BUG_ON(!em || IS_ERR(em));
4008
4009 em->start = start;
4010 em->len = extent_key->offset;
4011 em->block_len = extent_key->offset;
4012 em->block_start = extent_key->objectid;
4013 em->bdev = root->fs_info->fs_devices->latest_bdev;
4014 set_bit(EXTENT_FLAG_PINNED, &em->flags);
4015
4016 /* setup extent map to cheat btrfs_readpage */
4017 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4018 while (1) {
4019 int ret;
4020 spin_lock(&em_tree->lock);
4021 ret = add_extent_mapping(em_tree, em);
4022 spin_unlock(&em_tree->lock);
4023 if (ret != -EEXIST) {
4024 free_extent_map(em);
4025 break;
4026 }
4027 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
4028 }
4029 unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4030
4031 return relocate_inode_pages(reloc_inode, start, extent_key->offset);
4032}
4033
4034struct btrfs_ref_path {
4035 u64 extent_start;
4036 u64 nodes[BTRFS_MAX_LEVEL];
4037 u64 root_objectid;
4038 u64 root_generation;
4039 u64 owner_objectid;
4040 u32 num_refs;
4041 int lowest_level;
4042 int current_level;
4043 int shared_level;
4044
4045 struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
4046 u64 new_nodes[BTRFS_MAX_LEVEL];
4047};
4048
4049struct disk_extent {
4050 u64 ram_bytes;
4051 u64 disk_bytenr;
4052 u64 disk_num_bytes;
4053 u64 offset;
4054 u64 num_bytes;
4055 u8 compression;
4056 u8 encryption;
4057 u16 other_encoding;
4058};
4059
4060static int is_cowonly_root(u64 root_objectid)
4061{
4062 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
4063 root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
4064 root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
4065 root_objectid == BTRFS_DEV_TREE_OBJECTID ||
4066 root_objectid == BTRFS_TREE_LOG_OBJECTID)
4067 return 1;
4068 return 0;
4069}
4070
4071static int noinline __next_ref_path(struct btrfs_trans_handle *trans,
4072 struct btrfs_root *extent_root,
4073 struct btrfs_ref_path *ref_path,
4074 int first_time)
4075{
4076 struct extent_buffer *leaf;
4077 struct btrfs_path *path;
4078 struct btrfs_extent_ref *ref;
4079 struct btrfs_key key;
4080 struct btrfs_key found_key;
4081 u64 bytenr;
4082 u32 nritems;
4083 int level;
4084 int ret = 1;
4085
4086 path = btrfs_alloc_path();
4087 if (!path)
4088 return -ENOMEM;
4089
4090 if (first_time) {
4091 ref_path->lowest_level = -1;
4092 ref_path->current_level = -1;
4093 ref_path->shared_level = -1;
4094 goto walk_up;
4095 }
4096walk_down:
4097 level = ref_path->current_level - 1;
4098 while (level >= -1) {
4099 u64 parent;
4100 if (level < ref_path->lowest_level)
4101 break;
4102
4103 if (level >= 0) {
4104 bytenr = ref_path->nodes[level];
4105 } else {
4106 bytenr = ref_path->extent_start;
4107 }
4108 BUG_ON(bytenr == 0);
4109
4110 parent = ref_path->nodes[level + 1];
4111 ref_path->nodes[level + 1] = 0;
4112 ref_path->current_level = level;
4113 BUG_ON(parent == 0);
4114
4115 key.objectid = bytenr;
4116 key.offset = parent + 1;
4117 key.type = BTRFS_EXTENT_REF_KEY;
4118
4119 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4120 if (ret < 0)
4121 goto out;
4122 BUG_ON(ret == 0);
4123
4124 leaf = path->nodes[0];
4125 nritems = btrfs_header_nritems(leaf);
4126 if (path->slots[0] >= nritems) {
4127 ret = btrfs_next_leaf(extent_root, path);
4128 if (ret < 0)
4129 goto out;
4130 if (ret > 0)
4131 goto next;
4132 leaf = path->nodes[0];
4133 }
4134
4135 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4136 if (found_key.objectid == bytenr &&
4137 found_key.type == BTRFS_EXTENT_REF_KEY) {
4138 if (level < ref_path->shared_level)
4139 ref_path->shared_level = level;
4140 goto found;
4141 }
4142next:
4143 level--;
4144 btrfs_release_path(extent_root, path);
4145 cond_resched();
4146 }
4147 /* reached lowest level */
4148 ret = 1;
4149 goto out;
4150walk_up:
4151 level = ref_path->current_level;
4152 while (level < BTRFS_MAX_LEVEL - 1) {
4153 u64 ref_objectid;
4154 if (level >= 0) {
4155 bytenr = ref_path->nodes[level];
4156 } else {
4157 bytenr = ref_path->extent_start;
4158 }
4159 BUG_ON(bytenr == 0);
4160
4161 key.objectid = bytenr;
4162 key.offset = 0;
4163 key.type = BTRFS_EXTENT_REF_KEY;
4164
4165 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4166 if (ret < 0)
4167 goto out;
4168
4169 leaf = path->nodes[0];
4170 nritems = btrfs_header_nritems(leaf);
4171 if (path->slots[0] >= nritems) {
4172 ret = btrfs_next_leaf(extent_root, path);
4173 if (ret < 0)
4174 goto out;
4175 if (ret > 0) {
4176 /* the extent was freed by someone */
4177 if (ref_path->lowest_level == level)
4178 goto out;
4179 btrfs_release_path(extent_root, path);
4180 goto walk_down;
4181 }
4182 leaf = path->nodes[0];
4183 }
4184
4185 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4186 if (found_key.objectid != bytenr ||
4187 found_key.type != BTRFS_EXTENT_REF_KEY) {
4188 /* the extent was freed by someone */
4189 if (ref_path->lowest_level == level) {
4190 ret = 1;
4191 goto out;
4192 }
4193 btrfs_release_path(extent_root, path);
4194 goto walk_down;
4195 }
4196found:
4197 ref = btrfs_item_ptr(leaf, path->slots[0],
4198 struct btrfs_extent_ref);
4199 ref_objectid = btrfs_ref_objectid(leaf, ref);
4200 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4201 if (first_time) {
4202 level = (int)ref_objectid;
4203 BUG_ON(level >= BTRFS_MAX_LEVEL);
4204 ref_path->lowest_level = level;
4205 ref_path->current_level = level;
4206 ref_path->nodes[level] = bytenr;
4207 } else {
4208 WARN_ON(ref_objectid != level);
4209 }
4210 } else {
4211 WARN_ON(level != -1);
4212 }
4213 first_time = 0;
4214
4215 if (ref_path->lowest_level == level) {
4216 ref_path->owner_objectid = ref_objectid;
4217 ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
4218 }
4219
4220 /*
4221 * the block is tree root or the block isn't in reference
4222 * counted tree.
4223 */
4224 if (found_key.objectid == found_key.offset ||
4225 is_cowonly_root(btrfs_ref_root(leaf, ref))) {
4226 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4227 ref_path->root_generation =
4228 btrfs_ref_generation(leaf, ref);
4229 if (level < 0) {
4230 /* special reference from the tree log */
4231 ref_path->nodes[0] = found_key.offset;
4232 ref_path->current_level = 0;
4233 }
4234 ret = 0;
4235 goto out;
4236 }
4237
4238 level++;
4239 BUG_ON(ref_path->nodes[level] != 0);
4240 ref_path->nodes[level] = found_key.offset;
4241 ref_path->current_level = level;
4242
4243 /*
4244 * the reference was created in the running transaction,
4245 * no need to continue walking up.
4246 */
4247 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
4248 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4249 ref_path->root_generation =
4250 btrfs_ref_generation(leaf, ref);
4251 ret = 0;
4252 goto out;
4253 }
4254
4255 btrfs_release_path(extent_root, path);
4256 cond_resched();
4257 }
4258 /* reached max tree level, but no tree root found. */
4259 BUG();
4260out:
4261 btrfs_free_path(path);
4262 return ret;
4263}
4264
4265static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
4266 struct btrfs_root *extent_root,
4267 struct btrfs_ref_path *ref_path,
4268 u64 extent_start)
4269{
4270 memset(ref_path, 0, sizeof(*ref_path));
4271 ref_path->extent_start = extent_start;
4272
4273 return __next_ref_path(trans, extent_root, ref_path, 1);
4274}
4275
4276static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
4277 struct btrfs_root *extent_root,
4278 struct btrfs_ref_path *ref_path)
4279{
4280 return __next_ref_path(trans, extent_root, ref_path, 0);
4281}
4282
4283static int noinline get_new_locations(struct inode *reloc_inode,
4284 struct btrfs_key *extent_key,
4285 u64 offset, int no_fragment,
4286 struct disk_extent **extents,
4287 int *nr_extents)
4288{
4289 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4290 struct btrfs_path *path;
4291 struct btrfs_file_extent_item *fi;
4292 struct extent_buffer *leaf;
4293 struct disk_extent *exts = *extents;
4294 struct btrfs_key found_key;
4295 u64 cur_pos;
4296 u64 last_byte;
4297 u32 nritems;
4298 int nr = 0;
4299 int max = *nr_extents;
4300 int ret;
4301
4302 WARN_ON(!no_fragment && *extents);
4303 if (!exts) {
4304 max = 1;
4305 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
4306 if (!exts)
4307 return -ENOMEM;
4308 }
4309
4310 path = btrfs_alloc_path();
4311 BUG_ON(!path);
4312
4313 cur_pos = extent_key->objectid - offset;
4314 last_byte = extent_key->objectid + extent_key->offset;
4315 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
4316 cur_pos, 0);
4317 if (ret < 0)
4318 goto out;
4319 if (ret > 0) {
4320 ret = -ENOENT;
4321 goto out;
4322 }
4323
4324 while (1) {
4325 leaf = path->nodes[0];
4326 nritems = btrfs_header_nritems(leaf);
4327 if (path->slots[0] >= nritems) {
4328 ret = btrfs_next_leaf(root, path);
4329 if (ret < 0)
4330 goto out;
4331 if (ret > 0)
4332 break;
4333 leaf = path->nodes[0];
4334 }
4335
4336 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4337 if (found_key.offset != cur_pos ||
4338 found_key.type != BTRFS_EXTENT_DATA_KEY ||
4339 found_key.objectid != reloc_inode->i_ino)
4340 break;
4341
4342 fi = btrfs_item_ptr(leaf, path->slots[0],
4343 struct btrfs_file_extent_item);
4344 if (btrfs_file_extent_type(leaf, fi) !=
4345 BTRFS_FILE_EXTENT_REG ||
4346 btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4347 break;
4348
4349 if (nr == max) {
4350 struct disk_extent *old = exts;
4351 max *= 2;
4352 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
4353 memcpy(exts, old, sizeof(*exts) * nr);
4354 if (old != *extents)
4355 kfree(old);
4356 }
4357
4358 exts[nr].disk_bytenr =
4359 btrfs_file_extent_disk_bytenr(leaf, fi);
4360 exts[nr].disk_num_bytes =
4361 btrfs_file_extent_disk_num_bytes(leaf, fi);
4362 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
4363 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4364 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
4365 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
4366 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
4367 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
4368 fi);
4369 BUG_ON(exts[nr].offset > 0);
4370 BUG_ON(exts[nr].compression || exts[nr].encryption);
4371 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
4372
4373 cur_pos += exts[nr].num_bytes;
4374 nr++;
4375
4376 if (cur_pos + offset >= last_byte)
4377 break;
4378
4379 if (no_fragment) {
4380 ret = 1;
4381 goto out;
4382 }
4383 path->slots[0]++;
4384 }
4385
4386 WARN_ON(cur_pos + offset > last_byte);
4387 if (cur_pos + offset < last_byte) {
4388 ret = -ENOENT;
4389 goto out;
4390 }
4391 ret = 0;
4392out:
4393 btrfs_free_path(path);
4394 if (ret) {
4395 if (exts != *extents)
4396 kfree(exts);
4397 } else {
4398 *extents = exts;
4399 *nr_extents = nr;
4400 }
4401 return ret;
4402}
4403
4404static int noinline replace_one_extent(struct btrfs_trans_handle *trans,
4405 struct btrfs_root *root,
4406 struct btrfs_path *path,
4407 struct btrfs_key *extent_key,
4408 struct btrfs_key *leaf_key,
4409 struct btrfs_ref_path *ref_path,
4410 struct disk_extent *new_extents,
4411 int nr_extents)
4412{
4413 struct extent_buffer *leaf;
4414 struct btrfs_file_extent_item *fi;
4415 struct inode *inode = NULL;
4416 struct btrfs_key key;
4417 u64 lock_start = 0;
4418 u64 lock_end = 0;
4419 u64 num_bytes;
4420 u64 ext_offset;
4421 u64 first_pos;
4422 u32 nritems;
4423 int nr_scaned = 0;
4424 int extent_locked = 0;
4425 int extent_type;
4426 int ret;
4427
4428 memcpy(&key, leaf_key, sizeof(key));
4429 first_pos = INT_LIMIT(loff_t) - extent_key->offset;
4430 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4431 if (key.objectid < ref_path->owner_objectid ||
4432 (key.objectid == ref_path->owner_objectid &&
4433 key.type < BTRFS_EXTENT_DATA_KEY)) {
4434 key.objectid = ref_path->owner_objectid;
4435 key.type = BTRFS_EXTENT_DATA_KEY;
4436 key.offset = 0;
4437 }
4438 }
4439
4440 while (1) {
4441 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4442 if (ret < 0)
4443 goto out;
4444
4445 leaf = path->nodes[0];
4446 nritems = btrfs_header_nritems(leaf);
4447next:
4448 if (extent_locked && ret > 0) {
4449 /*
4450 * the file extent item was modified by someone
4451 * before the extent got locked.
4452 */
4453 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4454 lock_end, GFP_NOFS);
4455 extent_locked = 0;
4456 }
4457
4458 if (path->slots[0] >= nritems) {
4459 if (++nr_scaned > 2)
4460 break;
4461
4462 BUG_ON(extent_locked);
4463 ret = btrfs_next_leaf(root, path);
4464 if (ret < 0)
4465 goto out;
4466 if (ret > 0)
4467 break;
4468 leaf = path->nodes[0];
4469 nritems = btrfs_header_nritems(leaf);
4470 }
4471
4472 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4473
4474 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4475 if ((key.objectid > ref_path->owner_objectid) ||
4476 (key.objectid == ref_path->owner_objectid &&
4477 key.type > BTRFS_EXTENT_DATA_KEY) ||
4478 (key.offset >= first_pos + extent_key->offset))
4479 break;
4480 }
4481
4482 if (inode && key.objectid != inode->i_ino) {
4483 BUG_ON(extent_locked);
4484 btrfs_release_path(root, path);
4485 mutex_unlock(&inode->i_mutex);
4486 iput(inode);
4487 inode = NULL;
4488 continue;
4489 }
4490
4491 if (key.type != BTRFS_EXTENT_DATA_KEY) {
4492 path->slots[0]++;
4493 ret = 1;
4494 goto next;
4495 }
4496 fi = btrfs_item_ptr(leaf, path->slots[0],
4497 struct btrfs_file_extent_item);
4498 extent_type = btrfs_file_extent_type(leaf, fi);
4499 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
4500 extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
4501 (btrfs_file_extent_disk_bytenr(leaf, fi) !=
4502 extent_key->objectid)) {
4503 path->slots[0]++;
4504 ret = 1;
4505 goto next;
4506 }
4507
4508 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4509 ext_offset = btrfs_file_extent_offset(leaf, fi);
4510
4511 if (first_pos > key.offset - ext_offset)
4512 first_pos = key.offset - ext_offset;
4513
4514 if (!extent_locked) {
4515 lock_start = key.offset;
4516 lock_end = lock_start + num_bytes - 1;
4517 } else {
4518 if (lock_start > key.offset ||
4519 lock_end + 1 < key.offset + num_bytes) {
4520 unlock_extent(&BTRFS_I(inode)->io_tree,
4521 lock_start, lock_end, GFP_NOFS);
4522 extent_locked = 0;
4523 }
4524 }
4525
4526 if (!inode) {
4527 btrfs_release_path(root, path);
4528
4529 inode = btrfs_iget_locked(root->fs_info->sb,
4530 key.objectid, root);
4531 if (inode->i_state & I_NEW) {
4532 BTRFS_I(inode)->root = root;
4533 BTRFS_I(inode)->location.objectid =
4534 key.objectid;
4535 BTRFS_I(inode)->location.type =
4536 BTRFS_INODE_ITEM_KEY;
4537 BTRFS_I(inode)->location.offset = 0;
4538 btrfs_read_locked_inode(inode);
4539 unlock_new_inode(inode);
4540 }
4541 /*
4542 * some code call btrfs_commit_transaction while
4543 * holding the i_mutex, so we can't use mutex_lock
4544 * here.
4545 */
4546 if (is_bad_inode(inode) ||
4547 !mutex_trylock(&inode->i_mutex)) {
4548 iput(inode);
4549 inode = NULL;
4550 key.offset = (u64)-1;
4551 goto skip;
4552 }
4553 }
4554
4555 if (!extent_locked) {
4556 struct btrfs_ordered_extent *ordered;
4557
4558 btrfs_release_path(root, path);
4559
4560 lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4561 lock_end, GFP_NOFS);
4562 ordered = btrfs_lookup_first_ordered_extent(inode,
4563 lock_end);
4564 if (ordered &&
4565 ordered->file_offset <= lock_end &&
4566 ordered->file_offset + ordered->len > lock_start) {
4567 unlock_extent(&BTRFS_I(inode)->io_tree,
4568 lock_start, lock_end, GFP_NOFS);
4569 btrfs_start_ordered_extent(inode, ordered, 1);
4570 btrfs_put_ordered_extent(ordered);
4571 key.offset += num_bytes;
4572 goto skip;
4573 }
4574 if (ordered)
4575 btrfs_put_ordered_extent(ordered);
4576
4577 extent_locked = 1;
4578 continue;
4579 }
4580
4581 if (nr_extents == 1) {
4582 /* update extent pointer in place */
4583 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4584 new_extents[0].disk_bytenr);
4585 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4586 new_extents[0].disk_num_bytes);
4587 btrfs_mark_buffer_dirty(leaf);
4588
4589 btrfs_drop_extent_cache(inode, key.offset,
4590 key.offset + num_bytes - 1, 0);
4591
4592 ret = btrfs_inc_extent_ref(trans, root,
4593 new_extents[0].disk_bytenr,
4594 new_extents[0].disk_num_bytes,
4595 leaf->start,
4596 root->root_key.objectid,
4597 trans->transid,
4598 key.objectid);
4599 BUG_ON(ret);
4600
4601 ret = btrfs_free_extent(trans, root,
4602 extent_key->objectid,
4603 extent_key->offset,
4604 leaf->start,
4605 btrfs_header_owner(leaf),
4606 btrfs_header_generation(leaf),
4607 key.objectid, 0);
4608 BUG_ON(ret);
4609
4610 btrfs_release_path(root, path);
4611 key.offset += num_bytes;
4612 } else {
4613 BUG_ON(1);
4614#if 0
4615 u64 alloc_hint;
4616 u64 extent_len;
4617 int i;
4618 /*
4619 * drop old extent pointer at first, then insert the
4620 * new pointers one bye one
4621 */
4622 btrfs_release_path(root, path);
4623 ret = btrfs_drop_extents(trans, root, inode, key.offset,
4624 key.offset + num_bytes,
4625 key.offset, &alloc_hint);
4626 BUG_ON(ret);
4627
4628 for (i = 0; i < nr_extents; i++) {
4629 if (ext_offset >= new_extents[i].num_bytes) {
4630 ext_offset -= new_extents[i].num_bytes;
4631 continue;
4632 }
4633 extent_len = min(new_extents[i].num_bytes -
4634 ext_offset, num_bytes);
4635
4636 ret = btrfs_insert_empty_item(trans, root,
4637 path, &key,
4638 sizeof(*fi));
4639 BUG_ON(ret);
4640
4641 leaf = path->nodes[0];
4642 fi = btrfs_item_ptr(leaf, path->slots[0],
4643 struct btrfs_file_extent_item);
4644 btrfs_set_file_extent_generation(leaf, fi,
4645 trans->transid);
4646 btrfs_set_file_extent_type(leaf, fi,
4647 BTRFS_FILE_EXTENT_REG);
4648 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4649 new_extents[i].disk_bytenr);
4650 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4651 new_extents[i].disk_num_bytes);
4652 btrfs_set_file_extent_ram_bytes(leaf, fi,
4653 new_extents[i].ram_bytes);
4654
4655 btrfs_set_file_extent_compression(leaf, fi,
4656 new_extents[i].compression);
4657 btrfs_set_file_extent_encryption(leaf, fi,
4658 new_extents[i].encryption);
4659 btrfs_set_file_extent_other_encoding(leaf, fi,
4660 new_extents[i].other_encoding);
4661
4662 btrfs_set_file_extent_num_bytes(leaf, fi,
4663 extent_len);
4664 ext_offset += new_extents[i].offset;
4665 btrfs_set_file_extent_offset(leaf, fi,
4666 ext_offset);
4667 btrfs_mark_buffer_dirty(leaf);
4668
4669 btrfs_drop_extent_cache(inode, key.offset,
4670 key.offset + extent_len - 1, 0);
4671
4672 ret = btrfs_inc_extent_ref(trans, root,
4673 new_extents[i].disk_bytenr,
4674 new_extents[i].disk_num_bytes,
4675 leaf->start,
4676 root->root_key.objectid,
4677 trans->transid, key.objectid);
4678 BUG_ON(ret);
4679 btrfs_release_path(root, path);
4680
4681 inode_add_bytes(inode, extent_len);
4682
4683 ext_offset = 0;
4684 num_bytes -= extent_len;
4685 key.offset += extent_len;
4686
4687 if (num_bytes == 0)
4688 break;
4689 }
4690 BUG_ON(i >= nr_extents);
4691#endif
4692 }
4693
4694 if (extent_locked) {
4695 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4696 lock_end, GFP_NOFS);
4697 extent_locked = 0;
4698 }
4699skip:
4700 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4701 key.offset >= first_pos + extent_key->offset)
4702 break;
4703
4704 cond_resched();
4705 }
4706 ret = 0;
4707out:
4708 btrfs_release_path(root, path);
4709 if (inode) {
4710 mutex_unlock(&inode->i_mutex);
4711 if (extent_locked) {
4712 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4713 lock_end, GFP_NOFS);
4714 }
4715 iput(inode);
4716 }
4717 return ret;
4718}
4719
4720int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4721 struct btrfs_root *root,
4722 struct extent_buffer *buf, u64 orig_start)
4723{
4724 int level;
4725 int ret;
4726
4727 BUG_ON(btrfs_header_generation(buf) != trans->transid);
4728 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
4729
4730 level = btrfs_header_level(buf);
4731 if (level == 0) {
4732 struct btrfs_leaf_ref *ref;
4733 struct btrfs_leaf_ref *orig_ref;
4734
4735 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
4736 if (!orig_ref)
4737 return -ENOENT;
4738
4739 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
4740 if (!ref) {
4741 btrfs_free_leaf_ref(root, orig_ref);
4742 return -ENOMEM;
4743 }
4744
4745 ref->nritems = orig_ref->nritems;
4746 memcpy(ref->extents, orig_ref->extents,
4747 sizeof(ref->extents[0]) * ref->nritems);
4748
4749 btrfs_free_leaf_ref(root, orig_ref);
4750
4751 ref->root_gen = trans->transid;
4752 ref->bytenr = buf->start;
4753 ref->owner = btrfs_header_owner(buf);
4754 ref->generation = btrfs_header_generation(buf);
4755 ret = btrfs_add_leaf_ref(root, ref, 0);
4756 WARN_ON(ret);
4757 btrfs_free_leaf_ref(root, ref);
4758 }
4759 return 0;
4760}
4761
4762static int noinline invalidate_extent_cache(struct btrfs_root *root,
4763 struct extent_buffer *leaf,
4764 struct btrfs_block_group_cache *group,
4765 struct btrfs_root *target_root)
4766{
4767 struct btrfs_key key;
4768 struct inode *inode = NULL;
4769 struct btrfs_file_extent_item *fi;
4770 u64 num_bytes;
4771 u64 skip_objectid = 0;
4772 u32 nritems;
4773 u32 i;
4774
4775 nritems = btrfs_header_nritems(leaf);
4776 for (i = 0; i < nritems; i++) {
4777 btrfs_item_key_to_cpu(leaf, &key, i);
4778 if (key.objectid == skip_objectid ||
4779 key.type != BTRFS_EXTENT_DATA_KEY)
4780 continue;
4781 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4782 if (btrfs_file_extent_type(leaf, fi) ==
4783 BTRFS_FILE_EXTENT_INLINE)
4784 continue;
4785 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4786 continue;
4787 if (!inode || inode->i_ino != key.objectid) {
4788 iput(inode);
4789 inode = btrfs_ilookup(target_root->fs_info->sb,
4790 key.objectid, target_root, 1);
4791 }
4792 if (!inode) {
4793 skip_objectid = key.objectid;
4794 continue;
4795 }
4796 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4797
4798 lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4799 key.offset + num_bytes - 1, GFP_NOFS);
4800 btrfs_drop_extent_cache(inode, key.offset,
4801 key.offset + num_bytes - 1, 1);
4802 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4803 key.offset + num_bytes - 1, GFP_NOFS);
4804 cond_resched();
4805 }
4806 iput(inode);
4807 return 0;
4808}
4809
4810static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
4811 struct btrfs_root *root,
4812 struct extent_buffer *leaf,
4813 struct btrfs_block_group_cache *group,
4814 struct inode *reloc_inode)
4815{
4816 struct btrfs_key key;
4817 struct btrfs_key extent_key;
4818 struct btrfs_file_extent_item *fi;
4819 struct btrfs_leaf_ref *ref;
4820 struct disk_extent *new_extent;
4821 u64 bytenr;
4822 u64 num_bytes;
4823 u32 nritems;
4824 u32 i;
4825 int ext_index;
4826 int nr_extent;
4827 int ret;
4828
4829 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
4830 BUG_ON(!new_extent);
4831
4832 ref = btrfs_lookup_leaf_ref(root, leaf->start);
4833 BUG_ON(!ref);
4834
4835 ext_index = -1;
4836 nritems = btrfs_header_nritems(leaf);
4837 for (i = 0; i < nritems; i++) {
4838 btrfs_item_key_to_cpu(leaf, &key, i);
4839 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4840 continue;
4841 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4842 if (btrfs_file_extent_type(leaf, fi) ==
4843 BTRFS_FILE_EXTENT_INLINE)
4844 continue;
4845 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4846 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4847 if (bytenr == 0)
4848 continue;
4849
4850 ext_index++;
4851 if (bytenr >= group->key.objectid + group->key.offset ||
4852 bytenr + num_bytes <= group->key.objectid)
4853 continue;
4854
4855 extent_key.objectid = bytenr;
4856 extent_key.offset = num_bytes;
4857 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4858 nr_extent = 1;
4859 ret = get_new_locations(reloc_inode, &extent_key,
4860 group->key.objectid, 1,
4861 &new_extent, &nr_extent);
4862 if (ret > 0)
4863 continue;
4864 BUG_ON(ret < 0);
4865
4866 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
4867 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
4868 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
4869 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
4870
4871 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4872 new_extent->disk_bytenr);
4873 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4874 new_extent->disk_num_bytes);
4875 btrfs_mark_buffer_dirty(leaf);
4876
4877 ret = btrfs_inc_extent_ref(trans, root,
4878 new_extent->disk_bytenr,
4879 new_extent->disk_num_bytes,
4880 leaf->start,
4881 root->root_key.objectid,
4882 trans->transid, key.objectid);
4883 BUG_ON(ret);
4884 ret = btrfs_free_extent(trans, root,
4885 bytenr, num_bytes, leaf->start,
4886 btrfs_header_owner(leaf),
4887 btrfs_header_generation(leaf),
4888 key.objectid, 0);
4889 BUG_ON(ret);
4890 cond_resched();
4891 }
4892 kfree(new_extent);
4893 BUG_ON(ext_index + 1 != ref->nritems);
4894 btrfs_free_leaf_ref(root, ref);
4895 return 0;
4896}
4897
4898int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
4899 struct btrfs_root *root)
4900{
4901 struct btrfs_root *reloc_root;
4902 int ret;
4903
4904 if (root->reloc_root) {
4905 reloc_root = root->reloc_root;
4906 root->reloc_root = NULL;
4907 list_add(&reloc_root->dead_list,
4908 &root->fs_info->dead_reloc_roots);
4909
4910 btrfs_set_root_bytenr(&reloc_root->root_item,
4911 reloc_root->node->start);
4912 btrfs_set_root_level(&root->root_item,
4913 btrfs_header_level(reloc_root->node));
4914 memset(&reloc_root->root_item.drop_progress, 0,
4915 sizeof(struct btrfs_disk_key));
4916 reloc_root->root_item.drop_level = 0;
4917
4918 ret = btrfs_update_root(trans, root->fs_info->tree_root,
4919 &reloc_root->root_key,
4920 &reloc_root->root_item);
4921 BUG_ON(ret);
4922 }
4923 return 0;
4924}
4925
4926int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
4927{
4928 struct btrfs_trans_handle *trans;
4929 struct btrfs_root *reloc_root;
4930 struct btrfs_root *prev_root = NULL;
4931 struct list_head dead_roots;
4932 int ret;
4933 unsigned long nr;
4934
4935 INIT_LIST_HEAD(&dead_roots);
4936 list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
4937
4938 while (!list_empty(&dead_roots)) {
4939 reloc_root = list_entry(dead_roots.prev,
4940 struct btrfs_root, dead_list);
4941 list_del_init(&reloc_root->dead_list);
4942
4943 BUG_ON(reloc_root->commit_root != NULL);
4944 while (1) {
4945 trans = btrfs_join_transaction(root, 1);
4946 BUG_ON(!trans);
4947
4948 mutex_lock(&root->fs_info->drop_mutex);
4949 ret = btrfs_drop_snapshot(trans, reloc_root);
4950 if (ret != -EAGAIN)
4951 break;
4952 mutex_unlock(&root->fs_info->drop_mutex);
4953
4954 nr = trans->blocks_used;
4955 ret = btrfs_end_transaction(trans, root);
4956 BUG_ON(ret);
4957 btrfs_btree_balance_dirty(root, nr);
4958 }
4959
4960 free_extent_buffer(reloc_root->node);
4961
4962 ret = btrfs_del_root(trans, root->fs_info->tree_root,
4963 &reloc_root->root_key);
4964 BUG_ON(ret);
4965 mutex_unlock(&root->fs_info->drop_mutex);
4966
4967 nr = trans->blocks_used;
4968 ret = btrfs_end_transaction(trans, root);
4969 BUG_ON(ret);
4970 btrfs_btree_balance_dirty(root, nr);
4971
4972 kfree(prev_root);
4973 prev_root = reloc_root;
4974 }
4975 if (prev_root) {
4976 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
4977 kfree(prev_root);
4978 }
4979 return 0;
4980}
4981
4982int btrfs_add_dead_reloc_root(struct btrfs_root *root)
4983{
4984 list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
4985 return 0;
4986}
4987
4988int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
4989{
4990 struct btrfs_root *reloc_root;
4991 struct btrfs_trans_handle *trans;
4992 struct btrfs_key location;
4993 int found;
4994 int ret;
4995
4996 mutex_lock(&root->fs_info->tree_reloc_mutex);
4997 ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
4998 BUG_ON(ret);
4999 found = !list_empty(&root->fs_info->dead_reloc_roots);
5000 mutex_unlock(&root->fs_info->tree_reloc_mutex);
5001
5002 if (found) {
5003 trans = btrfs_start_transaction(root, 1);
5004 BUG_ON(!trans);
5005 ret = btrfs_commit_transaction(trans, root);
5006 BUG_ON(ret);
5007 }
5008
5009 location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5010 location.offset = (u64)-1;
5011 location.type = BTRFS_ROOT_ITEM_KEY;
5012
5013 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
5014 BUG_ON(!reloc_root);
5015 btrfs_orphan_cleanup(reloc_root);
5016 return 0;
5017}
5018
5019static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
5020 struct btrfs_root *root)
5021{
5022 struct btrfs_root *reloc_root;
5023 struct extent_buffer *eb;
5024 struct btrfs_root_item *root_item;
5025 struct btrfs_key root_key;
5026 int ret;
5027
5028 BUG_ON(!root->ref_cows);
5029 if (root->reloc_root)
5030 return 0;
5031
5032 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
5033 BUG_ON(!root_item);
5034
5035 ret = btrfs_copy_root(trans, root, root->commit_root,
5036 &eb, BTRFS_TREE_RELOC_OBJECTID);
5037 BUG_ON(ret);
5038
5039 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
5040 root_key.offset = root->root_key.objectid;
5041 root_key.type = BTRFS_ROOT_ITEM_KEY;
5042
5043 memcpy(root_item, &root->root_item, sizeof(root_item));
5044 btrfs_set_root_refs(root_item, 0);
5045 btrfs_set_root_bytenr(root_item, eb->start);
5046 btrfs_set_root_level(root_item, btrfs_header_level(eb));
5047 btrfs_set_root_generation(root_item, trans->transid);
5048
5049 btrfs_tree_unlock(eb);
5050 free_extent_buffer(eb);
5051
5052 ret = btrfs_insert_root(trans, root->fs_info->tree_root,
5053 &root_key, root_item);
5054 BUG_ON(ret);
5055 kfree(root_item);
5056
5057 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
5058 &root_key);
5059 BUG_ON(!reloc_root);
5060 reloc_root->last_trans = trans->transid;
5061 reloc_root->commit_root = NULL;
5062 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
5063
5064 root->reloc_root = reloc_root;
5065 return 0;
5066}
5067
5068/*
5069 * Core function of space balance.
5070 *
5071 * The idea is using reloc trees to relocate tree blocks in reference
5072 * counted roots. There is one reloc tree for each subvol, and all
5073 * reloc trees share same root key objectid. Reloc trees are snapshots
5074 * of the latest committed roots of subvols (root->commit_root).
5075 *
5076 * To relocate a tree block referenced by a subvol, there are two steps.
5077 * COW the block through subvol's reloc tree, then update block pointer
5078 * in the subvol to point to the new block. Since all reloc trees share
5079 * same root key objectid, doing special handing for tree blocks owned
5080 * by them is easy. Once a tree block has been COWed in one reloc tree,
5081 * we can use the resulting new block directly when the same block is
5082 * required to COW again through other reloc trees. By this way, relocated
5083 * tree blocks are shared between reloc trees, so they are also shared
5084 * between subvols.
5085 */
5086static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
5087 struct btrfs_root *root,
5088 struct btrfs_path *path,
5089 struct btrfs_key *first_key,
5090 struct btrfs_ref_path *ref_path,
5091 struct btrfs_block_group_cache *group,
5092 struct inode *reloc_inode)
5093{
5094 struct btrfs_root *reloc_root;
5095 struct extent_buffer *eb = NULL;
5096 struct btrfs_key *keys;
5097 u64 *nodes;
5098 int level;
5099 int shared_level;
5100 int lowest_level = 0;
5101 int ret;
5102
5103 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
5104 lowest_level = ref_path->owner_objectid;
5105
5106 if (!root->ref_cows) {
5107 path->lowest_level = lowest_level;
5108 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
5109 BUG_ON(ret < 0);
5110 path->lowest_level = 0;
5111 btrfs_release_path(root, path);
5112 return 0;
5113 }
5114
5115 mutex_lock(&root->fs_info->tree_reloc_mutex);
5116 ret = init_reloc_tree(trans, root);
5117 BUG_ON(ret);
5118 reloc_root = root->reloc_root;
5119
5120 shared_level = ref_path->shared_level;
5121 ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
5122
5123 keys = ref_path->node_keys;
5124 nodes = ref_path->new_nodes;
5125 memset(&keys[shared_level + 1], 0,
5126 sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
5127 memset(&nodes[shared_level + 1], 0,
5128 sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
5129
5130 if (nodes[lowest_level] == 0) {
5131 path->lowest_level = lowest_level;
5132 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5133 0, 1);
5134 BUG_ON(ret);
5135 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
5136 eb = path->nodes[level];
5137 if (!eb || eb == reloc_root->node)
5138 break;
5139 nodes[level] = eb->start;
5140 if (level == 0)
5141 btrfs_item_key_to_cpu(eb, &keys[level], 0);
5142 else
5143 btrfs_node_key_to_cpu(eb, &keys[level], 0);
5144 }
5145 if (nodes[0] &&
5146 ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5147 eb = path->nodes[0];
5148 ret = replace_extents_in_leaf(trans, reloc_root, eb,
5149 group, reloc_inode);
5150 BUG_ON(ret);
5151 }
5152 btrfs_release_path(reloc_root, path);
5153 } else {
5154 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
5155 lowest_level);
5156 BUG_ON(ret);
5157 }
5158
5159 /*
5160 * replace tree blocks in the fs tree with tree blocks in
5161 * the reloc tree.
5162 */
5163 ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
5164 BUG_ON(ret < 0);
5165
5166 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5167 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5168 0, 0);
5169 BUG_ON(ret);
5170 extent_buffer_get(path->nodes[0]);
5171 eb = path->nodes[0];
5172 btrfs_release_path(reloc_root, path);
5173 ret = invalidate_extent_cache(reloc_root, eb, group, root);
5174 BUG_ON(ret);
5175 free_extent_buffer(eb);
5176 }
5177
5178 mutex_unlock(&root->fs_info->tree_reloc_mutex);
5179 path->lowest_level = 0;
5180 return 0;
5181}
5182
5183static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
5184 struct btrfs_root *root,
5185 struct btrfs_path *path,
5186 struct btrfs_key *first_key,
5187 struct btrfs_ref_path *ref_path)
5188{
5189 int ret;
5190
5191 ret = relocate_one_path(trans, root, path, first_key,
5192 ref_path, NULL, NULL);
5193 BUG_ON(ret);
5194
5195 if (root == root->fs_info->extent_root)
5196 btrfs_extent_post_op(trans, root);
5197
5198 return 0;
5199}
5200
5201static int noinline del_extent_zero(struct btrfs_trans_handle *trans,
5202 struct btrfs_root *extent_root,
5203 struct btrfs_path *path,
5204 struct btrfs_key *extent_key)
5205{
5206 int ret;
5207
5208 ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
5209 if (ret)
5210 goto out;
5211 ret = btrfs_del_item(trans, extent_root, path);
5212out:
5213 btrfs_release_path(extent_root, path);
5214 return ret;
5215}
5216
5217static struct btrfs_root noinline *read_ref_root(struct btrfs_fs_info *fs_info,
5218 struct btrfs_ref_path *ref_path)
5219{
5220 struct btrfs_key root_key;
5221
5222 root_key.objectid = ref_path->root_objectid;
5223 root_key.type = BTRFS_ROOT_ITEM_KEY;
5224 if (is_cowonly_root(ref_path->root_objectid))
5225 root_key.offset = 0;
5226 else
5227 root_key.offset = (u64)-1;
5228
5229 return btrfs_read_fs_root_no_name(fs_info, &root_key);
5230}
5231
5232static int noinline relocate_one_extent(struct btrfs_root *extent_root,
5233 struct btrfs_path *path,
5234 struct btrfs_key *extent_key,
5235 struct btrfs_block_group_cache *group,
5236 struct inode *reloc_inode, int pass)
5237{
5238 struct btrfs_trans_handle *trans;
5239 struct btrfs_root *found_root;
5240 struct btrfs_ref_path *ref_path = NULL;
5241 struct disk_extent *new_extents = NULL;
5242 int nr_extents = 0;
5243 int loops;
5244 int ret;
5245 int level;
5246 struct btrfs_key first_key;
5247 u64 prev_block = 0;
5248
5249
5250 trans = btrfs_start_transaction(extent_root, 1);
5251 BUG_ON(!trans);
5252
5253 if (extent_key->objectid == 0) {
5254 ret = del_extent_zero(trans, extent_root, path, extent_key);
5255 goto out;
5256 }
5257
5258 ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
5259 if (!ref_path) {
5260 ret = -ENOMEM;
5261 goto out;
5262 }
5263
5264 for (loops = 0; ; loops++) {
5265 if (loops == 0) {
5266 ret = btrfs_first_ref_path(trans, extent_root, ref_path,
5267 extent_key->objectid);
5268 } else {
5269 ret = btrfs_next_ref_path(trans, extent_root, ref_path);
5270 }
5271 if (ret < 0)
5272 goto out;
5273 if (ret > 0)
5274 break;
5275
5276 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5277 ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
5278 continue;
5279
5280 found_root = read_ref_root(extent_root->fs_info, ref_path);
5281 BUG_ON(!found_root);
5282 /*
5283 * for reference counted tree, only process reference paths
5284 * rooted at the latest committed root.
5285 */
5286 if (found_root->ref_cows &&
5287 ref_path->root_generation != found_root->root_key.offset)
5288 continue;
5289
5290 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5291 if (pass == 0) {
5292 /*
5293 * copy data extents to new locations
5294 */
5295 u64 group_start = group->key.objectid;
5296 ret = relocate_data_extent(reloc_inode,
5297 extent_key,
5298 group_start);
5299 if (ret < 0)
5300 goto out;
5301 break;
5302 }
5303 level = 0;
5304 } else {
5305 level = ref_path->owner_objectid;
5306 }
5307
5308 if (prev_block != ref_path->nodes[level]) {
5309 struct extent_buffer *eb;
5310 u64 block_start = ref_path->nodes[level];
5311 u64 block_size = btrfs_level_size(found_root, level);
5312
5313 eb = read_tree_block(found_root, block_start,
5314 block_size, 0);
5315 btrfs_tree_lock(eb);
5316 BUG_ON(level != btrfs_header_level(eb));
5317
5318 if (level == 0)
5319 btrfs_item_key_to_cpu(eb, &first_key, 0);
5320 else
5321 btrfs_node_key_to_cpu(eb, &first_key, 0);
5322
5323 btrfs_tree_unlock(eb);
5324 free_extent_buffer(eb);
5325 prev_block = block_start;
5326 }
5327
5328 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
5329 pass >= 2) {
5330 /*
5331 * use fallback method to process the remaining
5332 * references.
5333 */
5334 if (!new_extents) {
5335 u64 group_start = group->key.objectid;
5336 new_extents = kmalloc(sizeof(*new_extents),
5337 GFP_NOFS);
5338 nr_extents = 1;
5339 ret = get_new_locations(reloc_inode,
5340 extent_key,
5341 group_start, 1,
5342 &new_extents,
5343 &nr_extents);
5344 if (ret)
5345 goto out;
5346 }
5347 btrfs_record_root_in_trans(found_root);
5348 ret = replace_one_extent(trans, found_root,
5349 path, extent_key,
5350 &first_key, ref_path,
5351 new_extents, nr_extents);
5352 if (ret < 0)
5353 goto out;
5354 continue;
5355 }
5356
5357 btrfs_record_root_in_trans(found_root);
5358 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
5359 ret = relocate_tree_block(trans, found_root, path,
5360 &first_key, ref_path);
5361 } else {
5362 /*
5363 * try to update data extent references while
5364 * keeping metadata shared between snapshots.
5365 */
5366 ret = relocate_one_path(trans, found_root, path,
5367 &first_key, ref_path,
5368 group, reloc_inode);
5369 }
5370 if (ret < 0)
5371 goto out;
5372 }
5373 ret = 0;
5374out:
5375 btrfs_end_transaction(trans, extent_root);
5376 kfree(new_extents);
5377 kfree(ref_path);
5378 return ret;
5379}
5380
5381static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
5382{
5383 u64 num_devices;
5384 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
5385 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
5386
5387 num_devices = root->fs_info->fs_devices->rw_devices;
5388 if (num_devices == 1) {
5389 stripped |= BTRFS_BLOCK_GROUP_DUP;
5390 stripped = flags & ~stripped;
5391
5392 /* turn raid0 into single device chunks */
5393 if (flags & BTRFS_BLOCK_GROUP_RAID0)
5394 return stripped;
5395
5396 /* turn mirroring into duplication */
5397 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
5398 BTRFS_BLOCK_GROUP_RAID10))
5399 return stripped | BTRFS_BLOCK_GROUP_DUP;
5400 return flags;
5401 } else {
5402 /* they already had raid on here, just return */
5403 if (flags & stripped)
5404 return flags;
5405
5406 stripped |= BTRFS_BLOCK_GROUP_DUP;
5407 stripped = flags & ~stripped;
5408
5409 /* switch duplicated blocks with raid1 */
5410 if (flags & BTRFS_BLOCK_GROUP_DUP)
5411 return stripped | BTRFS_BLOCK_GROUP_RAID1;
5412
5413 /* turn single device chunks into raid0 */
5414 return stripped | BTRFS_BLOCK_GROUP_RAID0;
5415 }
5416 return flags;
5417}
5418
5419int __alloc_chunk_for_shrink(struct btrfs_root *root,
5420 struct btrfs_block_group_cache *shrink_block_group,
5421 int force)
5422{
5423 struct btrfs_trans_handle *trans;
5424 u64 new_alloc_flags;
5425 u64 calc;
5426
5427 spin_lock(&shrink_block_group->lock);
5428 if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
5429 spin_unlock(&shrink_block_group->lock);
5430
5431 trans = btrfs_start_transaction(root, 1);
5432 spin_lock(&shrink_block_group->lock);
5433
5434 new_alloc_flags = update_block_group_flags(root,
5435 shrink_block_group->flags);
5436 if (new_alloc_flags != shrink_block_group->flags) {
5437 calc =
5438 btrfs_block_group_used(&shrink_block_group->item);
5439 } else {
5440 calc = shrink_block_group->key.offset;
5441 }
5442 spin_unlock(&shrink_block_group->lock);
5443
5444 do_chunk_alloc(trans, root->fs_info->extent_root,
5445 calc + 2 * 1024 * 1024, new_alloc_flags, force);
5446
5447 btrfs_end_transaction(trans, root);
5448 } else
5449 spin_unlock(&shrink_block_group->lock);
5450 return 0;
5451}
5452
5453static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
5454 struct btrfs_root *root,
5455 u64 objectid, u64 size)
5456{
5457 struct btrfs_path *path;
5458 struct btrfs_inode_item *item;
5459 struct extent_buffer *leaf;
5460 int ret;
5461
5462 path = btrfs_alloc_path();
5463 if (!path)
5464 return -ENOMEM;
5465
5466 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
5467 if (ret)
5468 goto out;
5469
5470 leaf = path->nodes[0];
5471 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
5472 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
5473 btrfs_set_inode_generation(leaf, item, 1);
5474 btrfs_set_inode_size(leaf, item, size);
5475 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
5476 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NODATASUM |
5477 BTRFS_INODE_NOCOMPRESS);
5478 btrfs_mark_buffer_dirty(leaf);
5479 btrfs_release_path(root, path);
5480out:
5481 btrfs_free_path(path);
5482 return ret;
5483}
5484
5485static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
5486 struct btrfs_block_group_cache *group)
5487{
5488 struct inode *inode = NULL;
5489 struct btrfs_trans_handle *trans;
5490 struct btrfs_root *root;
5491 struct btrfs_key root_key;
5492 u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
5493 int err = 0;
5494
5495 root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5496 root_key.type = BTRFS_ROOT_ITEM_KEY;
5497 root_key.offset = (u64)-1;
5498 root = btrfs_read_fs_root_no_name(fs_info, &root_key);
5499 if (IS_ERR(root))
5500 return ERR_CAST(root);
5501
5502 trans = btrfs_start_transaction(root, 1);
5503 BUG_ON(!trans);
5504
5505 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
5506 if (err)
5507 goto out;
5508
5509 err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
5510 BUG_ON(err);
5511
5512 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
5513 group->key.offset, 0, group->key.offset,
5514 0, 0, 0);
5515 BUG_ON(err);
5516
5517 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
5518 if (inode->i_state & I_NEW) {
5519 BTRFS_I(inode)->root = root;
5520 BTRFS_I(inode)->location.objectid = objectid;
5521 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5522 BTRFS_I(inode)->location.offset = 0;
5523 btrfs_read_locked_inode(inode);
5524 unlock_new_inode(inode);
5525 BUG_ON(is_bad_inode(inode));
5526 } else {
5527 BUG_ON(1);
5528 }
5529
5530 err = btrfs_orphan_add(trans, inode);
5531out:
5532 btrfs_end_transaction(trans, root);
5533 if (err) {
5534 if (inode)
5535 iput(inode);
5536 inode = ERR_PTR(err);
5537 }
5538 return inode;
5539}
5540
5541int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
5542{
5543 struct btrfs_trans_handle *trans;
5544 struct btrfs_path *path;
5545 struct btrfs_fs_info *info = root->fs_info;
5546 struct extent_buffer *leaf;
5547 struct inode *reloc_inode;
5548 struct btrfs_block_group_cache *block_group;
5549 struct btrfs_key key;
5550 u64 skipped;
5551 u64 cur_byte;
5552 u64 total_found;
5553 u32 nritems;
5554 int ret;
5555 int progress;
5556 int pass = 0;
5557
5558 root = root->fs_info->extent_root;
5559
5560 block_group = btrfs_lookup_block_group(info, group_start);
5561 BUG_ON(!block_group);
5562
5563 printk("btrfs relocating block group %llu flags %llu\n",
5564 (unsigned long long)block_group->key.objectid,
5565 (unsigned long long)block_group->flags);
5566
5567 path = btrfs_alloc_path();
5568 BUG_ON(!path);
5569
5570 reloc_inode = create_reloc_inode(info, block_group);
5571 BUG_ON(IS_ERR(reloc_inode));
5572
5573 __alloc_chunk_for_shrink(root, block_group, 1);
5574 set_block_group_readonly(block_group);
5575
5576 btrfs_start_delalloc_inodes(info->tree_root);
5577 btrfs_wait_ordered_extents(info->tree_root, 0);
5578again:
5579 skipped = 0;
5580 total_found = 0;
5581 progress = 0;
5582 key.objectid = block_group->key.objectid;
5583 key.offset = 0;
5584 key.type = 0;
5585 cur_byte = key.objectid;
5586
5587 trans = btrfs_start_transaction(info->tree_root, 1);
5588 btrfs_commit_transaction(trans, info->tree_root);
5589
5590 mutex_lock(&root->fs_info->cleaner_mutex);
5591 btrfs_clean_old_snapshots(info->tree_root);
5592 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
5593 mutex_unlock(&root->fs_info->cleaner_mutex);
5594
5595 while(1) {
5596 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5597 if (ret < 0)
5598 goto out;
5599next:
5600 leaf = path->nodes[0];
5601 nritems = btrfs_header_nritems(leaf);
5602 if (path->slots[0] >= nritems) {
5603 ret = btrfs_next_leaf(root, path);
5604 if (ret < 0)
5605 goto out;
5606 if (ret == 1) {
5607 ret = 0;
5608 break;
5609 }
5610 leaf = path->nodes[0];
5611 nritems = btrfs_header_nritems(leaf);
5612 }
5613
5614 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5615
5616 if (key.objectid >= block_group->key.objectid +
5617 block_group->key.offset)
5618 break;
5619
5620 if (progress && need_resched()) {
5621 btrfs_release_path(root, path);
5622 cond_resched();
5623 progress = 0;
5624 continue;
5625 }
5626 progress = 1;
5627
5628 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
5629 key.objectid + key.offset <= cur_byte) {
5630 path->slots[0]++;
5631 goto next;
5632 }
5633
5634 total_found++;
5635 cur_byte = key.objectid + key.offset;
5636 btrfs_release_path(root, path);
5637
5638 __alloc_chunk_for_shrink(root, block_group, 0);
5639 ret = relocate_one_extent(root, path, &key, block_group,
5640 reloc_inode, pass);
5641 BUG_ON(ret < 0);
5642 if (ret > 0)
5643 skipped++;
5644
5645 key.objectid = cur_byte;
5646 key.type = 0;
5647 key.offset = 0;
5648 }
5649
5650 btrfs_release_path(root, path);
5651
5652 if (pass == 0) {
5653 btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
5654 invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
5655 WARN_ON(reloc_inode->i_mapping->nrpages);
5656 }
5657
5658 if (total_found > 0) {
5659 printk("btrfs found %llu extents in pass %d\n",
5660 (unsigned long long)total_found, pass);
5661 pass++;
5662 if (total_found == skipped && pass > 2) {
5663 iput(reloc_inode);
5664 reloc_inode = create_reloc_inode(info, block_group);
5665 pass = 0;
5666 }
5667 goto again;
5668 }
5669
5670 /* delete reloc_inode */
5671 iput(reloc_inode);
5672
5673 /* unpin extents in this range */
5674 trans = btrfs_start_transaction(info->tree_root, 1);
5675 btrfs_commit_transaction(trans, info->tree_root);
5676
5677 spin_lock(&block_group->lock);
5678 WARN_ON(block_group->pinned > 0);
5679 WARN_ON(block_group->reserved > 0);
5680 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5681 spin_unlock(&block_group->lock);
5682 ret = 0;
5683out:
5684 btrfs_free_path(path);
5685 return ret;
5686}
5687
5688int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
5689 struct btrfs_key *key)
5690{
5691 int ret = 0;
5692 struct btrfs_key found_key;
5693 struct extent_buffer *leaf;
5694 int slot;
5695
5696 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
5697 if (ret < 0)
5698 goto out;
5699
5700 while(1) {
5701 slot = path->slots[0];
5702 leaf = path->nodes[0];
5703 if (slot >= btrfs_header_nritems(leaf)) {
5704 ret = btrfs_next_leaf(root, path);
5705 if (ret == 0)
5706 continue;
5707 if (ret < 0)
5708 goto out;
5709 break;
5710 }
5711 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5712
5713 if (found_key.objectid >= key->objectid &&
5714 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
5715 ret = 0;
5716 goto out;
5717 }
5718 path->slots[0]++;
5719 }
5720 ret = -ENOENT;
5721out:
5722 return ret;
5723}
5724
5725int btrfs_free_block_groups(struct btrfs_fs_info *info)
5726{
5727 struct btrfs_block_group_cache *block_group;
5728 struct rb_node *n;
5729
5730 spin_lock(&info->block_group_cache_lock);
5731 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
5732 block_group = rb_entry(n, struct btrfs_block_group_cache,
5733 cache_node);
5734 rb_erase(&block_group->cache_node,
5735 &info->block_group_cache_tree);
5736 spin_unlock(&info->block_group_cache_lock);
5737
5738 btrfs_remove_free_space_cache(block_group);
5739 down_write(&block_group->space_info->groups_sem);
5740 list_del(&block_group->list);
5741 up_write(&block_group->space_info->groups_sem);
5742 kfree(block_group);
5743
5744 spin_lock(&info->block_group_cache_lock);
5745 }
5746 spin_unlock(&info->block_group_cache_lock);
5747 return 0;
5748}
5749
5750int btrfs_read_block_groups(struct btrfs_root *root)
5751{
5752 struct btrfs_path *path;
5753 int ret;
5754 struct btrfs_block_group_cache *cache;
5755 struct btrfs_fs_info *info = root->fs_info;
5756 struct btrfs_space_info *space_info;
5757 struct btrfs_key key;
5758 struct btrfs_key found_key;
5759 struct extent_buffer *leaf;
5760
5761 root = info->extent_root;
5762 key.objectid = 0;
5763 key.offset = 0;
5764 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
5765 path = btrfs_alloc_path();
5766 if (!path)
5767 return -ENOMEM;
5768
5769 while(1) {
5770 ret = find_first_block_group(root, path, &key);
5771 if (ret > 0) {
5772 ret = 0;
5773 goto error;
5774 }
5775 if (ret != 0)
5776 goto error;
5777
5778 leaf = path->nodes[0];
5779 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5780 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5781 if (!cache) {
5782 ret = -ENOMEM;
5783 break;
5784 }
5785
5786 spin_lock_init(&cache->lock);
5787 mutex_init(&cache->alloc_mutex);
5788 INIT_LIST_HEAD(&cache->list);
5789 read_extent_buffer(leaf, &cache->item,
5790 btrfs_item_ptr_offset(leaf, path->slots[0]),
5791 sizeof(cache->item));
5792 memcpy(&cache->key, &found_key, sizeof(found_key));
5793
5794 key.objectid = found_key.objectid + found_key.offset;
5795 btrfs_release_path(root, path);
5796 cache->flags = btrfs_block_group_flags(&cache->item);
5797
5798 ret = update_space_info(info, cache->flags, found_key.offset,
5799 btrfs_block_group_used(&cache->item),
5800 &space_info);
5801 BUG_ON(ret);
5802 cache->space_info = space_info;
5803 down_write(&space_info->groups_sem);
5804 list_add_tail(&cache->list, &space_info->block_groups);
5805 up_write(&space_info->groups_sem);
5806
5807 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5808 BUG_ON(ret);
5809
5810 set_avail_alloc_bits(root->fs_info, cache->flags);
5811 if (btrfs_chunk_readonly(root, cache->key.objectid))
5812 set_block_group_readonly(cache);
5813 }
5814 ret = 0;
5815error:
5816 btrfs_free_path(path);
5817 return ret;
5818}
5819
5820int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5821 struct btrfs_root *root, u64 bytes_used,
5822 u64 type, u64 chunk_objectid, u64 chunk_offset,
5823 u64 size)
5824{
5825 int ret;
5826 struct btrfs_root *extent_root;
5827 struct btrfs_block_group_cache *cache;
5828
5829 extent_root = root->fs_info->extent_root;
5830
5831 root->fs_info->last_trans_new_blockgroup = trans->transid;
5832
5833 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5834 if (!cache)
5835 return -ENOMEM;
5836
5837 cache->key.objectid = chunk_offset;
5838 cache->key.offset = size;
5839 spin_lock_init(&cache->lock);
5840 mutex_init(&cache->alloc_mutex);
5841 INIT_LIST_HEAD(&cache->list);
5842 btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
5843
5844 btrfs_set_block_group_used(&cache->item, bytes_used);
5845 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
5846 cache->flags = type;
5847 btrfs_set_block_group_flags(&cache->item, type);
5848
5849 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
5850 &cache->space_info);
5851 BUG_ON(ret);
5852 down_write(&cache->space_info->groups_sem);
5853 list_add_tail(&cache->list, &cache->space_info->block_groups);
5854 up_write(&cache->space_info->groups_sem);
5855
5856 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5857 BUG_ON(ret);
5858
5859 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
5860 sizeof(cache->item));
5861 BUG_ON(ret);
5862
5863 finish_current_insert(trans, extent_root, 0);
5864 ret = del_pending_extents(trans, extent_root, 0);
5865 BUG_ON(ret);
5866 set_avail_alloc_bits(extent_root->fs_info, type);
5867
5868 return 0;
5869}
5870
5871int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5872 struct btrfs_root *root, u64 group_start)
5873{
5874 struct btrfs_path *path;
5875 struct btrfs_block_group_cache *block_group;
5876 struct btrfs_key key;
5877 int ret;
5878
5879 root = root->fs_info->extent_root;
5880
5881 block_group = btrfs_lookup_block_group(root->fs_info, group_start);
5882 BUG_ON(!block_group);
5883 BUG_ON(!block_group->ro);
5884
5885 memcpy(&key, &block_group->key, sizeof(key));
5886
5887 path = btrfs_alloc_path();
5888 BUG_ON(!path);
5889
5890 btrfs_remove_free_space_cache(block_group);
5891 rb_erase(&block_group->cache_node,
5892 &root->fs_info->block_group_cache_tree);
5893 down_write(&block_group->space_info->groups_sem);
5894 list_del(&block_group->list);
5895 up_write(&block_group->space_info->groups_sem);
5896
5897 spin_lock(&block_group->space_info->lock);
5898 block_group->space_info->total_bytes -= block_group->key.offset;
5899 block_group->space_info->bytes_readonly -= block_group->key.offset;
5900 spin_unlock(&block_group->space_info->lock);
5901 block_group->space_info->full = 0;
5902
5903 /*
5904 memset(shrink_block_group, 0, sizeof(*shrink_block_group));
5905 kfree(shrink_block_group);
5906 */
5907
5908 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5909 if (ret > 0)
5910 ret = -EIO;
5911 if (ret < 0)
5912 goto out;
5913
5914 ret = btrfs_del_item(trans, root, path);
5915out:
5916 btrfs_free_path(path);
5917 return ret;
5918}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 000000000000..a0f3804efe4f
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3787 @@
1#include <linux/bitops.h>
2#include <linux/slab.h>
3#include <linux/bio.h>
4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h>
7#include <linux/page-flags.h>
8#include <linux/module.h>
9#include <linux/spinlock.h>
10#include <linux/blkdev.h>
11#include <linux/swap.h>
12#include <linux/version.h>
13#include <linux/writeback.h>
14#include <linux/pagevec.h>
15#include "extent_io.h"
16#include "extent_map.h"
17#include "compat.h"
18#include "ctree.h"
19#include "btrfs_inode.h"
20
21/* temporary define until extent_map moves out of btrfs */
22struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
23 unsigned long extra_flags,
24 void (*ctor)(void *, struct kmem_cache *,
25 unsigned long));
26
27static struct kmem_cache *extent_state_cache;
28static struct kmem_cache *extent_buffer_cache;
29
30static LIST_HEAD(buffers);
31static LIST_HEAD(states);
32
33#define LEAK_DEBUG 0
34#ifdef LEAK_DEBUG
35static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
36#endif
37
38#define BUFFER_LRU_MAX 64
39
40struct tree_entry {
41 u64 start;
42 u64 end;
43 struct rb_node rb_node;
44};
45
46struct extent_page_data {
47 struct bio *bio;
48 struct extent_io_tree *tree;
49 get_extent_t *get_extent;
50
51 /* tells writepage not to lock the state bits for this range
52 * it still does the unlocking
53 */
54 int extent_locked;
55};
56
57int __init extent_io_init(void)
58{
59 extent_state_cache = btrfs_cache_create("extent_state",
60 sizeof(struct extent_state), 0,
61 NULL);
62 if (!extent_state_cache)
63 return -ENOMEM;
64
65 extent_buffer_cache = btrfs_cache_create("extent_buffers",
66 sizeof(struct extent_buffer), 0,
67 NULL);
68 if (!extent_buffer_cache)
69 goto free_state_cache;
70 return 0;
71
72free_state_cache:
73 kmem_cache_destroy(extent_state_cache);
74 return -ENOMEM;
75}
76
77void extent_io_exit(void)
78{
79 struct extent_state *state;
80 struct extent_buffer *eb;
81
82 while (!list_empty(&states)) {
83 state = list_entry(states.next, struct extent_state, leak_list);
84 printk("state leak: start %Lu end %Lu state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs));
85 list_del(&state->leak_list);
86 kmem_cache_free(extent_state_cache, state);
87
88 }
89
90 while (!list_empty(&buffers)) {
91 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
92 printk("buffer leak start %Lu len %lu refs %d\n", eb->start, eb->len, atomic_read(&eb->refs));
93 list_del(&eb->leak_list);
94 kmem_cache_free(extent_buffer_cache, eb);
95 }
96 if (extent_state_cache)
97 kmem_cache_destroy(extent_state_cache);
98 if (extent_buffer_cache)
99 kmem_cache_destroy(extent_buffer_cache);
100}
101
102void extent_io_tree_init(struct extent_io_tree *tree,
103 struct address_space *mapping, gfp_t mask)
104{
105 tree->state.rb_node = NULL;
106 tree->buffer.rb_node = NULL;
107 tree->ops = NULL;
108 tree->dirty_bytes = 0;
109 spin_lock_init(&tree->lock);
110 spin_lock_init(&tree->buffer_lock);
111 tree->mapping = mapping;
112}
113EXPORT_SYMBOL(extent_io_tree_init);
114
115struct extent_state *alloc_extent_state(gfp_t mask)
116{
117 struct extent_state *state;
118#ifdef LEAK_DEBUG
119 unsigned long flags;
120#endif
121
122 state = kmem_cache_alloc(extent_state_cache, mask);
123 if (!state)
124 return state;
125 state->state = 0;
126 state->private = 0;
127 state->tree = NULL;
128#ifdef LEAK_DEBUG
129 spin_lock_irqsave(&leak_lock, flags);
130 list_add(&state->leak_list, &states);
131 spin_unlock_irqrestore(&leak_lock, flags);
132#endif
133 atomic_set(&state->refs, 1);
134 init_waitqueue_head(&state->wq);
135 return state;
136}
137EXPORT_SYMBOL(alloc_extent_state);
138
139void free_extent_state(struct extent_state *state)
140{
141 if (!state)
142 return;
143 if (atomic_dec_and_test(&state->refs)) {
144#ifdef LEAK_DEBUG
145 unsigned long flags;
146#endif
147 WARN_ON(state->tree);
148#ifdef LEAK_DEBUG
149 spin_lock_irqsave(&leak_lock, flags);
150 list_del(&state->leak_list);
151 spin_unlock_irqrestore(&leak_lock, flags);
152#endif
153 kmem_cache_free(extent_state_cache, state);
154 }
155}
156EXPORT_SYMBOL(free_extent_state);
157
158static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
159 struct rb_node *node)
160{
161 struct rb_node ** p = &root->rb_node;
162 struct rb_node * parent = NULL;
163 struct tree_entry *entry;
164
165 while(*p) {
166 parent = *p;
167 entry = rb_entry(parent, struct tree_entry, rb_node);
168
169 if (offset < entry->start)
170 p = &(*p)->rb_left;
171 else if (offset > entry->end)
172 p = &(*p)->rb_right;
173 else
174 return parent;
175 }
176
177 entry = rb_entry(node, struct tree_entry, rb_node);
178 rb_link_node(node, parent, p);
179 rb_insert_color(node, root);
180 return NULL;
181}
182
183static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
184 struct rb_node **prev_ret,
185 struct rb_node **next_ret)
186{
187 struct rb_root *root = &tree->state;
188 struct rb_node * n = root->rb_node;
189 struct rb_node *prev = NULL;
190 struct rb_node *orig_prev = NULL;
191 struct tree_entry *entry;
192 struct tree_entry *prev_entry = NULL;
193
194 while(n) {
195 entry = rb_entry(n, struct tree_entry, rb_node);
196 prev = n;
197 prev_entry = entry;
198
199 if (offset < entry->start)
200 n = n->rb_left;
201 else if (offset > entry->end)
202 n = n->rb_right;
203 else {
204 return n;
205 }
206 }
207
208 if (prev_ret) {
209 orig_prev = prev;
210 while(prev && offset > prev_entry->end) {
211 prev = rb_next(prev);
212 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
213 }
214 *prev_ret = prev;
215 prev = orig_prev;
216 }
217
218 if (next_ret) {
219 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
220 while(prev && offset < prev_entry->start) {
221 prev = rb_prev(prev);
222 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
223 }
224 *next_ret = prev;
225 }
226 return NULL;
227}
228
229static inline struct rb_node *tree_search(struct extent_io_tree *tree,
230 u64 offset)
231{
232 struct rb_node *prev = NULL;
233 struct rb_node *ret;
234
235 ret = __etree_search(tree, offset, &prev, NULL);
236 if (!ret) {
237 return prev;
238 }
239 return ret;
240}
241
242static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
243 u64 offset, struct rb_node *node)
244{
245 struct rb_root *root = &tree->buffer;
246 struct rb_node ** p = &root->rb_node;
247 struct rb_node * parent = NULL;
248 struct extent_buffer *eb;
249
250 while(*p) {
251 parent = *p;
252 eb = rb_entry(parent, struct extent_buffer, rb_node);
253
254 if (offset < eb->start)
255 p = &(*p)->rb_left;
256 else if (offset > eb->start)
257 p = &(*p)->rb_right;
258 else
259 return eb;
260 }
261
262 rb_link_node(node, parent, p);
263 rb_insert_color(node, root);
264 return NULL;
265}
266
267static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
268 u64 offset)
269{
270 struct rb_root *root = &tree->buffer;
271 struct rb_node * n = root->rb_node;
272 struct extent_buffer *eb;
273
274 while(n) {
275 eb = rb_entry(n, struct extent_buffer, rb_node);
276 if (offset < eb->start)
277 n = n->rb_left;
278 else if (offset > eb->start)
279 n = n->rb_right;
280 else
281 return eb;
282 }
283 return NULL;
284}
285
286/*
287 * utility function to look for merge candidates inside a given range.
288 * Any extents with matching state are merged together into a single
289 * extent in the tree. Extents with EXTENT_IO in their state field
290 * are not merged because the end_io handlers need to be able to do
291 * operations on them without sleeping (or doing allocations/splits).
292 *
293 * This should be called with the tree lock held.
294 */
295static int merge_state(struct extent_io_tree *tree,
296 struct extent_state *state)
297{
298 struct extent_state *other;
299 struct rb_node *other_node;
300
301 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
302 return 0;
303
304 other_node = rb_prev(&state->rb_node);
305 if (other_node) {
306 other = rb_entry(other_node, struct extent_state, rb_node);
307 if (other->end == state->start - 1 &&
308 other->state == state->state) {
309 state->start = other->start;
310 other->tree = NULL;
311 rb_erase(&other->rb_node, &tree->state);
312 free_extent_state(other);
313 }
314 }
315 other_node = rb_next(&state->rb_node);
316 if (other_node) {
317 other = rb_entry(other_node, struct extent_state, rb_node);
318 if (other->start == state->end + 1 &&
319 other->state == state->state) {
320 other->start = state->start;
321 state->tree = NULL;
322 rb_erase(&state->rb_node, &tree->state);
323 free_extent_state(state);
324 }
325 }
326 return 0;
327}
328
329static void set_state_cb(struct extent_io_tree *tree,
330 struct extent_state *state,
331 unsigned long bits)
332{
333 if (tree->ops && tree->ops->set_bit_hook) {
334 tree->ops->set_bit_hook(tree->mapping->host, state->start,
335 state->end, state->state, bits);
336 }
337}
338
339static void clear_state_cb(struct extent_io_tree *tree,
340 struct extent_state *state,
341 unsigned long bits)
342{
343 if (tree->ops && tree->ops->set_bit_hook) {
344 tree->ops->clear_bit_hook(tree->mapping->host, state->start,
345 state->end, state->state, bits);
346 }
347}
348
349/*
350 * insert an extent_state struct into the tree. 'bits' are set on the
351 * struct before it is inserted.
352 *
353 * This may return -EEXIST if the extent is already there, in which case the
354 * state struct is freed.
355 *
356 * The tree lock is not taken internally. This is a utility function and
357 * probably isn't what you want to call (see set/clear_extent_bit).
358 */
359static int insert_state(struct extent_io_tree *tree,
360 struct extent_state *state, u64 start, u64 end,
361 int bits)
362{
363 struct rb_node *node;
364
365 if (end < start) {
366 printk("end < start %Lu %Lu\n", end, start);
367 WARN_ON(1);
368 }
369 if (bits & EXTENT_DIRTY)
370 tree->dirty_bytes += end - start + 1;
371 set_state_cb(tree, state, bits);
372 state->state |= bits;
373 state->start = start;
374 state->end = end;
375 node = tree_insert(&tree->state, end, &state->rb_node);
376 if (node) {
377 struct extent_state *found;
378 found = rb_entry(node, struct extent_state, rb_node);
379 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
380 free_extent_state(state);
381 return -EEXIST;
382 }
383 state->tree = tree;
384 merge_state(tree, state);
385 return 0;
386}
387
388/*
389 * split a given extent state struct in two, inserting the preallocated
390 * struct 'prealloc' as the newly created second half. 'split' indicates an
391 * offset inside 'orig' where it should be split.
392 *
393 * Before calling,
394 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
395 * are two extent state structs in the tree:
396 * prealloc: [orig->start, split - 1]
397 * orig: [ split, orig->end ]
398 *
399 * The tree locks are not taken by this function. They need to be held
400 * by the caller.
401 */
402static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
403 struct extent_state *prealloc, u64 split)
404{
405 struct rb_node *node;
406 prealloc->start = orig->start;
407 prealloc->end = split - 1;
408 prealloc->state = orig->state;
409 orig->start = split;
410
411 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
412 if (node) {
413 struct extent_state *found;
414 found = rb_entry(node, struct extent_state, rb_node);
415 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
416 free_extent_state(prealloc);
417 return -EEXIST;
418 }
419 prealloc->tree = tree;
420 return 0;
421}
422
423/*
424 * utility function to clear some bits in an extent state struct.
425 * it will optionally wake up any one waiting on this state (wake == 1), or
426 * forcibly remove the state from the tree (delete == 1).
427 *
428 * If no bits are set on the state struct after clearing things, the
429 * struct is freed and removed from the tree
430 */
431static int clear_state_bit(struct extent_io_tree *tree,
432 struct extent_state *state, int bits, int wake,
433 int delete)
434{
435 int ret = state->state & bits;
436
437 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
438 u64 range = state->end - state->start + 1;
439 WARN_ON(range > tree->dirty_bytes);
440 tree->dirty_bytes -= range;
441 }
442 clear_state_cb(tree, state, bits);
443 state->state &= ~bits;
444 if (wake)
445 wake_up(&state->wq);
446 if (delete || state->state == 0) {
447 if (state->tree) {
448 clear_state_cb(tree, state, state->state);
449 rb_erase(&state->rb_node, &tree->state);
450 state->tree = NULL;
451 free_extent_state(state);
452 } else {
453 WARN_ON(1);
454 }
455 } else {
456 merge_state(tree, state);
457 }
458 return ret;
459}
460
461/*
462 * clear some bits on a range in the tree. This may require splitting
463 * or inserting elements in the tree, so the gfp mask is used to
464 * indicate which allocations or sleeping are allowed.
465 *
466 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
467 * the given range from the tree regardless of state (ie for truncate).
468 *
469 * the range [start, end] is inclusive.
470 *
471 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
472 * bits were already set, or zero if none of the bits were already set.
473 */
474int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
475 int bits, int wake, int delete, gfp_t mask)
476{
477 struct extent_state *state;
478 struct extent_state *prealloc = NULL;
479 struct rb_node *node;
480 unsigned long flags;
481 int err;
482 int set = 0;
483
484again:
485 if (!prealloc && (mask & __GFP_WAIT)) {
486 prealloc = alloc_extent_state(mask);
487 if (!prealloc)
488 return -ENOMEM;
489 }
490
491 spin_lock_irqsave(&tree->lock, flags);
492 /*
493 * this search will find the extents that end after
494 * our range starts
495 */
496 node = tree_search(tree, start);
497 if (!node)
498 goto out;
499 state = rb_entry(node, struct extent_state, rb_node);
500 if (state->start > end)
501 goto out;
502 WARN_ON(state->end < start);
503
504 /*
505 * | ---- desired range ---- |
506 * | state | or
507 * | ------------- state -------------- |
508 *
509 * We need to split the extent we found, and may flip
510 * bits on second half.
511 *
512 * If the extent we found extends past our range, we
513 * just split and search again. It'll get split again
514 * the next time though.
515 *
516 * If the extent we found is inside our range, we clear
517 * the desired bit on it.
518 */
519
520 if (state->start < start) {
521 if (!prealloc)
522 prealloc = alloc_extent_state(GFP_ATOMIC);
523 err = split_state(tree, state, prealloc, start);
524 BUG_ON(err == -EEXIST);
525 prealloc = NULL;
526 if (err)
527 goto out;
528 if (state->end <= end) {
529 start = state->end + 1;
530 set |= clear_state_bit(tree, state, bits,
531 wake, delete);
532 } else {
533 start = state->start;
534 }
535 goto search_again;
536 }
537 /*
538 * | ---- desired range ---- |
539 * | state |
540 * We need to split the extent, and clear the bit
541 * on the first half
542 */
543 if (state->start <= end && state->end > end) {
544 if (!prealloc)
545 prealloc = alloc_extent_state(GFP_ATOMIC);
546 err = split_state(tree, state, prealloc, end + 1);
547 BUG_ON(err == -EEXIST);
548
549 if (wake)
550 wake_up(&state->wq);
551 set |= clear_state_bit(tree, prealloc, bits,
552 wake, delete);
553 prealloc = NULL;
554 goto out;
555 }
556
557 start = state->end + 1;
558 set |= clear_state_bit(tree, state, bits, wake, delete);
559 goto search_again;
560
561out:
562 spin_unlock_irqrestore(&tree->lock, flags);
563 if (prealloc)
564 free_extent_state(prealloc);
565
566 return set;
567
568search_again:
569 if (start > end)
570 goto out;
571 spin_unlock_irqrestore(&tree->lock, flags);
572 if (mask & __GFP_WAIT)
573 cond_resched();
574 goto again;
575}
576EXPORT_SYMBOL(clear_extent_bit);
577
578static int wait_on_state(struct extent_io_tree *tree,
579 struct extent_state *state)
580{
581 DEFINE_WAIT(wait);
582 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
583 spin_unlock_irq(&tree->lock);
584 schedule();
585 spin_lock_irq(&tree->lock);
586 finish_wait(&state->wq, &wait);
587 return 0;
588}
589
590/*
591 * waits for one or more bits to clear on a range in the state tree.
592 * The range [start, end] is inclusive.
593 * The tree lock is taken by this function
594 */
595int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
596{
597 struct extent_state *state;
598 struct rb_node *node;
599
600 spin_lock_irq(&tree->lock);
601again:
602 while (1) {
603 /*
604 * this search will find all the extents that end after
605 * our range starts
606 */
607 node = tree_search(tree, start);
608 if (!node)
609 break;
610
611 state = rb_entry(node, struct extent_state, rb_node);
612
613 if (state->start > end)
614 goto out;
615
616 if (state->state & bits) {
617 start = state->start;
618 atomic_inc(&state->refs);
619 wait_on_state(tree, state);
620 free_extent_state(state);
621 goto again;
622 }
623 start = state->end + 1;
624
625 if (start > end)
626 break;
627
628 if (need_resched()) {
629 spin_unlock_irq(&tree->lock);
630 cond_resched();
631 spin_lock_irq(&tree->lock);
632 }
633 }
634out:
635 spin_unlock_irq(&tree->lock);
636 return 0;
637}
638EXPORT_SYMBOL(wait_extent_bit);
639
640static void set_state_bits(struct extent_io_tree *tree,
641 struct extent_state *state,
642 int bits)
643{
644 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
645 u64 range = state->end - state->start + 1;
646 tree->dirty_bytes += range;
647 }
648 set_state_cb(tree, state, bits);
649 state->state |= bits;
650}
651
652/*
653 * set some bits on a range in the tree. This may require allocations
654 * or sleeping, so the gfp mask is used to indicate what is allowed.
655 *
656 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
657 * range already has the desired bits set. The start of the existing
658 * range is returned in failed_start in this case.
659 *
660 * [start, end] is inclusive
661 * This takes the tree lock.
662 */
663int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
664 int exclusive, u64 *failed_start, gfp_t mask)
665{
666 struct extent_state *state;
667 struct extent_state *prealloc = NULL;
668 struct rb_node *node;
669 unsigned long flags;
670 int err = 0;
671 int set;
672 u64 last_start;
673 u64 last_end;
674again:
675 if (!prealloc && (mask & __GFP_WAIT)) {
676 prealloc = alloc_extent_state(mask);
677 if (!prealloc)
678 return -ENOMEM;
679 }
680
681 spin_lock_irqsave(&tree->lock, flags);
682 /*
683 * this search will find all the extents that end after
684 * our range starts.
685 */
686 node = tree_search(tree, start);
687 if (!node) {
688 err = insert_state(tree, prealloc, start, end, bits);
689 prealloc = NULL;
690 BUG_ON(err == -EEXIST);
691 goto out;
692 }
693
694 state = rb_entry(node, struct extent_state, rb_node);
695 last_start = state->start;
696 last_end = state->end;
697
698 /*
699 * | ---- desired range ---- |
700 * | state |
701 *
702 * Just lock what we found and keep going
703 */
704 if (state->start == start && state->end <= end) {
705 set = state->state & bits;
706 if (set && exclusive) {
707 *failed_start = state->start;
708 err = -EEXIST;
709 goto out;
710 }
711 set_state_bits(tree, state, bits);
712 start = state->end + 1;
713 merge_state(tree, state);
714 goto search_again;
715 }
716
717 /*
718 * | ---- desired range ---- |
719 * | state |
720 * or
721 * | ------------- state -------------- |
722 *
723 * We need to split the extent we found, and may flip bits on
724 * second half.
725 *
726 * If the extent we found extends past our
727 * range, we just split and search again. It'll get split
728 * again the next time though.
729 *
730 * If the extent we found is inside our range, we set the
731 * desired bit on it.
732 */
733 if (state->start < start) {
734 set = state->state & bits;
735 if (exclusive && set) {
736 *failed_start = start;
737 err = -EEXIST;
738 goto out;
739 }
740 err = split_state(tree, state, prealloc, start);
741 BUG_ON(err == -EEXIST);
742 prealloc = NULL;
743 if (err)
744 goto out;
745 if (state->end <= end) {
746 set_state_bits(tree, state, bits);
747 start = state->end + 1;
748 merge_state(tree, state);
749 } else {
750 start = state->start;
751 }
752 goto search_again;
753 }
754 /*
755 * | ---- desired range ---- |
756 * | state | or | state |
757 *
758 * There's a hole, we need to insert something in it and
759 * ignore the extent we found.
760 */
761 if (state->start > start) {
762 u64 this_end;
763 if (end < last_start)
764 this_end = end;
765 else
766 this_end = last_start -1;
767 err = insert_state(tree, prealloc, start, this_end,
768 bits);
769 prealloc = NULL;
770 BUG_ON(err == -EEXIST);
771 if (err)
772 goto out;
773 start = this_end + 1;
774 goto search_again;
775 }
776 /*
777 * | ---- desired range ---- |
778 * | state |
779 * We need to split the extent, and set the bit
780 * on the first half
781 */
782 if (state->start <= end && state->end > end) {
783 set = state->state & bits;
784 if (exclusive && set) {
785 *failed_start = start;
786 err = -EEXIST;
787 goto out;
788 }
789 err = split_state(tree, state, prealloc, end + 1);
790 BUG_ON(err == -EEXIST);
791
792 set_state_bits(tree, prealloc, bits);
793 merge_state(tree, prealloc);
794 prealloc = NULL;
795 goto out;
796 }
797
798 goto search_again;
799
800out:
801 spin_unlock_irqrestore(&tree->lock, flags);
802 if (prealloc)
803 free_extent_state(prealloc);
804
805 return err;
806
807search_again:
808 if (start > end)
809 goto out;
810 spin_unlock_irqrestore(&tree->lock, flags);
811 if (mask & __GFP_WAIT)
812 cond_resched();
813 goto again;
814}
815EXPORT_SYMBOL(set_extent_bit);
816
817/* wrappers around set/clear extent bit */
818int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
819 gfp_t mask)
820{
821 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
822 mask);
823}
824EXPORT_SYMBOL(set_extent_dirty);
825
826int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
827 gfp_t mask)
828{
829 return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
830}
831EXPORT_SYMBOL(set_extent_ordered);
832
833int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
834 int bits, gfp_t mask)
835{
836 return set_extent_bit(tree, start, end, bits, 0, NULL,
837 mask);
838}
839EXPORT_SYMBOL(set_extent_bits);
840
841int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
842 int bits, gfp_t mask)
843{
844 return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
845}
846EXPORT_SYMBOL(clear_extent_bits);
847
848int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
849 gfp_t mask)
850{
851 return set_extent_bit(tree, start, end,
852 EXTENT_DELALLOC | EXTENT_DIRTY,
853 0, NULL, mask);
854}
855EXPORT_SYMBOL(set_extent_delalloc);
856
857int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
858 gfp_t mask)
859{
860 return clear_extent_bit(tree, start, end,
861 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
862}
863EXPORT_SYMBOL(clear_extent_dirty);
864
865int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
866 gfp_t mask)
867{
868 return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
869}
870EXPORT_SYMBOL(clear_extent_ordered);
871
872int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
873 gfp_t mask)
874{
875 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
876 mask);
877}
878EXPORT_SYMBOL(set_extent_new);
879
880int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
881 gfp_t mask)
882{
883 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
884}
885EXPORT_SYMBOL(clear_extent_new);
886
887int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
888 gfp_t mask)
889{
890 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
891 mask);
892}
893EXPORT_SYMBOL(set_extent_uptodate);
894
895int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
896 gfp_t mask)
897{
898 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
899}
900EXPORT_SYMBOL(clear_extent_uptodate);
901
902int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
903 gfp_t mask)
904{
905 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
906 0, NULL, mask);
907}
908EXPORT_SYMBOL(set_extent_writeback);
909
910int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
911 gfp_t mask)
912{
913 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
914}
915EXPORT_SYMBOL(clear_extent_writeback);
916
917int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
918{
919 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
920}
921EXPORT_SYMBOL(wait_on_extent_writeback);
922
923/*
924 * either insert or lock state struct between start and end use mask to tell
925 * us if waiting is desired.
926 */
927int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
928{
929 int err;
930 u64 failed_start;
931 while (1) {
932 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
933 &failed_start, mask);
934 if (err == -EEXIST && (mask & __GFP_WAIT)) {
935 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
936 start = failed_start;
937 } else {
938 break;
939 }
940 WARN_ON(start > end);
941 }
942 return err;
943}
944EXPORT_SYMBOL(lock_extent);
945
946int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
947 gfp_t mask)
948{
949 int err;
950 u64 failed_start;
951
952 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
953 &failed_start, mask);
954 if (err == -EEXIST) {
955 if (failed_start > start)
956 clear_extent_bit(tree, start, failed_start - 1,
957 EXTENT_LOCKED, 1, 0, mask);
958 return 0;
959 }
960 return 1;
961}
962EXPORT_SYMBOL(try_lock_extent);
963
964int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
965 gfp_t mask)
966{
967 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
968}
969EXPORT_SYMBOL(unlock_extent);
970
971/*
972 * helper function to set pages and extents in the tree dirty
973 */
974int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
975{
976 unsigned long index = start >> PAGE_CACHE_SHIFT;
977 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
978 struct page *page;
979
980 while (index <= end_index) {
981 page = find_get_page(tree->mapping, index);
982 BUG_ON(!page);
983 __set_page_dirty_nobuffers(page);
984 page_cache_release(page);
985 index++;
986 }
987 set_extent_dirty(tree, start, end, GFP_NOFS);
988 return 0;
989}
990EXPORT_SYMBOL(set_range_dirty);
991
992/*
993 * helper function to set both pages and extents in the tree writeback
994 */
995int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
996{
997 unsigned long index = start >> PAGE_CACHE_SHIFT;
998 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
999 struct page *page;
1000
1001 while (index <= end_index) {
1002 page = find_get_page(tree->mapping, index);
1003 BUG_ON(!page);
1004 set_page_writeback(page);
1005 page_cache_release(page);
1006 index++;
1007 }
1008 set_extent_writeback(tree, start, end, GFP_NOFS);
1009 return 0;
1010}
1011EXPORT_SYMBOL(set_range_writeback);
1012
1013/*
1014 * find the first offset in the io tree with 'bits' set. zero is
1015 * returned if we find something, and *start_ret and *end_ret are
1016 * set to reflect the state struct that was found.
1017 *
1018 * If nothing was found, 1 is returned, < 0 on error
1019 */
1020int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1021 u64 *start_ret, u64 *end_ret, int bits)
1022{
1023 struct rb_node *node;
1024 struct extent_state *state;
1025 int ret = 1;
1026
1027 spin_lock_irq(&tree->lock);
1028 /*
1029 * this search will find all the extents that end after
1030 * our range starts.
1031 */
1032 node = tree_search(tree, start);
1033 if (!node) {
1034 goto out;
1035 }
1036
1037 while(1) {
1038 state = rb_entry(node, struct extent_state, rb_node);
1039 if (state->end >= start && (state->state & bits)) {
1040 *start_ret = state->start;
1041 *end_ret = state->end;
1042 ret = 0;
1043 break;
1044 }
1045 node = rb_next(node);
1046 if (!node)
1047 break;
1048 }
1049out:
1050 spin_unlock_irq(&tree->lock);
1051 return ret;
1052}
1053EXPORT_SYMBOL(find_first_extent_bit);
1054
1055/* find the first state struct with 'bits' set after 'start', and
1056 * return it. tree->lock must be held. NULL will returned if
1057 * nothing was found after 'start'
1058 */
1059struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1060 u64 start, int bits)
1061{
1062 struct rb_node *node;
1063 struct extent_state *state;
1064
1065 /*
1066 * this search will find all the extents that end after
1067 * our range starts.
1068 */
1069 node = tree_search(tree, start);
1070 if (!node) {
1071 goto out;
1072 }
1073
1074 while(1) {
1075 state = rb_entry(node, struct extent_state, rb_node);
1076 if (state->end >= start && (state->state & bits)) {
1077 return state;
1078 }
1079 node = rb_next(node);
1080 if (!node)
1081 break;
1082 }
1083out:
1084 return NULL;
1085}
1086EXPORT_SYMBOL(find_first_extent_bit_state);
1087
1088/*
1089 * find a contiguous range of bytes in the file marked as delalloc, not
1090 * more than 'max_bytes'. start and end are used to return the range,
1091 *
1092 * 1 is returned if we find something, 0 if nothing was in the tree
1093 */
1094static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1095 u64 *start, u64 *end, u64 max_bytes)
1096{
1097 struct rb_node *node;
1098 struct extent_state *state;
1099 u64 cur_start = *start;
1100 u64 found = 0;
1101 u64 total_bytes = 0;
1102
1103 spin_lock_irq(&tree->lock);
1104
1105 /*
1106 * this search will find all the extents that end after
1107 * our range starts.
1108 */
1109 node = tree_search(tree, cur_start);
1110 if (!node) {
1111 if (!found)
1112 *end = (u64)-1;
1113 goto out;
1114 }
1115
1116 while(1) {
1117 state = rb_entry(node, struct extent_state, rb_node);
1118 if (found && (state->start != cur_start ||
1119 (state->state & EXTENT_BOUNDARY))) {
1120 goto out;
1121 }
1122 if (!(state->state & EXTENT_DELALLOC)) {
1123 if (!found)
1124 *end = state->end;
1125 goto out;
1126 }
1127 if (!found)
1128 *start = state->start;
1129 found++;
1130 *end = state->end;
1131 cur_start = state->end + 1;
1132 node = rb_next(node);
1133 if (!node)
1134 break;
1135 total_bytes += state->end - state->start + 1;
1136 if (total_bytes >= max_bytes)
1137 break;
1138 }
1139out:
1140 spin_unlock_irq(&tree->lock);
1141 return found;
1142}
1143
1144static noinline int __unlock_for_delalloc(struct inode *inode,
1145 struct page *locked_page,
1146 u64 start, u64 end)
1147{
1148 int ret;
1149 struct page *pages[16];
1150 unsigned long index = start >> PAGE_CACHE_SHIFT;
1151 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1152 unsigned long nr_pages = end_index - index + 1;
1153 int i;
1154
1155 if (index == locked_page->index && end_index == index)
1156 return 0;
1157
1158 while(nr_pages > 0) {
1159 ret = find_get_pages_contig(inode->i_mapping, index,
1160 min_t(unsigned long, nr_pages,
1161 ARRAY_SIZE(pages)), pages);
1162 for (i = 0; i < ret; i++) {
1163 if (pages[i] != locked_page)
1164 unlock_page(pages[i]);
1165 page_cache_release(pages[i]);
1166 }
1167 nr_pages -= ret;
1168 index += ret;
1169 cond_resched();
1170 }
1171 return 0;
1172}
1173
1174static noinline int lock_delalloc_pages(struct inode *inode,
1175 struct page *locked_page,
1176 u64 delalloc_start,
1177 u64 delalloc_end)
1178{
1179 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1180 unsigned long start_index = index;
1181 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1182 unsigned long pages_locked = 0;
1183 struct page *pages[16];
1184 unsigned long nrpages;
1185 int ret;
1186 int i;
1187
1188 /* the caller is responsible for locking the start index */
1189 if (index == locked_page->index && index == end_index)
1190 return 0;
1191
1192 /* skip the page at the start index */
1193 nrpages = end_index - index + 1;
1194 while(nrpages > 0) {
1195 ret = find_get_pages_contig(inode->i_mapping, index,
1196 min_t(unsigned long,
1197 nrpages, ARRAY_SIZE(pages)), pages);
1198 if (ret == 0) {
1199 ret = -EAGAIN;
1200 goto done;
1201 }
1202 /* now we have an array of pages, lock them all */
1203 for (i = 0; i < ret; i++) {
1204 /*
1205 * the caller is taking responsibility for
1206 * locked_page
1207 */
1208 if (pages[i] != locked_page) {
1209 lock_page(pages[i]);
1210 if (!PageDirty(pages[i]) ||
1211 pages[i]->mapping != inode->i_mapping) {
1212 ret = -EAGAIN;
1213 unlock_page(pages[i]);
1214 page_cache_release(pages[i]);
1215 goto done;
1216 }
1217 }
1218 page_cache_release(pages[i]);
1219 pages_locked++;
1220 }
1221 nrpages -= ret;
1222 index += ret;
1223 cond_resched();
1224 }
1225 ret = 0;
1226done:
1227 if (ret && pages_locked) {
1228 __unlock_for_delalloc(inode, locked_page,
1229 delalloc_start,
1230 ((u64)(start_index + pages_locked - 1)) <<
1231 PAGE_CACHE_SHIFT);
1232 }
1233 return ret;
1234}
1235
1236/*
1237 * find a contiguous range of bytes in the file marked as delalloc, not
1238 * more than 'max_bytes'. start and end are used to return the range,
1239 *
1240 * 1 is returned if we find something, 0 if nothing was in the tree
1241 */
1242static noinline u64 find_lock_delalloc_range(struct inode *inode,
1243 struct extent_io_tree *tree,
1244 struct page *locked_page,
1245 u64 *start, u64 *end,
1246 u64 max_bytes)
1247{
1248 u64 delalloc_start;
1249 u64 delalloc_end;
1250 u64 found;
1251 int ret;
1252 int loops = 0;
1253
1254again:
1255 /* step one, find a bunch of delalloc bytes starting at start */
1256 delalloc_start = *start;
1257 delalloc_end = 0;
1258 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1259 max_bytes);
1260 if (!found || delalloc_end <= *start) {
1261 *start = delalloc_start;
1262 *end = delalloc_end;
1263 return found;
1264 }
1265
1266 /*
1267 * start comes from the offset of locked_page. We have to lock
1268 * pages in order, so we can't process delalloc bytes before
1269 * locked_page
1270 */
1271 if (delalloc_start < *start) {
1272 delalloc_start = *start;
1273 }
1274
1275 /*
1276 * make sure to limit the number of pages we try to lock down
1277 * if we're looping.
1278 */
1279 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
1280 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1281 }
1282 /* step two, lock all the pages after the page that has start */
1283 ret = lock_delalloc_pages(inode, locked_page,
1284 delalloc_start, delalloc_end);
1285 if (ret == -EAGAIN) {
1286 /* some of the pages are gone, lets avoid looping by
1287 * shortening the size of the delalloc range we're searching
1288 */
1289 if (!loops) {
1290 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1291 max_bytes = PAGE_CACHE_SIZE - offset;
1292 loops = 1;
1293 goto again;
1294 } else {
1295 found = 0;
1296 goto out_failed;
1297 }
1298 }
1299 BUG_ON(ret);
1300
1301 /* step three, lock the state bits for the whole range */
1302 lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1303
1304 /* then test to make sure it is all still delalloc */
1305 ret = test_range_bit(tree, delalloc_start, delalloc_end,
1306 EXTENT_DELALLOC, 1);
1307 if (!ret) {
1308 unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1309 __unlock_for_delalloc(inode, locked_page,
1310 delalloc_start, delalloc_end);
1311 cond_resched();
1312 goto again;
1313 }
1314 *start = delalloc_start;
1315 *end = delalloc_end;
1316out_failed:
1317 return found;
1318}
1319
1320int extent_clear_unlock_delalloc(struct inode *inode,
1321 struct extent_io_tree *tree,
1322 u64 start, u64 end, struct page *locked_page,
1323 int unlock_pages,
1324 int clear_unlock,
1325 int clear_delalloc, int clear_dirty,
1326 int set_writeback,
1327 int end_writeback)
1328{
1329 int ret;
1330 struct page *pages[16];
1331 unsigned long index = start >> PAGE_CACHE_SHIFT;
1332 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1333 unsigned long nr_pages = end_index - index + 1;
1334 int i;
1335 int clear_bits = 0;
1336
1337 if (clear_unlock)
1338 clear_bits |= EXTENT_LOCKED;
1339 if (clear_dirty)
1340 clear_bits |= EXTENT_DIRTY;
1341
1342 if (clear_delalloc)
1343 clear_bits |= EXTENT_DELALLOC;
1344
1345 clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
1346 if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
1347 return 0;
1348
1349 while(nr_pages > 0) {
1350 ret = find_get_pages_contig(inode->i_mapping, index,
1351 min_t(unsigned long,
1352 nr_pages, ARRAY_SIZE(pages)), pages);
1353 for (i = 0; i < ret; i++) {
1354 if (pages[i] == locked_page) {
1355 page_cache_release(pages[i]);
1356 continue;
1357 }
1358 if (clear_dirty)
1359 clear_page_dirty_for_io(pages[i]);
1360 if (set_writeback)
1361 set_page_writeback(pages[i]);
1362 if (end_writeback)
1363 end_page_writeback(pages[i]);
1364 if (unlock_pages)
1365 unlock_page(pages[i]);
1366 page_cache_release(pages[i]);
1367 }
1368 nr_pages -= ret;
1369 index += ret;
1370 cond_resched();
1371 }
1372 return 0;
1373}
1374EXPORT_SYMBOL(extent_clear_unlock_delalloc);
1375
1376/*
1377 * count the number of bytes in the tree that have a given bit(s)
1378 * set. This can be fairly slow, except for EXTENT_DIRTY which is
1379 * cached. The total number found is returned.
1380 */
1381u64 count_range_bits(struct extent_io_tree *tree,
1382 u64 *start, u64 search_end, u64 max_bytes,
1383 unsigned long bits)
1384{
1385 struct rb_node *node;
1386 struct extent_state *state;
1387 u64 cur_start = *start;
1388 u64 total_bytes = 0;
1389 int found = 0;
1390
1391 if (search_end <= cur_start) {
1392 printk("search_end %Lu start %Lu\n", search_end, cur_start);
1393 WARN_ON(1);
1394 return 0;
1395 }
1396
1397 spin_lock_irq(&tree->lock);
1398 if (cur_start == 0 && bits == EXTENT_DIRTY) {
1399 total_bytes = tree->dirty_bytes;
1400 goto out;
1401 }
1402 /*
1403 * this search will find all the extents that end after
1404 * our range starts.
1405 */
1406 node = tree_search(tree, cur_start);
1407 if (!node) {
1408 goto out;
1409 }
1410
1411 while(1) {
1412 state = rb_entry(node, struct extent_state, rb_node);
1413 if (state->start > search_end)
1414 break;
1415 if (state->end >= cur_start && (state->state & bits)) {
1416 total_bytes += min(search_end, state->end) + 1 -
1417 max(cur_start, state->start);
1418 if (total_bytes >= max_bytes)
1419 break;
1420 if (!found) {
1421 *start = state->start;
1422 found = 1;
1423 }
1424 }
1425 node = rb_next(node);
1426 if (!node)
1427 break;
1428 }
1429out:
1430 spin_unlock_irq(&tree->lock);
1431 return total_bytes;
1432}
1433/*
1434 * helper function to lock both pages and extents in the tree.
1435 * pages must be locked first.
1436 */
1437int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1438{
1439 unsigned long index = start >> PAGE_CACHE_SHIFT;
1440 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1441 struct page *page;
1442 int err;
1443
1444 while (index <= end_index) {
1445 page = grab_cache_page(tree->mapping, index);
1446 if (!page) {
1447 err = -ENOMEM;
1448 goto failed;
1449 }
1450 if (IS_ERR(page)) {
1451 err = PTR_ERR(page);
1452 goto failed;
1453 }
1454 index++;
1455 }
1456 lock_extent(tree, start, end, GFP_NOFS);
1457 return 0;
1458
1459failed:
1460 /*
1461 * we failed above in getting the page at 'index', so we undo here
1462 * up to but not including the page at 'index'
1463 */
1464 end_index = index;
1465 index = start >> PAGE_CACHE_SHIFT;
1466 while (index < end_index) {
1467 page = find_get_page(tree->mapping, index);
1468 unlock_page(page);
1469 page_cache_release(page);
1470 index++;
1471 }
1472 return err;
1473}
1474EXPORT_SYMBOL(lock_range);
1475
1476/*
1477 * helper function to unlock both pages and extents in the tree.
1478 */
1479int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1480{
1481 unsigned long index = start >> PAGE_CACHE_SHIFT;
1482 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1483 struct page *page;
1484
1485 while (index <= end_index) {
1486 page = find_get_page(tree->mapping, index);
1487 unlock_page(page);
1488 page_cache_release(page);
1489 index++;
1490 }
1491 unlock_extent(tree, start, end, GFP_NOFS);
1492 return 0;
1493}
1494EXPORT_SYMBOL(unlock_range);
1495
1496/*
1497 * set the private field for a given byte offset in the tree. If there isn't
1498 * an extent_state there already, this does nothing.
1499 */
1500int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1501{
1502 struct rb_node *node;
1503 struct extent_state *state;
1504 int ret = 0;
1505
1506 spin_lock_irq(&tree->lock);
1507 /*
1508 * this search will find all the extents that end after
1509 * our range starts.
1510 */
1511 node = tree_search(tree, start);
1512 if (!node) {
1513 ret = -ENOENT;
1514 goto out;
1515 }
1516 state = rb_entry(node, struct extent_state, rb_node);
1517 if (state->start != start) {
1518 ret = -ENOENT;
1519 goto out;
1520 }
1521 state->private = private;
1522out:
1523 spin_unlock_irq(&tree->lock);
1524 return ret;
1525}
1526
1527int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1528{
1529 struct rb_node *node;
1530 struct extent_state *state;
1531 int ret = 0;
1532
1533 spin_lock_irq(&tree->lock);
1534 /*
1535 * this search will find all the extents that end after
1536 * our range starts.
1537 */
1538 node = tree_search(tree, start);
1539 if (!node) {
1540 ret = -ENOENT;
1541 goto out;
1542 }
1543 state = rb_entry(node, struct extent_state, rb_node);
1544 if (state->start != start) {
1545 ret = -ENOENT;
1546 goto out;
1547 }
1548 *private = state->private;
1549out:
1550 spin_unlock_irq(&tree->lock);
1551 return ret;
1552}
1553
1554/*
1555 * searches a range in the state tree for a given mask.
1556 * If 'filled' == 1, this returns 1 only if every extent in the tree
1557 * has the bits set. Otherwise, 1 is returned if any bit in the
1558 * range is found set.
1559 */
1560int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1561 int bits, int filled)
1562{
1563 struct extent_state *state = NULL;
1564 struct rb_node *node;
1565 int bitset = 0;
1566 unsigned long flags;
1567
1568 spin_lock_irqsave(&tree->lock, flags);
1569 node = tree_search(tree, start);
1570 while (node && start <= end) {
1571 state = rb_entry(node, struct extent_state, rb_node);
1572
1573 if (filled && state->start > start) {
1574 bitset = 0;
1575 break;
1576 }
1577
1578 if (state->start > end)
1579 break;
1580
1581 if (state->state & bits) {
1582 bitset = 1;
1583 if (!filled)
1584 break;
1585 } else if (filled) {
1586 bitset = 0;
1587 break;
1588 }
1589 start = state->end + 1;
1590 if (start > end)
1591 break;
1592 node = rb_next(node);
1593 if (!node) {
1594 if (filled)
1595 bitset = 0;
1596 break;
1597 }
1598 }
1599 spin_unlock_irqrestore(&tree->lock, flags);
1600 return bitset;
1601}
1602EXPORT_SYMBOL(test_range_bit);
1603
1604/*
1605 * helper function to set a given page up to date if all the
1606 * extents in the tree for that page are up to date
1607 */
1608static int check_page_uptodate(struct extent_io_tree *tree,
1609 struct page *page)
1610{
1611 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1612 u64 end = start + PAGE_CACHE_SIZE - 1;
1613 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1614 SetPageUptodate(page);
1615 return 0;
1616}
1617
1618/*
1619 * helper function to unlock a page if all the extents in the tree
1620 * for that page are unlocked
1621 */
1622static int check_page_locked(struct extent_io_tree *tree,
1623 struct page *page)
1624{
1625 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1626 u64 end = start + PAGE_CACHE_SIZE - 1;
1627 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1628 unlock_page(page);
1629 return 0;
1630}
1631
1632/*
1633 * helper function to end page writeback if all the extents
1634 * in the tree for that page are done with writeback
1635 */
1636static int check_page_writeback(struct extent_io_tree *tree,
1637 struct page *page)
1638{
1639 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1640 u64 end = start + PAGE_CACHE_SIZE - 1;
1641 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1642 end_page_writeback(page);
1643 return 0;
1644}
1645
1646/* lots and lots of room for performance fixes in the end_bio funcs */
1647
1648/*
1649 * after a writepage IO is done, we need to:
1650 * clear the uptodate bits on error
1651 * clear the writeback bits in the extent tree for this IO
1652 * end_page_writeback if the page has no more pending IO
1653 *
1654 * Scheduling is not allowed, so the extent state tree is expected
1655 * to have one and only one object corresponding to this IO.
1656 */
1657static void end_bio_extent_writepage(struct bio *bio, int err)
1658{
1659 int uptodate = err == 0;
1660 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1661 struct extent_io_tree *tree;
1662 u64 start;
1663 u64 end;
1664 int whole_page;
1665 int ret;
1666
1667 do {
1668 struct page *page = bvec->bv_page;
1669 tree = &BTRFS_I(page->mapping->host)->io_tree;
1670
1671 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1672 bvec->bv_offset;
1673 end = start + bvec->bv_len - 1;
1674
1675 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1676 whole_page = 1;
1677 else
1678 whole_page = 0;
1679
1680 if (--bvec >= bio->bi_io_vec)
1681 prefetchw(&bvec->bv_page->flags);
1682 if (tree->ops && tree->ops->writepage_end_io_hook) {
1683 ret = tree->ops->writepage_end_io_hook(page, start,
1684 end, NULL, uptodate);
1685 if (ret)
1686 uptodate = 0;
1687 }
1688
1689 if (!uptodate && tree->ops &&
1690 tree->ops->writepage_io_failed_hook) {
1691 ret = tree->ops->writepage_io_failed_hook(bio, page,
1692 start, end, NULL);
1693 if (ret == 0) {
1694 uptodate = (err == 0);
1695 continue;
1696 }
1697 }
1698
1699 if (!uptodate) {
1700 clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1701 ClearPageUptodate(page);
1702 SetPageError(page);
1703 }
1704
1705 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1706
1707 if (whole_page)
1708 end_page_writeback(page);
1709 else
1710 check_page_writeback(tree, page);
1711 } while (bvec >= bio->bi_io_vec);
1712
1713 bio_put(bio);
1714}
1715
1716/*
1717 * after a readpage IO is done, we need to:
1718 * clear the uptodate bits on error
1719 * set the uptodate bits if things worked
1720 * set the page up to date if all extents in the tree are uptodate
1721 * clear the lock bit in the extent tree
1722 * unlock the page if there are no other extents locked for it
1723 *
1724 * Scheduling is not allowed, so the extent state tree is expected
1725 * to have one and only one object corresponding to this IO.
1726 */
1727static void end_bio_extent_readpage(struct bio *bio, int err)
1728{
1729 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1730 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1731 struct extent_io_tree *tree;
1732 u64 start;
1733 u64 end;
1734 int whole_page;
1735 int ret;
1736
1737 do {
1738 struct page *page = bvec->bv_page;
1739 tree = &BTRFS_I(page->mapping->host)->io_tree;
1740
1741 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1742 bvec->bv_offset;
1743 end = start + bvec->bv_len - 1;
1744
1745 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1746 whole_page = 1;
1747 else
1748 whole_page = 0;
1749
1750 if (--bvec >= bio->bi_io_vec)
1751 prefetchw(&bvec->bv_page->flags);
1752
1753 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1754 ret = tree->ops->readpage_end_io_hook(page, start, end,
1755 NULL);
1756 if (ret)
1757 uptodate = 0;
1758 }
1759 if (!uptodate && tree->ops &&
1760 tree->ops->readpage_io_failed_hook) {
1761 ret = tree->ops->readpage_io_failed_hook(bio, page,
1762 start, end, NULL);
1763 if (ret == 0) {
1764 uptodate =
1765 test_bit(BIO_UPTODATE, &bio->bi_flags);
1766 continue;
1767 }
1768 }
1769
1770 if (uptodate) {
1771 set_extent_uptodate(tree, start, end,
1772 GFP_ATOMIC);
1773 }
1774 unlock_extent(tree, start, end, GFP_ATOMIC);
1775
1776 if (whole_page) {
1777 if (uptodate) {
1778 SetPageUptodate(page);
1779 } else {
1780 ClearPageUptodate(page);
1781 SetPageError(page);
1782 }
1783 unlock_page(page);
1784 } else {
1785 if (uptodate) {
1786 check_page_uptodate(tree, page);
1787 } else {
1788 ClearPageUptodate(page);
1789 SetPageError(page);
1790 }
1791 check_page_locked(tree, page);
1792 }
1793 } while (bvec >= bio->bi_io_vec);
1794
1795 bio_put(bio);
1796}
1797
1798/*
1799 * IO done from prepare_write is pretty simple, we just unlock
1800 * the structs in the extent tree when done, and set the uptodate bits
1801 * as appropriate.
1802 */
1803static void end_bio_extent_preparewrite(struct bio *bio, int err)
1804{
1805 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1806 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1807 struct extent_io_tree *tree;
1808 u64 start;
1809 u64 end;
1810
1811 do {
1812 struct page *page = bvec->bv_page;
1813 tree = &BTRFS_I(page->mapping->host)->io_tree;
1814
1815 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1816 bvec->bv_offset;
1817 end = start + bvec->bv_len - 1;
1818
1819 if (--bvec >= bio->bi_io_vec)
1820 prefetchw(&bvec->bv_page->flags);
1821
1822 if (uptodate) {
1823 set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1824 } else {
1825 ClearPageUptodate(page);
1826 SetPageError(page);
1827 }
1828
1829 unlock_extent(tree, start, end, GFP_ATOMIC);
1830
1831 } while (bvec >= bio->bi_io_vec);
1832
1833 bio_put(bio);
1834}
1835
1836static struct bio *
1837extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1838 gfp_t gfp_flags)
1839{
1840 struct bio *bio;
1841
1842 bio = bio_alloc(gfp_flags, nr_vecs);
1843
1844 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1845 while (!bio && (nr_vecs /= 2))
1846 bio = bio_alloc(gfp_flags, nr_vecs);
1847 }
1848
1849 if (bio) {
1850 bio->bi_size = 0;
1851 bio->bi_bdev = bdev;
1852 bio->bi_sector = first_sector;
1853 }
1854 return bio;
1855}
1856
1857static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1858 unsigned long bio_flags)
1859{
1860 int ret = 0;
1861 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1862 struct page *page = bvec->bv_page;
1863 struct extent_io_tree *tree = bio->bi_private;
1864 u64 start;
1865 u64 end;
1866
1867 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1868 end = start + bvec->bv_len - 1;
1869
1870 bio->bi_private = NULL;
1871
1872 bio_get(bio);
1873
1874 if (tree->ops && tree->ops->submit_bio_hook)
1875 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1876 mirror_num, bio_flags);
1877 else
1878 submit_bio(rw, bio);
1879 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1880 ret = -EOPNOTSUPP;
1881 bio_put(bio);
1882 return ret;
1883}
1884
1885static int submit_extent_page(int rw, struct extent_io_tree *tree,
1886 struct page *page, sector_t sector,
1887 size_t size, unsigned long offset,
1888 struct block_device *bdev,
1889 struct bio **bio_ret,
1890 unsigned long max_pages,
1891 bio_end_io_t end_io_func,
1892 int mirror_num,
1893 unsigned long prev_bio_flags,
1894 unsigned long bio_flags)
1895{
1896 int ret = 0;
1897 struct bio *bio;
1898 int nr;
1899 int contig = 0;
1900 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1901 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1902 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1903
1904 if (bio_ret && *bio_ret) {
1905 bio = *bio_ret;
1906 if (old_compressed)
1907 contig = bio->bi_sector == sector;
1908 else
1909 contig = bio->bi_sector + (bio->bi_size >> 9) ==
1910 sector;
1911
1912 if (prev_bio_flags != bio_flags || !contig ||
1913 (tree->ops && tree->ops->merge_bio_hook &&
1914 tree->ops->merge_bio_hook(page, offset, page_size, bio,
1915 bio_flags)) ||
1916 bio_add_page(bio, page, page_size, offset) < page_size) {
1917 ret = submit_one_bio(rw, bio, mirror_num,
1918 prev_bio_flags);
1919 bio = NULL;
1920 } else {
1921 return 0;
1922 }
1923 }
1924 if (this_compressed)
1925 nr = BIO_MAX_PAGES;
1926 else
1927 nr = bio_get_nr_vecs(bdev);
1928
1929 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1930 if (!bio) {
1931 printk("failed to allocate bio nr %d\n", nr);
1932 }
1933
1934 bio_add_page(bio, page, page_size, offset);
1935 bio->bi_end_io = end_io_func;
1936 bio->bi_private = tree;
1937
1938 if (bio_ret) {
1939 *bio_ret = bio;
1940 } else {
1941 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1942 }
1943
1944 return ret;
1945}
1946
1947void set_page_extent_mapped(struct page *page)
1948{
1949 if (!PagePrivate(page)) {
1950 SetPagePrivate(page);
1951 page_cache_get(page);
1952 set_page_private(page, EXTENT_PAGE_PRIVATE);
1953 }
1954}
1955EXPORT_SYMBOL(set_page_extent_mapped);
1956
1957void set_page_extent_head(struct page *page, unsigned long len)
1958{
1959 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1960}
1961
1962/*
1963 * basic readpage implementation. Locked extent state structs are inserted
1964 * into the tree that are removed when the IO is done (by the end_io
1965 * handlers)
1966 */
1967static int __extent_read_full_page(struct extent_io_tree *tree,
1968 struct page *page,
1969 get_extent_t *get_extent,
1970 struct bio **bio, int mirror_num,
1971 unsigned long *bio_flags)
1972{
1973 struct inode *inode = page->mapping->host;
1974 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1975 u64 page_end = start + PAGE_CACHE_SIZE - 1;
1976 u64 end;
1977 u64 cur = start;
1978 u64 extent_offset;
1979 u64 last_byte = i_size_read(inode);
1980 u64 block_start;
1981 u64 cur_end;
1982 sector_t sector;
1983 struct extent_map *em;
1984 struct block_device *bdev;
1985 int ret;
1986 int nr = 0;
1987 size_t page_offset = 0;
1988 size_t iosize;
1989 size_t disk_io_size;
1990 size_t blocksize = inode->i_sb->s_blocksize;
1991 unsigned long this_bio_flag = 0;
1992
1993 set_page_extent_mapped(page);
1994
1995 end = page_end;
1996 lock_extent(tree, start, end, GFP_NOFS);
1997
1998 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1999 char *userpage;
2000 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2001
2002 if (zero_offset) {
2003 iosize = PAGE_CACHE_SIZE - zero_offset;
2004 userpage = kmap_atomic(page, KM_USER0);
2005 memset(userpage + zero_offset, 0, iosize);
2006 flush_dcache_page(page);
2007 kunmap_atomic(userpage, KM_USER0);
2008 }
2009 }
2010 while (cur <= end) {
2011 if (cur >= last_byte) {
2012 char *userpage;
2013 iosize = PAGE_CACHE_SIZE - page_offset;
2014 userpage = kmap_atomic(page, KM_USER0);
2015 memset(userpage + page_offset, 0, iosize);
2016 flush_dcache_page(page);
2017 kunmap_atomic(userpage, KM_USER0);
2018 set_extent_uptodate(tree, cur, cur + iosize - 1,
2019 GFP_NOFS);
2020 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2021 break;
2022 }
2023 em = get_extent(inode, page, page_offset, cur,
2024 end - cur + 1, 0);
2025 if (IS_ERR(em) || !em) {
2026 SetPageError(page);
2027 unlock_extent(tree, cur, end, GFP_NOFS);
2028 break;
2029 }
2030 extent_offset = cur - em->start;
2031 if (extent_map_end(em) <= cur) {
2032printk("bad mapping em [%Lu %Lu] cur %Lu\n", em->start, extent_map_end(em), cur);
2033 }
2034 BUG_ON(extent_map_end(em) <= cur);
2035 if (end < cur) {
2036printk("2bad mapping end %Lu cur %Lu\n", end, cur);
2037 }
2038 BUG_ON(end < cur);
2039
2040 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2041 this_bio_flag = EXTENT_BIO_COMPRESSED;
2042
2043 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2044 cur_end = min(extent_map_end(em) - 1, end);
2045 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2046 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2047 disk_io_size = em->block_len;
2048 sector = em->block_start >> 9;
2049 } else {
2050 sector = (em->block_start + extent_offset) >> 9;
2051 disk_io_size = iosize;
2052 }
2053 bdev = em->bdev;
2054 block_start = em->block_start;
2055 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2056 block_start = EXTENT_MAP_HOLE;
2057 free_extent_map(em);
2058 em = NULL;
2059
2060 /* we've found a hole, just zero and go on */
2061 if (block_start == EXTENT_MAP_HOLE) {
2062 char *userpage;
2063 userpage = kmap_atomic(page, KM_USER0);
2064 memset(userpage + page_offset, 0, iosize);
2065 flush_dcache_page(page);
2066 kunmap_atomic(userpage, KM_USER0);
2067
2068 set_extent_uptodate(tree, cur, cur + iosize - 1,
2069 GFP_NOFS);
2070 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2071 cur = cur + iosize;
2072 page_offset += iosize;
2073 continue;
2074 }
2075 /* the get_extent function already copied into the page */
2076 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
2077 check_page_uptodate(tree, page);
2078 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2079 cur = cur + iosize;
2080 page_offset += iosize;
2081 continue;
2082 }
2083 /* we have an inline extent but it didn't get marked up
2084 * to date. Error out
2085 */
2086 if (block_start == EXTENT_MAP_INLINE) {
2087 SetPageError(page);
2088 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2089 cur = cur + iosize;
2090 page_offset += iosize;
2091 continue;
2092 }
2093
2094 ret = 0;
2095 if (tree->ops && tree->ops->readpage_io_hook) {
2096 ret = tree->ops->readpage_io_hook(page, cur,
2097 cur + iosize - 1);
2098 }
2099 if (!ret) {
2100 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2101 pnr -= page->index;
2102 ret = submit_extent_page(READ, tree, page,
2103 sector, disk_io_size, page_offset,
2104 bdev, bio, pnr,
2105 end_bio_extent_readpage, mirror_num,
2106 *bio_flags,
2107 this_bio_flag);
2108 nr++;
2109 *bio_flags = this_bio_flag;
2110 }
2111 if (ret)
2112 SetPageError(page);
2113 cur = cur + iosize;
2114 page_offset += iosize;
2115 }
2116 if (!nr) {
2117 if (!PageError(page))
2118 SetPageUptodate(page);
2119 unlock_page(page);
2120 }
2121 return 0;
2122}
2123
2124int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2125 get_extent_t *get_extent)
2126{
2127 struct bio *bio = NULL;
2128 unsigned long bio_flags = 0;
2129 int ret;
2130
2131 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2132 &bio_flags);
2133 if (bio)
2134 submit_one_bio(READ, bio, 0, bio_flags);
2135 return ret;
2136}
2137EXPORT_SYMBOL(extent_read_full_page);
2138
2139/*
2140 * the writepage semantics are similar to regular writepage. extent
2141 * records are inserted to lock ranges in the tree, and as dirty areas
2142 * are found, they are marked writeback. Then the lock bits are removed
2143 * and the end_io handler clears the writeback ranges
2144 */
2145static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2146 void *data)
2147{
2148 struct inode *inode = page->mapping->host;
2149 struct extent_page_data *epd = data;
2150 struct extent_io_tree *tree = epd->tree;
2151 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2152 u64 delalloc_start;
2153 u64 page_end = start + PAGE_CACHE_SIZE - 1;
2154 u64 end;
2155 u64 cur = start;
2156 u64 extent_offset;
2157 u64 last_byte = i_size_read(inode);
2158 u64 block_start;
2159 u64 iosize;
2160 u64 unlock_start;
2161 sector_t sector;
2162 struct extent_map *em;
2163 struct block_device *bdev;
2164 int ret;
2165 int nr = 0;
2166 size_t pg_offset = 0;
2167 size_t blocksize;
2168 loff_t i_size = i_size_read(inode);
2169 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2170 u64 nr_delalloc;
2171 u64 delalloc_end;
2172 int page_started;
2173 int compressed;
2174 unsigned long nr_written = 0;
2175
2176 WARN_ON(!PageLocked(page));
2177 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2178 if (page->index > end_index ||
2179 (page->index == end_index && !pg_offset)) {
2180 page->mapping->a_ops->invalidatepage(page, 0);
2181 unlock_page(page);
2182 return 0;
2183 }
2184
2185 if (page->index == end_index) {
2186 char *userpage;
2187
2188 userpage = kmap_atomic(page, KM_USER0);
2189 memset(userpage + pg_offset, 0,
2190 PAGE_CACHE_SIZE - pg_offset);
2191 kunmap_atomic(userpage, KM_USER0);
2192 flush_dcache_page(page);
2193 }
2194 pg_offset = 0;
2195
2196 set_page_extent_mapped(page);
2197
2198 delalloc_start = start;
2199 delalloc_end = 0;
2200 page_started = 0;
2201 if (!epd->extent_locked) {
2202 while(delalloc_end < page_end) {
2203 nr_delalloc = find_lock_delalloc_range(inode, tree,
2204 page,
2205 &delalloc_start,
2206 &delalloc_end,
2207 128 * 1024 * 1024);
2208 if (nr_delalloc == 0) {
2209 delalloc_start = delalloc_end + 1;
2210 continue;
2211 }
2212 tree->ops->fill_delalloc(inode, page, delalloc_start,
2213 delalloc_end, &page_started,
2214 &nr_written);
2215 delalloc_start = delalloc_end + 1;
2216 }
2217
2218 /* did the fill delalloc function already unlock and start
2219 * the IO?
2220 */
2221 if (page_started) {
2222 ret = 0;
2223 goto update_nr_written;
2224 }
2225 }
2226 lock_extent(tree, start, page_end, GFP_NOFS);
2227
2228 unlock_start = start;
2229
2230 if (tree->ops && tree->ops->writepage_start_hook) {
2231 ret = tree->ops->writepage_start_hook(page, start,
2232 page_end);
2233 if (ret == -EAGAIN) {
2234 unlock_extent(tree, start, page_end, GFP_NOFS);
2235 redirty_page_for_writepage(wbc, page);
2236 unlock_page(page);
2237 ret = 0;
2238 goto update_nr_written;
2239 }
2240 }
2241
2242 nr_written++;
2243
2244 end = page_end;
2245 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
2246 printk("found delalloc bits after lock_extent\n");
2247 }
2248
2249 if (last_byte <= start) {
2250 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
2251 unlock_extent(tree, start, page_end, GFP_NOFS);
2252 if (tree->ops && tree->ops->writepage_end_io_hook)
2253 tree->ops->writepage_end_io_hook(page, start,
2254 page_end, NULL, 1);
2255 unlock_start = page_end + 1;
2256 goto done;
2257 }
2258
2259 set_extent_uptodate(tree, start, page_end, GFP_NOFS);
2260 blocksize = inode->i_sb->s_blocksize;
2261
2262 while (cur <= end) {
2263 if (cur >= last_byte) {
2264 clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
2265 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2266 if (tree->ops && tree->ops->writepage_end_io_hook)
2267 tree->ops->writepage_end_io_hook(page, cur,
2268 page_end, NULL, 1);
2269 unlock_start = page_end + 1;
2270 break;
2271 }
2272 em = epd->get_extent(inode, page, pg_offset, cur,
2273 end - cur + 1, 1);
2274 if (IS_ERR(em) || !em) {
2275 SetPageError(page);
2276 break;
2277 }
2278
2279 extent_offset = cur - em->start;
2280 BUG_ON(extent_map_end(em) <= cur);
2281 BUG_ON(end < cur);
2282 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2283 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2284 sector = (em->block_start + extent_offset) >> 9;
2285 bdev = em->bdev;
2286 block_start = em->block_start;
2287 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2288 free_extent_map(em);
2289 em = NULL;
2290
2291 /*
2292 * compressed and inline extents are written through other
2293 * paths in the FS
2294 */
2295 if (compressed || block_start == EXTENT_MAP_HOLE ||
2296 block_start == EXTENT_MAP_INLINE) {
2297 clear_extent_dirty(tree, cur,
2298 cur + iosize - 1, GFP_NOFS);
2299
2300 unlock_extent(tree, unlock_start, cur + iosize -1,
2301 GFP_NOFS);
2302
2303 /*
2304 * end_io notification does not happen here for
2305 * compressed extents
2306 */
2307 if (!compressed && tree->ops &&
2308 tree->ops->writepage_end_io_hook)
2309 tree->ops->writepage_end_io_hook(page, cur,
2310 cur + iosize - 1,
2311 NULL, 1);
2312 else if (compressed) {
2313 /* we don't want to end_page_writeback on
2314 * a compressed extent. this happens
2315 * elsewhere
2316 */
2317 nr++;
2318 }
2319
2320 cur += iosize;
2321 pg_offset += iosize;
2322 unlock_start = cur;
2323 continue;
2324 }
2325 /* leave this out until we have a page_mkwrite call */
2326 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2327 EXTENT_DIRTY, 0)) {
2328 cur = cur + iosize;
2329 pg_offset += iosize;
2330 continue;
2331 }
2332
2333 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2334 if (tree->ops && tree->ops->writepage_io_hook) {
2335 ret = tree->ops->writepage_io_hook(page, cur,
2336 cur + iosize - 1);
2337 } else {
2338 ret = 0;
2339 }
2340 if (ret) {
2341 SetPageError(page);
2342 } else {
2343 unsigned long max_nr = end_index + 1;
2344
2345 set_range_writeback(tree, cur, cur + iosize - 1);
2346 if (!PageWriteback(page)) {
2347 printk("warning page %lu not writeback, "
2348 "cur %llu end %llu\n", page->index,
2349 (unsigned long long)cur,
2350 (unsigned long long)end);
2351 }
2352
2353 ret = submit_extent_page(WRITE, tree, page, sector,
2354 iosize, pg_offset, bdev,
2355 &epd->bio, max_nr,
2356 end_bio_extent_writepage,
2357 0, 0, 0);
2358 if (ret)
2359 SetPageError(page);
2360 }
2361 cur = cur + iosize;
2362 pg_offset += iosize;
2363 nr++;
2364 }
2365done:
2366 if (nr == 0) {
2367 /* make sure the mapping tag for page dirty gets cleared */
2368 set_page_writeback(page);
2369 end_page_writeback(page);
2370 }
2371 if (unlock_start <= page_end)
2372 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2373 unlock_page(page);
2374
2375update_nr_written:
2376 wbc->nr_to_write -= nr_written;
2377 if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2378 wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2379 page->mapping->writeback_index = page->index + nr_written;
2380 return 0;
2381}
2382
2383/**
2384 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2385 * @mapping: address space structure to write
2386 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2387 * @writepage: function called for each page
2388 * @data: data passed to writepage function
2389 *
2390 * If a page is already under I/O, write_cache_pages() skips it, even
2391 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
2392 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
2393 * and msync() need to guarantee that all the data which was dirty at the time
2394 * the call was made get new I/O started against them. If wbc->sync_mode is
2395 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2396 * existing IO to complete.
2397 */
2398int extent_write_cache_pages(struct extent_io_tree *tree,
2399 struct address_space *mapping,
2400 struct writeback_control *wbc,
2401 writepage_t writepage, void *data,
2402 void (*flush_fn)(void *))
2403{
2404 struct backing_dev_info *bdi = mapping->backing_dev_info;
2405 int ret = 0;
2406 int done = 0;
2407 struct pagevec pvec;
2408 int nr_pages;
2409 pgoff_t index;
2410 pgoff_t end; /* Inclusive */
2411 int scanned = 0;
2412 int range_whole = 0;
2413
2414 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2415 wbc->encountered_congestion = 1;
2416 return 0;
2417 }
2418
2419 pagevec_init(&pvec, 0);
2420 if (wbc->range_cyclic) {
2421 index = mapping->writeback_index; /* Start from prev offset */
2422 end = -1;
2423 } else {
2424 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2425 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2426 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2427 range_whole = 1;
2428 scanned = 1;
2429 }
2430retry:
2431 while (!done && (index <= end) &&
2432 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2433 PAGECACHE_TAG_DIRTY,
2434 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2435 unsigned i;
2436
2437 scanned = 1;
2438 for (i = 0; i < nr_pages; i++) {
2439 struct page *page = pvec.pages[i];
2440
2441 /*
2442 * At this point we hold neither mapping->tree_lock nor
2443 * lock on the page itself: the page may be truncated or
2444 * invalidated (changing page->mapping to NULL), or even
2445 * swizzled back from swapper_space to tmpfs file
2446 * mapping
2447 */
2448 if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2449 tree->ops->write_cache_pages_lock_hook(page);
2450 else
2451 lock_page(page);
2452
2453 if (unlikely(page->mapping != mapping)) {
2454 unlock_page(page);
2455 continue;
2456 }
2457
2458 if (!wbc->range_cyclic && page->index > end) {
2459 done = 1;
2460 unlock_page(page);
2461 continue;
2462 }
2463
2464 if (wbc->sync_mode != WB_SYNC_NONE) {
2465 flush_fn(data);
2466 wait_on_page_writeback(page);
2467 }
2468
2469 if (PageWriteback(page) ||
2470 !clear_page_dirty_for_io(page)) {
2471 unlock_page(page);
2472 continue;
2473 }
2474
2475 ret = (*writepage)(page, wbc, data);
2476
2477 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2478 unlock_page(page);
2479 ret = 0;
2480 }
2481 if (ret || wbc->nr_to_write <= 0)
2482 done = 1;
2483 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2484 wbc->encountered_congestion = 1;
2485 done = 1;
2486 }
2487 }
2488 pagevec_release(&pvec);
2489 cond_resched();
2490 }
2491 if (!scanned && !done) {
2492 /*
2493 * We hit the last page and there is more work to be done: wrap
2494 * back to the start of the file
2495 */
2496 scanned = 1;
2497 index = 0;
2498 goto retry;
2499 }
2500 return ret;
2501}
2502EXPORT_SYMBOL(extent_write_cache_pages);
2503
2504static noinline void flush_write_bio(void *data)
2505{
2506 struct extent_page_data *epd = data;
2507 if (epd->bio) {
2508 submit_one_bio(WRITE, epd->bio, 0, 0);
2509 epd->bio = NULL;
2510 }
2511}
2512
2513int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2514 get_extent_t *get_extent,
2515 struct writeback_control *wbc)
2516{
2517 int ret;
2518 struct address_space *mapping = page->mapping;
2519 struct extent_page_data epd = {
2520 .bio = NULL,
2521 .tree = tree,
2522 .get_extent = get_extent,
2523 .extent_locked = 0,
2524 };
2525 struct writeback_control wbc_writepages = {
2526 .bdi = wbc->bdi,
2527 .sync_mode = WB_SYNC_NONE,
2528 .older_than_this = NULL,
2529 .nr_to_write = 64,
2530 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2531 .range_end = (loff_t)-1,
2532 };
2533
2534
2535 ret = __extent_writepage(page, wbc, &epd);
2536
2537 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2538 __extent_writepage, &epd, flush_write_bio);
2539 if (epd.bio) {
2540 submit_one_bio(WRITE, epd.bio, 0, 0);
2541 }
2542 return ret;
2543}
2544EXPORT_SYMBOL(extent_write_full_page);
2545
2546int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2547 u64 start, u64 end, get_extent_t *get_extent,
2548 int mode)
2549{
2550 int ret = 0;
2551 struct address_space *mapping = inode->i_mapping;
2552 struct page *page;
2553 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2554 PAGE_CACHE_SHIFT;
2555
2556 struct extent_page_data epd = {
2557 .bio = NULL,
2558 .tree = tree,
2559 .get_extent = get_extent,
2560 .extent_locked = 1,
2561 };
2562 struct writeback_control wbc_writepages = {
2563 .bdi = inode->i_mapping->backing_dev_info,
2564 .sync_mode = mode,
2565 .older_than_this = NULL,
2566 .nr_to_write = nr_pages * 2,
2567 .range_start = start,
2568 .range_end = end + 1,
2569 };
2570
2571 while(start <= end) {
2572 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2573 if (clear_page_dirty_for_io(page))
2574 ret = __extent_writepage(page, &wbc_writepages, &epd);
2575 else {
2576 if (tree->ops && tree->ops->writepage_end_io_hook)
2577 tree->ops->writepage_end_io_hook(page, start,
2578 start + PAGE_CACHE_SIZE - 1,
2579 NULL, 1);
2580 unlock_page(page);
2581 }
2582 page_cache_release(page);
2583 start += PAGE_CACHE_SIZE;
2584 }
2585
2586 if (epd.bio)
2587 submit_one_bio(WRITE, epd.bio, 0, 0);
2588 return ret;
2589}
2590EXPORT_SYMBOL(extent_write_locked_range);
2591
2592
2593int extent_writepages(struct extent_io_tree *tree,
2594 struct address_space *mapping,
2595 get_extent_t *get_extent,
2596 struct writeback_control *wbc)
2597{
2598 int ret = 0;
2599 struct extent_page_data epd = {
2600 .bio = NULL,
2601 .tree = tree,
2602 .get_extent = get_extent,
2603 .extent_locked = 0,
2604 };
2605
2606 ret = extent_write_cache_pages(tree, mapping, wbc,
2607 __extent_writepage, &epd,
2608 flush_write_bio);
2609 if (epd.bio) {
2610 submit_one_bio(WRITE, epd.bio, 0, 0);
2611 }
2612 return ret;
2613}
2614EXPORT_SYMBOL(extent_writepages);
2615
2616int extent_readpages(struct extent_io_tree *tree,
2617 struct address_space *mapping,
2618 struct list_head *pages, unsigned nr_pages,
2619 get_extent_t get_extent)
2620{
2621 struct bio *bio = NULL;
2622 unsigned page_idx;
2623 struct pagevec pvec;
2624 unsigned long bio_flags = 0;
2625
2626 pagevec_init(&pvec, 0);
2627 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2628 struct page *page = list_entry(pages->prev, struct page, lru);
2629
2630 prefetchw(&page->flags);
2631 list_del(&page->lru);
2632 /*
2633 * what we want to do here is call add_to_page_cache_lru,
2634 * but that isn't exported, so we reproduce it here
2635 */
2636 if (!add_to_page_cache(page, mapping,
2637 page->index, GFP_KERNEL)) {
2638
2639 /* open coding of lru_cache_add, also not exported */
2640 page_cache_get(page);
2641 if (!pagevec_add(&pvec, page))
2642 __pagevec_lru_add(&pvec);
2643 __extent_read_full_page(tree, page, get_extent,
2644 &bio, 0, &bio_flags);
2645 }
2646 page_cache_release(page);
2647 }
2648 if (pagevec_count(&pvec))
2649 __pagevec_lru_add(&pvec);
2650 BUG_ON(!list_empty(pages));
2651 if (bio)
2652 submit_one_bio(READ, bio, 0, bio_flags);
2653 return 0;
2654}
2655EXPORT_SYMBOL(extent_readpages);
2656
2657/*
2658 * basic invalidatepage code, this waits on any locked or writeback
2659 * ranges corresponding to the page, and then deletes any extent state
2660 * records from the tree
2661 */
2662int extent_invalidatepage(struct extent_io_tree *tree,
2663 struct page *page, unsigned long offset)
2664{
2665 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2666 u64 end = start + PAGE_CACHE_SIZE - 1;
2667 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2668
2669 start += (offset + blocksize -1) & ~(blocksize - 1);
2670 if (start > end)
2671 return 0;
2672
2673 lock_extent(tree, start, end, GFP_NOFS);
2674 wait_on_extent_writeback(tree, start, end);
2675 clear_extent_bit(tree, start, end,
2676 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2677 1, 1, GFP_NOFS);
2678 return 0;
2679}
2680EXPORT_SYMBOL(extent_invalidatepage);
2681
2682/*
2683 * simple commit_write call, set_range_dirty is used to mark both
2684 * the pages and the extent records as dirty
2685 */
2686int extent_commit_write(struct extent_io_tree *tree,
2687 struct inode *inode, struct page *page,
2688 unsigned from, unsigned to)
2689{
2690 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2691
2692 set_page_extent_mapped(page);
2693 set_page_dirty(page);
2694
2695 if (pos > inode->i_size) {
2696 i_size_write(inode, pos);
2697 mark_inode_dirty(inode);
2698 }
2699 return 0;
2700}
2701EXPORT_SYMBOL(extent_commit_write);
2702
2703int extent_prepare_write(struct extent_io_tree *tree,
2704 struct inode *inode, struct page *page,
2705 unsigned from, unsigned to, get_extent_t *get_extent)
2706{
2707 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2708 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2709 u64 block_start;
2710 u64 orig_block_start;
2711 u64 block_end;
2712 u64 cur_end;
2713 struct extent_map *em;
2714 unsigned blocksize = 1 << inode->i_blkbits;
2715 size_t page_offset = 0;
2716 size_t block_off_start;
2717 size_t block_off_end;
2718 int err = 0;
2719 int iocount = 0;
2720 int ret = 0;
2721 int isnew;
2722
2723 set_page_extent_mapped(page);
2724
2725 block_start = (page_start + from) & ~((u64)blocksize - 1);
2726 block_end = (page_start + to - 1) | (blocksize - 1);
2727 orig_block_start = block_start;
2728
2729 lock_extent(tree, page_start, page_end, GFP_NOFS);
2730 while(block_start <= block_end) {
2731 em = get_extent(inode, page, page_offset, block_start,
2732 block_end - block_start + 1, 1);
2733 if (IS_ERR(em) || !em) {
2734 goto err;
2735 }
2736 cur_end = min(block_end, extent_map_end(em) - 1);
2737 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2738 block_off_end = block_off_start + blocksize;
2739 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2740
2741 if (!PageUptodate(page) && isnew &&
2742 (block_off_end > to || block_off_start < from)) {
2743 void *kaddr;
2744
2745 kaddr = kmap_atomic(page, KM_USER0);
2746 if (block_off_end > to)
2747 memset(kaddr + to, 0, block_off_end - to);
2748 if (block_off_start < from)
2749 memset(kaddr + block_off_start, 0,
2750 from - block_off_start);
2751 flush_dcache_page(page);
2752 kunmap_atomic(kaddr, KM_USER0);
2753 }
2754 if ((em->block_start != EXTENT_MAP_HOLE &&
2755 em->block_start != EXTENT_MAP_INLINE) &&
2756 !isnew && !PageUptodate(page) &&
2757 (block_off_end > to || block_off_start < from) &&
2758 !test_range_bit(tree, block_start, cur_end,
2759 EXTENT_UPTODATE, 1)) {
2760 u64 sector;
2761 u64 extent_offset = block_start - em->start;
2762 size_t iosize;
2763 sector = (em->block_start + extent_offset) >> 9;
2764 iosize = (cur_end - block_start + blocksize) &
2765 ~((u64)blocksize - 1);
2766 /*
2767 * we've already got the extent locked, but we
2768 * need to split the state such that our end_bio
2769 * handler can clear the lock.
2770 */
2771 set_extent_bit(tree, block_start,
2772 block_start + iosize - 1,
2773 EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2774 ret = submit_extent_page(READ, tree, page,
2775 sector, iosize, page_offset, em->bdev,
2776 NULL, 1,
2777 end_bio_extent_preparewrite, 0,
2778 0, 0);
2779 iocount++;
2780 block_start = block_start + iosize;
2781 } else {
2782 set_extent_uptodate(tree, block_start, cur_end,
2783 GFP_NOFS);
2784 unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2785 block_start = cur_end + 1;
2786 }
2787 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2788 free_extent_map(em);
2789 }
2790 if (iocount) {
2791 wait_extent_bit(tree, orig_block_start,
2792 block_end, EXTENT_LOCKED);
2793 }
2794 check_page_uptodate(tree, page);
2795err:
2796 /* FIXME, zero out newly allocated blocks on error */
2797 return err;
2798}
2799EXPORT_SYMBOL(extent_prepare_write);
2800
2801/*
2802 * a helper for releasepage, this tests for areas of the page that
2803 * are locked or under IO and drops the related state bits if it is safe
2804 * to drop the page.
2805 */
2806int try_release_extent_state(struct extent_map_tree *map,
2807 struct extent_io_tree *tree, struct page *page,
2808 gfp_t mask)
2809{
2810 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2811 u64 end = start + PAGE_CACHE_SIZE - 1;
2812 int ret = 1;
2813
2814 if (test_range_bit(tree, start, end,
2815 EXTENT_IOBITS | EXTENT_ORDERED, 0))
2816 ret = 0;
2817 else {
2818 if ((mask & GFP_NOFS) == GFP_NOFS)
2819 mask = GFP_NOFS;
2820 clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2821 1, 1, mask);
2822 }
2823 return ret;
2824}
2825EXPORT_SYMBOL(try_release_extent_state);
2826
2827/*
2828 * a helper for releasepage. As long as there are no locked extents
2829 * in the range corresponding to the page, both state records and extent
2830 * map records are removed
2831 */
2832int try_release_extent_mapping(struct extent_map_tree *map,
2833 struct extent_io_tree *tree, struct page *page,
2834 gfp_t mask)
2835{
2836 struct extent_map *em;
2837 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2838 u64 end = start + PAGE_CACHE_SIZE - 1;
2839
2840 if ((mask & __GFP_WAIT) &&
2841 page->mapping->host->i_size > 16 * 1024 * 1024) {
2842 u64 len;
2843 while (start <= end) {
2844 len = end - start + 1;
2845 spin_lock(&map->lock);
2846 em = lookup_extent_mapping(map, start, len);
2847 if (!em || IS_ERR(em)) {
2848 spin_unlock(&map->lock);
2849 break;
2850 }
2851 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2852 em->start != start) {
2853 spin_unlock(&map->lock);
2854 free_extent_map(em);
2855 break;
2856 }
2857 if (!test_range_bit(tree, em->start,
2858 extent_map_end(em) - 1,
2859 EXTENT_LOCKED | EXTENT_WRITEBACK |
2860 EXTENT_ORDERED,
2861 0)) {
2862 remove_extent_mapping(map, em);
2863 /* once for the rb tree */
2864 free_extent_map(em);
2865 }
2866 start = extent_map_end(em);
2867 spin_unlock(&map->lock);
2868
2869 /* once for us */
2870 free_extent_map(em);
2871 }
2872 }
2873 return try_release_extent_state(map, tree, page, mask);
2874}
2875EXPORT_SYMBOL(try_release_extent_mapping);
2876
2877sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2878 get_extent_t *get_extent)
2879{
2880 struct inode *inode = mapping->host;
2881 u64 start = iblock << inode->i_blkbits;
2882 sector_t sector = 0;
2883 size_t blksize = (1 << inode->i_blkbits);
2884 struct extent_map *em;
2885
2886 lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2887 GFP_NOFS);
2888 em = get_extent(inode, NULL, 0, start, blksize, 0);
2889 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2890 GFP_NOFS);
2891 if (!em || IS_ERR(em))
2892 return 0;
2893
2894 if (em->block_start > EXTENT_MAP_LAST_BYTE)
2895 goto out;
2896
2897 sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2898out:
2899 free_extent_map(em);
2900 return sector;
2901}
2902
2903static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2904 unsigned long i)
2905{
2906 struct page *p;
2907 struct address_space *mapping;
2908
2909 if (i == 0)
2910 return eb->first_page;
2911 i += eb->start >> PAGE_CACHE_SHIFT;
2912 mapping = eb->first_page->mapping;
2913 if (!mapping)
2914 return NULL;
2915
2916 /*
2917 * extent_buffer_page is only called after pinning the page
2918 * by increasing the reference count. So we know the page must
2919 * be in the radix tree.
2920 */
2921 rcu_read_lock();
2922 p = radix_tree_lookup(&mapping->page_tree, i);
2923 rcu_read_unlock();
2924
2925 return p;
2926}
2927
2928static inline unsigned long num_extent_pages(u64 start, u64 len)
2929{
2930 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2931 (start >> PAGE_CACHE_SHIFT);
2932}
2933
2934static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2935 u64 start,
2936 unsigned long len,
2937 gfp_t mask)
2938{
2939 struct extent_buffer *eb = NULL;
2940#ifdef LEAK_DEBUG
2941 unsigned long flags;
2942#endif
2943
2944 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2945 eb->start = start;
2946 eb->len = len;
2947 mutex_init(&eb->mutex);
2948#ifdef LEAK_DEBUG
2949 spin_lock_irqsave(&leak_lock, flags);
2950 list_add(&eb->leak_list, &buffers);
2951 spin_unlock_irqrestore(&leak_lock, flags);
2952#endif
2953 atomic_set(&eb->refs, 1);
2954
2955 return eb;
2956}
2957
2958static void __free_extent_buffer(struct extent_buffer *eb)
2959{
2960#ifdef LEAK_DEBUG
2961 unsigned long flags;
2962 spin_lock_irqsave(&leak_lock, flags);
2963 list_del(&eb->leak_list);
2964 spin_unlock_irqrestore(&leak_lock, flags);
2965#endif
2966 kmem_cache_free(extent_buffer_cache, eb);
2967}
2968
2969struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2970 u64 start, unsigned long len,
2971 struct page *page0,
2972 gfp_t mask)
2973{
2974 unsigned long num_pages = num_extent_pages(start, len);
2975 unsigned long i;
2976 unsigned long index = start >> PAGE_CACHE_SHIFT;
2977 struct extent_buffer *eb;
2978 struct extent_buffer *exists = NULL;
2979 struct page *p;
2980 struct address_space *mapping = tree->mapping;
2981 int uptodate = 1;
2982
2983 spin_lock(&tree->buffer_lock);
2984 eb = buffer_search(tree, start);
2985 if (eb) {
2986 atomic_inc(&eb->refs);
2987 spin_unlock(&tree->buffer_lock);
2988 mark_page_accessed(eb->first_page);
2989 return eb;
2990 }
2991 spin_unlock(&tree->buffer_lock);
2992
2993 eb = __alloc_extent_buffer(tree, start, len, mask);
2994 if (!eb)
2995 return NULL;
2996
2997 if (page0) {
2998 eb->first_page = page0;
2999 i = 1;
3000 index++;
3001 page_cache_get(page0);
3002 mark_page_accessed(page0);
3003 set_page_extent_mapped(page0);
3004 set_page_extent_head(page0, len);
3005 uptodate = PageUptodate(page0);
3006 } else {
3007 i = 0;
3008 }
3009 for (; i < num_pages; i++, index++) {
3010 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
3011 if (!p) {
3012 WARN_ON(1);
3013 goto free_eb;
3014 }
3015 set_page_extent_mapped(p);
3016 mark_page_accessed(p);
3017 if (i == 0) {
3018 eb->first_page = p;
3019 set_page_extent_head(p, len);
3020 } else {
3021 set_page_private(p, EXTENT_PAGE_PRIVATE);
3022 }
3023 if (!PageUptodate(p))
3024 uptodate = 0;
3025 unlock_page(p);
3026 }
3027 if (uptodate)
3028 eb->flags |= EXTENT_UPTODATE;
3029 eb->flags |= EXTENT_BUFFER_FILLED;
3030
3031 spin_lock(&tree->buffer_lock);
3032 exists = buffer_tree_insert(tree, start, &eb->rb_node);
3033 if (exists) {
3034 /* add one reference for the caller */
3035 atomic_inc(&exists->refs);
3036 spin_unlock(&tree->buffer_lock);
3037 goto free_eb;
3038 }
3039 spin_unlock(&tree->buffer_lock);
3040
3041 /* add one reference for the tree */
3042 atomic_inc(&eb->refs);
3043 return eb;
3044
3045free_eb:
3046 if (!atomic_dec_and_test(&eb->refs))
3047 return exists;
3048 for (index = 1; index < i; index++)
3049 page_cache_release(extent_buffer_page(eb, index));
3050 page_cache_release(extent_buffer_page(eb, 0));
3051 __free_extent_buffer(eb);
3052 return exists;
3053}
3054EXPORT_SYMBOL(alloc_extent_buffer);
3055
3056struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3057 u64 start, unsigned long len,
3058 gfp_t mask)
3059{
3060 struct extent_buffer *eb;
3061
3062 spin_lock(&tree->buffer_lock);
3063 eb = buffer_search(tree, start);
3064 if (eb)
3065 atomic_inc(&eb->refs);
3066 spin_unlock(&tree->buffer_lock);
3067
3068 if (eb)
3069 mark_page_accessed(eb->first_page);
3070
3071 return eb;
3072}
3073EXPORT_SYMBOL(find_extent_buffer);
3074
3075void free_extent_buffer(struct extent_buffer *eb)
3076{
3077 if (!eb)
3078 return;
3079
3080 if (!atomic_dec_and_test(&eb->refs))
3081 return;
3082
3083 WARN_ON(1);
3084}
3085EXPORT_SYMBOL(free_extent_buffer);
3086
3087int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3088 struct extent_buffer *eb)
3089{
3090 int set;
3091 unsigned long i;
3092 unsigned long num_pages;
3093 struct page *page;
3094
3095 u64 start = eb->start;
3096 u64 end = start + eb->len - 1;
3097
3098 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
3099 num_pages = num_extent_pages(eb->start, eb->len);
3100
3101 for (i = 0; i < num_pages; i++) {
3102 page = extent_buffer_page(eb, i);
3103 if (!set && !PageDirty(page))
3104 continue;
3105
3106 lock_page(page);
3107 if (i == 0)
3108 set_page_extent_head(page, eb->len);
3109 else
3110 set_page_private(page, EXTENT_PAGE_PRIVATE);
3111
3112 /*
3113 * if we're on the last page or the first page and the
3114 * block isn't aligned on a page boundary, do extra checks
3115 * to make sure we don't clean page that is partially dirty
3116 */
3117 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3118 ((i == num_pages - 1) &&
3119 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3120 start = (u64)page->index << PAGE_CACHE_SHIFT;
3121 end = start + PAGE_CACHE_SIZE - 1;
3122 if (test_range_bit(tree, start, end,
3123 EXTENT_DIRTY, 0)) {
3124 unlock_page(page);
3125 continue;
3126 }
3127 }
3128 clear_page_dirty_for_io(page);
3129 spin_lock_irq(&page->mapping->tree_lock);
3130 if (!PageDirty(page)) {
3131 radix_tree_tag_clear(&page->mapping->page_tree,
3132 page_index(page),
3133 PAGECACHE_TAG_DIRTY);
3134 }
3135 spin_unlock_irq(&page->mapping->tree_lock);
3136 unlock_page(page);
3137 }
3138 return 0;
3139}
3140EXPORT_SYMBOL(clear_extent_buffer_dirty);
3141
3142int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3143 struct extent_buffer *eb)
3144{
3145 return wait_on_extent_writeback(tree, eb->start,
3146 eb->start + eb->len - 1);
3147}
3148EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
3149
3150int set_extent_buffer_dirty(struct extent_io_tree *tree,
3151 struct extent_buffer *eb)
3152{
3153 unsigned long i;
3154 unsigned long num_pages;
3155
3156 num_pages = num_extent_pages(eb->start, eb->len);
3157 for (i = 0; i < num_pages; i++) {
3158 struct page *page = extent_buffer_page(eb, i);
3159 /* writepage may need to do something special for the
3160 * first page, we have to make sure page->private is
3161 * properly set. releasepage may drop page->private
3162 * on us if the page isn't already dirty.
3163 */
3164 lock_page(page);
3165 if (i == 0) {
3166 set_page_extent_head(page, eb->len);
3167 } else if (PagePrivate(page) &&
3168 page->private != EXTENT_PAGE_PRIVATE) {
3169 set_page_extent_mapped(page);
3170 }
3171 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3172 set_extent_dirty(tree, page_offset(page),
3173 page_offset(page) + PAGE_CACHE_SIZE -1,
3174 GFP_NOFS);
3175 unlock_page(page);
3176 }
3177 return 0;
3178}
3179EXPORT_SYMBOL(set_extent_buffer_dirty);
3180
3181int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3182 struct extent_buffer *eb)
3183{
3184 unsigned long i;
3185 struct page *page;
3186 unsigned long num_pages;
3187
3188 num_pages = num_extent_pages(eb->start, eb->len);
3189 eb->flags &= ~EXTENT_UPTODATE;
3190
3191 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3192 GFP_NOFS);
3193 for (i = 0; i < num_pages; i++) {
3194 page = extent_buffer_page(eb, i);
3195 if (page)
3196 ClearPageUptodate(page);
3197 }
3198 return 0;
3199}
3200
3201int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3202 struct extent_buffer *eb)
3203{
3204 unsigned long i;
3205 struct page *page;
3206 unsigned long num_pages;
3207
3208 num_pages = num_extent_pages(eb->start, eb->len);
3209
3210 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3211 GFP_NOFS);
3212 for (i = 0; i < num_pages; i++) {
3213 page = extent_buffer_page(eb, i);
3214 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3215 ((i == num_pages - 1) &&
3216 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3217 check_page_uptodate(tree, page);
3218 continue;
3219 }
3220 SetPageUptodate(page);
3221 }
3222 return 0;
3223}
3224EXPORT_SYMBOL(set_extent_buffer_uptodate);
3225
3226int extent_range_uptodate(struct extent_io_tree *tree,
3227 u64 start, u64 end)
3228{
3229 struct page *page;
3230 int ret;
3231 int pg_uptodate = 1;
3232 int uptodate;
3233 unsigned long index;
3234
3235 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
3236 if (ret)
3237 return 1;
3238 while(start <= end) {
3239 index = start >> PAGE_CACHE_SHIFT;
3240 page = find_get_page(tree->mapping, index);
3241 uptodate = PageUptodate(page);
3242 page_cache_release(page);
3243 if (!uptodate) {
3244 pg_uptodate = 0;
3245 break;
3246 }
3247 start += PAGE_CACHE_SIZE;
3248 }
3249 return pg_uptodate;
3250}
3251
3252int extent_buffer_uptodate(struct extent_io_tree *tree,
3253 struct extent_buffer *eb)
3254{
3255 int ret = 0;
3256 unsigned long num_pages;
3257 unsigned long i;
3258 struct page *page;
3259 int pg_uptodate = 1;
3260
3261 if (eb->flags & EXTENT_UPTODATE)
3262 return 1;
3263
3264 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3265 EXTENT_UPTODATE, 1);
3266 if (ret)
3267 return ret;
3268
3269 num_pages = num_extent_pages(eb->start, eb->len);
3270 for (i = 0; i < num_pages; i++) {
3271 page = extent_buffer_page(eb, i);
3272 if (!PageUptodate(page)) {
3273 pg_uptodate = 0;
3274 break;
3275 }
3276 }
3277 return pg_uptodate;
3278}
3279EXPORT_SYMBOL(extent_buffer_uptodate);
3280
3281int read_extent_buffer_pages(struct extent_io_tree *tree,
3282 struct extent_buffer *eb,
3283 u64 start, int wait,
3284 get_extent_t *get_extent, int mirror_num)
3285{
3286 unsigned long i;
3287 unsigned long start_i;
3288 struct page *page;
3289 int err;
3290 int ret = 0;
3291 int locked_pages = 0;
3292 int all_uptodate = 1;
3293 int inc_all_pages = 0;
3294 unsigned long num_pages;
3295 struct bio *bio = NULL;
3296 unsigned long bio_flags = 0;
3297
3298 if (eb->flags & EXTENT_UPTODATE)
3299 return 0;
3300
3301 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3302 EXTENT_UPTODATE, 1)) {
3303 return 0;
3304 }
3305
3306 if (start) {
3307 WARN_ON(start < eb->start);
3308 start_i = (start >> PAGE_CACHE_SHIFT) -
3309 (eb->start >> PAGE_CACHE_SHIFT);
3310 } else {
3311 start_i = 0;
3312 }
3313
3314 num_pages = num_extent_pages(eb->start, eb->len);
3315 for (i = start_i; i < num_pages; i++) {
3316 page = extent_buffer_page(eb, i);
3317 if (!wait) {
3318 if (!trylock_page(page))
3319 goto unlock_exit;
3320 } else {
3321 lock_page(page);
3322 }
3323 locked_pages++;
3324 if (!PageUptodate(page)) {
3325 all_uptodate = 0;
3326 }
3327 }
3328 if (all_uptodate) {
3329 if (start_i == 0)
3330 eb->flags |= EXTENT_UPTODATE;
3331 if (ret) {
3332 printk("all up to date but ret is %d\n", ret);
3333 }
3334 goto unlock_exit;
3335 }
3336
3337 for (i = start_i; i < num_pages; i++) {
3338 page = extent_buffer_page(eb, i);
3339 if (inc_all_pages)
3340 page_cache_get(page);
3341 if (!PageUptodate(page)) {
3342 if (start_i == 0)
3343 inc_all_pages = 1;
3344 ClearPageError(page);
3345 err = __extent_read_full_page(tree, page,
3346 get_extent, &bio,
3347 mirror_num, &bio_flags);
3348 if (err) {
3349 ret = err;
3350 printk("err %d from __extent_read_full_page\n", ret);
3351 }
3352 } else {
3353 unlock_page(page);
3354 }
3355 }
3356
3357 if (bio)
3358 submit_one_bio(READ, bio, mirror_num, bio_flags);
3359
3360 if (ret || !wait) {
3361 if (ret)
3362 printk("ret %d wait %d returning\n", ret, wait);
3363 return ret;
3364 }
3365 for (i = start_i; i < num_pages; i++) {
3366 page = extent_buffer_page(eb, i);
3367 wait_on_page_locked(page);
3368 if (!PageUptodate(page)) {
3369 printk("page not uptodate after wait_on_page_locked\n");
3370 ret = -EIO;
3371 }
3372 }
3373 if (!ret)
3374 eb->flags |= EXTENT_UPTODATE;
3375 return ret;
3376
3377unlock_exit:
3378 i = start_i;
3379 while(locked_pages > 0) {
3380 page = extent_buffer_page(eb, i);
3381 i++;
3382 unlock_page(page);
3383 locked_pages--;
3384 }
3385 return ret;
3386}
3387EXPORT_SYMBOL(read_extent_buffer_pages);
3388
3389void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3390 unsigned long start,
3391 unsigned long len)
3392{
3393 size_t cur;
3394 size_t offset;
3395 struct page *page;
3396 char *kaddr;
3397 char *dst = (char *)dstv;
3398 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3399 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3400
3401 WARN_ON(start > eb->len);
3402 WARN_ON(start + len > eb->start + eb->len);
3403
3404 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3405
3406 while(len > 0) {
3407 page = extent_buffer_page(eb, i);
3408
3409 cur = min(len, (PAGE_CACHE_SIZE - offset));
3410 kaddr = kmap_atomic(page, KM_USER1);
3411 memcpy(dst, kaddr + offset, cur);
3412 kunmap_atomic(kaddr, KM_USER1);
3413
3414 dst += cur;
3415 len -= cur;
3416 offset = 0;
3417 i++;
3418 }
3419}
3420EXPORT_SYMBOL(read_extent_buffer);
3421
3422int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3423 unsigned long min_len, char **token, char **map,
3424 unsigned long *map_start,
3425 unsigned long *map_len, int km)
3426{
3427 size_t offset = start & (PAGE_CACHE_SIZE - 1);
3428 char *kaddr;
3429 struct page *p;
3430 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3431 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3432 unsigned long end_i = (start_offset + start + min_len - 1) >>
3433 PAGE_CACHE_SHIFT;
3434
3435 if (i != end_i)
3436 return -EINVAL;
3437
3438 if (i == 0) {
3439 offset = start_offset;
3440 *map_start = 0;
3441 } else {
3442 offset = 0;
3443 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3444 }
3445 if (start + min_len > eb->len) {
3446printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
3447 WARN_ON(1);
3448 }
3449
3450 p = extent_buffer_page(eb, i);
3451 kaddr = kmap_atomic(p, km);
3452 *token = kaddr;
3453 *map = kaddr + offset;
3454 *map_len = PAGE_CACHE_SIZE - offset;
3455 return 0;
3456}
3457EXPORT_SYMBOL(map_private_extent_buffer);
3458
3459int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3460 unsigned long min_len,
3461 char **token, char **map,
3462 unsigned long *map_start,
3463 unsigned long *map_len, int km)
3464{
3465 int err;
3466 int save = 0;
3467 if (eb->map_token) {
3468 unmap_extent_buffer(eb, eb->map_token, km);
3469 eb->map_token = NULL;
3470 save = 1;
3471 }
3472 err = map_private_extent_buffer(eb, start, min_len, token, map,
3473 map_start, map_len, km);
3474 if (!err && save) {
3475 eb->map_token = *token;
3476 eb->kaddr = *map;
3477 eb->map_start = *map_start;
3478 eb->map_len = *map_len;
3479 }
3480 return err;
3481}
3482EXPORT_SYMBOL(map_extent_buffer);
3483
3484void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3485{
3486 kunmap_atomic(token, km);
3487}
3488EXPORT_SYMBOL(unmap_extent_buffer);
3489
3490int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3491 unsigned long start,
3492 unsigned long len)
3493{
3494 size_t cur;
3495 size_t offset;
3496 struct page *page;
3497 char *kaddr;
3498 char *ptr = (char *)ptrv;
3499 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3500 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3501 int ret = 0;
3502
3503 WARN_ON(start > eb->len);
3504 WARN_ON(start + len > eb->start + eb->len);
3505
3506 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3507
3508 while(len > 0) {
3509 page = extent_buffer_page(eb, i);
3510
3511 cur = min(len, (PAGE_CACHE_SIZE - offset));
3512
3513 kaddr = kmap_atomic(page, KM_USER0);
3514 ret = memcmp(ptr, kaddr + offset, cur);
3515 kunmap_atomic(kaddr, KM_USER0);
3516 if (ret)
3517 break;
3518
3519 ptr += cur;
3520 len -= cur;
3521 offset = 0;
3522 i++;
3523 }
3524 return ret;
3525}
3526EXPORT_SYMBOL(memcmp_extent_buffer);
3527
3528void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3529 unsigned long start, unsigned long len)
3530{
3531 size_t cur;
3532 size_t offset;
3533 struct page *page;
3534 char *kaddr;
3535 char *src = (char *)srcv;
3536 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3537 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3538
3539 WARN_ON(start > eb->len);
3540 WARN_ON(start + len > eb->start + eb->len);
3541
3542 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3543
3544 while(len > 0) {
3545 page = extent_buffer_page(eb, i);
3546 WARN_ON(!PageUptodate(page));
3547
3548 cur = min(len, PAGE_CACHE_SIZE - offset);
3549 kaddr = kmap_atomic(page, KM_USER1);
3550 memcpy(kaddr + offset, src, cur);
3551 kunmap_atomic(kaddr, KM_USER1);
3552
3553 src += cur;
3554 len -= cur;
3555 offset = 0;
3556 i++;
3557 }
3558}
3559EXPORT_SYMBOL(write_extent_buffer);
3560
3561void memset_extent_buffer(struct extent_buffer *eb, char c,
3562 unsigned long start, unsigned long len)
3563{
3564 size_t cur;
3565 size_t offset;
3566 struct page *page;
3567 char *kaddr;
3568 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3569 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3570
3571 WARN_ON(start > eb->len);
3572 WARN_ON(start + len > eb->start + eb->len);
3573
3574 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3575
3576 while(len > 0) {
3577 page = extent_buffer_page(eb, i);
3578 WARN_ON(!PageUptodate(page));
3579
3580 cur = min(len, PAGE_CACHE_SIZE - offset);
3581 kaddr = kmap_atomic(page, KM_USER0);
3582 memset(kaddr + offset, c, cur);
3583 kunmap_atomic(kaddr, KM_USER0);
3584
3585 len -= cur;
3586 offset = 0;
3587 i++;
3588 }
3589}
3590EXPORT_SYMBOL(memset_extent_buffer);
3591
3592void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3593 unsigned long dst_offset, unsigned long src_offset,
3594 unsigned long len)
3595{
3596 u64 dst_len = dst->len;
3597 size_t cur;
3598 size_t offset;
3599 struct page *page;
3600 char *kaddr;
3601 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3602 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3603
3604 WARN_ON(src->len != dst_len);
3605
3606 offset = (start_offset + dst_offset) &
3607 ((unsigned long)PAGE_CACHE_SIZE - 1);
3608
3609 while(len > 0) {
3610 page = extent_buffer_page(dst, i);
3611 WARN_ON(!PageUptodate(page));
3612
3613 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3614
3615 kaddr = kmap_atomic(page, KM_USER0);
3616 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3617 kunmap_atomic(kaddr, KM_USER0);
3618
3619 src_offset += cur;
3620 len -= cur;
3621 offset = 0;
3622 i++;
3623 }
3624}
3625EXPORT_SYMBOL(copy_extent_buffer);
3626
3627static void move_pages(struct page *dst_page, struct page *src_page,
3628 unsigned long dst_off, unsigned long src_off,
3629 unsigned long len)
3630{
3631 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3632 if (dst_page == src_page) {
3633 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3634 } else {
3635 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3636 char *p = dst_kaddr + dst_off + len;
3637 char *s = src_kaddr + src_off + len;
3638
3639 while (len--)
3640 *--p = *--s;
3641
3642 kunmap_atomic(src_kaddr, KM_USER1);
3643 }
3644 kunmap_atomic(dst_kaddr, KM_USER0);
3645}
3646
3647static void copy_pages(struct page *dst_page, struct page *src_page,
3648 unsigned long dst_off, unsigned long src_off,
3649 unsigned long len)
3650{
3651 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3652 char *src_kaddr;
3653
3654 if (dst_page != src_page)
3655 src_kaddr = kmap_atomic(src_page, KM_USER1);
3656 else
3657 src_kaddr = dst_kaddr;
3658
3659 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3660 kunmap_atomic(dst_kaddr, KM_USER0);
3661 if (dst_page != src_page)
3662 kunmap_atomic(src_kaddr, KM_USER1);
3663}
3664
3665void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3666 unsigned long src_offset, unsigned long len)
3667{
3668 size_t cur;
3669 size_t dst_off_in_page;
3670 size_t src_off_in_page;
3671 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3672 unsigned long dst_i;
3673 unsigned long src_i;
3674
3675 if (src_offset + len > dst->len) {
3676 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3677 src_offset, len, dst->len);
3678 BUG_ON(1);
3679 }
3680 if (dst_offset + len > dst->len) {
3681 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3682 dst_offset, len, dst->len);
3683 BUG_ON(1);
3684 }
3685
3686 while(len > 0) {
3687 dst_off_in_page = (start_offset + dst_offset) &
3688 ((unsigned long)PAGE_CACHE_SIZE - 1);
3689 src_off_in_page = (start_offset + src_offset) &
3690 ((unsigned long)PAGE_CACHE_SIZE - 1);
3691
3692 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3693 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3694
3695 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3696 src_off_in_page));
3697 cur = min_t(unsigned long, cur,
3698 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3699
3700 copy_pages(extent_buffer_page(dst, dst_i),
3701 extent_buffer_page(dst, src_i),
3702 dst_off_in_page, src_off_in_page, cur);
3703
3704 src_offset += cur;
3705 dst_offset += cur;
3706 len -= cur;
3707 }
3708}
3709EXPORT_SYMBOL(memcpy_extent_buffer);
3710
3711void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3712 unsigned long src_offset, unsigned long len)
3713{
3714 size_t cur;
3715 size_t dst_off_in_page;
3716 size_t src_off_in_page;
3717 unsigned long dst_end = dst_offset + len - 1;
3718 unsigned long src_end = src_offset + len - 1;
3719 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3720 unsigned long dst_i;
3721 unsigned long src_i;
3722
3723 if (src_offset + len > dst->len) {
3724 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3725 src_offset, len, dst->len);
3726 BUG_ON(1);
3727 }
3728 if (dst_offset + len > dst->len) {
3729 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3730 dst_offset, len, dst->len);
3731 BUG_ON(1);
3732 }
3733 if (dst_offset < src_offset) {
3734 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3735 return;
3736 }
3737 while(len > 0) {
3738 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3739 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3740
3741 dst_off_in_page = (start_offset + dst_end) &
3742 ((unsigned long)PAGE_CACHE_SIZE - 1);
3743 src_off_in_page = (start_offset + src_end) &
3744 ((unsigned long)PAGE_CACHE_SIZE - 1);
3745
3746 cur = min_t(unsigned long, len, src_off_in_page + 1);
3747 cur = min(cur, dst_off_in_page + 1);
3748 move_pages(extent_buffer_page(dst, dst_i),
3749 extent_buffer_page(dst, src_i),
3750 dst_off_in_page - cur + 1,
3751 src_off_in_page - cur + 1, cur);
3752
3753 dst_end -= cur;
3754 src_end -= cur;
3755 len -= cur;
3756 }
3757}
3758EXPORT_SYMBOL(memmove_extent_buffer);
3759
3760int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3761{
3762 u64 start = page_offset(page);
3763 struct extent_buffer *eb;
3764 int ret = 1;
3765 unsigned long i;
3766 unsigned long num_pages;
3767
3768 spin_lock(&tree->buffer_lock);
3769 eb = buffer_search(tree, start);
3770 if (!eb)
3771 goto out;
3772
3773 if (atomic_read(&eb->refs) > 1) {
3774 ret = 0;
3775 goto out;
3776 }
3777 /* at this point we can safely release the extent buffer */
3778 num_pages = num_extent_pages(eb->start, eb->len);
3779 for (i = 0; i < num_pages; i++)
3780 page_cache_release(extent_buffer_page(eb, i));
3781 rb_erase(&eb->rb_node, &tree->buffer);
3782 __free_extent_buffer(eb);
3783out:
3784 spin_unlock(&tree->buffer_lock);
3785 return ret;
3786}
3787EXPORT_SYMBOL(try_release_extent_buffer);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 000000000000..2d5f67065b69
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,268 @@
1#ifndef __EXTENTIO__
2#define __EXTENTIO__
3
4#include <linux/rbtree.h>
5
6/* bits for the extent state */
7#define EXTENT_DIRTY 1
8#define EXTENT_WRITEBACK (1 << 1)
9#define EXTENT_UPTODATE (1 << 2)
10#define EXTENT_LOCKED (1 << 3)
11#define EXTENT_NEW (1 << 4)
12#define EXTENT_DELALLOC (1 << 5)
13#define EXTENT_DEFRAG (1 << 6)
14#define EXTENT_DEFRAG_DONE (1 << 7)
15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_ORDERED (1 << 9)
17#define EXTENT_ORDERED_METADATA (1 << 10)
18#define EXTENT_BOUNDARY (1 << 11)
19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
20
21/* flags for bio submission */
22#define EXTENT_BIO_COMPRESSED 1
23
24/*
25 * page->private values. Every page that is controlled by the extent
26 * map has page->private set to one.
27 */
28#define EXTENT_PAGE_PRIVATE 1
29#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
30
31struct extent_state;
32
33typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
34 struct bio *bio, int mirror_num,
35 unsigned long bio_flags);
36struct extent_io_ops {
37 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
38 u64 start, u64 end, int *page_started,
39 unsigned long *nr_written);
40 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
41 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
42 extent_submit_bio_hook_t *submit_bio_hook;
43 int (*merge_bio_hook)(struct page *page, unsigned long offset,
44 size_t size, struct bio *bio,
45 unsigned long bio_flags);
46 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
47 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
48 u64 start, u64 end,
49 struct extent_state *state);
50 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
51 u64 start, u64 end,
52 struct extent_state *state);
53 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
54 struct extent_state *state);
55 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
56 struct extent_state *state, int uptodate);
57 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
58 unsigned long old, unsigned long bits);
59 int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
60 unsigned long old, unsigned long bits);
61 int (*write_cache_pages_lock_hook)(struct page *page);
62};
63
64struct extent_io_tree {
65 struct rb_root state;
66 struct rb_root buffer;
67 struct address_space *mapping;
68 u64 dirty_bytes;
69 spinlock_t lock;
70 spinlock_t buffer_lock;
71 struct extent_io_ops *ops;
72};
73
74struct extent_state {
75 u64 start;
76 u64 end; /* inclusive */
77 struct rb_node rb_node;
78 struct extent_io_tree *tree;
79 wait_queue_head_t wq;
80 atomic_t refs;
81 unsigned long state;
82
83 /* for use by the FS */
84 u64 private;
85
86 struct list_head leak_list;
87};
88
89struct extent_buffer {
90 u64 start;
91 unsigned long len;
92 char *map_token;
93 char *kaddr;
94 unsigned long map_start;
95 unsigned long map_len;
96 struct page *first_page;
97 atomic_t refs;
98 int flags;
99 struct list_head leak_list;
100 struct rb_node rb_node;
101 struct mutex mutex;
102};
103
104struct extent_map_tree;
105
106static inline struct extent_state *extent_state_next(struct extent_state *state)
107{
108 struct rb_node *node;
109 node = rb_next(&state->rb_node);
110 if (!node)
111 return NULL;
112 return rb_entry(node, struct extent_state, rb_node);
113}
114
115typedef struct extent_map *(get_extent_t)(struct inode *inode,
116 struct page *page,
117 size_t page_offset,
118 u64 start, u64 len,
119 int create);
120
121void extent_io_tree_init(struct extent_io_tree *tree,
122 struct address_space *mapping, gfp_t mask);
123int try_release_extent_mapping(struct extent_map_tree *map,
124 struct extent_io_tree *tree, struct page *page,
125 gfp_t mask);
126int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
127int try_release_extent_state(struct extent_map_tree *map,
128 struct extent_io_tree *tree, struct page *page,
129 gfp_t mask);
130int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
131int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
132int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
133 gfp_t mask);
134int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
135 get_extent_t *get_extent);
136int __init extent_io_init(void);
137void extent_io_exit(void);
138
139u64 count_range_bits(struct extent_io_tree *tree,
140 u64 *start, u64 search_end,
141 u64 max_bytes, unsigned long bits);
142
143int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
144 int bits, int filled);
145int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
146 int bits, gfp_t mask);
147int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
148 int bits, int wake, int delete, gfp_t mask);
149int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
150 int bits, gfp_t mask);
151int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
152 gfp_t mask);
153int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
154 gfp_t mask);
155int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
156 gfp_t mask);
157int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
158 gfp_t mask);
159int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
160 gfp_t mask);
161int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
162 u64 end, gfp_t mask);
163int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
164 gfp_t mask);
165int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
166 gfp_t mask);
167int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
168 u64 *start_ret, u64 *end_ret, int bits);
169struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
170 u64 start, int bits);
171int extent_invalidatepage(struct extent_io_tree *tree,
172 struct page *page, unsigned long offset);
173int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
174 get_extent_t *get_extent,
175 struct writeback_control *wbc);
176int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
177 u64 start, u64 end, get_extent_t *get_extent,
178 int mode);
179int extent_writepages(struct extent_io_tree *tree,
180 struct address_space *mapping,
181 get_extent_t *get_extent,
182 struct writeback_control *wbc);
183int extent_readpages(struct extent_io_tree *tree,
184 struct address_space *mapping,
185 struct list_head *pages, unsigned nr_pages,
186 get_extent_t get_extent);
187int extent_prepare_write(struct extent_io_tree *tree,
188 struct inode *inode, struct page *page,
189 unsigned from, unsigned to, get_extent_t *get_extent);
190int extent_commit_write(struct extent_io_tree *tree,
191 struct inode *inode, struct page *page,
192 unsigned from, unsigned to);
193sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
194 get_extent_t *get_extent);
195int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
196int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
197int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
198void set_page_extent_mapped(struct page *page);
199
200struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
201 u64 start, unsigned long len,
202 struct page *page0,
203 gfp_t mask);
204struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
205 u64 start, unsigned long len,
206 gfp_t mask);
207void free_extent_buffer(struct extent_buffer *eb);
208int read_extent_buffer_pages(struct extent_io_tree *tree,
209 struct extent_buffer *eb, u64 start, int wait,
210 get_extent_t *get_extent, int mirror_num);
211
212static inline void extent_buffer_get(struct extent_buffer *eb)
213{
214 atomic_inc(&eb->refs);
215}
216
217int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
218 unsigned long start,
219 unsigned long len);
220void read_extent_buffer(struct extent_buffer *eb, void *dst,
221 unsigned long start,
222 unsigned long len);
223void write_extent_buffer(struct extent_buffer *eb, const void *src,
224 unsigned long start, unsigned long len);
225void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
226 unsigned long dst_offset, unsigned long src_offset,
227 unsigned long len);
228void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
229 unsigned long src_offset, unsigned long len);
230void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
231 unsigned long src_offset, unsigned long len);
232void memset_extent_buffer(struct extent_buffer *eb, char c,
233 unsigned long start, unsigned long len);
234int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
235 struct extent_buffer *eb);
236int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
237int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
238int clear_extent_buffer_dirty(struct extent_io_tree *tree,
239 struct extent_buffer *eb);
240int set_extent_buffer_dirty(struct extent_io_tree *tree,
241 struct extent_buffer *eb);
242int set_extent_buffer_uptodate(struct extent_io_tree *tree,
243 struct extent_buffer *eb);
244int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
245 struct extent_buffer *eb);
246int extent_buffer_uptodate(struct extent_io_tree *tree,
247 struct extent_buffer *eb);
248int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
249 unsigned long min_len, char **token, char **map,
250 unsigned long *map_start,
251 unsigned long *map_len, int km);
252int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
253 unsigned long min_len, char **token, char **map,
254 unsigned long *map_start,
255 unsigned long *map_len, int km);
256void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
257int release_extent_buffer_tail_pages(struct extent_buffer *eb);
258int extent_range_uptodate(struct extent_io_tree *tree,
259 u64 start, u64 end);
260int extent_clear_unlock_delalloc(struct inode *inode,
261 struct extent_io_tree *tree,
262 u64 start, u64 end, struct page *locked_page,
263 int unlock_page,
264 int clear_unlock,
265 int clear_delalloc, int clear_dirty,
266 int set_writeback,
267 int end_writeback);
268#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 000000000000..fd3ebfb8c3c5
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,351 @@
1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h>
4#include <linux/module.h>
5#include <linux/spinlock.h>
6#include <linux/version.h>
7#include <linux/hardirq.h>
8#include "extent_map.h"
9
10/* temporary define until extent_map moves out of btrfs */
11struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
12 unsigned long extra_flags,
13 void (*ctor)(void *, struct kmem_cache *,
14 unsigned long));
15
16static struct kmem_cache *extent_map_cache;
17
18int __init extent_map_init(void)
19{
20 extent_map_cache = btrfs_cache_create("extent_map",
21 sizeof(struct extent_map), 0,
22 NULL);
23 if (!extent_map_cache)
24 return -ENOMEM;
25 return 0;
26}
27
28void extent_map_exit(void)
29{
30 if (extent_map_cache)
31 kmem_cache_destroy(extent_map_cache);
32}
33
34/**
35 * extent_map_tree_init - initialize extent map tree
36 * @tree: tree to initialize
37 * @mask: flags for memory allocations during tree operations
38 *
39 * Initialize the extent tree @tree. Should be called for each new inode
40 * or other user of the extent_map interface.
41 */
42void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
43{
44 tree->map.rb_node = NULL;
45 spin_lock_init(&tree->lock);
46}
47EXPORT_SYMBOL(extent_map_tree_init);
48
49/**
50 * alloc_extent_map - allocate new extent map structure
51 * @mask: memory allocation flags
52 *
53 * Allocate a new extent_map structure. The new structure is
54 * returned with a reference count of one and needs to be
55 * freed using free_extent_map()
56 */
57struct extent_map *alloc_extent_map(gfp_t mask)
58{
59 struct extent_map *em;
60 em = kmem_cache_alloc(extent_map_cache, mask);
61 if (!em || IS_ERR(em))
62 return em;
63 em->in_tree = 0;
64 em->flags = 0;
65 atomic_set(&em->refs, 1);
66 return em;
67}
68EXPORT_SYMBOL(alloc_extent_map);
69
70/**
71 * free_extent_map - drop reference count of an extent_map
72 * @em: extent map beeing releasead
73 *
74 * Drops the reference out on @em by one and free the structure
75 * if the reference count hits zero.
76 */
77void free_extent_map(struct extent_map *em)
78{
79 if (!em)
80 return;
81 WARN_ON(atomic_read(&em->refs) == 0);
82 if (atomic_dec_and_test(&em->refs)) {
83 WARN_ON(em->in_tree);
84 kmem_cache_free(extent_map_cache, em);
85 }
86}
87EXPORT_SYMBOL(free_extent_map);
88
89static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
90 struct rb_node *node)
91{
92 struct rb_node ** p = &root->rb_node;
93 struct rb_node * parent = NULL;
94 struct extent_map *entry;
95
96 while(*p) {
97 parent = *p;
98 entry = rb_entry(parent, struct extent_map, rb_node);
99
100 WARN_ON(!entry->in_tree);
101
102 if (offset < entry->start)
103 p = &(*p)->rb_left;
104 else if (offset >= extent_map_end(entry))
105 p = &(*p)->rb_right;
106 else
107 return parent;
108 }
109
110 entry = rb_entry(node, struct extent_map, rb_node);
111 entry->in_tree = 1;
112 rb_link_node(node, parent, p);
113 rb_insert_color(node, root);
114 return NULL;
115}
116
117/*
118 * search through the tree for an extent_map with a given offset. If
119 * it can't be found, try to find some neighboring extents
120 */
121static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
122 struct rb_node **prev_ret,
123 struct rb_node **next_ret)
124{
125 struct rb_node * n = root->rb_node;
126 struct rb_node *prev = NULL;
127 struct rb_node *orig_prev = NULL;
128 struct extent_map *entry;
129 struct extent_map *prev_entry = NULL;
130
131 while(n) {
132 entry = rb_entry(n, struct extent_map, rb_node);
133 prev = n;
134 prev_entry = entry;
135
136 WARN_ON(!entry->in_tree);
137
138 if (offset < entry->start)
139 n = n->rb_left;
140 else if (offset >= extent_map_end(entry))
141 n = n->rb_right;
142 else
143 return n;
144 }
145
146 if (prev_ret) {
147 orig_prev = prev;
148 while(prev && offset >= extent_map_end(prev_entry)) {
149 prev = rb_next(prev);
150 prev_entry = rb_entry(prev, struct extent_map, rb_node);
151 }
152 *prev_ret = prev;
153 prev = orig_prev;
154 }
155
156 if (next_ret) {
157 prev_entry = rb_entry(prev, struct extent_map, rb_node);
158 while(prev && offset < prev_entry->start) {
159 prev = rb_prev(prev);
160 prev_entry = rb_entry(prev, struct extent_map, rb_node);
161 }
162 *next_ret = prev;
163 }
164 return NULL;
165}
166
167/*
168 * look for an offset in the tree, and if it can't be found, return
169 * the first offset we can find smaller than 'offset'.
170 */
171static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
172{
173 struct rb_node *prev;
174 struct rb_node *ret;
175 ret = __tree_search(root, offset, &prev, NULL);
176 if (!ret)
177 return prev;
178 return ret;
179}
180
181/* check to see if two extent_map structs are adjacent and safe to merge */
182static int mergable_maps(struct extent_map *prev, struct extent_map *next)
183{
184 if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
185 return 0;
186
187 /*
188 * don't merge compressed extents, we need to know their
189 * actual size
190 */
191 if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
192 return 0;
193
194 if (extent_map_end(prev) == next->start &&
195 prev->flags == next->flags &&
196 prev->bdev == next->bdev &&
197 ((next->block_start == EXTENT_MAP_HOLE &&
198 prev->block_start == EXTENT_MAP_HOLE) ||
199 (next->block_start == EXTENT_MAP_INLINE &&
200 prev->block_start == EXTENT_MAP_INLINE) ||
201 (next->block_start == EXTENT_MAP_DELALLOC &&
202 prev->block_start == EXTENT_MAP_DELALLOC) ||
203 (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
204 next->block_start == extent_map_block_end(prev)))) {
205 return 1;
206 }
207 return 0;
208}
209
210/**
211 * add_extent_mapping - add new extent map to the extent tree
212 * @tree: tree to insert new map in
213 * @em: map to insert
214 *
215 * Insert @em into @tree or perform a simple forward/backward merge with
216 * existing mappings. The extent_map struct passed in will be inserted
217 * into the tree directly, with an additional reference taken, or a
218 * reference dropped if the merge attempt was sucessfull.
219 */
220int add_extent_mapping(struct extent_map_tree *tree,
221 struct extent_map *em)
222{
223 int ret = 0;
224 struct extent_map *merge = NULL;
225 struct rb_node *rb;
226 struct extent_map *exist;
227
228 exist = lookup_extent_mapping(tree, em->start, em->len);
229 if (exist) {
230 free_extent_map(exist);
231 ret = -EEXIST;
232 goto out;
233 }
234 assert_spin_locked(&tree->lock);
235 rb = tree_insert(&tree->map, em->start, &em->rb_node);
236 if (rb) {
237 ret = -EEXIST;
238 free_extent_map(merge);
239 goto out;
240 }
241 atomic_inc(&em->refs);
242 if (em->start != 0) {
243 rb = rb_prev(&em->rb_node);
244 if (rb)
245 merge = rb_entry(rb, struct extent_map, rb_node);
246 if (rb && mergable_maps(merge, em)) {
247 em->start = merge->start;
248 em->len += merge->len;
249 em->block_len += merge->block_len;
250 em->block_start = merge->block_start;
251 merge->in_tree = 0;
252 rb_erase(&merge->rb_node, &tree->map);
253 free_extent_map(merge);
254 }
255 }
256 rb = rb_next(&em->rb_node);
257 if (rb)
258 merge = rb_entry(rb, struct extent_map, rb_node);
259 if (rb && mergable_maps(em, merge)) {
260 em->len += merge->len;
261 em->block_len += merge->len;
262 rb_erase(&merge->rb_node, &tree->map);
263 merge->in_tree = 0;
264 free_extent_map(merge);
265 }
266out:
267 return ret;
268}
269EXPORT_SYMBOL(add_extent_mapping);
270
271/* simple helper to do math around the end of an extent, handling wrap */
272static u64 range_end(u64 start, u64 len)
273{
274 if (start + len < start)
275 return (u64)-1;
276 return start + len;
277}
278
279/**
280 * lookup_extent_mapping - lookup extent_map
281 * @tree: tree to lookup in
282 * @start: byte offset to start the search
283 * @len: length of the lookup range
284 *
285 * Find and return the first extent_map struct in @tree that intersects the
286 * [start, len] range. There may be additional objects in the tree that
287 * intersect, so check the object returned carefully to make sure that no
288 * additional lookups are needed.
289 */
290struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
291 u64 start, u64 len)
292{
293 struct extent_map *em;
294 struct rb_node *rb_node;
295 struct rb_node *prev = NULL;
296 struct rb_node *next = NULL;
297 u64 end = range_end(start, len);
298
299 assert_spin_locked(&tree->lock);
300 rb_node = __tree_search(&tree->map, start, &prev, &next);
301 if (!rb_node && prev) {
302 em = rb_entry(prev, struct extent_map, rb_node);
303 if (end > em->start && start < extent_map_end(em))
304 goto found;
305 }
306 if (!rb_node && next) {
307 em = rb_entry(next, struct extent_map, rb_node);
308 if (end > em->start && start < extent_map_end(em))
309 goto found;
310 }
311 if (!rb_node) {
312 em = NULL;
313 goto out;
314 }
315 if (IS_ERR(rb_node)) {
316 em = ERR_PTR(PTR_ERR(rb_node));
317 goto out;
318 }
319 em = rb_entry(rb_node, struct extent_map, rb_node);
320 if (end > em->start && start < extent_map_end(em))
321 goto found;
322
323 em = NULL;
324 goto out;
325
326found:
327 atomic_inc(&em->refs);
328out:
329 return em;
330}
331EXPORT_SYMBOL(lookup_extent_mapping);
332
333/**
334 * remove_extent_mapping - removes an extent_map from the extent tree
335 * @tree: extent tree to remove from
336 * @em: extent map beeing removed
337 *
338 * Removes @em from @tree. No reference counts are dropped, and no checks
339 * are done to see if the range is in use
340 */
341int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
342{
343 int ret = 0;
344
345 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
346 assert_spin_locked(&tree->lock);
347 rb_erase(&em->rb_node, &tree->map);
348 em->in_tree = 0;
349 return ret;
350}
351EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 000000000000..fb6eeef06bb0
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,62 @@
1#ifndef __EXTENTMAP__
2#define __EXTENTMAP__
3
4#include <linux/rbtree.h>
5
6#define EXTENT_MAP_LAST_BYTE (u64)-4
7#define EXTENT_MAP_HOLE (u64)-3
8#define EXTENT_MAP_INLINE (u64)-2
9#define EXTENT_MAP_DELALLOC (u64)-1
10
11/* bits for the flags field */
12#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
13#define EXTENT_FLAG_COMPRESSED 1
14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
16
17struct extent_map {
18 struct rb_node rb_node;
19
20 /* all of these are in bytes */
21 u64 start;
22 u64 len;
23 u64 orig_start;
24 u64 block_start;
25 u64 block_len;
26 unsigned long flags;
27 struct block_device *bdev;
28 atomic_t refs;
29 int in_tree;
30};
31
32struct extent_map_tree {
33 struct rb_root map;
34 spinlock_t lock;
35};
36
37static inline u64 extent_map_end(struct extent_map *em)
38{
39 if (em->start + em->len < em->start)
40 return (u64)-1;
41 return em->start + em->len;
42}
43
44static inline u64 extent_map_block_end(struct extent_map *em)
45{
46 if (em->block_start + em->block_len < em->block_start)
47 return (u64)-1;
48 return em->block_start + em->block_len;
49}
50
51void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
52struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
53 u64 start, u64 len);
54int add_extent_mapping(struct extent_map_tree *tree,
55 struct extent_map *em);
56int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
57
58struct extent_map *alloc_extent_map(gfp_t mask);
59void free_extent_map(struct extent_map *em);
60int __init extent_map_init(void);
61void extent_map_exit(void);
62#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 000000000000..f76378831407
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,586 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/bio.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include "ctree.h"
23#include "disk-io.h"
24#include "transaction.h"
25#include "print-tree.h"
26
27#define MAX_CSUM_ITEMS(r) ((((BTRFS_LEAF_DATA_SIZE(r) - \
28 sizeof(struct btrfs_item) * 2) / \
29 BTRFS_CRC32_SIZE) - 1))
30int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root,
32 u64 objectid, u64 pos,
33 u64 disk_offset, u64 disk_num_bytes,
34 u64 num_bytes, u64 offset, u64 ram_bytes,
35 u8 compression, u8 encryption, u16 other_encoding)
36{
37 int ret = 0;
38 struct btrfs_file_extent_item *item;
39 struct btrfs_key file_key;
40 struct btrfs_path *path;
41 struct extent_buffer *leaf;
42
43 path = btrfs_alloc_path();
44 BUG_ON(!path);
45 file_key.objectid = objectid;
46 file_key.offset = pos;
47 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
48
49 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
50 sizeof(*item));
51 if (ret < 0)
52 goto out;
53 BUG_ON(ret);
54 leaf = path->nodes[0];
55 item = btrfs_item_ptr(leaf, path->slots[0],
56 struct btrfs_file_extent_item);
57 btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
58 btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
59 btrfs_set_file_extent_offset(leaf, item, offset);
60 btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
61 btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
62 btrfs_set_file_extent_generation(leaf, item, trans->transid);
63 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
64 btrfs_set_file_extent_compression(leaf, item, compression);
65 btrfs_set_file_extent_encryption(leaf, item, encryption);
66 btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
67
68 btrfs_mark_buffer_dirty(leaf);
69out:
70 btrfs_free_path(path);
71 return ret;
72}
73
74struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
75 struct btrfs_root *root,
76 struct btrfs_path *path,
77 u64 objectid, u64 offset,
78 int cow)
79{
80 int ret;
81 struct btrfs_key file_key;
82 struct btrfs_key found_key;
83 struct btrfs_csum_item *item;
84 struct extent_buffer *leaf;
85 u64 csum_offset = 0;
86 int csums_in_item;
87
88 file_key.objectid = objectid;
89 file_key.offset = offset;
90 btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
91 ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
92 if (ret < 0)
93 goto fail;
94 leaf = path->nodes[0];
95 if (ret > 0) {
96 ret = 1;
97 if (path->slots[0] == 0)
98 goto fail;
99 path->slots[0]--;
100 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
101 if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
102 found_key.objectid != objectid) {
103 goto fail;
104 }
105 csum_offset = (offset - found_key.offset) >>
106 root->fs_info->sb->s_blocksize_bits;
107 csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
108 csums_in_item /= BTRFS_CRC32_SIZE;
109
110 if (csum_offset >= csums_in_item) {
111 ret = -EFBIG;
112 goto fail;
113 }
114 }
115 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
116 item = (struct btrfs_csum_item *)((unsigned char *)item +
117 csum_offset * BTRFS_CRC32_SIZE);
118 return item;
119fail:
120 if (ret > 0)
121 ret = -ENOENT;
122 return ERR_PTR(ret);
123}
124
125
126int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
127 struct btrfs_root *root,
128 struct btrfs_path *path, u64 objectid,
129 u64 offset, int mod)
130{
131 int ret;
132 struct btrfs_key file_key;
133 int ins_len = mod < 0 ? -1 : 0;
134 int cow = mod != 0;
135
136 file_key.objectid = objectid;
137 file_key.offset = offset;
138 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
139 ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
140 return ret;
141}
142
143int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
144 struct bio *bio)
145{
146 u32 sum;
147 struct bio_vec *bvec = bio->bi_io_vec;
148 int bio_index = 0;
149 u64 offset;
150 u64 item_start_offset = 0;
151 u64 item_last_offset = 0;
152 u32 diff;
153 int ret;
154 struct btrfs_path *path;
155 struct btrfs_csum_item *item = NULL;
156 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
157
158 path = btrfs_alloc_path();
159 if (bio->bi_size > PAGE_CACHE_SIZE * 8)
160 path->reada = 2;
161
162 WARN_ON(bio->bi_vcnt <= 0);
163
164 while(bio_index < bio->bi_vcnt) {
165 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
166 ret = btrfs_find_ordered_sum(inode, offset, &sum);
167 if (ret == 0)
168 goto found;
169
170 if (!item || offset < item_start_offset ||
171 offset >= item_last_offset) {
172 struct btrfs_key found_key;
173 u32 item_size;
174
175 if (item)
176 btrfs_release_path(root, path);
177 item = btrfs_lookup_csum(NULL, root, path,
178 inode->i_ino, offset, 0);
179 if (IS_ERR(item)) {
180 ret = PTR_ERR(item);
181 if (ret == -ENOENT || ret == -EFBIG)
182 ret = 0;
183 sum = 0;
184 printk("no csum found for inode %lu start "
185 "%llu\n", inode->i_ino,
186 (unsigned long long)offset);
187 item = NULL;
188 btrfs_release_path(root, path);
189 goto found;
190 }
191 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
192 path->slots[0]);
193
194 item_start_offset = found_key.offset;
195 item_size = btrfs_item_size_nr(path->nodes[0],
196 path->slots[0]);
197 item_last_offset = item_start_offset +
198 (item_size / BTRFS_CRC32_SIZE) *
199 root->sectorsize;
200 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
201 struct btrfs_csum_item);
202 }
203 /*
204 * this byte range must be able to fit inside
205 * a single leaf so it will also fit inside a u32
206 */
207 diff = offset - item_start_offset;
208 diff = diff / root->sectorsize;
209 diff = diff * BTRFS_CRC32_SIZE;
210
211 read_extent_buffer(path->nodes[0], &sum,
212 ((unsigned long)item) + diff,
213 BTRFS_CRC32_SIZE);
214found:
215 set_state_private(io_tree, offset, sum);
216 bio_index++;
217 bvec++;
218 }
219 btrfs_free_path(path);
220 return 0;
221}
222
223int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
224 u64 start, unsigned long len)
225{
226 struct btrfs_ordered_sum *sums;
227 struct btrfs_sector_sum *sector_sum;
228 struct btrfs_ordered_extent *ordered;
229 char *data;
230 struct page *page;
231 unsigned long total_bytes = 0;
232 unsigned long this_sum_bytes = 0;
233
234 sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
235 if (!sums)
236 return -ENOMEM;
237
238 sector_sum = sums->sums;
239 sums->file_offset = start;
240 sums->len = len;
241 INIT_LIST_HEAD(&sums->list);
242 ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
243 BUG_ON(!ordered);
244
245 while(len > 0) {
246 if (start >= ordered->file_offset + ordered->len ||
247 start < ordered->file_offset) {
248 sums->len = this_sum_bytes;
249 this_sum_bytes = 0;
250 btrfs_add_ordered_sum(inode, ordered, sums);
251 btrfs_put_ordered_extent(ordered);
252
253 sums = kzalloc(btrfs_ordered_sum_size(root, len),
254 GFP_NOFS);
255 BUG_ON(!sums);
256 sector_sum = sums->sums;
257 sums->len = len;
258 sums->file_offset = start;
259 ordered = btrfs_lookup_ordered_extent(inode,
260 sums->file_offset);
261 BUG_ON(!ordered);
262 }
263
264 page = find_get_page(inode->i_mapping,
265 start >> PAGE_CACHE_SHIFT);
266
267 data = kmap_atomic(page, KM_USER0);
268 sector_sum->sum = ~(u32)0;
269 sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum,
270 PAGE_CACHE_SIZE);
271 kunmap_atomic(data, KM_USER0);
272 btrfs_csum_final(sector_sum->sum,
273 (char *)&sector_sum->sum);
274 sector_sum->offset = page_offset(page);
275 page_cache_release(page);
276
277 sector_sum++;
278 total_bytes += PAGE_CACHE_SIZE;
279 this_sum_bytes += PAGE_CACHE_SIZE;
280 start += PAGE_CACHE_SIZE;
281
282 WARN_ON(len < PAGE_CACHE_SIZE);
283 len -= PAGE_CACHE_SIZE;
284 }
285 btrfs_add_ordered_sum(inode, ordered, sums);
286 btrfs_put_ordered_extent(ordered);
287 return 0;
288}
289
290int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
291 struct bio *bio)
292{
293 struct btrfs_ordered_sum *sums;
294 struct btrfs_sector_sum *sector_sum;
295 struct btrfs_ordered_extent *ordered;
296 char *data;
297 struct bio_vec *bvec = bio->bi_io_vec;
298 int bio_index = 0;
299 unsigned long total_bytes = 0;
300 unsigned long this_sum_bytes = 0;
301 u64 offset;
302
303 WARN_ON(bio->bi_vcnt <= 0);
304 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
305 if (!sums)
306 return -ENOMEM;
307
308 sector_sum = sums->sums;
309 sums->file_offset = page_offset(bvec->bv_page) + bvec->bv_offset;
310 sums->len = bio->bi_size;
311 INIT_LIST_HEAD(&sums->list);
312 ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
313 BUG_ON(!ordered);
314
315 while(bio_index < bio->bi_vcnt) {
316 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
317 if (offset >= ordered->file_offset + ordered->len ||
318 offset < ordered->file_offset) {
319 unsigned long bytes_left;
320 sums->len = this_sum_bytes;
321 this_sum_bytes = 0;
322 btrfs_add_ordered_sum(inode, ordered, sums);
323 btrfs_put_ordered_extent(ordered);
324
325 bytes_left = bio->bi_size - total_bytes;
326
327 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
328 GFP_NOFS);
329 BUG_ON(!sums);
330 sector_sum = sums->sums;
331 sums->len = bytes_left;
332 sums->file_offset = offset;
333 ordered = btrfs_lookup_ordered_extent(inode,
334 sums->file_offset);
335 BUG_ON(!ordered);
336 }
337
338 data = kmap_atomic(bvec->bv_page, KM_USER0);
339 sector_sum->sum = ~(u32)0;
340 sector_sum->sum = btrfs_csum_data(root,
341 data + bvec->bv_offset,
342 sector_sum->sum,
343 bvec->bv_len);
344 kunmap_atomic(data, KM_USER0);
345 btrfs_csum_final(sector_sum->sum,
346 (char *)&sector_sum->sum);
347 sector_sum->offset = page_offset(bvec->bv_page) +
348 bvec->bv_offset;
349
350 sector_sum++;
351 bio_index++;
352 total_bytes += bvec->bv_len;
353 this_sum_bytes += bvec->bv_len;
354 bvec++;
355 }
356 this_sum_bytes = 0;
357 btrfs_add_ordered_sum(inode, ordered, sums);
358 btrfs_put_ordered_extent(ordered);
359 return 0;
360}
361
362int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
363 struct btrfs_root *root, struct inode *inode,
364 struct btrfs_ordered_sum *sums)
365{
366 u64 objectid = inode->i_ino;
367 u64 offset;
368 int ret;
369 struct btrfs_key file_key;
370 struct btrfs_key found_key;
371 u64 next_offset;
372 u64 total_bytes = 0;
373 int found_next;
374 struct btrfs_path *path;
375 struct btrfs_csum_item *item;
376 struct btrfs_csum_item *item_end;
377 struct extent_buffer *leaf = NULL;
378 u64 csum_offset;
379 struct btrfs_sector_sum *sector_sum;
380 u32 nritems;
381 u32 ins_size;
382 char *eb_map;
383 char *eb_token;
384 unsigned long map_len;
385 unsigned long map_start;
386
387 path = btrfs_alloc_path();
388 BUG_ON(!path);
389 sector_sum = sums->sums;
390again:
391 next_offset = (u64)-1;
392 found_next = 0;
393 offset = sector_sum->offset;
394 file_key.objectid = objectid;
395 file_key.offset = offset;
396 btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
397
398 mutex_lock(&BTRFS_I(inode)->csum_mutex);
399 item = btrfs_lookup_csum(trans, root, path, objectid, offset, 1);
400 if (!IS_ERR(item)) {
401 leaf = path->nodes[0];
402 ret = 0;
403 goto found;
404 }
405 ret = PTR_ERR(item);
406 if (ret == -EFBIG) {
407 u32 item_size;
408 /* we found one, but it isn't big enough yet */
409 leaf = path->nodes[0];
410 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
411 if ((item_size / BTRFS_CRC32_SIZE) >= MAX_CSUM_ITEMS(root)) {
412 /* already at max size, make a new one */
413 goto insert;
414 }
415 } else {
416 int slot = path->slots[0] + 1;
417 /* we didn't find a csum item, insert one */
418 nritems = btrfs_header_nritems(path->nodes[0]);
419 if (path->slots[0] >= nritems - 1) {
420 ret = btrfs_next_leaf(root, path);
421 if (ret == 1)
422 found_next = 1;
423 if (ret != 0)
424 goto insert;
425 slot = 0;
426 }
427 btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
428 if (found_key.objectid != objectid ||
429 found_key.type != BTRFS_CSUM_ITEM_KEY) {
430 found_next = 1;
431 goto insert;
432 }
433 next_offset = found_key.offset;
434 found_next = 1;
435 goto insert;
436 }
437
438 /*
439 * at this point, we know the tree has an item, but it isn't big
440 * enough yet to put our csum in. Grow it
441 */
442 btrfs_release_path(root, path);
443 ret = btrfs_search_slot(trans, root, &file_key, path,
444 BTRFS_CRC32_SIZE, 1);
445 if (ret < 0)
446 goto fail_unlock;
447 if (ret == 0) {
448 BUG();
449 }
450 if (path->slots[0] == 0) {
451 goto insert;
452 }
453 path->slots[0]--;
454 leaf = path->nodes[0];
455 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
456 csum_offset = (offset - found_key.offset) >>
457 root->fs_info->sb->s_blocksize_bits;
458 if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
459 found_key.objectid != objectid ||
460 csum_offset >= MAX_CSUM_ITEMS(root)) {
461 goto insert;
462 }
463 if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
464 BTRFS_CRC32_SIZE) {
465 u32 diff = (csum_offset + 1) * BTRFS_CRC32_SIZE;
466 diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
467 if (diff != BTRFS_CRC32_SIZE)
468 goto insert;
469 ret = btrfs_extend_item(trans, root, path, diff);
470 BUG_ON(ret);
471 goto csum;
472 }
473
474insert:
475 btrfs_release_path(root, path);
476 csum_offset = 0;
477 if (found_next) {
478 u64 tmp = min((u64)i_size_read(inode), next_offset);
479 tmp -= offset & ~((u64)root->sectorsize -1);
480 tmp >>= root->fs_info->sb->s_blocksize_bits;
481 tmp = max((u64)1, tmp);
482 tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root));
483 ins_size = BTRFS_CRC32_SIZE * tmp;
484 } else {
485 ins_size = BTRFS_CRC32_SIZE;
486 }
487 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
488 ins_size);
489 if (ret < 0)
490 goto fail_unlock;
491 if (ret != 0) {
492 WARN_ON(1);
493 goto fail_unlock;
494 }
495csum:
496 leaf = path->nodes[0];
497 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
498 ret = 0;
499 item = (struct btrfs_csum_item *)((unsigned char *)item +
500 csum_offset * BTRFS_CRC32_SIZE);
501found:
502 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
503 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
504 btrfs_item_size_nr(leaf, path->slots[0]));
505 eb_token = NULL;
506 mutex_unlock(&BTRFS_I(inode)->csum_mutex);
507 cond_resched();
508next_sector:
509
510 if (!eb_token ||
511 (unsigned long)item + BTRFS_CRC32_SIZE >= map_start + map_len) {
512 int err;
513
514 if (eb_token)
515 unmap_extent_buffer(leaf, eb_token, KM_USER1);
516 eb_token = NULL;
517 err = map_private_extent_buffer(leaf, (unsigned long)item,
518 BTRFS_CRC32_SIZE,
519 &eb_token, &eb_map,
520 &map_start, &map_len, KM_USER1);
521 if (err)
522 eb_token = NULL;
523 }
524 if (eb_token) {
525 memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
526 &sector_sum->sum, BTRFS_CRC32_SIZE);
527 } else {
528 write_extent_buffer(leaf, &sector_sum->sum,
529 (unsigned long)item, BTRFS_CRC32_SIZE);
530 }
531
532 total_bytes += root->sectorsize;
533 sector_sum++;
534 if (total_bytes < sums->len) {
535 item = (struct btrfs_csum_item *)((char *)item +
536 BTRFS_CRC32_SIZE);
537 if (item < item_end && offset + PAGE_CACHE_SIZE ==
538 sector_sum->offset) {
539 offset = sector_sum->offset;
540 goto next_sector;
541 }
542 }
543 if (eb_token) {
544 unmap_extent_buffer(leaf, eb_token, KM_USER1);
545 eb_token = NULL;
546 }
547 btrfs_mark_buffer_dirty(path->nodes[0]);
548 cond_resched();
549 if (total_bytes < sums->len) {
550 btrfs_release_path(root, path);
551 goto again;
552 }
553out:
554 btrfs_free_path(path);
555 return ret;
556
557fail_unlock:
558 mutex_unlock(&BTRFS_I(inode)->csum_mutex);
559 goto out;
560}
561
562int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
563 struct btrfs_root *root, struct btrfs_path *path,
564 u64 isize)
565{
566 struct btrfs_key key;
567 struct extent_buffer *leaf = path->nodes[0];
568 int slot = path->slots[0];
569 int ret;
570 u32 new_item_size;
571 u64 new_item_span;
572 u64 blocks;
573
574 btrfs_item_key_to_cpu(leaf, &key, slot);
575 if (isize <= key.offset)
576 return 0;
577 new_item_span = isize - key.offset;
578 blocks = (new_item_span + root->sectorsize - 1) >>
579 root->fs_info->sb->s_blocksize_bits;
580 new_item_size = blocks * BTRFS_CRC32_SIZE;
581 if (new_item_size >= btrfs_item_size_nr(leaf, slot))
582 return 0;
583 ret = btrfs_truncate_item(trans, root, path, new_item_size, 1);
584 BUG_ON(ret);
585 return ret;
586}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 000000000000..1e8c024c69c3
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,1299 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include <linux/time.h>
23#include <linux/init.h>
24#include <linux/string.h>
25#include <linux/smp_lock.h>
26#include <linux/backing-dev.h>
27#include <linux/mpage.h>
28#include <linux/swap.h>
29#include <linux/writeback.h>
30#include <linux/statfs.h>
31#include <linux/compat.h>
32#include <linux/version.h>
33#include "ctree.h"
34#include "disk-io.h"
35#include "transaction.h"
36#include "btrfs_inode.h"
37#include "ioctl.h"
38#include "print-tree.h"
39#include "tree-log.h"
40#include "locking.h"
41#include "compat.h"
42
43
44/* simple helper to fault in pages and copy. This should go away
45 * and be replaced with calls into generic code.
46 */
47static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
48 int write_bytes,
49 struct page **prepared_pages,
50 const char __user * buf)
51{
52 long page_fault = 0;
53 int i;
54 int offset = pos & (PAGE_CACHE_SIZE - 1);
55
56 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
57 size_t count = min_t(size_t,
58 PAGE_CACHE_SIZE - offset, write_bytes);
59 struct page *page = prepared_pages[i];
60 fault_in_pages_readable(buf, count);
61
62 /* Copy data from userspace to the current page */
63 kmap(page);
64 page_fault = __copy_from_user(page_address(page) + offset,
65 buf, count);
66 /* Flush processor's dcache for this page */
67 flush_dcache_page(page);
68 kunmap(page);
69 buf += count;
70 write_bytes -= count;
71
72 if (page_fault)
73 break;
74 }
75 return page_fault ? -EFAULT : 0;
76}
77
78/*
79 * unlocks pages after btrfs_file_write is done with them
80 */
81static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
82{
83 size_t i;
84 for (i = 0; i < num_pages; i++) {
85 if (!pages[i])
86 break;
87 /* page checked is some magic around finding pages that
88 * have been modified without going through btrfs_set_page_dirty
89 * clear it here
90 */
91 ClearPageChecked(pages[i]);
92 unlock_page(pages[i]);
93 mark_page_accessed(pages[i]);
94 page_cache_release(pages[i]);
95 }
96}
97
98/*
99 * after copy_from_user, pages need to be dirtied and we need to make
100 * sure holes are created between the current EOF and the start of
101 * any next extents (if required).
102 *
103 * this also makes the decision about creating an inline extent vs
104 * doing real data extents, marking pages dirty and delalloc as required.
105 */
106static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
107 struct btrfs_root *root,
108 struct file *file,
109 struct page **pages,
110 size_t num_pages,
111 loff_t pos,
112 size_t write_bytes)
113{
114 int err = 0;
115 int i;
116 struct inode *inode = fdentry(file)->d_inode;
117 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
118 u64 hint_byte;
119 u64 num_bytes;
120 u64 start_pos;
121 u64 end_of_last_block;
122 u64 end_pos = pos + write_bytes;
123 loff_t isize = i_size_read(inode);
124
125 start_pos = pos & ~((u64)root->sectorsize - 1);
126 num_bytes = (write_bytes + pos - start_pos +
127 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
128
129 end_of_last_block = start_pos + num_bytes - 1;
130
131 lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
132 trans = btrfs_join_transaction(root, 1);
133 if (!trans) {
134 err = -ENOMEM;
135 goto out_unlock;
136 }
137 btrfs_set_trans_block_group(trans, inode);
138 hint_byte = 0;
139
140 if ((end_of_last_block & 4095) == 0) {
141 printk("strange end of last %Lu %zu %Lu\n", start_pos, write_bytes, end_of_last_block);
142 }
143 set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
144
145 /* check for reserved extents on each page, we don't want
146 * to reset the delalloc bit on things that already have
147 * extents reserved.
148 */
149 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
150 for (i = 0; i < num_pages; i++) {
151 struct page *p = pages[i];
152 SetPageUptodate(p);
153 ClearPageChecked(p);
154 set_page_dirty(p);
155 }
156 if (end_pos > isize) {
157 i_size_write(inode, end_pos);
158 btrfs_update_inode(trans, root, inode);
159 }
160 err = btrfs_end_transaction(trans, root);
161out_unlock:
162 unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
163 return err;
164}
165
166/*
167 * this drops all the extents in the cache that intersect the range
168 * [start, end]. Existing extents are split as required.
169 */
170int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
171 int skip_pinned)
172{
173 struct extent_map *em;
174 struct extent_map *split = NULL;
175 struct extent_map *split2 = NULL;
176 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
177 u64 len = end - start + 1;
178 int ret;
179 int testend = 1;
180 unsigned long flags;
181 int compressed = 0;
182
183 WARN_ON(end < start);
184 if (end == (u64)-1) {
185 len = (u64)-1;
186 testend = 0;
187 }
188 while(1) {
189 if (!split)
190 split = alloc_extent_map(GFP_NOFS);
191 if (!split2)
192 split2 = alloc_extent_map(GFP_NOFS);
193
194 spin_lock(&em_tree->lock);
195 em = lookup_extent_mapping(em_tree, start, len);
196 if (!em) {
197 spin_unlock(&em_tree->lock);
198 break;
199 }
200 flags = em->flags;
201 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
202 spin_unlock(&em_tree->lock);
203 if (em->start <= start &&
204 (!testend || em->start + em->len >= start + len)) {
205 free_extent_map(em);
206 break;
207 }
208 if (start < em->start) {
209 len = em->start - start;
210 } else {
211 len = start + len - (em->start + em->len);
212 start = em->start + em->len;
213 }
214 free_extent_map(em);
215 continue;
216 }
217 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
218 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
219 remove_extent_mapping(em_tree, em);
220
221 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
222 em->start < start) {
223 split->start = em->start;
224 split->len = start - em->start;
225 split->orig_start = em->orig_start;
226 split->block_start = em->block_start;
227
228 if (compressed)
229 split->block_len = em->block_len;
230 else
231 split->block_len = split->len;
232
233 split->bdev = em->bdev;
234 split->flags = flags;
235 ret = add_extent_mapping(em_tree, split);
236 BUG_ON(ret);
237 free_extent_map(split);
238 split = split2;
239 split2 = NULL;
240 }
241 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
242 testend && em->start + em->len > start + len) {
243 u64 diff = start + len - em->start;
244
245 split->start = start + len;
246 split->len = em->start + em->len - (start + len);
247 split->bdev = em->bdev;
248 split->flags = flags;
249
250 if (compressed) {
251 split->block_len = em->block_len;
252 split->block_start = em->block_start;
253 split->orig_start = em->orig_start;
254 } else {
255 split->block_len = split->len;
256 split->block_start = em->block_start + diff;
257 split->orig_start = split->start;
258 }
259
260 ret = add_extent_mapping(em_tree, split);
261 BUG_ON(ret);
262 free_extent_map(split);
263 split = NULL;
264 }
265 spin_unlock(&em_tree->lock);
266
267 /* once for us */
268 free_extent_map(em);
269 /* once for the tree*/
270 free_extent_map(em);
271 }
272 if (split)
273 free_extent_map(split);
274 if (split2)
275 free_extent_map(split2);
276 return 0;
277}
278
279int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
280{
281 return 0;
282#if 0
283 struct btrfs_path *path;
284 struct btrfs_key found_key;
285 struct extent_buffer *leaf;
286 struct btrfs_file_extent_item *extent;
287 u64 last_offset = 0;
288 int nritems;
289 int slot;
290 int found_type;
291 int ret;
292 int err = 0;
293 u64 extent_end = 0;
294
295 path = btrfs_alloc_path();
296 ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
297 last_offset, 0);
298 while(1) {
299 nritems = btrfs_header_nritems(path->nodes[0]);
300 if (path->slots[0] >= nritems) {
301 ret = btrfs_next_leaf(root, path);
302 if (ret)
303 goto out;
304 nritems = btrfs_header_nritems(path->nodes[0]);
305 }
306 slot = path->slots[0];
307 leaf = path->nodes[0];
308 btrfs_item_key_to_cpu(leaf, &found_key, slot);
309 if (found_key.objectid != inode->i_ino)
310 break;
311 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
312 goto out;
313
314 if (found_key.offset < last_offset) {
315 WARN_ON(1);
316 btrfs_print_leaf(root, leaf);
317 printk("inode %lu found offset %Lu expected %Lu\n",
318 inode->i_ino, found_key.offset, last_offset);
319 err = 1;
320 goto out;
321 }
322 extent = btrfs_item_ptr(leaf, slot,
323 struct btrfs_file_extent_item);
324 found_type = btrfs_file_extent_type(leaf, extent);
325 if (found_type == BTRFS_FILE_EXTENT_REG) {
326 extent_end = found_key.offset +
327 btrfs_file_extent_num_bytes(leaf, extent);
328 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
329 struct btrfs_item *item;
330 item = btrfs_item_nr(leaf, slot);
331 extent_end = found_key.offset +
332 btrfs_file_extent_inline_len(leaf, extent);
333 extent_end = (extent_end + root->sectorsize - 1) &
334 ~((u64)root->sectorsize -1 );
335 }
336 last_offset = extent_end;
337 path->slots[0]++;
338 }
339 if (0 && last_offset < inode->i_size) {
340 WARN_ON(1);
341 btrfs_print_leaf(root, leaf);
342 printk("inode %lu found offset %Lu size %Lu\n", inode->i_ino,
343 last_offset, inode->i_size);
344 err = 1;
345
346 }
347out:
348 btrfs_free_path(path);
349 return err;
350#endif
351}
352
353/*
354 * this is very complex, but the basic idea is to drop all extents
355 * in the range start - end. hint_block is filled in with a block number
356 * that would be a good hint to the block allocator for this file.
357 *
358 * If an extent intersects the range but is not entirely inside the range
359 * it is either truncated or split. Anything entirely inside the range
360 * is deleted from the tree.
361 *
362 * inline_limit is used to tell this code which offsets in the file to keep
363 * if they contain inline extents.
364 */
365int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
366 struct btrfs_root *root, struct inode *inode,
367 u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
368{
369 u64 extent_end = 0;
370 u64 locked_end = end;
371 u64 search_start = start;
372 u64 leaf_start;
373 u64 ram_bytes = 0;
374 u64 orig_parent = 0;
375 u64 disk_bytenr = 0;
376 u8 compression;
377 u8 encryption;
378 u16 other_encoding = 0;
379 u64 root_gen;
380 u64 root_owner;
381 struct extent_buffer *leaf;
382 struct btrfs_file_extent_item *extent;
383 struct btrfs_path *path;
384 struct btrfs_key key;
385 struct btrfs_file_extent_item old;
386 int keep;
387 int slot;
388 int bookend;
389 int found_type = 0;
390 int found_extent;
391 int found_inline;
392 int recow;
393 int ret;
394
395 inline_limit = 0;
396 btrfs_drop_extent_cache(inode, start, end - 1, 0);
397
398 path = btrfs_alloc_path();
399 if (!path)
400 return -ENOMEM;
401 while(1) {
402 recow = 0;
403 btrfs_release_path(root, path);
404 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
405 search_start, -1);
406 if (ret < 0)
407 goto out;
408 if (ret > 0) {
409 if (path->slots[0] == 0) {
410 ret = 0;
411 goto out;
412 }
413 path->slots[0]--;
414 }
415next_slot:
416 keep = 0;
417 bookend = 0;
418 found_extent = 0;
419 found_inline = 0;
420 leaf_start = 0;
421 root_gen = 0;
422 root_owner = 0;
423 compression = 0;
424 encryption = 0;
425 extent = NULL;
426 leaf = path->nodes[0];
427 slot = path->slots[0];
428 ret = 0;
429 btrfs_item_key_to_cpu(leaf, &key, slot);
430 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
431 key.offset >= end) {
432 goto out;
433 }
434 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
435 key.objectid != inode->i_ino) {
436 goto out;
437 }
438 if (recow) {
439 search_start = max(key.offset, start);
440 continue;
441 }
442 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
443 extent = btrfs_item_ptr(leaf, slot,
444 struct btrfs_file_extent_item);
445 found_type = btrfs_file_extent_type(leaf, extent);
446 compression = btrfs_file_extent_compression(leaf,
447 extent);
448 encryption = btrfs_file_extent_encryption(leaf,
449 extent);
450 other_encoding = btrfs_file_extent_other_encoding(leaf,
451 extent);
452 if (found_type == BTRFS_FILE_EXTENT_REG ||
453 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
454 extent_end =
455 btrfs_file_extent_disk_bytenr(leaf,
456 extent);
457 if (extent_end)
458 *hint_byte = extent_end;
459
460 extent_end = key.offset +
461 btrfs_file_extent_num_bytes(leaf, extent);
462 ram_bytes = btrfs_file_extent_ram_bytes(leaf,
463 extent);
464 found_extent = 1;
465 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
466 found_inline = 1;
467 extent_end = key.offset +
468 btrfs_file_extent_inline_len(leaf, extent);
469 }
470 } else {
471 extent_end = search_start;
472 }
473
474 /* we found nothing we can drop */
475 if ((!found_extent && !found_inline) ||
476 search_start >= extent_end) {
477 int nextret;
478 u32 nritems;
479 nritems = btrfs_header_nritems(leaf);
480 if (slot >= nritems - 1) {
481 nextret = btrfs_next_leaf(root, path);
482 if (nextret)
483 goto out;
484 recow = 1;
485 } else {
486 path->slots[0]++;
487 }
488 goto next_slot;
489 }
490
491 if (end <= extent_end && start >= key.offset && found_inline)
492 *hint_byte = EXTENT_MAP_INLINE;
493
494 if (found_extent) {
495 read_extent_buffer(leaf, &old, (unsigned long)extent,
496 sizeof(old));
497 root_gen = btrfs_header_generation(leaf);
498 root_owner = btrfs_header_owner(leaf);
499 leaf_start = leaf->start;
500 }
501
502 if (end < extent_end && end >= key.offset) {
503 bookend = 1;
504 if (found_inline && start <= key.offset)
505 keep = 1;
506 }
507
508 if (bookend && found_extent) {
509 if (locked_end < extent_end) {
510 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
511 locked_end, extent_end - 1,
512 GFP_NOFS);
513 if (!ret) {
514 btrfs_release_path(root, path);
515 lock_extent(&BTRFS_I(inode)->io_tree,
516 locked_end, extent_end - 1,
517 GFP_NOFS);
518 locked_end = extent_end;
519 continue;
520 }
521 locked_end = extent_end;
522 }
523 orig_parent = path->nodes[0]->start;
524 disk_bytenr = le64_to_cpu(old.disk_bytenr);
525 if (disk_bytenr != 0) {
526 ret = btrfs_inc_extent_ref(trans, root,
527 disk_bytenr,
528 le64_to_cpu(old.disk_num_bytes),
529 orig_parent, root->root_key.objectid,
530 trans->transid, inode->i_ino);
531 BUG_ON(ret);
532 }
533 }
534
535 if (found_inline) {
536 u64 mask = root->sectorsize - 1;
537 search_start = (extent_end + mask) & ~mask;
538 } else
539 search_start = extent_end;
540
541 /* truncate existing extent */
542 if (start > key.offset) {
543 u64 new_num;
544 u64 old_num;
545 keep = 1;
546 WARN_ON(start & (root->sectorsize - 1));
547 if (found_extent) {
548 new_num = start - key.offset;
549 old_num = btrfs_file_extent_num_bytes(leaf,
550 extent);
551 *hint_byte =
552 btrfs_file_extent_disk_bytenr(leaf,
553 extent);
554 if (btrfs_file_extent_disk_bytenr(leaf,
555 extent)) {
556 inode_sub_bytes(inode, old_num -
557 new_num);
558 }
559 if (!compression && !encryption) {
560 btrfs_set_file_extent_ram_bytes(leaf,
561 extent, new_num);
562 }
563 btrfs_set_file_extent_num_bytes(leaf,
564 extent, new_num);
565 btrfs_mark_buffer_dirty(leaf);
566 } else if (key.offset < inline_limit &&
567 (end > extent_end) &&
568 (inline_limit < extent_end)) {
569 u32 new_size;
570 new_size = btrfs_file_extent_calc_inline_size(
571 inline_limit - key.offset);
572 inode_sub_bytes(inode, extent_end -
573 inline_limit);
574 btrfs_set_file_extent_ram_bytes(leaf, extent,
575 new_size);
576 if (!compression && !encryption) {
577 btrfs_truncate_item(trans, root, path,
578 new_size, 1);
579 }
580 }
581 }
582 /* delete the entire extent */
583 if (!keep) {
584 if (found_inline)
585 inode_sub_bytes(inode, extent_end -
586 key.offset);
587 ret = btrfs_del_item(trans, root, path);
588 /* TODO update progress marker and return */
589 BUG_ON(ret);
590 extent = NULL;
591 btrfs_release_path(root, path);
592 /* the extent will be freed later */
593 }
594 if (bookend && found_inline && start <= key.offset) {
595 u32 new_size;
596 new_size = btrfs_file_extent_calc_inline_size(
597 extent_end - end);
598 inode_sub_bytes(inode, end - key.offset);
599 btrfs_set_file_extent_ram_bytes(leaf, extent,
600 new_size);
601 if (!compression && !encryption)
602 ret = btrfs_truncate_item(trans, root, path,
603 new_size, 0);
604 BUG_ON(ret);
605 }
606 /* create bookend, splitting the extent in two */
607 if (bookend && found_extent) {
608 struct btrfs_key ins;
609 ins.objectid = inode->i_ino;
610 ins.offset = end;
611 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
612
613 btrfs_release_path(root, path);
614 ret = btrfs_insert_empty_item(trans, root, path, &ins,
615 sizeof(*extent));
616 BUG_ON(ret);
617
618 leaf = path->nodes[0];
619 extent = btrfs_item_ptr(leaf, path->slots[0],
620 struct btrfs_file_extent_item);
621 write_extent_buffer(leaf, &old,
622 (unsigned long)extent, sizeof(old));
623
624 btrfs_set_file_extent_compression(leaf, extent,
625 compression);
626 btrfs_set_file_extent_encryption(leaf, extent,
627 encryption);
628 btrfs_set_file_extent_other_encoding(leaf, extent,
629 other_encoding);
630 btrfs_set_file_extent_offset(leaf, extent,
631 le64_to_cpu(old.offset) + end - key.offset);
632 WARN_ON(le64_to_cpu(old.num_bytes) <
633 (extent_end - end));
634 btrfs_set_file_extent_num_bytes(leaf, extent,
635 extent_end - end);
636
637 /*
638 * set the ram bytes to the size of the full extent
639 * before splitting. This is a worst case flag,
640 * but its the best we can do because we don't know
641 * how splitting affects compression
642 */
643 btrfs_set_file_extent_ram_bytes(leaf, extent,
644 ram_bytes);
645 btrfs_set_file_extent_type(leaf, extent, found_type);
646
647 btrfs_mark_buffer_dirty(path->nodes[0]);
648
649 if (disk_bytenr != 0) {
650 ret = btrfs_update_extent_ref(trans, root,
651 disk_bytenr, orig_parent,
652 leaf->start,
653 root->root_key.objectid,
654 trans->transid, ins.objectid);
655
656 BUG_ON(ret);
657 }
658 btrfs_release_path(root, path);
659 if (disk_bytenr != 0) {
660 inode_add_bytes(inode, extent_end - end);
661 }
662 }
663
664 if (found_extent && !keep) {
665 u64 disk_bytenr = le64_to_cpu(old.disk_bytenr);
666
667 if (disk_bytenr != 0) {
668 inode_sub_bytes(inode,
669 le64_to_cpu(old.num_bytes));
670 ret = btrfs_free_extent(trans, root,
671 disk_bytenr,
672 le64_to_cpu(old.disk_num_bytes),
673 leaf_start, root_owner,
674 root_gen, key.objectid, 0);
675 BUG_ON(ret);
676 *hint_byte = disk_bytenr;
677 }
678 }
679
680 if (search_start >= end) {
681 ret = 0;
682 goto out;
683 }
684 }
685out:
686 btrfs_free_path(path);
687 if (locked_end > end) {
688 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
689 GFP_NOFS);
690 }
691 btrfs_check_file(root, inode);
692 return ret;
693}
694
695static int extent_mergeable(struct extent_buffer *leaf, int slot,
696 u64 objectid, u64 bytenr, u64 *start, u64 *end)
697{
698 struct btrfs_file_extent_item *fi;
699 struct btrfs_key key;
700 u64 extent_end;
701
702 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
703 return 0;
704
705 btrfs_item_key_to_cpu(leaf, &key, slot);
706 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
707 return 0;
708
709 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
710 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
711 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
712 btrfs_file_extent_compression(leaf, fi) ||
713 btrfs_file_extent_encryption(leaf, fi) ||
714 btrfs_file_extent_other_encoding(leaf, fi))
715 return 0;
716
717 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
718 if ((*start && *start != key.offset) || (*end && *end != extent_end))
719 return 0;
720
721 *start = key.offset;
722 *end = extent_end;
723 return 1;
724}
725
726/*
727 * Mark extent in the range start - end as written.
728 *
729 * This changes extent type from 'pre-allocated' to 'regular'. If only
730 * part of extent is marked as written, the extent will be split into
731 * two or three.
732 */
733int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
734 struct btrfs_root *root,
735 struct inode *inode, u64 start, u64 end)
736{
737 struct extent_buffer *leaf;
738 struct btrfs_path *path;
739 struct btrfs_file_extent_item *fi;
740 struct btrfs_key key;
741 u64 bytenr;
742 u64 num_bytes;
743 u64 extent_end;
744 u64 extent_offset;
745 u64 other_start;
746 u64 other_end;
747 u64 split = start;
748 u64 locked_end = end;
749 u64 orig_parent;
750 int extent_type;
751 int split_end = 1;
752 int ret;
753
754 btrfs_drop_extent_cache(inode, start, end - 1, 0);
755
756 path = btrfs_alloc_path();
757 BUG_ON(!path);
758again:
759 key.objectid = inode->i_ino;
760 key.type = BTRFS_EXTENT_DATA_KEY;
761 if (split == start)
762 key.offset = split;
763 else
764 key.offset = split - 1;
765
766 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
767 if (ret > 0 && path->slots[0] > 0)
768 path->slots[0]--;
769
770 leaf = path->nodes[0];
771 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
772 BUG_ON(key.objectid != inode->i_ino ||
773 key.type != BTRFS_EXTENT_DATA_KEY);
774 fi = btrfs_item_ptr(leaf, path->slots[0],
775 struct btrfs_file_extent_item);
776 extent_type = btrfs_file_extent_type(leaf, fi);
777 BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
778 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
779 BUG_ON(key.offset > start || extent_end < end);
780
781 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
782 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
783 extent_offset = btrfs_file_extent_offset(leaf, fi);
784
785 if (key.offset == start)
786 split = end;
787
788 if (key.offset == start && extent_end == end) {
789 int del_nr = 0;
790 int del_slot = 0;
791 u64 leaf_owner = btrfs_header_owner(leaf);
792 u64 leaf_gen = btrfs_header_generation(leaf);
793 other_start = end;
794 other_end = 0;
795 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
796 bytenr, &other_start, &other_end)) {
797 extent_end = other_end;
798 del_slot = path->slots[0] + 1;
799 del_nr++;
800 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
801 leaf->start, leaf_owner,
802 leaf_gen, inode->i_ino, 0);
803 BUG_ON(ret);
804 }
805 other_start = 0;
806 other_end = start;
807 if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
808 bytenr, &other_start, &other_end)) {
809 key.offset = other_start;
810 del_slot = path->slots[0];
811 del_nr++;
812 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
813 leaf->start, leaf_owner,
814 leaf_gen, inode->i_ino, 0);
815 BUG_ON(ret);
816 }
817 split_end = 0;
818 if (del_nr == 0) {
819 btrfs_set_file_extent_type(leaf, fi,
820 BTRFS_FILE_EXTENT_REG);
821 goto done;
822 }
823
824 fi = btrfs_item_ptr(leaf, del_slot - 1,
825 struct btrfs_file_extent_item);
826 btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
827 btrfs_set_file_extent_num_bytes(leaf, fi,
828 extent_end - key.offset);
829 btrfs_mark_buffer_dirty(leaf);
830
831 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
832 BUG_ON(ret);
833 goto done;
834 } else if (split == start) {
835 if (locked_end < extent_end) {
836 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
837 locked_end, extent_end - 1, GFP_NOFS);
838 if (!ret) {
839 btrfs_release_path(root, path);
840 lock_extent(&BTRFS_I(inode)->io_tree,
841 locked_end, extent_end - 1, GFP_NOFS);
842 locked_end = extent_end;
843 goto again;
844 }
845 locked_end = extent_end;
846 }
847 btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
848 extent_offset += split - key.offset;
849 } else {
850 BUG_ON(key.offset != start);
851 btrfs_set_file_extent_offset(leaf, fi, extent_offset +
852 split - key.offset);
853 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
854 key.offset = split;
855 btrfs_set_item_key_safe(trans, root, path, &key);
856 extent_end = split;
857 }
858
859 if (extent_end == end) {
860 split_end = 0;
861 extent_type = BTRFS_FILE_EXTENT_REG;
862 }
863 if (extent_end == end && split == start) {
864 other_start = end;
865 other_end = 0;
866 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
867 bytenr, &other_start, &other_end)) {
868 path->slots[0]++;
869 fi = btrfs_item_ptr(leaf, path->slots[0],
870 struct btrfs_file_extent_item);
871 key.offset = split;
872 btrfs_set_item_key_safe(trans, root, path, &key);
873 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
874 btrfs_set_file_extent_num_bytes(leaf, fi,
875 other_end - split);
876 goto done;
877 }
878 }
879 if (extent_end == end && split == end) {
880 other_start = 0;
881 other_end = start;
882 if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
883 bytenr, &other_start, &other_end)) {
884 path->slots[0]--;
885 fi = btrfs_item_ptr(leaf, path->slots[0],
886 struct btrfs_file_extent_item);
887 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
888 other_start);
889 goto done;
890 }
891 }
892
893 btrfs_mark_buffer_dirty(leaf);
894
895 orig_parent = leaf->start;
896 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
897 orig_parent, root->root_key.objectid,
898 trans->transid, inode->i_ino);
899 BUG_ON(ret);
900 btrfs_release_path(root, path);
901
902 key.offset = start;
903 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
904 BUG_ON(ret);
905
906 leaf = path->nodes[0];
907 fi = btrfs_item_ptr(leaf, path->slots[0],
908 struct btrfs_file_extent_item);
909 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
910 btrfs_set_file_extent_type(leaf, fi, extent_type);
911 btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
912 btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
913 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
914 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
915 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
916 btrfs_set_file_extent_compression(leaf, fi, 0);
917 btrfs_set_file_extent_encryption(leaf, fi, 0);
918 btrfs_set_file_extent_other_encoding(leaf, fi, 0);
919
920 if (orig_parent != leaf->start) {
921 ret = btrfs_update_extent_ref(trans, root, bytenr,
922 orig_parent, leaf->start,
923 root->root_key.objectid,
924 trans->transid, inode->i_ino);
925 BUG_ON(ret);
926 }
927done:
928 btrfs_mark_buffer_dirty(leaf);
929 btrfs_release_path(root, path);
930 if (split_end && split == start) {
931 split = end;
932 goto again;
933 }
934 if (locked_end > end) {
935 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
936 GFP_NOFS);
937 }
938 btrfs_free_path(path);
939 return 0;
940}
941
942/*
943 * this gets pages into the page cache and locks them down, it also properly
944 * waits for data=ordered extents to finish before allowing the pages to be
945 * modified.
946 */
947static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
948 struct page **pages, size_t num_pages,
949 loff_t pos, unsigned long first_index,
950 unsigned long last_index, size_t write_bytes)
951{
952 int i;
953 unsigned long index = pos >> PAGE_CACHE_SHIFT;
954 struct inode *inode = fdentry(file)->d_inode;
955 int err = 0;
956 u64 start_pos;
957 u64 last_pos;
958
959 start_pos = pos & ~((u64)root->sectorsize - 1);
960 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
961
962 if (start_pos > inode->i_size) {
963 err = btrfs_cont_expand(inode, start_pos);
964 if (err)
965 return err;
966 }
967
968 memset(pages, 0, num_pages * sizeof(struct page *));
969again:
970 for (i = 0; i < num_pages; i++) {
971 pages[i] = grab_cache_page(inode->i_mapping, index + i);
972 if (!pages[i]) {
973 err = -ENOMEM;
974 BUG_ON(1);
975 }
976 wait_on_page_writeback(pages[i]);
977 }
978 if (start_pos < inode->i_size) {
979 struct btrfs_ordered_extent *ordered;
980 lock_extent(&BTRFS_I(inode)->io_tree,
981 start_pos, last_pos - 1, GFP_NOFS);
982 ordered = btrfs_lookup_first_ordered_extent(inode, last_pos -1);
983 if (ordered &&
984 ordered->file_offset + ordered->len > start_pos &&
985 ordered->file_offset < last_pos) {
986 btrfs_put_ordered_extent(ordered);
987 unlock_extent(&BTRFS_I(inode)->io_tree,
988 start_pos, last_pos - 1, GFP_NOFS);
989 for (i = 0; i < num_pages; i++) {
990 unlock_page(pages[i]);
991 page_cache_release(pages[i]);
992 }
993 btrfs_wait_ordered_range(inode, start_pos,
994 last_pos - start_pos);
995 goto again;
996 }
997 if (ordered)
998 btrfs_put_ordered_extent(ordered);
999
1000 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
1001 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
1002 GFP_NOFS);
1003 unlock_extent(&BTRFS_I(inode)->io_tree,
1004 start_pos, last_pos - 1, GFP_NOFS);
1005 }
1006 for (i = 0; i < num_pages; i++) {
1007 clear_page_dirty_for_io(pages[i]);
1008 set_page_extent_mapped(pages[i]);
1009 WARN_ON(!PageLocked(pages[i]));
1010 }
1011 return 0;
1012}
1013
1014static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1015 size_t count, loff_t *ppos)
1016{
1017 loff_t pos;
1018 loff_t start_pos;
1019 ssize_t num_written = 0;
1020 ssize_t err = 0;
1021 int ret = 0;
1022 struct inode *inode = fdentry(file)->d_inode;
1023 struct btrfs_root *root = BTRFS_I(inode)->root;
1024 struct page **pages = NULL;
1025 int nrptrs;
1026 struct page *pinned[2];
1027 unsigned long first_index;
1028 unsigned long last_index;
1029 int will_write;
1030
1031 will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
1032 (file->f_flags & O_DIRECT));
1033
1034 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
1035 PAGE_CACHE_SIZE / (sizeof(struct page *)));
1036 pinned[0] = NULL;
1037 pinned[1] = NULL;
1038
1039 pos = *ppos;
1040 start_pos = pos;
1041
1042 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1043 current->backing_dev_info = inode->i_mapping->backing_dev_info;
1044 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1045 if (err)
1046 goto out_nolock;
1047 if (count == 0)
1048 goto out_nolock;
1049
1050 err = file_remove_suid(file);
1051 if (err)
1052 goto out_nolock;
1053 file_update_time(file);
1054
1055 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1056
1057 mutex_lock(&inode->i_mutex);
1058 first_index = pos >> PAGE_CACHE_SHIFT;
1059 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
1060
1061 /*
1062 * if this is a nodatasum mount, force summing off for the inode
1063 * all the time. That way a later mount with summing on won't
1064 * get confused
1065 */
1066 if (btrfs_test_opt(root, NODATASUM))
1067 btrfs_set_flag(inode, NODATASUM);
1068
1069 /*
1070 * there are lots of better ways to do this, but this code
1071 * makes sure the first and last page in the file range are
1072 * up to date and ready for cow
1073 */
1074 if ((pos & (PAGE_CACHE_SIZE - 1))) {
1075 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
1076 if (!PageUptodate(pinned[0])) {
1077 ret = btrfs_readpage(NULL, pinned[0]);
1078 BUG_ON(ret);
1079 wait_on_page_locked(pinned[0]);
1080 } else {
1081 unlock_page(pinned[0]);
1082 }
1083 }
1084 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
1085 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
1086 if (!PageUptodate(pinned[1])) {
1087 ret = btrfs_readpage(NULL, pinned[1]);
1088 BUG_ON(ret);
1089 wait_on_page_locked(pinned[1]);
1090 } else {
1091 unlock_page(pinned[1]);
1092 }
1093 }
1094
1095 while(count > 0) {
1096 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1097 size_t write_bytes = min(count, nrptrs *
1098 (size_t)PAGE_CACHE_SIZE -
1099 offset);
1100 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
1101 PAGE_CACHE_SHIFT;
1102
1103 WARN_ON(num_pages > nrptrs);
1104 memset(pages, 0, sizeof(pages));
1105
1106 ret = btrfs_check_free_space(root, write_bytes, 0);
1107 if (ret)
1108 goto out;
1109
1110 ret = prepare_pages(root, file, pages, num_pages,
1111 pos, first_index, last_index,
1112 write_bytes);
1113 if (ret)
1114 goto out;
1115
1116 ret = btrfs_copy_from_user(pos, num_pages,
1117 write_bytes, pages, buf);
1118 if (ret) {
1119 btrfs_drop_pages(pages, num_pages);
1120 goto out;
1121 }
1122
1123 ret = dirty_and_release_pages(NULL, root, file, pages,
1124 num_pages, pos, write_bytes);
1125 btrfs_drop_pages(pages, num_pages);
1126 if (ret)
1127 goto out;
1128
1129 if (will_write) {
1130 btrfs_fdatawrite_range(inode->i_mapping, pos,
1131 pos + write_bytes - 1,
1132 WB_SYNC_NONE);
1133 } else {
1134 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1135 num_pages);
1136 if (num_pages <
1137 (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1138 btrfs_btree_balance_dirty(root, 1);
1139 btrfs_throttle(root);
1140 }
1141
1142 buf += write_bytes;
1143 count -= write_bytes;
1144 pos += write_bytes;
1145 num_written += write_bytes;
1146
1147 cond_resched();
1148 }
1149out:
1150 mutex_unlock(&inode->i_mutex);
1151
1152out_nolock:
1153 kfree(pages);
1154 if (pinned[0])
1155 page_cache_release(pinned[0]);
1156 if (pinned[1])
1157 page_cache_release(pinned[1]);
1158 *ppos = pos;
1159
1160 if (num_written > 0 && will_write) {
1161 struct btrfs_trans_handle *trans;
1162
1163 err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1164 if (err)
1165 num_written = err;
1166
1167 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
1168 trans = btrfs_start_transaction(root, 1);
1169 ret = btrfs_log_dentry_safe(trans, root,
1170 file->f_dentry);
1171 if (ret == 0) {
1172 btrfs_sync_log(trans, root);
1173 btrfs_end_transaction(trans, root);
1174 } else {
1175 btrfs_commit_transaction(trans, root);
1176 }
1177 }
1178 if (file->f_flags & O_DIRECT) {
1179 invalidate_mapping_pages(inode->i_mapping,
1180 start_pos >> PAGE_CACHE_SHIFT,
1181 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1182 }
1183 }
1184 current->backing_dev_info = NULL;
1185 return num_written ? num_written : err;
1186}
1187
1188int btrfs_release_file(struct inode * inode, struct file * filp)
1189{
1190 if (filp->private_data)
1191 btrfs_ioctl_trans_end(filp);
1192 return 0;
1193}
1194
1195/*
1196 * fsync call for both files and directories. This logs the inode into
1197 * the tree log instead of forcing full commits whenever possible.
1198 *
1199 * It needs to call filemap_fdatawait so that all ordered extent updates are
1200 * in the metadata btree are up to date for copying to the log.
1201 *
1202 * It drops the inode mutex before doing the tree log commit. This is an
1203 * important optimization for directories because holding the mutex prevents
1204 * new operations on the dir while we write to disk.
1205 */
1206int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1207{
1208 struct inode *inode = dentry->d_inode;
1209 struct btrfs_root *root = BTRFS_I(inode)->root;
1210 int ret = 0;
1211 struct btrfs_trans_handle *trans;
1212
1213 /*
1214 * check the transaction that last modified this inode
1215 * and see if its already been committed
1216 */
1217 if (!BTRFS_I(inode)->last_trans)
1218 goto out;
1219
1220 mutex_lock(&root->fs_info->trans_mutex);
1221 if (BTRFS_I(inode)->last_trans <=
1222 root->fs_info->last_trans_committed) {
1223 BTRFS_I(inode)->last_trans = 0;
1224 mutex_unlock(&root->fs_info->trans_mutex);
1225 goto out;
1226 }
1227 mutex_unlock(&root->fs_info->trans_mutex);
1228
1229 root->fs_info->tree_log_batch++;
1230 filemap_fdatawait(inode->i_mapping);
1231 root->fs_info->tree_log_batch++;
1232
1233 /*
1234 * ok we haven't committed the transaction yet, lets do a commit
1235 */
1236 if (file->private_data)
1237 btrfs_ioctl_trans_end(file);
1238
1239 trans = btrfs_start_transaction(root, 1);
1240 if (!trans) {
1241 ret = -ENOMEM;
1242 goto out;
1243 }
1244
1245 ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
1246 if (ret < 0) {
1247 goto out;
1248 }
1249
1250 /* we've logged all the items and now have a consistent
1251 * version of the file in the log. It is possible that
1252 * someone will come in and modify the file, but that's
1253 * fine because the log is consistent on disk, and we
1254 * have references to all of the file's extents
1255 *
1256 * It is possible that someone will come in and log the
1257 * file again, but that will end up using the synchronization
1258 * inside btrfs_sync_log to keep things safe.
1259 */
1260 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
1261
1262 if (ret > 0) {
1263 ret = btrfs_commit_transaction(trans, root);
1264 } else {
1265 btrfs_sync_log(trans, root);
1266 ret = btrfs_end_transaction(trans, root);
1267 }
1268 mutex_lock(&file->f_dentry->d_inode->i_mutex);
1269out:
1270 return ret > 0 ? EIO : ret;
1271}
1272
1273static struct vm_operations_struct btrfs_file_vm_ops = {
1274 .fault = filemap_fault,
1275 .page_mkwrite = btrfs_page_mkwrite,
1276};
1277
1278static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1279{
1280 vma->vm_ops = &btrfs_file_vm_ops;
1281 file_accessed(filp);
1282 return 0;
1283}
1284
1285struct file_operations btrfs_file_operations = {
1286 .llseek = generic_file_llseek,
1287 .read = do_sync_read,
1288 .aio_read = generic_file_aio_read,
1289 .splice_read = generic_file_splice_read,
1290 .write = btrfs_file_write,
1291 .mmap = btrfs_file_mmap,
1292 .open = generic_file_open,
1293 .release = btrfs_release_file,
1294 .fsync = btrfs_sync_file,
1295 .unlocked_ioctl = btrfs_ioctl,
1296#ifdef CONFIG_COMPAT
1297 .compat_ioctl = btrfs_ioctl,
1298#endif
1299};
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 000000000000..f4926c0f3c8c
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,489 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21
22static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node)
24{
25 struct rb_node **p = &root->rb_node;
26 struct rb_node *parent = NULL;
27 struct btrfs_free_space *info;
28
29 while (*p) {
30 parent = *p;
31 info = rb_entry(parent, struct btrfs_free_space, offset_index);
32
33 if (offset < info->offset)
34 p = &(*p)->rb_left;
35 else if (offset > info->offset)
36 p = &(*p)->rb_right;
37 else
38 return -EEXIST;
39 }
40
41 rb_link_node(node, parent, p);
42 rb_insert_color(node, root);
43
44 return 0;
45}
46
47static int tree_insert_bytes(struct rb_root *root, u64 bytes,
48 struct rb_node *node)
49{
50 struct rb_node **p = &root->rb_node;
51 struct rb_node *parent = NULL;
52 struct btrfs_free_space *info;
53
54 while (*p) {
55 parent = *p;
56 info = rb_entry(parent, struct btrfs_free_space, bytes_index);
57
58 if (bytes < info->bytes)
59 p = &(*p)->rb_left;
60 else
61 p = &(*p)->rb_right;
62 }
63
64 rb_link_node(node, parent, p);
65 rb_insert_color(node, root);
66
67 return 0;
68}
69
70/*
71 * searches the tree for the given offset. If contains is set we will return
72 * the free space that contains the given offset. If contains is not set we
73 * will return the free space that starts at or after the given offset and is
74 * at least bytes long.
75 */
76static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
77 u64 offset, u64 bytes,
78 int contains)
79{
80 struct rb_node *n = root->rb_node;
81 struct btrfs_free_space *entry, *ret = NULL;
82
83 while (n) {
84 entry = rb_entry(n, struct btrfs_free_space, offset_index);
85
86 if (offset < entry->offset) {
87 if (!contains &&
88 (!ret || entry->offset < ret->offset) &&
89 (bytes <= entry->bytes))
90 ret = entry;
91 n = n->rb_left;
92 } else if (offset > entry->offset) {
93 if ((entry->offset + entry->bytes - 1) >= offset &&
94 bytes <= entry->bytes) {
95 ret = entry;
96 break;
97 }
98 n = n->rb_right;
99 } else {
100 if (bytes > entry->bytes) {
101 n = n->rb_right;
102 continue;
103 }
104 ret = entry;
105 break;
106 }
107 }
108
109 return ret;
110}
111
112/*
113 * return a chunk at least bytes size, as close to offset that we can get.
114 */
115static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
116 u64 offset, u64 bytes)
117{
118 struct rb_node *n = root->rb_node;
119 struct btrfs_free_space *entry, *ret = NULL;
120
121 while (n) {
122 entry = rb_entry(n, struct btrfs_free_space, bytes_index);
123
124 if (bytes < entry->bytes) {
125 /*
126 * We prefer to get a hole size as close to the size we
127 * are asking for so we don't take small slivers out of
128 * huge holes, but we also want to get as close to the
129 * offset as possible so we don't have a whole lot of
130 * fragmentation.
131 */
132 if (offset <= entry->offset) {
133 if (!ret)
134 ret = entry;
135 else if (entry->bytes < ret->bytes)
136 ret = entry;
137 else if (entry->offset < ret->offset)
138 ret = entry;
139 }
140 n = n->rb_left;
141 } else if (bytes > entry->bytes) {
142 n = n->rb_right;
143 } else {
144 /*
145 * Ok we may have multiple chunks of the wanted size,
146 * so we don't want to take the first one we find, we
147 * want to take the one closest to our given offset, so
148 * keep searching just in case theres a better match.
149 */
150 n = n->rb_right;
151 if (offset > entry->offset)
152 continue;
153 else if (!ret || entry->offset < ret->offset)
154 ret = entry;
155 }
156 }
157
158 return ret;
159}
160
161static void unlink_free_space(struct btrfs_block_group_cache *block_group,
162 struct btrfs_free_space *info)
163{
164 rb_erase(&info->offset_index, &block_group->free_space_offset);
165 rb_erase(&info->bytes_index, &block_group->free_space_bytes);
166}
167
168static int link_free_space(struct btrfs_block_group_cache *block_group,
169 struct btrfs_free_space *info)
170{
171 int ret = 0;
172
173
174 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
175 &info->offset_index);
176 if (ret)
177 return ret;
178
179 ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
180 &info->bytes_index);
181 if (ret)
182 return ret;
183
184 return ret;
185}
186
187static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
188 u64 offset, u64 bytes)
189{
190 struct btrfs_free_space *right_info;
191 struct btrfs_free_space *left_info;
192 struct btrfs_free_space *info = NULL;
193 struct btrfs_free_space *alloc_info;
194 int ret = 0;
195
196 alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
197 if (!alloc_info)
198 return -ENOMEM;
199
200 /*
201 * first we want to see if there is free space adjacent to the range we
202 * are adding, if there is remove that struct and add a new one to
203 * cover the entire range
204 */
205 right_info = tree_search_offset(&block_group->free_space_offset,
206 offset+bytes, 0, 1);
207 left_info = tree_search_offset(&block_group->free_space_offset,
208 offset-1, 0, 1);
209
210 if (right_info && right_info->offset == offset+bytes) {
211 unlink_free_space(block_group, right_info);
212 info = right_info;
213 info->offset = offset;
214 info->bytes += bytes;
215 } else if (right_info && right_info->offset != offset+bytes) {
216 printk(KERN_ERR "adding space in the middle of an existing "
217 "free space area. existing: offset=%Lu, bytes=%Lu. "
218 "new: offset=%Lu, bytes=%Lu\n", right_info->offset,
219 right_info->bytes, offset, bytes);
220 BUG();
221 }
222
223 if (left_info) {
224 unlink_free_space(block_group, left_info);
225
226 if (unlikely((left_info->offset + left_info->bytes) !=
227 offset)) {
228 printk(KERN_ERR "free space to the left of new free "
229 "space isn't quite right. existing: offset=%Lu,"
230 " bytes=%Lu. new: offset=%Lu, bytes=%Lu\n",
231 left_info->offset, left_info->bytes, offset,
232 bytes);
233 BUG();
234 }
235
236 if (info) {
237 info->offset = left_info->offset;
238 info->bytes += left_info->bytes;
239 kfree(left_info);
240 } else {
241 info = left_info;
242 info->bytes += bytes;
243 }
244 }
245
246 if (info) {
247 ret = link_free_space(block_group, info);
248 if (!ret)
249 info = NULL;
250 goto out;
251 }
252
253 info = alloc_info;
254 alloc_info = NULL;
255 info->offset = offset;
256 info->bytes = bytes;
257
258 ret = link_free_space(block_group, info);
259 if (ret)
260 kfree(info);
261out:
262 if (ret) {
263 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
264 if (ret == -EEXIST)
265 BUG();
266 }
267
268 if (alloc_info)
269 kfree(alloc_info);
270
271 return ret;
272}
273
274static int
275__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
276 u64 offset, u64 bytes)
277{
278 struct btrfs_free_space *info;
279 int ret = 0;
280
281 info = tree_search_offset(&block_group->free_space_offset, offset, 0,
282 1);
283
284 if (info && info->offset == offset) {
285 if (info->bytes < bytes) {
286 printk(KERN_ERR "Found free space at %Lu, size %Lu,"
287 "trying to use %Lu\n",
288 info->offset, info->bytes, bytes);
289 WARN_ON(1);
290 ret = -EINVAL;
291 goto out;
292 }
293
294 unlink_free_space(block_group, info);
295
296 if (info->bytes == bytes) {
297 kfree(info);
298 goto out;
299 }
300
301 info->offset += bytes;
302 info->bytes -= bytes;
303
304 ret = link_free_space(block_group, info);
305 BUG_ON(ret);
306 } else if (info && info->offset < offset &&
307 info->offset + info->bytes >= offset + bytes) {
308 u64 old_start = info->offset;
309 /*
310 * we're freeing space in the middle of the info,
311 * this can happen during tree log replay
312 *
313 * first unlink the old info and then
314 * insert it again after the hole we're creating
315 */
316 unlink_free_space(block_group, info);
317 if (offset + bytes < info->offset + info->bytes) {
318 u64 old_end = info->offset + info->bytes;
319
320 info->offset = offset + bytes;
321 info->bytes = old_end - info->offset;
322 ret = link_free_space(block_group, info);
323 BUG_ON(ret);
324 } else {
325 /* the hole we're creating ends at the end
326 * of the info struct, just free the info
327 */
328 kfree(info);
329 }
330
331 /* step two, insert a new info struct to cover anything
332 * before the hole
333 */
334 ret = __btrfs_add_free_space(block_group, old_start,
335 offset - old_start);
336 BUG_ON(ret);
337 } else {
338 WARN_ON(1);
339 }
340out:
341 return ret;
342}
343
344int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
345 u64 offset, u64 bytes)
346{
347 int ret;
348 struct btrfs_free_space *sp;
349
350 mutex_lock(&block_group->alloc_mutex);
351 ret = __btrfs_add_free_space(block_group, offset, bytes);
352 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
353 BUG_ON(!sp);
354 mutex_unlock(&block_group->alloc_mutex);
355
356 return ret;
357}
358
359int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
360 u64 offset, u64 bytes)
361{
362 int ret;
363 struct btrfs_free_space *sp;
364
365 ret = __btrfs_add_free_space(block_group, offset, bytes);
366 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
367 BUG_ON(!sp);
368
369 return ret;
370}
371
372int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
373 u64 offset, u64 bytes)
374{
375 int ret = 0;
376
377 mutex_lock(&block_group->alloc_mutex);
378 ret = __btrfs_remove_free_space(block_group, offset, bytes);
379 mutex_unlock(&block_group->alloc_mutex);
380
381 return ret;
382}
383
384int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
385 u64 offset, u64 bytes)
386{
387 int ret;
388
389 ret = __btrfs_remove_free_space(block_group, offset, bytes);
390
391 return ret;
392}
393
394void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
395 u64 bytes)
396{
397 struct btrfs_free_space *info;
398 struct rb_node *n;
399 int count = 0;
400
401 for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
402 info = rb_entry(n, struct btrfs_free_space, offset_index);
403 if (info->bytes >= bytes)
404 count++;
405 //printk(KERN_INFO "offset=%Lu, bytes=%Lu\n", info->offset,
406 // info->bytes);
407 }
408 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
409 "\n", count);
410}
411
412u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
413{
414 struct btrfs_free_space *info;
415 struct rb_node *n;
416 u64 ret = 0;
417
418 for (n = rb_first(&block_group->free_space_offset); n;
419 n = rb_next(n)) {
420 info = rb_entry(n, struct btrfs_free_space, offset_index);
421 ret += info->bytes;
422 }
423
424 return ret;
425}
426
427void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
428{
429 struct btrfs_free_space *info;
430 struct rb_node *node;
431
432 mutex_lock(&block_group->alloc_mutex);
433 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
434 info = rb_entry(node, struct btrfs_free_space, bytes_index);
435 unlink_free_space(block_group, info);
436 kfree(info);
437 if (need_resched()) {
438 mutex_unlock(&block_group->alloc_mutex);
439 cond_resched();
440 mutex_lock(&block_group->alloc_mutex);
441 }
442 }
443 mutex_unlock(&block_group->alloc_mutex);
444}
445
446struct btrfs_free_space *btrfs_find_free_space_offset(struct
447 btrfs_block_group_cache
448 *block_group, u64 offset,
449 u64 bytes)
450{
451 struct btrfs_free_space *ret;
452
453 mutex_lock(&block_group->alloc_mutex);
454 ret = tree_search_offset(&block_group->free_space_offset, offset,
455 bytes, 0);
456 mutex_unlock(&block_group->alloc_mutex);
457
458 return ret;
459}
460
461struct btrfs_free_space *btrfs_find_free_space_bytes(struct
462 btrfs_block_group_cache
463 *block_group, u64 offset,
464 u64 bytes)
465{
466 struct btrfs_free_space *ret;
467
468 mutex_lock(&block_group->alloc_mutex);
469
470 ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
471 mutex_unlock(&block_group->alloc_mutex);
472
473 return ret;
474}
475
476struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
477 *block_group, u64 offset,
478 u64 bytes)
479{
480 struct btrfs_free_space *ret = NULL;
481
482 ret = tree_search_offset(&block_group->free_space_offset, offset,
483 bytes, 0);
484 if (!ret)
485 ret = tree_search_bytes(&block_group->free_space_bytes,
486 offset, bytes);
487
488 return ret;
489}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
new file mode 100644
index 000000000000..2a020b276768
--- /dev/null
+++ b/fs/btrfs/hash.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __HASH__
20#define __HASH__
21
22#include "crc32c.h"
23static inline u64 btrfs_name_hash(const char *name, int len)
24{
25 return btrfs_crc32c((u32)~1, name, len);
26}
27#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 000000000000..d93451c66ba1
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,206 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22
23int find_name_in_backref(struct btrfs_path *path, const char * name,
24 int name_len, struct btrfs_inode_ref **ref_ret)
25{
26 struct extent_buffer *leaf;
27 struct btrfs_inode_ref *ref;
28 unsigned long ptr;
29 unsigned long name_ptr;
30 u32 item_size;
31 u32 cur_offset = 0;
32 int len;
33
34 leaf = path->nodes[0];
35 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
36 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
37 while (cur_offset < item_size) {
38 ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
39 len = btrfs_inode_ref_name_len(leaf, ref);
40 name_ptr = (unsigned long)(ref + 1);
41 cur_offset += len + sizeof(*ref);
42 if (len != name_len)
43 continue;
44 if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
45 *ref_ret = ref;
46 return 1;
47 }
48 }
49 return 0;
50}
51
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root,
54 const char *name, int name_len,
55 u64 inode_objectid, u64 ref_objectid, u64 *index)
56{
57 struct btrfs_path *path;
58 struct btrfs_key key;
59 struct btrfs_inode_ref *ref;
60 struct extent_buffer *leaf;
61 unsigned long ptr;
62 unsigned long item_start;
63 u32 item_size;
64 u32 sub_item_len;
65 int ret;
66 int del_len = name_len + sizeof(*ref);
67
68 key.objectid = inode_objectid;
69 key.offset = ref_objectid;
70 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
71
72 path = btrfs_alloc_path();
73 if (!path)
74 return -ENOMEM;
75
76 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
77 if (ret > 0) {
78 ret = -ENOENT;
79 goto out;
80 } else if (ret < 0) {
81 goto out;
82 }
83 if (!find_name_in_backref(path, name, name_len, &ref)) {
84 ret = -ENOENT;
85 goto out;
86 }
87 leaf = path->nodes[0];
88 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
89
90 if (index)
91 *index = btrfs_inode_ref_index(leaf, ref);
92
93 if (del_len == item_size) {
94 ret = btrfs_del_item(trans, root, path);
95 goto out;
96 }
97 ptr = (unsigned long)ref;
98 sub_item_len = name_len + sizeof(*ref);
99 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
100 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
101 item_size - (ptr + sub_item_len - item_start));
102 ret = btrfs_truncate_item(trans, root, path,
103 item_size - sub_item_len, 1);
104 BUG_ON(ret);
105out:
106 btrfs_free_path(path);
107 return ret;
108}
109
110int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
111 struct btrfs_root *root,
112 const char *name, int name_len,
113 u64 inode_objectid, u64 ref_objectid, u64 index)
114{
115 struct btrfs_path *path;
116 struct btrfs_key key;
117 struct btrfs_inode_ref *ref;
118 unsigned long ptr;
119 int ret;
120 int ins_len = name_len + sizeof(*ref);
121
122 key.objectid = inode_objectid;
123 key.offset = ref_objectid;
124 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
125
126 path = btrfs_alloc_path();
127 if (!path)
128 return -ENOMEM;
129
130 ret = btrfs_insert_empty_item(trans, root, path, &key,
131 ins_len);
132 if (ret == -EEXIST) {
133 u32 old_size;
134
135 if (find_name_in_backref(path, name, name_len, &ref))
136 goto out;
137
138 old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
139 ret = btrfs_extend_item(trans, root, path, ins_len);
140 BUG_ON(ret);
141 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
142 struct btrfs_inode_ref);
143 ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
144 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
145 btrfs_set_inode_ref_index(path->nodes[0], ref, index);
146 ptr = (unsigned long)(ref + 1);
147 ret = 0;
148 } else if (ret < 0) {
149 goto out;
150 } else {
151 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
152 struct btrfs_inode_ref);
153 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
154 btrfs_set_inode_ref_index(path->nodes[0], ref, index);
155 ptr = (unsigned long)(ref + 1);
156 }
157 write_extent_buffer(path->nodes[0], name, ptr, name_len);
158 btrfs_mark_buffer_dirty(path->nodes[0]);
159
160out:
161 btrfs_free_path(path);
162 return ret;
163}
164
165int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
166 struct btrfs_root *root,
167 struct btrfs_path *path, u64 objectid)
168{
169 struct btrfs_key key;
170 int ret;
171 key.objectid = objectid;
172 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
173 key.offset = 0;
174
175 ret = btrfs_insert_empty_item(trans, root, path, &key,
176 sizeof(struct btrfs_inode_item));
177 if (ret == 0 && objectid > root->highest_inode)
178 root->highest_inode = objectid;
179 return ret;
180}
181
182int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
183 *root, struct btrfs_path *path,
184 struct btrfs_key *location, int mod)
185{
186 int ins_len = mod < 0 ? -1 : 0;
187 int cow = mod != 0;
188 int ret;
189 int slot;
190 struct extent_buffer *leaf;
191 struct btrfs_key found_key;
192
193 ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
194 if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
195 location->offset == (u64)-1 && path->slots[0] != 0) {
196 slot = path->slots[0] - 1;
197 leaf = path->nodes[0];
198 btrfs_item_key_to_cpu(leaf, &found_key, slot);
199 if (found_key.objectid == location->objectid &&
200 btrfs_key_type(&found_key) == btrfs_key_type(location)) {
201 path->slots[0]--;
202 return 0;
203 }
204 }
205 return ret;
206}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
new file mode 100644
index 000000000000..80038c5ef7cf
--- /dev/null
+++ b/fs/btrfs/inode-map.c
@@ -0,0 +1,145 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22
23int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
24{
25 struct btrfs_path *path;
26 int ret;
27 struct extent_buffer *l;
28 struct btrfs_key search_key;
29 struct btrfs_key found_key;
30 int slot;
31
32 path = btrfs_alloc_path();
33 BUG_ON(!path);
34
35 search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
36 search_key.type = -1;
37 search_key.offset = (u64)-1;
38 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
39 if (ret < 0)
40 goto error;
41 BUG_ON(ret == 0);
42 if (path->slots[0] > 0) {
43 slot = path->slots[0] - 1;
44 l = path->nodes[0];
45 btrfs_item_key_to_cpu(l, &found_key, slot);
46 *objectid = found_key.objectid;
47 } else {
48 *objectid = BTRFS_FIRST_FREE_OBJECTID;
49 }
50 ret = 0;
51error:
52 btrfs_free_path(path);
53 return ret;
54}
55
56/*
57 * walks the btree of allocated inodes and find a hole.
58 */
59int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root,
61 u64 dirid, u64 *objectid)
62{
63 struct btrfs_path *path;
64 struct btrfs_key key;
65 int ret;
66 int slot = 0;
67 u64 last_ino = 0;
68 int start_found;
69 struct extent_buffer *l;
70 struct btrfs_key search_key;
71 u64 search_start = dirid;
72
73 mutex_lock(&root->objectid_mutex);
74 if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
75 root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
76 *objectid = ++root->last_inode_alloc;
77 mutex_unlock(&root->objectid_mutex);
78 return 0;
79 }
80 path = btrfs_alloc_path();
81 BUG_ON(!path);
82 search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
83 search_key.objectid = search_start;
84 search_key.type = 0;
85 search_key.offset = 0;
86
87 btrfs_init_path(path);
88 start_found = 0;
89 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
90 if (ret < 0)
91 goto error;
92
93 while (1) {
94 l = path->nodes[0];
95 slot = path->slots[0];
96 if (slot >= btrfs_header_nritems(l)) {
97 ret = btrfs_next_leaf(root, path);
98 if (ret == 0)
99 continue;
100 if (ret < 0)
101 goto error;
102 if (!start_found) {
103 *objectid = search_start;
104 start_found = 1;
105 goto found;
106 }
107 *objectid = last_ino > search_start ?
108 last_ino : search_start;
109 goto found;
110 }
111 btrfs_item_key_to_cpu(l, &key, slot);
112 if (key.objectid >= search_start) {
113 if (start_found) {
114 if (last_ino < search_start)
115 last_ino = search_start;
116 if (key.objectid > last_ino) {
117 *objectid = last_ino;
118 goto found;
119 }
120 } else if (key.objectid > search_start) {
121 *objectid = search_start;
122 goto found;
123 }
124 }
125 if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
126 break;
127
128 start_found = 1;
129 last_ino = key.objectid + 1;
130 path->slots[0]++;
131 }
132 // FIXME -ENOSPC
133 BUG_ON(1);
134found:
135 btrfs_release_path(root, path);
136 btrfs_free_path(path);
137 BUG_ON(*objectid < search_start);
138 mutex_unlock(&root->objectid_mutex);
139 return 0;
140error:
141 btrfs_release_path(root, path);
142 btrfs_free_path(path);
143 mutex_unlock(&root->objectid_mutex);
144 return ret;
145}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 000000000000..806caacff86c
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,5022 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/bit_spinlock.h>
37#include <linux/version.h>
38#include <linux/xattr.h>
39#include <linux/posix_acl.h>
40#include <linux/falloc.h>
41#include "ctree.h"
42#include "disk-io.h"
43#include "transaction.h"
44#include "btrfs_inode.h"
45#include "ioctl.h"
46#include "print-tree.h"
47#include "volumes.h"
48#include "ordered-data.h"
49#include "xattr.h"
50#include "compat.h"
51#include "tree-log.h"
52#include "ref-cache.h"
53#include "compression.h"
54
55struct btrfs_iget_args {
56 u64 ino;
57 struct btrfs_root *root;
58};
59
60static struct inode_operations btrfs_dir_inode_operations;
61static struct inode_operations btrfs_symlink_inode_operations;
62static struct inode_operations btrfs_dir_ro_inode_operations;
63static struct inode_operations btrfs_special_inode_operations;
64static struct inode_operations btrfs_file_inode_operations;
65static struct address_space_operations btrfs_aops;
66static struct address_space_operations btrfs_symlink_aops;
67static struct file_operations btrfs_dir_file_operations;
68static struct extent_io_ops btrfs_extent_io_ops;
69
70static struct kmem_cache *btrfs_inode_cachep;
71struct kmem_cache *btrfs_trans_handle_cachep;
72struct kmem_cache *btrfs_transaction_cachep;
73struct kmem_cache *btrfs_bit_radix_cachep;
74struct kmem_cache *btrfs_path_cachep;
75
76#define S_SHIFT 12
77static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
78 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
79 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
80 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
81 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
82 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
83 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
84 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
85};
86
87static void btrfs_truncate(struct inode *inode);
88static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
89static noinline int cow_file_range(struct inode *inode,
90 struct page *locked_page,
91 u64 start, u64 end, int *page_started,
92 unsigned long *nr_written, int unlock);
93
94/*
95 * a very lame attempt at stopping writes when the FS is 85% full. There
96 * are countless ways this is incorrect, but it is better than nothing.
97 */
98int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
99 int for_del)
100{
101 u64 total;
102 u64 used;
103 u64 thresh;
104 unsigned long flags;
105 int ret = 0;
106
107 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
108 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
109 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
110 if (for_del)
111 thresh = total * 90;
112 else
113 thresh = total * 85;
114
115 do_div(thresh, 100);
116
117 if (used + root->fs_info->delalloc_bytes + num_required > thresh)
118 ret = -ENOSPC;
119 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
120 return ret;
121}
122
123/*
124 * this does all the hard work for inserting an inline extent into
125 * the btree. The caller should have done a btrfs_drop_extents so that
126 * no overlapping inline items exist in the btree
127 */
128static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
129 struct btrfs_root *root, struct inode *inode,
130 u64 start, size_t size, size_t compressed_size,
131 struct page **compressed_pages)
132{
133 struct btrfs_key key;
134 struct btrfs_path *path;
135 struct extent_buffer *leaf;
136 struct page *page = NULL;
137 char *kaddr;
138 unsigned long ptr;
139 struct btrfs_file_extent_item *ei;
140 int err = 0;
141 int ret;
142 size_t cur_size = size;
143 size_t datasize;
144 unsigned long offset;
145 int use_compress = 0;
146
147 if (compressed_size && compressed_pages) {
148 use_compress = 1;
149 cur_size = compressed_size;
150 }
151
152 path = btrfs_alloc_path(); if (!path)
153 return -ENOMEM;
154
155 btrfs_set_trans_block_group(trans, inode);
156
157 key.objectid = inode->i_ino;
158 key.offset = start;
159 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
160 inode_add_bytes(inode, size);
161 datasize = btrfs_file_extent_calc_inline_size(cur_size);
162
163 inode_add_bytes(inode, size);
164 ret = btrfs_insert_empty_item(trans, root, path, &key,
165 datasize);
166 BUG_ON(ret);
167 if (ret) {
168 err = ret;
169 printk("got bad ret %d\n", ret);
170 goto fail;
171 }
172 leaf = path->nodes[0];
173 ei = btrfs_item_ptr(leaf, path->slots[0],
174 struct btrfs_file_extent_item);
175 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
176 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
177 btrfs_set_file_extent_encryption(leaf, ei, 0);
178 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
179 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
180 ptr = btrfs_file_extent_inline_start(ei);
181
182 if (use_compress) {
183 struct page *cpage;
184 int i = 0;
185 while(compressed_size > 0) {
186 cpage = compressed_pages[i];
187 cur_size = min_t(unsigned long, compressed_size,
188 PAGE_CACHE_SIZE);
189
190 kaddr = kmap(cpage);
191 write_extent_buffer(leaf, kaddr, ptr, cur_size);
192 kunmap(cpage);
193
194 i++;
195 ptr += cur_size;
196 compressed_size -= cur_size;
197 }
198 btrfs_set_file_extent_compression(leaf, ei,
199 BTRFS_COMPRESS_ZLIB);
200 } else {
201 page = find_get_page(inode->i_mapping,
202 start >> PAGE_CACHE_SHIFT);
203 btrfs_set_file_extent_compression(leaf, ei, 0);
204 kaddr = kmap_atomic(page, KM_USER0);
205 offset = start & (PAGE_CACHE_SIZE - 1);
206 write_extent_buffer(leaf, kaddr + offset, ptr, size);
207 kunmap_atomic(kaddr, KM_USER0);
208 page_cache_release(page);
209 }
210 btrfs_mark_buffer_dirty(leaf);
211 btrfs_free_path(path);
212
213 BTRFS_I(inode)->disk_i_size = inode->i_size;
214 btrfs_update_inode(trans, root, inode);
215 return 0;
216fail:
217 btrfs_free_path(path);
218 return err;
219}
220
221
222/*
223 * conditionally insert an inline extent into the file. This
224 * does the checks required to make sure the data is small enough
225 * to fit as an inline extent.
226 */
227static int cow_file_range_inline(struct btrfs_trans_handle *trans,
228 struct btrfs_root *root,
229 struct inode *inode, u64 start, u64 end,
230 size_t compressed_size,
231 struct page **compressed_pages)
232{
233 u64 isize = i_size_read(inode);
234 u64 actual_end = min(end + 1, isize);
235 u64 inline_len = actual_end - start;
236 u64 aligned_end = (end + root->sectorsize - 1) &
237 ~((u64)root->sectorsize - 1);
238 u64 hint_byte;
239 u64 data_len = inline_len;
240 int ret;
241
242 if (compressed_size)
243 data_len = compressed_size;
244
245 if (start > 0 ||
246 actual_end >= PAGE_CACHE_SIZE ||
247 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
248 (!compressed_size &&
249 (actual_end & (root->sectorsize - 1)) == 0) ||
250 end + 1 < isize ||
251 data_len > root->fs_info->max_inline) {
252 return 1;
253 }
254
255 ret = btrfs_drop_extents(trans, root, inode, start,
256 aligned_end, start, &hint_byte);
257 BUG_ON(ret);
258
259 if (isize > actual_end)
260 inline_len = min_t(u64, isize, actual_end);
261 ret = insert_inline_extent(trans, root, inode, start,
262 inline_len, compressed_size,
263 compressed_pages);
264 BUG_ON(ret);
265 btrfs_drop_extent_cache(inode, start, aligned_end, 0);
266 return 0;
267}
268
269struct async_extent {
270 u64 start;
271 u64 ram_size;
272 u64 compressed_size;
273 struct page **pages;
274 unsigned long nr_pages;
275 struct list_head list;
276};
277
278struct async_cow {
279 struct inode *inode;
280 struct btrfs_root *root;
281 struct page *locked_page;
282 u64 start;
283 u64 end;
284 struct list_head extents;
285 struct btrfs_work work;
286};
287
288static noinline int add_async_extent(struct async_cow *cow,
289 u64 start, u64 ram_size,
290 u64 compressed_size,
291 struct page **pages,
292 unsigned long nr_pages)
293{
294 struct async_extent *async_extent;
295
296 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
297 async_extent->start = start;
298 async_extent->ram_size = ram_size;
299 async_extent->compressed_size = compressed_size;
300 async_extent->pages = pages;
301 async_extent->nr_pages = nr_pages;
302 list_add_tail(&async_extent->list, &cow->extents);
303 return 0;
304}
305
306/*
307 * we create compressed extents in two phases. The first
308 * phase compresses a range of pages that have already been
309 * locked (both pages and state bits are locked).
310 *
311 * This is done inside an ordered work queue, and the compression
312 * is spread across many cpus. The actual IO submission is step
313 * two, and the ordered work queue takes care of making sure that
314 * happens in the same order things were put onto the queue by
315 * writepages and friends.
316 *
317 * If this code finds it can't get good compression, it puts an
318 * entry onto the work queue to write the uncompressed bytes. This
319 * makes sure that both compressed inodes and uncompressed inodes
320 * are written in the same order that pdflush sent them down.
321 */
322static noinline int compress_file_range(struct inode *inode,
323 struct page *locked_page,
324 u64 start, u64 end,
325 struct async_cow *async_cow,
326 int *num_added)
327{
328 struct btrfs_root *root = BTRFS_I(inode)->root;
329 struct btrfs_trans_handle *trans;
330 u64 num_bytes;
331 u64 orig_start;
332 u64 disk_num_bytes;
333 u64 blocksize = root->sectorsize;
334 u64 actual_end;
335 int ret = 0;
336 struct page **pages = NULL;
337 unsigned long nr_pages;
338 unsigned long nr_pages_ret = 0;
339 unsigned long total_compressed = 0;
340 unsigned long total_in = 0;
341 unsigned long max_compressed = 128 * 1024;
342 unsigned long max_uncompressed = 128 * 1024;
343 int i;
344 int will_compress;
345
346 orig_start = start;
347
348again:
349 will_compress = 0;
350 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
351 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
352
353 actual_end = min_t(u64, i_size_read(inode), end + 1);
354 total_compressed = actual_end - start;
355
356 /* we want to make sure that amount of ram required to uncompress
357 * an extent is reasonable, so we limit the total size in ram
358 * of a compressed extent to 128k. This is a crucial number
359 * because it also controls how easily we can spread reads across
360 * cpus for decompression.
361 *
362 * We also want to make sure the amount of IO required to do
363 * a random read is reasonably small, so we limit the size of
364 * a compressed extent to 128k.
365 */
366 total_compressed = min(total_compressed, max_uncompressed);
367 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
368 num_bytes = max(blocksize, num_bytes);
369 disk_num_bytes = num_bytes;
370 total_in = 0;
371 ret = 0;
372
373 /*
374 * we do compression for mount -o compress and when the
375 * inode has not been flagged as nocompress. This flag can
376 * change at any time if we discover bad compression ratios.
377 */
378 if (!btrfs_test_flag(inode, NOCOMPRESS) &&
379 btrfs_test_opt(root, COMPRESS)) {
380 WARN_ON(pages);
381 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
382
383 ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
384 total_compressed, pages,
385 nr_pages, &nr_pages_ret,
386 &total_in,
387 &total_compressed,
388 max_compressed);
389
390 if (!ret) {
391 unsigned long offset = total_compressed &
392 (PAGE_CACHE_SIZE - 1);
393 struct page *page = pages[nr_pages_ret - 1];
394 char *kaddr;
395
396 /* zero the tail end of the last page, we might be
397 * sending it down to disk
398 */
399 if (offset) {
400 kaddr = kmap_atomic(page, KM_USER0);
401 memset(kaddr + offset, 0,
402 PAGE_CACHE_SIZE - offset);
403 kunmap_atomic(kaddr, KM_USER0);
404 }
405 will_compress = 1;
406 }
407 }
408 if (start == 0) {
409 trans = btrfs_join_transaction(root, 1);
410 BUG_ON(!trans);
411 btrfs_set_trans_block_group(trans, inode);
412
413 /* lets try to make an inline extent */
414 if (ret || total_in < (actual_end - start)) {
415 /* we didn't compress the entire range, try
416 * to make an uncompressed inline extent.
417 */
418 ret = cow_file_range_inline(trans, root, inode,
419 start, end, 0, NULL);
420 } else {
421 /* try making a compressed inline extent */
422 ret = cow_file_range_inline(trans, root, inode,
423 start, end,
424 total_compressed, pages);
425 }
426 btrfs_end_transaction(trans, root);
427 if (ret == 0) {
428 /*
429 * inline extent creation worked, we don't need
430 * to create any more async work items. Unlock
431 * and free up our temp pages.
432 */
433 extent_clear_unlock_delalloc(inode,
434 &BTRFS_I(inode)->io_tree,
435 start, end, NULL, 1, 0,
436 0, 1, 1, 1);
437 ret = 0;
438 goto free_pages_out;
439 }
440 }
441
442 if (will_compress) {
443 /*
444 * we aren't doing an inline extent round the compressed size
445 * up to a block size boundary so the allocator does sane
446 * things
447 */
448 total_compressed = (total_compressed + blocksize - 1) &
449 ~(blocksize - 1);
450
451 /*
452 * one last check to make sure the compression is really a
453 * win, compare the page count read with the blocks on disk
454 */
455 total_in = (total_in + PAGE_CACHE_SIZE - 1) &
456 ~(PAGE_CACHE_SIZE - 1);
457 if (total_compressed >= total_in) {
458 will_compress = 0;
459 } else {
460 disk_num_bytes = total_compressed;
461 num_bytes = total_in;
462 }
463 }
464 if (!will_compress && pages) {
465 /*
466 * the compression code ran but failed to make things smaller,
467 * free any pages it allocated and our page pointer array
468 */
469 for (i = 0; i < nr_pages_ret; i++) {
470 WARN_ON(pages[i]->mapping);
471 page_cache_release(pages[i]);
472 }
473 kfree(pages);
474 pages = NULL;
475 total_compressed = 0;
476 nr_pages_ret = 0;
477
478 /* flag the file so we don't compress in the future */
479 btrfs_set_flag(inode, NOCOMPRESS);
480 }
481 if (will_compress) {
482 *num_added += 1;
483
484 /* the async work queues will take care of doing actual
485 * allocation on disk for these compressed pages,
486 * and will submit them to the elevator.
487 */
488 add_async_extent(async_cow, start, num_bytes,
489 total_compressed, pages, nr_pages_ret);
490
491 if (start + num_bytes < end) {
492 start += num_bytes;
493 pages = NULL;
494 cond_resched();
495 goto again;
496 }
497 } else {
498 /*
499 * No compression, but we still need to write the pages in
500 * the file we've been given so far. redirty the locked
501 * page if it corresponds to our extent and set things up
502 * for the async work queue to run cow_file_range to do
503 * the normal delalloc dance
504 */
505 if (page_offset(locked_page) >= start &&
506 page_offset(locked_page) <= end) {
507 __set_page_dirty_nobuffers(locked_page);
508 /* unlocked later on in the async handlers */
509 }
510 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
511 *num_added += 1;
512 }
513
514out:
515 return 0;
516
517free_pages_out:
518 for (i = 0; i < nr_pages_ret; i++) {
519 WARN_ON(pages[i]->mapping);
520 page_cache_release(pages[i]);
521 }
522 if (pages)
523 kfree(pages);
524
525 goto out;
526}
527
528/*
529 * phase two of compressed writeback. This is the ordered portion
530 * of the code, which only gets called in the order the work was
531 * queued. We walk all the async extents created by compress_file_range
532 * and send them down to the disk.
533 */
534static noinline int submit_compressed_extents(struct inode *inode,
535 struct async_cow *async_cow)
536{
537 struct async_extent *async_extent;
538 u64 alloc_hint = 0;
539 struct btrfs_trans_handle *trans;
540 struct btrfs_key ins;
541 struct extent_map *em;
542 struct btrfs_root *root = BTRFS_I(inode)->root;
543 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
544 struct extent_io_tree *io_tree;
545 int ret;
546
547 if (list_empty(&async_cow->extents))
548 return 0;
549
550 trans = btrfs_join_transaction(root, 1);
551
552 while(!list_empty(&async_cow->extents)) {
553 async_extent = list_entry(async_cow->extents.next,
554 struct async_extent, list);
555 list_del(&async_extent->list);
556
557 io_tree = &BTRFS_I(inode)->io_tree;
558
559 /* did the compression code fall back to uncompressed IO? */
560 if (!async_extent->pages) {
561 int page_started = 0;
562 unsigned long nr_written = 0;
563
564 lock_extent(io_tree, async_extent->start,
565 async_extent->start + async_extent->ram_size - 1,
566 GFP_NOFS);
567
568 /* allocate blocks */
569 cow_file_range(inode, async_cow->locked_page,
570 async_extent->start,
571 async_extent->start +
572 async_extent->ram_size - 1,
573 &page_started, &nr_written, 0);
574
575 /*
576 * if page_started, cow_file_range inserted an
577 * inline extent and took care of all the unlocking
578 * and IO for us. Otherwise, we need to submit
579 * all those pages down to the drive.
580 */
581 if (!page_started)
582 extent_write_locked_range(io_tree,
583 inode, async_extent->start,
584 async_extent->start +
585 async_extent->ram_size - 1,
586 btrfs_get_extent,
587 WB_SYNC_ALL);
588 kfree(async_extent);
589 cond_resched();
590 continue;
591 }
592
593 lock_extent(io_tree, async_extent->start,
594 async_extent->start + async_extent->ram_size - 1,
595 GFP_NOFS);
596 /*
597 * here we're doing allocation and writeback of the
598 * compressed pages
599 */
600 btrfs_drop_extent_cache(inode, async_extent->start,
601 async_extent->start +
602 async_extent->ram_size - 1, 0);
603
604 ret = btrfs_reserve_extent(trans, root,
605 async_extent->compressed_size,
606 async_extent->compressed_size,
607 0, alloc_hint,
608 (u64)-1, &ins, 1);
609 BUG_ON(ret);
610 em = alloc_extent_map(GFP_NOFS);
611 em->start = async_extent->start;
612 em->len = async_extent->ram_size;
613 em->orig_start = em->start;
614
615 em->block_start = ins.objectid;
616 em->block_len = ins.offset;
617 em->bdev = root->fs_info->fs_devices->latest_bdev;
618 set_bit(EXTENT_FLAG_PINNED, &em->flags);
619 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
620
621 while(1) {
622 spin_lock(&em_tree->lock);
623 ret = add_extent_mapping(em_tree, em);
624 spin_unlock(&em_tree->lock);
625 if (ret != -EEXIST) {
626 free_extent_map(em);
627 break;
628 }
629 btrfs_drop_extent_cache(inode, async_extent->start,
630 async_extent->start +
631 async_extent->ram_size - 1, 0);
632 }
633
634 ret = btrfs_add_ordered_extent(inode, async_extent->start,
635 ins.objectid,
636 async_extent->ram_size,
637 ins.offset,
638 BTRFS_ORDERED_COMPRESSED);
639 BUG_ON(ret);
640
641 btrfs_end_transaction(trans, root);
642
643 /*
644 * clear dirty, set writeback and unlock the pages.
645 */
646 extent_clear_unlock_delalloc(inode,
647 &BTRFS_I(inode)->io_tree,
648 async_extent->start,
649 async_extent->start +
650 async_extent->ram_size - 1,
651 NULL, 1, 1, 0, 1, 1, 0);
652
653 ret = btrfs_submit_compressed_write(inode,
654 async_extent->start,
655 async_extent->ram_size,
656 ins.objectid,
657 ins.offset, async_extent->pages,
658 async_extent->nr_pages);
659
660 BUG_ON(ret);
661 trans = btrfs_join_transaction(root, 1);
662 alloc_hint = ins.objectid + ins.offset;
663 kfree(async_extent);
664 cond_resched();
665 }
666
667 btrfs_end_transaction(trans, root);
668 return 0;
669}
670
671/*
672 * when extent_io.c finds a delayed allocation range in the file,
673 * the call backs end up in this code. The basic idea is to
674 * allocate extents on disk for the range, and create ordered data structs
675 * in ram to track those extents.
676 *
677 * locked_page is the page that writepage had locked already. We use
678 * it to make sure we don't do extra locks or unlocks.
679 *
680 * *page_started is set to one if we unlock locked_page and do everything
681 * required to start IO on it. It may be clean and already done with
682 * IO when we return.
683 */
684static noinline int cow_file_range(struct inode *inode,
685 struct page *locked_page,
686 u64 start, u64 end, int *page_started,
687 unsigned long *nr_written,
688 int unlock)
689{
690 struct btrfs_root *root = BTRFS_I(inode)->root;
691 struct btrfs_trans_handle *trans;
692 u64 alloc_hint = 0;
693 u64 num_bytes;
694 unsigned long ram_size;
695 u64 disk_num_bytes;
696 u64 cur_alloc_size;
697 u64 blocksize = root->sectorsize;
698 u64 actual_end;
699 struct btrfs_key ins;
700 struct extent_map *em;
701 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
702 int ret = 0;
703
704 trans = btrfs_join_transaction(root, 1);
705 BUG_ON(!trans);
706 btrfs_set_trans_block_group(trans, inode);
707
708 actual_end = min_t(u64, i_size_read(inode), end + 1);
709
710 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
711 num_bytes = max(blocksize, num_bytes);
712 disk_num_bytes = num_bytes;
713 ret = 0;
714
715 if (start == 0) {
716 /* lets try to make an inline extent */
717 ret = cow_file_range_inline(trans, root, inode,
718 start, end, 0, NULL);
719 if (ret == 0) {
720 extent_clear_unlock_delalloc(inode,
721 &BTRFS_I(inode)->io_tree,
722 start, end, NULL, 1, 1,
723 1, 1, 1, 1);
724 *nr_written = *nr_written +
725 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
726 *page_started = 1;
727 ret = 0;
728 goto out;
729 }
730 }
731
732 BUG_ON(disk_num_bytes >
733 btrfs_super_total_bytes(&root->fs_info->super_copy));
734
735 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
736
737 while(disk_num_bytes > 0) {
738 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
739 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
740 root->sectorsize, 0, alloc_hint,
741 (u64)-1, &ins, 1);
742 if (ret) {
743 BUG();
744 }
745 em = alloc_extent_map(GFP_NOFS);
746 em->start = start;
747 em->orig_start = em->start;
748
749 ram_size = ins.offset;
750 em->len = ins.offset;
751
752 em->block_start = ins.objectid;
753 em->block_len = ins.offset;
754 em->bdev = root->fs_info->fs_devices->latest_bdev;
755 set_bit(EXTENT_FLAG_PINNED, &em->flags);
756
757 while(1) {
758 spin_lock(&em_tree->lock);
759 ret = add_extent_mapping(em_tree, em);
760 spin_unlock(&em_tree->lock);
761 if (ret != -EEXIST) {
762 free_extent_map(em);
763 break;
764 }
765 btrfs_drop_extent_cache(inode, start,
766 start + ram_size - 1, 0);
767 }
768
769 cur_alloc_size = ins.offset;
770 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
771 ram_size, cur_alloc_size, 0);
772 BUG_ON(ret);
773
774 if (disk_num_bytes < cur_alloc_size) {
775 printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
776 cur_alloc_size);
777 break;
778 }
779 /* we're not doing compressed IO, don't unlock the first
780 * page (which the caller expects to stay locked), don't
781 * clear any dirty bits and don't set any writeback bits
782 */
783 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
784 start, start + ram_size - 1,
785 locked_page, unlock, 1,
786 1, 0, 0, 0);
787 disk_num_bytes -= cur_alloc_size;
788 num_bytes -= cur_alloc_size;
789 alloc_hint = ins.objectid + ins.offset;
790 start += cur_alloc_size;
791 }
792out:
793 ret = 0;
794 btrfs_end_transaction(trans, root);
795
796 return ret;
797}
798
799/*
800 * work queue call back to started compression on a file and pages
801 */
802static noinline void async_cow_start(struct btrfs_work *work)
803{
804 struct async_cow *async_cow;
805 int num_added = 0;
806 async_cow = container_of(work, struct async_cow, work);
807
808 compress_file_range(async_cow->inode, async_cow->locked_page,
809 async_cow->start, async_cow->end, async_cow,
810 &num_added);
811 if (num_added == 0)
812 async_cow->inode = NULL;
813}
814
815/*
816 * work queue call back to submit previously compressed pages
817 */
818static noinline void async_cow_submit(struct btrfs_work *work)
819{
820 struct async_cow *async_cow;
821 struct btrfs_root *root;
822 unsigned long nr_pages;
823
824 async_cow = container_of(work, struct async_cow, work);
825
826 root = async_cow->root;
827 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
828 PAGE_CACHE_SHIFT;
829
830 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
831
832 if (atomic_read(&root->fs_info->async_delalloc_pages) <
833 5 * 1042 * 1024 &&
834 waitqueue_active(&root->fs_info->async_submit_wait))
835 wake_up(&root->fs_info->async_submit_wait);
836
837 if (async_cow->inode) {
838 submit_compressed_extents(async_cow->inode, async_cow);
839 }
840}
841
842static noinline void async_cow_free(struct btrfs_work *work)
843{
844 struct async_cow *async_cow;
845 async_cow = container_of(work, struct async_cow, work);
846 kfree(async_cow);
847}
848
849static int cow_file_range_async(struct inode *inode, struct page *locked_page,
850 u64 start, u64 end, int *page_started,
851 unsigned long *nr_written)
852{
853 struct async_cow *async_cow;
854 struct btrfs_root *root = BTRFS_I(inode)->root;
855 unsigned long nr_pages;
856 u64 cur_end;
857 int limit = 10 * 1024 * 1042;
858
859 if (!btrfs_test_opt(root, COMPRESS)) {
860 return cow_file_range(inode, locked_page, start, end,
861 page_started, nr_written, 1);
862 }
863
864 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
865 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
866 while(start < end) {
867 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
868 async_cow->inode = inode;
869 async_cow->root = root;
870 async_cow->locked_page = locked_page;
871 async_cow->start = start;
872
873 if (btrfs_test_flag(inode, NOCOMPRESS))
874 cur_end = end;
875 else
876 cur_end = min(end, start + 512 * 1024 - 1);
877
878 async_cow->end = cur_end;
879 INIT_LIST_HEAD(&async_cow->extents);
880
881 async_cow->work.func = async_cow_start;
882 async_cow->work.ordered_func = async_cow_submit;
883 async_cow->work.ordered_free = async_cow_free;
884 async_cow->work.flags = 0;
885
886 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
887 PAGE_CACHE_SHIFT;
888 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
889
890 btrfs_queue_worker(&root->fs_info->delalloc_workers,
891 &async_cow->work);
892
893 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
894 wait_event(root->fs_info->async_submit_wait,
895 (atomic_read(&root->fs_info->async_delalloc_pages) <
896 limit));
897 }
898
899 while(atomic_read(&root->fs_info->async_submit_draining) &&
900 atomic_read(&root->fs_info->async_delalloc_pages)) {
901 wait_event(root->fs_info->async_submit_wait,
902 (atomic_read(&root->fs_info->async_delalloc_pages) ==
903 0));
904 }
905
906 *nr_written += nr_pages;
907 start = cur_end + 1;
908 }
909 *page_started = 1;
910 return 0;
911}
912
913/*
914 * when nowcow writeback call back. This checks for snapshots or COW copies
915 * of the extents that exist in the file, and COWs the file as required.
916 *
917 * If no cow copies or snapshots exist, we write directly to the existing
918 * blocks on disk
919 */
920static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
921 u64 start, u64 end, int *page_started, int force,
922 unsigned long *nr_written)
923{
924 struct btrfs_root *root = BTRFS_I(inode)->root;
925 struct btrfs_trans_handle *trans;
926 struct extent_buffer *leaf;
927 struct btrfs_path *path;
928 struct btrfs_file_extent_item *fi;
929 struct btrfs_key found_key;
930 u64 cow_start;
931 u64 cur_offset;
932 u64 extent_end;
933 u64 disk_bytenr;
934 u64 num_bytes;
935 int extent_type;
936 int ret;
937 int type;
938 int nocow;
939 int check_prev = 1;
940
941 path = btrfs_alloc_path();
942 BUG_ON(!path);
943 trans = btrfs_join_transaction(root, 1);
944 BUG_ON(!trans);
945
946 cow_start = (u64)-1;
947 cur_offset = start;
948 while (1) {
949 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
950 cur_offset, 0);
951 BUG_ON(ret < 0);
952 if (ret > 0 && path->slots[0] > 0 && check_prev) {
953 leaf = path->nodes[0];
954 btrfs_item_key_to_cpu(leaf, &found_key,
955 path->slots[0] - 1);
956 if (found_key.objectid == inode->i_ino &&
957 found_key.type == BTRFS_EXTENT_DATA_KEY)
958 path->slots[0]--;
959 }
960 check_prev = 0;
961next_slot:
962 leaf = path->nodes[0];
963 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
964 ret = btrfs_next_leaf(root, path);
965 if (ret < 0)
966 BUG_ON(1);
967 if (ret > 0)
968 break;
969 leaf = path->nodes[0];
970 }
971
972 nocow = 0;
973 disk_bytenr = 0;
974 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
975
976 if (found_key.objectid > inode->i_ino ||
977 found_key.type > BTRFS_EXTENT_DATA_KEY ||
978 found_key.offset > end)
979 break;
980
981 if (found_key.offset > cur_offset) {
982 extent_end = found_key.offset;
983 goto out_check;
984 }
985
986 fi = btrfs_item_ptr(leaf, path->slots[0],
987 struct btrfs_file_extent_item);
988 extent_type = btrfs_file_extent_type(leaf, fi);
989
990 if (extent_type == BTRFS_FILE_EXTENT_REG ||
991 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
992 struct btrfs_block_group_cache *block_group;
993 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
994 extent_end = found_key.offset +
995 btrfs_file_extent_num_bytes(leaf, fi);
996 if (extent_end <= start) {
997 path->slots[0]++;
998 goto next_slot;
999 }
1000 if (btrfs_file_extent_compression(leaf, fi) ||
1001 btrfs_file_extent_encryption(leaf, fi) ||
1002 btrfs_file_extent_other_encoding(leaf, fi))
1003 goto out_check;
1004 if (disk_bytenr == 0)
1005 goto out_check;
1006 if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1007 goto out_check;
1008 if (btrfs_cross_ref_exist(trans, root, disk_bytenr))
1009 goto out_check;
1010 block_group = btrfs_lookup_block_group(root->fs_info,
1011 disk_bytenr);
1012 if (!block_group || block_group->ro)
1013 goto out_check;
1014 disk_bytenr += btrfs_file_extent_offset(leaf, fi);
1015 nocow = 1;
1016 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1017 extent_end = found_key.offset +
1018 btrfs_file_extent_inline_len(leaf, fi);
1019 extent_end = ALIGN(extent_end, root->sectorsize);
1020 } else {
1021 BUG_ON(1);
1022 }
1023out_check:
1024 if (extent_end <= start) {
1025 path->slots[0]++;
1026 goto next_slot;
1027 }
1028 if (!nocow) {
1029 if (cow_start == (u64)-1)
1030 cow_start = cur_offset;
1031 cur_offset = extent_end;
1032 if (cur_offset > end)
1033 break;
1034 path->slots[0]++;
1035 goto next_slot;
1036 }
1037
1038 btrfs_release_path(root, path);
1039 if (cow_start != (u64)-1) {
1040 ret = cow_file_range(inode, locked_page, cow_start,
1041 found_key.offset - 1, page_started,
1042 nr_written, 1);
1043 BUG_ON(ret);
1044 cow_start = (u64)-1;
1045 }
1046
1047 disk_bytenr += cur_offset - found_key.offset;
1048 num_bytes = min(end + 1, extent_end) - cur_offset;
1049 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1050 struct extent_map *em;
1051 struct extent_map_tree *em_tree;
1052 em_tree = &BTRFS_I(inode)->extent_tree;
1053 em = alloc_extent_map(GFP_NOFS);
1054 em->start = cur_offset;
1055 em->orig_start = em->start;
1056 em->len = num_bytes;
1057 em->block_len = num_bytes;
1058 em->block_start = disk_bytenr;
1059 em->bdev = root->fs_info->fs_devices->latest_bdev;
1060 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1061 while (1) {
1062 spin_lock(&em_tree->lock);
1063 ret = add_extent_mapping(em_tree, em);
1064 spin_unlock(&em_tree->lock);
1065 if (ret != -EEXIST) {
1066 free_extent_map(em);
1067 break;
1068 }
1069 btrfs_drop_extent_cache(inode, em->start,
1070 em->start + em->len - 1, 0);
1071 }
1072 type = BTRFS_ORDERED_PREALLOC;
1073 } else {
1074 type = BTRFS_ORDERED_NOCOW;
1075 }
1076
1077 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1078 num_bytes, num_bytes, type);
1079 BUG_ON(ret);
1080
1081 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1082 cur_offset, cur_offset + num_bytes - 1,
1083 locked_page, 1, 1, 1, 0, 0, 0);
1084 cur_offset = extent_end;
1085 if (cur_offset > end)
1086 break;
1087 }
1088 btrfs_release_path(root, path);
1089
1090 if (cur_offset <= end && cow_start == (u64)-1)
1091 cow_start = cur_offset;
1092 if (cow_start != (u64)-1) {
1093 ret = cow_file_range(inode, locked_page, cow_start, end,
1094 page_started, nr_written, 1);
1095 BUG_ON(ret);
1096 }
1097
1098 ret = btrfs_end_transaction(trans, root);
1099 BUG_ON(ret);
1100 btrfs_free_path(path);
1101 return 0;
1102}
1103
1104/*
1105 * extent_io.c call back to do delayed allocation processing
1106 */
1107static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1108 u64 start, u64 end, int *page_started,
1109 unsigned long *nr_written)
1110{
1111 struct btrfs_root *root = BTRFS_I(inode)->root;
1112 int ret;
1113
1114 if (btrfs_test_opt(root, NODATACOW) ||
1115 btrfs_test_flag(inode, NODATACOW))
1116 ret = run_delalloc_nocow(inode, locked_page, start, end,
1117 page_started, 0, nr_written);
1118 else if (btrfs_test_flag(inode, PREALLOC))
1119 ret = run_delalloc_nocow(inode, locked_page, start, end,
1120 page_started, 1, nr_written);
1121 else
1122 ret = cow_file_range_async(inode, locked_page, start, end,
1123 page_started, nr_written);
1124
1125 return ret;
1126}
1127
1128/*
1129 * extent_io.c set_bit_hook, used to track delayed allocation
1130 * bytes in this file, and to maintain the list of inodes that
1131 * have pending delalloc work to be done.
1132 */
1133int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1134 unsigned long old, unsigned long bits)
1135{
1136 unsigned long flags;
1137 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1138 struct btrfs_root *root = BTRFS_I(inode)->root;
1139 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
1140 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1141 root->fs_info->delalloc_bytes += end - start + 1;
1142 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1143 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1144 &root->fs_info->delalloc_inodes);
1145 }
1146 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
1147 }
1148 return 0;
1149}
1150
1151/*
1152 * extent_io.c clear_bit_hook, see set_bit_hook for why
1153 */
1154int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1155 unsigned long old, unsigned long bits)
1156{
1157 if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1158 struct btrfs_root *root = BTRFS_I(inode)->root;
1159 unsigned long flags;
1160
1161 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
1162 if (end - start + 1 > root->fs_info->delalloc_bytes) {
1163 printk("warning: delalloc account %Lu %Lu\n",
1164 end - start + 1, root->fs_info->delalloc_bytes);
1165 root->fs_info->delalloc_bytes = 0;
1166 BTRFS_I(inode)->delalloc_bytes = 0;
1167 } else {
1168 root->fs_info->delalloc_bytes -= end - start + 1;
1169 BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
1170 }
1171 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1172 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1173 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1174 }
1175 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
1176 }
1177 return 0;
1178}
1179
1180/*
1181 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1182 * we don't create bios that span stripes or chunks
1183 */
1184int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1185 size_t size, struct bio *bio,
1186 unsigned long bio_flags)
1187{
1188 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1189 struct btrfs_mapping_tree *map_tree;
1190 u64 logical = (u64)bio->bi_sector << 9;
1191 u64 length = 0;
1192 u64 map_length;
1193 int ret;
1194
1195 if (bio_flags & EXTENT_BIO_COMPRESSED)
1196 return 0;
1197
1198 length = bio->bi_size;
1199 map_tree = &root->fs_info->mapping_tree;
1200 map_length = length;
1201 ret = btrfs_map_block(map_tree, READ, logical,
1202 &map_length, NULL, 0);
1203
1204 if (map_length < length + size) {
1205 return 1;
1206 }
1207 return 0;
1208}
1209
1210/*
1211 * in order to insert checksums into the metadata in large chunks,
1212 * we wait until bio submission time. All the pages in the bio are
1213 * checksummed and sums are attached onto the ordered extent record.
1214 *
1215 * At IO completion time the cums attached on the ordered extent record
1216 * are inserted into the btree
1217 */
1218int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio,
1219 int mirror_num, unsigned long bio_flags)
1220{
1221 struct btrfs_root *root = BTRFS_I(inode)->root;
1222 int ret = 0;
1223
1224 ret = btrfs_csum_one_bio(root, inode, bio);
1225 BUG_ON(ret);
1226 return 0;
1227}
1228
1229/*
1230 * in order to insert checksums into the metadata in large chunks,
1231 * we wait until bio submission time. All the pages in the bio are
1232 * checksummed and sums are attached onto the ordered extent record.
1233 *
1234 * At IO completion time the cums attached on the ordered extent record
1235 * are inserted into the btree
1236 */
1237int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1238 int mirror_num, unsigned long bio_flags)
1239{
1240 struct btrfs_root *root = BTRFS_I(inode)->root;
1241 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
1242}
1243
1244/*
1245 * extent_io.c submission hook. This does the right thing for csum calculation on write,
1246 * or reading the csums from the tree before a read
1247 */
1248int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1249 int mirror_num, unsigned long bio_flags)
1250{
1251 struct btrfs_root *root = BTRFS_I(inode)->root;
1252 int ret = 0;
1253 int skip_sum;
1254
1255 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1256 BUG_ON(ret);
1257
1258 skip_sum = btrfs_test_opt(root, NODATASUM) ||
1259 btrfs_test_flag(inode, NODATASUM);
1260
1261 if (!(rw & (1 << BIO_RW))) {
1262
1263 if (bio_flags & EXTENT_BIO_COMPRESSED)
1264 return btrfs_submit_compressed_read(inode, bio,
1265 mirror_num, bio_flags);
1266 else if (!skip_sum)
1267 btrfs_lookup_bio_sums(root, inode, bio);
1268 goto mapit;
1269 } else if (!skip_sum) {
1270 /* we're doing a write, do the async checksumming */
1271 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1272 inode, rw, bio, mirror_num,
1273 bio_flags, __btrfs_submit_bio_start,
1274 __btrfs_submit_bio_done);
1275 }
1276
1277mapit:
1278 return btrfs_map_bio(root, rw, bio, mirror_num, 0);
1279}
1280
1281/*
1282 * given a list of ordered sums record them in the inode. This happens
1283 * at IO completion time based on sums calculated at bio submission time.
1284 */
1285static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1286 struct inode *inode, u64 file_offset,
1287 struct list_head *list)
1288{
1289 struct list_head *cur;
1290 struct btrfs_ordered_sum *sum;
1291
1292 btrfs_set_trans_block_group(trans, inode);
1293 list_for_each(cur, list) {
1294 sum = list_entry(cur, struct btrfs_ordered_sum, list);
1295 btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root,
1296 inode, sum);
1297 }
1298 return 0;
1299}
1300
1301int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
1302{
1303 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) {
1304 WARN_ON(1);
1305 }
1306 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1307 GFP_NOFS);
1308}
1309
1310/* see btrfs_writepage_start_hook for details on why this is required */
1311struct btrfs_writepage_fixup {
1312 struct page *page;
1313 struct btrfs_work work;
1314};
1315
1316void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1317{
1318 struct btrfs_writepage_fixup *fixup;
1319 struct btrfs_ordered_extent *ordered;
1320 struct page *page;
1321 struct inode *inode;
1322 u64 page_start;
1323 u64 page_end;
1324
1325 fixup = container_of(work, struct btrfs_writepage_fixup, work);
1326 page = fixup->page;
1327again:
1328 lock_page(page);
1329 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1330 ClearPageChecked(page);
1331 goto out_page;
1332 }
1333
1334 inode = page->mapping->host;
1335 page_start = page_offset(page);
1336 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1337
1338 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1339
1340 /* already ordered? We're done */
1341 if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
1342 EXTENT_ORDERED, 0)) {
1343 goto out;
1344 }
1345
1346 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1347 if (ordered) {
1348 unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
1349 page_end, GFP_NOFS);
1350 unlock_page(page);
1351 btrfs_start_ordered_extent(inode, ordered, 1);
1352 goto again;
1353 }
1354
1355 btrfs_set_extent_delalloc(inode, page_start, page_end);
1356 ClearPageChecked(page);
1357out:
1358 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1359out_page:
1360 unlock_page(page);
1361 page_cache_release(page);
1362}
1363
1364/*
1365 * There are a few paths in the higher layers of the kernel that directly
1366 * set the page dirty bit without asking the filesystem if it is a
1367 * good idea. This causes problems because we want to make sure COW
1368 * properly happens and the data=ordered rules are followed.
1369 *
1370 * In our case any range that doesn't have the ORDERED bit set
1371 * hasn't been properly setup for IO. We kick off an async process
1372 * to fix it up. The async helper will wait for ordered extents, set
1373 * the delalloc bit and make it safe to write the page.
1374 */
1375int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1376{
1377 struct inode *inode = page->mapping->host;
1378 struct btrfs_writepage_fixup *fixup;
1379 struct btrfs_root *root = BTRFS_I(inode)->root;
1380 int ret;
1381
1382 ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1383 EXTENT_ORDERED, 0);
1384 if (ret)
1385 return 0;
1386
1387 if (PageChecked(page))
1388 return -EAGAIN;
1389
1390 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1391 if (!fixup)
1392 return -EAGAIN;
1393
1394 SetPageChecked(page);
1395 page_cache_get(page);
1396 fixup->work.func = btrfs_writepage_fixup_worker;
1397 fixup->page = page;
1398 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
1399 return -EAGAIN;
1400}
1401
1402static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1403 struct inode *inode, u64 file_pos,
1404 u64 disk_bytenr, u64 disk_num_bytes,
1405 u64 num_bytes, u64 ram_bytes,
1406 u8 compression, u8 encryption,
1407 u16 other_encoding, int extent_type)
1408{
1409 struct btrfs_root *root = BTRFS_I(inode)->root;
1410 struct btrfs_file_extent_item *fi;
1411 struct btrfs_path *path;
1412 struct extent_buffer *leaf;
1413 struct btrfs_key ins;
1414 u64 hint;
1415 int ret;
1416
1417 path = btrfs_alloc_path();
1418 BUG_ON(!path);
1419
1420 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1421 file_pos + num_bytes, file_pos, &hint);
1422 BUG_ON(ret);
1423
1424 ins.objectid = inode->i_ino;
1425 ins.offset = file_pos;
1426 ins.type = BTRFS_EXTENT_DATA_KEY;
1427 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
1428 BUG_ON(ret);
1429 leaf = path->nodes[0];
1430 fi = btrfs_item_ptr(leaf, path->slots[0],
1431 struct btrfs_file_extent_item);
1432 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1433 btrfs_set_file_extent_type(leaf, fi, extent_type);
1434 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1435 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1436 btrfs_set_file_extent_offset(leaf, fi, 0);
1437 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1438 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1439 btrfs_set_file_extent_compression(leaf, fi, compression);
1440 btrfs_set_file_extent_encryption(leaf, fi, encryption);
1441 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1442 btrfs_mark_buffer_dirty(leaf);
1443
1444 inode_add_bytes(inode, num_bytes);
1445 btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
1446
1447 ins.objectid = disk_bytenr;
1448 ins.offset = disk_num_bytes;
1449 ins.type = BTRFS_EXTENT_ITEM_KEY;
1450 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
1451 root->root_key.objectid,
1452 trans->transid, inode->i_ino, &ins);
1453 BUG_ON(ret);
1454
1455 btrfs_free_path(path);
1456 return 0;
1457}
1458
1459/* as ordered data IO finishes, this gets called so we can finish
1460 * an ordered extent if the range of bytes in the file it covers are
1461 * fully written.
1462 */
1463static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1464{
1465 struct btrfs_root *root = BTRFS_I(inode)->root;
1466 struct btrfs_trans_handle *trans;
1467 struct btrfs_ordered_extent *ordered_extent;
1468 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1469 int compressed = 0;
1470 int ret;
1471
1472 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
1473 if (!ret)
1474 return 0;
1475
1476 trans = btrfs_join_transaction(root, 1);
1477
1478 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1479 BUG_ON(!ordered_extent);
1480 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
1481 goto nocow;
1482
1483 lock_extent(io_tree, ordered_extent->file_offset,
1484 ordered_extent->file_offset + ordered_extent->len - 1,
1485 GFP_NOFS);
1486
1487 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1488 compressed = 1;
1489 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1490 BUG_ON(compressed);
1491 ret = btrfs_mark_extent_written(trans, root, inode,
1492 ordered_extent->file_offset,
1493 ordered_extent->file_offset +
1494 ordered_extent->len);
1495 BUG_ON(ret);
1496 } else {
1497 ret = insert_reserved_file_extent(trans, inode,
1498 ordered_extent->file_offset,
1499 ordered_extent->start,
1500 ordered_extent->disk_len,
1501 ordered_extent->len,
1502 ordered_extent->len,
1503 compressed, 0, 0,
1504 BTRFS_FILE_EXTENT_REG);
1505 BUG_ON(ret);
1506 }
1507 unlock_extent(io_tree, ordered_extent->file_offset,
1508 ordered_extent->file_offset + ordered_extent->len - 1,
1509 GFP_NOFS);
1510nocow:
1511 add_pending_csums(trans, inode, ordered_extent->file_offset,
1512 &ordered_extent->list);
1513
1514 mutex_lock(&BTRFS_I(inode)->extent_mutex);
1515 btrfs_ordered_update_i_size(inode, ordered_extent);
1516 btrfs_update_inode(trans, root, inode);
1517 btrfs_remove_ordered_extent(inode, ordered_extent);
1518 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
1519
1520 /* once for us */
1521 btrfs_put_ordered_extent(ordered_extent);
1522 /* once for the tree */
1523 btrfs_put_ordered_extent(ordered_extent);
1524
1525 btrfs_end_transaction(trans, root);
1526 return 0;
1527}
1528
1529int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1530 struct extent_state *state, int uptodate)
1531{
1532 return btrfs_finish_ordered_io(page->mapping->host, start, end);
1533}
1534
1535/*
1536 * When IO fails, either with EIO or csum verification fails, we
1537 * try other mirrors that might have a good copy of the data. This
1538 * io_failure_record is used to record state as we go through all the
1539 * mirrors. If another mirror has good data, the page is set up to date
1540 * and things continue. If a good mirror can't be found, the original
1541 * bio end_io callback is called to indicate things have failed.
1542 */
1543struct io_failure_record {
1544 struct page *page;
1545 u64 start;
1546 u64 len;
1547 u64 logical;
1548 int last_mirror;
1549};
1550
1551int btrfs_io_failed_hook(struct bio *failed_bio,
1552 struct page *page, u64 start, u64 end,
1553 struct extent_state *state)
1554{
1555 struct io_failure_record *failrec = NULL;
1556 u64 private;
1557 struct extent_map *em;
1558 struct inode *inode = page->mapping->host;
1559 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1560 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1561 struct bio *bio;
1562 int num_copies;
1563 int ret;
1564 int rw;
1565 u64 logical;
1566 unsigned long bio_flags = 0;
1567
1568 ret = get_state_private(failure_tree, start, &private);
1569 if (ret) {
1570 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1571 if (!failrec)
1572 return -ENOMEM;
1573 failrec->start = start;
1574 failrec->len = end - start + 1;
1575 failrec->last_mirror = 0;
1576
1577 spin_lock(&em_tree->lock);
1578 em = lookup_extent_mapping(em_tree, start, failrec->len);
1579 if (em->start > start || em->start + em->len < start) {
1580 free_extent_map(em);
1581 em = NULL;
1582 }
1583 spin_unlock(&em_tree->lock);
1584
1585 if (!em || IS_ERR(em)) {
1586 kfree(failrec);
1587 return -EIO;
1588 }
1589 logical = start - em->start;
1590 logical = em->block_start + logical;
1591 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
1592 bio_flags = EXTENT_BIO_COMPRESSED;
1593 failrec->logical = logical;
1594 free_extent_map(em);
1595 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1596 EXTENT_DIRTY, GFP_NOFS);
1597 set_state_private(failure_tree, start,
1598 (u64)(unsigned long)failrec);
1599 } else {
1600 failrec = (struct io_failure_record *)(unsigned long)private;
1601 }
1602 num_copies = btrfs_num_copies(
1603 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1604 failrec->logical, failrec->len);
1605 failrec->last_mirror++;
1606 if (!state) {
1607 spin_lock_irq(&BTRFS_I(inode)->io_tree.lock);
1608 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1609 failrec->start,
1610 EXTENT_LOCKED);
1611 if (state && state->start != failrec->start)
1612 state = NULL;
1613 spin_unlock_irq(&BTRFS_I(inode)->io_tree.lock);
1614 }
1615 if (!state || failrec->last_mirror > num_copies) {
1616 set_state_private(failure_tree, failrec->start, 0);
1617 clear_extent_bits(failure_tree, failrec->start,
1618 failrec->start + failrec->len - 1,
1619 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1620 kfree(failrec);
1621 return -EIO;
1622 }
1623 bio = bio_alloc(GFP_NOFS, 1);
1624 bio->bi_private = state;
1625 bio->bi_end_io = failed_bio->bi_end_io;
1626 bio->bi_sector = failrec->logical >> 9;
1627 bio->bi_bdev = failed_bio->bi_bdev;
1628 bio->bi_size = 0;
1629 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1630 if (failed_bio->bi_rw & (1 << BIO_RW))
1631 rw = WRITE;
1632 else
1633 rw = READ;
1634
1635 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1636 failrec->last_mirror,
1637 bio_flags);
1638 return 0;
1639}
1640
1641/*
1642 * each time an IO finishes, we do a fast check in the IO failure tree
1643 * to see if we need to process or clean up an io_failure_record
1644 */
1645int btrfs_clean_io_failures(struct inode *inode, u64 start)
1646{
1647 u64 private;
1648 u64 private_failure;
1649 struct io_failure_record *failure;
1650 int ret;
1651
1652 private = 0;
1653 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1654 (u64)-1, 1, EXTENT_DIRTY)) {
1655 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1656 start, &private_failure);
1657 if (ret == 0) {
1658 failure = (struct io_failure_record *)(unsigned long)
1659 private_failure;
1660 set_state_private(&BTRFS_I(inode)->io_failure_tree,
1661 failure->start, 0);
1662 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1663 failure->start,
1664 failure->start + failure->len - 1,
1665 EXTENT_DIRTY | EXTENT_LOCKED,
1666 GFP_NOFS);
1667 kfree(failure);
1668 }
1669 }
1670 return 0;
1671}
1672
1673/*
1674 * when reads are done, we need to check csums to verify the data is correct
1675 * if there's a match, we allow the bio to finish. If not, we go through
1676 * the io_failure_record routines to find good copies
1677 */
1678int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1679 struct extent_state *state)
1680{
1681 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
1682 struct inode *inode = page->mapping->host;
1683 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1684 char *kaddr;
1685 u64 private = ~(u32)0;
1686 int ret;
1687 struct btrfs_root *root = BTRFS_I(inode)->root;
1688 u32 csum = ~(u32)0;
1689 unsigned long flags;
1690
1691 if (btrfs_test_opt(root, NODATASUM) ||
1692 btrfs_test_flag(inode, NODATASUM))
1693 return 0;
1694 if (state && state->start == start) {
1695 private = state->private;
1696 ret = 0;
1697 } else {
1698 ret = get_state_private(io_tree, start, &private);
1699 }
1700 local_irq_save(flags);
1701 kaddr = kmap_atomic(page, KM_IRQ0);
1702 if (ret) {
1703 goto zeroit;
1704 }
1705 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1);
1706 btrfs_csum_final(csum, (char *)&csum);
1707 if (csum != private) {
1708 goto zeroit;
1709 }
1710 kunmap_atomic(kaddr, KM_IRQ0);
1711 local_irq_restore(flags);
1712
1713 /* if the io failure tree for this inode is non-empty,
1714 * check to see if we've recovered from a failed IO
1715 */
1716 btrfs_clean_io_failures(inode, start);
1717 return 0;
1718
1719zeroit:
1720 printk("btrfs csum failed ino %lu off %llu csum %u private %Lu\n",
1721 page->mapping->host->i_ino, (unsigned long long)start, csum,
1722 private);
1723 memset(kaddr + offset, 1, end - start + 1);
1724 flush_dcache_page(page);
1725 kunmap_atomic(kaddr, KM_IRQ0);
1726 local_irq_restore(flags);
1727 if (private == 0)
1728 return 0;
1729 return -EIO;
1730}
1731
1732/*
1733 * This creates an orphan entry for the given inode in case something goes
1734 * wrong in the middle of an unlink/truncate.
1735 */
1736int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
1737{
1738 struct btrfs_root *root = BTRFS_I(inode)->root;
1739 int ret = 0;
1740
1741 spin_lock(&root->list_lock);
1742
1743 /* already on the orphan list, we're good */
1744 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
1745 spin_unlock(&root->list_lock);
1746 return 0;
1747 }
1748
1749 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1750
1751 spin_unlock(&root->list_lock);
1752
1753 /*
1754 * insert an orphan item to track this unlinked/truncated file
1755 */
1756 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
1757
1758 return ret;
1759}
1760
1761/*
1762 * We have done the truncate/delete so we can go ahead and remove the orphan
1763 * item for this particular inode.
1764 */
1765int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
1766{
1767 struct btrfs_root *root = BTRFS_I(inode)->root;
1768 int ret = 0;
1769
1770 spin_lock(&root->list_lock);
1771
1772 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
1773 spin_unlock(&root->list_lock);
1774 return 0;
1775 }
1776
1777 list_del_init(&BTRFS_I(inode)->i_orphan);
1778 if (!trans) {
1779 spin_unlock(&root->list_lock);
1780 return 0;
1781 }
1782
1783 spin_unlock(&root->list_lock);
1784
1785 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
1786
1787 return ret;
1788}
1789
1790/*
1791 * this cleans up any orphans that may be left on the list from the last use
1792 * of this root.
1793 */
1794void btrfs_orphan_cleanup(struct btrfs_root *root)
1795{
1796 struct btrfs_path *path;
1797 struct extent_buffer *leaf;
1798 struct btrfs_item *item;
1799 struct btrfs_key key, found_key;
1800 struct btrfs_trans_handle *trans;
1801 struct inode *inode;
1802 int ret = 0, nr_unlink = 0, nr_truncate = 0;
1803
1804 path = btrfs_alloc_path();
1805 if (!path)
1806 return;
1807 path->reada = -1;
1808
1809 key.objectid = BTRFS_ORPHAN_OBJECTID;
1810 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1811 key.offset = (u64)-1;
1812
1813
1814 while (1) {
1815 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1816 if (ret < 0) {
1817 printk(KERN_ERR "Error searching slot for orphan: %d"
1818 "\n", ret);
1819 break;
1820 }
1821
1822 /*
1823 * if ret == 0 means we found what we were searching for, which
1824 * is weird, but possible, so only screw with path if we didnt
1825 * find the key and see if we have stuff that matches
1826 */
1827 if (ret > 0) {
1828 if (path->slots[0] == 0)
1829 break;
1830 path->slots[0]--;
1831 }
1832
1833 /* pull out the item */
1834 leaf = path->nodes[0];
1835 item = btrfs_item_nr(leaf, path->slots[0]);
1836 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1837
1838 /* make sure the item matches what we want */
1839 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
1840 break;
1841 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
1842 break;
1843
1844 /* release the path since we're done with it */
1845 btrfs_release_path(root, path);
1846
1847 /*
1848 * this is where we are basically btrfs_lookup, without the
1849 * crossing root thing. we store the inode number in the
1850 * offset of the orphan item.
1851 */
1852 inode = btrfs_iget_locked(root->fs_info->sb,
1853 found_key.offset, root);
1854 if (!inode)
1855 break;
1856
1857 if (inode->i_state & I_NEW) {
1858 BTRFS_I(inode)->root = root;
1859
1860 /* have to set the location manually */
1861 BTRFS_I(inode)->location.objectid = inode->i_ino;
1862 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
1863 BTRFS_I(inode)->location.offset = 0;
1864
1865 btrfs_read_locked_inode(inode);
1866 unlock_new_inode(inode);
1867 }
1868
1869 /*
1870 * add this inode to the orphan list so btrfs_orphan_del does
1871 * the proper thing when we hit it
1872 */
1873 spin_lock(&root->list_lock);
1874 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1875 spin_unlock(&root->list_lock);
1876
1877 /*
1878 * if this is a bad inode, means we actually succeeded in
1879 * removing the inode, but not the orphan record, which means
1880 * we need to manually delete the orphan since iput will just
1881 * do a destroy_inode
1882 */
1883 if (is_bad_inode(inode)) {
1884 trans = btrfs_start_transaction(root, 1);
1885 btrfs_orphan_del(trans, inode);
1886 btrfs_end_transaction(trans, root);
1887 iput(inode);
1888 continue;
1889 }
1890
1891 /* if we have links, this was a truncate, lets do that */
1892 if (inode->i_nlink) {
1893 nr_truncate++;
1894 btrfs_truncate(inode);
1895 } else {
1896 nr_unlink++;
1897 }
1898
1899 /* this will do delete_inode and everything for us */
1900 iput(inode);
1901 }
1902
1903 if (nr_unlink)
1904 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
1905 if (nr_truncate)
1906 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
1907
1908 btrfs_free_path(path);
1909}
1910
1911/*
1912 * read an inode from the btree into the in-memory inode
1913 */
1914void btrfs_read_locked_inode(struct inode *inode)
1915{
1916 struct btrfs_path *path;
1917 struct extent_buffer *leaf;
1918 struct btrfs_inode_item *inode_item;
1919 struct btrfs_timespec *tspec;
1920 struct btrfs_root *root = BTRFS_I(inode)->root;
1921 struct btrfs_key location;
1922 u64 alloc_group_block;
1923 u32 rdev;
1924 int ret;
1925
1926 path = btrfs_alloc_path();
1927 BUG_ON(!path);
1928 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
1929
1930 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
1931 if (ret)
1932 goto make_bad;
1933
1934 leaf = path->nodes[0];
1935 inode_item = btrfs_item_ptr(leaf, path->slots[0],
1936 struct btrfs_inode_item);
1937
1938 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
1939 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
1940 inode->i_uid = btrfs_inode_uid(leaf, inode_item);
1941 inode->i_gid = btrfs_inode_gid(leaf, inode_item);
1942 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
1943
1944 tspec = btrfs_inode_atime(inode_item);
1945 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1946 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
1947
1948 tspec = btrfs_inode_mtime(inode_item);
1949 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1950 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
1951
1952 tspec = btrfs_inode_ctime(inode_item);
1953 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1954 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
1955
1956 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
1957 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
1958 inode->i_generation = BTRFS_I(inode)->generation;
1959 inode->i_rdev = 0;
1960 rdev = btrfs_inode_rdev(leaf, inode_item);
1961
1962 BTRFS_I(inode)->index_cnt = (u64)-1;
1963
1964 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
1965 BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
1966 alloc_group_block);
1967 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
1968 if (!BTRFS_I(inode)->block_group) {
1969 BTRFS_I(inode)->block_group = btrfs_find_block_group(root,
1970 NULL, 0,
1971 BTRFS_BLOCK_GROUP_METADATA, 0);
1972 }
1973 btrfs_free_path(path);
1974 inode_item = NULL;
1975
1976 switch (inode->i_mode & S_IFMT) {
1977 case S_IFREG:
1978 inode->i_mapping->a_ops = &btrfs_aops;
1979 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
1980 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
1981 inode->i_fop = &btrfs_file_operations;
1982 inode->i_op = &btrfs_file_inode_operations;
1983 break;
1984 case S_IFDIR:
1985 inode->i_fop = &btrfs_dir_file_operations;
1986 if (root == root->fs_info->tree_root)
1987 inode->i_op = &btrfs_dir_ro_inode_operations;
1988 else
1989 inode->i_op = &btrfs_dir_inode_operations;
1990 break;
1991 case S_IFLNK:
1992 inode->i_op = &btrfs_symlink_inode_operations;
1993 inode->i_mapping->a_ops = &btrfs_symlink_aops;
1994 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
1995 break;
1996 default:
1997 init_special_inode(inode, inode->i_mode, rdev);
1998 break;
1999 }
2000 return;
2001
2002make_bad:
2003 btrfs_free_path(path);
2004 make_bad_inode(inode);
2005}
2006
2007/*
2008 * given a leaf and an inode, copy the inode fields into the leaf
2009 */
2010static void fill_inode_item(struct btrfs_trans_handle *trans,
2011 struct extent_buffer *leaf,
2012 struct btrfs_inode_item *item,
2013 struct inode *inode)
2014{
2015 btrfs_set_inode_uid(leaf, item, inode->i_uid);
2016 btrfs_set_inode_gid(leaf, item, inode->i_gid);
2017 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
2018 btrfs_set_inode_mode(leaf, item, inode->i_mode);
2019 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2020
2021 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2022 inode->i_atime.tv_sec);
2023 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2024 inode->i_atime.tv_nsec);
2025
2026 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2027 inode->i_mtime.tv_sec);
2028 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2029 inode->i_mtime.tv_nsec);
2030
2031 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2032 inode->i_ctime.tv_sec);
2033 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2034 inode->i_ctime.tv_nsec);
2035
2036 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2037 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
2038 btrfs_set_inode_transid(leaf, item, trans->transid);
2039 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2040 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2041 btrfs_set_inode_block_group(leaf, item,
2042 BTRFS_I(inode)->block_group->key.objectid);
2043}
2044
2045/*
2046 * copy everything in the in-memory inode into the btree.
2047 */
2048int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,
2049 struct btrfs_root *root,
2050 struct inode *inode)
2051{
2052 struct btrfs_inode_item *inode_item;
2053 struct btrfs_path *path;
2054 struct extent_buffer *leaf;
2055 int ret;
2056
2057 path = btrfs_alloc_path();
2058 BUG_ON(!path);
2059 ret = btrfs_lookup_inode(trans, root, path,
2060 &BTRFS_I(inode)->location, 1);
2061 if (ret) {
2062 if (ret > 0)
2063 ret = -ENOENT;
2064 goto failed;
2065 }
2066
2067 leaf = path->nodes[0];
2068 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2069 struct btrfs_inode_item);
2070
2071 fill_inode_item(trans, leaf, inode_item, inode);
2072 btrfs_mark_buffer_dirty(leaf);
2073 btrfs_set_inode_last_trans(trans, inode);
2074 ret = 0;
2075failed:
2076 btrfs_free_path(path);
2077 return ret;
2078}
2079
2080
2081/*
2082 * unlink helper that gets used here in inode.c and in the tree logging
2083 * recovery code. It remove a link in a directory with a given name, and
2084 * also drops the back refs in the inode to the directory
2085 */
2086int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2087 struct btrfs_root *root,
2088 struct inode *dir, struct inode *inode,
2089 const char *name, int name_len)
2090{
2091 struct btrfs_path *path;
2092 int ret = 0;
2093 struct extent_buffer *leaf;
2094 struct btrfs_dir_item *di;
2095 struct btrfs_key key;
2096 u64 index;
2097
2098 path = btrfs_alloc_path();
2099 if (!path) {
2100 ret = -ENOMEM;
2101 goto err;
2102 }
2103
2104 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2105 name, name_len, -1);
2106 if (IS_ERR(di)) {
2107 ret = PTR_ERR(di);
2108 goto err;
2109 }
2110 if (!di) {
2111 ret = -ENOENT;
2112 goto err;
2113 }
2114 leaf = path->nodes[0];
2115 btrfs_dir_item_key_to_cpu(leaf, di, &key);
2116 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2117 if (ret)
2118 goto err;
2119 btrfs_release_path(root, path);
2120
2121 ret = btrfs_del_inode_ref(trans, root, name, name_len,
2122 inode->i_ino,
2123 dir->i_ino, &index);
2124 if (ret) {
2125 printk("failed to delete reference to %.*s, "
2126 "inode %lu parent %lu\n", name_len, name,
2127 inode->i_ino, dir->i_ino);
2128 goto err;
2129 }
2130
2131 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
2132 index, name, name_len, -1);
2133 if (IS_ERR(di)) {
2134 ret = PTR_ERR(di);
2135 goto err;
2136 }
2137 if (!di) {
2138 ret = -ENOENT;
2139 goto err;
2140 }
2141 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2142 btrfs_release_path(root, path);
2143
2144 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2145 inode, dir->i_ino);
2146 BUG_ON(ret != 0 && ret != -ENOENT);
2147 if (ret != -ENOENT)
2148 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2149
2150 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2151 dir, index);
2152 BUG_ON(ret);
2153err:
2154 btrfs_free_path(path);
2155 if (ret)
2156 goto out;
2157
2158 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2159 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2160 btrfs_update_inode(trans, root, dir);
2161 btrfs_drop_nlink(inode);
2162 ret = btrfs_update_inode(trans, root, inode);
2163 dir->i_sb->s_dirt = 1;
2164out:
2165 return ret;
2166}
2167
2168static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2169{
2170 struct btrfs_root *root;
2171 struct btrfs_trans_handle *trans;
2172 struct inode *inode = dentry->d_inode;
2173 int ret;
2174 unsigned long nr = 0;
2175
2176 root = BTRFS_I(dir)->root;
2177
2178 ret = btrfs_check_free_space(root, 1, 1);
2179 if (ret)
2180 goto fail;
2181
2182 trans = btrfs_start_transaction(root, 1);
2183
2184 btrfs_set_trans_block_group(trans, dir);
2185 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2186 dentry->d_name.name, dentry->d_name.len);
2187
2188 if (inode->i_nlink == 0)
2189 ret = btrfs_orphan_add(trans, inode);
2190
2191 nr = trans->blocks_used;
2192
2193 btrfs_end_transaction_throttle(trans, root);
2194fail:
2195 btrfs_btree_balance_dirty(root, nr);
2196 return ret;
2197}
2198
2199static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2200{
2201 struct inode *inode = dentry->d_inode;
2202 int err = 0;
2203 int ret;
2204 struct btrfs_root *root = BTRFS_I(dir)->root;
2205 struct btrfs_trans_handle *trans;
2206 unsigned long nr = 0;
2207
2208 /*
2209 * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
2210 * the root of a subvolume or snapshot
2211 */
2212 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
2213 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
2214 return -ENOTEMPTY;
2215 }
2216
2217 ret = btrfs_check_free_space(root, 1, 1);
2218 if (ret)
2219 goto fail;
2220
2221 trans = btrfs_start_transaction(root, 1);
2222 btrfs_set_trans_block_group(trans, dir);
2223
2224 err = btrfs_orphan_add(trans, inode);
2225 if (err)
2226 goto fail_trans;
2227
2228 /* now the directory is empty */
2229 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2230 dentry->d_name.name, dentry->d_name.len);
2231 if (!err) {
2232 btrfs_i_size_write(inode, 0);
2233 }
2234
2235fail_trans:
2236 nr = trans->blocks_used;
2237 ret = btrfs_end_transaction_throttle(trans, root);
2238fail:
2239 btrfs_btree_balance_dirty(root, nr);
2240
2241 if (ret && !err)
2242 err = ret;
2243 return err;
2244}
2245
2246/*
2247 * when truncating bytes in a file, it is possible to avoid reading
2248 * the leaves that contain only checksum items. This can be the
2249 * majority of the IO required to delete a large file, but it must
2250 * be done carefully.
2251 *
2252 * The keys in the level just above the leaves are checked to make sure
2253 * the lowest key in a given leaf is a csum key, and starts at an offset
2254 * after the new size.
2255 *
2256 * Then the key for the next leaf is checked to make sure it also has
2257 * a checksum item for the same file. If it does, we know our target leaf
2258 * contains only checksum items, and it can be safely freed without reading
2259 * it.
2260 *
2261 * This is just an optimization targeted at large files. It may do
2262 * nothing. It will return 0 unless things went badly.
2263 */
2264static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
2265 struct btrfs_root *root,
2266 struct btrfs_path *path,
2267 struct inode *inode, u64 new_size)
2268{
2269 struct btrfs_key key;
2270 int ret;
2271 int nritems;
2272 struct btrfs_key found_key;
2273 struct btrfs_key other_key;
2274 struct btrfs_leaf_ref *ref;
2275 u64 leaf_gen;
2276 u64 leaf_start;
2277
2278 path->lowest_level = 1;
2279 key.objectid = inode->i_ino;
2280 key.type = BTRFS_CSUM_ITEM_KEY;
2281 key.offset = new_size;
2282again:
2283 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2284 if (ret < 0)
2285 goto out;
2286
2287 if (path->nodes[1] == NULL) {
2288 ret = 0;
2289 goto out;
2290 }
2291 ret = 0;
2292 btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
2293 nritems = btrfs_header_nritems(path->nodes[1]);
2294
2295 if (!nritems)
2296 goto out;
2297
2298 if (path->slots[1] >= nritems)
2299 goto next_node;
2300
2301 /* did we find a key greater than anything we want to delete? */
2302 if (found_key.objectid > inode->i_ino ||
2303 (found_key.objectid == inode->i_ino && found_key.type > key.type))
2304 goto out;
2305
2306 /* we check the next key in the node to make sure the leave contains
2307 * only checksum items. This comparison doesn't work if our
2308 * leaf is the last one in the node
2309 */
2310 if (path->slots[1] + 1 >= nritems) {
2311next_node:
2312 /* search forward from the last key in the node, this
2313 * will bring us into the next node in the tree
2314 */
2315 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
2316
2317 /* unlikely, but we inc below, so check to be safe */
2318 if (found_key.offset == (u64)-1)
2319 goto out;
2320
2321 /* search_forward needs a path with locks held, do the
2322 * search again for the original key. It is possible
2323 * this will race with a balance and return a path that
2324 * we could modify, but this drop is just an optimization
2325 * and is allowed to miss some leaves.
2326 */
2327 btrfs_release_path(root, path);
2328 found_key.offset++;
2329
2330 /* setup a max key for search_forward */
2331 other_key.offset = (u64)-1;
2332 other_key.type = key.type;
2333 other_key.objectid = key.objectid;
2334
2335 path->keep_locks = 1;
2336 ret = btrfs_search_forward(root, &found_key, &other_key,
2337 path, 0, 0);
2338 path->keep_locks = 0;
2339 if (ret || found_key.objectid != key.objectid ||
2340 found_key.type != key.type) {
2341 ret = 0;
2342 goto out;
2343 }
2344
2345 key.offset = found_key.offset;
2346 btrfs_release_path(root, path);
2347 cond_resched();
2348 goto again;
2349 }
2350
2351 /* we know there's one more slot after us in the tree,
2352 * read that key so we can verify it is also a checksum item
2353 */
2354 btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
2355
2356 if (found_key.objectid < inode->i_ino)
2357 goto next_key;
2358
2359 if (found_key.type != key.type || found_key.offset < new_size)
2360 goto next_key;
2361
2362 /*
2363 * if the key for the next leaf isn't a csum key from this objectid,
2364 * we can't be sure there aren't good items inside this leaf.
2365 * Bail out
2366 */
2367 if (other_key.objectid != inode->i_ino || other_key.type != key.type)
2368 goto out;
2369
2370 leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
2371 leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
2372 /*
2373 * it is safe to delete this leaf, it contains only
2374 * csum items from this inode at an offset >= new_size
2375 */
2376 ret = btrfs_del_leaf(trans, root, path, leaf_start);
2377 BUG_ON(ret);
2378
2379 if (root->ref_cows && leaf_gen < trans->transid) {
2380 ref = btrfs_alloc_leaf_ref(root, 0);
2381 if (ref) {
2382 ref->root_gen = root->root_key.offset;
2383 ref->bytenr = leaf_start;
2384 ref->owner = 0;
2385 ref->generation = leaf_gen;
2386 ref->nritems = 0;
2387
2388 ret = btrfs_add_leaf_ref(root, ref, 0);
2389 WARN_ON(ret);
2390 btrfs_free_leaf_ref(root, ref);
2391 } else {
2392 WARN_ON(1);
2393 }
2394 }
2395next_key:
2396 btrfs_release_path(root, path);
2397
2398 if (other_key.objectid == inode->i_ino &&
2399 other_key.type == key.type && other_key.offset > key.offset) {
2400 key.offset = other_key.offset;
2401 cond_resched();
2402 goto again;
2403 }
2404 ret = 0;
2405out:
2406 /* fixup any changes we've made to the path */
2407 path->lowest_level = 0;
2408 path->keep_locks = 0;
2409 btrfs_release_path(root, path);
2410 return ret;
2411}
2412
2413/*
2414 * this can truncate away extent items, csum items and directory items.
2415 * It starts at a high offset and removes keys until it can't find
2416 * any higher than new_size
2417 *
2418 * csum items that cross the new i_size are truncated to the new size
2419 * as well.
2420 *
2421 * min_type is the minimum key type to truncate down to. If set to 0, this
2422 * will kill all the items on this inode, including the INODE_ITEM_KEY.
2423 */
2424noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2425 struct btrfs_root *root,
2426 struct inode *inode,
2427 u64 new_size, u32 min_type)
2428{
2429 int ret;
2430 struct btrfs_path *path;
2431 struct btrfs_key key;
2432 struct btrfs_key found_key;
2433 u32 found_type;
2434 struct extent_buffer *leaf;
2435 struct btrfs_file_extent_item *fi;
2436 u64 extent_start = 0;
2437 u64 extent_num_bytes = 0;
2438 u64 item_end = 0;
2439 u64 root_gen = 0;
2440 u64 root_owner = 0;
2441 int found_extent;
2442 int del_item;
2443 int pending_del_nr = 0;
2444 int pending_del_slot = 0;
2445 int extent_type = -1;
2446 int encoding;
2447 u64 mask = root->sectorsize - 1;
2448
2449 if (root->ref_cows)
2450 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
2451 path = btrfs_alloc_path();
2452 path->reada = -1;
2453 BUG_ON(!path);
2454
2455 /* FIXME, add redo link to tree so we don't leak on crash */
2456 key.objectid = inode->i_ino;
2457 key.offset = (u64)-1;
2458 key.type = (u8)-1;
2459
2460 btrfs_init_path(path);
2461
2462 ret = drop_csum_leaves(trans, root, path, inode, new_size);
2463 BUG_ON(ret);
2464
2465search_again:
2466 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2467 if (ret < 0) {
2468 goto error;
2469 }
2470 if (ret > 0) {
2471 /* there are no items in the tree for us to truncate, we're
2472 * done
2473 */
2474 if (path->slots[0] == 0) {
2475 ret = 0;
2476 goto error;
2477 }
2478 path->slots[0]--;
2479 }
2480
2481 while(1) {
2482 fi = NULL;
2483 leaf = path->nodes[0];
2484 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2485 found_type = btrfs_key_type(&found_key);
2486 encoding = 0;
2487
2488 if (found_key.objectid != inode->i_ino)
2489 break;
2490
2491 if (found_type < min_type)
2492 break;
2493
2494 item_end = found_key.offset;
2495 if (found_type == BTRFS_EXTENT_DATA_KEY) {
2496 fi = btrfs_item_ptr(leaf, path->slots[0],
2497 struct btrfs_file_extent_item);
2498 extent_type = btrfs_file_extent_type(leaf, fi);
2499 encoding = btrfs_file_extent_compression(leaf, fi);
2500 encoding |= btrfs_file_extent_encryption(leaf, fi);
2501 encoding |= btrfs_file_extent_other_encoding(leaf, fi);
2502
2503 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2504 item_end +=
2505 btrfs_file_extent_num_bytes(leaf, fi);
2506 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2507 item_end += btrfs_file_extent_inline_len(leaf,
2508 fi);
2509 }
2510 item_end--;
2511 }
2512 if (found_type == BTRFS_CSUM_ITEM_KEY) {
2513 ret = btrfs_csum_truncate(trans, root, path,
2514 new_size);
2515 BUG_ON(ret);
2516 }
2517 if (item_end < new_size) {
2518 if (found_type == BTRFS_DIR_ITEM_KEY) {
2519 found_type = BTRFS_INODE_ITEM_KEY;
2520 } else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
2521 found_type = BTRFS_CSUM_ITEM_KEY;
2522 } else if (found_type == BTRFS_EXTENT_DATA_KEY) {
2523 found_type = BTRFS_XATTR_ITEM_KEY;
2524 } else if (found_type == BTRFS_XATTR_ITEM_KEY) {
2525 found_type = BTRFS_INODE_REF_KEY;
2526 } else if (found_type) {
2527 found_type--;
2528 } else {
2529 break;
2530 }
2531 btrfs_set_key_type(&key, found_type);
2532 goto next;
2533 }
2534 if (found_key.offset >= new_size)
2535 del_item = 1;
2536 else
2537 del_item = 0;
2538 found_extent = 0;
2539
2540 /* FIXME, shrink the extent if the ref count is only 1 */
2541 if (found_type != BTRFS_EXTENT_DATA_KEY)
2542 goto delete;
2543
2544 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2545 u64 num_dec;
2546 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
2547 if (!del_item && !encoding) {
2548 u64 orig_num_bytes =
2549 btrfs_file_extent_num_bytes(leaf, fi);
2550 extent_num_bytes = new_size -
2551 found_key.offset + root->sectorsize - 1;
2552 extent_num_bytes = extent_num_bytes &
2553 ~((u64)root->sectorsize - 1);
2554 btrfs_set_file_extent_num_bytes(leaf, fi,
2555 extent_num_bytes);
2556 num_dec = (orig_num_bytes -
2557 extent_num_bytes);
2558 if (root->ref_cows && extent_start != 0)
2559 inode_sub_bytes(inode, num_dec);
2560 btrfs_mark_buffer_dirty(leaf);
2561 } else {
2562 extent_num_bytes =
2563 btrfs_file_extent_disk_num_bytes(leaf,
2564 fi);
2565 /* FIXME blocksize != 4096 */
2566 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
2567 if (extent_start != 0) {
2568 found_extent = 1;
2569 if (root->ref_cows)
2570 inode_sub_bytes(inode, num_dec);
2571 }
2572 root_gen = btrfs_header_generation(leaf);
2573 root_owner = btrfs_header_owner(leaf);
2574 }
2575 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2576 /*
2577 * we can't truncate inline items that have had
2578 * special encodings
2579 */
2580 if (!del_item &&
2581 btrfs_file_extent_compression(leaf, fi) == 0 &&
2582 btrfs_file_extent_encryption(leaf, fi) == 0 &&
2583 btrfs_file_extent_other_encoding(leaf, fi) == 0) {
2584 u32 size = new_size - found_key.offset;
2585
2586 if (root->ref_cows) {
2587 inode_sub_bytes(inode, item_end + 1 -
2588 new_size);
2589 }
2590 size =
2591 btrfs_file_extent_calc_inline_size(size);
2592 ret = btrfs_truncate_item(trans, root, path,
2593 size, 1);
2594 BUG_ON(ret);
2595 } else if (root->ref_cows) {
2596 inode_sub_bytes(inode, item_end + 1 -
2597 found_key.offset);
2598 }
2599 }
2600delete:
2601 if (del_item) {
2602 if (!pending_del_nr) {
2603 /* no pending yet, add ourselves */
2604 pending_del_slot = path->slots[0];
2605 pending_del_nr = 1;
2606 } else if (pending_del_nr &&
2607 path->slots[0] + 1 == pending_del_slot) {
2608 /* hop on the pending chunk */
2609 pending_del_nr++;
2610 pending_del_slot = path->slots[0];
2611 } else {
2612 printk("bad pending slot %d pending_del_nr %d pending_del_slot %d\n", path->slots[0], pending_del_nr, pending_del_slot);
2613 }
2614 } else {
2615 break;
2616 }
2617 if (found_extent) {
2618 ret = btrfs_free_extent(trans, root, extent_start,
2619 extent_num_bytes,
2620 leaf->start, root_owner,
2621 root_gen, inode->i_ino, 0);
2622 BUG_ON(ret);
2623 }
2624next:
2625 if (path->slots[0] == 0) {
2626 if (pending_del_nr)
2627 goto del_pending;
2628 btrfs_release_path(root, path);
2629 goto search_again;
2630 }
2631
2632 path->slots[0]--;
2633 if (pending_del_nr &&
2634 path->slots[0] + 1 != pending_del_slot) {
2635 struct btrfs_key debug;
2636del_pending:
2637 btrfs_item_key_to_cpu(path->nodes[0], &debug,
2638 pending_del_slot);
2639 ret = btrfs_del_items(trans, root, path,
2640 pending_del_slot,
2641 pending_del_nr);
2642 BUG_ON(ret);
2643 pending_del_nr = 0;
2644 btrfs_release_path(root, path);
2645 goto search_again;
2646 }
2647 }
2648 ret = 0;
2649error:
2650 if (pending_del_nr) {
2651 ret = btrfs_del_items(trans, root, path, pending_del_slot,
2652 pending_del_nr);
2653 }
2654 btrfs_free_path(path);
2655 inode->i_sb->s_dirt = 1;
2656 return ret;
2657}
2658
2659/*
2660 * taken from block_truncate_page, but does cow as it zeros out
2661 * any bytes left in the last page in the file.
2662 */
2663static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
2664{
2665 struct inode *inode = mapping->host;
2666 struct btrfs_root *root = BTRFS_I(inode)->root;
2667 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2668 struct btrfs_ordered_extent *ordered;
2669 char *kaddr;
2670 u32 blocksize = root->sectorsize;
2671 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2672 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2673 struct page *page;
2674 int ret = 0;
2675 u64 page_start;
2676 u64 page_end;
2677
2678 if ((offset & (blocksize - 1)) == 0)
2679 goto out;
2680
2681 ret = -ENOMEM;
2682again:
2683 page = grab_cache_page(mapping, index);
2684 if (!page)
2685 goto out;
2686
2687 page_start = page_offset(page);
2688 page_end = page_start + PAGE_CACHE_SIZE - 1;
2689
2690 if (!PageUptodate(page)) {
2691 ret = btrfs_readpage(NULL, page);
2692 lock_page(page);
2693 if (page->mapping != mapping) {
2694 unlock_page(page);
2695 page_cache_release(page);
2696 goto again;
2697 }
2698 if (!PageUptodate(page)) {
2699 ret = -EIO;
2700 goto out_unlock;
2701 }
2702 }
2703 wait_on_page_writeback(page);
2704
2705 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2706 set_page_extent_mapped(page);
2707
2708 ordered = btrfs_lookup_ordered_extent(inode, page_start);
2709 if (ordered) {
2710 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2711 unlock_page(page);
2712 page_cache_release(page);
2713 btrfs_start_ordered_extent(inode, ordered, 1);
2714 btrfs_put_ordered_extent(ordered);
2715 goto again;
2716 }
2717
2718 btrfs_set_extent_delalloc(inode, page_start, page_end);
2719 ret = 0;
2720 if (offset != PAGE_CACHE_SIZE) {
2721 kaddr = kmap(page);
2722 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2723 flush_dcache_page(page);
2724 kunmap(page);
2725 }
2726 ClearPageChecked(page);
2727 set_page_dirty(page);
2728 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2729
2730out_unlock:
2731 unlock_page(page);
2732 page_cache_release(page);
2733out:
2734 return ret;
2735}
2736
2737int btrfs_cont_expand(struct inode *inode, loff_t size)
2738{
2739 struct btrfs_trans_handle *trans;
2740 struct btrfs_root *root = BTRFS_I(inode)->root;
2741 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2742 struct extent_map *em;
2743 u64 mask = root->sectorsize - 1;
2744 u64 hole_start = (inode->i_size + mask) & ~mask;
2745 u64 block_end = (size + mask) & ~mask;
2746 u64 last_byte;
2747 u64 cur_offset;
2748 u64 hole_size;
2749 int err;
2750
2751 if (size <= hole_start)
2752 return 0;
2753
2754 err = btrfs_check_free_space(root, 1, 0);
2755 if (err)
2756 return err;
2757
2758 btrfs_truncate_page(inode->i_mapping, inode->i_size);
2759
2760 while (1) {
2761 struct btrfs_ordered_extent *ordered;
2762 btrfs_wait_ordered_range(inode, hole_start,
2763 block_end - hole_start);
2764 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2765 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
2766 if (!ordered)
2767 break;
2768 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2769 btrfs_put_ordered_extent(ordered);
2770 }
2771
2772 trans = btrfs_start_transaction(root, 1);
2773 btrfs_set_trans_block_group(trans, inode);
2774
2775 cur_offset = hole_start;
2776 while (1) {
2777 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2778 block_end - cur_offset, 0);
2779 BUG_ON(IS_ERR(em) || !em);
2780 last_byte = min(extent_map_end(em), block_end);
2781 last_byte = (last_byte + mask) & ~mask;
2782 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
2783 u64 hint_byte = 0;
2784 hole_size = last_byte - cur_offset;
2785 err = btrfs_drop_extents(trans, root, inode,
2786 cur_offset,
2787 cur_offset + hole_size,
2788 cur_offset, &hint_byte);
2789 if (err)
2790 break;
2791 err = btrfs_insert_file_extent(trans, root,
2792 inode->i_ino, cur_offset, 0,
2793 0, hole_size, 0, hole_size,
2794 0, 0, 0);
2795 btrfs_drop_extent_cache(inode, hole_start,
2796 last_byte - 1, 0);
2797 }
2798 free_extent_map(em);
2799 cur_offset = last_byte;
2800 if (err || cur_offset >= block_end)
2801 break;
2802 }
2803
2804 btrfs_end_transaction(trans, root);
2805 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2806 return err;
2807}
2808
2809static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2810{
2811 struct inode *inode = dentry->d_inode;
2812 int err;
2813
2814 err = inode_change_ok(inode, attr);
2815 if (err)
2816 return err;
2817
2818 if (S_ISREG(inode->i_mode) &&
2819 attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
2820 err = btrfs_cont_expand(inode, attr->ia_size);
2821 if (err)
2822 return err;
2823 }
2824
2825 err = inode_setattr(inode, attr);
2826
2827 if (!err && ((attr->ia_valid & ATTR_MODE)))
2828 err = btrfs_acl_chmod(inode);
2829 return err;
2830}
2831
2832void btrfs_delete_inode(struct inode *inode)
2833{
2834 struct btrfs_trans_handle *trans;
2835 struct btrfs_root *root = BTRFS_I(inode)->root;
2836 unsigned long nr;
2837 int ret;
2838
2839 truncate_inode_pages(&inode->i_data, 0);
2840 if (is_bad_inode(inode)) {
2841 btrfs_orphan_del(NULL, inode);
2842 goto no_delete;
2843 }
2844 btrfs_wait_ordered_range(inode, 0, (u64)-1);
2845
2846 btrfs_i_size_write(inode, 0);
2847 trans = btrfs_start_transaction(root, 1);
2848
2849 btrfs_set_trans_block_group(trans, inode);
2850 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
2851 if (ret) {
2852 btrfs_orphan_del(NULL, inode);
2853 goto no_delete_lock;
2854 }
2855
2856 btrfs_orphan_del(trans, inode);
2857
2858 nr = trans->blocks_used;
2859 clear_inode(inode);
2860
2861 btrfs_end_transaction(trans, root);
2862 btrfs_btree_balance_dirty(root, nr);
2863 return;
2864
2865no_delete_lock:
2866 nr = trans->blocks_used;
2867 btrfs_end_transaction(trans, root);
2868 btrfs_btree_balance_dirty(root, nr);
2869no_delete:
2870 clear_inode(inode);
2871}
2872
2873/*
2874 * this returns the key found in the dir entry in the location pointer.
2875 * If no dir entries were found, location->objectid is 0.
2876 */
2877static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
2878 struct btrfs_key *location)
2879{
2880 const char *name = dentry->d_name.name;
2881 int namelen = dentry->d_name.len;
2882 struct btrfs_dir_item *di;
2883 struct btrfs_path *path;
2884 struct btrfs_root *root = BTRFS_I(dir)->root;
2885 int ret = 0;
2886
2887 path = btrfs_alloc_path();
2888 BUG_ON(!path);
2889
2890 di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
2891 namelen, 0);
2892 if (IS_ERR(di))
2893 ret = PTR_ERR(di);
2894 if (!di || IS_ERR(di)) {
2895 goto out_err;
2896 }
2897 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
2898out:
2899 btrfs_free_path(path);
2900 return ret;
2901out_err:
2902 location->objectid = 0;
2903 goto out;
2904}
2905
2906/*
2907 * when we hit a tree root in a directory, the btrfs part of the inode
2908 * needs to be changed to reflect the root directory of the tree root. This
2909 * is kind of like crossing a mount point.
2910 */
2911static int fixup_tree_root_location(struct btrfs_root *root,
2912 struct btrfs_key *location,
2913 struct btrfs_root **sub_root,
2914 struct dentry *dentry)
2915{
2916 struct btrfs_root_item *ri;
2917
2918 if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
2919 return 0;
2920 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
2921 return 0;
2922
2923 *sub_root = btrfs_read_fs_root(root->fs_info, location,
2924 dentry->d_name.name,
2925 dentry->d_name.len);
2926 if (IS_ERR(*sub_root))
2927 return PTR_ERR(*sub_root);
2928
2929 ri = &(*sub_root)->root_item;
2930 location->objectid = btrfs_root_dirid(ri);
2931 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
2932 location->offset = 0;
2933
2934 return 0;
2935}
2936
2937static noinline void init_btrfs_i(struct inode *inode)
2938{
2939 struct btrfs_inode *bi = BTRFS_I(inode);
2940
2941 bi->i_acl = NULL;
2942 bi->i_default_acl = NULL;
2943
2944 bi->generation = 0;
2945 bi->last_trans = 0;
2946 bi->logged_trans = 0;
2947 bi->delalloc_bytes = 0;
2948 bi->disk_i_size = 0;
2949 bi->flags = 0;
2950 bi->index_cnt = (u64)-1;
2951 bi->log_dirty_trans = 0;
2952 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
2953 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
2954 inode->i_mapping, GFP_NOFS);
2955 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
2956 inode->i_mapping, GFP_NOFS);
2957 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
2958 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
2959 mutex_init(&BTRFS_I(inode)->csum_mutex);
2960 mutex_init(&BTRFS_I(inode)->extent_mutex);
2961 mutex_init(&BTRFS_I(inode)->log_mutex);
2962}
2963
2964static int btrfs_init_locked_inode(struct inode *inode, void *p)
2965{
2966 struct btrfs_iget_args *args = p;
2967 inode->i_ino = args->ino;
2968 init_btrfs_i(inode);
2969 BTRFS_I(inode)->root = args->root;
2970 return 0;
2971}
2972
2973static int btrfs_find_actor(struct inode *inode, void *opaque)
2974{
2975 struct btrfs_iget_args *args = opaque;
2976 return (args->ino == inode->i_ino &&
2977 args->root == BTRFS_I(inode)->root);
2978}
2979
2980struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
2981 struct btrfs_root *root, int wait)
2982{
2983 struct inode *inode;
2984 struct btrfs_iget_args args;
2985 args.ino = objectid;
2986 args.root = root;
2987
2988 if (wait) {
2989 inode = ilookup5(s, objectid, btrfs_find_actor,
2990 (void *)&args);
2991 } else {
2992 inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
2993 (void *)&args);
2994 }
2995 return inode;
2996}
2997
2998struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
2999 struct btrfs_root *root)
3000{
3001 struct inode *inode;
3002 struct btrfs_iget_args args;
3003 args.ino = objectid;
3004 args.root = root;
3005
3006 inode = iget5_locked(s, objectid, btrfs_find_actor,
3007 btrfs_init_locked_inode,
3008 (void *)&args);
3009 return inode;
3010}
3011
3012/* Get an inode object given its location and corresponding root.
3013 * Returns in *is_new if the inode was read from disk
3014 */
3015struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3016 struct btrfs_root *root, int *is_new)
3017{
3018 struct inode *inode;
3019
3020 inode = btrfs_iget_locked(s, location->objectid, root);
3021 if (!inode)
3022 return ERR_PTR(-EACCES);
3023
3024 if (inode->i_state & I_NEW) {
3025 BTRFS_I(inode)->root = root;
3026 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3027 btrfs_read_locked_inode(inode);
3028 unlock_new_inode(inode);
3029 if (is_new)
3030 *is_new = 1;
3031 } else {
3032 if (is_new)
3033 *is_new = 0;
3034 }
3035
3036 return inode;
3037}
3038
3039struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3040{
3041 struct inode * inode;
3042 struct btrfs_inode *bi = BTRFS_I(dir);
3043 struct btrfs_root *root = bi->root;
3044 struct btrfs_root *sub_root = root;
3045 struct btrfs_key location;
3046 int ret, new;
3047
3048 if (dentry->d_name.len > BTRFS_NAME_LEN)
3049 return ERR_PTR(-ENAMETOOLONG);
3050
3051 ret = btrfs_inode_by_name(dir, dentry, &location);
3052
3053 if (ret < 0)
3054 return ERR_PTR(ret);
3055
3056 inode = NULL;
3057 if (location.objectid) {
3058 ret = fixup_tree_root_location(root, &location, &sub_root,
3059 dentry);
3060 if (ret < 0)
3061 return ERR_PTR(ret);
3062 if (ret > 0)
3063 return ERR_PTR(-ENOENT);
3064 inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
3065 if (IS_ERR(inode))
3066 return ERR_CAST(inode);
3067 }
3068 return inode;
3069}
3070
3071static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
3072 struct nameidata *nd)
3073{
3074 struct inode *inode;
3075
3076 if (dentry->d_name.len > BTRFS_NAME_LEN)
3077 return ERR_PTR(-ENAMETOOLONG);
3078
3079 inode = btrfs_lookup_dentry(dir, dentry);
3080 if (IS_ERR(inode))
3081 return ERR_CAST(inode);
3082
3083 return d_splice_alias(inode, dentry);
3084}
3085
3086static unsigned char btrfs_filetype_table[] = {
3087 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
3088};
3089
3090static int btrfs_real_readdir(struct file *filp, void *dirent,
3091 filldir_t filldir)
3092{
3093 struct inode *inode = filp->f_dentry->d_inode;
3094 struct btrfs_root *root = BTRFS_I(inode)->root;
3095 struct btrfs_item *item;
3096 struct btrfs_dir_item *di;
3097 struct btrfs_key key;
3098 struct btrfs_key found_key;
3099 struct btrfs_path *path;
3100 int ret;
3101 u32 nritems;
3102 struct extent_buffer *leaf;
3103 int slot;
3104 int advance;
3105 unsigned char d_type;
3106 int over = 0;
3107 u32 di_cur;
3108 u32 di_total;
3109 u32 di_len;
3110 int key_type = BTRFS_DIR_INDEX_KEY;
3111 char tmp_name[32];
3112 char *name_ptr;
3113 int name_len;
3114
3115 /* FIXME, use a real flag for deciding about the key type */
3116 if (root->fs_info->tree_root == root)
3117 key_type = BTRFS_DIR_ITEM_KEY;
3118
3119 /* special case for "." */
3120 if (filp->f_pos == 0) {
3121 over = filldir(dirent, ".", 1,
3122 1, inode->i_ino,
3123 DT_DIR);
3124 if (over)
3125 return 0;
3126 filp->f_pos = 1;
3127 }
3128 /* special case for .., just use the back ref */
3129 if (filp->f_pos == 1) {
3130 u64 pino = parent_ino(filp->f_path.dentry);
3131 over = filldir(dirent, "..", 2,
3132 2, pino, DT_DIR);
3133 if (over)
3134 return 0;
3135 filp->f_pos = 2;
3136 }
3137 path = btrfs_alloc_path();
3138 path->reada = 2;
3139
3140 btrfs_set_key_type(&key, key_type);
3141 key.offset = filp->f_pos;
3142 key.objectid = inode->i_ino;
3143
3144 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3145 if (ret < 0)
3146 goto err;
3147 advance = 0;
3148
3149 while (1) {
3150 leaf = path->nodes[0];
3151 nritems = btrfs_header_nritems(leaf);
3152 slot = path->slots[0];
3153 if (advance || slot >= nritems) {
3154 if (slot >= nritems - 1) {
3155 ret = btrfs_next_leaf(root, path);
3156 if (ret)
3157 break;
3158 leaf = path->nodes[0];
3159 nritems = btrfs_header_nritems(leaf);
3160 slot = path->slots[0];
3161 } else {
3162 slot++;
3163 path->slots[0]++;
3164 }
3165 }
3166
3167 advance = 1;
3168 item = btrfs_item_nr(leaf, slot);
3169 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3170
3171 if (found_key.objectid != key.objectid)
3172 break;
3173 if (btrfs_key_type(&found_key) != key_type)
3174 break;
3175 if (found_key.offset < filp->f_pos)
3176 continue;
3177
3178 filp->f_pos = found_key.offset;
3179
3180 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
3181 di_cur = 0;
3182 di_total = btrfs_item_size(leaf, item);
3183
3184 while (di_cur < di_total) {
3185 struct btrfs_key location;
3186
3187 name_len = btrfs_dir_name_len(leaf, di);
3188 if (name_len <= sizeof(tmp_name)) {
3189 name_ptr = tmp_name;
3190 } else {
3191 name_ptr = kmalloc(name_len, GFP_NOFS);
3192 if (!name_ptr) {
3193 ret = -ENOMEM;
3194 goto err;
3195 }
3196 }
3197 read_extent_buffer(leaf, name_ptr,
3198 (unsigned long)(di + 1), name_len);
3199
3200 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
3201 btrfs_dir_item_key_to_cpu(leaf, di, &location);
3202
3203 /* is this a reference to our own snapshot? If so
3204 * skip it
3205 */
3206 if (location.type == BTRFS_ROOT_ITEM_KEY &&
3207 location.objectid == root->root_key.objectid) {
3208 over = 0;
3209 goto skip;
3210 }
3211 over = filldir(dirent, name_ptr, name_len,
3212 found_key.offset, location.objectid,
3213 d_type);
3214
3215skip:
3216 if (name_ptr != tmp_name)
3217 kfree(name_ptr);
3218
3219 if (over)
3220 goto nopos;
3221 di_len = btrfs_dir_name_len(leaf, di) +
3222 btrfs_dir_data_len(leaf, di) + sizeof(*di);
3223 di_cur += di_len;
3224 di = (struct btrfs_dir_item *)((char *)di + di_len);
3225 }
3226 }
3227
3228 /* Reached end of directory/root. Bump pos past the last item. */
3229 if (key_type == BTRFS_DIR_INDEX_KEY)
3230 filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
3231 else
3232 filp->f_pos++;
3233nopos:
3234 ret = 0;
3235err:
3236 btrfs_free_path(path);
3237 return ret;
3238}
3239
3240int btrfs_write_inode(struct inode *inode, int wait)
3241{
3242 struct btrfs_root *root = BTRFS_I(inode)->root;
3243 struct btrfs_trans_handle *trans;
3244 int ret = 0;
3245
3246 if (root->fs_info->btree_inode == inode)
3247 return 0;
3248
3249 if (wait) {
3250 trans = btrfs_join_transaction(root, 1);
3251 btrfs_set_trans_block_group(trans, inode);
3252 ret = btrfs_commit_transaction(trans, root);
3253 }
3254 return ret;
3255}
3256
3257/*
3258 * This is somewhat expensive, updating the tree every time the
3259 * inode changes. But, it is most likely to find the inode in cache.
3260 * FIXME, needs more benchmarking...there are no reasons other than performance
3261 * to keep or drop this code.
3262 */
3263void btrfs_dirty_inode(struct inode *inode)
3264{
3265 struct btrfs_root *root = BTRFS_I(inode)->root;
3266 struct btrfs_trans_handle *trans;
3267
3268 trans = btrfs_join_transaction(root, 1);
3269 btrfs_set_trans_block_group(trans, inode);
3270 btrfs_update_inode(trans, root, inode);
3271 btrfs_end_transaction(trans, root);
3272}
3273
3274/*
3275 * find the highest existing sequence number in a directory
3276 * and then set the in-memory index_cnt variable to reflect
3277 * free sequence numbers
3278 */
3279static int btrfs_set_inode_index_count(struct inode *inode)
3280{
3281 struct btrfs_root *root = BTRFS_I(inode)->root;
3282 struct btrfs_key key, found_key;
3283 struct btrfs_path *path;
3284 struct extent_buffer *leaf;
3285 int ret;
3286
3287 key.objectid = inode->i_ino;
3288 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
3289 key.offset = (u64)-1;
3290
3291 path = btrfs_alloc_path();
3292 if (!path)
3293 return -ENOMEM;
3294
3295 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3296 if (ret < 0)
3297 goto out;
3298 /* FIXME: we should be able to handle this */
3299 if (ret == 0)
3300 goto out;
3301 ret = 0;
3302
3303 /*
3304 * MAGIC NUMBER EXPLANATION:
3305 * since we search a directory based on f_pos we have to start at 2
3306 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
3307 * else has to start at 2
3308 */
3309 if (path->slots[0] == 0) {
3310 BTRFS_I(inode)->index_cnt = 2;
3311 goto out;
3312 }
3313
3314 path->slots[0]--;
3315
3316 leaf = path->nodes[0];
3317 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3318
3319 if (found_key.objectid != inode->i_ino ||
3320 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
3321 BTRFS_I(inode)->index_cnt = 2;
3322 goto out;
3323 }
3324
3325 BTRFS_I(inode)->index_cnt = found_key.offset + 1;
3326out:
3327 btrfs_free_path(path);
3328 return ret;
3329}
3330
3331/*
3332 * helper to find a free sequence number in a given directory. This current
3333 * code is very simple, later versions will do smarter things in the btree
3334 */
3335int btrfs_set_inode_index(struct inode *dir, u64 *index)
3336{
3337 int ret = 0;
3338
3339 if (BTRFS_I(dir)->index_cnt == (u64)-1) {
3340 ret = btrfs_set_inode_index_count(dir);
3341 if (ret) {
3342 return ret;
3343 }
3344 }
3345
3346 *index = BTRFS_I(dir)->index_cnt;
3347 BTRFS_I(dir)->index_cnt++;
3348
3349 return ret;
3350}
3351
3352static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3353 struct btrfs_root *root,
3354 struct inode *dir,
3355 const char *name, int name_len,
3356 u64 ref_objectid,
3357 u64 objectid,
3358 struct btrfs_block_group_cache *group,
3359 int mode, u64 *index)
3360{
3361 struct inode *inode;
3362 struct btrfs_inode_item *inode_item;
3363 struct btrfs_block_group_cache *new_inode_group;
3364 struct btrfs_key *location;
3365 struct btrfs_path *path;
3366 struct btrfs_inode_ref *ref;
3367 struct btrfs_key key[2];
3368 u32 sizes[2];
3369 unsigned long ptr;
3370 int ret;
3371 int owner;
3372
3373 path = btrfs_alloc_path();
3374 BUG_ON(!path);
3375
3376 inode = new_inode(root->fs_info->sb);
3377 if (!inode)
3378 return ERR_PTR(-ENOMEM);
3379
3380 if (dir) {
3381 ret = btrfs_set_inode_index(dir, index);
3382 if (ret)
3383 return ERR_PTR(ret);
3384 }
3385 /*
3386 * index_cnt is ignored for everything but a dir,
3387 * btrfs_get_inode_index_count has an explanation for the magic
3388 * number
3389 */
3390 init_btrfs_i(inode);
3391 BTRFS_I(inode)->index_cnt = 2;
3392 BTRFS_I(inode)->root = root;
3393 BTRFS_I(inode)->generation = trans->transid;
3394
3395 if (mode & S_IFDIR)
3396 owner = 0;
3397 else
3398 owner = 1;
3399 new_inode_group = btrfs_find_block_group(root, group, 0,
3400 BTRFS_BLOCK_GROUP_METADATA, owner);
3401 if (!new_inode_group) {
3402 printk("find_block group failed\n");
3403 new_inode_group = group;
3404 }
3405 BTRFS_I(inode)->block_group = new_inode_group;
3406
3407 key[0].objectid = objectid;
3408 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
3409 key[0].offset = 0;
3410
3411 key[1].objectid = objectid;
3412 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
3413 key[1].offset = ref_objectid;
3414
3415 sizes[0] = sizeof(struct btrfs_inode_item);
3416 sizes[1] = name_len + sizeof(*ref);
3417
3418 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
3419 if (ret != 0)
3420 goto fail;
3421
3422 if (objectid > root->highest_inode)
3423 root->highest_inode = objectid;
3424
3425 inode->i_uid = current->fsuid;
3426 inode->i_gid = current->fsgid;
3427 inode->i_mode = mode;
3428 inode->i_ino = objectid;
3429 inode_set_bytes(inode, 0);
3430 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
3431 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3432 struct btrfs_inode_item);
3433 fill_inode_item(trans, path->nodes[0], inode_item, inode);
3434
3435 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3436 struct btrfs_inode_ref);
3437 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
3438 btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
3439 ptr = (unsigned long)(ref + 1);
3440 write_extent_buffer(path->nodes[0], name, ptr, name_len);
3441
3442 btrfs_mark_buffer_dirty(path->nodes[0]);
3443 btrfs_free_path(path);
3444
3445 location = &BTRFS_I(inode)->location;
3446 location->objectid = objectid;
3447 location->offset = 0;
3448 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
3449
3450 insert_inode_hash(inode);
3451 return inode;
3452fail:
3453 if (dir)
3454 BTRFS_I(dir)->index_cnt--;
3455 btrfs_free_path(path);
3456 return ERR_PTR(ret);
3457}
3458
3459static inline u8 btrfs_inode_type(struct inode *inode)
3460{
3461 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
3462}
3463
3464/*
3465 * utility function to add 'inode' into 'parent_inode' with
3466 * a give name and a given sequence number.
3467 * if 'add_backref' is true, also insert a backref from the
3468 * inode to the parent directory.
3469 */
3470int btrfs_add_link(struct btrfs_trans_handle *trans,
3471 struct inode *parent_inode, struct inode *inode,
3472 const char *name, int name_len, int add_backref, u64 index)
3473{
3474 int ret;
3475 struct btrfs_key key;
3476 struct btrfs_root *root = BTRFS_I(parent_inode)->root;
3477
3478 key.objectid = inode->i_ino;
3479 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
3480 key.offset = 0;
3481
3482 ret = btrfs_insert_dir_item(trans, root, name, name_len,
3483 parent_inode->i_ino,
3484 &key, btrfs_inode_type(inode),
3485 index);
3486 if (ret == 0) {
3487 if (add_backref) {
3488 ret = btrfs_insert_inode_ref(trans, root,
3489 name, name_len,
3490 inode->i_ino,
3491 parent_inode->i_ino,
3492 index);
3493 }
3494 btrfs_i_size_write(parent_inode, parent_inode->i_size +
3495 name_len * 2);
3496 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
3497 ret = btrfs_update_inode(trans, root, parent_inode);
3498 }
3499 return ret;
3500}
3501
3502static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
3503 struct dentry *dentry, struct inode *inode,
3504 int backref, u64 index)
3505{
3506 int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3507 inode, dentry->d_name.name,
3508 dentry->d_name.len, backref, index);
3509 if (!err) {
3510 d_instantiate(dentry, inode);
3511 return 0;
3512 }
3513 if (err > 0)
3514 err = -EEXIST;
3515 return err;
3516}
3517
3518static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3519 int mode, dev_t rdev)
3520{
3521 struct btrfs_trans_handle *trans;
3522 struct btrfs_root *root = BTRFS_I(dir)->root;
3523 struct inode *inode = NULL;
3524 int err;
3525 int drop_inode = 0;
3526 u64 objectid;
3527 unsigned long nr = 0;
3528 u64 index = 0;
3529
3530 if (!new_valid_dev(rdev))
3531 return -EINVAL;
3532
3533 err = btrfs_check_free_space(root, 1, 0);
3534 if (err)
3535 goto fail;
3536
3537 trans = btrfs_start_transaction(root, 1);
3538 btrfs_set_trans_block_group(trans, dir);
3539
3540 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3541 if (err) {
3542 err = -ENOSPC;
3543 goto out_unlock;
3544 }
3545
3546 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3547 dentry->d_name.len,
3548 dentry->d_parent->d_inode->i_ino, objectid,
3549 BTRFS_I(dir)->block_group, mode, &index);
3550 err = PTR_ERR(inode);
3551 if (IS_ERR(inode))
3552 goto out_unlock;
3553
3554 err = btrfs_init_acl(inode, dir);
3555 if (err) {
3556 drop_inode = 1;
3557 goto out_unlock;
3558 }
3559
3560 btrfs_set_trans_block_group(trans, inode);
3561 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3562 if (err)
3563 drop_inode = 1;
3564 else {
3565 inode->i_op = &btrfs_special_inode_operations;
3566 init_special_inode(inode, inode->i_mode, rdev);
3567 btrfs_update_inode(trans, root, inode);
3568 }
3569 dir->i_sb->s_dirt = 1;
3570 btrfs_update_inode_block_group(trans, inode);
3571 btrfs_update_inode_block_group(trans, dir);
3572out_unlock:
3573 nr = trans->blocks_used;
3574 btrfs_end_transaction_throttle(trans, root);
3575fail:
3576 if (drop_inode) {
3577 inode_dec_link_count(inode);
3578 iput(inode);
3579 }
3580 btrfs_btree_balance_dirty(root, nr);
3581 return err;
3582}
3583
3584static int btrfs_create(struct inode *dir, struct dentry *dentry,
3585 int mode, struct nameidata *nd)
3586{
3587 struct btrfs_trans_handle *trans;
3588 struct btrfs_root *root = BTRFS_I(dir)->root;
3589 struct inode *inode = NULL;
3590 int err;
3591 int drop_inode = 0;
3592 unsigned long nr = 0;
3593 u64 objectid;
3594 u64 index = 0;
3595
3596 err = btrfs_check_free_space(root, 1, 0);
3597 if (err)
3598 goto fail;
3599 trans = btrfs_start_transaction(root, 1);
3600 btrfs_set_trans_block_group(trans, dir);
3601
3602 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3603 if (err) {
3604 err = -ENOSPC;
3605 goto out_unlock;
3606 }
3607
3608 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3609 dentry->d_name.len,
3610 dentry->d_parent->d_inode->i_ino,
3611 objectid, BTRFS_I(dir)->block_group, mode,
3612 &index);
3613 err = PTR_ERR(inode);
3614 if (IS_ERR(inode))
3615 goto out_unlock;
3616
3617 err = btrfs_init_acl(inode, dir);
3618 if (err) {
3619 drop_inode = 1;
3620 goto out_unlock;
3621 }
3622
3623 btrfs_set_trans_block_group(trans, inode);
3624 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3625 if (err)
3626 drop_inode = 1;
3627 else {
3628 inode->i_mapping->a_ops = &btrfs_aops;
3629 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3630 inode->i_fop = &btrfs_file_operations;
3631 inode->i_op = &btrfs_file_inode_operations;
3632 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3633 }
3634 dir->i_sb->s_dirt = 1;
3635 btrfs_update_inode_block_group(trans, inode);
3636 btrfs_update_inode_block_group(trans, dir);
3637out_unlock:
3638 nr = trans->blocks_used;
3639 btrfs_end_transaction_throttle(trans, root);
3640fail:
3641 if (drop_inode) {
3642 inode_dec_link_count(inode);
3643 iput(inode);
3644 }
3645 btrfs_btree_balance_dirty(root, nr);
3646 return err;
3647}
3648
3649static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3650 struct dentry *dentry)
3651{
3652 struct btrfs_trans_handle *trans;
3653 struct btrfs_root *root = BTRFS_I(dir)->root;
3654 struct inode *inode = old_dentry->d_inode;
3655 u64 index;
3656 unsigned long nr = 0;
3657 int err;
3658 int drop_inode = 0;
3659
3660 if (inode->i_nlink == 0)
3661 return -ENOENT;
3662
3663 btrfs_inc_nlink(inode);
3664 err = btrfs_check_free_space(root, 1, 0);
3665 if (err)
3666 goto fail;
3667 err = btrfs_set_inode_index(dir, &index);
3668 if (err)
3669 goto fail;
3670
3671 trans = btrfs_start_transaction(root, 1);
3672
3673 btrfs_set_trans_block_group(trans, dir);
3674 atomic_inc(&inode->i_count);
3675
3676 err = btrfs_add_nondir(trans, dentry, inode, 1, index);
3677
3678 if (err)
3679 drop_inode = 1;
3680
3681 dir->i_sb->s_dirt = 1;
3682 btrfs_update_inode_block_group(trans, dir);
3683 err = btrfs_update_inode(trans, root, inode);
3684
3685 if (err)
3686 drop_inode = 1;
3687
3688 nr = trans->blocks_used;
3689 btrfs_end_transaction_throttle(trans, root);
3690fail:
3691 if (drop_inode) {
3692 inode_dec_link_count(inode);
3693 iput(inode);
3694 }
3695 btrfs_btree_balance_dirty(root, nr);
3696 return err;
3697}
3698
3699static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3700{
3701 struct inode *inode = NULL;
3702 struct btrfs_trans_handle *trans;
3703 struct btrfs_root *root = BTRFS_I(dir)->root;
3704 int err = 0;
3705 int drop_on_err = 0;
3706 u64 objectid = 0;
3707 u64 index = 0;
3708 unsigned long nr = 1;
3709
3710 err = btrfs_check_free_space(root, 1, 0);
3711 if (err)
3712 goto out_unlock;
3713
3714 trans = btrfs_start_transaction(root, 1);
3715 btrfs_set_trans_block_group(trans, dir);
3716
3717 if (IS_ERR(trans)) {
3718 err = PTR_ERR(trans);
3719 goto out_unlock;
3720 }
3721
3722 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3723 if (err) {
3724 err = -ENOSPC;
3725 goto out_unlock;
3726 }
3727
3728 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3729 dentry->d_name.len,
3730 dentry->d_parent->d_inode->i_ino, objectid,
3731 BTRFS_I(dir)->block_group, S_IFDIR | mode,
3732 &index);
3733 if (IS_ERR(inode)) {
3734 err = PTR_ERR(inode);
3735 goto out_fail;
3736 }
3737
3738 drop_on_err = 1;
3739
3740 err = btrfs_init_acl(inode, dir);
3741 if (err)
3742 goto out_fail;
3743
3744 inode->i_op = &btrfs_dir_inode_operations;
3745 inode->i_fop = &btrfs_dir_file_operations;
3746 btrfs_set_trans_block_group(trans, inode);
3747
3748 btrfs_i_size_write(inode, 0);
3749 err = btrfs_update_inode(trans, root, inode);
3750 if (err)
3751 goto out_fail;
3752
3753 err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3754 inode, dentry->d_name.name,
3755 dentry->d_name.len, 0, index);
3756 if (err)
3757 goto out_fail;
3758
3759 d_instantiate(dentry, inode);
3760 drop_on_err = 0;
3761 dir->i_sb->s_dirt = 1;
3762 btrfs_update_inode_block_group(trans, inode);
3763 btrfs_update_inode_block_group(trans, dir);
3764
3765out_fail:
3766 nr = trans->blocks_used;
3767 btrfs_end_transaction_throttle(trans, root);
3768
3769out_unlock:
3770 if (drop_on_err)
3771 iput(inode);
3772 btrfs_btree_balance_dirty(root, nr);
3773 return err;
3774}
3775
3776/* helper for btfs_get_extent. Given an existing extent in the tree,
3777 * and an extent that you want to insert, deal with overlap and insert
3778 * the new extent into the tree.
3779 */
3780static int merge_extent_mapping(struct extent_map_tree *em_tree,
3781 struct extent_map *existing,
3782 struct extent_map *em,
3783 u64 map_start, u64 map_len)
3784{
3785 u64 start_diff;
3786
3787 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
3788 start_diff = map_start - em->start;
3789 em->start = map_start;
3790 em->len = map_len;
3791 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
3792 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
3793 em->block_start += start_diff;
3794 em->block_len -= start_diff;
3795 }
3796 return add_extent_mapping(em_tree, em);
3797}
3798
3799static noinline int uncompress_inline(struct btrfs_path *path,
3800 struct inode *inode, struct page *page,
3801 size_t pg_offset, u64 extent_offset,
3802 struct btrfs_file_extent_item *item)
3803{
3804 int ret;
3805 struct extent_buffer *leaf = path->nodes[0];
3806 char *tmp;
3807 size_t max_size;
3808 unsigned long inline_size;
3809 unsigned long ptr;
3810
3811 WARN_ON(pg_offset != 0);
3812 max_size = btrfs_file_extent_ram_bytes(leaf, item);
3813 inline_size = btrfs_file_extent_inline_item_len(leaf,
3814 btrfs_item_nr(leaf, path->slots[0]));
3815 tmp = kmalloc(inline_size, GFP_NOFS);
3816 ptr = btrfs_file_extent_inline_start(item);
3817
3818 read_extent_buffer(leaf, tmp, ptr, inline_size);
3819
3820 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
3821 ret = btrfs_zlib_decompress(tmp, page, extent_offset,
3822 inline_size, max_size);
3823 if (ret) {
3824 char *kaddr = kmap_atomic(page, KM_USER0);
3825 unsigned long copy_size = min_t(u64,
3826 PAGE_CACHE_SIZE - pg_offset,
3827 max_size - extent_offset);
3828 memset(kaddr + pg_offset, 0, copy_size);
3829 kunmap_atomic(kaddr, KM_USER0);
3830 }
3831 kfree(tmp);
3832 return 0;
3833}
3834
3835/*
3836 * a bit scary, this does extent mapping from logical file offset to the disk.
3837 * the ugly parts come from merging extents from the disk with the
3838 * in-ram representation. This gets more complex because of the data=ordered code,
3839 * where the in-ram extents might be locked pending data=ordered completion.
3840 *
3841 * This also copies inline extents directly into the page.
3842 */
3843struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
3844 size_t pg_offset, u64 start, u64 len,
3845 int create)
3846{
3847 int ret;
3848 int err = 0;
3849 u64 bytenr;
3850 u64 extent_start = 0;
3851 u64 extent_end = 0;
3852 u64 objectid = inode->i_ino;
3853 u32 found_type;
3854 struct btrfs_path *path = NULL;
3855 struct btrfs_root *root = BTRFS_I(inode)->root;
3856 struct btrfs_file_extent_item *item;
3857 struct extent_buffer *leaf;
3858 struct btrfs_key found_key;
3859 struct extent_map *em = NULL;
3860 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3861 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3862 struct btrfs_trans_handle *trans = NULL;
3863 int compressed;
3864
3865again:
3866 spin_lock(&em_tree->lock);
3867 em = lookup_extent_mapping(em_tree, start, len);
3868 if (em)
3869 em->bdev = root->fs_info->fs_devices->latest_bdev;
3870 spin_unlock(&em_tree->lock);
3871
3872 if (em) {
3873 if (em->start > start || em->start + em->len <= start)
3874 free_extent_map(em);
3875 else if (em->block_start == EXTENT_MAP_INLINE && page)
3876 free_extent_map(em);
3877 else
3878 goto out;
3879 }
3880 em = alloc_extent_map(GFP_NOFS);
3881 if (!em) {
3882 err = -ENOMEM;
3883 goto out;
3884 }
3885 em->bdev = root->fs_info->fs_devices->latest_bdev;
3886 em->start = EXTENT_MAP_HOLE;
3887 em->orig_start = EXTENT_MAP_HOLE;
3888 em->len = (u64)-1;
3889 em->block_len = (u64)-1;
3890
3891 if (!path) {
3892 path = btrfs_alloc_path();
3893 BUG_ON(!path);
3894 }
3895
3896 ret = btrfs_lookup_file_extent(trans, root, path,
3897 objectid, start, trans != NULL);
3898 if (ret < 0) {
3899 err = ret;
3900 goto out;
3901 }
3902
3903 if (ret != 0) {
3904 if (path->slots[0] == 0)
3905 goto not_found;
3906 path->slots[0]--;
3907 }
3908
3909 leaf = path->nodes[0];
3910 item = btrfs_item_ptr(leaf, path->slots[0],
3911 struct btrfs_file_extent_item);
3912 /* are we inside the extent that was found? */
3913 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3914 found_type = btrfs_key_type(&found_key);
3915 if (found_key.objectid != objectid ||
3916 found_type != BTRFS_EXTENT_DATA_KEY) {
3917 goto not_found;
3918 }
3919
3920 found_type = btrfs_file_extent_type(leaf, item);
3921 extent_start = found_key.offset;
3922 compressed = btrfs_file_extent_compression(leaf, item);
3923 if (found_type == BTRFS_FILE_EXTENT_REG ||
3924 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
3925 extent_end = extent_start +
3926 btrfs_file_extent_num_bytes(leaf, item);
3927 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
3928 size_t size;
3929 size = btrfs_file_extent_inline_len(leaf, item);
3930 extent_end = (extent_start + size + root->sectorsize - 1) &
3931 ~((u64)root->sectorsize - 1);
3932 }
3933
3934 if (start >= extent_end) {
3935 path->slots[0]++;
3936 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3937 ret = btrfs_next_leaf(root, path);
3938 if (ret < 0) {
3939 err = ret;
3940 goto out;
3941 }
3942 if (ret > 0)
3943 goto not_found;
3944 leaf = path->nodes[0];
3945 }
3946 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3947 if (found_key.objectid != objectid ||
3948 found_key.type != BTRFS_EXTENT_DATA_KEY)
3949 goto not_found;
3950 if (start + len <= found_key.offset)
3951 goto not_found;
3952 em->start = start;
3953 em->len = found_key.offset - start;
3954 goto not_found_em;
3955 }
3956
3957 if (found_type == BTRFS_FILE_EXTENT_REG ||
3958 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
3959 em->start = extent_start;
3960 em->len = extent_end - extent_start;
3961 em->orig_start = extent_start -
3962 btrfs_file_extent_offset(leaf, item);
3963 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
3964 if (bytenr == 0) {
3965 em->block_start = EXTENT_MAP_HOLE;
3966 goto insert;
3967 }
3968 if (compressed) {
3969 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3970 em->block_start = bytenr;
3971 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
3972 item);
3973 } else {
3974 bytenr += btrfs_file_extent_offset(leaf, item);
3975 em->block_start = bytenr;
3976 em->block_len = em->len;
3977 if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
3978 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
3979 }
3980 goto insert;
3981 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
3982 unsigned long ptr;
3983 char *map;
3984 size_t size;
3985 size_t extent_offset;
3986 size_t copy_size;
3987
3988 em->block_start = EXTENT_MAP_INLINE;
3989 if (!page || create) {
3990 em->start = extent_start;
3991 em->len = extent_end - extent_start;
3992 goto out;
3993 }
3994
3995 size = btrfs_file_extent_inline_len(leaf, item);
3996 extent_offset = page_offset(page) + pg_offset - extent_start;
3997 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
3998 size - extent_offset);
3999 em->start = extent_start + extent_offset;
4000 em->len = (copy_size + root->sectorsize - 1) &
4001 ~((u64)root->sectorsize - 1);
4002 em->orig_start = EXTENT_MAP_INLINE;
4003 if (compressed)
4004 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4005 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
4006 if (create == 0 && !PageUptodate(page)) {
4007 if (btrfs_file_extent_compression(leaf, item) ==
4008 BTRFS_COMPRESS_ZLIB) {
4009 ret = uncompress_inline(path, inode, page,
4010 pg_offset,
4011 extent_offset, item);
4012 BUG_ON(ret);
4013 } else {
4014 map = kmap(page);
4015 read_extent_buffer(leaf, map + pg_offset, ptr,
4016 copy_size);
4017 kunmap(page);
4018 }
4019 flush_dcache_page(page);
4020 } else if (create && PageUptodate(page)) {
4021 if (!trans) {
4022 kunmap(page);
4023 free_extent_map(em);
4024 em = NULL;
4025 btrfs_release_path(root, path);
4026 trans = btrfs_join_transaction(root, 1);
4027 goto again;
4028 }
4029 map = kmap(page);
4030 write_extent_buffer(leaf, map + pg_offset, ptr,
4031 copy_size);
4032 kunmap(page);
4033 btrfs_mark_buffer_dirty(leaf);
4034 }
4035 set_extent_uptodate(io_tree, em->start,
4036 extent_map_end(em) - 1, GFP_NOFS);
4037 goto insert;
4038 } else {
4039 printk("unkknown found_type %d\n", found_type);
4040 WARN_ON(1);
4041 }
4042not_found:
4043 em->start = start;
4044 em->len = len;
4045not_found_em:
4046 em->block_start = EXTENT_MAP_HOLE;
4047 set_bit(EXTENT_FLAG_VACANCY, &em->flags);
4048insert:
4049 btrfs_release_path(root, path);
4050 if (em->start > start || extent_map_end(em) <= start) {
4051 printk("bad extent! em: [%Lu %Lu] passed [%Lu %Lu]\n", em->start, em->len, start, len);
4052 err = -EIO;
4053 goto out;
4054 }
4055
4056 err = 0;
4057 spin_lock(&em_tree->lock);
4058 ret = add_extent_mapping(em_tree, em);
4059 /* it is possible that someone inserted the extent into the tree
4060 * while we had the lock dropped. It is also possible that
4061 * an overlapping map exists in the tree
4062 */
4063 if (ret == -EEXIST) {
4064 struct extent_map *existing;
4065
4066 ret = 0;
4067
4068 existing = lookup_extent_mapping(em_tree, start, len);
4069 if (existing && (existing->start > start ||
4070 existing->start + existing->len <= start)) {
4071 free_extent_map(existing);
4072 existing = NULL;
4073 }
4074 if (!existing) {
4075 existing = lookup_extent_mapping(em_tree, em->start,
4076 em->len);
4077 if (existing) {
4078 err = merge_extent_mapping(em_tree, existing,
4079 em, start,
4080 root->sectorsize);
4081 free_extent_map(existing);
4082 if (err) {
4083 free_extent_map(em);
4084 em = NULL;
4085 }
4086 } else {
4087 err = -EIO;
4088 printk("failing to insert %Lu %Lu\n",
4089 start, len);
4090 free_extent_map(em);
4091 em = NULL;
4092 }
4093 } else {
4094 free_extent_map(em);
4095 em = existing;
4096 err = 0;
4097 }
4098 }
4099 spin_unlock(&em_tree->lock);
4100out:
4101 if (path)
4102 btrfs_free_path(path);
4103 if (trans) {
4104 ret = btrfs_end_transaction(trans, root);
4105 if (!err) {
4106 err = ret;
4107 }
4108 }
4109 if (err) {
4110 free_extent_map(em);
4111 WARN_ON(1);
4112 return ERR_PTR(err);
4113 }
4114 return em;
4115}
4116
4117static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4118 const struct iovec *iov, loff_t offset,
4119 unsigned long nr_segs)
4120{
4121 return -EINVAL;
4122}
4123
4124static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
4125{
4126 return extent_bmap(mapping, iblock, btrfs_get_extent);
4127}
4128
4129int btrfs_readpage(struct file *file, struct page *page)
4130{
4131 struct extent_io_tree *tree;
4132 tree = &BTRFS_I(page->mapping->host)->io_tree;
4133 return extent_read_full_page(tree, page, btrfs_get_extent);
4134}
4135
4136static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
4137{
4138 struct extent_io_tree *tree;
4139
4140
4141 if (current->flags & PF_MEMALLOC) {
4142 redirty_page_for_writepage(wbc, page);
4143 unlock_page(page);
4144 return 0;
4145 }
4146 tree = &BTRFS_I(page->mapping->host)->io_tree;
4147 return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
4148}
4149
4150int btrfs_writepages(struct address_space *mapping,
4151 struct writeback_control *wbc)
4152{
4153 struct extent_io_tree *tree;
4154
4155 tree = &BTRFS_I(mapping->host)->io_tree;
4156 return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
4157}
4158
4159static int
4160btrfs_readpages(struct file *file, struct address_space *mapping,
4161 struct list_head *pages, unsigned nr_pages)
4162{
4163 struct extent_io_tree *tree;
4164 tree = &BTRFS_I(mapping->host)->io_tree;
4165 return extent_readpages(tree, mapping, pages, nr_pages,
4166 btrfs_get_extent);
4167}
4168static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4169{
4170 struct extent_io_tree *tree;
4171 struct extent_map_tree *map;
4172 int ret;
4173
4174 tree = &BTRFS_I(page->mapping->host)->io_tree;
4175 map = &BTRFS_I(page->mapping->host)->extent_tree;
4176 ret = try_release_extent_mapping(map, tree, page, gfp_flags);
4177 if (ret == 1) {
4178 ClearPagePrivate(page);
4179 set_page_private(page, 0);
4180 page_cache_release(page);
4181 }
4182 return ret;
4183}
4184
4185static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4186{
4187 if (PageWriteback(page) || PageDirty(page))
4188 return 0;
4189 return __btrfs_releasepage(page, gfp_flags);
4190}
4191
4192static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4193{
4194 struct extent_io_tree *tree;
4195 struct btrfs_ordered_extent *ordered;
4196 u64 page_start = page_offset(page);
4197 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
4198
4199 wait_on_page_writeback(page);
4200 tree = &BTRFS_I(page->mapping->host)->io_tree;
4201 if (offset) {
4202 btrfs_releasepage(page, GFP_NOFS);
4203 return;
4204 }
4205
4206 lock_extent(tree, page_start, page_end, GFP_NOFS);
4207 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
4208 page_offset(page));
4209 if (ordered) {
4210 /*
4211 * IO on this page will never be started, so we need
4212 * to account for any ordered extents now
4213 */
4214 clear_extent_bit(tree, page_start, page_end,
4215 EXTENT_DIRTY | EXTENT_DELALLOC |
4216 EXTENT_LOCKED, 1, 0, GFP_NOFS);
4217 btrfs_finish_ordered_io(page->mapping->host,
4218 page_start, page_end);
4219 btrfs_put_ordered_extent(ordered);
4220 lock_extent(tree, page_start, page_end, GFP_NOFS);
4221 }
4222 clear_extent_bit(tree, page_start, page_end,
4223 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4224 EXTENT_ORDERED,
4225 1, 1, GFP_NOFS);
4226 __btrfs_releasepage(page, GFP_NOFS);
4227
4228 ClearPageChecked(page);
4229 if (PagePrivate(page)) {
4230 ClearPagePrivate(page);
4231 set_page_private(page, 0);
4232 page_cache_release(page);
4233 }
4234}
4235
4236/*
4237 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
4238 * called from a page fault handler when a page is first dirtied. Hence we must
4239 * be careful to check for EOF conditions here. We set the page up correctly
4240 * for a written page which means we get ENOSPC checking when writing into
4241 * holes and correct delalloc and unwritten extent mapping on filesystems that
4242 * support these features.
4243 *
4244 * We are not allowed to take the i_mutex here so we have to play games to
4245 * protect against truncate races as the page could now be beyond EOF. Because
4246 * vmtruncate() writes the inode size before removing pages, once we have the
4247 * page lock we can determine safely if the page is beyond EOF. If it is not
4248 * beyond EOF, then the page is guaranteed safe against truncation until we
4249 * unlock the page.
4250 */
4251int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4252{
4253 struct inode *inode = fdentry(vma->vm_file)->d_inode;
4254 struct btrfs_root *root = BTRFS_I(inode)->root;
4255 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4256 struct btrfs_ordered_extent *ordered;
4257 char *kaddr;
4258 unsigned long zero_start;
4259 loff_t size;
4260 int ret;
4261 u64 page_start;
4262 u64 page_end;
4263
4264 ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
4265 if (ret)
4266 goto out;
4267
4268 ret = -EINVAL;
4269again:
4270 lock_page(page);
4271 size = i_size_read(inode);
4272 page_start = page_offset(page);
4273 page_end = page_start + PAGE_CACHE_SIZE - 1;
4274
4275 if ((page->mapping != inode->i_mapping) ||
4276 (page_start >= size)) {
4277 /* page got truncated out from underneath us */
4278 goto out_unlock;
4279 }
4280 wait_on_page_writeback(page);
4281
4282 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
4283 set_page_extent_mapped(page);
4284
4285 /*
4286 * we can't set the delalloc bits if there are pending ordered
4287 * extents. Drop our locks and wait for them to finish
4288 */
4289 ordered = btrfs_lookup_ordered_extent(inode, page_start);
4290 if (ordered) {
4291 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4292 unlock_page(page);
4293 btrfs_start_ordered_extent(inode, ordered, 1);
4294 btrfs_put_ordered_extent(ordered);
4295 goto again;
4296 }
4297
4298 btrfs_set_extent_delalloc(inode, page_start, page_end);
4299 ret = 0;
4300
4301 /* page is wholly or partially inside EOF */
4302 if (page_start + PAGE_CACHE_SIZE > size)
4303 zero_start = size & ~PAGE_CACHE_MASK;
4304 else
4305 zero_start = PAGE_CACHE_SIZE;
4306
4307 if (zero_start != PAGE_CACHE_SIZE) {
4308 kaddr = kmap(page);
4309 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
4310 flush_dcache_page(page);
4311 kunmap(page);
4312 }
4313 ClearPageChecked(page);
4314 set_page_dirty(page);
4315 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4316
4317out_unlock:
4318 unlock_page(page);
4319out:
4320 return ret;
4321}
4322
4323static void btrfs_truncate(struct inode *inode)
4324{
4325 struct btrfs_root *root = BTRFS_I(inode)->root;
4326 int ret;
4327 struct btrfs_trans_handle *trans;
4328 unsigned long nr;
4329 u64 mask = root->sectorsize - 1;
4330
4331 if (!S_ISREG(inode->i_mode))
4332 return;
4333 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4334 return;
4335
4336 btrfs_truncate_page(inode->i_mapping, inode->i_size);
4337 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4338
4339 trans = btrfs_start_transaction(root, 1);
4340 btrfs_set_trans_block_group(trans, inode);
4341 btrfs_i_size_write(inode, inode->i_size);
4342
4343 ret = btrfs_orphan_add(trans, inode);
4344 if (ret)
4345 goto out;
4346 /* FIXME, add redo link to tree so we don't leak on crash */
4347 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
4348 BTRFS_EXTENT_DATA_KEY);
4349 btrfs_update_inode(trans, root, inode);
4350
4351 ret = btrfs_orphan_del(trans, inode);
4352 BUG_ON(ret);
4353
4354out:
4355 nr = trans->blocks_used;
4356 ret = btrfs_end_transaction_throttle(trans, root);
4357 BUG_ON(ret);
4358 btrfs_btree_balance_dirty(root, nr);
4359}
4360
4361/*
4362 * Invalidate a single dcache entry at the root of the filesystem.
4363 * Needed after creation of snapshot or subvolume.
4364 */
4365void btrfs_invalidate_dcache_root(struct inode *dir, char *name,
4366 int namelen)
4367{
4368 struct dentry *alias, *entry;
4369 struct qstr qstr;
4370
4371 alias = d_find_alias(dir);
4372 if (alias) {
4373 qstr.name = name;
4374 qstr.len = namelen;
4375 /* change me if btrfs ever gets a d_hash operation */
4376 qstr.hash = full_name_hash(qstr.name, qstr.len);
4377 entry = d_lookup(alias, &qstr);
4378 dput(alias);
4379 if (entry) {
4380 d_invalidate(entry);
4381 dput(entry);
4382 }
4383 }
4384}
4385
4386/*
4387 * create a new subvolume directory/inode (helper for the ioctl).
4388 */
4389int btrfs_create_subvol_root(struct btrfs_root *new_root, struct dentry *dentry,
4390 struct btrfs_trans_handle *trans, u64 new_dirid,
4391 struct btrfs_block_group_cache *block_group)
4392{
4393 struct inode *inode;
4394 int error;
4395 u64 index = 0;
4396
4397 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
4398 new_dirid, block_group, S_IFDIR | 0700, &index);
4399 if (IS_ERR(inode))
4400 return PTR_ERR(inode);
4401 inode->i_op = &btrfs_dir_inode_operations;
4402 inode->i_fop = &btrfs_dir_file_operations;
4403
4404 inode->i_nlink = 1;
4405 btrfs_i_size_write(inode, 0);
4406
4407 error = btrfs_update_inode(trans, new_root, inode);
4408 if (error)
4409 return error;
4410
4411 d_instantiate(dentry, inode);
4412 return 0;
4413}
4414
4415/* helper function for file defrag and space balancing. This
4416 * forces readahead on a given range of bytes in an inode
4417 */
4418unsigned long btrfs_force_ra(struct address_space *mapping,
4419 struct file_ra_state *ra, struct file *file,
4420 pgoff_t offset, pgoff_t last_index)
4421{
4422 pgoff_t req_size = last_index - offset + 1;
4423
4424 page_cache_sync_readahead(mapping, ra, file, offset, req_size);
4425 return offset + req_size;
4426}
4427
4428struct inode *btrfs_alloc_inode(struct super_block *sb)
4429{
4430 struct btrfs_inode *ei;
4431
4432 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
4433 if (!ei)
4434 return NULL;
4435 ei->last_trans = 0;
4436 ei->logged_trans = 0;
4437 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4438 ei->i_acl = BTRFS_ACL_NOT_CACHED;
4439 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4440 INIT_LIST_HEAD(&ei->i_orphan);
4441 return &ei->vfs_inode;
4442}
4443
4444void btrfs_destroy_inode(struct inode *inode)
4445{
4446 struct btrfs_ordered_extent *ordered;
4447 WARN_ON(!list_empty(&inode->i_dentry));
4448 WARN_ON(inode->i_data.nrpages);
4449
4450 if (BTRFS_I(inode)->i_acl &&
4451 BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
4452 posix_acl_release(BTRFS_I(inode)->i_acl);
4453 if (BTRFS_I(inode)->i_default_acl &&
4454 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4455 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4456
4457 spin_lock(&BTRFS_I(inode)->root->list_lock);
4458 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4459 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4460 " list\n", inode->i_ino);
4461 dump_stack();
4462 }
4463 spin_unlock(&BTRFS_I(inode)->root->list_lock);
4464
4465 while(1) {
4466 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
4467 if (!ordered)
4468 break;
4469 else {
4470 printk("found ordered extent %Lu %Lu\n",
4471 ordered->file_offset, ordered->len);
4472 btrfs_remove_ordered_extent(inode, ordered);
4473 btrfs_put_ordered_extent(ordered);
4474 btrfs_put_ordered_extent(ordered);
4475 }
4476 }
4477 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
4478 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
4479}
4480
4481static void init_once(void *foo)
4482{
4483 struct btrfs_inode *ei = (struct btrfs_inode *) foo;
4484
4485 inode_init_once(&ei->vfs_inode);
4486}
4487
4488void btrfs_destroy_cachep(void)
4489{
4490 if (btrfs_inode_cachep)
4491 kmem_cache_destroy(btrfs_inode_cachep);
4492 if (btrfs_trans_handle_cachep)
4493 kmem_cache_destroy(btrfs_trans_handle_cachep);
4494 if (btrfs_transaction_cachep)
4495 kmem_cache_destroy(btrfs_transaction_cachep);
4496 if (btrfs_bit_radix_cachep)
4497 kmem_cache_destroy(btrfs_bit_radix_cachep);
4498 if (btrfs_path_cachep)
4499 kmem_cache_destroy(btrfs_path_cachep);
4500}
4501
4502struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
4503 unsigned long extra_flags,
4504 void (*ctor)(void *))
4505{
4506 return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
4507 SLAB_MEM_SPREAD | extra_flags), ctor);
4508}
4509
4510int btrfs_init_cachep(void)
4511{
4512 btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
4513 sizeof(struct btrfs_inode),
4514 0, init_once);
4515 if (!btrfs_inode_cachep)
4516 goto fail;
4517 btrfs_trans_handle_cachep =
4518 btrfs_cache_create("btrfs_trans_handle_cache",
4519 sizeof(struct btrfs_trans_handle),
4520 0, NULL);
4521 if (!btrfs_trans_handle_cachep)
4522 goto fail;
4523 btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
4524 sizeof(struct btrfs_transaction),
4525 0, NULL);
4526 if (!btrfs_transaction_cachep)
4527 goto fail;
4528 btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
4529 sizeof(struct btrfs_path),
4530 0, NULL);
4531 if (!btrfs_path_cachep)
4532 goto fail;
4533 btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
4534 SLAB_DESTROY_BY_RCU, NULL);
4535 if (!btrfs_bit_radix_cachep)
4536 goto fail;
4537 return 0;
4538fail:
4539 btrfs_destroy_cachep();
4540 return -ENOMEM;
4541}
4542
4543static int btrfs_getattr(struct vfsmount *mnt,
4544 struct dentry *dentry, struct kstat *stat)
4545{
4546 struct inode *inode = dentry->d_inode;
4547 generic_fillattr(inode, stat);
4548 stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
4549 stat->blksize = PAGE_CACHE_SIZE;
4550 stat->blocks = (inode_get_bytes(inode) +
4551 BTRFS_I(inode)->delalloc_bytes) >> 9;
4552 return 0;
4553}
4554
4555static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
4556 struct inode * new_dir,struct dentry *new_dentry)
4557{
4558 struct btrfs_trans_handle *trans;
4559 struct btrfs_root *root = BTRFS_I(old_dir)->root;
4560 struct inode *new_inode = new_dentry->d_inode;
4561 struct inode *old_inode = old_dentry->d_inode;
4562 struct timespec ctime = CURRENT_TIME;
4563 u64 index = 0;
4564 int ret;
4565
4566 /* we're not allowed to rename between subvolumes */
4567 if (BTRFS_I(old_inode)->root->root_key.objectid !=
4568 BTRFS_I(new_dir)->root->root_key.objectid)
4569 return -EXDEV;
4570
4571 if (S_ISDIR(old_inode->i_mode) && new_inode &&
4572 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
4573 return -ENOTEMPTY;
4574 }
4575
4576 /* to rename a snapshot or subvolume, we need to juggle the
4577 * backrefs. This isn't coded yet
4578 */
4579 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
4580 return -EXDEV;
4581
4582 ret = btrfs_check_free_space(root, 1, 0);
4583 if (ret)
4584 goto out_unlock;
4585
4586 trans = btrfs_start_transaction(root, 1);
4587
4588 btrfs_set_trans_block_group(trans, new_dir);
4589
4590 btrfs_inc_nlink(old_dentry->d_inode);
4591 old_dir->i_ctime = old_dir->i_mtime = ctime;
4592 new_dir->i_ctime = new_dir->i_mtime = ctime;
4593 old_inode->i_ctime = ctime;
4594
4595 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4596 old_dentry->d_name.name,
4597 old_dentry->d_name.len);
4598 if (ret)
4599 goto out_fail;
4600
4601 if (new_inode) {
4602 new_inode->i_ctime = CURRENT_TIME;
4603 ret = btrfs_unlink_inode(trans, root, new_dir,
4604 new_dentry->d_inode,
4605 new_dentry->d_name.name,
4606 new_dentry->d_name.len);
4607 if (ret)
4608 goto out_fail;
4609 if (new_inode->i_nlink == 0) {
4610 ret = btrfs_orphan_add(trans, new_dentry->d_inode);
4611 if (ret)
4612 goto out_fail;
4613 }
4614
4615 }
4616 ret = btrfs_set_inode_index(new_dir, &index);
4617 if (ret)
4618 goto out_fail;
4619
4620 ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
4621 old_inode, new_dentry->d_name.name,
4622 new_dentry->d_name.len, 1, index);
4623 if (ret)
4624 goto out_fail;
4625
4626out_fail:
4627 btrfs_end_transaction_throttle(trans, root);
4628out_unlock:
4629 return ret;
4630}
4631
4632/*
4633 * some fairly slow code that needs optimization. This walks the list
4634 * of all the inodes with pending delalloc and forces them to disk.
4635 */
4636int btrfs_start_delalloc_inodes(struct btrfs_root *root)
4637{
4638 struct list_head *head = &root->fs_info->delalloc_inodes;
4639 struct btrfs_inode *binode;
4640 struct inode *inode;
4641 unsigned long flags;
4642
4643 if (root->fs_info->sb->s_flags & MS_RDONLY)
4644 return -EROFS;
4645
4646 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
4647 while(!list_empty(head)) {
4648 binode = list_entry(head->next, struct btrfs_inode,
4649 delalloc_inodes);
4650 inode = igrab(&binode->vfs_inode);
4651 if (!inode)
4652 list_del_init(&binode->delalloc_inodes);
4653 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
4654 if (inode) {
4655 filemap_flush(inode->i_mapping);
4656 iput(inode);
4657 }
4658 cond_resched();
4659 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
4660 }
4661 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
4662
4663 /* the filemap_flush will queue IO into the worker threads, but
4664 * we have to make sure the IO is actually started and that
4665 * ordered extents get created before we return
4666 */
4667 atomic_inc(&root->fs_info->async_submit_draining);
4668 while(atomic_read(&root->fs_info->nr_async_submits) ||
4669 atomic_read(&root->fs_info->async_delalloc_pages)) {
4670 wait_event(root->fs_info->async_submit_wait,
4671 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
4672 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
4673 }
4674 atomic_dec(&root->fs_info->async_submit_draining);
4675 return 0;
4676}
4677
4678static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4679 const char *symname)
4680{
4681 struct btrfs_trans_handle *trans;
4682 struct btrfs_root *root = BTRFS_I(dir)->root;
4683 struct btrfs_path *path;
4684 struct btrfs_key key;
4685 struct inode *inode = NULL;
4686 int err;
4687 int drop_inode = 0;
4688 u64 objectid;
4689 u64 index = 0 ;
4690 int name_len;
4691 int datasize;
4692 unsigned long ptr;
4693 struct btrfs_file_extent_item *ei;
4694 struct extent_buffer *leaf;
4695 unsigned long nr = 0;
4696
4697 name_len = strlen(symname) + 1;
4698 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
4699 return -ENAMETOOLONG;
4700
4701 err = btrfs_check_free_space(root, 1, 0);
4702 if (err)
4703 goto out_fail;
4704
4705 trans = btrfs_start_transaction(root, 1);
4706 btrfs_set_trans_block_group(trans, dir);
4707
4708 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4709 if (err) {
4710 err = -ENOSPC;
4711 goto out_unlock;
4712 }
4713
4714 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4715 dentry->d_name.len,
4716 dentry->d_parent->d_inode->i_ino, objectid,
4717 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
4718 &index);
4719 err = PTR_ERR(inode);
4720 if (IS_ERR(inode))
4721 goto out_unlock;
4722
4723 err = btrfs_init_acl(inode, dir);
4724 if (err) {
4725 drop_inode = 1;
4726 goto out_unlock;
4727 }
4728
4729 btrfs_set_trans_block_group(trans, inode);
4730 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
4731 if (err)
4732 drop_inode = 1;
4733 else {
4734 inode->i_mapping->a_ops = &btrfs_aops;
4735 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4736 inode->i_fop = &btrfs_file_operations;
4737 inode->i_op = &btrfs_file_inode_operations;
4738 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4739 }
4740 dir->i_sb->s_dirt = 1;
4741 btrfs_update_inode_block_group(trans, inode);
4742 btrfs_update_inode_block_group(trans, dir);
4743 if (drop_inode)
4744 goto out_unlock;
4745
4746 path = btrfs_alloc_path();
4747 BUG_ON(!path);
4748 key.objectid = inode->i_ino;
4749 key.offset = 0;
4750 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
4751 datasize = btrfs_file_extent_calc_inline_size(name_len);
4752 err = btrfs_insert_empty_item(trans, root, path, &key,
4753 datasize);
4754 if (err) {
4755 drop_inode = 1;
4756 goto out_unlock;
4757 }
4758 leaf = path->nodes[0];
4759 ei = btrfs_item_ptr(leaf, path->slots[0],
4760 struct btrfs_file_extent_item);
4761 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
4762 btrfs_set_file_extent_type(leaf, ei,
4763 BTRFS_FILE_EXTENT_INLINE);
4764 btrfs_set_file_extent_encryption(leaf, ei, 0);
4765 btrfs_set_file_extent_compression(leaf, ei, 0);
4766 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
4767 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
4768
4769 ptr = btrfs_file_extent_inline_start(ei);
4770 write_extent_buffer(leaf, symname, ptr, name_len);
4771 btrfs_mark_buffer_dirty(leaf);
4772 btrfs_free_path(path);
4773
4774 inode->i_op = &btrfs_symlink_inode_operations;
4775 inode->i_mapping->a_ops = &btrfs_symlink_aops;
4776 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4777 inode_set_bytes(inode, name_len);
4778 btrfs_i_size_write(inode, name_len - 1);
4779 err = btrfs_update_inode(trans, root, inode);
4780 if (err)
4781 drop_inode = 1;
4782
4783out_unlock:
4784 nr = trans->blocks_used;
4785 btrfs_end_transaction_throttle(trans, root);
4786out_fail:
4787 if (drop_inode) {
4788 inode_dec_link_count(inode);
4789 iput(inode);
4790 }
4791 btrfs_btree_balance_dirty(root, nr);
4792 return err;
4793}
4794
4795static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
4796 u64 alloc_hint, int mode)
4797{
4798 struct btrfs_trans_handle *trans;
4799 struct btrfs_root *root = BTRFS_I(inode)->root;
4800 struct btrfs_key ins;
4801 u64 alloc_size;
4802 u64 cur_offset = start;
4803 u64 num_bytes = end - start;
4804 int ret = 0;
4805
4806 trans = btrfs_join_transaction(root, 1);
4807 BUG_ON(!trans);
4808 btrfs_set_trans_block_group(trans, inode);
4809
4810 while (num_bytes > 0) {
4811 alloc_size = min(num_bytes, root->fs_info->max_extent);
4812 ret = btrfs_reserve_extent(trans, root, alloc_size,
4813 root->sectorsize, 0, alloc_hint,
4814 (u64)-1, &ins, 1);
4815 if (ret) {
4816 WARN_ON(1);
4817 goto out;
4818 }
4819 ret = insert_reserved_file_extent(trans, inode,
4820 cur_offset, ins.objectid,
4821 ins.offset, ins.offset,
4822 ins.offset, 0, 0, 0,
4823 BTRFS_FILE_EXTENT_PREALLOC);
4824 BUG_ON(ret);
4825 num_bytes -= ins.offset;
4826 cur_offset += ins.offset;
4827 alloc_hint = ins.objectid + ins.offset;
4828 }
4829out:
4830 if (cur_offset > start) {
4831 inode->i_ctime = CURRENT_TIME;
4832 btrfs_set_flag(inode, PREALLOC);
4833 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4834 cur_offset > i_size_read(inode))
4835 btrfs_i_size_write(inode, cur_offset);
4836 ret = btrfs_update_inode(trans, root, inode);
4837 BUG_ON(ret);
4838 }
4839
4840 btrfs_end_transaction(trans, root);
4841 return ret;
4842}
4843
4844static long btrfs_fallocate(struct inode *inode, int mode,
4845 loff_t offset, loff_t len)
4846{
4847 u64 cur_offset;
4848 u64 last_byte;
4849 u64 alloc_start;
4850 u64 alloc_end;
4851 u64 alloc_hint = 0;
4852 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
4853 struct extent_map *em;
4854 int ret;
4855
4856 alloc_start = offset & ~mask;
4857 alloc_end = (offset + len + mask) & ~mask;
4858
4859 mutex_lock(&inode->i_mutex);
4860 if (alloc_start > inode->i_size) {
4861 ret = btrfs_cont_expand(inode, alloc_start);
4862 if (ret)
4863 goto out;
4864 }
4865
4866 while (1) {
4867 struct btrfs_ordered_extent *ordered;
4868 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
4869 alloc_end - 1, GFP_NOFS);
4870 ordered = btrfs_lookup_first_ordered_extent(inode,
4871 alloc_end - 1);
4872 if (ordered &&
4873 ordered->file_offset + ordered->len > alloc_start &&
4874 ordered->file_offset < alloc_end) {
4875 btrfs_put_ordered_extent(ordered);
4876 unlock_extent(&BTRFS_I(inode)->io_tree,
4877 alloc_start, alloc_end - 1, GFP_NOFS);
4878 btrfs_wait_ordered_range(inode, alloc_start,
4879 alloc_end - alloc_start);
4880 } else {
4881 if (ordered)
4882 btrfs_put_ordered_extent(ordered);
4883 break;
4884 }
4885 }
4886
4887 cur_offset = alloc_start;
4888 while (1) {
4889 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4890 alloc_end - cur_offset, 0);
4891 BUG_ON(IS_ERR(em) || !em);
4892 last_byte = min(extent_map_end(em), alloc_end);
4893 last_byte = (last_byte + mask) & ~mask;
4894 if (em->block_start == EXTENT_MAP_HOLE) {
4895 ret = prealloc_file_range(inode, cur_offset,
4896 last_byte, alloc_hint, mode);
4897 if (ret < 0) {
4898 free_extent_map(em);
4899 break;
4900 }
4901 }
4902 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
4903 alloc_hint = em->block_start;
4904 free_extent_map(em);
4905
4906 cur_offset = last_byte;
4907 if (cur_offset >= alloc_end) {
4908 ret = 0;
4909 break;
4910 }
4911 }
4912 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
4913 GFP_NOFS);
4914out:
4915 mutex_unlock(&inode->i_mutex);
4916 return ret;
4917}
4918
4919static int btrfs_set_page_dirty(struct page *page)
4920{
4921 return __set_page_dirty_nobuffers(page);
4922}
4923
4924static int btrfs_permission(struct inode *inode, int mask)
4925{
4926 if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
4927 return -EACCES;
4928 return generic_permission(inode, mask, btrfs_check_acl);
4929}
4930
4931static struct inode_operations btrfs_dir_inode_operations = {
4932 .getattr = btrfs_getattr,
4933 .lookup = btrfs_lookup,
4934 .create = btrfs_create,
4935 .unlink = btrfs_unlink,
4936 .link = btrfs_link,
4937 .mkdir = btrfs_mkdir,
4938 .rmdir = btrfs_rmdir,
4939 .rename = btrfs_rename,
4940 .symlink = btrfs_symlink,
4941 .setattr = btrfs_setattr,
4942 .mknod = btrfs_mknod,
4943 .setxattr = btrfs_setxattr,
4944 .getxattr = btrfs_getxattr,
4945 .listxattr = btrfs_listxattr,
4946 .removexattr = btrfs_removexattr,
4947 .permission = btrfs_permission,
4948};
4949static struct inode_operations btrfs_dir_ro_inode_operations = {
4950 .lookup = btrfs_lookup,
4951 .permission = btrfs_permission,
4952};
4953static struct file_operations btrfs_dir_file_operations = {
4954 .llseek = generic_file_llseek,
4955 .read = generic_read_dir,
4956 .readdir = btrfs_real_readdir,
4957 .unlocked_ioctl = btrfs_ioctl,
4958#ifdef CONFIG_COMPAT
4959 .compat_ioctl = btrfs_ioctl,
4960#endif
4961 .release = btrfs_release_file,
4962 .fsync = btrfs_sync_file,
4963};
4964
4965static struct extent_io_ops btrfs_extent_io_ops = {
4966 .fill_delalloc = run_delalloc_range,
4967 .submit_bio_hook = btrfs_submit_bio_hook,
4968 .merge_bio_hook = btrfs_merge_bio_hook,
4969 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
4970 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
4971 .writepage_start_hook = btrfs_writepage_start_hook,
4972 .readpage_io_failed_hook = btrfs_io_failed_hook,
4973 .set_bit_hook = btrfs_set_bit_hook,
4974 .clear_bit_hook = btrfs_clear_bit_hook,
4975};
4976
4977static struct address_space_operations btrfs_aops = {
4978 .readpage = btrfs_readpage,
4979 .writepage = btrfs_writepage,
4980 .writepages = btrfs_writepages,
4981 .readpages = btrfs_readpages,
4982 .sync_page = block_sync_page,
4983 .bmap = btrfs_bmap,
4984 .direct_IO = btrfs_direct_IO,
4985 .invalidatepage = btrfs_invalidatepage,
4986 .releasepage = btrfs_releasepage,
4987 .set_page_dirty = btrfs_set_page_dirty,
4988};
4989
4990static struct address_space_operations btrfs_symlink_aops = {
4991 .readpage = btrfs_readpage,
4992 .writepage = btrfs_writepage,
4993 .invalidatepage = btrfs_invalidatepage,
4994 .releasepage = btrfs_releasepage,
4995};
4996
4997static struct inode_operations btrfs_file_inode_operations = {
4998 .truncate = btrfs_truncate,
4999 .getattr = btrfs_getattr,
5000 .setattr = btrfs_setattr,
5001 .setxattr = btrfs_setxattr,
5002 .getxattr = btrfs_getxattr,
5003 .listxattr = btrfs_listxattr,
5004 .removexattr = btrfs_removexattr,
5005 .permission = btrfs_permission,
5006 .fallocate = btrfs_fallocate,
5007};
5008static struct inode_operations btrfs_special_inode_operations = {
5009 .getattr = btrfs_getattr,
5010 .setattr = btrfs_setattr,
5011 .permission = btrfs_permission,
5012 .setxattr = btrfs_setxattr,
5013 .getxattr = btrfs_getxattr,
5014 .listxattr = btrfs_listxattr,
5015 .removexattr = btrfs_removexattr,
5016};
5017static struct inode_operations btrfs_symlink_inode_operations = {
5018 .readlink = generic_readlink,
5019 .follow_link = page_follow_link_light,
5020 .put_link = page_put_link,
5021 .permission = btrfs_permission,
5022};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 000000000000..8828109fa58e
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,1150 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/fsnotify.h>
25#include <linux/pagemap.h>
26#include <linux/highmem.h>
27#include <linux/time.h>
28#include <linux/init.h>
29#include <linux/string.h>
30#include <linux/smp_lock.h>
31#include <linux/backing-dev.h>
32#include <linux/mount.h>
33#include <linux/mpage.h>
34#include <linux/namei.h>
35#include <linux/swap.h>
36#include <linux/writeback.h>
37#include <linux/statfs.h>
38#include <linux/compat.h>
39#include <linux/bit_spinlock.h>
40#include <linux/security.h>
41#include <linux/version.h>
42#include <linux/xattr.h>
43#include <linux/vmalloc.h>
44#include "ctree.h"
45#include "disk-io.h"
46#include "transaction.h"
47#include "btrfs_inode.h"
48#include "ioctl.h"
49#include "print-tree.h"
50#include "volumes.h"
51#include "locking.h"
52
53
54
55static noinline int create_subvol(struct btrfs_root *root,
56 struct dentry *dentry,
57 char *name, int namelen)
58{
59 struct btrfs_trans_handle *trans;
60 struct btrfs_key key;
61 struct btrfs_root_item root_item;
62 struct btrfs_inode_item *inode_item;
63 struct extent_buffer *leaf;
64 struct btrfs_root *new_root = root;
65 struct inode *dir;
66 int ret;
67 int err;
68 u64 objectid;
69 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
70 u64 index = 0;
71 unsigned long nr = 1;
72
73 ret = btrfs_check_free_space(root, 1, 0);
74 if (ret)
75 goto fail_commit;
76
77 trans = btrfs_start_transaction(root, 1);
78 BUG_ON(!trans);
79
80 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
81 0, &objectid);
82 if (ret)
83 goto fail;
84
85 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
86 objectid, trans->transid, 0, 0, 0);
87 if (IS_ERR(leaf)) {
88 ret = PTR_ERR(leaf);
89 goto fail;
90 }
91
92 btrfs_set_header_nritems(leaf, 0);
93 btrfs_set_header_level(leaf, 0);
94 btrfs_set_header_bytenr(leaf, leaf->start);
95 btrfs_set_header_generation(leaf, trans->transid);
96 btrfs_set_header_owner(leaf, objectid);
97
98 write_extent_buffer(leaf, root->fs_info->fsid,
99 (unsigned long)btrfs_header_fsid(leaf),
100 BTRFS_FSID_SIZE);
101 btrfs_mark_buffer_dirty(leaf);
102
103 inode_item = &root_item.inode;
104 memset(inode_item, 0, sizeof(*inode_item));
105 inode_item->generation = cpu_to_le64(1);
106 inode_item->size = cpu_to_le64(3);
107 inode_item->nlink = cpu_to_le32(1);
108 inode_item->nbytes = cpu_to_le64(root->leafsize);
109 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
110
111 btrfs_set_root_bytenr(&root_item, leaf->start);
112 btrfs_set_root_generation(&root_item, trans->transid);
113 btrfs_set_root_level(&root_item, 0);
114 btrfs_set_root_refs(&root_item, 1);
115 btrfs_set_root_used(&root_item, 0);
116 btrfs_set_root_last_snapshot(&root_item, 0);
117
118 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
119 root_item.drop_level = 0;
120
121 btrfs_tree_unlock(leaf);
122 free_extent_buffer(leaf);
123 leaf = NULL;
124
125 btrfs_set_root_dirid(&root_item, new_dirid);
126
127 key.objectid = objectid;
128 key.offset = 1;
129 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
130 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
131 &root_item);
132 if (ret)
133 goto fail;
134
135 /*
136 * insert the directory item
137 */
138 key.offset = (u64)-1;
139 dir = dentry->d_parent->d_inode;
140 ret = btrfs_set_inode_index(dir, &index);
141 BUG_ON(ret);
142
143 ret = btrfs_insert_dir_item(trans, root,
144 name, namelen, dir->i_ino, &key,
145 BTRFS_FT_DIR, index);
146 if (ret)
147 goto fail;
148
149 /* add the backref first */
150 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
151 objectid, BTRFS_ROOT_BACKREF_KEY,
152 root->root_key.objectid,
153 dir->i_ino, index, name, namelen);
154
155 BUG_ON(ret);
156
157 /* now add the forward ref */
158 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
159 root->root_key.objectid, BTRFS_ROOT_REF_KEY,
160 objectid,
161 dir->i_ino, index, name, namelen);
162
163 BUG_ON(ret);
164
165 ret = btrfs_commit_transaction(trans, root);
166 if (ret)
167 goto fail_commit;
168
169 new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
170 BUG_ON(!new_root);
171
172 trans = btrfs_start_transaction(new_root, 1);
173 BUG_ON(!trans);
174
175 ret = btrfs_create_subvol_root(new_root, dentry, trans, new_dirid,
176 BTRFS_I(dir)->block_group);
177 if (ret)
178 goto fail;
179
180fail:
181 nr = trans->blocks_used;
182 err = btrfs_commit_transaction(trans, new_root);
183 if (err && !ret)
184 ret = err;
185fail_commit:
186 btrfs_btree_balance_dirty(root, nr);
187 return ret;
188}
189
190static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
191 char *name, int namelen)
192{
193 struct btrfs_pending_snapshot *pending_snapshot;
194 struct btrfs_trans_handle *trans;
195 int ret = 0;
196 int err;
197 unsigned long nr = 0;
198
199 if (!root->ref_cows)
200 return -EINVAL;
201
202 ret = btrfs_check_free_space(root, 1, 0);
203 if (ret)
204 goto fail_unlock;
205
206 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
207 if (!pending_snapshot) {
208 ret = -ENOMEM;
209 goto fail_unlock;
210 }
211 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
212 if (!pending_snapshot->name) {
213 ret = -ENOMEM;
214 kfree(pending_snapshot);
215 goto fail_unlock;
216 }
217 memcpy(pending_snapshot->name, name, namelen);
218 pending_snapshot->name[namelen] = '\0';
219 pending_snapshot->dentry = dentry;
220 trans = btrfs_start_transaction(root, 1);
221 BUG_ON(!trans);
222 pending_snapshot->root = root;
223 list_add(&pending_snapshot->list,
224 &trans->transaction->pending_snapshots);
225 err = btrfs_commit_transaction(trans, root);
226
227fail_unlock:
228 btrfs_btree_balance_dirty(root, nr);
229 return ret;
230}
231
232/* copy of may_create in fs/namei.c() */
233static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
234{
235 if (child->d_inode)
236 return -EEXIST;
237 if (IS_DEADDIR(dir))
238 return -ENOENT;
239 return inode_permission(dir, MAY_WRITE | MAY_EXEC);
240}
241
242/*
243 * Create a new subvolume below @parent. This is largely modeled after
244 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
245 * inside this filesystem so it's quite a bit simpler.
246 */
247static noinline int btrfs_mksubvol(struct path *parent, char *name,
248 int mode, int namelen,
249 struct btrfs_root *snap_src)
250{
251 struct dentry *dentry;
252 int error;
253
254 mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
255
256 dentry = lookup_one_len(name, parent->dentry, namelen);
257 error = PTR_ERR(dentry);
258 if (IS_ERR(dentry))
259 goto out_unlock;
260
261 error = -EEXIST;
262 if (dentry->d_inode)
263 goto out_dput;
264
265 if (!IS_POSIXACL(parent->dentry->d_inode))
266 mode &= ~current->fs->umask;
267
268 error = mnt_want_write(parent->mnt);
269 if (error)
270 goto out_dput;
271
272 error = btrfs_may_create(parent->dentry->d_inode, dentry);
273 if (error)
274 goto out_drop_write;
275
276 /*
277 * Actually perform the low-level subvolume creation after all
278 * this VFS fuzz.
279 *
280 * Eventually we want to pass in an inode under which we create this
281 * subvolume, but for now all are under the filesystem root.
282 *
283 * Also we should pass on the mode eventually to allow creating new
284 * subvolume with specific mode bits.
285 */
286 if (snap_src) {
287 struct dentry *dir = dentry->d_parent;
288 struct dentry *test = dir->d_parent;
289 struct btrfs_path *path = btrfs_alloc_path();
290 int ret;
291 u64 test_oid;
292 u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
293
294 test_oid = snap_src->root_key.objectid;
295
296 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
297 path, parent_oid, test_oid);
298 if (ret == 0)
299 goto create;
300 btrfs_release_path(snap_src->fs_info->tree_root, path);
301
302 /* we need to make sure we aren't creating a directory loop
303 * by taking a snapshot of something that has our current
304 * subvol in its directory tree. So, this loops through
305 * the dentries and checks the forward refs for each subvolume
306 * to see if is references the subvolume where we are
307 * placing this new snapshot.
308 */
309 while(1) {
310 if (!test ||
311 dir == snap_src->fs_info->sb->s_root ||
312 test == snap_src->fs_info->sb->s_root ||
313 test->d_inode->i_sb != snap_src->fs_info->sb) {
314 break;
315 }
316 if (S_ISLNK(test->d_inode->i_mode)) {
317 printk("Symlink in snapshot path, failed\n");
318 error = -EMLINK;
319 btrfs_free_path(path);
320 goto out_drop_write;
321 }
322 test_oid =
323 BTRFS_I(test->d_inode)->root->root_key.objectid;
324 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
325 path, test_oid, parent_oid);
326 if (ret == 0) {
327 printk("Snapshot creation failed, looping\n");
328 error = -EMLINK;
329 btrfs_free_path(path);
330 goto out_drop_write;
331 }
332 btrfs_release_path(snap_src->fs_info->tree_root, path);
333 test = test->d_parent;
334 }
335create:
336 btrfs_free_path(path);
337 error = create_snapshot(snap_src, dentry, name, namelen);
338 } else {
339 error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
340 dentry, name, namelen);
341 }
342 if (error)
343 goto out_drop_write;
344
345 fsnotify_mkdir(parent->dentry->d_inode, dentry);
346out_drop_write:
347 mnt_drop_write(parent->mnt);
348out_dput:
349 dput(dentry);
350out_unlock:
351 mutex_unlock(&parent->dentry->d_inode->i_mutex);
352 return error;
353}
354
355
356int btrfs_defrag_file(struct file *file)
357{
358 struct inode *inode = fdentry(file)->d_inode;
359 struct btrfs_root *root = BTRFS_I(inode)->root;
360 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
361 struct btrfs_ordered_extent *ordered;
362 struct page *page;
363 unsigned long last_index;
364 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
365 unsigned long total_read = 0;
366 u64 page_start;
367 u64 page_end;
368 unsigned long i;
369 int ret;
370
371 ret = btrfs_check_free_space(root, inode->i_size, 0);
372 if (ret)
373 return -ENOSPC;
374
375 mutex_lock(&inode->i_mutex);
376 last_index = inode->i_size >> PAGE_CACHE_SHIFT;
377 for (i = 0; i <= last_index; i++) {
378 if (total_read % ra_pages == 0) {
379 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
380 min(last_index, i + ra_pages - 1));
381 }
382 total_read++;
383again:
384 page = grab_cache_page(inode->i_mapping, i);
385 if (!page)
386 goto out_unlock;
387 if (!PageUptodate(page)) {
388 btrfs_readpage(NULL, page);
389 lock_page(page);
390 if (!PageUptodate(page)) {
391 unlock_page(page);
392 page_cache_release(page);
393 goto out_unlock;
394 }
395 }
396
397 wait_on_page_writeback(page);
398
399 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
400 page_end = page_start + PAGE_CACHE_SIZE - 1;
401 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
402
403 ordered = btrfs_lookup_ordered_extent(inode, page_start);
404 if (ordered) {
405 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
406 unlock_page(page);
407 page_cache_release(page);
408 btrfs_start_ordered_extent(inode, ordered, 1);
409 btrfs_put_ordered_extent(ordered);
410 goto again;
411 }
412 set_page_extent_mapped(page);
413
414 /*
415 * this makes sure page_mkwrite is called on the
416 * page if it is dirtied again later
417 */
418 clear_page_dirty_for_io(page);
419
420 btrfs_set_extent_delalloc(inode, page_start, page_end);
421
422 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
423 set_page_dirty(page);
424 unlock_page(page);
425 page_cache_release(page);
426 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
427 }
428
429out_unlock:
430 mutex_unlock(&inode->i_mutex);
431 return 0;
432}
433
434/*
435 * Called inside transaction, so use GFP_NOFS
436 */
437
438static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
439{
440 u64 new_size;
441 u64 old_size;
442 u64 devid = 1;
443 struct btrfs_ioctl_vol_args *vol_args;
444 struct btrfs_trans_handle *trans;
445 struct btrfs_device *device = NULL;
446 char *sizestr;
447 char *devstr = NULL;
448 int ret = 0;
449 int namelen;
450 int mod = 0;
451
452 if (root->fs_info->sb->s_flags & MS_RDONLY)
453 return -EROFS;
454
455 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
456
457 if (!vol_args)
458 return -ENOMEM;
459
460 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
461 ret = -EFAULT;
462 goto out;
463 }
464
465 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
466 namelen = strlen(vol_args->name);
467
468 mutex_lock(&root->fs_info->volume_mutex);
469 sizestr = vol_args->name;
470 devstr = strchr(sizestr, ':');
471 if (devstr) {
472 char *end;
473 sizestr = devstr + 1;
474 *devstr = '\0';
475 devstr = vol_args->name;
476 devid = simple_strtoull(devstr, &end, 10);
477 printk(KERN_INFO "resizing devid %llu\n", devid);
478 }
479 device = btrfs_find_device(root, devid, NULL, NULL);
480 if (!device) {
481 printk(KERN_INFO "resizer unable to find device %llu\n", devid);
482 ret = -EINVAL;
483 goto out_unlock;
484 }
485 if (!strcmp(sizestr, "max"))
486 new_size = device->bdev->bd_inode->i_size;
487 else {
488 if (sizestr[0] == '-') {
489 mod = -1;
490 sizestr++;
491 } else if (sizestr[0] == '+') {
492 mod = 1;
493 sizestr++;
494 }
495 new_size = btrfs_parse_size(sizestr);
496 if (new_size == 0) {
497 ret = -EINVAL;
498 goto out_unlock;
499 }
500 }
501
502 old_size = device->total_bytes;
503
504 if (mod < 0) {
505 if (new_size > old_size) {
506 ret = -EINVAL;
507 goto out_unlock;
508 }
509 new_size = old_size - new_size;
510 } else if (mod > 0) {
511 new_size = old_size + new_size;
512 }
513
514 if (new_size < 256 * 1024 * 1024) {
515 ret = -EINVAL;
516 goto out_unlock;
517 }
518 if (new_size > device->bdev->bd_inode->i_size) {
519 ret = -EFBIG;
520 goto out_unlock;
521 }
522
523 do_div(new_size, root->sectorsize);
524 new_size *= root->sectorsize;
525
526 printk(KERN_INFO "new size for %s is %llu\n",
527 device->name, (unsigned long long)new_size);
528
529 if (new_size > old_size) {
530 trans = btrfs_start_transaction(root, 1);
531 ret = btrfs_grow_device(trans, device, new_size);
532 btrfs_commit_transaction(trans, root);
533 } else {
534 ret = btrfs_shrink_device(device, new_size);
535 }
536
537out_unlock:
538 mutex_unlock(&root->fs_info->volume_mutex);
539out:
540 kfree(vol_args);
541 return ret;
542}
543
544static noinline int btrfs_ioctl_snap_create(struct file *file,
545 void __user *arg, int subvol)
546{
547 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
548 struct btrfs_ioctl_vol_args *vol_args;
549 struct btrfs_dir_item *di;
550 struct btrfs_path *path;
551 struct file *src_file;
552 u64 root_dirid;
553 int namelen;
554 int ret = 0;
555
556 if (root->fs_info->sb->s_flags & MS_RDONLY)
557 return -EROFS;
558
559 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
560
561 if (!vol_args)
562 return -ENOMEM;
563
564 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
565 ret = -EFAULT;
566 goto out;
567 }
568
569 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
570 namelen = strlen(vol_args->name);
571 if (strchr(vol_args->name, '/')) {
572 ret = -EINVAL;
573 goto out;
574 }
575
576 path = btrfs_alloc_path();
577 if (!path) {
578 ret = -ENOMEM;
579 goto out;
580 }
581
582 root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
583 di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
584 path, root_dirid,
585 vol_args->name, namelen, 0);
586 btrfs_free_path(path);
587
588 if (di && !IS_ERR(di)) {
589 ret = -EEXIST;
590 goto out;
591 }
592
593 if (IS_ERR(di)) {
594 ret = PTR_ERR(di);
595 goto out;
596 }
597
598 if (subvol) {
599 ret = btrfs_mksubvol(&file->f_path, vol_args->name,
600 file->f_path.dentry->d_inode->i_mode,
601 namelen, NULL);
602 } else {
603 struct inode *src_inode;
604 src_file = fget(vol_args->fd);
605 if (!src_file) {
606 ret = -EINVAL;
607 goto out;
608 }
609
610 src_inode = src_file->f_path.dentry->d_inode;
611 if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
612 printk("btrfs: Snapshot src from another FS\n");
613 ret = -EINVAL;
614 fput(src_file);
615 goto out;
616 }
617 ret = btrfs_mksubvol(&file->f_path, vol_args->name,
618 file->f_path.dentry->d_inode->i_mode,
619 namelen, BTRFS_I(src_inode)->root);
620 fput(src_file);
621 }
622
623out:
624 kfree(vol_args);
625 return ret;
626}
627
628static int btrfs_ioctl_defrag(struct file *file)
629{
630 struct inode *inode = fdentry(file)->d_inode;
631 struct btrfs_root *root = BTRFS_I(inode)->root;
632 int ret;
633
634 ret = mnt_want_write(file->f_path.mnt);
635 if (ret)
636 return ret;
637
638 switch (inode->i_mode & S_IFMT) {
639 case S_IFDIR:
640 btrfs_defrag_root(root, 0);
641 btrfs_defrag_root(root->fs_info->extent_root, 0);
642 break;
643 case S_IFREG:
644 btrfs_defrag_file(file);
645 break;
646 }
647
648 return 0;
649}
650
651long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
652{
653 struct btrfs_ioctl_vol_args *vol_args;
654 int ret;
655
656 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
657
658 if (!vol_args)
659 return -ENOMEM;
660
661 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
662 ret = -EFAULT;
663 goto out;
664 }
665 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
666 ret = btrfs_init_new_device(root, vol_args->name);
667
668out:
669 kfree(vol_args);
670 return ret;
671}
672
673long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
674{
675 struct btrfs_ioctl_vol_args *vol_args;
676 int ret;
677
678 if (root->fs_info->sb->s_flags & MS_RDONLY)
679 return -EROFS;
680
681 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
682
683 if (!vol_args)
684 return -ENOMEM;
685
686 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
687 ret = -EFAULT;
688 goto out;
689 }
690 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
691 ret = btrfs_rm_device(root, vol_args->name);
692
693out:
694 kfree(vol_args);
695 return ret;
696}
697
698long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 off,
699 u64 olen, u64 destoff)
700{
701 struct inode *inode = fdentry(file)->d_inode;
702 struct btrfs_root *root = BTRFS_I(inode)->root;
703 struct file *src_file;
704 struct inode *src;
705 struct btrfs_trans_handle *trans;
706 struct btrfs_path *path;
707 struct extent_buffer *leaf;
708 char *buf;
709 struct btrfs_key key;
710 u32 nritems;
711 int slot;
712 int ret;
713 u64 len = olen;
714 u64 bs = root->fs_info->sb->s_blocksize;
715 u64 hint_byte;
716
717 /*
718 * TODO:
719 * - split compressed inline extents. annoying: we need to
720 * decompress into destination's address_space (the file offset
721 * may change, so source mapping won't do), then recompress (or
722 * otherwise reinsert) a subrange.
723 * - allow ranges within the same file to be cloned (provided
724 * they don't overlap)?
725 */
726
727 ret = mnt_want_write(file->f_path.mnt);
728 if (ret)
729 return ret;
730
731 src_file = fget(srcfd);
732 if (!src_file)
733 return -EBADF;
734 src = src_file->f_dentry->d_inode;
735
736 ret = -EINVAL;
737 if (src == inode)
738 goto out_fput;
739
740 ret = -EISDIR;
741 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
742 goto out_fput;
743
744 ret = -EXDEV;
745 if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
746 goto out_fput;
747
748 ret = -ENOMEM;
749 buf = vmalloc(btrfs_level_size(root, 0));
750 if (!buf)
751 goto out_fput;
752
753 path = btrfs_alloc_path();
754 if (!path) {
755 vfree(buf);
756 goto out_fput;
757 }
758 path->reada = 2;
759
760 if (inode < src) {
761 mutex_lock(&inode->i_mutex);
762 mutex_lock(&src->i_mutex);
763 } else {
764 mutex_lock(&src->i_mutex);
765 mutex_lock(&inode->i_mutex);
766 }
767
768 /* determine range to clone */
769 ret = -EINVAL;
770 if (off >= src->i_size || off + len > src->i_size)
771 goto out_unlock;
772 if (len == 0)
773 olen = len = src->i_size - off;
774 /* if we extend to eof, continue to block boundary */
775 if (off + len == src->i_size)
776 len = ((src->i_size + bs-1) & ~(bs-1))
777 - off;
778
779 /* verify the end result is block aligned */
780 if ((off & (bs-1)) ||
781 ((off + len) & (bs-1)))
782 goto out_unlock;
783
784 printk("final src extent is %llu~%llu\n", off, len);
785 printk("final dst extent is %llu~%llu\n", destoff, len);
786
787 /* do any pending delalloc/csum calc on src, one way or
788 another, and lock file content */
789 while (1) {
790 struct btrfs_ordered_extent *ordered;
791 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
792 ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
793 if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
794 break;
795 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
796 if (ordered)
797 btrfs_put_ordered_extent(ordered);
798 btrfs_wait_ordered_range(src, off, off+len);
799 }
800
801 trans = btrfs_start_transaction(root, 1);
802 BUG_ON(!trans);
803
804 /* punch hole in destination first */
805 btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
806
807 /* clone data */
808 key.objectid = src->i_ino;
809 key.type = BTRFS_EXTENT_DATA_KEY;
810 key.offset = 0;
811
812 while (1) {
813 /*
814 * note the key will change type as we walk through the
815 * tree.
816 */
817 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
818 if (ret < 0)
819 goto out;
820
821 nritems = btrfs_header_nritems(path->nodes[0]);
822 if (path->slots[0] >= nritems) {
823 ret = btrfs_next_leaf(root, path);
824 if (ret < 0)
825 goto out;
826 if (ret > 0)
827 break;
828 nritems = btrfs_header_nritems(path->nodes[0]);
829 }
830 leaf = path->nodes[0];
831 slot = path->slots[0];
832
833 btrfs_item_key_to_cpu(leaf, &key, slot);
834 if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
835 key.objectid != src->i_ino)
836 break;
837
838 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
839 struct btrfs_file_extent_item *extent;
840 int type;
841 u32 size;
842 struct btrfs_key new_key;
843 u64 disko = 0, diskl = 0;
844 u64 datao = 0, datal = 0;
845 u8 comp;
846
847 size = btrfs_item_size_nr(leaf, slot);
848 read_extent_buffer(leaf, buf,
849 btrfs_item_ptr_offset(leaf, slot),
850 size);
851
852 extent = btrfs_item_ptr(leaf, slot,
853 struct btrfs_file_extent_item);
854 comp = btrfs_file_extent_compression(leaf, extent);
855 type = btrfs_file_extent_type(leaf, extent);
856 if (type == BTRFS_FILE_EXTENT_REG) {
857 disko = btrfs_file_extent_disk_bytenr(leaf, extent);
858 diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
859 datao = btrfs_file_extent_offset(leaf, extent);
860 datal = btrfs_file_extent_num_bytes(leaf, extent);
861 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
862 /* take upper bound, may be compressed */
863 datal = btrfs_file_extent_ram_bytes(leaf,
864 extent);
865 }
866 btrfs_release_path(root, path);
867
868 if (key.offset + datal < off ||
869 key.offset >= off+len)
870 goto next;
871
872 memcpy(&new_key, &key, sizeof(new_key));
873 new_key.objectid = inode->i_ino;
874 new_key.offset = key.offset + destoff - off;
875
876 if (type == BTRFS_FILE_EXTENT_REG) {
877 ret = btrfs_insert_empty_item(trans, root, path,
878 &new_key, size);
879 if (ret)
880 goto out;
881
882 leaf = path->nodes[0];
883 slot = path->slots[0];
884 write_extent_buffer(leaf, buf,
885 btrfs_item_ptr_offset(leaf, slot),
886 size);
887
888 extent = btrfs_item_ptr(leaf, slot,
889 struct btrfs_file_extent_item);
890 printk(" orig disk %llu~%llu data %llu~%llu\n",
891 disko, diskl, datao, datal);
892
893 if (off > key.offset) {
894 datao += off - key.offset;
895 datal -= off - key.offset;
896 }
897 if (key.offset + datao + datal + key.offset >
898 off + len)
899 datal = off + len - key.offset - datao;
900 /* disko == 0 means it's a hole */
901 if (!disko)
902 datao = 0;
903 printk(" final disk %llu~%llu data %llu~%llu\n",
904 disko, diskl, datao, datal);
905
906 btrfs_set_file_extent_offset(leaf, extent,
907 datao);
908 btrfs_set_file_extent_num_bytes(leaf, extent,
909 datal);
910 if (disko) {
911 inode_add_bytes(inode, datal);
912 ret = btrfs_inc_extent_ref(trans, root,
913 disko, diskl, leaf->start,
914 root->root_key.objectid,
915 trans->transid,
916 inode->i_ino);
917 BUG_ON(ret);
918 }
919 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
920 u64 skip = 0;
921 u64 trim = 0;
922 if (off > key.offset) {
923 skip = off - key.offset;
924 new_key.offset += skip;
925 }
926 if (key.offset + datal > off+len)
927 trim = key.offset + datal - (off+len);
928 printk("len %lld skip %lld trim %lld\n",
929 datal, skip, trim);
930 if (comp && (skip || trim)) {
931 printk("btrfs clone_range can't split compressed inline extents yet\n");
932 ret = -EINVAL;
933 goto out;
934 }
935 size -= skip + trim;
936 datal -= skip + trim;
937 ret = btrfs_insert_empty_item(trans, root, path,
938 &new_key, size);
939 if (ret)
940 goto out;
941
942 if (skip) {
943 u32 start = btrfs_file_extent_calc_inline_size(0);
944 memmove(buf+start, buf+start+skip,
945 datal);
946 }
947
948 leaf = path->nodes[0];
949 slot = path->slots[0];
950 write_extent_buffer(leaf, buf,
951 btrfs_item_ptr_offset(leaf, slot),
952 size);
953 inode_add_bytes(inode, datal);
954 }
955
956 btrfs_mark_buffer_dirty(leaf);
957 }
958
959 if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
960 u32 size;
961 struct btrfs_key new_key;
962 u64 coverslen;
963 int coff, clen;
964
965 size = btrfs_item_size_nr(leaf, slot);
966 coverslen = (size / BTRFS_CRC32_SIZE) <<
967 root->fs_info->sb->s_blocksize_bits;
968 printk("csums for %llu~%llu\n",
969 key.offset, coverslen);
970 if (key.offset + coverslen < off ||
971 key.offset >= off+len)
972 goto next;
973
974 read_extent_buffer(leaf, buf,
975 btrfs_item_ptr_offset(leaf, slot),
976 size);
977 btrfs_release_path(root, path);
978
979 coff = 0;
980 if (off > key.offset)
981 coff = ((off - key.offset) >>
982 root->fs_info->sb->s_blocksize_bits) *
983 BTRFS_CRC32_SIZE;
984 clen = size - coff;
985 if (key.offset + coverslen > off+len)
986 clen -= ((key.offset+coverslen-off-len) >>
987 root->fs_info->sb->s_blocksize_bits) *
988 BTRFS_CRC32_SIZE;
989 printk(" will dup %d~%d of %d\n",
990 coff, clen, size);
991
992 memcpy(&new_key, &key, sizeof(new_key));
993 new_key.objectid = inode->i_ino;
994 new_key.offset = key.offset + destoff - off;
995
996 ret = btrfs_insert_empty_item(trans, root, path,
997 &new_key, clen);
998 if (ret)
999 goto out;
1000
1001 leaf = path->nodes[0];
1002 slot = path->slots[0];
1003 write_extent_buffer(leaf, buf + coff,
1004 btrfs_item_ptr_offset(leaf, slot),
1005 clen);
1006 btrfs_mark_buffer_dirty(leaf);
1007 }
1008
1009 next:
1010 btrfs_release_path(root, path);
1011 key.offset++;
1012 }
1013 ret = 0;
1014out:
1015 btrfs_release_path(root, path);
1016 if (ret == 0) {
1017 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1018 if (destoff + olen > inode->i_size)
1019 btrfs_i_size_write(inode, destoff + olen);
1020 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1021 ret = btrfs_update_inode(trans, root, inode);
1022 }
1023 btrfs_end_transaction(trans, root);
1024 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1025 if (ret)
1026 vmtruncate(inode, 0);
1027out_unlock:
1028 mutex_unlock(&src->i_mutex);
1029 mutex_unlock(&inode->i_mutex);
1030 vfree(buf);
1031 btrfs_free_path(path);
1032out_fput:
1033 fput(src_file);
1034 return ret;
1035}
1036
1037long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr)
1038{
1039 struct btrfs_ioctl_clone_range_args args;
1040
1041 if (copy_from_user(&args, (void *)argptr, sizeof(args)))
1042 return -EFAULT;
1043 return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
1044 args.src_length, args.dest_offset);
1045}
1046
1047/*
1048 * there are many ways the trans_start and trans_end ioctls can lead
1049 * to deadlocks. They should only be used by applications that
1050 * basically own the machine, and have a very in depth understanding
1051 * of all the possible deadlocks and enospc problems.
1052 */
1053long btrfs_ioctl_trans_start(struct file *file)
1054{
1055 struct inode *inode = fdentry(file)->d_inode;
1056 struct btrfs_root *root = BTRFS_I(inode)->root;
1057 struct btrfs_trans_handle *trans;
1058 int ret = 0;
1059
1060 if (!capable(CAP_SYS_ADMIN))
1061 return -EPERM;
1062
1063 if (file->private_data) {
1064 ret = -EINPROGRESS;
1065 goto out;
1066 }
1067
1068 ret = mnt_want_write(file->f_path.mnt);
1069 if (ret)
1070 goto out;
1071
1072 mutex_lock(&root->fs_info->trans_mutex);
1073 root->fs_info->open_ioctl_trans++;
1074 mutex_unlock(&root->fs_info->trans_mutex);
1075
1076 trans = btrfs_start_ioctl_transaction(root, 0);
1077 if (trans)
1078 file->private_data = trans;
1079 else
1080 ret = -ENOMEM;
1081 /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
1082out:
1083 return ret;
1084}
1085
1086/*
1087 * there are many ways the trans_start and trans_end ioctls can lead
1088 * to deadlocks. They should only be used by applications that
1089 * basically own the machine, and have a very in depth understanding
1090 * of all the possible deadlocks and enospc problems.
1091 */
1092long btrfs_ioctl_trans_end(struct file *file)
1093{
1094 struct inode *inode = fdentry(file)->d_inode;
1095 struct btrfs_root *root = BTRFS_I(inode)->root;
1096 struct btrfs_trans_handle *trans;
1097 int ret = 0;
1098
1099 trans = file->private_data;
1100 if (!trans) {
1101 ret = -EINVAL;
1102 goto out;
1103 }
1104 btrfs_end_transaction(trans, root);
1105 file->private_data = NULL;
1106
1107 mutex_lock(&root->fs_info->trans_mutex);
1108 root->fs_info->open_ioctl_trans--;
1109 mutex_unlock(&root->fs_info->trans_mutex);
1110
1111out:
1112 return ret;
1113}
1114
1115long btrfs_ioctl(struct file *file, unsigned int
1116 cmd, unsigned long arg)
1117{
1118 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1119
1120 switch (cmd) {
1121 case BTRFS_IOC_SNAP_CREATE:
1122 return btrfs_ioctl_snap_create(file, (void __user *)arg, 0);
1123 case BTRFS_IOC_SUBVOL_CREATE:
1124 return btrfs_ioctl_snap_create(file, (void __user *)arg, 1);
1125 case BTRFS_IOC_DEFRAG:
1126 return btrfs_ioctl_defrag(file);
1127 case BTRFS_IOC_RESIZE:
1128 return btrfs_ioctl_resize(root, (void __user *)arg);
1129 case BTRFS_IOC_ADD_DEV:
1130 return btrfs_ioctl_add_dev(root, (void __user *)arg);
1131 case BTRFS_IOC_RM_DEV:
1132 return btrfs_ioctl_rm_dev(root, (void __user *)arg);
1133 case BTRFS_IOC_BALANCE:
1134 return btrfs_balance(root->fs_info->dev_root);
1135 case BTRFS_IOC_CLONE:
1136 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
1137 case BTRFS_IOC_CLONE_RANGE:
1138 return btrfs_ioctl_clone_range(file, arg);
1139 case BTRFS_IOC_TRANS_START:
1140 return btrfs_ioctl_trans_start(file);
1141 case BTRFS_IOC_TRANS_END:
1142 return btrfs_ioctl_trans_end(file);
1143 case BTRFS_IOC_SYNC:
1144 btrfs_start_delalloc_inodes(root);
1145 btrfs_sync_fs(file->f_dentry->d_sb, 1);
1146 return 0;
1147 }
1148
1149 return -ENOTTY;
1150}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 000000000000..78049ea208db
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,67 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __IOCTL_
20#define __IOCTL_
21#include <linux/ioctl.h>
22
23#define BTRFS_IOCTL_MAGIC 0x94
24#define BTRFS_VOL_NAME_MAX 255
25#define BTRFS_PATH_NAME_MAX 3072
26
27struct btrfs_ioctl_vol_args {
28 __s64 fd;
29 char name[BTRFS_PATH_NAME_MAX + 1];
30};
31
32#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
33 struct btrfs_ioctl_vol_args)
34#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
35 struct btrfs_ioctl_vol_args)
36#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
37 struct btrfs_ioctl_vol_args)
38#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
39 struct btrfs_ioctl_vol_args)
40/* trans start and trans end are dangerous, and only for
41 * use by applications that know how to avoid the
42 * resulting deadlocks
43 */
44#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
45#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
46#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
47
48#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
49#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
50 struct btrfs_ioctl_vol_args)
51#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
52 struct btrfs_ioctl_vol_args)
53#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
54 struct btrfs_ioctl_vol_args)
55struct btrfs_ioctl_clone_range_args {
56 __s64 src_fd;
57 __u64 src_offset, src_length;
58 __u64 dest_offset;
59};
60
61#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
62 struct btrfs_ioctl_clone_range_args)
63
64#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
65 struct btrfs_ioctl_vol_args)
66
67#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 000000000000..e30aa6e2958f
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,87 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h>
21#include <linux/spinlock.h>
22#include <linux/page-flags.h>
23#include <asm/bug.h>
24#include "ctree.h"
25#include "extent_io.h"
26#include "locking.h"
27
28/*
29 * locks the per buffer mutex in an extent buffer. This uses adaptive locks
30 * and the spin is not tuned very extensively. The spinning does make a big
31 * difference in almost every workload, but spinning for the right amount of
32 * time needs some help.
33 *
34 * In general, we want to spin as long as the lock holder is doing btree searches,
35 * and we should give up if they are in more expensive code.
36 */
37int btrfs_tree_lock(struct extent_buffer *eb)
38{
39 int i;
40
41 if (mutex_trylock(&eb->mutex))
42 return 0;
43 for (i = 0; i < 512; i++) {
44 cpu_relax();
45 if (mutex_trylock(&eb->mutex))
46 return 0;
47 }
48 cpu_relax();
49 mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
50 return 0;
51}
52
53int btrfs_try_tree_lock(struct extent_buffer *eb)
54{
55 return mutex_trylock(&eb->mutex);
56}
57
58int btrfs_tree_unlock(struct extent_buffer *eb)
59{
60 mutex_unlock(&eb->mutex);
61 return 0;
62}
63
64int btrfs_tree_locked(struct extent_buffer *eb)
65{
66 return mutex_is_locked(&eb->mutex);
67}
68
69/*
70 * btrfs_search_slot uses this to decide if it should drop its locks
71 * before doing something expensive like allocating free blocks for cow.
72 */
73int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
74{
75 int i;
76 struct extent_buffer *eb;
77 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
78 eb = path->nodes[i];
79 if (!eb)
80 break;
81 smp_mb();
82 if (!list_empty(&eb->mutex.wait_list))
83 return 1;
84 }
85 return 0;
86}
87
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 000000000000..bc1faef12519
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_LOCKING_
20#define __BTRFS_LOCKING_
21
22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb);
25int btrfs_try_tree_lock(struct extent_buffer *eb);
26int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
27#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 000000000000..027ad6b3839e
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,733 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/gfp.h>
20#include <linux/slab.h>
21#include <linux/blkdev.h>
22#include <linux/writeback.h>
23#include <linux/pagevec.h>
24#include "ctree.h"
25#include "transaction.h"
26#include "btrfs_inode.h"
27#include "extent_io.h"
28
29static u64 entry_end(struct btrfs_ordered_extent *entry)
30{
31 if (entry->file_offset + entry->len < entry->file_offset)
32 return (u64)-1;
33 return entry->file_offset + entry->len;
34}
35
36/* returns NULL if the insertion worked, or it returns the node it did find
37 * in the tree
38 */
39static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
40 struct rb_node *node)
41{
42 struct rb_node ** p = &root->rb_node;
43 struct rb_node * parent = NULL;
44 struct btrfs_ordered_extent *entry;
45
46 while(*p) {
47 parent = *p;
48 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
49
50 if (file_offset < entry->file_offset)
51 p = &(*p)->rb_left;
52 else if (file_offset >= entry_end(entry))
53 p = &(*p)->rb_right;
54 else
55 return parent;
56 }
57
58 rb_link_node(node, parent, p);
59 rb_insert_color(node, root);
60 return NULL;
61}
62
63/*
64 * look for a given offset in the tree, and if it can't be found return the
65 * first lesser offset
66 */
67static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
68 struct rb_node **prev_ret)
69{
70 struct rb_node * n = root->rb_node;
71 struct rb_node *prev = NULL;
72 struct rb_node *test;
73 struct btrfs_ordered_extent *entry;
74 struct btrfs_ordered_extent *prev_entry = NULL;
75
76 while(n) {
77 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
78 prev = n;
79 prev_entry = entry;
80
81 if (file_offset < entry->file_offset)
82 n = n->rb_left;
83 else if (file_offset >= entry_end(entry))
84 n = n->rb_right;
85 else
86 return n;
87 }
88 if (!prev_ret)
89 return NULL;
90
91 while(prev && file_offset >= entry_end(prev_entry)) {
92 test = rb_next(prev);
93 if (!test)
94 break;
95 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
96 rb_node);
97 if (file_offset < entry_end(prev_entry))
98 break;
99
100 prev = test;
101 }
102 if (prev)
103 prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
104 rb_node);
105 while(prev && file_offset < entry_end(prev_entry)) {
106 test = rb_prev(prev);
107 if (!test)
108 break;
109 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
110 rb_node);
111 prev = test;
112 }
113 *prev_ret = prev;
114 return NULL;
115}
116
117/*
118 * helper to check if a given offset is inside a given entry
119 */
120static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
121{
122 if (file_offset < entry->file_offset ||
123 entry->file_offset + entry->len <= file_offset)
124 return 0;
125 return 1;
126}
127
128/*
129 * look find the first ordered struct that has this offset, otherwise
130 * the first one less than this offset
131 */
132static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
133 u64 file_offset)
134{
135 struct rb_root *root = &tree->tree;
136 struct rb_node *prev;
137 struct rb_node *ret;
138 struct btrfs_ordered_extent *entry;
139
140 if (tree->last) {
141 entry = rb_entry(tree->last, struct btrfs_ordered_extent,
142 rb_node);
143 if (offset_in_entry(entry, file_offset))
144 return tree->last;
145 }
146 ret = __tree_search(root, file_offset, &prev);
147 if (!ret)
148 ret = prev;
149 if (ret)
150 tree->last = ret;
151 return ret;
152}
153
154/* allocate and add a new ordered_extent into the per-inode tree.
155 * file_offset is the logical offset in the file
156 *
157 * start is the disk block number of an extent already reserved in the
158 * extent allocation tree
159 *
160 * len is the length of the extent
161 *
162 * This also sets the EXTENT_ORDERED bit on the range in the inode.
163 *
164 * The tree is given a single reference on the ordered extent that was
165 * inserted.
166 */
167int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
168 u64 start, u64 len, u64 disk_len, int type)
169{
170 struct btrfs_ordered_inode_tree *tree;
171 struct rb_node *node;
172 struct btrfs_ordered_extent *entry;
173
174 tree = &BTRFS_I(inode)->ordered_tree;
175 entry = kzalloc(sizeof(*entry), GFP_NOFS);
176 if (!entry)
177 return -ENOMEM;
178
179 mutex_lock(&tree->mutex);
180 entry->file_offset = file_offset;
181 entry->start = start;
182 entry->len = len;
183 entry->disk_len = disk_len;
184 entry->inode = inode;
185 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
186 set_bit(type, &entry->flags);
187
188 /* one ref for the tree */
189 atomic_set(&entry->refs, 1);
190 init_waitqueue_head(&entry->wait);
191 INIT_LIST_HEAD(&entry->list);
192 INIT_LIST_HEAD(&entry->root_extent_list);
193
194 node = tree_insert(&tree->tree, file_offset,
195 &entry->rb_node);
196 if (node) {
197 printk("warning dup entry from add_ordered_extent\n");
198 BUG();
199 }
200 set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
201 entry_end(entry) - 1, GFP_NOFS);
202
203 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
204 list_add_tail(&entry->root_extent_list,
205 &BTRFS_I(inode)->root->fs_info->ordered_extents);
206 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
207
208 mutex_unlock(&tree->mutex);
209 BUG_ON(node);
210 return 0;
211}
212
213/*
214 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
215 * when an ordered extent is finished. If the list covers more than one
216 * ordered extent, it is split across multiples.
217 */
218int btrfs_add_ordered_sum(struct inode *inode,
219 struct btrfs_ordered_extent *entry,
220 struct btrfs_ordered_sum *sum)
221{
222 struct btrfs_ordered_inode_tree *tree;
223
224 tree = &BTRFS_I(inode)->ordered_tree;
225 mutex_lock(&tree->mutex);
226 list_add_tail(&sum->list, &entry->list);
227 mutex_unlock(&tree->mutex);
228 return 0;
229}
230
231/*
232 * this is used to account for finished IO across a given range
233 * of the file. The IO should not span ordered extents. If
234 * a given ordered_extent is completely done, 1 is returned, otherwise
235 * 0.
236 *
237 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
238 * to make sure this function only returns 1 once for a given ordered extent.
239 */
240int btrfs_dec_test_ordered_pending(struct inode *inode,
241 u64 file_offset, u64 io_size)
242{
243 struct btrfs_ordered_inode_tree *tree;
244 struct rb_node *node;
245 struct btrfs_ordered_extent *entry;
246 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
247 int ret;
248
249 tree = &BTRFS_I(inode)->ordered_tree;
250 mutex_lock(&tree->mutex);
251 clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
252 GFP_NOFS);
253 node = tree_search(tree, file_offset);
254 if (!node) {
255 ret = 1;
256 goto out;
257 }
258
259 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
260 if (!offset_in_entry(entry, file_offset)) {
261 ret = 1;
262 goto out;
263 }
264
265 ret = test_range_bit(io_tree, entry->file_offset,
266 entry->file_offset + entry->len - 1,
267 EXTENT_ORDERED, 0);
268 if (ret == 0)
269 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
270out:
271 mutex_unlock(&tree->mutex);
272 return ret == 0;
273}
274
275/*
276 * used to drop a reference on an ordered extent. This will free
277 * the extent if the last reference is dropped
278 */
279int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
280{
281 struct list_head *cur;
282 struct btrfs_ordered_sum *sum;
283
284 if (atomic_dec_and_test(&entry->refs)) {
285 while(!list_empty(&entry->list)) {
286 cur = entry->list.next;
287 sum = list_entry(cur, struct btrfs_ordered_sum, list);
288 list_del(&sum->list);
289 kfree(sum);
290 }
291 kfree(entry);
292 }
293 return 0;
294}
295
296/*
297 * remove an ordered extent from the tree. No references are dropped
298 * but, anyone waiting on this extent is woken up.
299 */
300int btrfs_remove_ordered_extent(struct inode *inode,
301 struct btrfs_ordered_extent *entry)
302{
303 struct btrfs_ordered_inode_tree *tree;
304 struct rb_node *node;
305
306 tree = &BTRFS_I(inode)->ordered_tree;
307 mutex_lock(&tree->mutex);
308 node = &entry->rb_node;
309 rb_erase(node, &tree->tree);
310 tree->last = NULL;
311 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
312
313 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
314 list_del_init(&entry->root_extent_list);
315 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
316
317 mutex_unlock(&tree->mutex);
318 wake_up(&entry->wait);
319 return 0;
320}
321
322/*
323 * wait for all the ordered extents in a root. This is done when balancing
324 * space between drives.
325 */
326int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
327{
328 struct list_head splice;
329 struct list_head *cur;
330 struct btrfs_ordered_extent *ordered;
331 struct inode *inode;
332
333 INIT_LIST_HEAD(&splice);
334
335 spin_lock(&root->fs_info->ordered_extent_lock);
336 list_splice_init(&root->fs_info->ordered_extents, &splice);
337 while (!list_empty(&splice)) {
338 cur = splice.next;
339 ordered = list_entry(cur, struct btrfs_ordered_extent,
340 root_extent_list);
341 if (nocow_only &&
342 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
343 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
344 list_move(&ordered->root_extent_list,
345 &root->fs_info->ordered_extents);
346 cond_resched_lock(&root->fs_info->ordered_extent_lock);
347 continue;
348 }
349
350 list_del_init(&ordered->root_extent_list);
351 atomic_inc(&ordered->refs);
352
353 /*
354 * the inode may be getting freed (in sys_unlink path).
355 */
356 inode = igrab(ordered->inode);
357
358 spin_unlock(&root->fs_info->ordered_extent_lock);
359
360 if (inode) {
361 btrfs_start_ordered_extent(inode, ordered, 1);
362 btrfs_put_ordered_extent(ordered);
363 iput(inode);
364 } else {
365 btrfs_put_ordered_extent(ordered);
366 }
367
368 spin_lock(&root->fs_info->ordered_extent_lock);
369 }
370 spin_unlock(&root->fs_info->ordered_extent_lock);
371 return 0;
372}
373
374/*
375 * Used to start IO or wait for a given ordered extent to finish.
376 *
377 * If wait is one, this effectively waits on page writeback for all the pages
378 * in the extent, and it waits on the io completion code to insert
379 * metadata into the btree corresponding to the extent
380 */
381void btrfs_start_ordered_extent(struct inode *inode,
382 struct btrfs_ordered_extent *entry,
383 int wait)
384{
385 u64 start = entry->file_offset;
386 u64 end = start + entry->len - 1;
387
388 /*
389 * pages in the range can be dirty, clean or writeback. We
390 * start IO on any dirty ones so the wait doesn't stall waiting
391 * for pdflush to find them
392 */
393 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
394 if (wait) {
395 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
396 &entry->flags));
397 }
398}
399
400/*
401 * Used to wait on ordered extents across a large range of bytes.
402 */
403int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
404{
405 u64 end;
406 u64 orig_end;
407 u64 wait_end;
408 struct btrfs_ordered_extent *ordered;
409
410 if (start + len < start) {
411 orig_end = INT_LIMIT(loff_t);
412 } else {
413 orig_end = start + len - 1;
414 if (orig_end > INT_LIMIT(loff_t))
415 orig_end = INT_LIMIT(loff_t);
416 }
417 wait_end = orig_end;
418again:
419 /* start IO across the range first to instantiate any delalloc
420 * extents
421 */
422 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
423
424 /* The compression code will leave pages locked but return from
425 * writepage without setting the page writeback. Starting again
426 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
427 */
428 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
429
430 btrfs_wait_on_page_writeback_range(inode->i_mapping,
431 start >> PAGE_CACHE_SHIFT,
432 orig_end >> PAGE_CACHE_SHIFT);
433
434 end = orig_end;
435 while(1) {
436 ordered = btrfs_lookup_first_ordered_extent(inode, end);
437 if (!ordered) {
438 break;
439 }
440 if (ordered->file_offset > orig_end) {
441 btrfs_put_ordered_extent(ordered);
442 break;
443 }
444 if (ordered->file_offset + ordered->len < start) {
445 btrfs_put_ordered_extent(ordered);
446 break;
447 }
448 btrfs_start_ordered_extent(inode, ordered, 1);
449 end = ordered->file_offset;
450 btrfs_put_ordered_extent(ordered);
451 if (end == 0 || end == start)
452 break;
453 end--;
454 }
455 if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
456 EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
457 schedule_timeout(1);
458 goto again;
459 }
460 return 0;
461}
462
463/*
464 * find an ordered extent corresponding to file_offset. return NULL if
465 * nothing is found, otherwise take a reference on the extent and return it
466 */
467struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
468 u64 file_offset)
469{
470 struct btrfs_ordered_inode_tree *tree;
471 struct rb_node *node;
472 struct btrfs_ordered_extent *entry = NULL;
473
474 tree = &BTRFS_I(inode)->ordered_tree;
475 mutex_lock(&tree->mutex);
476 node = tree_search(tree, file_offset);
477 if (!node)
478 goto out;
479
480 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
481 if (!offset_in_entry(entry, file_offset))
482 entry = NULL;
483 if (entry)
484 atomic_inc(&entry->refs);
485out:
486 mutex_unlock(&tree->mutex);
487 return entry;
488}
489
490/*
491 * lookup and return any extent before 'file_offset'. NULL is returned
492 * if none is found
493 */
494struct btrfs_ordered_extent *
495btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset)
496{
497 struct btrfs_ordered_inode_tree *tree;
498 struct rb_node *node;
499 struct btrfs_ordered_extent *entry = NULL;
500
501 tree = &BTRFS_I(inode)->ordered_tree;
502 mutex_lock(&tree->mutex);
503 node = tree_search(tree, file_offset);
504 if (!node)
505 goto out;
506
507 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
508 atomic_inc(&entry->refs);
509out:
510 mutex_unlock(&tree->mutex);
511 return entry;
512}
513
514/*
515 * After an extent is done, call this to conditionally update the on disk
516 * i_size. i_size is updated to cover any fully written part of the file.
517 */
518int btrfs_ordered_update_i_size(struct inode *inode,
519 struct btrfs_ordered_extent *ordered)
520{
521 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
522 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
523 u64 disk_i_size;
524 u64 new_i_size;
525 u64 i_size_test;
526 struct rb_node *node;
527 struct btrfs_ordered_extent *test;
528
529 mutex_lock(&tree->mutex);
530 disk_i_size = BTRFS_I(inode)->disk_i_size;
531
532 /*
533 * if the disk i_size is already at the inode->i_size, or
534 * this ordered extent is inside the disk i_size, we're done
535 */
536 if (disk_i_size >= inode->i_size ||
537 ordered->file_offset + ordered->len <= disk_i_size) {
538 goto out;
539 }
540
541 /*
542 * we can't update the disk_isize if there are delalloc bytes
543 * between disk_i_size and this ordered extent
544 */
545 if (test_range_bit(io_tree, disk_i_size,
546 ordered->file_offset + ordered->len - 1,
547 EXTENT_DELALLOC, 0)) {
548 goto out;
549 }
550 /*
551 * walk backward from this ordered extent to disk_i_size.
552 * if we find an ordered extent then we can't update disk i_size
553 * yet
554 */
555 node = &ordered->rb_node;
556 while(1) {
557 node = rb_prev(node);
558 if (!node)
559 break;
560 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
561 if (test->file_offset + test->len <= disk_i_size)
562 break;
563 if (test->file_offset >= inode->i_size)
564 break;
565 if (test->file_offset >= disk_i_size)
566 goto out;
567 }
568 new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
569
570 /*
571 * at this point, we know we can safely update i_size to at least
572 * the offset from this ordered extent. But, we need to
573 * walk forward and see if ios from higher up in the file have
574 * finished.
575 */
576 node = rb_next(&ordered->rb_node);
577 i_size_test = 0;
578 if (node) {
579 /*
580 * do we have an area where IO might have finished
581 * between our ordered extent and the next one.
582 */
583 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
584 if (test->file_offset > entry_end(ordered)) {
585 i_size_test = test->file_offset;
586 }
587 } else {
588 i_size_test = i_size_read(inode);
589 }
590
591 /*
592 * i_size_test is the end of a region after this ordered
593 * extent where there are no ordered extents. As long as there
594 * are no delalloc bytes in this area, it is safe to update
595 * disk_i_size to the end of the region.
596 */
597 if (i_size_test > entry_end(ordered) &&
598 !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
599 EXTENT_DELALLOC, 0)) {
600 new_i_size = min_t(u64, i_size_test, i_size_read(inode));
601 }
602 BTRFS_I(inode)->disk_i_size = new_i_size;
603out:
604 mutex_unlock(&tree->mutex);
605 return 0;
606}
607
608/*
609 * search the ordered extents for one corresponding to 'offset' and
610 * try to find a checksum. This is used because we allow pages to
611 * be reclaimed before their checksum is actually put into the btree
612 */
613int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
614{
615 struct btrfs_ordered_sum *ordered_sum;
616 struct btrfs_sector_sum *sector_sums;
617 struct btrfs_ordered_extent *ordered;
618 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
619 struct list_head *cur;
620 unsigned long num_sectors;
621 unsigned long i;
622 u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
623 int ret = 1;
624
625 ordered = btrfs_lookup_ordered_extent(inode, offset);
626 if (!ordered)
627 return 1;
628
629 mutex_lock(&tree->mutex);
630 list_for_each_prev(cur, &ordered->list) {
631 ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
632 if (offset >= ordered_sum->file_offset) {
633 num_sectors = ordered_sum->len / sectorsize;
634 sector_sums = ordered_sum->sums;
635 for (i = 0; i < num_sectors; i++) {
636 if (sector_sums[i].offset == offset) {
637 *sum = sector_sums[i].sum;
638 ret = 0;
639 goto out;
640 }
641 }
642 }
643 }
644out:
645 mutex_unlock(&tree->mutex);
646 btrfs_put_ordered_extent(ordered);
647 return ret;
648}
649
650
651/**
652 * taken from mm/filemap.c because it isn't exported
653 *
654 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
655 * @mapping: address space structure to write
656 * @start: offset in bytes where the range starts
657 * @end: offset in bytes where the range ends (inclusive)
658 * @sync_mode: enable synchronous operation
659 *
660 * Start writeback against all of a mapping's dirty pages that lie
661 * within the byte offsets <start, end> inclusive.
662 *
663 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
664 * opposed to a regular memory cleansing writeback. The difference between
665 * these two operations is that if a dirty page/buffer is encountered, it must
666 * be waited upon, and not just skipped over.
667 */
668int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
669 loff_t end, int sync_mode)
670{
671 struct writeback_control wbc = {
672 .sync_mode = sync_mode,
673 .nr_to_write = mapping->nrpages * 2,
674 .range_start = start,
675 .range_end = end,
676 .for_writepages = 1,
677 };
678 return btrfs_writepages(mapping, &wbc);
679}
680
681/**
682 * taken from mm/filemap.c because it isn't exported
683 *
684 * wait_on_page_writeback_range - wait for writeback to complete
685 * @mapping: target address_space
686 * @start: beginning page index
687 * @end: ending page index
688 *
689 * Wait for writeback to complete against pages indexed by start->end
690 * inclusive
691 */
692int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
693 pgoff_t start, pgoff_t end)
694{
695 struct pagevec pvec;
696 int nr_pages;
697 int ret = 0;
698 pgoff_t index;
699
700 if (end < start)
701 return 0;
702
703 pagevec_init(&pvec, 0);
704 index = start;
705 while ((index <= end) &&
706 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
707 PAGECACHE_TAG_WRITEBACK,
708 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
709 unsigned i;
710
711 for (i = 0; i < nr_pages; i++) {
712 struct page *page = pvec.pages[i];
713
714 /* until radix tree lookup accepts end_index */
715 if (page->index > end)
716 continue;
717
718 wait_on_page_writeback(page);
719 if (PageError(page))
720 ret = -EIO;
721 }
722 pagevec_release(&pvec);
723 cond_resched();
724 }
725
726 /* Check for outstanding write errors */
727 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
728 ret = -ENOSPC;
729 if (test_and_clear_bit(AS_EIO, &mapping->flags))
730 ret = -EIO;
731
732 return ret;
733}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 000000000000..260bf95dfe0c
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,156 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_ORDERED_DATA__
20#define __BTRFS_ORDERED_DATA__
21
22/* one of these per inode */
23struct btrfs_ordered_inode_tree {
24 struct mutex mutex;
25 struct rb_root tree;
26 struct rb_node *last;
27};
28
29/*
30 * these are used to collect checksums done just before bios submission.
31 * They are attached via a list into the ordered extent, and
32 * checksum items are inserted into the tree after all the blocks in
33 * the ordered extent are on disk
34 */
35struct btrfs_sector_sum {
36 u64 offset;
37 u32 sum;
38};
39
40struct btrfs_ordered_sum {
41 u64 file_offset;
42 /*
43 * this is the length in bytes covered by the sums array below.
44 * But, the sums array may not be contiguous in the file.
45 */
46 unsigned long len;
47 struct list_head list;
48 /* last field is a variable length array of btrfs_sector_sums */
49 struct btrfs_sector_sum sums[];
50};
51
52/*
53 * bits for the flags field:
54 *
55 * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
56 * It is used to make sure metadata is inserted into the tree only once
57 * per extent.
58 *
59 * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
60 * rbtree, just before waking any waiters. It is used to indicate the
61 * IO is done and any metadata is inserted into the tree.
62 */
63#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
64
65#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
66
67#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
68
69#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
70
71#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
72
73struct btrfs_ordered_extent {
74 /* logical offset in the file */
75 u64 file_offset;
76
77 /* disk byte number */
78 u64 start;
79
80 /* ram length of the extent in bytes */
81 u64 len;
82
83 /* extent length on disk */
84 u64 disk_len;
85
86 /* flags (described above) */
87 unsigned long flags;
88
89 /* reference count */
90 atomic_t refs;
91
92 /* the inode we belong to */
93 struct inode *inode;
94
95 /* list of checksums for insertion when the extent io is done */
96 struct list_head list;
97
98 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
99 wait_queue_head_t wait;
100
101 /* our friendly rbtree entry */
102 struct rb_node rb_node;
103
104 /* a per root list of all the pending ordered extents */
105 struct list_head root_extent_list;
106};
107
108
109/*
110 * calculates the total size you need to allocate for an ordered sum
111 * structure spanning 'bytes' in the file
112 */
113static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
114 unsigned long bytes)
115{
116 unsigned long num_sectors = (bytes + root->sectorsize - 1) /
117 root->sectorsize;
118 num_sectors++;
119 return sizeof(struct btrfs_ordered_sum) +
120 num_sectors * sizeof(struct btrfs_sector_sum);
121}
122
123static inline void
124btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
125{
126 mutex_init(&t->mutex);
127 t->tree.rb_node = NULL;
128 t->last = NULL;
129}
130
131int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
132int btrfs_remove_ordered_extent(struct inode *inode,
133 struct btrfs_ordered_extent *entry);
134int btrfs_dec_test_ordered_pending(struct inode *inode,
135 u64 file_offset, u64 io_size);
136int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
137 u64 start, u64 len, u64 disk_len, int tyep);
138int btrfs_add_ordered_sum(struct inode *inode,
139 struct btrfs_ordered_extent *entry,
140 struct btrfs_ordered_sum *sum);
141struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
142 u64 file_offset);
143void btrfs_start_ordered_extent(struct inode *inode,
144 struct btrfs_ordered_extent *entry, int wait);
145int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
146struct btrfs_ordered_extent *
147btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
148int btrfs_ordered_update_i_size(struct inode *inode,
149 struct btrfs_ordered_extent *ordered);
150int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);
151int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
152 pgoff_t start, pgoff_t end);
153int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
154 loff_t end, int sync_mode);
155int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
156#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 000000000000..3c0d52af4f80
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,67 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21
22int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root, u64 offset)
24{
25 struct btrfs_path *path;
26 struct btrfs_key key;
27 int ret = 0;
28
29 key.objectid = BTRFS_ORPHAN_OBJECTID;
30 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
31 key.offset = offset;
32
33 path = btrfs_alloc_path();
34 if (!path)
35 return -ENOMEM;
36
37 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
38
39 btrfs_free_path(path);
40 return ret;
41}
42
43int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
44 struct btrfs_root *root, u64 offset)
45{
46 struct btrfs_path *path;
47 struct btrfs_key key;
48 int ret = 0;
49
50 key.objectid = BTRFS_ORPHAN_OBJECTID;
51 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
52 key.offset = offset;
53
54 path = btrfs_alloc_path();
55 if (!path)
56 return -ENOMEM;
57
58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
59 if (ret)
60 goto out;
61
62 ret = btrfs_del_item(trans, root, path);
63
64out:
65 btrfs_free_path(path);
66 return ret;
67}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 000000000000..64725c13aa11
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,201 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "print-tree.h"
22
23static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
24{
25 int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
26 int i;
27 printk("\t\tchunk length %llu owner %llu type %llu num_stripes %d\n",
28 (unsigned long long)btrfs_chunk_length(eb, chunk),
29 (unsigned long long)btrfs_chunk_owner(eb, chunk),
30 (unsigned long long)btrfs_chunk_type(eb, chunk),
31 num_stripes);
32 for (i = 0 ; i < num_stripes ; i++) {
33 printk("\t\t\tstripe %d devid %llu offset %llu\n", i,
34 (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
35 (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
36 }
37}
38static void print_dev_item(struct extent_buffer *eb,
39 struct btrfs_dev_item *dev_item)
40{
41 printk("\t\tdev item devid %llu "
42 "total_bytes %llu bytes used %Lu\n",
43 (unsigned long long)btrfs_device_id(eb, dev_item),
44 (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
45 (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
46}
47void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
48{
49 int i;
50 u32 nr = btrfs_header_nritems(l);
51 struct btrfs_item *item;
52 struct btrfs_extent_item *ei;
53 struct btrfs_root_item *ri;
54 struct btrfs_dir_item *di;
55 struct btrfs_inode_item *ii;
56 struct btrfs_block_group_item *bi;
57 struct btrfs_file_extent_item *fi;
58 struct btrfs_key key;
59 struct btrfs_key found_key;
60 struct btrfs_extent_ref *ref;
61 struct btrfs_dev_extent *dev_extent;
62 u32 type;
63
64 printk("leaf %llu total ptrs %d free space %d\n",
65 (unsigned long long)btrfs_header_bytenr(l), nr,
66 btrfs_leaf_free_space(root, l));
67 for (i = 0 ; i < nr ; i++) {
68 item = btrfs_item_nr(l, i);
69 btrfs_item_key_to_cpu(l, &key, i);
70 type = btrfs_key_type(&key);
71 printk("\titem %d key (%llu %x %llu) itemoff %d itemsize %d\n",
72 i,
73 (unsigned long long)key.objectid, type,
74 (unsigned long long)key.offset,
75 btrfs_item_offset(l, item), btrfs_item_size(l, item));
76 switch (type) {
77 case BTRFS_INODE_ITEM_KEY:
78 ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
79 printk("\t\tinode generation %llu size %llu mode %o\n",
80 (unsigned long long)btrfs_inode_generation(l, ii),
81 (unsigned long long)btrfs_inode_size(l, ii),
82 btrfs_inode_mode(l, ii));
83 break;
84 case BTRFS_DIR_ITEM_KEY:
85 di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
86 btrfs_dir_item_key_to_cpu(l, di, &found_key);
87 printk("\t\tdir oid %llu type %u\n",
88 (unsigned long long)found_key.objectid,
89 btrfs_dir_type(l, di));
90 break;
91 case BTRFS_ROOT_ITEM_KEY:
92 ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
93 printk("\t\troot data bytenr %llu refs %u\n",
94 (unsigned long long)btrfs_disk_root_bytenr(l, ri),
95 btrfs_disk_root_refs(l, ri));
96 break;
97 case BTRFS_EXTENT_ITEM_KEY:
98 ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
99 printk("\t\textent data refs %u\n",
100 btrfs_extent_refs(l, ei));
101 break;
102 case BTRFS_EXTENT_REF_KEY:
103 ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
104 printk("\t\textent back ref root %llu gen %llu "
105 "owner %llu num_refs %lu\n",
106 (unsigned long long)btrfs_ref_root(l, ref),
107 (unsigned long long)btrfs_ref_generation(l, ref),
108 (unsigned long long)btrfs_ref_objectid(l, ref),
109 (unsigned long)btrfs_ref_num_refs(l, ref));
110 break;
111
112 case BTRFS_EXTENT_DATA_KEY:
113 fi = btrfs_item_ptr(l, i,
114 struct btrfs_file_extent_item);
115 if (btrfs_file_extent_type(l, fi) ==
116 BTRFS_FILE_EXTENT_INLINE) {
117 printk("\t\tinline extent data size %u\n",
118 btrfs_file_extent_inline_len(l, fi));
119 break;
120 }
121 printk("\t\textent data disk bytenr %llu nr %llu\n",
122 (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),
123 (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi));
124 printk("\t\textent data offset %llu nr %llu ram %llu\n",
125 (unsigned long long)btrfs_file_extent_offset(l, fi),
126 (unsigned long long)btrfs_file_extent_num_bytes(l, fi),
127 (unsigned long long)btrfs_file_extent_ram_bytes(l, fi));
128 break;
129 case BTRFS_BLOCK_GROUP_ITEM_KEY:
130 bi = btrfs_item_ptr(l, i,
131 struct btrfs_block_group_item);
132 printk("\t\tblock group used %llu\n",
133 (unsigned long long)btrfs_disk_block_group_used(l, bi));
134 break;
135 case BTRFS_CHUNK_ITEM_KEY:
136 print_chunk(l, btrfs_item_ptr(l, i, struct btrfs_chunk));
137 break;
138 case BTRFS_DEV_ITEM_KEY:
139 print_dev_item(l, btrfs_item_ptr(l, i,
140 struct btrfs_dev_item));
141 break;
142 case BTRFS_DEV_EXTENT_KEY:
143 dev_extent = btrfs_item_ptr(l, i,
144 struct btrfs_dev_extent);
145 printk("\t\tdev extent chunk_tree %llu\n"
146 "\t\tchunk objectid %llu chunk offset %llu "
147 "length %llu\n",
148 (unsigned long long)
149 btrfs_dev_extent_chunk_tree(l, dev_extent),
150 (unsigned long long)
151 btrfs_dev_extent_chunk_objectid(l, dev_extent),
152 (unsigned long long)
153 btrfs_dev_extent_chunk_offset(l, dev_extent),
154 (unsigned long long)
155 btrfs_dev_extent_length(l, dev_extent));
156 };
157 }
158}
159
160void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
161{
162 int i; u32 nr;
163 struct btrfs_key key;
164 int level;
165
166 if (!c)
167 return;
168 nr = btrfs_header_nritems(c);
169 level = btrfs_header_level(c);
170 if (level == 0) {
171 btrfs_print_leaf(root, c);
172 return;
173 }
174 printk("node %llu level %d total ptrs %d free spc %u\n",
175 (unsigned long long)btrfs_header_bytenr(c),
176 btrfs_header_level(c), nr,
177 (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
178 for (i = 0; i < nr; i++) {
179 btrfs_node_key_to_cpu(c, &key, i);
180 printk("\tkey %d (%llu %u %llu) block %llu\n",
181 i,
182 (unsigned long long)key.objectid,
183 key.type,
184 (unsigned long long)key.offset,
185 (unsigned long long)btrfs_node_blockptr(c, i));
186 }
187 for (i = 0; i < nr; i++) {
188 struct extent_buffer *next = read_tree_block(root,
189 btrfs_node_blockptr(c, i),
190 btrfs_level_size(root, level - 1),
191 btrfs_node_ptr_generation(c, i));
192 if (btrfs_is_leaf(next) &&
193 btrfs_header_level(c) != 1)
194 BUG();
195 if (btrfs_header_level(next) !=
196 btrfs_header_level(c) - 1)
197 BUG();
198 btrfs_print_tree(root, next);
199 free_extent_buffer(next);
200 }
201}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 000000000000..da75efe534d5
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __PRINT_TREE_
20#define __PRINT_TREE_
21void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
22void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
23#endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
new file mode 100644
index 000000000000..a50ebb67055d
--- /dev/null
+++ b/fs/btrfs/ref-cache.c
@@ -0,0 +1,230 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "ref-cache.h"
22#include "transaction.h"
23
24/*
25 * leaf refs are used to cache the information about which extents
26 * a given leaf has references on. This allows us to process that leaf
27 * in btrfs_drop_snapshot without needing to read it back from disk.
28 */
29
30/*
31 * kmalloc a leaf reference struct and update the counters for the
32 * total ref cache size
33 */
34struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
35 int nr_extents)
36{
37 struct btrfs_leaf_ref *ref;
38 size_t size = btrfs_leaf_ref_size(nr_extents);
39
40 ref = kmalloc(size, GFP_NOFS);
41 if (ref) {
42 spin_lock(&root->fs_info->ref_cache_lock);
43 root->fs_info->total_ref_cache_size += size;
44 spin_unlock(&root->fs_info->ref_cache_lock);
45
46 memset(ref, 0, sizeof(*ref));
47 atomic_set(&ref->usage, 1);
48 INIT_LIST_HEAD(&ref->list);
49 }
50 return ref;
51}
52
53/*
54 * free a leaf reference struct and update the counters for the
55 * total ref cache size
56 */
57void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
58{
59 if (!ref)
60 return;
61 WARN_ON(atomic_read(&ref->usage) == 0);
62 if (atomic_dec_and_test(&ref->usage)) {
63 size_t size = btrfs_leaf_ref_size(ref->nritems);
64
65 BUG_ON(ref->in_tree);
66 kfree(ref);
67
68 spin_lock(&root->fs_info->ref_cache_lock);
69 root->fs_info->total_ref_cache_size -= size;
70 spin_unlock(&root->fs_info->ref_cache_lock);
71 }
72}
73
74static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
75 struct rb_node *node)
76{
77 struct rb_node ** p = &root->rb_node;
78 struct rb_node * parent = NULL;
79 struct btrfs_leaf_ref *entry;
80
81 while(*p) {
82 parent = *p;
83 entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
84
85 if (bytenr < entry->bytenr)
86 p = &(*p)->rb_left;
87 else if (bytenr > entry->bytenr)
88 p = &(*p)->rb_right;
89 else
90 return parent;
91 }
92
93 entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
94 rb_link_node(node, parent, p);
95 rb_insert_color(node, root);
96 return NULL;
97}
98
99static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
100{
101 struct rb_node * n = root->rb_node;
102 struct btrfs_leaf_ref *entry;
103
104 while(n) {
105 entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
106 WARN_ON(!entry->in_tree);
107
108 if (bytenr < entry->bytenr)
109 n = n->rb_left;
110 else if (bytenr > entry->bytenr)
111 n = n->rb_right;
112 else
113 return n;
114 }
115 return NULL;
116}
117
118int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
119 int shared)
120{
121 struct btrfs_leaf_ref *ref = NULL;
122 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
123
124 if (shared)
125 tree = &root->fs_info->shared_ref_tree;
126 if (!tree)
127 return 0;
128
129 spin_lock(&tree->lock);
130 while(!list_empty(&tree->list)) {
131 ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
132 BUG_ON(ref->tree != tree);
133 if (ref->root_gen > max_root_gen)
134 break;
135 if (!xchg(&ref->in_tree, 0)) {
136 cond_resched_lock(&tree->lock);
137 continue;
138 }
139
140 rb_erase(&ref->rb_node, &tree->root);
141 list_del_init(&ref->list);
142
143 spin_unlock(&tree->lock);
144 btrfs_free_leaf_ref(root, ref);
145 cond_resched();
146 spin_lock(&tree->lock);
147 }
148 spin_unlock(&tree->lock);
149 return 0;
150}
151
152/*
153 * find the leaf ref for a given extent. This returns the ref struct with
154 * a usage reference incremented
155 */
156struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
157 u64 bytenr)
158{
159 struct rb_node *rb;
160 struct btrfs_leaf_ref *ref = NULL;
161 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
162again:
163 if (tree) {
164 spin_lock(&tree->lock);
165 rb = tree_search(&tree->root, bytenr);
166 if (rb)
167 ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
168 if (ref)
169 atomic_inc(&ref->usage);
170 spin_unlock(&tree->lock);
171 if (ref)
172 return ref;
173 }
174 if (tree != &root->fs_info->shared_ref_tree) {
175 tree = &root->fs_info->shared_ref_tree;
176 goto again;
177 }
178 return NULL;
179}
180
181/*
182 * add a fully filled in leaf ref struct
183 * remove all the refs older than a given root generation
184 */
185int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
186 int shared)
187{
188 int ret = 0;
189 struct rb_node *rb;
190 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
191
192 if (shared)
193 tree = &root->fs_info->shared_ref_tree;
194
195 spin_lock(&tree->lock);
196 rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
197 if (rb) {
198 ret = -EEXIST;
199 } else {
200 atomic_inc(&ref->usage);
201 ref->tree = tree;
202 ref->in_tree = 1;
203 list_add_tail(&ref->list, &tree->list);
204 }
205 spin_unlock(&tree->lock);
206 return ret;
207}
208
209/*
210 * remove a single leaf ref from the tree. This drops the ref held by the tree
211 * only
212 */
213int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
214{
215 struct btrfs_leaf_ref_tree *tree;
216
217 if (!xchg(&ref->in_tree, 0))
218 return 0;
219
220 tree = ref->tree;
221 spin_lock(&tree->lock);
222
223 rb_erase(&ref->rb_node, &tree->root);
224 list_del_init(&ref->list);
225
226 spin_unlock(&tree->lock);
227
228 btrfs_free_leaf_ref(root, ref);
229 return 0;
230}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
new file mode 100644
index 000000000000..16f3183d7c59
--- /dev/null
+++ b/fs/btrfs/ref-cache.h
@@ -0,0 +1,77 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __REFCACHE__
19#define __REFCACHE__
20
21struct btrfs_extent_info {
22 /* bytenr and num_bytes find the extent in the extent allocation tree */
23 u64 bytenr;
24 u64 num_bytes;
25
26 /* objectid and offset find the back reference for the file */
27 u64 objectid;
28 u64 offset;
29};
30
31struct btrfs_leaf_ref {
32 struct rb_node rb_node;
33 struct btrfs_leaf_ref_tree *tree;
34 int in_tree;
35 atomic_t usage;
36
37 u64 root_gen;
38 u64 bytenr;
39 u64 owner;
40 u64 generation;
41 int nritems;
42
43 struct list_head list;
44 struct btrfs_extent_info extents[];
45};
46
47static inline size_t btrfs_leaf_ref_size(int nr_extents)
48{
49 return sizeof(struct btrfs_leaf_ref) +
50 sizeof(struct btrfs_extent_info) * nr_extents;
51}
52
53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
54{
55 tree->root.rb_node = NULL;
56 INIT_LIST_HEAD(&tree->list);
57 spin_lock_init(&tree->lock);
58}
59
60static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
61{
62 return RB_EMPTY_ROOT(&tree->root);
63}
64
65void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
66struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
67 int nr_extents);
68void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
69struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
70 u64 bytenr);
71int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
72 int shared);
73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
74 int shared);
75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
76
77#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 000000000000..dbe20d4c6ea4
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,367 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "transaction.h"
21#include "disk-io.h"
22#include "print-tree.h"
23
24/*
25 * search forward for a root, starting with objectid 'search_start'
26 * if a root key is found, the objectid we find is filled into 'found_objectid'
27 * and 0 is returned. < 0 is returned on error, 1 if there is nothing
28 * left in the tree.
29 */
30int btrfs_search_root(struct btrfs_root *root, u64 search_start,
31 u64 *found_objectid)
32{
33 struct btrfs_path *path;
34 struct btrfs_key search_key;
35 int ret;
36
37 root = root->fs_info->tree_root;
38 search_key.objectid = search_start;
39 search_key.type = (u8)-1;
40 search_key.offset = (u64)-1;
41
42 path = btrfs_alloc_path();
43 BUG_ON(!path);
44again:
45 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
46 if (ret < 0)
47 goto out;
48 if (ret == 0) {
49 ret = 1;
50 goto out;
51 }
52 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
53 ret = btrfs_next_leaf(root, path);
54 if (ret)
55 goto out;
56 }
57 btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
58 if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
59 search_key.offset++;
60 btrfs_release_path(root, path);
61 goto again;
62 }
63 ret = 0;
64 *found_objectid = search_key.objectid;
65
66out:
67 btrfs_free_path(path);
68 return ret;
69}
70
71/*
72 * lookup the root with the highest offset for a given objectid. The key we do
73 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
74 * on error.
75 */
76int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
77 struct btrfs_root_item *item, struct btrfs_key *key)
78{
79 struct btrfs_path *path;
80 struct btrfs_key search_key;
81 struct btrfs_key found_key;
82 struct extent_buffer *l;
83 int ret;
84 int slot;
85
86 search_key.objectid = objectid;
87 search_key.type = BTRFS_ROOT_ITEM_KEY;
88 search_key.offset = (u64)-1;
89
90 path = btrfs_alloc_path();
91 BUG_ON(!path);
92 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
93 if (ret < 0)
94 goto out;
95
96 BUG_ON(ret == 0);
97 l = path->nodes[0];
98 BUG_ON(path->slots[0] == 0);
99 slot = path->slots[0] - 1;
100 btrfs_item_key_to_cpu(l, &found_key, slot);
101 if (found_key.objectid != objectid) {
102 ret = 1;
103 goto out;
104 }
105 read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
106 sizeof(*item));
107 memcpy(key, &found_key, sizeof(found_key));
108 ret = 0;
109out:
110 btrfs_free_path(path);
111 return ret;
112}
113
114/*
115 * copy the data in 'item' into the btree
116 */
117int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
118 *root, struct btrfs_key *key, struct btrfs_root_item
119 *item)
120{
121 struct btrfs_path *path;
122 struct extent_buffer *l;
123 int ret;
124 int slot;
125 unsigned long ptr;
126
127 path = btrfs_alloc_path();
128 BUG_ON(!path);
129 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
130 if (ret < 0)
131 goto out;
132
133 if (ret != 0) {
134 btrfs_print_leaf(root, path->nodes[0]);
135 printk("unable to update root key %Lu %u %Lu\n",
136 key->objectid, key->type, key->offset);
137 BUG_ON(1);
138 }
139
140 l = path->nodes[0];
141 slot = path->slots[0];
142 ptr = btrfs_item_ptr_offset(l, slot);
143 write_extent_buffer(l, item, ptr, sizeof(*item));
144 btrfs_mark_buffer_dirty(path->nodes[0]);
145out:
146 btrfs_release_path(root, path);
147 btrfs_free_path(path);
148 return ret;
149}
150
151int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
152 *root, struct btrfs_key *key, struct btrfs_root_item
153 *item)
154{
155 int ret;
156 ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
157 return ret;
158}
159
160/*
161 * at mount time we want to find all the old transaction snapshots that were in
162 * the process of being deleted if we crashed. This is any root item with an offset
163 * lower than the latest root. They need to be queued for deletion to finish
164 * what was happening when we crashed.
165 */
166int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
167 struct btrfs_root *latest)
168{
169 struct btrfs_root *dead_root;
170 struct btrfs_item *item;
171 struct btrfs_root_item *ri;
172 struct btrfs_key key;
173 struct btrfs_key found_key;
174 struct btrfs_path *path;
175 int ret;
176 u32 nritems;
177 struct extent_buffer *leaf;
178 int slot;
179
180 key.objectid = objectid;
181 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
182 key.offset = 0;
183 path = btrfs_alloc_path();
184 if (!path)
185 return -ENOMEM;
186
187again:
188 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
189 if (ret < 0)
190 goto err;
191 while(1) {
192 leaf = path->nodes[0];
193 nritems = btrfs_header_nritems(leaf);
194 slot = path->slots[0];
195 if (slot >= nritems) {
196 ret = btrfs_next_leaf(root, path);
197 if (ret)
198 break;
199 leaf = path->nodes[0];
200 nritems = btrfs_header_nritems(leaf);
201 slot = path->slots[0];
202 }
203 item = btrfs_item_nr(leaf, slot);
204 btrfs_item_key_to_cpu(leaf, &key, slot);
205 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
206 goto next;
207
208 if (key.objectid < objectid)
209 goto next;
210
211 if (key.objectid > objectid)
212 break;
213
214 ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
215 if (btrfs_disk_root_refs(leaf, ri) != 0)
216 goto next;
217
218 memcpy(&found_key, &key, sizeof(key));
219 key.offset++;
220 btrfs_release_path(root, path);
221 dead_root =
222 btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
223 &found_key);
224 if (IS_ERR(dead_root)) {
225 ret = PTR_ERR(dead_root);
226 goto err;
227 }
228
229 if (objectid == BTRFS_TREE_RELOC_OBJECTID)
230 ret = btrfs_add_dead_reloc_root(dead_root);
231 else
232 ret = btrfs_add_dead_root(dead_root, latest);
233 if (ret)
234 goto err;
235 goto again;
236next:
237 slot++;
238 path->slots[0]++;
239 }
240 ret = 0;
241err:
242 btrfs_free_path(path);
243 return ret;
244}
245
246/* drop the root item for 'key' from 'root' */
247int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
248 struct btrfs_key *key)
249{
250 struct btrfs_path *path;
251 int ret;
252 u32 refs;
253 struct btrfs_root_item *ri;
254 struct extent_buffer *leaf;
255
256 path = btrfs_alloc_path();
257 BUG_ON(!path);
258 ret = btrfs_search_slot(trans, root, key, path, -1, 1);
259 if (ret < 0)
260 goto out;
261 if (ret) {
262btrfs_print_leaf(root, path->nodes[0]);
263printk("failed to del %Lu %u %Lu\n", key->objectid, key->type, key->offset);
264
265 }
266 BUG_ON(ret != 0);
267 leaf = path->nodes[0];
268 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
269
270 refs = btrfs_disk_root_refs(leaf, ri);
271 BUG_ON(refs != 0);
272 ret = btrfs_del_item(trans, root, path);
273out:
274 btrfs_release_path(root, path);
275 btrfs_free_path(path);
276 return ret;
277}
278
279int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
280 struct btrfs_root *tree_root,
281 u64 root_id, u8 type, u64 ref_id)
282{
283 struct btrfs_key key;
284 int ret;
285 struct btrfs_path *path;
286
287 path = btrfs_alloc_path();
288
289 key.objectid = root_id;
290 key.type = type;
291 key.offset = ref_id;
292
293 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
294 BUG_ON(ret);
295
296 ret = btrfs_del_item(trans, tree_root, path);
297 BUG_ON(ret);
298
299 btrfs_free_path(path);
300 return ret;
301}
302
303int btrfs_find_root_ref(struct btrfs_root *tree_root,
304 struct btrfs_path *path,
305 u64 root_id, u64 ref_id)
306{
307 struct btrfs_key key;
308 int ret;
309
310 key.objectid = root_id;
311 key.type = BTRFS_ROOT_REF_KEY;
312 key.offset = ref_id;
313
314 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
315 return ret;
316}
317
318
319/*
320 * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
321 * or BTRFS_ROOT_BACKREF_KEY.
322 *
323 * The dirid, sequence, name and name_len refer to the directory entry
324 * that is referencing the root.
325 *
326 * For a forward ref, the root_id is the id of the tree referencing
327 * the root and ref_id is the id of the subvol or snapshot.
328 *
329 * For a back ref the root_id is the id of the subvol or snapshot and
330 * ref_id is the id of the tree referencing it.
331 */
332int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
333 struct btrfs_root *tree_root,
334 u64 root_id, u8 type, u64 ref_id,
335 u64 dirid, u64 sequence,
336 const char *name, int name_len)
337{
338 struct btrfs_key key;
339 int ret;
340 struct btrfs_path *path;
341 struct btrfs_root_ref *ref;
342 struct extent_buffer *leaf;
343 unsigned long ptr;
344
345
346 path = btrfs_alloc_path();
347
348 key.objectid = root_id;
349 key.type = type;
350 key.offset = ref_id;
351
352 ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
353 sizeof(*ref) + name_len);
354 BUG_ON(ret);
355
356 leaf = path->nodes[0];
357 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
358 btrfs_set_root_ref_dirid(leaf, ref, dirid);
359 btrfs_set_root_ref_sequence(leaf, ref, sequence);
360 btrfs_set_root_ref_name_len(leaf, ref, name_len);
361 ptr = (unsigned long)(ref + 1);
362 write_extent_buffer(leaf, name, ptr, name_len);
363 btrfs_mark_buffer_dirty(leaf);
364
365 btrfs_free_path(path);
366 return ret;
367}
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 000000000000..cdedbe144d45
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,132 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/highmem.h>
20
21/* this is some deeply nasty code. ctree.h has a different
22 * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
23 *
24 * The end result is that anyone who #includes ctree.h gets a
25 * declaration for the btrfs_set_foo functions and btrfs_foo functions
26 *
27 * This file declares the macros and then #includes ctree.h, which results
28 * in cpp creating the function here based on the template below.
29 *
30 * These setget functions do all the extent_buffer related mapping
31 * required to efficiently read and write specific fields in the extent
32 * buffers. Every pointer to metadata items in btrfs is really just
33 * an unsigned long offset into the extent buffer which has been
34 * cast to a specific type. This gives us all the gcc type checking.
35 *
36 * The extent buffer api is used to do all the kmapping and page
37 * spanning work required to get extent buffers in highmem and have
38 * a metadata blocksize different from the page size.
39 */
40
41#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
42u##bits btrfs_##name(struct extent_buffer *eb, \
43 type *s) \
44{ \
45 unsigned long part_offset = (unsigned long)s; \
46 unsigned long offset = part_offset + offsetof(type, member); \
47 type *p; \
48 /* ugly, but we want the fast path here */ \
49 if (eb->map_token && offset >= eb->map_start && \
50 offset + sizeof(((type *)0)->member) <= eb->map_start + \
51 eb->map_len) { \
52 p = (type *)(eb->kaddr + part_offset - eb->map_start); \
53 return le##bits##_to_cpu(p->member); \
54 } \
55 { \
56 int err; \
57 char *map_token; \
58 char *kaddr; \
59 int unmap_on_exit = (eb->map_token == NULL); \
60 unsigned long map_start; \
61 unsigned long map_len; \
62 __le##bits res; \
63 err = map_extent_buffer(eb, offset, \
64 sizeof(((type *)0)->member), \
65 &map_token, &kaddr, \
66 &map_start, &map_len, KM_USER1); \
67 if (err) { \
68 read_eb_member(eb, s, type, member, &res); \
69 return le##bits##_to_cpu(res); \
70 } \
71 p = (type *)(kaddr + part_offset - map_start); \
72 res = le##bits##_to_cpu(p->member); \
73 if (unmap_on_exit) \
74 unmap_extent_buffer(eb, map_token, KM_USER1); \
75 return res; \
76 } \
77} \
78void btrfs_set_##name(struct extent_buffer *eb, \
79 type *s, u##bits val) \
80{ \
81 unsigned long part_offset = (unsigned long)s; \
82 unsigned long offset = part_offset + offsetof(type, member); \
83 type *p; \
84 /* ugly, but we want the fast path here */ \
85 if (eb->map_token && offset >= eb->map_start && \
86 offset + sizeof(((type *)0)->member) <= eb->map_start + \
87 eb->map_len) { \
88 p = (type *)(eb->kaddr + part_offset - eb->map_start); \
89 p->member = cpu_to_le##bits(val); \
90 return; \
91 } \
92 { \
93 int err; \
94 char *map_token; \
95 char *kaddr; \
96 int unmap_on_exit = (eb->map_token == NULL); \
97 unsigned long map_start; \
98 unsigned long map_len; \
99 err = map_extent_buffer(eb, offset, \
100 sizeof(((type *)0)->member), \
101 &map_token, &kaddr, \
102 &map_start, &map_len, KM_USER1); \
103 if (err) { \
104 val = cpu_to_le##bits(val); \
105 write_eb_member(eb, s, type, member, &val); \
106 return; \
107 } \
108 p = (type *)(kaddr + part_offset - map_start); \
109 p->member = cpu_to_le##bits(val); \
110 if (unmap_on_exit) \
111 unmap_extent_buffer(eb, map_token, KM_USER1); \
112 } \
113}
114
115#include "ctree.h"
116
117void btrfs_node_key(struct extent_buffer *eb,
118 struct btrfs_disk_key *disk_key, int nr)
119{
120 unsigned long ptr = btrfs_node_key_ptr_offset(nr);
121 if (eb->map_token && ptr >= eb->map_start &&
122 ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
123 memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
124 sizeof(*disk_key));
125 return;
126 } else if (eb->map_token) {
127 unmap_extent_buffer(eb, eb->map_token, KM_USER1);
128 eb->map_token = NULL;
129 }
130 read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
131 struct btrfs_key_ptr, key, disk_key);
132}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 000000000000..77c5eff3e209
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,713 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/blkdev.h>
20#include <linux/module.h>
21#include <linux/buffer_head.h>
22#include <linux/fs.h>
23#include <linux/pagemap.h>
24#include <linux/highmem.h>
25#include <linux/time.h>
26#include <linux/init.h>
27#include <linux/string.h>
28#include <linux/smp_lock.h>
29#include <linux/backing-dev.h>
30#include <linux/mount.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/parser.h>
37#include <linux/ctype.h>
38#include <linux/namei.h>
39#include <linux/miscdevice.h>
40#include "ctree.h"
41#include "disk-io.h"
42#include "transaction.h"
43#include "btrfs_inode.h"
44#include "ioctl.h"
45#include "print-tree.h"
46#include "xattr.h"
47#include "volumes.h"
48#include "version.h"
49#include "export.h"
50#include "compression.h"
51
52#define BTRFS_SUPER_MAGIC 0x9123683E
53
54static struct super_operations btrfs_super_ops;
55
56static void btrfs_put_super (struct super_block * sb)
57{
58 struct btrfs_root *root = btrfs_sb(sb);
59 struct btrfs_fs_info *fs = root->fs_info;
60 int ret;
61
62 ret = close_ctree(root);
63 if (ret) {
64 printk("close ctree returns %d\n", ret);
65 }
66 btrfs_sysfs_del_super(fs);
67 sb->s_fs_info = NULL;
68}
69
70enum {
71 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
72 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
73 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err,
74};
75
76static match_table_t tokens = {
77 {Opt_degraded, "degraded"},
78 {Opt_subvol, "subvol=%s"},
79 {Opt_device, "device=%s"},
80 {Opt_nodatasum, "nodatasum"},
81 {Opt_nodatacow, "nodatacow"},
82 {Opt_nobarrier, "nobarrier"},
83 {Opt_max_extent, "max_extent=%s"},
84 {Opt_max_inline, "max_inline=%s"},
85 {Opt_alloc_start, "alloc_start=%s"},
86 {Opt_thread_pool, "thread_pool=%d"},
87 {Opt_compress, "compress"},
88 {Opt_ssd, "ssd"},
89 {Opt_noacl, "noacl"},
90 {Opt_err, NULL},
91};
92
93u64 btrfs_parse_size(char *str)
94{
95 u64 res;
96 int mult = 1;
97 char *end;
98 char last;
99
100 res = simple_strtoul(str, &end, 10);
101
102 last = end[0];
103 if (isalpha(last)) {
104 last = tolower(last);
105 switch (last) {
106 case 'g':
107 mult *= 1024;
108 case 'm':
109 mult *= 1024;
110 case 'k':
111 mult *= 1024;
112 }
113 res = res * mult;
114 }
115 return res;
116}
117
118/*
119 * Regular mount options parser. Everything that is needed only when
120 * reading in a new superblock is parsed here.
121 */
122int btrfs_parse_options(struct btrfs_root *root, char *options)
123{
124 struct btrfs_fs_info *info = root->fs_info;
125 substring_t args[MAX_OPT_ARGS];
126 char *p, *num;
127 int intarg;
128
129 if (!options)
130 return 0;
131
132 /*
133 * strsep changes the string, duplicate it because parse_options
134 * gets called twice
135 */
136 options = kstrdup(options, GFP_NOFS);
137 if (!options)
138 return -ENOMEM;
139
140
141 while ((p = strsep(&options, ",")) != NULL) {
142 int token;
143 if (!*p)
144 continue;
145
146 token = match_token(p, tokens, args);
147 switch (token) {
148 case Opt_degraded:
149 printk(KERN_INFO "btrfs: allowing degraded mounts\n");
150 btrfs_set_opt(info->mount_opt, DEGRADED);
151 break;
152 case Opt_subvol:
153 case Opt_device:
154 /*
155 * These are parsed by btrfs_parse_early_options
156 * and can be happily ignored here.
157 */
158 break;
159 case Opt_nodatasum:
160 printk(KERN_INFO "btrfs: setting nodatacsum\n");
161 btrfs_set_opt(info->mount_opt, NODATASUM);
162 break;
163 case Opt_nodatacow:
164 printk(KERN_INFO "btrfs: setting nodatacow\n");
165 btrfs_set_opt(info->mount_opt, NODATACOW);
166 btrfs_set_opt(info->mount_opt, NODATASUM);
167 break;
168 case Opt_compress:
169 printk(KERN_INFO "btrfs: use compression\n");
170 btrfs_set_opt(info->mount_opt, COMPRESS);
171 break;
172 case Opt_ssd:
173 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
174 btrfs_set_opt(info->mount_opt, SSD);
175 break;
176 case Opt_nobarrier:
177 printk(KERN_INFO "btrfs: turning off barriers\n");
178 btrfs_set_opt(info->mount_opt, NOBARRIER);
179 break;
180 case Opt_thread_pool:
181 intarg = 0;
182 match_int(&args[0], &intarg);
183 if (intarg) {
184 info->thread_pool_size = intarg;
185 printk(KERN_INFO "btrfs: thread pool %d\n",
186 info->thread_pool_size);
187 }
188 break;
189 case Opt_max_extent:
190 num = match_strdup(&args[0]);
191 if (num) {
192 info->max_extent = btrfs_parse_size(num);
193 kfree(num);
194
195 info->max_extent = max_t(u64,
196 info->max_extent, root->sectorsize);
197 printk(KERN_INFO "btrfs: max_extent at %llu\n",
198 info->max_extent);
199 }
200 break;
201 case Opt_max_inline:
202 num = match_strdup(&args[0]);
203 if (num) {
204 info->max_inline = btrfs_parse_size(num);
205 kfree(num);
206
207 if (info->max_inline) {
208 info->max_inline = max_t(u64,
209 info->max_inline,
210 root->sectorsize);
211 }
212 printk(KERN_INFO "btrfs: max_inline at %llu\n",
213 info->max_inline);
214 }
215 break;
216 case Opt_alloc_start:
217 num = match_strdup(&args[0]);
218 if (num) {
219 info->alloc_start = btrfs_parse_size(num);
220 kfree(num);
221 printk(KERN_INFO
222 "btrfs: allocations start at %llu\n",
223 info->alloc_start);
224 }
225 break;
226 case Opt_noacl:
227 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
228 break;
229 default:
230 break;
231 }
232 }
233 kfree(options);
234 return 0;
235}
236
237/*
238 * Parse mount options that are required early in the mount process.
239 *
240 * All other options will be parsed on much later in the mount process and
241 * only when we need to allocate a new super block.
242 */
243static int btrfs_parse_early_options(const char *options, int flags,
244 void *holder, char **subvol_name,
245 struct btrfs_fs_devices **fs_devices)
246{
247 substring_t args[MAX_OPT_ARGS];
248 char *opts, *p;
249 int error = 0;
250
251 if (!options)
252 goto out;
253
254 /*
255 * strsep changes the string, duplicate it because parse_options
256 * gets called twice
257 */
258 opts = kstrdup(options, GFP_KERNEL);
259 if (!opts)
260 return -ENOMEM;
261
262 while ((p = strsep(&opts, ",")) != NULL) {
263 int token;
264 if (!*p)
265 continue;
266
267 token = match_token(p, tokens, args);
268 switch (token) {
269 case Opt_subvol:
270 *subvol_name = match_strdup(&args[0]);
271 break;
272 case Opt_device:
273 error = btrfs_scan_one_device(match_strdup(&args[0]),
274 flags, holder, fs_devices);
275 if (error)
276 goto out_free_opts;
277 break;
278 default:
279 break;
280 }
281 }
282
283 out_free_opts:
284 kfree(opts);
285 out:
286 /*
287 * If no subvolume name is specified we use the default one. Allocate
288 * a copy of the string "." here so that code later in the
289 * mount path doesn't care if it's the default volume or another one.
290 */
291 if (!*subvol_name) {
292 *subvol_name = kstrdup(".", GFP_KERNEL);
293 if (!*subvol_name)
294 return -ENOMEM;
295 }
296 return error;
297}
298
299static int btrfs_fill_super(struct super_block * sb,
300 struct btrfs_fs_devices *fs_devices,
301 void * data, int silent)
302{
303 struct inode * inode;
304 struct dentry * root_dentry;
305 struct btrfs_super_block *disk_super;
306 struct btrfs_root *tree_root;
307 struct btrfs_inode *bi;
308 int err;
309
310 sb->s_maxbytes = MAX_LFS_FILESIZE;
311 sb->s_magic = BTRFS_SUPER_MAGIC;
312 sb->s_op = &btrfs_super_ops;
313 sb->s_export_op = &btrfs_export_ops;
314 sb->s_xattr = btrfs_xattr_handlers;
315 sb->s_time_gran = 1;
316 sb->s_flags |= MS_POSIXACL;
317
318 tree_root = open_ctree(sb, fs_devices, (char *)data);
319
320 if (IS_ERR(tree_root)) {
321 printk("btrfs: open_ctree failed\n");
322 return PTR_ERR(tree_root);
323 }
324 sb->s_fs_info = tree_root;
325 disk_super = &tree_root->fs_info->super_copy;
326 inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
327 tree_root->fs_info->fs_root);
328 bi = BTRFS_I(inode);
329 bi->location.objectid = inode->i_ino;
330 bi->location.offset = 0;
331 bi->root = tree_root->fs_info->fs_root;
332
333 btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
334
335 if (!inode) {
336 err = -ENOMEM;
337 goto fail_close;
338 }
339 if (inode->i_state & I_NEW) {
340 btrfs_read_locked_inode(inode);
341 unlock_new_inode(inode);
342 }
343
344 root_dentry = d_alloc_root(inode);
345 if (!root_dentry) {
346 iput(inode);
347 err = -ENOMEM;
348 goto fail_close;
349 }
350
351 /* this does the super kobj at the same time */
352 err = btrfs_sysfs_add_super(tree_root->fs_info);
353 if (err)
354 goto fail_close;
355
356 sb->s_root = root_dentry;
357
358 save_mount_options(sb, data);
359 return 0;
360
361fail_close:
362 close_ctree(tree_root);
363 return err;
364}
365
366int btrfs_sync_fs(struct super_block *sb, int wait)
367{
368 struct btrfs_trans_handle *trans;
369 struct btrfs_root *root;
370 int ret;
371 root = btrfs_sb(sb);
372
373 if (sb->s_flags & MS_RDONLY)
374 return 0;
375
376 sb->s_dirt = 0;
377 if (!wait) {
378 filemap_flush(root->fs_info->btree_inode->i_mapping);
379 return 0;
380 }
381
382 btrfs_start_delalloc_inodes(root);
383 btrfs_wait_ordered_extents(root, 0);
384
385 btrfs_clean_old_snapshots(root);
386 trans = btrfs_start_transaction(root, 1);
387 ret = btrfs_commit_transaction(trans, root);
388 sb->s_dirt = 0;
389 return ret;
390}
391
392static void btrfs_write_super(struct super_block *sb)
393{
394 sb->s_dirt = 0;
395}
396
397static int btrfs_test_super(struct super_block *s, void *data)
398{
399 struct btrfs_fs_devices *test_fs_devices = data;
400 struct btrfs_root *root = btrfs_sb(s);
401
402 return root->fs_info->fs_devices == test_fs_devices;
403}
404
405/*
406 * Find a superblock for the given device / mount point.
407 *
408 * Note: This is based on get_sb_bdev from fs/super.c with a few additions
409 * for multiple device setup. Make sure to keep it in sync.
410 */
411static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
412 const char *dev_name, void *data, struct vfsmount *mnt)
413{
414 char *subvol_name = NULL;
415 struct block_device *bdev = NULL;
416 struct super_block *s;
417 struct dentry *root;
418 struct btrfs_fs_devices *fs_devices = NULL;
419 int error = 0;
420
421 error = btrfs_parse_early_options(data, flags, fs_type,
422 &subvol_name, &fs_devices);
423 if (error)
424 goto error;
425
426 error = btrfs_scan_one_device(dev_name, flags, fs_type, &fs_devices);
427 if (error)
428 goto error_free_subvol_name;
429
430 error = btrfs_open_devices(fs_devices, flags, fs_type);
431 if (error)
432 goto error_free_subvol_name;
433
434 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
435 error = -EACCES;
436 goto error_close_devices;
437 }
438
439 bdev = fs_devices->latest_bdev;
440 s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
441 if (IS_ERR(s))
442 goto error_s;
443
444 if (s->s_root) {
445 if ((flags ^ s->s_flags) & MS_RDONLY) {
446 up_write(&s->s_umount);
447 deactivate_super(s);
448 error = -EBUSY;
449 goto error_close_devices;
450 }
451
452 btrfs_close_devices(fs_devices);
453 } else {
454 char b[BDEVNAME_SIZE];
455
456 s->s_flags = flags;
457 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
458 error = btrfs_fill_super(s, fs_devices, data,
459 flags & MS_SILENT ? 1 : 0);
460 if (error) {
461 up_write(&s->s_umount);
462 deactivate_super(s);
463 goto error;
464 }
465
466 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
467 s->s_flags |= MS_ACTIVE;
468 }
469
470 if (!strcmp(subvol_name, "."))
471 root = dget(s->s_root);
472 else {
473 mutex_lock(&s->s_root->d_inode->i_mutex);
474 root = lookup_one_len(subvol_name, s->s_root, strlen(subvol_name));
475 mutex_unlock(&s->s_root->d_inode->i_mutex);
476 if (IS_ERR(root)) {
477 up_write(&s->s_umount);
478 deactivate_super(s);
479 error = PTR_ERR(root);
480 goto error;
481 }
482 if (!root->d_inode) {
483 dput(root);
484 up_write(&s->s_umount);
485 deactivate_super(s);
486 error = -ENXIO;
487 goto error;
488 }
489 }
490
491 mnt->mnt_sb = s;
492 mnt->mnt_root = root;
493
494 kfree(subvol_name);
495 return 0;
496
497error_s:
498 error = PTR_ERR(s);
499error_close_devices:
500 btrfs_close_devices(fs_devices);
501error_free_subvol_name:
502 kfree(subvol_name);
503error:
504 return error;
505}
506
507static int btrfs_remount(struct super_block *sb, int *flags, char *data)
508{
509 struct btrfs_root *root = btrfs_sb(sb);
510 int ret;
511
512 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
513 return 0;
514
515 if (*flags & MS_RDONLY) {
516 sb->s_flags |= MS_RDONLY;
517
518 ret = btrfs_commit_super(root);
519 WARN_ON(ret);
520 } else {
521 if (root->fs_info->fs_devices->rw_devices == 0)
522 return -EACCES;
523
524 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
525 return -EINVAL;
526
527 ret = btrfs_cleanup_reloc_trees(root);
528 WARN_ON(ret);
529
530 ret = btrfs_cleanup_fs_roots(root->fs_info);
531 WARN_ON(ret);
532
533 sb->s_flags &= ~MS_RDONLY;
534 }
535
536 return 0;
537}
538
539static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
540{
541 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
542 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
543 int bits = dentry->d_sb->s_blocksize_bits;
544 __be32 *fsid = (__be32 *)root->fs_info->fsid;
545
546 buf->f_namelen = BTRFS_NAME_LEN;
547 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
548 buf->f_bfree = buf->f_blocks -
549 (btrfs_super_bytes_used(disk_super) >> bits);
550 buf->f_bavail = buf->f_bfree;
551 buf->f_bsize = dentry->d_sb->s_blocksize;
552 buf->f_type = BTRFS_SUPER_MAGIC;
553 /* We treat it as constant endianness (it doesn't matter _which_)
554 because we want the fsid to come out the same whether mounted
555 on a big-endian or little-endian host */
556 buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
557 buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
558 /* Mask in the root object ID too, to disambiguate subvols */
559 buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
560 buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
561
562 return 0;
563}
564
565static struct file_system_type btrfs_fs_type = {
566 .owner = THIS_MODULE,
567 .name = "btrfs",
568 .get_sb = btrfs_get_sb,
569 .kill_sb = kill_anon_super,
570 .fs_flags = FS_REQUIRES_DEV,
571};
572
573/*
574 * used by btrfsctl to scan devices when no FS is mounted
575 */
576static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
577 unsigned long arg)
578{
579 struct btrfs_ioctl_vol_args *vol;
580 struct btrfs_fs_devices *fs_devices;
581 int ret = 0;
582 int len;
583
584 vol = kmalloc(sizeof(*vol), GFP_KERNEL);
585 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
586 ret = -EFAULT;
587 goto out;
588 }
589 len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
590 switch (cmd) {
591 case BTRFS_IOC_SCAN_DEV:
592 ret = btrfs_scan_one_device(vol->name, MS_RDONLY,
593 &btrfs_fs_type, &fs_devices);
594 break;
595 }
596out:
597 kfree(vol);
598 return ret;
599}
600
601static void btrfs_write_super_lockfs(struct super_block *sb)
602{
603 struct btrfs_root *root = btrfs_sb(sb);
604 mutex_lock(&root->fs_info->transaction_kthread_mutex);
605 mutex_lock(&root->fs_info->cleaner_mutex);
606}
607
608static void btrfs_unlockfs(struct super_block *sb)
609{
610 struct btrfs_root *root = btrfs_sb(sb);
611 mutex_unlock(&root->fs_info->cleaner_mutex);
612 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
613}
614
615static struct super_operations btrfs_super_ops = {
616 .delete_inode = btrfs_delete_inode,
617 .put_super = btrfs_put_super,
618 .write_super = btrfs_write_super,
619 .sync_fs = btrfs_sync_fs,
620 .show_options = generic_show_options,
621 .write_inode = btrfs_write_inode,
622 .dirty_inode = btrfs_dirty_inode,
623 .alloc_inode = btrfs_alloc_inode,
624 .destroy_inode = btrfs_destroy_inode,
625 .statfs = btrfs_statfs,
626 .remount_fs = btrfs_remount,
627 .write_super_lockfs = btrfs_write_super_lockfs,
628 .unlockfs = btrfs_unlockfs,
629};
630
631static const struct file_operations btrfs_ctl_fops = {
632 .unlocked_ioctl = btrfs_control_ioctl,
633 .compat_ioctl = btrfs_control_ioctl,
634 .owner = THIS_MODULE,
635};
636
637static struct miscdevice btrfs_misc = {
638 .minor = MISC_DYNAMIC_MINOR,
639 .name = "btrfs-control",
640 .fops = &btrfs_ctl_fops
641};
642
643static int btrfs_interface_init(void)
644{
645 return misc_register(&btrfs_misc);
646}
647
648void btrfs_interface_exit(void)
649{
650 if (misc_deregister(&btrfs_misc) < 0)
651 printk("misc_deregister failed for control device");
652}
653
654static int __init init_btrfs_fs(void)
655{
656 int err;
657
658 err = btrfs_init_sysfs();
659 if (err)
660 return err;
661
662 err = btrfs_init_cachep();
663 if (err)
664 goto free_sysfs;
665
666 err = extent_io_init();
667 if (err)
668 goto free_cachep;
669
670 err = extent_map_init();
671 if (err)
672 goto free_extent_io;
673
674 err = btrfs_interface_init();
675 if (err)
676 goto free_extent_map;
677
678 err = register_filesystem(&btrfs_fs_type);
679 if (err)
680 goto unregister_ioctl;
681
682 printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
683 return 0;
684
685unregister_ioctl:
686 btrfs_interface_exit();
687free_extent_map:
688 extent_map_exit();
689free_extent_io:
690 extent_io_exit();
691free_cachep:
692 btrfs_destroy_cachep();
693free_sysfs:
694 btrfs_exit_sysfs();
695 return err;
696}
697
698static void __exit exit_btrfs_fs(void)
699{
700 btrfs_destroy_cachep();
701 extent_map_exit();
702 extent_io_exit();
703 btrfs_interface_exit();
704 unregister_filesystem(&btrfs_fs_type);
705 btrfs_exit_sysfs();
706 btrfs_cleanup_fs_uuids();
707 btrfs_zlib_exit();
708}
709
710module_init(init_btrfs_fs)
711module_exit(exit_btrfs_fs)
712
713MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 000000000000..300076e66765
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,268 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/slab.h>
21#include <linux/spinlock.h>
22#include <linux/completion.h>
23#include <linux/buffer_head.h>
24#include <linux/module.h>
25#include <linux/kobject.h>
26
27#include "ctree.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
32{
33 return snprintf(buf, PAGE_SIZE, "%llu\n",
34 (unsigned long long)btrfs_root_used(&root->root_item));
35}
36
37static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
38{
39 return snprintf(buf, PAGE_SIZE, "%llu\n",
40 (unsigned long long)btrfs_root_limit(&root->root_item));
41}
42
43static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
44{
45
46 return snprintf(buf, PAGE_SIZE, "%llu\n",
47 (unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
48}
49
50static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
51{
52 return snprintf(buf, PAGE_SIZE, "%llu\n",
53 (unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
54}
55
56static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
57{
58 return snprintf(buf, PAGE_SIZE, "%llu\n",
59 (unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
60}
61
62/* this is for root attrs (subvols/snapshots) */
63struct btrfs_root_attr {
64 struct attribute attr;
65 ssize_t (*show)(struct btrfs_root *, char *);
66 ssize_t (*store)(struct btrfs_root *, const char *, size_t);
67};
68
69#define ROOT_ATTR(name, mode, show, store) \
70static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, show, store)
71
72ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL);
73ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL);
74
75static struct attribute *btrfs_root_attrs[] = {
76 &btrfs_root_attr_blocks_used.attr,
77 &btrfs_root_attr_block_limit.attr,
78 NULL,
79};
80
81/* this is for super attrs (actual full fs) */
82struct btrfs_super_attr {
83 struct attribute attr;
84 ssize_t (*show)(struct btrfs_fs_info *, char *);
85 ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
86};
87
88#define SUPER_ATTR(name, mode, show, store) \
89static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, show, store)
90
91SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL);
92SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL);
93SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL);
94
95static struct attribute *btrfs_super_attrs[] = {
96 &btrfs_super_attr_blocks_used.attr,
97 &btrfs_super_attr_total_blocks.attr,
98 &btrfs_super_attr_blocksize.attr,
99 NULL,
100};
101
102static ssize_t btrfs_super_attr_show(struct kobject *kobj,
103 struct attribute *attr, char *buf)
104{
105 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
106 super_kobj);
107 struct btrfs_super_attr *a = container_of(attr,
108 struct btrfs_super_attr,
109 attr);
110
111 return a->show ? a->show(fs, buf) : 0;
112}
113
114static ssize_t btrfs_super_attr_store(struct kobject *kobj,
115 struct attribute *attr,
116 const char *buf, size_t len)
117{
118 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
119 super_kobj);
120 struct btrfs_super_attr *a = container_of(attr,
121 struct btrfs_super_attr,
122 attr);
123
124 return a->store ? a->store(fs, buf, len) : 0;
125}
126
127static ssize_t btrfs_root_attr_show(struct kobject *kobj,
128 struct attribute *attr, char *buf)
129{
130 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
131 root_kobj);
132 struct btrfs_root_attr *a = container_of(attr,
133 struct btrfs_root_attr,
134 attr);
135
136 return a->show ? a->show(root, buf) : 0;
137}
138
139static ssize_t btrfs_root_attr_store(struct kobject *kobj,
140 struct attribute *attr,
141 const char *buf, size_t len)
142{
143 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
144 root_kobj);
145 struct btrfs_root_attr *a = container_of(attr,
146 struct btrfs_root_attr,
147 attr);
148 return a->store ? a->store(root, buf, len) : 0;
149}
150
151static void btrfs_super_release(struct kobject *kobj)
152{
153 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
154 super_kobj);
155 complete(&fs->kobj_unregister);
156}
157
158static void btrfs_root_release(struct kobject *kobj)
159{
160 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
161 root_kobj);
162 complete(&root->kobj_unregister);
163}
164
165static struct sysfs_ops btrfs_super_attr_ops = {
166 .show = btrfs_super_attr_show,
167 .store = btrfs_super_attr_store,
168};
169
170static struct sysfs_ops btrfs_root_attr_ops = {
171 .show = btrfs_root_attr_show,
172 .store = btrfs_root_attr_store,
173};
174
175static struct kobj_type btrfs_root_ktype = {
176 .default_attrs = btrfs_root_attrs,
177 .sysfs_ops = &btrfs_root_attr_ops,
178 .release = btrfs_root_release,
179};
180
181static struct kobj_type btrfs_super_ktype = {
182 .default_attrs = btrfs_super_attrs,
183 .sysfs_ops = &btrfs_super_attr_ops,
184 .release = btrfs_super_release,
185};
186
187/* /sys/fs/btrfs/ entry */
188static struct kset *btrfs_kset;
189
190int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
191{
192 int error;
193 char *name;
194 char c;
195 int len = strlen(fs->sb->s_id) + 1;
196 int i;
197
198 name = kmalloc(len, GFP_NOFS);
199 if (!name) {
200 error = -ENOMEM;
201 goto fail;
202 }
203
204 for (i = 0; i < len; i++) {
205 c = fs->sb->s_id[i];
206 if (c == '/' || c == '\\')
207 c = '!';
208 name[i] = c;
209 }
210 name[len] = '\0';
211
212 fs->super_kobj.kset = btrfs_kset;
213 error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
214 NULL, "%s", name);
215 if (error)
216 goto fail;
217
218 kfree(name);
219 return 0;
220
221fail:
222 kfree(name);
223 printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
224 return error;
225}
226
227int btrfs_sysfs_add_root(struct btrfs_root *root)
228{
229 int error;
230
231 error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
232 &root->fs_info->super_kobj,
233 "%s", root->name);
234 if (error)
235 goto fail;
236
237 return 0;
238
239fail:
240 printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
241 return error;
242}
243
244void btrfs_sysfs_del_root(struct btrfs_root *root)
245{
246 kobject_put(&root->root_kobj);
247 wait_for_completion(&root->kobj_unregister);
248}
249
250void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
251{
252 kobject_put(&fs->super_kobj);
253 wait_for_completion(&fs->kobj_unregister);
254}
255
256int btrfs_init_sysfs(void)
257{
258 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
259 if (!btrfs_kset)
260 return -ENOMEM;
261 return 0;
262}
263
264void btrfs_exit_sysfs(void)
265{
266 kset_unregister(btrfs_kset);
267}
268
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 000000000000..c2c3b4281962
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,1102 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/sched.h>
21#include <linux/writeback.h>
22#include <linux/pagemap.h>
23#include <linux/blkdev.h>
24#include "ctree.h"
25#include "disk-io.h"
26#include "transaction.h"
27#include "locking.h"
28#include "ref-cache.h"
29#include "tree-log.h"
30
31static int total_trans = 0;
32extern struct kmem_cache *btrfs_trans_handle_cachep;
33extern struct kmem_cache *btrfs_transaction_cachep;
34
35#define BTRFS_ROOT_TRANS_TAG 0
36
37static noinline void put_transaction(struct btrfs_transaction *transaction)
38{
39 WARN_ON(transaction->use_count == 0);
40 transaction->use_count--;
41 if (transaction->use_count == 0) {
42 WARN_ON(total_trans == 0);
43 total_trans--;
44 list_del_init(&transaction->list);
45 memset(transaction, 0, sizeof(*transaction));
46 kmem_cache_free(btrfs_transaction_cachep, transaction);
47 }
48}
49
50/*
51 * either allocate a new transaction or hop into the existing one
52 */
53static noinline int join_transaction(struct btrfs_root *root)
54{
55 struct btrfs_transaction *cur_trans;
56 cur_trans = root->fs_info->running_transaction;
57 if (!cur_trans) {
58 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
59 GFP_NOFS);
60 total_trans++;
61 BUG_ON(!cur_trans);
62 root->fs_info->generation++;
63 root->fs_info->last_alloc = 0;
64 root->fs_info->last_data_alloc = 0;
65 cur_trans->num_writers = 1;
66 cur_trans->num_joined = 0;
67 cur_trans->transid = root->fs_info->generation;
68 init_waitqueue_head(&cur_trans->writer_wait);
69 init_waitqueue_head(&cur_trans->commit_wait);
70 cur_trans->in_commit = 0;
71 cur_trans->blocked = 0;
72 cur_trans->use_count = 1;
73 cur_trans->commit_done = 0;
74 cur_trans->start_time = get_seconds();
75 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
76 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
77 extent_io_tree_init(&cur_trans->dirty_pages,
78 root->fs_info->btree_inode->i_mapping,
79 GFP_NOFS);
80 spin_lock(&root->fs_info->new_trans_lock);
81 root->fs_info->running_transaction = cur_trans;
82 spin_unlock(&root->fs_info->new_trans_lock);
83 } else {
84 cur_trans->num_writers++;
85 cur_trans->num_joined++;
86 }
87
88 return 0;
89}
90
91/*
92 * this does all the record keeping required to make sure that a
93 * reference counted root is properly recorded in a given transaction.
94 * This is required to make sure the old root from before we joined the transaction
95 * is deleted when the transaction commits
96 */
97noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
98{
99 struct btrfs_dirty_root *dirty;
100 u64 running_trans_id = root->fs_info->running_transaction->transid;
101 if (root->ref_cows && root->last_trans < running_trans_id) {
102 WARN_ON(root == root->fs_info->extent_root);
103 if (root->root_item.refs != 0) {
104 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
105 (unsigned long)root->root_key.objectid,
106 BTRFS_ROOT_TRANS_TAG);
107
108 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
109 BUG_ON(!dirty);
110 dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
111 BUG_ON(!dirty->root);
112 dirty->latest_root = root;
113 INIT_LIST_HEAD(&dirty->list);
114
115 root->commit_root = btrfs_root_node(root);
116
117 memcpy(dirty->root, root, sizeof(*root));
118 spin_lock_init(&dirty->root->node_lock);
119 spin_lock_init(&dirty->root->list_lock);
120 mutex_init(&dirty->root->objectid_mutex);
121 mutex_init(&dirty->root->log_mutex);
122 INIT_LIST_HEAD(&dirty->root->dead_list);
123 dirty->root->node = root->commit_root;
124 dirty->root->commit_root = NULL;
125
126 spin_lock(&root->list_lock);
127 list_add(&dirty->root->dead_list, &root->dead_list);
128 spin_unlock(&root->list_lock);
129
130 root->dirty_root = dirty;
131 } else {
132 WARN_ON(1);
133 }
134 root->last_trans = running_trans_id;
135 }
136 return 0;
137}
138
139/* wait for commit against the current transaction to become unblocked
140 * when this is done, it is safe to start a new transaction, but the current
141 * transaction might not be fully on disk.
142 */
143static void wait_current_trans(struct btrfs_root *root)
144{
145 struct btrfs_transaction *cur_trans;
146
147 cur_trans = root->fs_info->running_transaction;
148 if (cur_trans && cur_trans->blocked) {
149 DEFINE_WAIT(wait);
150 cur_trans->use_count++;
151 while(1) {
152 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
153 TASK_UNINTERRUPTIBLE);
154 if (cur_trans->blocked) {
155 mutex_unlock(&root->fs_info->trans_mutex);
156 schedule();
157 mutex_lock(&root->fs_info->trans_mutex);
158 finish_wait(&root->fs_info->transaction_wait,
159 &wait);
160 } else {
161 finish_wait(&root->fs_info->transaction_wait,
162 &wait);
163 break;
164 }
165 }
166 put_transaction(cur_trans);
167 }
168}
169
170static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
171 int num_blocks, int wait)
172{
173 struct btrfs_trans_handle *h =
174 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
175 int ret;
176
177 mutex_lock(&root->fs_info->trans_mutex);
178 if (!root->fs_info->log_root_recovering &&
179 ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
180 wait_current_trans(root);
181 ret = join_transaction(root);
182 BUG_ON(ret);
183
184 btrfs_record_root_in_trans(root);
185 h->transid = root->fs_info->running_transaction->transid;
186 h->transaction = root->fs_info->running_transaction;
187 h->blocks_reserved = num_blocks;
188 h->blocks_used = 0;
189 h->block_group = NULL;
190 h->alloc_exclude_nr = 0;
191 h->alloc_exclude_start = 0;
192 root->fs_info->running_transaction->use_count++;
193 mutex_unlock(&root->fs_info->trans_mutex);
194 return h;
195}
196
197struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
198 int num_blocks)
199{
200 return start_transaction(root, num_blocks, 1);
201}
202struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
203 int num_blocks)
204{
205 return start_transaction(root, num_blocks, 0);
206}
207
208struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
209 int num_blocks)
210{
211 return start_transaction(r, num_blocks, 2);
212}
213
214/* wait for a transaction commit to be fully complete */
215static noinline int wait_for_commit(struct btrfs_root *root,
216 struct btrfs_transaction *commit)
217{
218 DEFINE_WAIT(wait);
219 mutex_lock(&root->fs_info->trans_mutex);
220 while(!commit->commit_done) {
221 prepare_to_wait(&commit->commit_wait, &wait,
222 TASK_UNINTERRUPTIBLE);
223 if (commit->commit_done)
224 break;
225 mutex_unlock(&root->fs_info->trans_mutex);
226 schedule();
227 mutex_lock(&root->fs_info->trans_mutex);
228 }
229 mutex_unlock(&root->fs_info->trans_mutex);
230 finish_wait(&commit->commit_wait, &wait);
231 return 0;
232}
233
234/*
235 * rate limit against the drop_snapshot code. This helps to slow down new operations
236 * if the drop_snapshot code isn't able to keep up.
237 */
238static void throttle_on_drops(struct btrfs_root *root)
239{
240 struct btrfs_fs_info *info = root->fs_info;
241 int harder_count = 0;
242
243harder:
244 if (atomic_read(&info->throttles)) {
245 DEFINE_WAIT(wait);
246 int thr;
247 thr = atomic_read(&info->throttle_gen);
248
249 do {
250 prepare_to_wait(&info->transaction_throttle,
251 &wait, TASK_UNINTERRUPTIBLE);
252 if (!atomic_read(&info->throttles)) {
253 finish_wait(&info->transaction_throttle, &wait);
254 break;
255 }
256 schedule();
257 finish_wait(&info->transaction_throttle, &wait);
258 } while (thr == atomic_read(&info->throttle_gen));
259 harder_count++;
260
261 if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
262 harder_count < 2)
263 goto harder;
264
265 if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
266 harder_count < 10)
267 goto harder;
268
269 if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
270 harder_count < 20)
271 goto harder;
272 }
273}
274
275void btrfs_throttle(struct btrfs_root *root)
276{
277 mutex_lock(&root->fs_info->trans_mutex);
278 if (!root->fs_info->open_ioctl_trans)
279 wait_current_trans(root);
280 mutex_unlock(&root->fs_info->trans_mutex);
281
282 throttle_on_drops(root);
283}
284
285static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
286 struct btrfs_root *root, int throttle)
287{
288 struct btrfs_transaction *cur_trans;
289 struct btrfs_fs_info *info = root->fs_info;
290
291 mutex_lock(&info->trans_mutex);
292 cur_trans = info->running_transaction;
293 WARN_ON(cur_trans != trans->transaction);
294 WARN_ON(cur_trans->num_writers < 1);
295 cur_trans->num_writers--;
296
297 if (waitqueue_active(&cur_trans->writer_wait))
298 wake_up(&cur_trans->writer_wait);
299 put_transaction(cur_trans);
300 mutex_unlock(&info->trans_mutex);
301 memset(trans, 0, sizeof(*trans));
302 kmem_cache_free(btrfs_trans_handle_cachep, trans);
303
304 if (throttle)
305 throttle_on_drops(root);
306
307 return 0;
308}
309
310int btrfs_end_transaction(struct btrfs_trans_handle *trans,
311 struct btrfs_root *root)
312{
313 return __btrfs_end_transaction(trans, root, 0);
314}
315
316int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
317 struct btrfs_root *root)
318{
319 return __btrfs_end_transaction(trans, root, 1);
320}
321
322/*
323 * when btree blocks are allocated, they have some corresponding bits set for
324 * them in one of two extent_io trees. This is used to make sure all of
325 * those extents are on disk for transaction or log commit
326 */
327int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
328 struct extent_io_tree *dirty_pages)
329{
330 int ret;
331 int err = 0;
332 int werr = 0;
333 struct page *page;
334 struct inode *btree_inode = root->fs_info->btree_inode;
335 u64 start = 0;
336 u64 end;
337 unsigned long index;
338
339 while(1) {
340 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
341 EXTENT_DIRTY);
342 if (ret)
343 break;
344 while(start <= end) {
345 cond_resched();
346
347 index = start >> PAGE_CACHE_SHIFT;
348 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
349 page = find_get_page(btree_inode->i_mapping, index);
350 if (!page)
351 continue;
352
353 btree_lock_page_hook(page);
354 if (!page->mapping) {
355 unlock_page(page);
356 page_cache_release(page);
357 continue;
358 }
359
360 if (PageWriteback(page)) {
361 if (PageDirty(page))
362 wait_on_page_writeback(page);
363 else {
364 unlock_page(page);
365 page_cache_release(page);
366 continue;
367 }
368 }
369 err = write_one_page(page, 0);
370 if (err)
371 werr = err;
372 page_cache_release(page);
373 }
374 }
375 while(1) {
376 ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
377 EXTENT_DIRTY);
378 if (ret)
379 break;
380
381 clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
382 while(start <= end) {
383 index = start >> PAGE_CACHE_SHIFT;
384 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
385 page = find_get_page(btree_inode->i_mapping, index);
386 if (!page)
387 continue;
388 if (PageDirty(page)) {
389 btree_lock_page_hook(page);
390 wait_on_page_writeback(page);
391 err = write_one_page(page, 0);
392 if (err)
393 werr = err;
394 }
395 wait_on_page_writeback(page);
396 page_cache_release(page);
397 cond_resched();
398 }
399 }
400 if (err)
401 werr = err;
402 return werr;
403}
404
405int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
406 struct btrfs_root *root)
407{
408 if (!trans || !trans->transaction) {
409 struct inode *btree_inode;
410 btree_inode = root->fs_info->btree_inode;
411 return filemap_write_and_wait(btree_inode->i_mapping);
412 }
413 return btrfs_write_and_wait_marked_extents(root,
414 &trans->transaction->dirty_pages);
415}
416
417/*
418 * this is used to update the root pointer in the tree of tree roots.
419 *
420 * But, in the case of the extent allocation tree, updating the root
421 * pointer may allocate blocks which may change the root of the extent
422 * allocation tree.
423 *
424 * So, this loops and repeats and makes sure the cowonly root didn't
425 * change while the root pointer was being updated in the metadata.
426 */
427static int update_cowonly_root(struct btrfs_trans_handle *trans,
428 struct btrfs_root *root)
429{
430 int ret;
431 u64 old_root_bytenr;
432 struct btrfs_root *tree_root = root->fs_info->tree_root;
433
434 btrfs_extent_post_op(trans, root);
435 btrfs_write_dirty_block_groups(trans, root);
436 btrfs_extent_post_op(trans, root);
437
438 while(1) {
439 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
440 if (old_root_bytenr == root->node->start)
441 break;
442 btrfs_set_root_bytenr(&root->root_item,
443 root->node->start);
444 btrfs_set_root_level(&root->root_item,
445 btrfs_header_level(root->node));
446 btrfs_set_root_generation(&root->root_item, trans->transid);
447
448 btrfs_extent_post_op(trans, root);
449
450 ret = btrfs_update_root(trans, tree_root,
451 &root->root_key,
452 &root->root_item);
453 BUG_ON(ret);
454 btrfs_write_dirty_block_groups(trans, root);
455 btrfs_extent_post_op(trans, root);
456 }
457 return 0;
458}
459
460/*
461 * update all the cowonly tree roots on disk
462 */
463int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
464 struct btrfs_root *root)
465{
466 struct btrfs_fs_info *fs_info = root->fs_info;
467 struct list_head *next;
468 struct extent_buffer *eb;
469
470 btrfs_extent_post_op(trans, fs_info->tree_root);
471
472 eb = btrfs_lock_root_node(fs_info->tree_root);
473 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
474 btrfs_tree_unlock(eb);
475 free_extent_buffer(eb);
476
477 btrfs_extent_post_op(trans, fs_info->tree_root);
478
479 while(!list_empty(&fs_info->dirty_cowonly_roots)) {
480 next = fs_info->dirty_cowonly_roots.next;
481 list_del_init(next);
482 root = list_entry(next, struct btrfs_root, dirty_list);
483
484 update_cowonly_root(trans, root);
485 }
486 return 0;
487}
488
489/*
490 * dead roots are old snapshots that need to be deleted. This allocates
491 * a dirty root struct and adds it into the list of dead roots that need to
492 * be deleted
493 */
494int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
495{
496 struct btrfs_dirty_root *dirty;
497
498 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
499 if (!dirty)
500 return -ENOMEM;
501 dirty->root = root;
502 dirty->latest_root = latest;
503
504 mutex_lock(&root->fs_info->trans_mutex);
505 list_add(&dirty->list, &latest->fs_info->dead_roots);
506 mutex_unlock(&root->fs_info->trans_mutex);
507 return 0;
508}
509
510/*
511 * at transaction commit time we need to schedule the old roots for
512 * deletion via btrfs_drop_snapshot. This runs through all the
513 * reference counted roots that were modified in the current
514 * transaction and puts them into the drop list
515 */
516static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
517 struct radix_tree_root *radix,
518 struct list_head *list)
519{
520 struct btrfs_dirty_root *dirty;
521 struct btrfs_root *gang[8];
522 struct btrfs_root *root;
523 int i;
524 int ret;
525 int err = 0;
526 u32 refs;
527
528 while(1) {
529 ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
530 ARRAY_SIZE(gang),
531 BTRFS_ROOT_TRANS_TAG);
532 if (ret == 0)
533 break;
534 for (i = 0; i < ret; i++) {
535 root = gang[i];
536 radix_tree_tag_clear(radix,
537 (unsigned long)root->root_key.objectid,
538 BTRFS_ROOT_TRANS_TAG);
539
540 BUG_ON(!root->ref_tree);
541 dirty = root->dirty_root;
542
543 btrfs_free_log(trans, root);
544 btrfs_free_reloc_root(trans, root);
545
546 if (root->commit_root == root->node) {
547 WARN_ON(root->node->start !=
548 btrfs_root_bytenr(&root->root_item));
549
550 free_extent_buffer(root->commit_root);
551 root->commit_root = NULL;
552 root->dirty_root = NULL;
553
554 spin_lock(&root->list_lock);
555 list_del_init(&dirty->root->dead_list);
556 spin_unlock(&root->list_lock);
557
558 kfree(dirty->root);
559 kfree(dirty);
560
561 /* make sure to update the root on disk
562 * so we get any updates to the block used
563 * counts
564 */
565 err = btrfs_update_root(trans,
566 root->fs_info->tree_root,
567 &root->root_key,
568 &root->root_item);
569 continue;
570 }
571
572 memset(&root->root_item.drop_progress, 0,
573 sizeof(struct btrfs_disk_key));
574 root->root_item.drop_level = 0;
575 root->commit_root = NULL;
576 root->dirty_root = NULL;
577 root->root_key.offset = root->fs_info->generation;
578 btrfs_set_root_bytenr(&root->root_item,
579 root->node->start);
580 btrfs_set_root_level(&root->root_item,
581 btrfs_header_level(root->node));
582 btrfs_set_root_generation(&root->root_item,
583 root->root_key.offset);
584
585 err = btrfs_insert_root(trans, root->fs_info->tree_root,
586 &root->root_key,
587 &root->root_item);
588 if (err)
589 break;
590
591 refs = btrfs_root_refs(&dirty->root->root_item);
592 btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
593 err = btrfs_update_root(trans, root->fs_info->tree_root,
594 &dirty->root->root_key,
595 &dirty->root->root_item);
596
597 BUG_ON(err);
598 if (refs == 1) {
599 list_add(&dirty->list, list);
600 } else {
601 WARN_ON(1);
602 free_extent_buffer(dirty->root->node);
603 kfree(dirty->root);
604 kfree(dirty);
605 }
606 }
607 }
608 return err;
609}
610
611/*
612 * defrag a given btree. If cacheonly == 1, this won't read from the disk,
613 * otherwise every leaf in the btree is read and defragged.
614 */
615int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
616{
617 struct btrfs_fs_info *info = root->fs_info;
618 int ret;
619 struct btrfs_trans_handle *trans;
620 unsigned long nr;
621
622 smp_mb();
623 if (root->defrag_running)
624 return 0;
625 trans = btrfs_start_transaction(root, 1);
626 while (1) {
627 root->defrag_running = 1;
628 ret = btrfs_defrag_leaves(trans, root, cacheonly);
629 nr = trans->blocks_used;
630 btrfs_end_transaction(trans, root);
631 btrfs_btree_balance_dirty(info->tree_root, nr);
632 cond_resched();
633
634 trans = btrfs_start_transaction(root, 1);
635 if (root->fs_info->closing || ret != -EAGAIN)
636 break;
637 }
638 root->defrag_running = 0;
639 smp_mb();
640 btrfs_end_transaction(trans, root);
641 return 0;
642}
643
644/*
645 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
646 * all of them
647 */
648static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
649 struct list_head *list)
650{
651 struct btrfs_dirty_root *dirty;
652 struct btrfs_trans_handle *trans;
653 unsigned long nr;
654 u64 num_bytes;
655 u64 bytes_used;
656 u64 max_useless;
657 int ret = 0;
658 int err;
659
660 while(!list_empty(list)) {
661 struct btrfs_root *root;
662
663 dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
664 list_del_init(&dirty->list);
665
666 num_bytes = btrfs_root_used(&dirty->root->root_item);
667 root = dirty->latest_root;
668 atomic_inc(&root->fs_info->throttles);
669
670 while(1) {
671 trans = btrfs_start_transaction(tree_root, 1);
672 mutex_lock(&root->fs_info->drop_mutex);
673 ret = btrfs_drop_snapshot(trans, dirty->root);
674 if (ret != -EAGAIN) {
675 break;
676 }
677 mutex_unlock(&root->fs_info->drop_mutex);
678
679 err = btrfs_update_root(trans,
680 tree_root,
681 &dirty->root->root_key,
682 &dirty->root->root_item);
683 if (err)
684 ret = err;
685 nr = trans->blocks_used;
686 ret = btrfs_end_transaction(trans, tree_root);
687 BUG_ON(ret);
688
689 btrfs_btree_balance_dirty(tree_root, nr);
690 cond_resched();
691 }
692 BUG_ON(ret);
693 atomic_dec(&root->fs_info->throttles);
694 wake_up(&root->fs_info->transaction_throttle);
695
696 num_bytes -= btrfs_root_used(&dirty->root->root_item);
697 bytes_used = btrfs_root_used(&root->root_item);
698 if (num_bytes) {
699 btrfs_record_root_in_trans(root);
700 btrfs_set_root_used(&root->root_item,
701 bytes_used - num_bytes);
702 }
703
704 ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
705 if (ret) {
706 BUG();
707 break;
708 }
709 mutex_unlock(&root->fs_info->drop_mutex);
710
711 spin_lock(&root->list_lock);
712 list_del_init(&dirty->root->dead_list);
713 if (!list_empty(&root->dead_list)) {
714 struct btrfs_root *oldest;
715 oldest = list_entry(root->dead_list.prev,
716 struct btrfs_root, dead_list);
717 max_useless = oldest->root_key.offset - 1;
718 } else {
719 max_useless = root->root_key.offset - 1;
720 }
721 spin_unlock(&root->list_lock);
722
723 nr = trans->blocks_used;
724 ret = btrfs_end_transaction(trans, tree_root);
725 BUG_ON(ret);
726
727 ret = btrfs_remove_leaf_refs(root, max_useless, 0);
728 BUG_ON(ret);
729
730 free_extent_buffer(dirty->root->node);
731 kfree(dirty->root);
732 kfree(dirty);
733
734 btrfs_btree_balance_dirty(tree_root, nr);
735 cond_resched();
736 }
737 return ret;
738}
739
740/*
741 * new snapshots need to be created at a very specific time in the
742 * transaction commit. This does the actual creation
743 */
744static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
745 struct btrfs_fs_info *fs_info,
746 struct btrfs_pending_snapshot *pending)
747{
748 struct btrfs_key key;
749 struct btrfs_root_item *new_root_item;
750 struct btrfs_root *tree_root = fs_info->tree_root;
751 struct btrfs_root *root = pending->root;
752 struct extent_buffer *tmp;
753 struct extent_buffer *old;
754 int ret;
755 u64 objectid;
756
757 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
758 if (!new_root_item) {
759 ret = -ENOMEM;
760 goto fail;
761 }
762 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
763 if (ret)
764 goto fail;
765
766 btrfs_record_root_in_trans(root);
767 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
768 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
769
770 key.objectid = objectid;
771 key.offset = trans->transid;
772 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
773
774 old = btrfs_lock_root_node(root);
775 btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
776
777 btrfs_copy_root(trans, root, old, &tmp, objectid);
778 btrfs_tree_unlock(old);
779 free_extent_buffer(old);
780
781 btrfs_set_root_bytenr(new_root_item, tmp->start);
782 btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
783 btrfs_set_root_generation(new_root_item, trans->transid);
784 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
785 new_root_item);
786 btrfs_tree_unlock(tmp);
787 free_extent_buffer(tmp);
788 if (ret)
789 goto fail;
790
791 key.offset = (u64)-1;
792 memcpy(&pending->root_key, &key, sizeof(key));
793fail:
794 kfree(new_root_item);
795 return ret;
796}
797
798static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
799 struct btrfs_pending_snapshot *pending)
800{
801 int ret;
802 int namelen;
803 u64 index = 0;
804 struct btrfs_trans_handle *trans;
805 struct inode *parent_inode;
806 struct inode *inode;
807 struct btrfs_root *parent_root;
808
809 parent_inode = pending->dentry->d_parent->d_inode;
810 parent_root = BTRFS_I(parent_inode)->root;
811 trans = btrfs_start_transaction(parent_root, 1);
812
813 /*
814 * insert the directory item
815 */
816 namelen = strlen(pending->name);
817 ret = btrfs_set_inode_index(parent_inode, &index);
818 ret = btrfs_insert_dir_item(trans, parent_root,
819 pending->name, namelen,
820 parent_inode->i_ino,
821 &pending->root_key, BTRFS_FT_DIR, index);
822
823 if (ret)
824 goto fail;
825
826 /* add the backref first */
827 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
828 pending->root_key.objectid,
829 BTRFS_ROOT_BACKREF_KEY,
830 parent_root->root_key.objectid,
831 parent_inode->i_ino, index, pending->name,
832 namelen);
833
834 BUG_ON(ret);
835
836 /* now add the forward ref */
837 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
838 parent_root->root_key.objectid,
839 BTRFS_ROOT_REF_KEY,
840 pending->root_key.objectid,
841 parent_inode->i_ino, index, pending->name,
842 namelen);
843
844 inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
845 d_instantiate(pending->dentry, inode);
846fail:
847 btrfs_end_transaction(trans, fs_info->fs_root);
848 return ret;
849}
850
851/*
852 * create all the snapshots we've scheduled for creation
853 */
854static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
855 struct btrfs_fs_info *fs_info)
856{
857 struct btrfs_pending_snapshot *pending;
858 struct list_head *head = &trans->transaction->pending_snapshots;
859 struct list_head *cur;
860 int ret;
861
862 list_for_each(cur, head) {
863 pending = list_entry(cur, struct btrfs_pending_snapshot, list);
864 ret = create_pending_snapshot(trans, fs_info, pending);
865 BUG_ON(ret);
866 }
867 return 0;
868}
869
870static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
871 struct btrfs_fs_info *fs_info)
872{
873 struct btrfs_pending_snapshot *pending;
874 struct list_head *head = &trans->transaction->pending_snapshots;
875 int ret;
876
877 while(!list_empty(head)) {
878 pending = list_entry(head->next,
879 struct btrfs_pending_snapshot, list);
880 ret = finish_pending_snapshot(fs_info, pending);
881 BUG_ON(ret);
882 list_del(&pending->list);
883 kfree(pending->name);
884 kfree(pending);
885 }
886 return 0;
887}
888
889int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
890 struct btrfs_root *root)
891{
892 unsigned long joined = 0;
893 unsigned long timeout = 1;
894 struct btrfs_transaction *cur_trans;
895 struct btrfs_transaction *prev_trans = NULL;
896 struct btrfs_root *chunk_root = root->fs_info->chunk_root;
897 struct list_head dirty_fs_roots;
898 struct extent_io_tree *pinned_copy;
899 DEFINE_WAIT(wait);
900 int ret;
901
902 INIT_LIST_HEAD(&dirty_fs_roots);
903 mutex_lock(&root->fs_info->trans_mutex);
904 if (trans->transaction->in_commit) {
905 cur_trans = trans->transaction;
906 trans->transaction->use_count++;
907 mutex_unlock(&root->fs_info->trans_mutex);
908 btrfs_end_transaction(trans, root);
909
910 ret = wait_for_commit(root, cur_trans);
911 BUG_ON(ret);
912
913 mutex_lock(&root->fs_info->trans_mutex);
914 put_transaction(cur_trans);
915 mutex_unlock(&root->fs_info->trans_mutex);
916
917 return 0;
918 }
919
920 pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
921 if (!pinned_copy)
922 return -ENOMEM;
923
924 extent_io_tree_init(pinned_copy,
925 root->fs_info->btree_inode->i_mapping, GFP_NOFS);
926
927 trans->transaction->in_commit = 1;
928 trans->transaction->blocked = 1;
929 cur_trans = trans->transaction;
930 if (cur_trans->list.prev != &root->fs_info->trans_list) {
931 prev_trans = list_entry(cur_trans->list.prev,
932 struct btrfs_transaction, list);
933 if (!prev_trans->commit_done) {
934 prev_trans->use_count++;
935 mutex_unlock(&root->fs_info->trans_mutex);
936
937 wait_for_commit(root, prev_trans);
938
939 mutex_lock(&root->fs_info->trans_mutex);
940 put_transaction(prev_trans);
941 }
942 }
943
944 do {
945 int snap_pending = 0;
946 joined = cur_trans->num_joined;
947 if (!list_empty(&trans->transaction->pending_snapshots))
948 snap_pending = 1;
949
950 WARN_ON(cur_trans != trans->transaction);
951 prepare_to_wait(&cur_trans->writer_wait, &wait,
952 TASK_UNINTERRUPTIBLE);
953
954 if (cur_trans->num_writers > 1)
955 timeout = MAX_SCHEDULE_TIMEOUT;
956 else
957 timeout = 1;
958
959 mutex_unlock(&root->fs_info->trans_mutex);
960
961 if (snap_pending) {
962 ret = btrfs_wait_ordered_extents(root, 1);
963 BUG_ON(ret);
964 }
965
966 schedule_timeout(timeout);
967
968 mutex_lock(&root->fs_info->trans_mutex);
969 finish_wait(&cur_trans->writer_wait, &wait);
970 } while (cur_trans->num_writers > 1 ||
971 (cur_trans->num_joined != joined));
972
973 ret = create_pending_snapshots(trans, root->fs_info);
974 BUG_ON(ret);
975
976 WARN_ON(cur_trans != trans->transaction);
977
978 /* btrfs_commit_tree_roots is responsible for getting the
979 * various roots consistent with each other. Every pointer
980 * in the tree of tree roots has to point to the most up to date
981 * root for every subvolume and other tree. So, we have to keep
982 * the tree logging code from jumping in and changing any
983 * of the trees.
984 *
985 * At this point in the commit, there can't be any tree-log
986 * writers, but a little lower down we drop the trans mutex
987 * and let new people in. By holding the tree_log_mutex
988 * from now until after the super is written, we avoid races
989 * with the tree-log code.
990 */
991 mutex_lock(&root->fs_info->tree_log_mutex);
992 /*
993 * keep tree reloc code from adding new reloc trees
994 */
995 mutex_lock(&root->fs_info->tree_reloc_mutex);
996
997
998 ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
999 &dirty_fs_roots);
1000 BUG_ON(ret);
1001
1002 /* add_dirty_roots gets rid of all the tree log roots, it is now
1003 * safe to free the root of tree log roots
1004 */
1005 btrfs_free_log_root_tree(trans, root->fs_info);
1006
1007 ret = btrfs_commit_tree_roots(trans, root);
1008 BUG_ON(ret);
1009
1010 cur_trans = root->fs_info->running_transaction;
1011 spin_lock(&root->fs_info->new_trans_lock);
1012 root->fs_info->running_transaction = NULL;
1013 spin_unlock(&root->fs_info->new_trans_lock);
1014 btrfs_set_super_generation(&root->fs_info->super_copy,
1015 cur_trans->transid);
1016 btrfs_set_super_root(&root->fs_info->super_copy,
1017 root->fs_info->tree_root->node->start);
1018 btrfs_set_super_root_level(&root->fs_info->super_copy,
1019 btrfs_header_level(root->fs_info->tree_root->node));
1020
1021 btrfs_set_super_chunk_root(&root->fs_info->super_copy,
1022 chunk_root->node->start);
1023 btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
1024 btrfs_header_level(chunk_root->node));
1025 btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
1026 btrfs_header_generation(chunk_root->node));
1027
1028 if (!root->fs_info->log_root_recovering) {
1029 btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
1030 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
1031 }
1032
1033 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1034 sizeof(root->fs_info->super_copy));
1035
1036 btrfs_copy_pinned(root, pinned_copy);
1037
1038 trans->transaction->blocked = 0;
1039 wake_up(&root->fs_info->transaction_throttle);
1040 wake_up(&root->fs_info->transaction_wait);
1041
1042 mutex_unlock(&root->fs_info->trans_mutex);
1043 ret = btrfs_write_and_wait_transaction(trans, root);
1044 BUG_ON(ret);
1045 write_ctree_super(trans, root);
1046
1047 /*
1048 * the super is written, we can safely allow the tree-loggers
1049 * to go about their business
1050 */
1051 mutex_unlock(&root->fs_info->tree_log_mutex);
1052
1053 btrfs_finish_extent_commit(trans, root, pinned_copy);
1054 kfree(pinned_copy);
1055
1056 btrfs_drop_dead_reloc_roots(root);
1057 mutex_unlock(&root->fs_info->tree_reloc_mutex);
1058
1059 /* do the directory inserts of any pending snapshot creations */
1060 finish_pending_snapshots(trans, root->fs_info);
1061
1062 mutex_lock(&root->fs_info->trans_mutex);
1063
1064 cur_trans->commit_done = 1;
1065 root->fs_info->last_trans_committed = cur_trans->transid;
1066 wake_up(&cur_trans->commit_wait);
1067
1068 put_transaction(cur_trans);
1069 put_transaction(cur_trans);
1070
1071 list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
1072 if (root->fs_info->closing)
1073 list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
1074
1075 mutex_unlock(&root->fs_info->trans_mutex);
1076
1077 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1078
1079 if (root->fs_info->closing) {
1080 drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
1081 }
1082 return ret;
1083}
1084
1085/*
1086 * interface function to delete all the snapshots we have scheduled for deletion
1087 */
1088int btrfs_clean_old_snapshots(struct btrfs_root *root)
1089{
1090 struct list_head dirty_roots;
1091 INIT_LIST_HEAD(&dirty_roots);
1092again:
1093 mutex_lock(&root->fs_info->trans_mutex);
1094 list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
1095 mutex_unlock(&root->fs_info->trans_mutex);
1096
1097 if (!list_empty(&dirty_roots)) {
1098 drop_dirty_roots(root, &dirty_roots);
1099 goto again;
1100 }
1101 return 0;
1102}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 000000000000..202c8be6c05d
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,106 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_TRANSACTION__
20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h"
22
23struct btrfs_transaction {
24 u64 transid;
25 unsigned long num_writers;
26 unsigned long num_joined;
27 int in_commit;
28 int use_count;
29 int commit_done;
30 int blocked;
31 struct list_head list;
32 struct extent_io_tree dirty_pages;
33 unsigned long start_time;
34 wait_queue_head_t writer_wait;
35 wait_queue_head_t commit_wait;
36 struct list_head pending_snapshots;
37};
38
39struct btrfs_trans_handle {
40 u64 transid;
41 unsigned long blocks_reserved;
42 unsigned long blocks_used;
43 struct btrfs_transaction *transaction;
44 struct btrfs_block_group_cache *block_group;
45 u64 alloc_exclude_start;
46 u64 alloc_exclude_nr;
47};
48
49struct btrfs_pending_snapshot {
50 struct dentry *dentry;
51 struct btrfs_root *root;
52 char *name;
53 struct btrfs_key root_key;
54 struct list_head list;
55};
56
57struct btrfs_dirty_root {
58 struct list_head list;
59 struct btrfs_root *root;
60 struct btrfs_root *latest_root;
61};
62
63static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
64 struct inode *inode)
65{
66 trans->block_group = BTRFS_I(inode)->block_group;
67}
68
69static inline void btrfs_update_inode_block_group(struct
70 btrfs_trans_handle *trans,
71 struct inode *inode)
72{
73 BTRFS_I(inode)->block_group = trans->block_group;
74}
75
76static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
77 struct inode *inode)
78{
79 BTRFS_I(inode)->last_trans = trans->transaction->transid;
80}
81
82int btrfs_end_transaction(struct btrfs_trans_handle *trans,
83 struct btrfs_root *root);
84struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
85 int num_blocks);
86struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
87 int num_blocks);
88struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
89 int num_blocks);
90int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
91 struct btrfs_root *root);
92int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
93 struct btrfs_root *root);
94
95int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
96int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
97int btrfs_clean_old_snapshots(struct btrfs_root *root);
98int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
99 struct btrfs_root *root);
100int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
101 struct btrfs_root *root);
102void btrfs_throttle(struct btrfs_root *root);
103int btrfs_record_root_in_trans(struct btrfs_root *root);
104int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
105 struct extent_io_tree *dirty_pages);
106#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 000000000000..a6a3956cedfb
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,146 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "print-tree.h"
23#include "transaction.h"
24#include "locking.h"
25
26/* defrag all the leaves in a given btree. If cache_only == 1, don't read things
27 * from disk, otherwise read all the leaves and try to get key order to
28 * better reflect disk order
29 */
30int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, int cache_only)
32{
33 struct btrfs_path *path = NULL;
34 struct btrfs_key key;
35 int ret = 0;
36 int wret;
37 int level;
38 int orig_level;
39 int is_extent = 0;
40 int next_key_ret = 0;
41 u64 last_ret = 0;
42 u64 min_trans = 0;
43
44 if (cache_only)
45 goto out;
46
47 if (root->fs_info->extent_root == root) {
48 /*
49 * there's recursion here right now in the tree locking,
50 * we can't defrag the extent root without deadlock
51 */
52 goto out;
53 }
54
55 if (root->ref_cows == 0 && !is_extent)
56 goto out;
57
58 if (btrfs_test_opt(root, SSD))
59 goto out;
60
61 path = btrfs_alloc_path();
62 if (!path)
63 return -ENOMEM;
64
65 level = btrfs_header_level(root->node);
66 orig_level = level;
67
68 if (level == 0) {
69 goto out;
70 }
71 if (root->defrag_progress.objectid == 0) {
72 struct extent_buffer *root_node;
73 u32 nritems;
74
75 root_node = btrfs_lock_root_node(root);
76 nritems = btrfs_header_nritems(root_node);
77 root->defrag_max.objectid = 0;
78 /* from above we know this is not a leaf */
79 btrfs_node_key_to_cpu(root_node, &root->defrag_max,
80 nritems - 1);
81 btrfs_tree_unlock(root_node);
82 free_extent_buffer(root_node);
83 memset(&key, 0, sizeof(key));
84 } else {
85 memcpy(&key, &root->defrag_progress, sizeof(key));
86 }
87
88 path->keep_locks = 1;
89 if (cache_only)
90 min_trans = root->defrag_trans_start;
91
92 ret = btrfs_search_forward(root, &key, NULL, path,
93 cache_only, min_trans);
94 if (ret < 0)
95 goto out;
96 if (ret > 0) {
97 ret = 0;
98 goto out;
99 }
100 btrfs_release_path(root, path);
101 wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
102
103 if (wret < 0) {
104 ret = wret;
105 goto out;
106 }
107 if (!path->nodes[1]) {
108 ret = 0;
109 goto out;
110 }
111 path->slots[1] = btrfs_header_nritems(path->nodes[1]);
112 next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
113 min_trans);
114 ret = btrfs_realloc_node(trans, root,
115 path->nodes[1], 0,
116 cache_only, &last_ret,
117 &root->defrag_progress);
118 WARN_ON(ret && ret != -EAGAIN);
119 if (next_key_ret == 0) {
120 memcpy(&root->defrag_progress, &key, sizeof(key));
121 ret = -EAGAIN;
122 }
123
124 btrfs_release_path(root, path);
125 if (is_extent)
126 btrfs_extent_post_op(trans, root);
127out:
128 if (path)
129 btrfs_free_path(path);
130 if (ret == -EAGAIN) {
131 if (root->defrag_max.objectid > root->defrag_progress.objectid)
132 goto done;
133 if (root->defrag_max.type > root->defrag_progress.type)
134 goto done;
135 if (root->defrag_max.offset > root->defrag_progress.offset)
136 goto done;
137 ret = 0;
138 }
139done:
140 if (ret != -EAGAIN) {
141 memset(&root->defrag_progress, 0,
142 sizeof(root->defrag_progress));
143 root->defrag_trans_start = trans->transid;
144 }
145 return ret;
146}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 000000000000..be4fc30a30e4
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,2896 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "transaction.h"
22#include "disk-io.h"
23#include "locking.h"
24#include "print-tree.h"
25#include "compat.h"
26
27/* magic values for the inode_only field in btrfs_log_inode:
28 *
29 * LOG_INODE_ALL means to log everything
30 * LOG_INODE_EXISTS means to log just enough to recreate the inode
31 * during log replay
32 */
33#define LOG_INODE_ALL 0
34#define LOG_INODE_EXISTS 1
35
36/*
37 * stages for the tree walking. The first
38 * stage (0) is to only pin down the blocks we find
39 * the second stage (1) is to make sure that all the inodes
40 * we find in the log are created in the subvolume.
41 *
42 * The last stage is to deal with directories and links and extents
43 * and all the other fun semantics
44 */
45#define LOG_WALK_PIN_ONLY 0
46#define LOG_WALK_REPLAY_INODES 1
47#define LOG_WALK_REPLAY_ALL 2
48
49static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
50 struct btrfs_root *root, struct inode *inode,
51 int inode_only);
52
53/*
54 * tree logging is a special write ahead log used to make sure that
55 * fsyncs and O_SYNCs can happen without doing full tree commits.
56 *
57 * Full tree commits are expensive because they require commonly
58 * modified blocks to be recowed, creating many dirty pages in the
59 * extent tree an 4x-6x higher write load than ext3.
60 *
61 * Instead of doing a tree commit on every fsync, we use the
62 * key ranges and transaction ids to find items for a given file or directory
63 * that have changed in this transaction. Those items are copied into
64 * a special tree (one per subvolume root), that tree is written to disk
65 * and then the fsync is considered complete.
66 *
67 * After a crash, items are copied out of the log-tree back into the
68 * subvolume tree. Any file data extents found are recorded in the extent
69 * allocation tree, and the log-tree freed.
70 *
71 * The log tree is read three times, once to pin down all the extents it is
72 * using in ram and once, once to create all the inodes logged in the tree
73 * and once to do all the other items.
74 */
75
76/*
77 * btrfs_add_log_tree adds a new per-subvolume log tree into the
78 * tree of log tree roots. This must be called with a tree log transaction
79 * running (see start_log_trans).
80 */
81int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
82 struct btrfs_root *root)
83{
84 struct btrfs_key key;
85 struct btrfs_root_item root_item;
86 struct btrfs_inode_item *inode_item;
87 struct extent_buffer *leaf;
88 struct btrfs_root *new_root = root;
89 int ret;
90 u64 objectid = root->root_key.objectid;
91
92 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
93 BTRFS_TREE_LOG_OBJECTID,
94 trans->transid, 0, 0, 0);
95 if (IS_ERR(leaf)) {
96 ret = PTR_ERR(leaf);
97 return ret;
98 }
99
100 btrfs_set_header_nritems(leaf, 0);
101 btrfs_set_header_level(leaf, 0);
102 btrfs_set_header_bytenr(leaf, leaf->start);
103 btrfs_set_header_generation(leaf, trans->transid);
104 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
105
106 write_extent_buffer(leaf, root->fs_info->fsid,
107 (unsigned long)btrfs_header_fsid(leaf),
108 BTRFS_FSID_SIZE);
109 btrfs_mark_buffer_dirty(leaf);
110
111 inode_item = &root_item.inode;
112 memset(inode_item, 0, sizeof(*inode_item));
113 inode_item->generation = cpu_to_le64(1);
114 inode_item->size = cpu_to_le64(3);
115 inode_item->nlink = cpu_to_le32(1);
116 inode_item->nbytes = cpu_to_le64(root->leafsize);
117 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
118
119 btrfs_set_root_bytenr(&root_item, leaf->start);
120 btrfs_set_root_generation(&root_item, trans->transid);
121 btrfs_set_root_level(&root_item, 0);
122 btrfs_set_root_refs(&root_item, 0);
123 btrfs_set_root_used(&root_item, 0);
124
125 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
126 root_item.drop_level = 0;
127
128 btrfs_tree_unlock(leaf);
129 free_extent_buffer(leaf);
130 leaf = NULL;
131
132 btrfs_set_root_dirid(&root_item, 0);
133
134 key.objectid = BTRFS_TREE_LOG_OBJECTID;
135 key.offset = objectid;
136 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
137 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
138 &root_item);
139 if (ret)
140 goto fail;
141
142 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
143 &key);
144 BUG_ON(!new_root);
145
146 WARN_ON(root->log_root);
147 root->log_root = new_root;
148
149 /*
150 * log trees do not get reference counted because they go away
151 * before a real commit is actually done. They do store pointers
152 * to file data extents, and those reference counts still get
153 * updated (along with back refs to the log tree).
154 */
155 new_root->ref_cows = 0;
156 new_root->last_trans = trans->transid;
157fail:
158 return ret;
159}
160
161/*
162 * start a sub transaction and setup the log tree
163 * this increments the log tree writer count to make the people
164 * syncing the tree wait for us to finish
165 */
166static int start_log_trans(struct btrfs_trans_handle *trans,
167 struct btrfs_root *root)
168{
169 int ret;
170 mutex_lock(&root->fs_info->tree_log_mutex);
171 if (!root->fs_info->log_root_tree) {
172 ret = btrfs_init_log_root_tree(trans, root->fs_info);
173 BUG_ON(ret);
174 }
175 if (!root->log_root) {
176 ret = btrfs_add_log_tree(trans, root);
177 BUG_ON(ret);
178 }
179 atomic_inc(&root->fs_info->tree_log_writers);
180 root->fs_info->tree_log_batch++;
181 mutex_unlock(&root->fs_info->tree_log_mutex);
182 return 0;
183}
184
185/*
186 * returns 0 if there was a log transaction running and we were able
187 * to join, or returns -ENOENT if there were not transactions
188 * in progress
189 */
190static int join_running_log_trans(struct btrfs_root *root)
191{
192 int ret = -ENOENT;
193
194 smp_mb();
195 if (!root->log_root)
196 return -ENOENT;
197
198 mutex_lock(&root->fs_info->tree_log_mutex);
199 if (root->log_root) {
200 ret = 0;
201 atomic_inc(&root->fs_info->tree_log_writers);
202 root->fs_info->tree_log_batch++;
203 }
204 mutex_unlock(&root->fs_info->tree_log_mutex);
205 return ret;
206}
207
208/*
209 * indicate we're done making changes to the log tree
210 * and wake up anyone waiting to do a sync
211 */
212static int end_log_trans(struct btrfs_root *root)
213{
214 atomic_dec(&root->fs_info->tree_log_writers);
215 smp_mb();
216 if (waitqueue_active(&root->fs_info->tree_log_wait))
217 wake_up(&root->fs_info->tree_log_wait);
218 return 0;
219}
220
221
222/*
223 * the walk control struct is used to pass state down the chain when
224 * processing the log tree. The stage field tells us which part
225 * of the log tree processing we are currently doing. The others
226 * are state fields used for that specific part
227 */
228struct walk_control {
229 /* should we free the extent on disk when done? This is used
230 * at transaction commit time while freeing a log tree
231 */
232 int free;
233
234 /* should we write out the extent buffer? This is used
235 * while flushing the log tree to disk during a sync
236 */
237 int write;
238
239 /* should we wait for the extent buffer io to finish? Also used
240 * while flushing the log tree to disk for a sync
241 */
242 int wait;
243
244 /* pin only walk, we record which extents on disk belong to the
245 * log trees
246 */
247 int pin;
248
249 /* what stage of the replay code we're currently in */
250 int stage;
251
252 /* the root we are currently replaying */
253 struct btrfs_root *replay_dest;
254
255 /* the trans handle for the current replay */
256 struct btrfs_trans_handle *trans;
257
258 /* the function that gets used to process blocks we find in the
259 * tree. Note the extent_buffer might not be up to date when it is
260 * passed in, and it must be checked or read if you need the data
261 * inside it
262 */
263 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
264 struct walk_control *wc, u64 gen);
265};
266
267/*
268 * process_func used to pin down extents, write them or wait on them
269 */
270static int process_one_buffer(struct btrfs_root *log,
271 struct extent_buffer *eb,
272 struct walk_control *wc, u64 gen)
273{
274 if (wc->pin) {
275 mutex_lock(&log->fs_info->pinned_mutex);
276 btrfs_update_pinned_extents(log->fs_info->extent_root,
277 eb->start, eb->len, 1);
278 mutex_unlock(&log->fs_info->pinned_mutex);
279 }
280
281 if (btrfs_buffer_uptodate(eb, gen)) {
282 if (wc->write)
283 btrfs_write_tree_block(eb);
284 if (wc->wait)
285 btrfs_wait_tree_block_writeback(eb);
286 }
287 return 0;
288}
289
290/*
291 * Item overwrite used by replay and tree logging. eb, slot and key all refer
292 * to the src data we are copying out.
293 *
294 * root is the tree we are copying into, and path is a scratch
295 * path for use in this function (it should be released on entry and
296 * will be released on exit).
297 *
298 * If the key is already in the destination tree the existing item is
299 * overwritten. If the existing item isn't big enough, it is extended.
300 * If it is too large, it is truncated.
301 *
302 * If the key isn't in the destination yet, a new item is inserted.
303 */
304static noinline int overwrite_item(struct btrfs_trans_handle *trans,
305 struct btrfs_root *root,
306 struct btrfs_path *path,
307 struct extent_buffer *eb, int slot,
308 struct btrfs_key *key)
309{
310 int ret;
311 u32 item_size;
312 u64 saved_i_size = 0;
313 int save_old_i_size = 0;
314 unsigned long src_ptr;
315 unsigned long dst_ptr;
316 int overwrite_root = 0;
317
318 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
319 overwrite_root = 1;
320
321 item_size = btrfs_item_size_nr(eb, slot);
322 src_ptr = btrfs_item_ptr_offset(eb, slot);
323
324 /* look for the key in the destination tree */
325 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
326 if (ret == 0) {
327 char *src_copy;
328 char *dst_copy;
329 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
330 path->slots[0]);
331 if (dst_size != item_size)
332 goto insert;
333
334 if (item_size == 0) {
335 btrfs_release_path(root, path);
336 return 0;
337 }
338 dst_copy = kmalloc(item_size, GFP_NOFS);
339 src_copy = kmalloc(item_size, GFP_NOFS);
340
341 read_extent_buffer(eb, src_copy, src_ptr, item_size);
342
343 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
344 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
345 item_size);
346 ret = memcmp(dst_copy, src_copy, item_size);
347
348 kfree(dst_copy);
349 kfree(src_copy);
350 /*
351 * they have the same contents, just return, this saves
352 * us from cowing blocks in the destination tree and doing
353 * extra writes that may not have been done by a previous
354 * sync
355 */
356 if (ret == 0) {
357 btrfs_release_path(root, path);
358 return 0;
359 }
360
361 }
362insert:
363 btrfs_release_path(root, path);
364 /* try to insert the key into the destination tree */
365 ret = btrfs_insert_empty_item(trans, root, path,
366 key, item_size);
367
368 /* make sure any existing item is the correct size */
369 if (ret == -EEXIST) {
370 u32 found_size;
371 found_size = btrfs_item_size_nr(path->nodes[0],
372 path->slots[0]);
373 if (found_size > item_size) {
374 btrfs_truncate_item(trans, root, path, item_size, 1);
375 } else if (found_size < item_size) {
376 ret = btrfs_del_item(trans, root,
377 path);
378 BUG_ON(ret);
379
380 btrfs_release_path(root, path);
381 ret = btrfs_insert_empty_item(trans,
382 root, path, key, item_size);
383 BUG_ON(ret);
384 }
385 } else if (ret) {
386 BUG();
387 }
388 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
389 path->slots[0]);
390
391 /* don't overwrite an existing inode if the generation number
392 * was logged as zero. This is done when the tree logging code
393 * is just logging an inode to make sure it exists after recovery.
394 *
395 * Also, don't overwrite i_size on directories during replay.
396 * log replay inserts and removes directory items based on the
397 * state of the tree found in the subvolume, and i_size is modified
398 * as it goes
399 */
400 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
401 struct btrfs_inode_item *src_item;
402 struct btrfs_inode_item *dst_item;
403
404 src_item = (struct btrfs_inode_item *)src_ptr;
405 dst_item = (struct btrfs_inode_item *)dst_ptr;
406
407 if (btrfs_inode_generation(eb, src_item) == 0)
408 goto no_copy;
409
410 if (overwrite_root &&
411 S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
412 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
413 save_old_i_size = 1;
414 saved_i_size = btrfs_inode_size(path->nodes[0],
415 dst_item);
416 }
417 }
418
419 copy_extent_buffer(path->nodes[0], eb, dst_ptr,
420 src_ptr, item_size);
421
422 if (save_old_i_size) {
423 struct btrfs_inode_item *dst_item;
424 dst_item = (struct btrfs_inode_item *)dst_ptr;
425 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
426 }
427
428 /* make sure the generation is filled in */
429 if (key->type == BTRFS_INODE_ITEM_KEY) {
430 struct btrfs_inode_item *dst_item;
431 dst_item = (struct btrfs_inode_item *)dst_ptr;
432 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
433 btrfs_set_inode_generation(path->nodes[0], dst_item,
434 trans->transid);
435 }
436 }
437
438 if (overwrite_root &&
439 key->type == BTRFS_EXTENT_DATA_KEY) {
440 int extent_type;
441 struct btrfs_file_extent_item *fi;
442
443 fi = (struct btrfs_file_extent_item *)dst_ptr;
444 extent_type = btrfs_file_extent_type(path->nodes[0], fi);
445 if (extent_type == BTRFS_FILE_EXTENT_REG ||
446 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
447 struct btrfs_key ins;
448 ins.objectid = btrfs_file_extent_disk_bytenr(
449 path->nodes[0], fi);
450 ins.offset = btrfs_file_extent_disk_num_bytes(
451 path->nodes[0], fi);
452 ins.type = BTRFS_EXTENT_ITEM_KEY;
453
454 /*
455 * is this extent already allocated in the extent
456 * allocation tree? If so, just add a reference
457 */
458 ret = btrfs_lookup_extent(root, ins.objectid,
459 ins.offset);
460 if (ret == 0) {
461 ret = btrfs_inc_extent_ref(trans, root,
462 ins.objectid, ins.offset,
463 path->nodes[0]->start,
464 root->root_key.objectid,
465 trans->transid, key->objectid);
466 } else {
467 /*
468 * insert the extent pointer in the extent
469 * allocation tree
470 */
471 ret = btrfs_alloc_logged_extent(trans, root,
472 path->nodes[0]->start,
473 root->root_key.objectid,
474 trans->transid, key->objectid,
475 &ins);
476 BUG_ON(ret);
477 }
478 }
479 }
480no_copy:
481 btrfs_mark_buffer_dirty(path->nodes[0]);
482 btrfs_release_path(root, path);
483 return 0;
484}
485
486/*
487 * simple helper to read an inode off the disk from a given root
488 * This can only be called for subvolume roots and not for the log
489 */
490static noinline struct inode *read_one_inode(struct btrfs_root *root,
491 u64 objectid)
492{
493 struct inode *inode;
494 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
495 if (inode->i_state & I_NEW) {
496 BTRFS_I(inode)->root = root;
497 BTRFS_I(inode)->location.objectid = objectid;
498 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
499 BTRFS_I(inode)->location.offset = 0;
500 btrfs_read_locked_inode(inode);
501 unlock_new_inode(inode);
502
503 }
504 if (is_bad_inode(inode)) {
505 iput(inode);
506 inode = NULL;
507 }
508 return inode;
509}
510
511/* replays a single extent in 'eb' at 'slot' with 'key' into the
512 * subvolume 'root'. path is released on entry and should be released
513 * on exit.
514 *
515 * extents in the log tree have not been allocated out of the extent
516 * tree yet. So, this completes the allocation, taking a reference
517 * as required if the extent already exists or creating a new extent
518 * if it isn't in the extent allocation tree yet.
519 *
520 * The extent is inserted into the file, dropping any existing extents
521 * from the file that overlap the new one.
522 */
523static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
524 struct btrfs_root *root,
525 struct btrfs_path *path,
526 struct extent_buffer *eb, int slot,
527 struct btrfs_key *key)
528{
529 int found_type;
530 u64 mask = root->sectorsize - 1;
531 u64 extent_end;
532 u64 alloc_hint;
533 u64 start = key->offset;
534 struct btrfs_file_extent_item *item;
535 struct inode *inode = NULL;
536 unsigned long size;
537 int ret = 0;
538
539 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
540 found_type = btrfs_file_extent_type(eb, item);
541
542 if (found_type == BTRFS_FILE_EXTENT_REG ||
543 found_type == BTRFS_FILE_EXTENT_PREALLOC)
544 extent_end = start + btrfs_file_extent_num_bytes(eb, item);
545 else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
546 size = btrfs_file_extent_inline_len(eb, item);
547 extent_end = (start + size + mask) & ~mask;
548 } else {
549 ret = 0;
550 goto out;
551 }
552
553 inode = read_one_inode(root, key->objectid);
554 if (!inode) {
555 ret = -EIO;
556 goto out;
557 }
558
559 /*
560 * first check to see if we already have this extent in the
561 * file. This must be done before the btrfs_drop_extents run
562 * so we don't try to drop this extent.
563 */
564 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
565 start, 0);
566
567 if (ret == 0 &&
568 (found_type == BTRFS_FILE_EXTENT_REG ||
569 found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
570 struct btrfs_file_extent_item cmp1;
571 struct btrfs_file_extent_item cmp2;
572 struct btrfs_file_extent_item *existing;
573 struct extent_buffer *leaf;
574
575 leaf = path->nodes[0];
576 existing = btrfs_item_ptr(leaf, path->slots[0],
577 struct btrfs_file_extent_item);
578
579 read_extent_buffer(eb, &cmp1, (unsigned long)item,
580 sizeof(cmp1));
581 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
582 sizeof(cmp2));
583
584 /*
585 * we already have a pointer to this exact extent,
586 * we don't have to do anything
587 */
588 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
589 btrfs_release_path(root, path);
590 goto out;
591 }
592 }
593 btrfs_release_path(root, path);
594
595 /* drop any overlapping extents */
596 ret = btrfs_drop_extents(trans, root, inode,
597 start, extent_end, start, &alloc_hint);
598 BUG_ON(ret);
599
600 /* insert the extent */
601 ret = overwrite_item(trans, root, path, eb, slot, key);
602 BUG_ON(ret);
603
604 /* btrfs_drop_extents changes i_bytes & i_blocks, update it here */
605 inode_add_bytes(inode, extent_end - start);
606 btrfs_update_inode(trans, root, inode);
607out:
608 if (inode)
609 iput(inode);
610 return ret;
611}
612
613/*
614 * when cleaning up conflicts between the directory names in the
615 * subvolume, directory names in the log and directory names in the
616 * inode back references, we may have to unlink inodes from directories.
617 *
618 * This is a helper function to do the unlink of a specific directory
619 * item
620 */
621static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
622 struct btrfs_root *root,
623 struct btrfs_path *path,
624 struct inode *dir,
625 struct btrfs_dir_item *di)
626{
627 struct inode *inode;
628 char *name;
629 int name_len;
630 struct extent_buffer *leaf;
631 struct btrfs_key location;
632 int ret;
633
634 leaf = path->nodes[0];
635
636 btrfs_dir_item_key_to_cpu(leaf, di, &location);
637 name_len = btrfs_dir_name_len(leaf, di);
638 name = kmalloc(name_len, GFP_NOFS);
639 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
640 btrfs_release_path(root, path);
641
642 inode = read_one_inode(root, location.objectid);
643 BUG_ON(!inode);
644
645 btrfs_inc_nlink(inode);
646 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
647 kfree(name);
648
649 iput(inode);
650 return ret;
651}
652
653/*
654 * helper function to see if a given name and sequence number found
655 * in an inode back reference are already in a directory and correctly
656 * point to this inode
657 */
658static noinline int inode_in_dir(struct btrfs_root *root,
659 struct btrfs_path *path,
660 u64 dirid, u64 objectid, u64 index,
661 const char *name, int name_len)
662{
663 struct btrfs_dir_item *di;
664 struct btrfs_key location;
665 int match = 0;
666
667 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
668 index, name, name_len, 0);
669 if (di && !IS_ERR(di)) {
670 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
671 if (location.objectid != objectid)
672 goto out;
673 } else
674 goto out;
675 btrfs_release_path(root, path);
676
677 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
678 if (di && !IS_ERR(di)) {
679 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
680 if (location.objectid != objectid)
681 goto out;
682 } else
683 goto out;
684 match = 1;
685out:
686 btrfs_release_path(root, path);
687 return match;
688}
689
690/*
691 * helper function to check a log tree for a named back reference in
692 * an inode. This is used to decide if a back reference that is
693 * found in the subvolume conflicts with what we find in the log.
694 *
695 * inode backreferences may have multiple refs in a single item,
696 * during replay we process one reference at a time, and we don't
697 * want to delete valid links to a file from the subvolume if that
698 * link is also in the log.
699 */
700static noinline int backref_in_log(struct btrfs_root *log,
701 struct btrfs_key *key,
702 char *name, int namelen)
703{
704 struct btrfs_path *path;
705 struct btrfs_inode_ref *ref;
706 unsigned long ptr;
707 unsigned long ptr_end;
708 unsigned long name_ptr;
709 int found_name_len;
710 int item_size;
711 int ret;
712 int match = 0;
713
714 path = btrfs_alloc_path();
715 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
716 if (ret != 0)
717 goto out;
718
719 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
720 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
721 ptr_end = ptr + item_size;
722 while (ptr < ptr_end) {
723 ref = (struct btrfs_inode_ref *)ptr;
724 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
725 if (found_name_len == namelen) {
726 name_ptr = (unsigned long)(ref + 1);
727 ret = memcmp_extent_buffer(path->nodes[0], name,
728 name_ptr, namelen);
729 if (ret == 0) {
730 match = 1;
731 goto out;
732 }
733 }
734 ptr = (unsigned long)(ref + 1) + found_name_len;
735 }
736out:
737 btrfs_free_path(path);
738 return match;
739}
740
741
742/*
743 * replay one inode back reference item found in the log tree.
744 * eb, slot and key refer to the buffer and key found in the log tree.
745 * root is the destination we are replaying into, and path is for temp
746 * use by this function. (it should be released on return).
747 */
748static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
749 struct btrfs_root *root,
750 struct btrfs_root *log,
751 struct btrfs_path *path,
752 struct extent_buffer *eb, int slot,
753 struct btrfs_key *key)
754{
755 struct inode *dir;
756 int ret;
757 struct btrfs_key location;
758 struct btrfs_inode_ref *ref;
759 struct btrfs_dir_item *di;
760 struct inode *inode;
761 char *name;
762 int namelen;
763 unsigned long ref_ptr;
764 unsigned long ref_end;
765
766 location.objectid = key->objectid;
767 location.type = BTRFS_INODE_ITEM_KEY;
768 location.offset = 0;
769
770 /*
771 * it is possible that we didn't log all the parent directories
772 * for a given inode. If we don't find the dir, just don't
773 * copy the back ref in. The link count fixup code will take
774 * care of the rest
775 */
776 dir = read_one_inode(root, key->offset);
777 if (!dir)
778 return -ENOENT;
779
780 inode = read_one_inode(root, key->objectid);
781 BUG_ON(!dir);
782
783 ref_ptr = btrfs_item_ptr_offset(eb, slot);
784 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
785
786again:
787 ref = (struct btrfs_inode_ref *)ref_ptr;
788
789 namelen = btrfs_inode_ref_name_len(eb, ref);
790 name = kmalloc(namelen, GFP_NOFS);
791 BUG_ON(!name);
792
793 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
794
795 /* if we already have a perfect match, we're done */
796 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
797 btrfs_inode_ref_index(eb, ref),
798 name, namelen)) {
799 goto out;
800 }
801
802 /*
803 * look for a conflicting back reference in the metadata.
804 * if we find one we have to unlink that name of the file
805 * before we add our new link. Later on, we overwrite any
806 * existing back reference, and we don't want to create
807 * dangling pointers in the directory.
808 */
809conflict_again:
810 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
811 if (ret == 0) {
812 char *victim_name;
813 int victim_name_len;
814 struct btrfs_inode_ref *victim_ref;
815 unsigned long ptr;
816 unsigned long ptr_end;
817 struct extent_buffer *leaf = path->nodes[0];
818
819 /* are we trying to overwrite a back ref for the root directory
820 * if so, just jump out, we're done
821 */
822 if (key->objectid == key->offset)
823 goto out_nowrite;
824
825 /* check all the names in this back reference to see
826 * if they are in the log. if so, we allow them to stay
827 * otherwise they must be unlinked as a conflict
828 */
829 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
830 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
831 while(ptr < ptr_end) {
832 victim_ref = (struct btrfs_inode_ref *)ptr;
833 victim_name_len = btrfs_inode_ref_name_len(leaf,
834 victim_ref);
835 victim_name = kmalloc(victim_name_len, GFP_NOFS);
836 BUG_ON(!victim_name);
837
838 read_extent_buffer(leaf, victim_name,
839 (unsigned long)(victim_ref + 1),
840 victim_name_len);
841
842 if (!backref_in_log(log, key, victim_name,
843 victim_name_len)) {
844 btrfs_inc_nlink(inode);
845 btrfs_release_path(root, path);
846 ret = btrfs_unlink_inode(trans, root, dir,
847 inode, victim_name,
848 victim_name_len);
849 kfree(victim_name);
850 btrfs_release_path(root, path);
851 goto conflict_again;
852 }
853 kfree(victim_name);
854 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
855 }
856 BUG_ON(ret);
857 }
858 btrfs_release_path(root, path);
859
860 /* look for a conflicting sequence number */
861 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
862 btrfs_inode_ref_index(eb, ref),
863 name, namelen, 0);
864 if (di && !IS_ERR(di)) {
865 ret = drop_one_dir_item(trans, root, path, dir, di);
866 BUG_ON(ret);
867 }
868 btrfs_release_path(root, path);
869
870
871 /* look for a conflicting name */
872 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
873 name, namelen, 0);
874 if (di && !IS_ERR(di)) {
875 ret = drop_one_dir_item(trans, root, path, dir, di);
876 BUG_ON(ret);
877 }
878 btrfs_release_path(root, path);
879
880 /* insert our name */
881 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
882 btrfs_inode_ref_index(eb, ref));
883 BUG_ON(ret);
884
885 btrfs_update_inode(trans, root, inode);
886
887out:
888 ref_ptr = (unsigned long)(ref + 1) + namelen;
889 kfree(name);
890 if (ref_ptr < ref_end)
891 goto again;
892
893 /* finally write the back reference in the inode */
894 ret = overwrite_item(trans, root, path, eb, slot, key);
895 BUG_ON(ret);
896
897out_nowrite:
898 btrfs_release_path(root, path);
899 iput(dir);
900 iput(inode);
901 return 0;
902}
903
904/*
905 * replay one csum item from the log tree into the subvolume 'root'
906 * eb, slot and key all refer to the log tree
907 * path is for temp use by this function and should be released on return
908 *
909 * This copies the checksums out of the log tree and inserts them into
910 * the subvolume. Any existing checksums for this range in the file
911 * are overwritten, and new items are added where required.
912 *
913 * We keep this simple by reusing the btrfs_ordered_sum code from
914 * the data=ordered mode. This basically means making a copy
915 * of all the checksums in ram, which we have to do anyway for kmap
916 * rules.
917 *
918 * The copy is then sent down to btrfs_csum_file_blocks, which
919 * does all the hard work of finding existing items in the file
920 * or adding new ones.
921 */
922static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
923 struct btrfs_root *root,
924 struct btrfs_path *path,
925 struct extent_buffer *eb, int slot,
926 struct btrfs_key *key)
927{
928 int ret;
929 u32 item_size = btrfs_item_size_nr(eb, slot);
930 u64 cur_offset;
931 unsigned long file_bytes;
932 struct btrfs_ordered_sum *sums;
933 struct btrfs_sector_sum *sector_sum;
934 struct inode *inode;
935 unsigned long ptr;
936
937 file_bytes = (item_size / BTRFS_CRC32_SIZE) * root->sectorsize;
938 inode = read_one_inode(root, key->objectid);
939 if (!inode) {
940 return -EIO;
941 }
942
943 sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
944 if (!sums) {
945 iput(inode);
946 return -ENOMEM;
947 }
948
949 INIT_LIST_HEAD(&sums->list);
950 sums->len = file_bytes;
951 sums->file_offset = key->offset;
952
953 /*
954 * copy all the sums into the ordered sum struct
955 */
956 sector_sum = sums->sums;
957 cur_offset = key->offset;
958 ptr = btrfs_item_ptr_offset(eb, slot);
959 while(item_size > 0) {
960 sector_sum->offset = cur_offset;
961 read_extent_buffer(eb, &sector_sum->sum, ptr, BTRFS_CRC32_SIZE);
962 sector_sum++;
963 item_size -= BTRFS_CRC32_SIZE;
964 ptr += BTRFS_CRC32_SIZE;
965 cur_offset += root->sectorsize;
966 }
967
968 /* let btrfs_csum_file_blocks add them into the file */
969 ret = btrfs_csum_file_blocks(trans, root, inode, sums);
970 BUG_ON(ret);
971 kfree(sums);
972 iput(inode);
973
974 return 0;
975}
976/*
977 * There are a few corners where the link count of the file can't
978 * be properly maintained during replay. So, instead of adding
979 * lots of complexity to the log code, we just scan the backrefs
980 * for any file that has been through replay.
981 *
982 * The scan will update the link count on the inode to reflect the
983 * number of back refs found. If it goes down to zero, the iput
984 * will free the inode.
985 */
986static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
987 struct btrfs_root *root,
988 struct inode *inode)
989{
990 struct btrfs_path *path;
991 int ret;
992 struct btrfs_key key;
993 u64 nlink = 0;
994 unsigned long ptr;
995 unsigned long ptr_end;
996 int name_len;
997
998 key.objectid = inode->i_ino;
999 key.type = BTRFS_INODE_REF_KEY;
1000 key.offset = (u64)-1;
1001
1002 path = btrfs_alloc_path();
1003
1004 while(1) {
1005 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1006 if (ret < 0)
1007 break;
1008 if (ret > 0) {
1009 if (path->slots[0] == 0)
1010 break;
1011 path->slots[0]--;
1012 }
1013 btrfs_item_key_to_cpu(path->nodes[0], &key,
1014 path->slots[0]);
1015 if (key.objectid != inode->i_ino ||
1016 key.type != BTRFS_INODE_REF_KEY)
1017 break;
1018 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1019 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1020 path->slots[0]);
1021 while(ptr < ptr_end) {
1022 struct btrfs_inode_ref *ref;
1023
1024 ref = (struct btrfs_inode_ref *)ptr;
1025 name_len = btrfs_inode_ref_name_len(path->nodes[0],
1026 ref);
1027 ptr = (unsigned long)(ref + 1) + name_len;
1028 nlink++;
1029 }
1030
1031 if (key.offset == 0)
1032 break;
1033 key.offset--;
1034 btrfs_release_path(root, path);
1035 }
1036 btrfs_free_path(path);
1037 if (nlink != inode->i_nlink) {
1038 inode->i_nlink = nlink;
1039 btrfs_update_inode(trans, root, inode);
1040 }
1041 BTRFS_I(inode)->index_cnt = (u64)-1;
1042
1043 return 0;
1044}
1045
1046static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1047 struct btrfs_root *root,
1048 struct btrfs_path *path)
1049{
1050 int ret;
1051 struct btrfs_key key;
1052 struct inode *inode;
1053
1054 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1055 key.type = BTRFS_ORPHAN_ITEM_KEY;
1056 key.offset = (u64)-1;
1057 while(1) {
1058 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1059 if (ret < 0)
1060 break;
1061
1062 if (ret == 1) {
1063 if (path->slots[0] == 0)
1064 break;
1065 path->slots[0]--;
1066 }
1067
1068 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1069 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1070 key.type != BTRFS_ORPHAN_ITEM_KEY)
1071 break;
1072
1073 ret = btrfs_del_item(trans, root, path);
1074 BUG_ON(ret);
1075
1076 btrfs_release_path(root, path);
1077 inode = read_one_inode(root, key.offset);
1078 BUG_ON(!inode);
1079
1080 ret = fixup_inode_link_count(trans, root, inode);
1081 BUG_ON(ret);
1082
1083 iput(inode);
1084
1085 if (key.offset == 0)
1086 break;
1087 key.offset--;
1088 }
1089 btrfs_release_path(root, path);
1090 return 0;
1091}
1092
1093
1094/*
1095 * record a given inode in the fixup dir so we can check its link
1096 * count when replay is done. The link count is incremented here
1097 * so the inode won't go away until we check it
1098 */
1099static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1100 struct btrfs_root *root,
1101 struct btrfs_path *path,
1102 u64 objectid)
1103{
1104 struct btrfs_key key;
1105 int ret = 0;
1106 struct inode *inode;
1107
1108 inode = read_one_inode(root, objectid);
1109 BUG_ON(!inode);
1110
1111 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1112 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1113 key.offset = objectid;
1114
1115 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1116
1117 btrfs_release_path(root, path);
1118 if (ret == 0) {
1119 btrfs_inc_nlink(inode);
1120 btrfs_update_inode(trans, root, inode);
1121 } else if (ret == -EEXIST) {
1122 ret = 0;
1123 } else {
1124 BUG();
1125 }
1126 iput(inode);
1127
1128 return ret;
1129}
1130
1131/*
1132 * when replaying the log for a directory, we only insert names
1133 * for inodes that actually exist. This means an fsync on a directory
1134 * does not implicitly fsync all the new files in it
1135 */
1136static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1137 struct btrfs_root *root,
1138 struct btrfs_path *path,
1139 u64 dirid, u64 index,
1140 char *name, int name_len, u8 type,
1141 struct btrfs_key *location)
1142{
1143 struct inode *inode;
1144 struct inode *dir;
1145 int ret;
1146
1147 inode = read_one_inode(root, location->objectid);
1148 if (!inode)
1149 return -ENOENT;
1150
1151 dir = read_one_inode(root, dirid);
1152 if (!dir) {
1153 iput(inode);
1154 return -EIO;
1155 }
1156 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1157
1158 /* FIXME, put inode into FIXUP list */
1159
1160 iput(inode);
1161 iput(dir);
1162 return ret;
1163}
1164
1165/*
1166 * take a single entry in a log directory item and replay it into
1167 * the subvolume.
1168 *
1169 * if a conflicting item exists in the subdirectory already,
1170 * the inode it points to is unlinked and put into the link count
1171 * fix up tree.
1172 *
1173 * If a name from the log points to a file or directory that does
1174 * not exist in the FS, it is skipped. fsyncs on directories
1175 * do not force down inodes inside that directory, just changes to the
1176 * names or unlinks in a directory.
1177 */
1178static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1179 struct btrfs_root *root,
1180 struct btrfs_path *path,
1181 struct extent_buffer *eb,
1182 struct btrfs_dir_item *di,
1183 struct btrfs_key *key)
1184{
1185 char *name;
1186 int name_len;
1187 struct btrfs_dir_item *dst_di;
1188 struct btrfs_key found_key;
1189 struct btrfs_key log_key;
1190 struct inode *dir;
1191 u8 log_type;
1192 int exists;
1193 int ret;
1194
1195 dir = read_one_inode(root, key->objectid);
1196 BUG_ON(!dir);
1197
1198 name_len = btrfs_dir_name_len(eb, di);
1199 name = kmalloc(name_len, GFP_NOFS);
1200 log_type = btrfs_dir_type(eb, di);
1201 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1202 name_len);
1203
1204 btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1205 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1206 if (exists == 0)
1207 exists = 1;
1208 else
1209 exists = 0;
1210 btrfs_release_path(root, path);
1211
1212 if (key->type == BTRFS_DIR_ITEM_KEY) {
1213 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1214 name, name_len, 1);
1215 }
1216 else if (key->type == BTRFS_DIR_INDEX_KEY) {
1217 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1218 key->objectid,
1219 key->offset, name,
1220 name_len, 1);
1221 } else {
1222 BUG();
1223 }
1224 if (!dst_di || IS_ERR(dst_di)) {
1225 /* we need a sequence number to insert, so we only
1226 * do inserts for the BTRFS_DIR_INDEX_KEY types
1227 */
1228 if (key->type != BTRFS_DIR_INDEX_KEY)
1229 goto out;
1230 goto insert;
1231 }
1232
1233 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1234 /* the existing item matches the logged item */
1235 if (found_key.objectid == log_key.objectid &&
1236 found_key.type == log_key.type &&
1237 found_key.offset == log_key.offset &&
1238 btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1239 goto out;
1240 }
1241
1242 /*
1243 * don't drop the conflicting directory entry if the inode
1244 * for the new entry doesn't exist
1245 */
1246 if (!exists)
1247 goto out;
1248
1249 ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1250 BUG_ON(ret);
1251
1252 if (key->type == BTRFS_DIR_INDEX_KEY)
1253 goto insert;
1254out:
1255 btrfs_release_path(root, path);
1256 kfree(name);
1257 iput(dir);
1258 return 0;
1259
1260insert:
1261 btrfs_release_path(root, path);
1262 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1263 name, name_len, log_type, &log_key);
1264
1265 if (ret && ret != -ENOENT)
1266 BUG();
1267 goto out;
1268}
1269
1270/*
1271 * find all the names in a directory item and reconcile them into
1272 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
1273 * one name in a directory item, but the same code gets used for
1274 * both directory index types
1275 */
1276static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1277 struct btrfs_root *root,
1278 struct btrfs_path *path,
1279 struct extent_buffer *eb, int slot,
1280 struct btrfs_key *key)
1281{
1282 int ret;
1283 u32 item_size = btrfs_item_size_nr(eb, slot);
1284 struct btrfs_dir_item *di;
1285 int name_len;
1286 unsigned long ptr;
1287 unsigned long ptr_end;
1288
1289 ptr = btrfs_item_ptr_offset(eb, slot);
1290 ptr_end = ptr + item_size;
1291 while(ptr < ptr_end) {
1292 di = (struct btrfs_dir_item *)ptr;
1293 name_len = btrfs_dir_name_len(eb, di);
1294 ret = replay_one_name(trans, root, path, eb, di, key);
1295 BUG_ON(ret);
1296 ptr = (unsigned long)(di + 1);
1297 ptr += name_len;
1298 }
1299 return 0;
1300}
1301
1302/*
1303 * directory replay has two parts. There are the standard directory
1304 * items in the log copied from the subvolume, and range items
1305 * created in the log while the subvolume was logged.
1306 *
1307 * The range items tell us which parts of the key space the log
1308 * is authoritative for. During replay, if a key in the subvolume
1309 * directory is in a logged range item, but not actually in the log
1310 * that means it was deleted from the directory before the fsync
1311 * and should be removed.
1312 */
1313static noinline int find_dir_range(struct btrfs_root *root,
1314 struct btrfs_path *path,
1315 u64 dirid, int key_type,
1316 u64 *start_ret, u64 *end_ret)
1317{
1318 struct btrfs_key key;
1319 u64 found_end;
1320 struct btrfs_dir_log_item *item;
1321 int ret;
1322 int nritems;
1323
1324 if (*start_ret == (u64)-1)
1325 return 1;
1326
1327 key.objectid = dirid;
1328 key.type = key_type;
1329 key.offset = *start_ret;
1330
1331 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1332 if (ret < 0)
1333 goto out;
1334 if (ret > 0) {
1335 if (path->slots[0] == 0)
1336 goto out;
1337 path->slots[0]--;
1338 }
1339 if (ret != 0)
1340 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1341
1342 if (key.type != key_type || key.objectid != dirid) {
1343 ret = 1;
1344 goto next;
1345 }
1346 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1347 struct btrfs_dir_log_item);
1348 found_end = btrfs_dir_log_end(path->nodes[0], item);
1349
1350 if (*start_ret >= key.offset && *start_ret <= found_end) {
1351 ret = 0;
1352 *start_ret = key.offset;
1353 *end_ret = found_end;
1354 goto out;
1355 }
1356 ret = 1;
1357next:
1358 /* check the next slot in the tree to see if it is a valid item */
1359 nritems = btrfs_header_nritems(path->nodes[0]);
1360 if (path->slots[0] >= nritems) {
1361 ret = btrfs_next_leaf(root, path);
1362 if (ret)
1363 goto out;
1364 } else {
1365 path->slots[0]++;
1366 }
1367
1368 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1369
1370 if (key.type != key_type || key.objectid != dirid) {
1371 ret = 1;
1372 goto out;
1373 }
1374 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1375 struct btrfs_dir_log_item);
1376 found_end = btrfs_dir_log_end(path->nodes[0], item);
1377 *start_ret = key.offset;
1378 *end_ret = found_end;
1379 ret = 0;
1380out:
1381 btrfs_release_path(root, path);
1382 return ret;
1383}
1384
1385/*
1386 * this looks for a given directory item in the log. If the directory
1387 * item is not in the log, the item is removed and the inode it points
1388 * to is unlinked
1389 */
1390static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1391 struct btrfs_root *root,
1392 struct btrfs_root *log,
1393 struct btrfs_path *path,
1394 struct btrfs_path *log_path,
1395 struct inode *dir,
1396 struct btrfs_key *dir_key)
1397{
1398 int ret;
1399 struct extent_buffer *eb;
1400 int slot;
1401 u32 item_size;
1402 struct btrfs_dir_item *di;
1403 struct btrfs_dir_item *log_di;
1404 int name_len;
1405 unsigned long ptr;
1406 unsigned long ptr_end;
1407 char *name;
1408 struct inode *inode;
1409 struct btrfs_key location;
1410
1411again:
1412 eb = path->nodes[0];
1413 slot = path->slots[0];
1414 item_size = btrfs_item_size_nr(eb, slot);
1415 ptr = btrfs_item_ptr_offset(eb, slot);
1416 ptr_end = ptr + item_size;
1417 while(ptr < ptr_end) {
1418 di = (struct btrfs_dir_item *)ptr;
1419 name_len = btrfs_dir_name_len(eb, di);
1420 name = kmalloc(name_len, GFP_NOFS);
1421 if (!name) {
1422 ret = -ENOMEM;
1423 goto out;
1424 }
1425 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1426 name_len);
1427 log_di = NULL;
1428 if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
1429 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1430 dir_key->objectid,
1431 name, name_len, 0);
1432 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
1433 log_di = btrfs_lookup_dir_index_item(trans, log,
1434 log_path,
1435 dir_key->objectid,
1436 dir_key->offset,
1437 name, name_len, 0);
1438 }
1439 if (!log_di || IS_ERR(log_di)) {
1440 btrfs_dir_item_key_to_cpu(eb, di, &location);
1441 btrfs_release_path(root, path);
1442 btrfs_release_path(log, log_path);
1443 inode = read_one_inode(root, location.objectid);
1444 BUG_ON(!inode);
1445
1446 ret = link_to_fixup_dir(trans, root,
1447 path, location.objectid);
1448 BUG_ON(ret);
1449 btrfs_inc_nlink(inode);
1450 ret = btrfs_unlink_inode(trans, root, dir, inode,
1451 name, name_len);
1452 BUG_ON(ret);
1453 kfree(name);
1454 iput(inode);
1455
1456 /* there might still be more names under this key
1457 * check and repeat if required
1458 */
1459 ret = btrfs_search_slot(NULL, root, dir_key, path,
1460 0, 0);
1461 if (ret == 0)
1462 goto again;
1463 ret = 0;
1464 goto out;
1465 }
1466 btrfs_release_path(log, log_path);
1467 kfree(name);
1468
1469 ptr = (unsigned long)(di + 1);
1470 ptr += name_len;
1471 }
1472 ret = 0;
1473out:
1474 btrfs_release_path(root, path);
1475 btrfs_release_path(log, log_path);
1476 return ret;
1477}
1478
1479/*
1480 * deletion replay happens before we copy any new directory items
1481 * out of the log or out of backreferences from inodes. It
1482 * scans the log to find ranges of keys that log is authoritative for,
1483 * and then scans the directory to find items in those ranges that are
1484 * not present in the log.
1485 *
1486 * Anything we don't find in the log is unlinked and removed from the
1487 * directory.
1488 */
1489static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1490 struct btrfs_root *root,
1491 struct btrfs_root *log,
1492 struct btrfs_path *path,
1493 u64 dirid)
1494{
1495 u64 range_start;
1496 u64 range_end;
1497 int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1498 int ret = 0;
1499 struct btrfs_key dir_key;
1500 struct btrfs_key found_key;
1501 struct btrfs_path *log_path;
1502 struct inode *dir;
1503
1504 dir_key.objectid = dirid;
1505 dir_key.type = BTRFS_DIR_ITEM_KEY;
1506 log_path = btrfs_alloc_path();
1507 if (!log_path)
1508 return -ENOMEM;
1509
1510 dir = read_one_inode(root, dirid);
1511 /* it isn't an error if the inode isn't there, that can happen
1512 * because we replay the deletes before we copy in the inode item
1513 * from the log
1514 */
1515 if (!dir) {
1516 btrfs_free_path(log_path);
1517 return 0;
1518 }
1519again:
1520 range_start = 0;
1521 range_end = 0;
1522 while(1) {
1523 ret = find_dir_range(log, path, dirid, key_type,
1524 &range_start, &range_end);
1525 if (ret != 0)
1526 break;
1527
1528 dir_key.offset = range_start;
1529 while(1) {
1530 int nritems;
1531 ret = btrfs_search_slot(NULL, root, &dir_key, path,
1532 0, 0);
1533 if (ret < 0)
1534 goto out;
1535
1536 nritems = btrfs_header_nritems(path->nodes[0]);
1537 if (path->slots[0] >= nritems) {
1538 ret = btrfs_next_leaf(root, path);
1539 if (ret)
1540 break;
1541 }
1542 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1543 path->slots[0]);
1544 if (found_key.objectid != dirid ||
1545 found_key.type != dir_key.type)
1546 goto next_type;
1547
1548 if (found_key.offset > range_end)
1549 break;
1550
1551 ret = check_item_in_log(trans, root, log, path,
1552 log_path, dir, &found_key);
1553 BUG_ON(ret);
1554 if (found_key.offset == (u64)-1)
1555 break;
1556 dir_key.offset = found_key.offset + 1;
1557 }
1558 btrfs_release_path(root, path);
1559 if (range_end == (u64)-1)
1560 break;
1561 range_start = range_end + 1;
1562 }
1563
1564next_type:
1565 ret = 0;
1566 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1567 key_type = BTRFS_DIR_LOG_INDEX_KEY;
1568 dir_key.type = BTRFS_DIR_INDEX_KEY;
1569 btrfs_release_path(root, path);
1570 goto again;
1571 }
1572out:
1573 btrfs_release_path(root, path);
1574 btrfs_free_path(log_path);
1575 iput(dir);
1576 return ret;
1577}
1578
1579/*
1580 * the process_func used to replay items from the log tree. This
1581 * gets called in two different stages. The first stage just looks
1582 * for inodes and makes sure they are all copied into the subvolume.
1583 *
1584 * The second stage copies all the other item types from the log into
1585 * the subvolume. The two stage approach is slower, but gets rid of
1586 * lots of complexity around inodes referencing other inodes that exist
1587 * only in the log (references come from either directory items or inode
1588 * back refs).
1589 */
1590static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1591 struct walk_control *wc, u64 gen)
1592{
1593 int nritems;
1594 struct btrfs_path *path;
1595 struct btrfs_root *root = wc->replay_dest;
1596 struct btrfs_key key;
1597 u32 item_size;
1598 int level;
1599 int i;
1600 int ret;
1601
1602 btrfs_read_buffer(eb, gen);
1603
1604 level = btrfs_header_level(eb);
1605
1606 if (level != 0)
1607 return 0;
1608
1609 path = btrfs_alloc_path();
1610 BUG_ON(!path);
1611
1612 nritems = btrfs_header_nritems(eb);
1613 for (i = 0; i < nritems; i++) {
1614 btrfs_item_key_to_cpu(eb, &key, i);
1615 item_size = btrfs_item_size_nr(eb, i);
1616
1617 /* inode keys are done during the first stage */
1618 if (key.type == BTRFS_INODE_ITEM_KEY &&
1619 wc->stage == LOG_WALK_REPLAY_INODES) {
1620 struct inode *inode;
1621 struct btrfs_inode_item *inode_item;
1622 u32 mode;
1623
1624 inode_item = btrfs_item_ptr(eb, i,
1625 struct btrfs_inode_item);
1626 mode = btrfs_inode_mode(eb, inode_item);
1627 if (S_ISDIR(mode)) {
1628 ret = replay_dir_deletes(wc->trans,
1629 root, log, path, key.objectid);
1630 BUG_ON(ret);
1631 }
1632 ret = overwrite_item(wc->trans, root, path,
1633 eb, i, &key);
1634 BUG_ON(ret);
1635
1636 /* for regular files, truncate away
1637 * extents past the new EOF
1638 */
1639 if (S_ISREG(mode)) {
1640 inode = read_one_inode(root,
1641 key.objectid);
1642 BUG_ON(!inode);
1643
1644 ret = btrfs_truncate_inode_items(wc->trans,
1645 root, inode, inode->i_size,
1646 BTRFS_EXTENT_DATA_KEY);
1647 BUG_ON(ret);
1648 iput(inode);
1649 }
1650 ret = link_to_fixup_dir(wc->trans, root,
1651 path, key.objectid);
1652 BUG_ON(ret);
1653 }
1654 if (wc->stage < LOG_WALK_REPLAY_ALL)
1655 continue;
1656
1657 /* these keys are simply copied */
1658 if (key.type == BTRFS_XATTR_ITEM_KEY) {
1659 ret = overwrite_item(wc->trans, root, path,
1660 eb, i, &key);
1661 BUG_ON(ret);
1662 } else if (key.type == BTRFS_INODE_REF_KEY) {
1663 ret = add_inode_ref(wc->trans, root, log, path,
1664 eb, i, &key);
1665 BUG_ON(ret && ret != -ENOENT);
1666 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1667 ret = replay_one_extent(wc->trans, root, path,
1668 eb, i, &key);
1669 BUG_ON(ret);
1670 } else if (key.type == BTRFS_CSUM_ITEM_KEY) {
1671 ret = replay_one_csum(wc->trans, root, path,
1672 eb, i, &key);
1673 BUG_ON(ret);
1674 } else if (key.type == BTRFS_DIR_ITEM_KEY ||
1675 key.type == BTRFS_DIR_INDEX_KEY) {
1676 ret = replay_one_dir_item(wc->trans, root, path,
1677 eb, i, &key);
1678 BUG_ON(ret);
1679 }
1680 }
1681 btrfs_free_path(path);
1682 return 0;
1683}
1684
1685static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
1686 struct btrfs_root *root,
1687 struct btrfs_path *path, int *level,
1688 struct walk_control *wc)
1689{
1690 u64 root_owner;
1691 u64 root_gen;
1692 u64 bytenr;
1693 u64 ptr_gen;
1694 struct extent_buffer *next;
1695 struct extent_buffer *cur;
1696 struct extent_buffer *parent;
1697 u32 blocksize;
1698 int ret = 0;
1699
1700 WARN_ON(*level < 0);
1701 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1702
1703 while(*level > 0) {
1704 WARN_ON(*level < 0);
1705 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1706 cur = path->nodes[*level];
1707
1708 if (btrfs_header_level(cur) != *level)
1709 WARN_ON(1);
1710
1711 if (path->slots[*level] >=
1712 btrfs_header_nritems(cur))
1713 break;
1714
1715 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1716 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
1717 blocksize = btrfs_level_size(root, *level - 1);
1718
1719 parent = path->nodes[*level];
1720 root_owner = btrfs_header_owner(parent);
1721 root_gen = btrfs_header_generation(parent);
1722
1723 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1724
1725 wc->process_func(root, next, wc, ptr_gen);
1726
1727 if (*level == 1) {
1728 path->slots[*level]++;
1729 if (wc->free) {
1730 btrfs_read_buffer(next, ptr_gen);
1731
1732 btrfs_tree_lock(next);
1733 clean_tree_block(trans, root, next);
1734 btrfs_wait_tree_block_writeback(next);
1735 btrfs_tree_unlock(next);
1736
1737 ret = btrfs_drop_leaf_ref(trans, root, next);
1738 BUG_ON(ret);
1739
1740 WARN_ON(root_owner !=
1741 BTRFS_TREE_LOG_OBJECTID);
1742 ret = btrfs_free_reserved_extent(root,
1743 bytenr, blocksize);
1744 BUG_ON(ret);
1745 }
1746 free_extent_buffer(next);
1747 continue;
1748 }
1749 btrfs_read_buffer(next, ptr_gen);
1750
1751 WARN_ON(*level <= 0);
1752 if (path->nodes[*level-1])
1753 free_extent_buffer(path->nodes[*level-1]);
1754 path->nodes[*level-1] = next;
1755 *level = btrfs_header_level(next);
1756 path->slots[*level] = 0;
1757 cond_resched();
1758 }
1759 WARN_ON(*level < 0);
1760 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1761
1762 if (path->nodes[*level] == root->node) {
1763 parent = path->nodes[*level];
1764 } else {
1765 parent = path->nodes[*level + 1];
1766 }
1767 bytenr = path->nodes[*level]->start;
1768
1769 blocksize = btrfs_level_size(root, *level);
1770 root_owner = btrfs_header_owner(parent);
1771 root_gen = btrfs_header_generation(parent);
1772
1773 wc->process_func(root, path->nodes[*level], wc,
1774 btrfs_header_generation(path->nodes[*level]));
1775
1776 if (wc->free) {
1777 next = path->nodes[*level];
1778 btrfs_tree_lock(next);
1779 clean_tree_block(trans, root, next);
1780 btrfs_wait_tree_block_writeback(next);
1781 btrfs_tree_unlock(next);
1782
1783 if (*level == 0) {
1784 ret = btrfs_drop_leaf_ref(trans, root, next);
1785 BUG_ON(ret);
1786 }
1787 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1788 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1789 BUG_ON(ret);
1790 }
1791 free_extent_buffer(path->nodes[*level]);
1792 path->nodes[*level] = NULL;
1793 *level += 1;
1794
1795 cond_resched();
1796 return 0;
1797}
1798
1799static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
1800 struct btrfs_root *root,
1801 struct btrfs_path *path, int *level,
1802 struct walk_control *wc)
1803{
1804 u64 root_owner;
1805 u64 root_gen;
1806 int i;
1807 int slot;
1808 int ret;
1809
1810 for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1811 slot = path->slots[i];
1812 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
1813 struct extent_buffer *node;
1814 node = path->nodes[i];
1815 path->slots[i]++;
1816 *level = i;
1817 WARN_ON(*level == 0);
1818 return 0;
1819 } else {
1820 struct extent_buffer *parent;
1821 if (path->nodes[*level] == root->node)
1822 parent = path->nodes[*level];
1823 else
1824 parent = path->nodes[*level + 1];
1825
1826 root_owner = btrfs_header_owner(parent);
1827 root_gen = btrfs_header_generation(parent);
1828 wc->process_func(root, path->nodes[*level], wc,
1829 btrfs_header_generation(path->nodes[*level]));
1830 if (wc->free) {
1831 struct extent_buffer *next;
1832
1833 next = path->nodes[*level];
1834
1835 btrfs_tree_lock(next);
1836 clean_tree_block(trans, root, next);
1837 btrfs_wait_tree_block_writeback(next);
1838 btrfs_tree_unlock(next);
1839
1840 if (*level == 0) {
1841 ret = btrfs_drop_leaf_ref(trans, root,
1842 next);
1843 BUG_ON(ret);
1844 }
1845
1846 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1847 ret = btrfs_free_reserved_extent(root,
1848 path->nodes[*level]->start,
1849 path->nodes[*level]->len);
1850 BUG_ON(ret);
1851 }
1852 free_extent_buffer(path->nodes[*level]);
1853 path->nodes[*level] = NULL;
1854 *level = i + 1;
1855 }
1856 }
1857 return 1;
1858}
1859
1860/*
1861 * drop the reference count on the tree rooted at 'snap'. This traverses
1862 * the tree freeing any blocks that have a ref count of zero after being
1863 * decremented.
1864 */
1865static int walk_log_tree(struct btrfs_trans_handle *trans,
1866 struct btrfs_root *log, struct walk_control *wc)
1867{
1868 int ret = 0;
1869 int wret;
1870 int level;
1871 struct btrfs_path *path;
1872 int i;
1873 int orig_level;
1874
1875 path = btrfs_alloc_path();
1876 BUG_ON(!path);
1877
1878 level = btrfs_header_level(log->node);
1879 orig_level = level;
1880 path->nodes[level] = log->node;
1881 extent_buffer_get(log->node);
1882 path->slots[level] = 0;
1883
1884 while(1) {
1885 wret = walk_down_log_tree(trans, log, path, &level, wc);
1886 if (wret > 0)
1887 break;
1888 if (wret < 0)
1889 ret = wret;
1890
1891 wret = walk_up_log_tree(trans, log, path, &level, wc);
1892 if (wret > 0)
1893 break;
1894 if (wret < 0)
1895 ret = wret;
1896 }
1897
1898 /* was the root node processed? if not, catch it here */
1899 if (path->nodes[orig_level]) {
1900 wc->process_func(log, path->nodes[orig_level], wc,
1901 btrfs_header_generation(path->nodes[orig_level]));
1902 if (wc->free) {
1903 struct extent_buffer *next;
1904
1905 next = path->nodes[orig_level];
1906
1907 btrfs_tree_lock(next);
1908 clean_tree_block(trans, log, next);
1909 btrfs_wait_tree_block_writeback(next);
1910 btrfs_tree_unlock(next);
1911
1912 if (orig_level == 0) {
1913 ret = btrfs_drop_leaf_ref(trans, log,
1914 next);
1915 BUG_ON(ret);
1916 }
1917 WARN_ON(log->root_key.objectid !=
1918 BTRFS_TREE_LOG_OBJECTID);
1919 ret = btrfs_free_reserved_extent(log, next->start,
1920 next->len);
1921 BUG_ON(ret);
1922 }
1923 }
1924
1925 for (i = 0; i <= orig_level; i++) {
1926 if (path->nodes[i]) {
1927 free_extent_buffer(path->nodes[i]);
1928 path->nodes[i] = NULL;
1929 }
1930 }
1931 btrfs_free_path(path);
1932 if (wc->free)
1933 free_extent_buffer(log->node);
1934 return ret;
1935}
1936
1937int wait_log_commit(struct btrfs_root *log)
1938{
1939 DEFINE_WAIT(wait);
1940 u64 transid = log->fs_info->tree_log_transid;
1941
1942 do {
1943 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1944 TASK_UNINTERRUPTIBLE);
1945 mutex_unlock(&log->fs_info->tree_log_mutex);
1946 if (atomic_read(&log->fs_info->tree_log_commit))
1947 schedule();
1948 finish_wait(&log->fs_info->tree_log_wait, &wait);
1949 mutex_lock(&log->fs_info->tree_log_mutex);
1950 } while(transid == log->fs_info->tree_log_transid &&
1951 atomic_read(&log->fs_info->tree_log_commit));
1952 return 0;
1953}
1954
1955/*
1956 * btrfs_sync_log does sends a given tree log down to the disk and
1957 * updates the super blocks to record it. When this call is done,
1958 * you know that any inodes previously logged are safely on disk
1959 */
1960int btrfs_sync_log(struct btrfs_trans_handle *trans,
1961 struct btrfs_root *root)
1962{
1963 int ret;
1964 unsigned long batch;
1965 struct btrfs_root *log = root->log_root;
1966
1967 mutex_lock(&log->fs_info->tree_log_mutex);
1968 if (atomic_read(&log->fs_info->tree_log_commit)) {
1969 wait_log_commit(log);
1970 goto out;
1971 }
1972 atomic_set(&log->fs_info->tree_log_commit, 1);
1973
1974 while(1) {
1975 batch = log->fs_info->tree_log_batch;
1976 mutex_unlock(&log->fs_info->tree_log_mutex);
1977 schedule_timeout_uninterruptible(1);
1978 mutex_lock(&log->fs_info->tree_log_mutex);
1979
1980 while(atomic_read(&log->fs_info->tree_log_writers)) {
1981 DEFINE_WAIT(wait);
1982 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1983 TASK_UNINTERRUPTIBLE);
1984 mutex_unlock(&log->fs_info->tree_log_mutex);
1985 if (atomic_read(&log->fs_info->tree_log_writers))
1986 schedule();
1987 mutex_lock(&log->fs_info->tree_log_mutex);
1988 finish_wait(&log->fs_info->tree_log_wait, &wait);
1989 }
1990 if (batch == log->fs_info->tree_log_batch)
1991 break;
1992 }
1993
1994 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1995 BUG_ON(ret);
1996 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
1997 &root->fs_info->log_root_tree->dirty_log_pages);
1998 BUG_ON(ret);
1999
2000 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
2001 log->fs_info->log_root_tree->node->start);
2002 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
2003 btrfs_header_level(log->fs_info->log_root_tree->node));
2004
2005 write_ctree_super(trans, log->fs_info->tree_root);
2006 log->fs_info->tree_log_transid++;
2007 log->fs_info->tree_log_batch = 0;
2008 atomic_set(&log->fs_info->tree_log_commit, 0);
2009 smp_mb();
2010 if (waitqueue_active(&log->fs_info->tree_log_wait))
2011 wake_up(&log->fs_info->tree_log_wait);
2012out:
2013 mutex_unlock(&log->fs_info->tree_log_mutex);
2014 return 0;
2015
2016}
2017
2018/* * free all the extents used by the tree log. This should be called
2019 * at commit time of the full transaction
2020 */
2021int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2022{
2023 int ret;
2024 struct btrfs_root *log;
2025 struct key;
2026 u64 start;
2027 u64 end;
2028 struct walk_control wc = {
2029 .free = 1,
2030 .process_func = process_one_buffer
2031 };
2032
2033 if (!root->log_root)
2034 return 0;
2035
2036 log = root->log_root;
2037 ret = walk_log_tree(trans, log, &wc);
2038 BUG_ON(ret);
2039
2040 while(1) {
2041 ret = find_first_extent_bit(&log->dirty_log_pages,
2042 0, &start, &end, EXTENT_DIRTY);
2043 if (ret)
2044 break;
2045
2046 clear_extent_dirty(&log->dirty_log_pages,
2047 start, end, GFP_NOFS);
2048 }
2049
2050 log = root->log_root;
2051 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2052 &log->root_key);
2053 BUG_ON(ret);
2054 root->log_root = NULL;
2055 kfree(root->log_root);
2056 return 0;
2057}
2058
2059/*
2060 * helper function to update the item for a given subvolumes log root
2061 * in the tree of log roots
2062 */
2063static int update_log_root(struct btrfs_trans_handle *trans,
2064 struct btrfs_root *log)
2065{
2066 u64 bytenr = btrfs_root_bytenr(&log->root_item);
2067 int ret;
2068
2069 if (log->node->start == bytenr)
2070 return 0;
2071
2072 btrfs_set_root_bytenr(&log->root_item, log->node->start);
2073 btrfs_set_root_generation(&log->root_item, trans->transid);
2074 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2075 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2076 &log->root_key, &log->root_item);
2077 BUG_ON(ret);
2078 return ret;
2079}
2080
2081/*
2082 * If both a file and directory are logged, and unlinks or renames are
2083 * mixed in, we have a few interesting corners:
2084 *
2085 * create file X in dir Y
2086 * link file X to X.link in dir Y
2087 * fsync file X
2088 * unlink file X but leave X.link
2089 * fsync dir Y
2090 *
2091 * After a crash we would expect only X.link to exist. But file X
2092 * didn't get fsync'd again so the log has back refs for X and X.link.
2093 *
2094 * We solve this by removing directory entries and inode backrefs from the
2095 * log when a file that was logged in the current transaction is
2096 * unlinked. Any later fsync will include the updated log entries, and
2097 * we'll be able to reconstruct the proper directory items from backrefs.
2098 *
2099 * This optimizations allows us to avoid relogging the entire inode
2100 * or the entire directory.
2101 */
2102int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2103 struct btrfs_root *root,
2104 const char *name, int name_len,
2105 struct inode *dir, u64 index)
2106{
2107 struct btrfs_root *log;
2108 struct btrfs_dir_item *di;
2109 struct btrfs_path *path;
2110 int ret;
2111 int bytes_del = 0;
2112
2113 if (BTRFS_I(dir)->logged_trans < trans->transid)
2114 return 0;
2115
2116 ret = join_running_log_trans(root);
2117 if (ret)
2118 return 0;
2119
2120 mutex_lock(&BTRFS_I(dir)->log_mutex);
2121
2122 log = root->log_root;
2123 path = btrfs_alloc_path();
2124 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2125 name, name_len, -1);
2126 if (di && !IS_ERR(di)) {
2127 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2128 bytes_del += name_len;
2129 BUG_ON(ret);
2130 }
2131 btrfs_release_path(log, path);
2132 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2133 index, name, name_len, -1);
2134 if (di && !IS_ERR(di)) {
2135 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2136 bytes_del += name_len;
2137 BUG_ON(ret);
2138 }
2139
2140 /* update the directory size in the log to reflect the names
2141 * we have removed
2142 */
2143 if (bytes_del) {
2144 struct btrfs_key key;
2145
2146 key.objectid = dir->i_ino;
2147 key.offset = 0;
2148 key.type = BTRFS_INODE_ITEM_KEY;
2149 btrfs_release_path(log, path);
2150
2151 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2152 if (ret == 0) {
2153 struct btrfs_inode_item *item;
2154 u64 i_size;
2155
2156 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2157 struct btrfs_inode_item);
2158 i_size = btrfs_inode_size(path->nodes[0], item);
2159 if (i_size > bytes_del)
2160 i_size -= bytes_del;
2161 else
2162 i_size = 0;
2163 btrfs_set_inode_size(path->nodes[0], item, i_size);
2164 btrfs_mark_buffer_dirty(path->nodes[0]);
2165 } else
2166 ret = 0;
2167 btrfs_release_path(log, path);
2168 }
2169
2170 btrfs_free_path(path);
2171 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2172 end_log_trans(root);
2173
2174 return 0;
2175}
2176
2177/* see comments for btrfs_del_dir_entries_in_log */
2178int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2179 struct btrfs_root *root,
2180 const char *name, int name_len,
2181 struct inode *inode, u64 dirid)
2182{
2183 struct btrfs_root *log;
2184 u64 index;
2185 int ret;
2186
2187 if (BTRFS_I(inode)->logged_trans < trans->transid)
2188 return 0;
2189
2190 ret = join_running_log_trans(root);
2191 if (ret)
2192 return 0;
2193 log = root->log_root;
2194 mutex_lock(&BTRFS_I(inode)->log_mutex);
2195
2196 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2197 dirid, &index);
2198 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2199 end_log_trans(root);
2200
2201 return ret;
2202}
2203
2204/*
2205 * creates a range item in the log for 'dirid'. first_offset and
2206 * last_offset tell us which parts of the key space the log should
2207 * be considered authoritative for.
2208 */
2209static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2210 struct btrfs_root *log,
2211 struct btrfs_path *path,
2212 int key_type, u64 dirid,
2213 u64 first_offset, u64 last_offset)
2214{
2215 int ret;
2216 struct btrfs_key key;
2217 struct btrfs_dir_log_item *item;
2218
2219 key.objectid = dirid;
2220 key.offset = first_offset;
2221 if (key_type == BTRFS_DIR_ITEM_KEY)
2222 key.type = BTRFS_DIR_LOG_ITEM_KEY;
2223 else
2224 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2225 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2226 BUG_ON(ret);
2227
2228 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2229 struct btrfs_dir_log_item);
2230 btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2231 btrfs_mark_buffer_dirty(path->nodes[0]);
2232 btrfs_release_path(log, path);
2233 return 0;
2234}
2235
2236/*
2237 * log all the items included in the current transaction for a given
2238 * directory. This also creates the range items in the log tree required
2239 * to replay anything deleted before the fsync
2240 */
2241static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2242 struct btrfs_root *root, struct inode *inode,
2243 struct btrfs_path *path,
2244 struct btrfs_path *dst_path, int key_type,
2245 u64 min_offset, u64 *last_offset_ret)
2246{
2247 struct btrfs_key min_key;
2248 struct btrfs_key max_key;
2249 struct btrfs_root *log = root->log_root;
2250 struct extent_buffer *src;
2251 int ret;
2252 int i;
2253 int nritems;
2254 u64 first_offset = min_offset;
2255 u64 last_offset = (u64)-1;
2256
2257 log = root->log_root;
2258 max_key.objectid = inode->i_ino;
2259 max_key.offset = (u64)-1;
2260 max_key.type = key_type;
2261
2262 min_key.objectid = inode->i_ino;
2263 min_key.type = key_type;
2264 min_key.offset = min_offset;
2265
2266 path->keep_locks = 1;
2267
2268 ret = btrfs_search_forward(root, &min_key, &max_key,
2269 path, 0, trans->transid);
2270
2271 /*
2272 * we didn't find anything from this transaction, see if there
2273 * is anything at all
2274 */
2275 if (ret != 0 || min_key.objectid != inode->i_ino ||
2276 min_key.type != key_type) {
2277 min_key.objectid = inode->i_ino;
2278 min_key.type = key_type;
2279 min_key.offset = (u64)-1;
2280 btrfs_release_path(root, path);
2281 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2282 if (ret < 0) {
2283 btrfs_release_path(root, path);
2284 return ret;
2285 }
2286 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2287
2288 /* if ret == 0 there are items for this type,
2289 * create a range to tell us the last key of this type.
2290 * otherwise, there are no items in this directory after
2291 * *min_offset, and we create a range to indicate that.
2292 */
2293 if (ret == 0) {
2294 struct btrfs_key tmp;
2295 btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2296 path->slots[0]);
2297 if (key_type == tmp.type) {
2298 first_offset = max(min_offset, tmp.offset) + 1;
2299 }
2300 }
2301 goto done;
2302 }
2303
2304 /* go backward to find any previous key */
2305 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2306 if (ret == 0) {
2307 struct btrfs_key tmp;
2308 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2309 if (key_type == tmp.type) {
2310 first_offset = tmp.offset;
2311 ret = overwrite_item(trans, log, dst_path,
2312 path->nodes[0], path->slots[0],
2313 &tmp);
2314 }
2315 }
2316 btrfs_release_path(root, path);
2317
2318 /* find the first key from this transaction again */
2319 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2320 if (ret != 0) {
2321 WARN_ON(1);
2322 goto done;
2323 }
2324
2325 /*
2326 * we have a block from this transaction, log every item in it
2327 * from our directory
2328 */
2329 while(1) {
2330 struct btrfs_key tmp;
2331 src = path->nodes[0];
2332 nritems = btrfs_header_nritems(src);
2333 for (i = path->slots[0]; i < nritems; i++) {
2334 btrfs_item_key_to_cpu(src, &min_key, i);
2335
2336 if (min_key.objectid != inode->i_ino ||
2337 min_key.type != key_type)
2338 goto done;
2339 ret = overwrite_item(trans, log, dst_path, src, i,
2340 &min_key);
2341 BUG_ON(ret);
2342 }
2343 path->slots[0] = nritems;
2344
2345 /*
2346 * look ahead to the next item and see if it is also
2347 * from this directory and from this transaction
2348 */
2349 ret = btrfs_next_leaf(root, path);
2350 if (ret == 1) {
2351 last_offset = (u64)-1;
2352 goto done;
2353 }
2354 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2355 if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
2356 last_offset = (u64)-1;
2357 goto done;
2358 }
2359 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2360 ret = overwrite_item(trans, log, dst_path,
2361 path->nodes[0], path->slots[0],
2362 &tmp);
2363
2364 BUG_ON(ret);
2365 last_offset = tmp.offset;
2366 goto done;
2367 }
2368 }
2369done:
2370 *last_offset_ret = last_offset;
2371 btrfs_release_path(root, path);
2372 btrfs_release_path(log, dst_path);
2373
2374 /* insert the log range keys to indicate where the log is valid */
2375 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
2376 first_offset, last_offset);
2377 BUG_ON(ret);
2378 return 0;
2379}
2380
2381/*
2382 * logging directories is very similar to logging inodes, We find all the items
2383 * from the current transaction and write them to the log.
2384 *
2385 * The recovery code scans the directory in the subvolume, and if it finds a
2386 * key in the range logged that is not present in the log tree, then it means
2387 * that dir entry was unlinked during the transaction.
2388 *
2389 * In order for that scan to work, we must include one key smaller than
2390 * the smallest logged by this transaction and one key larger than the largest
2391 * key logged by this transaction.
2392 */
2393static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2394 struct btrfs_root *root, struct inode *inode,
2395 struct btrfs_path *path,
2396 struct btrfs_path *dst_path)
2397{
2398 u64 min_key;
2399 u64 max_key;
2400 int ret;
2401 int key_type = BTRFS_DIR_ITEM_KEY;
2402
2403again:
2404 min_key = 0;
2405 max_key = 0;
2406 while(1) {
2407 ret = log_dir_items(trans, root, inode, path,
2408 dst_path, key_type, min_key,
2409 &max_key);
2410 BUG_ON(ret);
2411 if (max_key == (u64)-1)
2412 break;
2413 min_key = max_key + 1;
2414 }
2415
2416 if (key_type == BTRFS_DIR_ITEM_KEY) {
2417 key_type = BTRFS_DIR_INDEX_KEY;
2418 goto again;
2419 }
2420 return 0;
2421}
2422
2423/*
2424 * a helper function to drop items from the log before we relog an
2425 * inode. max_key_type indicates the highest item type to remove.
2426 * This cannot be run for file data extents because it does not
2427 * free the extents they point to.
2428 */
2429static int drop_objectid_items(struct btrfs_trans_handle *trans,
2430 struct btrfs_root *log,
2431 struct btrfs_path *path,
2432 u64 objectid, int max_key_type)
2433{
2434 int ret;
2435 struct btrfs_key key;
2436 struct btrfs_key found_key;
2437
2438 key.objectid = objectid;
2439 key.type = max_key_type;
2440 key.offset = (u64)-1;
2441
2442 while(1) {
2443 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2444
2445 if (ret != 1)
2446 break;
2447
2448 if (path->slots[0] == 0)
2449 break;
2450
2451 path->slots[0]--;
2452 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2453 path->slots[0]);
2454
2455 if (found_key.objectid != objectid)
2456 break;
2457
2458 ret = btrfs_del_item(trans, log, path);
2459 BUG_ON(ret);
2460 btrfs_release_path(log, path);
2461 }
2462 btrfs_release_path(log, path);
2463 return 0;
2464}
2465
2466static noinline int copy_items(struct btrfs_trans_handle *trans,
2467 struct btrfs_root *log,
2468 struct btrfs_path *dst_path,
2469 struct extent_buffer *src,
2470 int start_slot, int nr, int inode_only)
2471{
2472 unsigned long src_offset;
2473 unsigned long dst_offset;
2474 struct btrfs_file_extent_item *extent;
2475 struct btrfs_inode_item *inode_item;
2476 int ret;
2477 struct btrfs_key *ins_keys;
2478 u32 *ins_sizes;
2479 char *ins_data;
2480 int i;
2481
2482 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2483 nr * sizeof(u32), GFP_NOFS);
2484 ins_sizes = (u32 *)ins_data;
2485 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2486
2487 for (i = 0; i < nr; i++) {
2488 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
2489 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
2490 }
2491 ret = btrfs_insert_empty_items(trans, log, dst_path,
2492 ins_keys, ins_sizes, nr);
2493 BUG_ON(ret);
2494
2495 for (i = 0; i < nr; i++) {
2496 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
2497 dst_path->slots[0]);
2498
2499 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2500
2501 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2502 src_offset, ins_sizes[i]);
2503
2504 if (inode_only == LOG_INODE_EXISTS &&
2505 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2506 inode_item = btrfs_item_ptr(dst_path->nodes[0],
2507 dst_path->slots[0],
2508 struct btrfs_inode_item);
2509 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
2510
2511 /* set the generation to zero so the recover code
2512 * can tell the difference between an logging
2513 * just to say 'this inode exists' and a logging
2514 * to say 'update this inode with these values'
2515 */
2516 btrfs_set_inode_generation(dst_path->nodes[0],
2517 inode_item, 0);
2518 }
2519 /* take a reference on file data extents so that truncates
2520 * or deletes of this inode don't have to relog the inode
2521 * again
2522 */
2523 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
2524 int found_type;
2525 extent = btrfs_item_ptr(src, start_slot + i,
2526 struct btrfs_file_extent_item);
2527
2528 found_type = btrfs_file_extent_type(src, extent);
2529 if (found_type == BTRFS_FILE_EXTENT_REG ||
2530 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2531 u64 ds = btrfs_file_extent_disk_bytenr(src,
2532 extent);
2533 u64 dl = btrfs_file_extent_disk_num_bytes(src,
2534 extent);
2535 /* ds == 0 is a hole */
2536 if (ds != 0) {
2537 ret = btrfs_inc_extent_ref(trans, log,
2538 ds, dl,
2539 dst_path->nodes[0]->start,
2540 BTRFS_TREE_LOG_OBJECTID,
2541 trans->transid,
2542 ins_keys[i].objectid);
2543 BUG_ON(ret);
2544 }
2545 }
2546 }
2547 dst_path->slots[0]++;
2548 }
2549
2550 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
2551 btrfs_release_path(log, dst_path);
2552 kfree(ins_data);
2553 return 0;
2554}
2555
2556/* log a single inode in the tree log.
2557 * At least one parent directory for this inode must exist in the tree
2558 * or be logged already.
2559 *
2560 * Any items from this inode changed by the current transaction are copied
2561 * to the log tree. An extra reference is taken on any extents in this
2562 * file, allowing us to avoid a whole pile of corner cases around logging
2563 * blocks that have been removed from the tree.
2564 *
2565 * See LOG_INODE_ALL and related defines for a description of what inode_only
2566 * does.
2567 *
2568 * This handles both files and directories.
2569 */
2570static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2571 struct btrfs_root *root, struct inode *inode,
2572 int inode_only)
2573{
2574 struct btrfs_path *path;
2575 struct btrfs_path *dst_path;
2576 struct btrfs_key min_key;
2577 struct btrfs_key max_key;
2578 struct btrfs_root *log = root->log_root;
2579 struct extent_buffer *src = NULL;
2580 u32 size;
2581 int ret;
2582 int nritems;
2583 int ins_start_slot = 0;
2584 int ins_nr;
2585
2586 log = root->log_root;
2587
2588 path = btrfs_alloc_path();
2589 dst_path = btrfs_alloc_path();
2590
2591 min_key.objectid = inode->i_ino;
2592 min_key.type = BTRFS_INODE_ITEM_KEY;
2593 min_key.offset = 0;
2594
2595 max_key.objectid = inode->i_ino;
2596 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2597 max_key.type = BTRFS_XATTR_ITEM_KEY;
2598 else
2599 max_key.type = (u8)-1;
2600 max_key.offset = (u64)-1;
2601
2602 /*
2603 * if this inode has already been logged and we're in inode_only
2604 * mode, we don't want to delete the things that have already
2605 * been written to the log.
2606 *
2607 * But, if the inode has been through an inode_only log,
2608 * the logged_trans field is not set. This allows us to catch
2609 * any new names for this inode in the backrefs by logging it
2610 * again
2611 */
2612 if (inode_only == LOG_INODE_EXISTS &&
2613 BTRFS_I(inode)->logged_trans == trans->transid) {
2614 btrfs_free_path(path);
2615 btrfs_free_path(dst_path);
2616 goto out;
2617 }
2618 mutex_lock(&BTRFS_I(inode)->log_mutex);
2619
2620 /*
2621 * a brute force approach to making sure we get the most uptodate
2622 * copies of everything.
2623 */
2624 if (S_ISDIR(inode->i_mode)) {
2625 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
2626
2627 if (inode_only == LOG_INODE_EXISTS)
2628 max_key_type = BTRFS_XATTR_ITEM_KEY;
2629 ret = drop_objectid_items(trans, log, path,
2630 inode->i_ino, max_key_type);
2631 } else {
2632 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2633 }
2634 BUG_ON(ret);
2635 path->keep_locks = 1;
2636
2637 while(1) {
2638 ins_nr = 0;
2639 ret = btrfs_search_forward(root, &min_key, &max_key,
2640 path, 0, trans->transid);
2641 if (ret != 0)
2642 break;
2643again:
2644 /* note, ins_nr might be > 0 here, cleanup outside the loop */
2645 if (min_key.objectid != inode->i_ino)
2646 break;
2647 if (min_key.type > max_key.type)
2648 break;
2649
2650 src = path->nodes[0];
2651 size = btrfs_item_size_nr(src, path->slots[0]);
2652 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2653 ins_nr++;
2654 goto next_slot;
2655 } else if (!ins_nr) {
2656 ins_start_slot = path->slots[0];
2657 ins_nr = 1;
2658 goto next_slot;
2659 }
2660
2661 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2662 ins_nr, inode_only);
2663 BUG_ON(ret);
2664 ins_nr = 1;
2665 ins_start_slot = path->slots[0];
2666next_slot:
2667
2668 nritems = btrfs_header_nritems(path->nodes[0]);
2669 path->slots[0]++;
2670 if (path->slots[0] < nritems) {
2671 btrfs_item_key_to_cpu(path->nodes[0], &min_key,
2672 path->slots[0]);
2673 goto again;
2674 }
2675 if (ins_nr) {
2676 ret = copy_items(trans, log, dst_path, src,
2677 ins_start_slot,
2678 ins_nr, inode_only);
2679 BUG_ON(ret);
2680 ins_nr = 0;
2681 }
2682 btrfs_release_path(root, path);
2683
2684 if (min_key.offset < (u64)-1)
2685 min_key.offset++;
2686 else if (min_key.type < (u8)-1)
2687 min_key.type++;
2688 else if (min_key.objectid < (u64)-1)
2689 min_key.objectid++;
2690 else
2691 break;
2692 }
2693 if (ins_nr) {
2694 ret = copy_items(trans, log, dst_path, src,
2695 ins_start_slot,
2696 ins_nr, inode_only);
2697 BUG_ON(ret);
2698 ins_nr = 0;
2699 }
2700 WARN_ON(ins_nr);
2701 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2702 btrfs_release_path(root, path);
2703 btrfs_release_path(log, dst_path);
2704 BTRFS_I(inode)->log_dirty_trans = 0;
2705 ret = log_directory_changes(trans, root, inode, path, dst_path);
2706 BUG_ON(ret);
2707 }
2708 BTRFS_I(inode)->logged_trans = trans->transid;
2709 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2710
2711 btrfs_free_path(path);
2712 btrfs_free_path(dst_path);
2713
2714 mutex_lock(&root->fs_info->tree_log_mutex);
2715 ret = update_log_root(trans, log);
2716 BUG_ON(ret);
2717 mutex_unlock(&root->fs_info->tree_log_mutex);
2718out:
2719 return 0;
2720}
2721
2722int btrfs_log_inode(struct btrfs_trans_handle *trans,
2723 struct btrfs_root *root, struct inode *inode,
2724 int inode_only)
2725{
2726 int ret;
2727
2728 start_log_trans(trans, root);
2729 ret = __btrfs_log_inode(trans, root, inode, inode_only);
2730 end_log_trans(root);
2731 return ret;
2732}
2733
2734/*
2735 * helper function around btrfs_log_inode to make sure newly created
2736 * parent directories also end up in the log. A minimal inode and backref
2737 * only logging is done of any parent directories that are older than
2738 * the last committed transaction
2739 */
2740int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2741 struct btrfs_root *root, struct dentry *dentry)
2742{
2743 int inode_only = LOG_INODE_ALL;
2744 struct super_block *sb;
2745 int ret;
2746
2747 start_log_trans(trans, root);
2748 sb = dentry->d_inode->i_sb;
2749 while(1) {
2750 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2751 inode_only);
2752 BUG_ON(ret);
2753 inode_only = LOG_INODE_EXISTS;
2754
2755 dentry = dentry->d_parent;
2756 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
2757 break;
2758
2759 if (BTRFS_I(dentry->d_inode)->generation <=
2760 root->fs_info->last_trans_committed)
2761 break;
2762 }
2763 end_log_trans(root);
2764 return 0;
2765}
2766
2767/*
2768 * it is not safe to log dentry if the chunk root has added new
2769 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
2770 * If this returns 1, you must commit the transaction to safely get your
2771 * data on disk.
2772 */
2773int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2774 struct btrfs_root *root, struct dentry *dentry)
2775{
2776 u64 gen;
2777 gen = root->fs_info->last_trans_new_blockgroup;
2778 if (gen > root->fs_info->last_trans_committed)
2779 return 1;
2780 else
2781 return btrfs_log_dentry(trans, root, dentry);
2782}
2783
2784/*
2785 * should be called during mount to recover any replay any log trees
2786 * from the FS
2787 */
2788int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
2789{
2790 int ret;
2791 struct btrfs_path *path;
2792 struct btrfs_trans_handle *trans;
2793 struct btrfs_key key;
2794 struct btrfs_key found_key;
2795 struct btrfs_key tmp_key;
2796 struct btrfs_root *log;
2797 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
2798 u64 highest_inode;
2799 struct walk_control wc = {
2800 .process_func = process_one_buffer,
2801 .stage = 0,
2802 };
2803
2804 fs_info->log_root_recovering = 1;
2805 path = btrfs_alloc_path();
2806 BUG_ON(!path);
2807
2808 trans = btrfs_start_transaction(fs_info->tree_root, 1);
2809
2810 wc.trans = trans;
2811 wc.pin = 1;
2812
2813 walk_log_tree(trans, log_root_tree, &wc);
2814
2815again:
2816 key.objectid = BTRFS_TREE_LOG_OBJECTID;
2817 key.offset = (u64)-1;
2818 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
2819
2820 while(1) {
2821 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
2822 if (ret < 0)
2823 break;
2824 if (ret > 0) {
2825 if (path->slots[0] == 0)
2826 break;
2827 path->slots[0]--;
2828 }
2829 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2830 path->slots[0]);
2831 btrfs_release_path(log_root_tree, path);
2832 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
2833 break;
2834
2835 log = btrfs_read_fs_root_no_radix(log_root_tree,
2836 &found_key);
2837 BUG_ON(!log);
2838
2839
2840 tmp_key.objectid = found_key.offset;
2841 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
2842 tmp_key.offset = (u64)-1;
2843
2844 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
2845
2846 BUG_ON(!wc.replay_dest);
2847
2848 btrfs_record_root_in_trans(wc.replay_dest);
2849 ret = walk_log_tree(trans, log, &wc);
2850 BUG_ON(ret);
2851
2852 if (wc.stage == LOG_WALK_REPLAY_ALL) {
2853 ret = fixup_inode_link_counts(trans, wc.replay_dest,
2854 path);
2855 BUG_ON(ret);
2856 }
2857 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
2858 if (ret == 0) {
2859 wc.replay_dest->highest_inode = highest_inode;
2860 wc.replay_dest->last_inode_alloc = highest_inode;
2861 }
2862
2863 key.offset = found_key.offset - 1;
2864 free_extent_buffer(log->node);
2865 kfree(log);
2866
2867 if (found_key.offset == 0)
2868 break;
2869 }
2870 btrfs_release_path(log_root_tree, path);
2871
2872 /* step one is to pin it all, step two is to replay just inodes */
2873 if (wc.pin) {
2874 wc.pin = 0;
2875 wc.process_func = replay_one_buffer;
2876 wc.stage = LOG_WALK_REPLAY_INODES;
2877 goto again;
2878 }
2879 /* step three is to replay everything */
2880 if (wc.stage < LOG_WALK_REPLAY_ALL) {
2881 wc.stage++;
2882 goto again;
2883 }
2884
2885 btrfs_free_path(path);
2886
2887 free_extent_buffer(log_root_tree->node);
2888 log_root_tree->log_root = NULL;
2889 fs_info->log_root_recovering = 0;
2890
2891 /* step 4: commit the transaction, which also unpins the blocks */
2892 btrfs_commit_transaction(trans, fs_info->tree_root);
2893
2894 kfree(log_root_tree);
2895 return 0;
2896}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 000000000000..b9409b32ed02
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __TREE_LOG_
20#define __TREE_LOG_
21
22int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
25int btrfs_log_dentry(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root, struct dentry *dentry);
27int btrfs_recover_log_trees(struct btrfs_root *tree_root);
28int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
29 struct btrfs_root *root, struct dentry *dentry);
30int btrfs_log_inode(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, struct inode *inode,
32 int inode_only);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root,
35 const char *name, int name_len,
36 struct inode *dir, u64 index);
37int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root,
39 const char *name, int name_len,
40 struct inode *inode, u64 dirid);
41#endif
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
new file mode 100644
index 000000000000..9bf3946d5ef2
--- /dev/null
+++ b/fs/btrfs/version.h
@@ -0,0 +1,4 @@
1#ifndef __BTRFS_VERSION_H
2#define __BTRFS_VERSION_H
3#define BTRFS_BUILD_VERSION "Btrfs"
4#endif
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
new file mode 100644
index 000000000000..0f57f24404d9
--- /dev/null
+++ b/fs/btrfs/version.sh
@@ -0,0 +1,43 @@
1#!/bin/bash
2#
3# determine-version -- report a useful version for releases
4#
5# Copyright 2008, Aron Griffis <agriffis@n01se.net>
6# Copyright 2008, Oracle
7# Released under the GNU GPLv2
8
9v="v0.16"
10
11which hg > /dev/null
12if [ -d .hg ] && [ $? == 0 ]; then
13 last=$(hg tags | grep -m1 -o '^v[0-9.]\+')
14
15 # now check if the repo has commits since then...
16 if [[ $(hg id -t) == $last || \
17 $(hg di -r "$last:." | awk '/^diff/{print $NF}' | sort -u) == .hgtags ]]
18 then
19 # check if it's dirty
20 if [[ $(hg id | cut -d' ' -f1) == *+ ]]; then
21 v=$last+
22 else
23 v=$last
24 fi
25 else
26 # includes dirty flag
27 v=$last+$(hg id -i)
28 fi
29fi
30
31echo "#ifndef __BUILD_VERSION" > .build-version.h
32echo "#define __BUILD_VERSION" >> .build-version.h
33echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
34echo "#endif" >> .build-version.h
35
36diff -q version.h .build-version.h >& /dev/null
37
38if [ $? == 0 ]; then
39 rm .build-version.h
40 exit 0
41fi
42
43mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 000000000000..ecf0633ab8cc
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,3117 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/bio.h>
20#include <linux/buffer_head.h>
21#include <linux/blkdev.h>
22#include <linux/random.h>
23#include <asm/div64.h>
24#include "ctree.h"
25#include "extent_map.h"
26#include "disk-io.h"
27#include "transaction.h"
28#include "print-tree.h"
29#include "volumes.h"
30#include "async-thread.h"
31
32struct map_lookup {
33 u64 type;
34 int io_align;
35 int io_width;
36 int stripe_len;
37 int sector_size;
38 int num_stripes;
39 int sub_stripes;
40 struct btrfs_bio_stripe stripes[];
41};
42
43static int init_first_rw_device(struct btrfs_trans_handle *trans,
44 struct btrfs_root *root,
45 struct btrfs_device *device);
46static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
47
48
49#define map_lookup_size(n) (sizeof(struct map_lookup) + \
50 (sizeof(struct btrfs_bio_stripe) * (n)))
51
52static DEFINE_MUTEX(uuid_mutex);
53static LIST_HEAD(fs_uuids);
54
55void btrfs_lock_volumes(void)
56{
57 mutex_lock(&uuid_mutex);
58}
59
60void btrfs_unlock_volumes(void)
61{
62 mutex_unlock(&uuid_mutex);
63}
64
65static void lock_chunks(struct btrfs_root *root)
66{
67 mutex_lock(&root->fs_info->chunk_mutex);
68}
69
70static void unlock_chunks(struct btrfs_root *root)
71{
72 mutex_unlock(&root->fs_info->chunk_mutex);
73}
74
75int btrfs_cleanup_fs_uuids(void)
76{
77 struct btrfs_fs_devices *fs_devices;
78 struct btrfs_device *dev;
79
80 while (!list_empty(&fs_uuids)) {
81 fs_devices = list_entry(fs_uuids.next,
82 struct btrfs_fs_devices, list);
83 list_del(&fs_devices->list);
84 while(!list_empty(&fs_devices->devices)) {
85 dev = list_entry(fs_devices->devices.next,
86 struct btrfs_device, dev_list);
87 if (dev->bdev) {
88 close_bdev_excl(dev->bdev);
89 fs_devices->open_devices--;
90 }
91 fs_devices->num_devices--;
92 if (dev->writeable)
93 fs_devices->rw_devices--;
94 list_del(&dev->dev_list);
95 list_del(&dev->dev_alloc_list);
96 kfree(dev->name);
97 kfree(dev);
98 }
99 WARN_ON(fs_devices->num_devices);
100 WARN_ON(fs_devices->open_devices);
101 WARN_ON(fs_devices->rw_devices);
102 kfree(fs_devices);
103 }
104 return 0;
105}
106
107static noinline struct btrfs_device *__find_device(struct list_head *head,
108 u64 devid, u8 *uuid)
109{
110 struct btrfs_device *dev;
111 struct list_head *cur;
112
113 list_for_each(cur, head) {
114 dev = list_entry(cur, struct btrfs_device, dev_list);
115 if (dev->devid == devid &&
116 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
117 return dev;
118 }
119 }
120 return NULL;
121}
122
123static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
124{
125 struct list_head *cur;
126 struct btrfs_fs_devices *fs_devices;
127
128 list_for_each(cur, &fs_uuids) {
129 fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
130 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
131 return fs_devices;
132 }
133 return NULL;
134}
135
136/*
137 * we try to collect pending bios for a device so we don't get a large
138 * number of procs sending bios down to the same device. This greatly
139 * improves the schedulers ability to collect and merge the bios.
140 *
141 * But, it also turns into a long list of bios to process and that is sure
142 * to eventually make the worker thread block. The solution here is to
143 * make some progress and then put this work struct back at the end of
144 * the list if the block device is congested. This way, multiple devices
145 * can make progress from a single worker thread.
146 */
147static int noinline run_scheduled_bios(struct btrfs_device *device)
148{
149 struct bio *pending;
150 struct backing_dev_info *bdi;
151 struct btrfs_fs_info *fs_info;
152 struct bio *tail;
153 struct bio *cur;
154 int again = 0;
155 unsigned long num_run = 0;
156 unsigned long limit;
157
158 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
159 fs_info = device->dev_root->fs_info;
160 limit = btrfs_async_submit_limit(fs_info);
161 limit = limit * 2 / 3;
162
163loop:
164 spin_lock(&device->io_lock);
165
166 /* take all the bios off the list at once and process them
167 * later on (without the lock held). But, remember the
168 * tail and other pointers so the bios can be properly reinserted
169 * into the list if we hit congestion
170 */
171 pending = device->pending_bios;
172 tail = device->pending_bio_tail;
173 WARN_ON(pending && !tail);
174 device->pending_bios = NULL;
175 device->pending_bio_tail = NULL;
176
177 /*
178 * if pending was null this time around, no bios need processing
179 * at all and we can stop. Otherwise it'll loop back up again
180 * and do an additional check so no bios are missed.
181 *
182 * device->running_pending is used to synchronize with the
183 * schedule_bio code.
184 */
185 if (pending) {
186 again = 1;
187 device->running_pending = 1;
188 } else {
189 again = 0;
190 device->running_pending = 0;
191 }
192 spin_unlock(&device->io_lock);
193
194 while(pending) {
195 cur = pending;
196 pending = pending->bi_next;
197 cur->bi_next = NULL;
198 atomic_dec(&fs_info->nr_async_bios);
199
200 if (atomic_read(&fs_info->nr_async_bios) < limit &&
201 waitqueue_active(&fs_info->async_submit_wait))
202 wake_up(&fs_info->async_submit_wait);
203
204 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
205 bio_get(cur);
206 submit_bio(cur->bi_rw, cur);
207 bio_put(cur);
208 num_run++;
209
210 /*
211 * we made progress, there is more work to do and the bdi
212 * is now congested. Back off and let other work structs
213 * run instead
214 */
215 if (pending && bdi_write_congested(bdi) &&
216 fs_info->fs_devices->open_devices > 1) {
217 struct bio *old_head;
218
219 spin_lock(&device->io_lock);
220
221 old_head = device->pending_bios;
222 device->pending_bios = pending;
223 if (device->pending_bio_tail)
224 tail->bi_next = old_head;
225 else
226 device->pending_bio_tail = tail;
227
228 spin_unlock(&device->io_lock);
229 btrfs_requeue_work(&device->work);
230 goto done;
231 }
232 }
233 if (again)
234 goto loop;
235done:
236 return 0;
237}
238
239void pending_bios_fn(struct btrfs_work *work)
240{
241 struct btrfs_device *device;
242
243 device = container_of(work, struct btrfs_device, work);
244 run_scheduled_bios(device);
245}
246
247static noinline int device_list_add(const char *path,
248 struct btrfs_super_block *disk_super,
249 u64 devid, struct btrfs_fs_devices **fs_devices_ret)
250{
251 struct btrfs_device *device;
252 struct btrfs_fs_devices *fs_devices;
253 u64 found_transid = btrfs_super_generation(disk_super);
254
255 fs_devices = find_fsid(disk_super->fsid);
256 if (!fs_devices) {
257 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
258 if (!fs_devices)
259 return -ENOMEM;
260 INIT_LIST_HEAD(&fs_devices->devices);
261 INIT_LIST_HEAD(&fs_devices->alloc_list);
262 list_add(&fs_devices->list, &fs_uuids);
263 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
264 fs_devices->latest_devid = devid;
265 fs_devices->latest_trans = found_transid;
266 device = NULL;
267 } else {
268 device = __find_device(&fs_devices->devices, devid,
269 disk_super->dev_item.uuid);
270 }
271 if (!device) {
272 if (fs_devices->opened)
273 return -EBUSY;
274
275 device = kzalloc(sizeof(*device), GFP_NOFS);
276 if (!device) {
277 /* we can safely leave the fs_devices entry around */
278 return -ENOMEM;
279 }
280 device->devid = devid;
281 device->work.func = pending_bios_fn;
282 memcpy(device->uuid, disk_super->dev_item.uuid,
283 BTRFS_UUID_SIZE);
284 device->barriers = 1;
285 spin_lock_init(&device->io_lock);
286 device->name = kstrdup(path, GFP_NOFS);
287 if (!device->name) {
288 kfree(device);
289 return -ENOMEM;
290 }
291 INIT_LIST_HEAD(&device->dev_alloc_list);
292 list_add(&device->dev_list, &fs_devices->devices);
293 device->fs_devices = fs_devices;
294 fs_devices->num_devices++;
295 }
296
297 if (found_transid > fs_devices->latest_trans) {
298 fs_devices->latest_devid = devid;
299 fs_devices->latest_trans = found_transid;
300 }
301 *fs_devices_ret = fs_devices;
302 return 0;
303}
304
305int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
306{
307 struct list_head *tmp;
308 struct list_head *cur;
309 struct btrfs_device *device;
310 int seed_devices = 0;
311
312 mutex_lock(&uuid_mutex);
313again:
314 list_for_each_safe(cur, tmp, &fs_devices->devices) {
315 device = list_entry(cur, struct btrfs_device, dev_list);
316 if (device->in_fs_metadata)
317 continue;
318
319 if (device->bdev) {
320 close_bdev_excl(device->bdev);
321 device->bdev = NULL;
322 fs_devices->open_devices--;
323 }
324 if (device->writeable) {
325 list_del_init(&device->dev_alloc_list);
326 device->writeable = 0;
327 fs_devices->rw_devices--;
328 }
329 if (!seed_devices) {
330 list_del_init(&device->dev_list);
331 fs_devices->num_devices--;
332 kfree(device->name);
333 kfree(device);
334 }
335 }
336
337 if (fs_devices->seed) {
338 fs_devices = fs_devices->seed;
339 seed_devices = 1;
340 goto again;
341 }
342
343 mutex_unlock(&uuid_mutex);
344 return 0;
345}
346
347static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
348{
349 struct btrfs_fs_devices *seed_devices;
350 struct list_head *cur;
351 struct btrfs_device *device;
352again:
353 if (--fs_devices->opened > 0)
354 return 0;
355
356 list_for_each(cur, &fs_devices->devices) {
357 device = list_entry(cur, struct btrfs_device, dev_list);
358 if (device->bdev) {
359 close_bdev_excl(device->bdev);
360 fs_devices->open_devices--;
361 }
362 if (device->writeable) {
363 list_del_init(&device->dev_alloc_list);
364 fs_devices->rw_devices--;
365 }
366
367 device->bdev = NULL;
368 device->writeable = 0;
369 device->in_fs_metadata = 0;
370 }
371 fs_devices->opened = 0;
372 fs_devices->seeding = 0;
373 fs_devices->sprouted = 0;
374
375 seed_devices = fs_devices->seed;
376 fs_devices->seed = NULL;
377 if (seed_devices) {
378 fs_devices = seed_devices;
379 goto again;
380 }
381 return 0;
382}
383
384int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
385{
386 int ret;
387
388 mutex_lock(&uuid_mutex);
389 ret = __btrfs_close_devices(fs_devices);
390 mutex_unlock(&uuid_mutex);
391 return ret;
392}
393
394int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void *holder)
395{
396 struct block_device *bdev;
397 struct list_head *head = &fs_devices->devices;
398 struct list_head *cur;
399 struct btrfs_device *device;
400 struct block_device *latest_bdev = NULL;
401 struct buffer_head *bh;
402 struct btrfs_super_block *disk_super;
403 u64 latest_devid = 0;
404 u64 latest_transid = 0;
405 u64 devid;
406 int seeding = 1;
407 int ret = 0;
408
409 list_for_each(cur, head) {
410 device = list_entry(cur, struct btrfs_device, dev_list);
411 if (device->bdev)
412 continue;
413 if (!device->name)
414 continue;
415
416 bdev = open_bdev_excl(device->name, MS_RDONLY, holder);
417 if (IS_ERR(bdev)) {
418 printk("open %s failed\n", device->name);
419 goto error;
420 }
421 set_blocksize(bdev, 4096);
422
423 bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
424 if (!bh)
425 goto error_close;
426
427 disk_super = (struct btrfs_super_block *)bh->b_data;
428 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
429 sizeof(disk_super->magic)))
430 goto error_brelse;
431
432 devid = le64_to_cpu(disk_super->dev_item.devid);
433 if (devid != device->devid)
434 goto error_brelse;
435
436 if (memcmp(device->uuid, disk_super->dev_item.uuid,
437 BTRFS_UUID_SIZE))
438 goto error_brelse;
439
440 device->generation = btrfs_super_generation(disk_super);
441 if (!latest_transid || device->generation > latest_transid) {
442 latest_devid = devid;
443 latest_transid = device->generation;
444 latest_bdev = bdev;
445 }
446
447 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
448 device->writeable = 0;
449 } else {
450 device->writeable = !bdev_read_only(bdev);
451 seeding = 0;
452 }
453
454 device->bdev = bdev;
455 device->in_fs_metadata = 0;
456 fs_devices->open_devices++;
457 if (device->writeable) {
458 fs_devices->rw_devices++;
459 list_add(&device->dev_alloc_list,
460 &fs_devices->alloc_list);
461 }
462 continue;
463
464error_brelse:
465 brelse(bh);
466error_close:
467 close_bdev_excl(bdev);
468error:
469 continue;
470 }
471 if (fs_devices->open_devices == 0) {
472 ret = -EIO;
473 goto out;
474 }
475 fs_devices->seeding = seeding;
476 fs_devices->opened = 1;
477 fs_devices->latest_bdev = latest_bdev;
478 fs_devices->latest_devid = latest_devid;
479 fs_devices->latest_trans = latest_transid;
480 fs_devices->total_rw_bytes = 0;
481out:
482 return ret;
483}
484
485int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
486 int flags, void *holder)
487{
488 int ret;
489
490 mutex_lock(&uuid_mutex);
491 if (fs_devices->opened) {
492 if (fs_devices->sprouted) {
493 ret = -EBUSY;
494 } else {
495 fs_devices->opened++;
496 ret = 0;
497 }
498 } else {
499 ret = __btrfs_open_devices(fs_devices, holder);
500 }
501 mutex_unlock(&uuid_mutex);
502 return ret;
503}
504
505int btrfs_scan_one_device(const char *path, int flags, void *holder,
506 struct btrfs_fs_devices **fs_devices_ret)
507{
508 struct btrfs_super_block *disk_super;
509 struct block_device *bdev;
510 struct buffer_head *bh;
511 int ret;
512 u64 devid;
513 u64 transid;
514
515 mutex_lock(&uuid_mutex);
516
517 bdev = open_bdev_excl(path, flags, holder);
518
519 if (IS_ERR(bdev)) {
520 ret = PTR_ERR(bdev);
521 goto error;
522 }
523
524 ret = set_blocksize(bdev, 4096);
525 if (ret)
526 goto error_close;
527 bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
528 if (!bh) {
529 ret = -EIO;
530 goto error_close;
531 }
532 disk_super = (struct btrfs_super_block *)bh->b_data;
533 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
534 sizeof(disk_super->magic))) {
535 ret = -EINVAL;
536 goto error_brelse;
537 }
538 devid = le64_to_cpu(disk_super->dev_item.devid);
539 transid = btrfs_super_generation(disk_super);
540 if (disk_super->label[0])
541 printk("device label %s ", disk_super->label);
542 else {
543 /* FIXME, make a readl uuid parser */
544 printk("device fsid %llx-%llx ",
545 *(unsigned long long *)disk_super->fsid,
546 *(unsigned long long *)(disk_super->fsid + 8));
547 }
548 printk("devid %Lu transid %Lu %s\n", devid, transid, path);
549 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
550
551error_brelse:
552 brelse(bh);
553error_close:
554 close_bdev_excl(bdev);
555error:
556 mutex_unlock(&uuid_mutex);
557 return ret;
558}
559
560/*
561 * this uses a pretty simple search, the expectation is that it is
562 * called very infrequently and that a given device has a small number
563 * of extents
564 */
565static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
566 struct btrfs_device *device,
567 u64 num_bytes, u64 *start)
568{
569 struct btrfs_key key;
570 struct btrfs_root *root = device->dev_root;
571 struct btrfs_dev_extent *dev_extent = NULL;
572 struct btrfs_path *path;
573 u64 hole_size = 0;
574 u64 last_byte = 0;
575 u64 search_start = 0;
576 u64 search_end = device->total_bytes;
577 int ret;
578 int slot = 0;
579 int start_found;
580 struct extent_buffer *l;
581
582 path = btrfs_alloc_path();
583 if (!path)
584 return -ENOMEM;
585 path->reada = 2;
586 start_found = 0;
587
588 /* FIXME use last free of some kind */
589
590 /* we don't want to overwrite the superblock on the drive,
591 * so we make sure to start at an offset of at least 1MB
592 */
593 search_start = max((u64)1024 * 1024, search_start);
594
595 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
596 search_start = max(root->fs_info->alloc_start, search_start);
597
598 key.objectid = device->devid;
599 key.offset = search_start;
600 key.type = BTRFS_DEV_EXTENT_KEY;
601 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
602 if (ret < 0)
603 goto error;
604 ret = btrfs_previous_item(root, path, 0, key.type);
605 if (ret < 0)
606 goto error;
607 l = path->nodes[0];
608 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
609 while (1) {
610 l = path->nodes[0];
611 slot = path->slots[0];
612 if (slot >= btrfs_header_nritems(l)) {
613 ret = btrfs_next_leaf(root, path);
614 if (ret == 0)
615 continue;
616 if (ret < 0)
617 goto error;
618no_more_items:
619 if (!start_found) {
620 if (search_start >= search_end) {
621 ret = -ENOSPC;
622 goto error;
623 }
624 *start = search_start;
625 start_found = 1;
626 goto check_pending;
627 }
628 *start = last_byte > search_start ?
629 last_byte : search_start;
630 if (search_end <= *start) {
631 ret = -ENOSPC;
632 goto error;
633 }
634 goto check_pending;
635 }
636 btrfs_item_key_to_cpu(l, &key, slot);
637
638 if (key.objectid < device->devid)
639 goto next;
640
641 if (key.objectid > device->devid)
642 goto no_more_items;
643
644 if (key.offset >= search_start && key.offset > last_byte &&
645 start_found) {
646 if (last_byte < search_start)
647 last_byte = search_start;
648 hole_size = key.offset - last_byte;
649 if (key.offset > last_byte &&
650 hole_size >= num_bytes) {
651 *start = last_byte;
652 goto check_pending;
653 }
654 }
655 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) {
656 goto next;
657 }
658
659 start_found = 1;
660 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
661 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
662next:
663 path->slots[0]++;
664 cond_resched();
665 }
666check_pending:
667 /* we have to make sure we didn't find an extent that has already
668 * been allocated by the map tree or the original allocation
669 */
670 BUG_ON(*start < search_start);
671
672 if (*start + num_bytes > search_end) {
673 ret = -ENOSPC;
674 goto error;
675 }
676 /* check for pending inserts here */
677 ret = 0;
678
679error:
680 btrfs_free_path(path);
681 return ret;
682}
683
684int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
685 struct btrfs_device *device,
686 u64 start)
687{
688 int ret;
689 struct btrfs_path *path;
690 struct btrfs_root *root = device->dev_root;
691 struct btrfs_key key;
692 struct btrfs_key found_key;
693 struct extent_buffer *leaf = NULL;
694 struct btrfs_dev_extent *extent = NULL;
695
696 path = btrfs_alloc_path();
697 if (!path)
698 return -ENOMEM;
699
700 key.objectid = device->devid;
701 key.offset = start;
702 key.type = BTRFS_DEV_EXTENT_KEY;
703
704 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
705 if (ret > 0) {
706 ret = btrfs_previous_item(root, path, key.objectid,
707 BTRFS_DEV_EXTENT_KEY);
708 BUG_ON(ret);
709 leaf = path->nodes[0];
710 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
711 extent = btrfs_item_ptr(leaf, path->slots[0],
712 struct btrfs_dev_extent);
713 BUG_ON(found_key.offset > start || found_key.offset +
714 btrfs_dev_extent_length(leaf, extent) < start);
715 ret = 0;
716 } else if (ret == 0) {
717 leaf = path->nodes[0];
718 extent = btrfs_item_ptr(leaf, path->slots[0],
719 struct btrfs_dev_extent);
720 }
721 BUG_ON(ret);
722
723 if (device->bytes_used > 0)
724 device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
725 ret = btrfs_del_item(trans, root, path);
726 BUG_ON(ret);
727
728 btrfs_free_path(path);
729 return ret;
730}
731
732int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
733 struct btrfs_device *device,
734 u64 chunk_tree, u64 chunk_objectid,
735 u64 chunk_offset, u64 start, u64 num_bytes)
736{
737 int ret;
738 struct btrfs_path *path;
739 struct btrfs_root *root = device->dev_root;
740 struct btrfs_dev_extent *extent;
741 struct extent_buffer *leaf;
742 struct btrfs_key key;
743
744 WARN_ON(!device->in_fs_metadata);
745 path = btrfs_alloc_path();
746 if (!path)
747 return -ENOMEM;
748
749 key.objectid = device->devid;
750 key.offset = start;
751 key.type = BTRFS_DEV_EXTENT_KEY;
752 ret = btrfs_insert_empty_item(trans, root, path, &key,
753 sizeof(*extent));
754 BUG_ON(ret);
755
756 leaf = path->nodes[0];
757 extent = btrfs_item_ptr(leaf, path->slots[0],
758 struct btrfs_dev_extent);
759 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
760 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
761 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
762
763 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
764 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
765 BTRFS_UUID_SIZE);
766
767 btrfs_set_dev_extent_length(leaf, extent, num_bytes);
768 btrfs_mark_buffer_dirty(leaf);
769 btrfs_free_path(path);
770 return ret;
771}
772
773static noinline int find_next_chunk(struct btrfs_root *root,
774 u64 objectid, u64 *offset)
775{
776 struct btrfs_path *path;
777 int ret;
778 struct btrfs_key key;
779 struct btrfs_chunk *chunk;
780 struct btrfs_key found_key;
781
782 path = btrfs_alloc_path();
783 BUG_ON(!path);
784
785 key.objectid = objectid;
786 key.offset = (u64)-1;
787 key.type = BTRFS_CHUNK_ITEM_KEY;
788
789 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
790 if (ret < 0)
791 goto error;
792
793 BUG_ON(ret == 0);
794
795 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
796 if (ret) {
797 *offset = 0;
798 } else {
799 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
800 path->slots[0]);
801 if (found_key.objectid != objectid)
802 *offset = 0;
803 else {
804 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
805 struct btrfs_chunk);
806 *offset = found_key.offset +
807 btrfs_chunk_length(path->nodes[0], chunk);
808 }
809 }
810 ret = 0;
811error:
812 btrfs_free_path(path);
813 return ret;
814}
815
816static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
817{
818 int ret;
819 struct btrfs_key key;
820 struct btrfs_key found_key;
821 struct btrfs_path *path;
822
823 root = root->fs_info->chunk_root;
824
825 path = btrfs_alloc_path();
826 if (!path)
827 return -ENOMEM;
828
829 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
830 key.type = BTRFS_DEV_ITEM_KEY;
831 key.offset = (u64)-1;
832
833 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
834 if (ret < 0)
835 goto error;
836
837 BUG_ON(ret == 0);
838
839 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
840 BTRFS_DEV_ITEM_KEY);
841 if (ret) {
842 *objectid = 1;
843 } else {
844 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
845 path->slots[0]);
846 *objectid = found_key.offset + 1;
847 }
848 ret = 0;
849error:
850 btrfs_free_path(path);
851 return ret;
852}
853
854/*
855 * the device information is stored in the chunk root
856 * the btrfs_device struct should be fully filled in
857 */
858int btrfs_add_device(struct btrfs_trans_handle *trans,
859 struct btrfs_root *root,
860 struct btrfs_device *device)
861{
862 int ret;
863 struct btrfs_path *path;
864 struct btrfs_dev_item *dev_item;
865 struct extent_buffer *leaf;
866 struct btrfs_key key;
867 unsigned long ptr;
868
869 root = root->fs_info->chunk_root;
870
871 path = btrfs_alloc_path();
872 if (!path)
873 return -ENOMEM;
874
875 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
876 key.type = BTRFS_DEV_ITEM_KEY;
877 key.offset = device->devid;
878
879 ret = btrfs_insert_empty_item(trans, root, path, &key,
880 sizeof(*dev_item));
881 if (ret)
882 goto out;
883
884 leaf = path->nodes[0];
885 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
886
887 btrfs_set_device_id(leaf, dev_item, device->devid);
888 btrfs_set_device_generation(leaf, dev_item, 0);
889 btrfs_set_device_type(leaf, dev_item, device->type);
890 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
891 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
892 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
893 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
894 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
895 btrfs_set_device_group(leaf, dev_item, 0);
896 btrfs_set_device_seek_speed(leaf, dev_item, 0);
897 btrfs_set_device_bandwidth(leaf, dev_item, 0);
898
899 ptr = (unsigned long)btrfs_device_uuid(dev_item);
900 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
901 ptr = (unsigned long)btrfs_device_fsid(dev_item);
902 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
903 btrfs_mark_buffer_dirty(leaf);
904
905 ret = 0;
906out:
907 btrfs_free_path(path);
908 return ret;
909}
910
911static int btrfs_rm_dev_item(struct btrfs_root *root,
912 struct btrfs_device *device)
913{
914 int ret;
915 struct btrfs_path *path;
916 struct btrfs_key key;
917 struct btrfs_trans_handle *trans;
918
919 root = root->fs_info->chunk_root;
920
921 path = btrfs_alloc_path();
922 if (!path)
923 return -ENOMEM;
924
925 trans = btrfs_start_transaction(root, 1);
926 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
927 key.type = BTRFS_DEV_ITEM_KEY;
928 key.offset = device->devid;
929 lock_chunks(root);
930
931 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
932 if (ret < 0)
933 goto out;
934
935 if (ret > 0) {
936 ret = -ENOENT;
937 goto out;
938 }
939
940 ret = btrfs_del_item(trans, root, path);
941 if (ret)
942 goto out;
943out:
944 btrfs_free_path(path);
945 unlock_chunks(root);
946 btrfs_commit_transaction(trans, root);
947 return ret;
948}
949
950int btrfs_rm_device(struct btrfs_root *root, char *device_path)
951{
952 struct btrfs_device *device;
953 struct btrfs_device *next_device;
954 struct block_device *bdev;
955 struct buffer_head *bh = NULL;
956 struct btrfs_super_block *disk_super;
957 u64 all_avail;
958 u64 devid;
959 u64 num_devices;
960 u8 *dev_uuid;
961 int ret = 0;
962
963 mutex_lock(&uuid_mutex);
964 mutex_lock(&root->fs_info->volume_mutex);
965
966 all_avail = root->fs_info->avail_data_alloc_bits |
967 root->fs_info->avail_system_alloc_bits |
968 root->fs_info->avail_metadata_alloc_bits;
969
970 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
971 root->fs_info->fs_devices->rw_devices <= 4) {
972 printk("btrfs: unable to go below four devices on raid10\n");
973 ret = -EINVAL;
974 goto out;
975 }
976
977 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
978 root->fs_info->fs_devices->rw_devices <= 2) {
979 printk("btrfs: unable to go below two devices on raid1\n");
980 ret = -EINVAL;
981 goto out;
982 }
983
984 if (strcmp(device_path, "missing") == 0) {
985 struct list_head *cur;
986 struct list_head *devices;
987 struct btrfs_device *tmp;
988
989 device = NULL;
990 devices = &root->fs_info->fs_devices->devices;
991 list_for_each(cur, devices) {
992 tmp = list_entry(cur, struct btrfs_device, dev_list);
993 if (tmp->in_fs_metadata && !tmp->bdev) {
994 device = tmp;
995 break;
996 }
997 }
998 bdev = NULL;
999 bh = NULL;
1000 disk_super = NULL;
1001 if (!device) {
1002 printk("btrfs: no missing devices found to remove\n");
1003 goto out;
1004 }
1005 } else {
1006 bdev = open_bdev_excl(device_path, MS_RDONLY,
1007 root->fs_info->bdev_holder);
1008 if (IS_ERR(bdev)) {
1009 ret = PTR_ERR(bdev);
1010 goto out;
1011 }
1012
1013 set_blocksize(bdev, 4096);
1014 bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
1015 if (!bh) {
1016 ret = -EIO;
1017 goto error_close;
1018 }
1019 disk_super = (struct btrfs_super_block *)bh->b_data;
1020 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
1021 sizeof(disk_super->magic))) {
1022 ret = -ENOENT;
1023 goto error_brelse;
1024 }
1025 devid = le64_to_cpu(disk_super->dev_item.devid);
1026 dev_uuid = disk_super->dev_item.uuid;
1027 device = btrfs_find_device(root, devid, dev_uuid,
1028 disk_super->fsid);
1029 if (!device) {
1030 ret = -ENOENT;
1031 goto error_brelse;
1032 }
1033 }
1034
1035 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1036 printk("btrfs: unable to remove the only writeable device\n");
1037 ret = -EINVAL;
1038 goto error_brelse;
1039 }
1040
1041 if (device->writeable) {
1042 list_del_init(&device->dev_alloc_list);
1043 root->fs_info->fs_devices->rw_devices--;
1044 }
1045
1046 ret = btrfs_shrink_device(device, 0);
1047 if (ret)
1048 goto error_brelse;
1049
1050 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1051 if (ret)
1052 goto error_brelse;
1053
1054 device->in_fs_metadata = 0;
1055 if (device->fs_devices == root->fs_info->fs_devices) {
1056 list_del_init(&device->dev_list);
1057 root->fs_info->fs_devices->num_devices--;
1058 if (device->bdev)
1059 device->fs_devices->open_devices--;
1060 }
1061
1062 next_device = list_entry(root->fs_info->fs_devices->devices.next,
1063 struct btrfs_device, dev_list);
1064 if (device->bdev == root->fs_info->sb->s_bdev)
1065 root->fs_info->sb->s_bdev = next_device->bdev;
1066 if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1067 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1068
1069 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1070 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
1071
1072 if (device->fs_devices != root->fs_info->fs_devices) {
1073 BUG_ON(device->writeable);
1074 brelse(bh);
1075 if (bdev)
1076 close_bdev_excl(bdev);
1077
1078 if (device->bdev) {
1079 close_bdev_excl(device->bdev);
1080 device->bdev = NULL;
1081 device->fs_devices->open_devices--;
1082 }
1083 if (device->fs_devices->open_devices == 0) {
1084 struct btrfs_fs_devices *fs_devices;
1085 fs_devices = root->fs_info->fs_devices;
1086 while (fs_devices) {
1087 if (fs_devices->seed == device->fs_devices)
1088 break;
1089 fs_devices = fs_devices->seed;
1090 }
1091 fs_devices->seed = device->fs_devices->seed;
1092 device->fs_devices->seed = NULL;
1093 __btrfs_close_devices(device->fs_devices);
1094 }
1095 ret = 0;
1096 goto out;
1097 }
1098
1099 /*
1100 * at this point, the device is zero sized. We want to
1101 * remove it from the devices list and zero out the old super
1102 */
1103 if (device->writeable) {
1104 /* make sure this device isn't detected as part of
1105 * the FS anymore
1106 */
1107 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1108 set_buffer_dirty(bh);
1109 sync_dirty_buffer(bh);
1110 }
1111 brelse(bh);
1112
1113 if (device->bdev) {
1114 /* one close for the device struct or super_block */
1115 close_bdev_excl(device->bdev);
1116 }
1117 if (bdev) {
1118 /* one close for us */
1119 close_bdev_excl(bdev);
1120 }
1121 kfree(device->name);
1122 kfree(device);
1123 ret = 0;
1124 goto out;
1125
1126error_brelse:
1127 brelse(bh);
1128error_close:
1129 if (bdev)
1130 close_bdev_excl(bdev);
1131out:
1132 mutex_unlock(&root->fs_info->volume_mutex);
1133 mutex_unlock(&uuid_mutex);
1134 return ret;
1135}
1136
1137/*
1138 * does all the dirty work required for changing file system's UUID.
1139 */
1140static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1141 struct btrfs_root *root)
1142{
1143 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1144 struct btrfs_fs_devices *old_devices;
1145 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
1146 struct btrfs_device *device;
1147 u64 super_flags;
1148
1149 BUG_ON(!mutex_is_locked(&uuid_mutex));
1150 if (!fs_devices->seeding || fs_devices->opened != 1)
1151 return -EINVAL;
1152
1153 old_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1154 if (!old_devices)
1155 return -ENOMEM;
1156
1157 memcpy(old_devices, fs_devices, sizeof(*old_devices));
1158 old_devices->opened = 1;
1159 old_devices->sprouted = 1;
1160 INIT_LIST_HEAD(&old_devices->devices);
1161 INIT_LIST_HEAD(&old_devices->alloc_list);
1162 list_splice_init(&fs_devices->devices, &old_devices->devices);
1163 list_splice_init(&fs_devices->alloc_list, &old_devices->alloc_list);
1164 list_for_each_entry(device, &old_devices->devices, dev_list) {
1165 device->fs_devices = old_devices;
1166 }
1167 list_add(&old_devices->list, &fs_uuids);
1168
1169 fs_devices->seeding = 0;
1170 fs_devices->num_devices = 0;
1171 fs_devices->open_devices = 0;
1172 fs_devices->seed = old_devices;
1173
1174 generate_random_uuid(fs_devices->fsid);
1175 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1176 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1177 super_flags = btrfs_super_flags(disk_super) &
1178 ~BTRFS_SUPER_FLAG_SEEDING;
1179 btrfs_set_super_flags(disk_super, super_flags);
1180
1181 return 0;
1182}
1183
1184/*
1185 * strore the expected generation for seed devices in device items.
1186 */
1187static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1188 struct btrfs_root *root)
1189{
1190 struct btrfs_path *path;
1191 struct extent_buffer *leaf;
1192 struct btrfs_dev_item *dev_item;
1193 struct btrfs_device *device;
1194 struct btrfs_key key;
1195 u8 fs_uuid[BTRFS_UUID_SIZE];
1196 u8 dev_uuid[BTRFS_UUID_SIZE];
1197 u64 devid;
1198 int ret;
1199
1200 path = btrfs_alloc_path();
1201 if (!path)
1202 return -ENOMEM;
1203
1204 root = root->fs_info->chunk_root;
1205 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1206 key.offset = 0;
1207 key.type = BTRFS_DEV_ITEM_KEY;
1208
1209 while (1) {
1210 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1211 if (ret < 0)
1212 goto error;
1213
1214 leaf = path->nodes[0];
1215next_slot:
1216 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1217 ret = btrfs_next_leaf(root, path);
1218 if (ret > 0)
1219 break;
1220 if (ret < 0)
1221 goto error;
1222 leaf = path->nodes[0];
1223 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1224 btrfs_release_path(root, path);
1225 continue;
1226 }
1227
1228 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1229 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1230 key.type != BTRFS_DEV_ITEM_KEY)
1231 break;
1232
1233 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1234 struct btrfs_dev_item);
1235 devid = btrfs_device_id(leaf, dev_item);
1236 read_extent_buffer(leaf, dev_uuid,
1237 (unsigned long)btrfs_device_uuid(dev_item),
1238 BTRFS_UUID_SIZE);
1239 read_extent_buffer(leaf, fs_uuid,
1240 (unsigned long)btrfs_device_fsid(dev_item),
1241 BTRFS_UUID_SIZE);
1242 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1243 BUG_ON(!device);
1244
1245 if (device->fs_devices->seeding) {
1246 btrfs_set_device_generation(leaf, dev_item,
1247 device->generation);
1248 btrfs_mark_buffer_dirty(leaf);
1249 }
1250
1251 path->slots[0]++;
1252 goto next_slot;
1253 }
1254 ret = 0;
1255error:
1256 btrfs_free_path(path);
1257 return ret;
1258}
1259
1260int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1261{
1262 struct btrfs_trans_handle *trans;
1263 struct btrfs_device *device;
1264 struct block_device *bdev;
1265 struct list_head *cur;
1266 struct list_head *devices;
1267 struct super_block *sb = root->fs_info->sb;
1268 u64 total_bytes;
1269 int seeding_dev = 0;
1270 int ret = 0;
1271
1272 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1273 return -EINVAL;
1274
1275 bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
1276 if (!bdev) {
1277 return -EIO;
1278 }
1279
1280 if (root->fs_info->fs_devices->seeding) {
1281 seeding_dev = 1;
1282 down_write(&sb->s_umount);
1283 mutex_lock(&uuid_mutex);
1284 }
1285
1286 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1287 mutex_lock(&root->fs_info->volume_mutex);
1288
1289 devices = &root->fs_info->fs_devices->devices;
1290 list_for_each(cur, devices) {
1291 device = list_entry(cur, struct btrfs_device, dev_list);
1292 if (device->bdev == bdev) {
1293 ret = -EEXIST;
1294 goto error;
1295 }
1296 }
1297
1298 device = kzalloc(sizeof(*device), GFP_NOFS);
1299 if (!device) {
1300 /* we can safely leave the fs_devices entry around */
1301 ret = -ENOMEM;
1302 goto error;
1303 }
1304
1305 device->name = kstrdup(device_path, GFP_NOFS);
1306 if (!device->name) {
1307 kfree(device);
1308 ret = -ENOMEM;
1309 goto error;
1310 }
1311
1312 ret = find_next_devid(root, &device->devid);
1313 if (ret) {
1314 kfree(device);
1315 goto error;
1316 }
1317
1318 trans = btrfs_start_transaction(root, 1);
1319 lock_chunks(root);
1320
1321 device->barriers = 1;
1322 device->writeable = 1;
1323 device->work.func = pending_bios_fn;
1324 generate_random_uuid(device->uuid);
1325 spin_lock_init(&device->io_lock);
1326 device->generation = trans->transid;
1327 device->io_width = root->sectorsize;
1328 device->io_align = root->sectorsize;
1329 device->sector_size = root->sectorsize;
1330 device->total_bytes = i_size_read(bdev->bd_inode);
1331 device->dev_root = root->fs_info->dev_root;
1332 device->bdev = bdev;
1333 device->in_fs_metadata = 1;
1334 set_blocksize(device->bdev, 4096);
1335
1336 if (seeding_dev) {
1337 sb->s_flags &= ~MS_RDONLY;
1338 ret = btrfs_prepare_sprout(trans, root);
1339 BUG_ON(ret);
1340 }
1341
1342 device->fs_devices = root->fs_info->fs_devices;
1343 list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1344 list_add(&device->dev_alloc_list,
1345 &root->fs_info->fs_devices->alloc_list);
1346 root->fs_info->fs_devices->num_devices++;
1347 root->fs_info->fs_devices->open_devices++;
1348 root->fs_info->fs_devices->rw_devices++;
1349 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1350
1351 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1352 btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1353 total_bytes + device->total_bytes);
1354
1355 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1356 btrfs_set_super_num_devices(&root->fs_info->super_copy,
1357 total_bytes + 1);
1358
1359 if (seeding_dev) {
1360 ret = init_first_rw_device(trans, root, device);
1361 BUG_ON(ret);
1362 ret = btrfs_finish_sprout(trans, root);
1363 BUG_ON(ret);
1364 } else {
1365 ret = btrfs_add_device(trans, root, device);
1366 }
1367
1368 unlock_chunks(root);
1369 btrfs_commit_transaction(trans, root);
1370
1371 if (seeding_dev) {
1372 mutex_unlock(&uuid_mutex);
1373 up_write(&sb->s_umount);
1374
1375 ret = btrfs_relocate_sys_chunks(root);
1376 BUG_ON(ret);
1377 }
1378out:
1379 mutex_unlock(&root->fs_info->volume_mutex);
1380 return ret;
1381error:
1382 close_bdev_excl(bdev);
1383 if (seeding_dev) {
1384 mutex_unlock(&uuid_mutex);
1385 up_write(&sb->s_umount);
1386 }
1387 goto out;
1388}
1389
1390int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
1391 struct btrfs_device *device)
1392{
1393 int ret;
1394 struct btrfs_path *path;
1395 struct btrfs_root *root;
1396 struct btrfs_dev_item *dev_item;
1397 struct extent_buffer *leaf;
1398 struct btrfs_key key;
1399
1400 root = device->dev_root->fs_info->chunk_root;
1401
1402 path = btrfs_alloc_path();
1403 if (!path)
1404 return -ENOMEM;
1405
1406 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1407 key.type = BTRFS_DEV_ITEM_KEY;
1408 key.offset = device->devid;
1409
1410 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1411 if (ret < 0)
1412 goto out;
1413
1414 if (ret > 0) {
1415 ret = -ENOENT;
1416 goto out;
1417 }
1418
1419 leaf = path->nodes[0];
1420 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1421
1422 btrfs_set_device_id(leaf, dev_item, device->devid);
1423 btrfs_set_device_type(leaf, dev_item, device->type);
1424 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1425 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1426 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1427 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1428 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1429 btrfs_mark_buffer_dirty(leaf);
1430
1431out:
1432 btrfs_free_path(path);
1433 return ret;
1434}
1435
1436static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1437 struct btrfs_device *device, u64 new_size)
1438{
1439 struct btrfs_super_block *super_copy =
1440 &device->dev_root->fs_info->super_copy;
1441 u64 old_total = btrfs_super_total_bytes(super_copy);
1442 u64 diff = new_size - device->total_bytes;
1443
1444 if (!device->writeable)
1445 return -EACCES;
1446 if (new_size <= device->total_bytes)
1447 return -EINVAL;
1448
1449 btrfs_set_super_total_bytes(super_copy, old_total + diff);
1450 device->fs_devices->total_rw_bytes += diff;
1451
1452 device->total_bytes = new_size;
1453 return btrfs_update_device(trans, device);
1454}
1455
1456int btrfs_grow_device(struct btrfs_trans_handle *trans,
1457 struct btrfs_device *device, u64 new_size)
1458{
1459 int ret;
1460 lock_chunks(device->dev_root);
1461 ret = __btrfs_grow_device(trans, device, new_size);
1462 unlock_chunks(device->dev_root);
1463 return ret;
1464}
1465
1466static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1467 struct btrfs_root *root,
1468 u64 chunk_tree, u64 chunk_objectid,
1469 u64 chunk_offset)
1470{
1471 int ret;
1472 struct btrfs_path *path;
1473 struct btrfs_key key;
1474
1475 root = root->fs_info->chunk_root;
1476 path = btrfs_alloc_path();
1477 if (!path)
1478 return -ENOMEM;
1479
1480 key.objectid = chunk_objectid;
1481 key.offset = chunk_offset;
1482 key.type = BTRFS_CHUNK_ITEM_KEY;
1483
1484 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1485 BUG_ON(ret);
1486
1487 ret = btrfs_del_item(trans, root, path);
1488 BUG_ON(ret);
1489
1490 btrfs_free_path(path);
1491 return 0;
1492}
1493
1494int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1495 chunk_offset)
1496{
1497 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1498 struct btrfs_disk_key *disk_key;
1499 struct btrfs_chunk *chunk;
1500 u8 *ptr;
1501 int ret = 0;
1502 u32 num_stripes;
1503 u32 array_size;
1504 u32 len = 0;
1505 u32 cur;
1506 struct btrfs_key key;
1507
1508 array_size = btrfs_super_sys_array_size(super_copy);
1509
1510 ptr = super_copy->sys_chunk_array;
1511 cur = 0;
1512
1513 while (cur < array_size) {
1514 disk_key = (struct btrfs_disk_key *)ptr;
1515 btrfs_disk_key_to_cpu(&key, disk_key);
1516
1517 len = sizeof(*disk_key);
1518
1519 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1520 chunk = (struct btrfs_chunk *)(ptr + len);
1521 num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1522 len += btrfs_chunk_item_size(num_stripes);
1523 } else {
1524 ret = -EIO;
1525 break;
1526 }
1527 if (key.objectid == chunk_objectid &&
1528 key.offset == chunk_offset) {
1529 memmove(ptr, ptr + len, array_size - (cur + len));
1530 array_size -= len;
1531 btrfs_set_super_sys_array_size(super_copy, array_size);
1532 } else {
1533 ptr += len;
1534 cur += len;
1535 }
1536 }
1537 return ret;
1538}
1539
1540int btrfs_relocate_chunk(struct btrfs_root *root,
1541 u64 chunk_tree, u64 chunk_objectid,
1542 u64 chunk_offset)
1543{
1544 struct extent_map_tree *em_tree;
1545 struct btrfs_root *extent_root;
1546 struct btrfs_trans_handle *trans;
1547 struct extent_map *em;
1548 struct map_lookup *map;
1549 int ret;
1550 int i;
1551
1552 printk("btrfs relocating chunk %llu\n",
1553 (unsigned long long)chunk_offset);
1554 root = root->fs_info->chunk_root;
1555 extent_root = root->fs_info->extent_root;
1556 em_tree = &root->fs_info->mapping_tree.map_tree;
1557
1558 /* step one, relocate all the extents inside this chunk */
1559 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1560 BUG_ON(ret);
1561
1562 trans = btrfs_start_transaction(root, 1);
1563 BUG_ON(!trans);
1564
1565 lock_chunks(root);
1566
1567 /*
1568 * step two, delete the device extents and the
1569 * chunk tree entries
1570 */
1571 spin_lock(&em_tree->lock);
1572 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1573 spin_unlock(&em_tree->lock);
1574
1575 BUG_ON(em->start > chunk_offset ||
1576 em->start + em->len < chunk_offset);
1577 map = (struct map_lookup *)em->bdev;
1578
1579 for (i = 0; i < map->num_stripes; i++) {
1580 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1581 map->stripes[i].physical);
1582 BUG_ON(ret);
1583
1584 if (map->stripes[i].dev) {
1585 ret = btrfs_update_device(trans, map->stripes[i].dev);
1586 BUG_ON(ret);
1587 }
1588 }
1589 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1590 chunk_offset);
1591
1592 BUG_ON(ret);
1593
1594 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1595 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1596 BUG_ON(ret);
1597 }
1598
1599 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1600 BUG_ON(ret);
1601
1602 spin_lock(&em_tree->lock);
1603 remove_extent_mapping(em_tree, em);
1604 spin_unlock(&em_tree->lock);
1605
1606 kfree(map);
1607 em->bdev = NULL;
1608
1609 /* once for the tree */
1610 free_extent_map(em);
1611 /* once for us */
1612 free_extent_map(em);
1613
1614 unlock_chunks(root);
1615 btrfs_end_transaction(trans, root);
1616 return 0;
1617}
1618
1619static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1620{
1621 struct btrfs_root *chunk_root = root->fs_info->chunk_root;
1622 struct btrfs_path *path;
1623 struct extent_buffer *leaf;
1624 struct btrfs_chunk *chunk;
1625 struct btrfs_key key;
1626 struct btrfs_key found_key;
1627 u64 chunk_tree = chunk_root->root_key.objectid;
1628 u64 chunk_type;
1629 int ret;
1630
1631 path = btrfs_alloc_path();
1632 if (!path)
1633 return -ENOMEM;
1634
1635 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1636 key.offset = (u64)-1;
1637 key.type = BTRFS_CHUNK_ITEM_KEY;
1638
1639 while (1) {
1640 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1641 if (ret < 0)
1642 goto error;
1643 BUG_ON(ret == 0);
1644
1645 ret = btrfs_previous_item(chunk_root, path, key.objectid,
1646 key.type);
1647 if (ret < 0)
1648 goto error;
1649 if (ret > 0)
1650 break;
1651
1652 leaf = path->nodes[0];
1653 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1654
1655 chunk = btrfs_item_ptr(leaf, path->slots[0],
1656 struct btrfs_chunk);
1657 chunk_type = btrfs_chunk_type(leaf, chunk);
1658 btrfs_release_path(chunk_root, path);
1659
1660 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
1661 ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
1662 found_key.objectid,
1663 found_key.offset);
1664 BUG_ON(ret);
1665 }
1666
1667 if (found_key.offset == 0)
1668 break;
1669 key.offset = found_key.offset - 1;
1670 }
1671 ret = 0;
1672error:
1673 btrfs_free_path(path);
1674 return ret;
1675}
1676
1677static u64 div_factor(u64 num, int factor)
1678{
1679 if (factor == 10)
1680 return num;
1681 num *= factor;
1682 do_div(num, 10);
1683 return num;
1684}
1685
1686int btrfs_balance(struct btrfs_root *dev_root)
1687{
1688 int ret;
1689 struct list_head *cur;
1690 struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1691 struct btrfs_device *device;
1692 u64 old_size;
1693 u64 size_to_free;
1694 struct btrfs_path *path;
1695 struct btrfs_key key;
1696 struct btrfs_chunk *chunk;
1697 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1698 struct btrfs_trans_handle *trans;
1699 struct btrfs_key found_key;
1700
1701 if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1702 return -EROFS;
1703
1704 mutex_lock(&dev_root->fs_info->volume_mutex);
1705 dev_root = dev_root->fs_info->dev_root;
1706
1707 /* step one make some room on all the devices */
1708 list_for_each(cur, devices) {
1709 device = list_entry(cur, struct btrfs_device, dev_list);
1710 old_size = device->total_bytes;
1711 size_to_free = div_factor(old_size, 1);
1712 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
1713 if (!device->writeable ||
1714 device->total_bytes - device->bytes_used > size_to_free)
1715 continue;
1716
1717 ret = btrfs_shrink_device(device, old_size - size_to_free);
1718 BUG_ON(ret);
1719
1720 trans = btrfs_start_transaction(dev_root, 1);
1721 BUG_ON(!trans);
1722
1723 ret = btrfs_grow_device(trans, device, old_size);
1724 BUG_ON(ret);
1725
1726 btrfs_end_transaction(trans, dev_root);
1727 }
1728
1729 /* step two, relocate all the chunks */
1730 path = btrfs_alloc_path();
1731 BUG_ON(!path);
1732
1733 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1734 key.offset = (u64)-1;
1735 key.type = BTRFS_CHUNK_ITEM_KEY;
1736
1737 while(1) {
1738 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1739 if (ret < 0)
1740 goto error;
1741
1742 /*
1743 * this shouldn't happen, it means the last relocate
1744 * failed
1745 */
1746 if (ret == 0)
1747 break;
1748
1749 ret = btrfs_previous_item(chunk_root, path, 0,
1750 BTRFS_CHUNK_ITEM_KEY);
1751 if (ret)
1752 break;
1753
1754 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1755 path->slots[0]);
1756 if (found_key.objectid != key.objectid)
1757 break;
1758
1759 chunk = btrfs_item_ptr(path->nodes[0],
1760 path->slots[0],
1761 struct btrfs_chunk);
1762 key.offset = found_key.offset;
1763 /* chunk zero is special */
1764 if (key.offset == 0)
1765 break;
1766
1767 btrfs_release_path(chunk_root, path);
1768 ret = btrfs_relocate_chunk(chunk_root,
1769 chunk_root->root_key.objectid,
1770 found_key.objectid,
1771 found_key.offset);
1772 BUG_ON(ret);
1773 }
1774 ret = 0;
1775error:
1776 btrfs_free_path(path);
1777 mutex_unlock(&dev_root->fs_info->volume_mutex);
1778 return ret;
1779}
1780
1781/*
1782 * shrinking a device means finding all of the device extents past
1783 * the new size, and then following the back refs to the chunks.
1784 * The chunk relocation code actually frees the device extent
1785 */
1786int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1787{
1788 struct btrfs_trans_handle *trans;
1789 struct btrfs_root *root = device->dev_root;
1790 struct btrfs_dev_extent *dev_extent = NULL;
1791 struct btrfs_path *path;
1792 u64 length;
1793 u64 chunk_tree;
1794 u64 chunk_objectid;
1795 u64 chunk_offset;
1796 int ret;
1797 int slot;
1798 struct extent_buffer *l;
1799 struct btrfs_key key;
1800 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1801 u64 old_total = btrfs_super_total_bytes(super_copy);
1802 u64 diff = device->total_bytes - new_size;
1803
1804 if (new_size >= device->total_bytes)
1805 return -EINVAL;
1806
1807 path = btrfs_alloc_path();
1808 if (!path)
1809 return -ENOMEM;
1810
1811 trans = btrfs_start_transaction(root, 1);
1812 if (!trans) {
1813 ret = -ENOMEM;
1814 goto done;
1815 }
1816
1817 path->reada = 2;
1818
1819 lock_chunks(root);
1820
1821 device->total_bytes = new_size;
1822 if (device->writeable)
1823 device->fs_devices->total_rw_bytes -= diff;
1824 ret = btrfs_update_device(trans, device);
1825 if (ret) {
1826 unlock_chunks(root);
1827 btrfs_end_transaction(trans, root);
1828 goto done;
1829 }
1830 WARN_ON(diff > old_total);
1831 btrfs_set_super_total_bytes(super_copy, old_total - diff);
1832 unlock_chunks(root);
1833 btrfs_end_transaction(trans, root);
1834
1835 key.objectid = device->devid;
1836 key.offset = (u64)-1;
1837 key.type = BTRFS_DEV_EXTENT_KEY;
1838
1839 while (1) {
1840 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1841 if (ret < 0)
1842 goto done;
1843
1844 ret = btrfs_previous_item(root, path, 0, key.type);
1845 if (ret < 0)
1846 goto done;
1847 if (ret) {
1848 ret = 0;
1849 goto done;
1850 }
1851
1852 l = path->nodes[0];
1853 slot = path->slots[0];
1854 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1855
1856 if (key.objectid != device->devid)
1857 goto done;
1858
1859 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1860 length = btrfs_dev_extent_length(l, dev_extent);
1861
1862 if (key.offset + length <= new_size)
1863 goto done;
1864
1865 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1866 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
1867 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
1868 btrfs_release_path(root, path);
1869
1870 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
1871 chunk_offset);
1872 if (ret)
1873 goto done;
1874 }
1875
1876done:
1877 btrfs_free_path(path);
1878 return ret;
1879}
1880
1881int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
1882 struct btrfs_root *root,
1883 struct btrfs_key *key,
1884 struct btrfs_chunk *chunk, int item_size)
1885{
1886 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1887 struct btrfs_disk_key disk_key;
1888 u32 array_size;
1889 u8 *ptr;
1890
1891 array_size = btrfs_super_sys_array_size(super_copy);
1892 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
1893 return -EFBIG;
1894
1895 ptr = super_copy->sys_chunk_array + array_size;
1896 btrfs_cpu_key_to_disk(&disk_key, key);
1897 memcpy(ptr, &disk_key, sizeof(disk_key));
1898 ptr += sizeof(disk_key);
1899 memcpy(ptr, chunk, item_size);
1900 item_size += sizeof(disk_key);
1901 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
1902 return 0;
1903}
1904
1905static u64 noinline chunk_bytes_by_type(u64 type, u64 calc_size,
1906 int num_stripes, int sub_stripes)
1907{
1908 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
1909 return calc_size;
1910 else if (type & BTRFS_BLOCK_GROUP_RAID10)
1911 return calc_size * (num_stripes / sub_stripes);
1912 else
1913 return calc_size * num_stripes;
1914}
1915
1916static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
1917 struct btrfs_root *extent_root,
1918 struct map_lookup **map_ret,
1919 u64 *num_bytes, u64 *stripe_size,
1920 u64 start, u64 type)
1921{
1922 struct btrfs_fs_info *info = extent_root->fs_info;
1923 struct btrfs_device *device = NULL;
1924 struct btrfs_fs_devices *fs_devices = info->fs_devices;
1925 struct list_head *cur;
1926 struct map_lookup *map = NULL;
1927 struct extent_map_tree *em_tree;
1928 struct extent_map *em;
1929 struct list_head private_devs;
1930 int min_stripe_size = 1 * 1024 * 1024;
1931 u64 calc_size = 1024 * 1024 * 1024;
1932 u64 max_chunk_size = calc_size;
1933 u64 min_free;
1934 u64 avail;
1935 u64 max_avail = 0;
1936 u64 dev_offset;
1937 int num_stripes = 1;
1938 int min_stripes = 1;
1939 int sub_stripes = 0;
1940 int looped = 0;
1941 int ret;
1942 int index;
1943 int stripe_len = 64 * 1024;
1944
1945 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
1946 (type & BTRFS_BLOCK_GROUP_DUP)) {
1947 WARN_ON(1);
1948 type &= ~BTRFS_BLOCK_GROUP_DUP;
1949 }
1950 if (list_empty(&fs_devices->alloc_list))
1951 return -ENOSPC;
1952
1953 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
1954 num_stripes = fs_devices->rw_devices;
1955 min_stripes = 2;
1956 }
1957 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
1958 num_stripes = 2;
1959 min_stripes = 2;
1960 }
1961 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
1962 num_stripes = min_t(u64, 2, fs_devices->rw_devices);
1963 if (num_stripes < 2)
1964 return -ENOSPC;
1965 min_stripes = 2;
1966 }
1967 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
1968 num_stripes = fs_devices->rw_devices;
1969 if (num_stripes < 4)
1970 return -ENOSPC;
1971 num_stripes &= ~(u32)1;
1972 sub_stripes = 2;
1973 min_stripes = 4;
1974 }
1975
1976 if (type & BTRFS_BLOCK_GROUP_DATA) {
1977 max_chunk_size = 10 * calc_size;
1978 min_stripe_size = 64 * 1024 * 1024;
1979 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
1980 max_chunk_size = 4 * calc_size;
1981 min_stripe_size = 32 * 1024 * 1024;
1982 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
1983 calc_size = 8 * 1024 * 1024;
1984 max_chunk_size = calc_size * 2;
1985 min_stripe_size = 1 * 1024 * 1024;
1986 }
1987
1988 /* we don't want a chunk larger than 10% of writeable space */
1989 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
1990 max_chunk_size);
1991
1992again:
1993 if (!map || map->num_stripes != num_stripes) {
1994 kfree(map);
1995 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
1996 if (!map)
1997 return -ENOMEM;
1998 map->num_stripes = num_stripes;
1999 }
2000
2001 if (calc_size * num_stripes > max_chunk_size) {
2002 calc_size = max_chunk_size;
2003 do_div(calc_size, num_stripes);
2004 do_div(calc_size, stripe_len);
2005 calc_size *= stripe_len;
2006 }
2007 /* we don't want tiny stripes */
2008 calc_size = max_t(u64, min_stripe_size, calc_size);
2009
2010 do_div(calc_size, stripe_len);
2011 calc_size *= stripe_len;
2012
2013 cur = fs_devices->alloc_list.next;
2014 index = 0;
2015
2016 if (type & BTRFS_BLOCK_GROUP_DUP)
2017 min_free = calc_size * 2;
2018 else
2019 min_free = calc_size;
2020
2021 /*
2022 * we add 1MB because we never use the first 1MB of the device, unless
2023 * we've looped, then we are likely allocating the maximum amount of
2024 * space left already
2025 */
2026 if (!looped)
2027 min_free += 1024 * 1024;
2028
2029 INIT_LIST_HEAD(&private_devs);
2030 while(index < num_stripes) {
2031 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2032 BUG_ON(!device->writeable);
2033 if (device->total_bytes > device->bytes_used)
2034 avail = device->total_bytes - device->bytes_used;
2035 else
2036 avail = 0;
2037 cur = cur->next;
2038
2039 if (device->in_fs_metadata && avail >= min_free) {
2040 ret = find_free_dev_extent(trans, device,
2041 min_free, &dev_offset);
2042 if (ret == 0) {
2043 list_move_tail(&device->dev_alloc_list,
2044 &private_devs);
2045 map->stripes[index].dev = device;
2046 map->stripes[index].physical = dev_offset;
2047 index++;
2048 if (type & BTRFS_BLOCK_GROUP_DUP) {
2049 map->stripes[index].dev = device;
2050 map->stripes[index].physical =
2051 dev_offset + calc_size;
2052 index++;
2053 }
2054 }
2055 } else if (device->in_fs_metadata && avail > max_avail)
2056 max_avail = avail;
2057 if (cur == &fs_devices->alloc_list)
2058 break;
2059 }
2060 list_splice(&private_devs, &fs_devices->alloc_list);
2061 if (index < num_stripes) {
2062 if (index >= min_stripes) {
2063 num_stripes = index;
2064 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2065 num_stripes /= sub_stripes;
2066 num_stripes *= sub_stripes;
2067 }
2068 looped = 1;
2069 goto again;
2070 }
2071 if (!looped && max_avail > 0) {
2072 looped = 1;
2073 calc_size = max_avail;
2074 goto again;
2075 }
2076 kfree(map);
2077 return -ENOSPC;
2078 }
2079 map->sector_size = extent_root->sectorsize;
2080 map->stripe_len = stripe_len;
2081 map->io_align = stripe_len;
2082 map->io_width = stripe_len;
2083 map->type = type;
2084 map->num_stripes = num_stripes;
2085 map->sub_stripes = sub_stripes;
2086
2087 *map_ret = map;
2088 *stripe_size = calc_size;
2089 *num_bytes = chunk_bytes_by_type(type, calc_size,
2090 num_stripes, sub_stripes);
2091
2092 em = alloc_extent_map(GFP_NOFS);
2093 if (!em) {
2094 kfree(map);
2095 return -ENOMEM;
2096 }
2097 em->bdev = (struct block_device *)map;
2098 em->start = start;
2099 em->len = *num_bytes;
2100 em->block_start = 0;
2101 em->block_len = em->len;
2102
2103 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2104 spin_lock(&em_tree->lock);
2105 ret = add_extent_mapping(em_tree, em);
2106 spin_unlock(&em_tree->lock);
2107 BUG_ON(ret);
2108 free_extent_map(em);
2109
2110 ret = btrfs_make_block_group(trans, extent_root, 0, type,
2111 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2112 start, *num_bytes);
2113 BUG_ON(ret);
2114
2115 index = 0;
2116 while (index < map->num_stripes) {
2117 device = map->stripes[index].dev;
2118 dev_offset = map->stripes[index].physical;
2119
2120 ret = btrfs_alloc_dev_extent(trans, device,
2121 info->chunk_root->root_key.objectid,
2122 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2123 start, dev_offset, calc_size);
2124 BUG_ON(ret);
2125 index++;
2126 }
2127
2128 return 0;
2129}
2130
2131static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2132 struct btrfs_root *extent_root,
2133 struct map_lookup *map, u64 chunk_offset,
2134 u64 chunk_size, u64 stripe_size)
2135{
2136 u64 dev_offset;
2137 struct btrfs_key key;
2138 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2139 struct btrfs_device *device;
2140 struct btrfs_chunk *chunk;
2141 struct btrfs_stripe *stripe;
2142 size_t item_size = btrfs_chunk_item_size(map->num_stripes);
2143 int index = 0;
2144 int ret;
2145
2146 chunk = kzalloc(item_size, GFP_NOFS);
2147 if (!chunk)
2148 return -ENOMEM;
2149
2150 index = 0;
2151 while (index < map->num_stripes) {
2152 device = map->stripes[index].dev;
2153 device->bytes_used += stripe_size;
2154 ret = btrfs_update_device(trans, device);
2155 BUG_ON(ret);
2156 index++;
2157 }
2158
2159 index = 0;
2160 stripe = &chunk->stripe;
2161 while (index < map->num_stripes) {
2162 device = map->stripes[index].dev;
2163 dev_offset = map->stripes[index].physical;
2164
2165 btrfs_set_stack_stripe_devid(stripe, device->devid);
2166 btrfs_set_stack_stripe_offset(stripe, dev_offset);
2167 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2168 stripe++;
2169 index++;
2170 }
2171
2172 btrfs_set_stack_chunk_length(chunk, chunk_size);
2173 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2174 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
2175 btrfs_set_stack_chunk_type(chunk, map->type);
2176 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
2177 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
2178 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
2179 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
2180 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
2181
2182 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2183 key.type = BTRFS_CHUNK_ITEM_KEY;
2184 key.offset = chunk_offset;
2185
2186 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
2187 BUG_ON(ret);
2188
2189 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2190 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
2191 item_size);
2192 BUG_ON(ret);
2193 }
2194 kfree(chunk);
2195 return 0;
2196}
2197
2198/*
2199 * Chunk allocation falls into two parts. The first part does works
2200 * that make the new allocated chunk useable, but not do any operation
2201 * that modifies the chunk tree. The second part does the works that
2202 * require modifying the chunk tree. This division is important for the
2203 * bootstrap process of adding storage to a seed btrfs.
2204 */
2205int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2206 struct btrfs_root *extent_root, u64 type)
2207{
2208 u64 chunk_offset;
2209 u64 chunk_size;
2210 u64 stripe_size;
2211 struct map_lookup *map;
2212 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2213 int ret;
2214
2215 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2216 &chunk_offset);
2217 if (ret)
2218 return ret;
2219
2220 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2221 &stripe_size, chunk_offset, type);
2222 if (ret)
2223 return ret;
2224
2225 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2226 chunk_size, stripe_size);
2227 BUG_ON(ret);
2228 return 0;
2229}
2230
2231static int noinline init_first_rw_device(struct btrfs_trans_handle *trans,
2232 struct btrfs_root *root,
2233 struct btrfs_device *device)
2234{
2235 u64 chunk_offset;
2236 u64 sys_chunk_offset;
2237 u64 chunk_size;
2238 u64 sys_chunk_size;
2239 u64 stripe_size;
2240 u64 sys_stripe_size;
2241 u64 alloc_profile;
2242 struct map_lookup *map;
2243 struct map_lookup *sys_map;
2244 struct btrfs_fs_info *fs_info = root->fs_info;
2245 struct btrfs_root *extent_root = fs_info->extent_root;
2246 int ret;
2247
2248 ret = find_next_chunk(fs_info->chunk_root,
2249 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2250 BUG_ON(ret);
2251
2252 alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2253 (fs_info->metadata_alloc_profile &
2254 fs_info->avail_metadata_alloc_bits);
2255 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2256
2257 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2258 &stripe_size, chunk_offset, alloc_profile);
2259 BUG_ON(ret);
2260
2261 sys_chunk_offset = chunk_offset + chunk_size;
2262
2263 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2264 (fs_info->system_alloc_profile &
2265 fs_info->avail_system_alloc_bits);
2266 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2267
2268 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
2269 &sys_chunk_size, &sys_stripe_size,
2270 sys_chunk_offset, alloc_profile);
2271 BUG_ON(ret);
2272
2273 ret = btrfs_add_device(trans, fs_info->chunk_root, device);
2274 BUG_ON(ret);
2275
2276 /*
2277 * Modifying chunk tree needs allocating new blocks from both
2278 * system block group and metadata block group. So we only can
2279 * do operations require modifying the chunk tree after both
2280 * block groups were created.
2281 */
2282 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2283 chunk_size, stripe_size);
2284 BUG_ON(ret);
2285
2286 ret = __finish_chunk_alloc(trans, extent_root, sys_map,
2287 sys_chunk_offset, sys_chunk_size,
2288 sys_stripe_size);
2289 BUG_ON(ret);
2290 return 0;
2291}
2292
2293int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2294{
2295 struct extent_map *em;
2296 struct map_lookup *map;
2297 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2298 int readonly = 0;
2299 int i;
2300
2301 spin_lock(&map_tree->map_tree.lock);
2302 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2303 spin_unlock(&map_tree->map_tree.lock);
2304 if (!em)
2305 return 1;
2306
2307 map = (struct map_lookup *)em->bdev;
2308 for (i = 0; i < map->num_stripes; i++) {
2309 if (!map->stripes[i].dev->writeable) {
2310 readonly = 1;
2311 break;
2312 }
2313 }
2314 free_extent_map(em);
2315 return readonly;
2316}
2317
2318void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2319{
2320 extent_map_tree_init(&tree->map_tree, GFP_NOFS);
2321}
2322
2323void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2324{
2325 struct extent_map *em;
2326
2327 while(1) {
2328 spin_lock(&tree->map_tree.lock);
2329 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2330 if (em)
2331 remove_extent_mapping(&tree->map_tree, em);
2332 spin_unlock(&tree->map_tree.lock);
2333 if (!em)
2334 break;
2335 kfree(em->bdev);
2336 /* once for us */
2337 free_extent_map(em);
2338 /* once for the tree */
2339 free_extent_map(em);
2340 }
2341}
2342
2343int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2344{
2345 struct extent_map *em;
2346 struct map_lookup *map;
2347 struct extent_map_tree *em_tree = &map_tree->map_tree;
2348 int ret;
2349
2350 spin_lock(&em_tree->lock);
2351 em = lookup_extent_mapping(em_tree, logical, len);
2352 spin_unlock(&em_tree->lock);
2353 BUG_ON(!em);
2354
2355 BUG_ON(em->start > logical || em->start + em->len < logical);
2356 map = (struct map_lookup *)em->bdev;
2357 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
2358 ret = map->num_stripes;
2359 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2360 ret = map->sub_stripes;
2361 else
2362 ret = 1;
2363 free_extent_map(em);
2364 return ret;
2365}
2366
2367static int find_live_mirror(struct map_lookup *map, int first, int num,
2368 int optimal)
2369{
2370 int i;
2371 if (map->stripes[optimal].dev->bdev)
2372 return optimal;
2373 for (i = first; i < first + num; i++) {
2374 if (map->stripes[i].dev->bdev)
2375 return i;
2376 }
2377 /* we couldn't find one that doesn't fail. Just return something
2378 * and the io error handling code will clean up eventually
2379 */
2380 return optimal;
2381}
2382
2383static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2384 u64 logical, u64 *length,
2385 struct btrfs_multi_bio **multi_ret,
2386 int mirror_num, struct page *unplug_page)
2387{
2388 struct extent_map *em;
2389 struct map_lookup *map;
2390 struct extent_map_tree *em_tree = &map_tree->map_tree;
2391 u64 offset;
2392 u64 stripe_offset;
2393 u64 stripe_nr;
2394 int stripes_allocated = 8;
2395 int stripes_required = 1;
2396 int stripe_index;
2397 int i;
2398 int num_stripes;
2399 int max_errors = 0;
2400 struct btrfs_multi_bio *multi = NULL;
2401
2402 if (multi_ret && !(rw & (1 << BIO_RW))) {
2403 stripes_allocated = 1;
2404 }
2405again:
2406 if (multi_ret) {
2407 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
2408 GFP_NOFS);
2409 if (!multi)
2410 return -ENOMEM;
2411
2412 atomic_set(&multi->error, 0);
2413 }
2414
2415 spin_lock(&em_tree->lock);
2416 em = lookup_extent_mapping(em_tree, logical, *length);
2417 spin_unlock(&em_tree->lock);
2418
2419 if (!em && unplug_page)
2420 return 0;
2421
2422 if (!em) {
2423 printk("unable to find logical %Lu len %Lu\n", logical, *length);
2424 BUG();
2425 }
2426
2427 BUG_ON(em->start > logical || em->start + em->len < logical);
2428 map = (struct map_lookup *)em->bdev;
2429 offset = logical - em->start;
2430
2431 if (mirror_num > map->num_stripes)
2432 mirror_num = 0;
2433
2434 /* if our multi bio struct is too small, back off and try again */
2435 if (rw & (1 << BIO_RW)) {
2436 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2437 BTRFS_BLOCK_GROUP_DUP)) {
2438 stripes_required = map->num_stripes;
2439 max_errors = 1;
2440 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2441 stripes_required = map->sub_stripes;
2442 max_errors = 1;
2443 }
2444 }
2445 if (multi_ret && rw == WRITE &&
2446 stripes_allocated < stripes_required) {
2447 stripes_allocated = map->num_stripes;
2448 free_extent_map(em);
2449 kfree(multi);
2450 goto again;
2451 }
2452 stripe_nr = offset;
2453 /*
2454 * stripe_nr counts the total number of stripes we have to stride
2455 * to get to this block
2456 */
2457 do_div(stripe_nr, map->stripe_len);
2458
2459 stripe_offset = stripe_nr * map->stripe_len;
2460 BUG_ON(offset < stripe_offset);
2461
2462 /* stripe_offset is the offset of this block in its stripe*/
2463 stripe_offset = offset - stripe_offset;
2464
2465 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2466 BTRFS_BLOCK_GROUP_RAID10 |
2467 BTRFS_BLOCK_GROUP_DUP)) {
2468 /* we limit the length of each bio to what fits in a stripe */
2469 *length = min_t(u64, em->len - offset,
2470 map->stripe_len - stripe_offset);
2471 } else {
2472 *length = em->len - offset;
2473 }
2474
2475 if (!multi_ret && !unplug_page)
2476 goto out;
2477
2478 num_stripes = 1;
2479 stripe_index = 0;
2480 if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2481 if (unplug_page || (rw & (1 << BIO_RW)))
2482 num_stripes = map->num_stripes;
2483 else if (mirror_num)
2484 stripe_index = mirror_num - 1;
2485 else {
2486 stripe_index = find_live_mirror(map, 0,
2487 map->num_stripes,
2488 current->pid % map->num_stripes);
2489 }
2490
2491 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2492 if (rw & (1 << BIO_RW))
2493 num_stripes = map->num_stripes;
2494 else if (mirror_num)
2495 stripe_index = mirror_num - 1;
2496
2497 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2498 int factor = map->num_stripes / map->sub_stripes;
2499
2500 stripe_index = do_div(stripe_nr, factor);
2501 stripe_index *= map->sub_stripes;
2502
2503 if (unplug_page || (rw & (1 << BIO_RW)))
2504 num_stripes = map->sub_stripes;
2505 else if (mirror_num)
2506 stripe_index += mirror_num - 1;
2507 else {
2508 stripe_index = find_live_mirror(map, stripe_index,
2509 map->sub_stripes, stripe_index +
2510 current->pid % map->sub_stripes);
2511 }
2512 } else {
2513 /*
2514 * after this do_div call, stripe_nr is the number of stripes
2515 * on this device we have to walk to find the data, and
2516 * stripe_index is the number of our device in the stripe array
2517 */
2518 stripe_index = do_div(stripe_nr, map->num_stripes);
2519 }
2520 BUG_ON(stripe_index >= map->num_stripes);
2521
2522 for (i = 0; i < num_stripes; i++) {
2523 if (unplug_page) {
2524 struct btrfs_device *device;
2525 struct backing_dev_info *bdi;
2526
2527 device = map->stripes[stripe_index].dev;
2528 if (device->bdev) {
2529 bdi = blk_get_backing_dev_info(device->bdev);
2530 if (bdi->unplug_io_fn) {
2531 bdi->unplug_io_fn(bdi, unplug_page);
2532 }
2533 }
2534 } else {
2535 multi->stripes[i].physical =
2536 map->stripes[stripe_index].physical +
2537 stripe_offset + stripe_nr * map->stripe_len;
2538 multi->stripes[i].dev = map->stripes[stripe_index].dev;
2539 }
2540 stripe_index++;
2541 }
2542 if (multi_ret) {
2543 *multi_ret = multi;
2544 multi->num_stripes = num_stripes;
2545 multi->max_errors = max_errors;
2546 }
2547out:
2548 free_extent_map(em);
2549 return 0;
2550}
2551
2552int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2553 u64 logical, u64 *length,
2554 struct btrfs_multi_bio **multi_ret, int mirror_num)
2555{
2556 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
2557 mirror_num, NULL);
2558}
2559
2560int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
2561 u64 logical, struct page *page)
2562{
2563 u64 length = PAGE_CACHE_SIZE;
2564 return __btrfs_map_block(map_tree, READ, logical, &length,
2565 NULL, 0, page);
2566}
2567
2568
2569static void end_bio_multi_stripe(struct bio *bio, int err)
2570{
2571 struct btrfs_multi_bio *multi = bio->bi_private;
2572 int is_orig_bio = 0;
2573
2574 if (err)
2575 atomic_inc(&multi->error);
2576
2577 if (bio == multi->orig_bio)
2578 is_orig_bio = 1;
2579
2580 if (atomic_dec_and_test(&multi->stripes_pending)) {
2581 if (!is_orig_bio) {
2582 bio_put(bio);
2583 bio = multi->orig_bio;
2584 }
2585 bio->bi_private = multi->private;
2586 bio->bi_end_io = multi->end_io;
2587 /* only send an error to the higher layers if it is
2588 * beyond the tolerance of the multi-bio
2589 */
2590 if (atomic_read(&multi->error) > multi->max_errors) {
2591 err = -EIO;
2592 } else if (err) {
2593 /*
2594 * this bio is actually up to date, we didn't
2595 * go over the max number of errors
2596 */
2597 set_bit(BIO_UPTODATE, &bio->bi_flags);
2598 err = 0;
2599 }
2600 kfree(multi);
2601
2602 bio_endio(bio, err);
2603 } else if (!is_orig_bio) {
2604 bio_put(bio);
2605 }
2606}
2607
2608struct async_sched {
2609 struct bio *bio;
2610 int rw;
2611 struct btrfs_fs_info *info;
2612 struct btrfs_work work;
2613};
2614
2615/*
2616 * see run_scheduled_bios for a description of why bios are collected for
2617 * async submit.
2618 *
2619 * This will add one bio to the pending list for a device and make sure
2620 * the work struct is scheduled.
2621 */
2622static int noinline schedule_bio(struct btrfs_root *root,
2623 struct btrfs_device *device,
2624 int rw, struct bio *bio)
2625{
2626 int should_queue = 1;
2627
2628 /* don't bother with additional async steps for reads, right now */
2629 if (!(rw & (1 << BIO_RW))) {
2630 bio_get(bio);
2631 submit_bio(rw, bio);
2632 bio_put(bio);
2633 return 0;
2634 }
2635
2636 /*
2637 * nr_async_bios allows us to reliably return congestion to the
2638 * higher layers. Otherwise, the async bio makes it appear we have
2639 * made progress against dirty pages when we've really just put it
2640 * on a queue for later
2641 */
2642 atomic_inc(&root->fs_info->nr_async_bios);
2643 WARN_ON(bio->bi_next);
2644 bio->bi_next = NULL;
2645 bio->bi_rw |= rw;
2646
2647 spin_lock(&device->io_lock);
2648
2649 if (device->pending_bio_tail)
2650 device->pending_bio_tail->bi_next = bio;
2651
2652 device->pending_bio_tail = bio;
2653 if (!device->pending_bios)
2654 device->pending_bios = bio;
2655 if (device->running_pending)
2656 should_queue = 0;
2657
2658 spin_unlock(&device->io_lock);
2659
2660 if (should_queue)
2661 btrfs_queue_worker(&root->fs_info->submit_workers,
2662 &device->work);
2663 return 0;
2664}
2665
2666int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
2667 int mirror_num, int async_submit)
2668{
2669 struct btrfs_mapping_tree *map_tree;
2670 struct btrfs_device *dev;
2671 struct bio *first_bio = bio;
2672 u64 logical = (u64)bio->bi_sector << 9;
2673 u64 length = 0;
2674 u64 map_length;
2675 struct btrfs_multi_bio *multi = NULL;
2676 int ret;
2677 int dev_nr = 0;
2678 int total_devs = 1;
2679
2680 length = bio->bi_size;
2681 map_tree = &root->fs_info->mapping_tree;
2682 map_length = length;
2683
2684 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
2685 mirror_num);
2686 BUG_ON(ret);
2687
2688 total_devs = multi->num_stripes;
2689 if (map_length < length) {
2690 printk("mapping failed logical %Lu bio len %Lu "
2691 "len %Lu\n", logical, length, map_length);
2692 BUG();
2693 }
2694 multi->end_io = first_bio->bi_end_io;
2695 multi->private = first_bio->bi_private;
2696 multi->orig_bio = first_bio;
2697 atomic_set(&multi->stripes_pending, multi->num_stripes);
2698
2699 while(dev_nr < total_devs) {
2700 if (total_devs > 1) {
2701 if (dev_nr < total_devs - 1) {
2702 bio = bio_clone(first_bio, GFP_NOFS);
2703 BUG_ON(!bio);
2704 } else {
2705 bio = first_bio;
2706 }
2707 bio->bi_private = multi;
2708 bio->bi_end_io = end_bio_multi_stripe;
2709 }
2710 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
2711 dev = multi->stripes[dev_nr].dev;
2712 BUG_ON(rw == WRITE && !dev->writeable);
2713 if (dev && dev->bdev) {
2714 bio->bi_bdev = dev->bdev;
2715 if (async_submit)
2716 schedule_bio(root, dev, rw, bio);
2717 else
2718 submit_bio(rw, bio);
2719 } else {
2720 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
2721 bio->bi_sector = logical >> 9;
2722 bio_endio(bio, -EIO);
2723 }
2724 dev_nr++;
2725 }
2726 if (total_devs == 1)
2727 kfree(multi);
2728 return 0;
2729}
2730
2731struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
2732 u8 *uuid, u8 *fsid)
2733{
2734 struct btrfs_device *device;
2735 struct btrfs_fs_devices *cur_devices;
2736
2737 cur_devices = root->fs_info->fs_devices;
2738 while (cur_devices) {
2739 if (!fsid ||
2740 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
2741 device = __find_device(&cur_devices->devices,
2742 devid, uuid);
2743 if (device)
2744 return device;
2745 }
2746 cur_devices = cur_devices->seed;
2747 }
2748 return NULL;
2749}
2750
2751static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
2752 u64 devid, u8 *dev_uuid)
2753{
2754 struct btrfs_device *device;
2755 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2756
2757 device = kzalloc(sizeof(*device), GFP_NOFS);
2758 if (!device)
2759 return NULL;
2760 list_add(&device->dev_list,
2761 &fs_devices->devices);
2762 device->barriers = 1;
2763 device->dev_root = root->fs_info->dev_root;
2764 device->devid = devid;
2765 device->work.func = pending_bios_fn;
2766 fs_devices->num_devices++;
2767 spin_lock_init(&device->io_lock);
2768 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
2769 return device;
2770}
2771
2772static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2773 struct extent_buffer *leaf,
2774 struct btrfs_chunk *chunk)
2775{
2776 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2777 struct map_lookup *map;
2778 struct extent_map *em;
2779 u64 logical;
2780 u64 length;
2781 u64 devid;
2782 u8 uuid[BTRFS_UUID_SIZE];
2783 int num_stripes;
2784 int ret;
2785 int i;
2786
2787 logical = key->offset;
2788 length = btrfs_chunk_length(leaf, chunk);
2789
2790 spin_lock(&map_tree->map_tree.lock);
2791 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
2792 spin_unlock(&map_tree->map_tree.lock);
2793
2794 /* already mapped? */
2795 if (em && em->start <= logical && em->start + em->len > logical) {
2796 free_extent_map(em);
2797 return 0;
2798 } else if (em) {
2799 free_extent_map(em);
2800 }
2801
2802 map = kzalloc(sizeof(*map), GFP_NOFS);
2803 if (!map)
2804 return -ENOMEM;
2805
2806 em = alloc_extent_map(GFP_NOFS);
2807 if (!em)
2808 return -ENOMEM;
2809 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2810 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2811 if (!map) {
2812 free_extent_map(em);
2813 return -ENOMEM;
2814 }
2815
2816 em->bdev = (struct block_device *)map;
2817 em->start = logical;
2818 em->len = length;
2819 em->block_start = 0;
2820 em->block_len = em->len;
2821
2822 map->num_stripes = num_stripes;
2823 map->io_width = btrfs_chunk_io_width(leaf, chunk);
2824 map->io_align = btrfs_chunk_io_align(leaf, chunk);
2825 map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
2826 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
2827 map->type = btrfs_chunk_type(leaf, chunk);
2828 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
2829 for (i = 0; i < num_stripes; i++) {
2830 map->stripes[i].physical =
2831 btrfs_stripe_offset_nr(leaf, chunk, i);
2832 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
2833 read_extent_buffer(leaf, uuid, (unsigned long)
2834 btrfs_stripe_dev_uuid_nr(chunk, i),
2835 BTRFS_UUID_SIZE);
2836 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
2837 NULL);
2838 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
2839 kfree(map);
2840 free_extent_map(em);
2841 return -EIO;
2842 }
2843 if (!map->stripes[i].dev) {
2844 map->stripes[i].dev =
2845 add_missing_dev(root, devid, uuid);
2846 if (!map->stripes[i].dev) {
2847 kfree(map);
2848 free_extent_map(em);
2849 return -EIO;
2850 }
2851 }
2852 map->stripes[i].dev->in_fs_metadata = 1;
2853 }
2854
2855 spin_lock(&map_tree->map_tree.lock);
2856 ret = add_extent_mapping(&map_tree->map_tree, em);
2857 spin_unlock(&map_tree->map_tree.lock);
2858 BUG_ON(ret);
2859 free_extent_map(em);
2860
2861 return 0;
2862}
2863
2864static int fill_device_from_item(struct extent_buffer *leaf,
2865 struct btrfs_dev_item *dev_item,
2866 struct btrfs_device *device)
2867{
2868 unsigned long ptr;
2869
2870 device->devid = btrfs_device_id(leaf, dev_item);
2871 device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
2872 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
2873 device->type = btrfs_device_type(leaf, dev_item);
2874 device->io_align = btrfs_device_io_align(leaf, dev_item);
2875 device->io_width = btrfs_device_io_width(leaf, dev_item);
2876 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
2877
2878 ptr = (unsigned long)btrfs_device_uuid(dev_item);
2879 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
2880
2881 return 0;
2882}
2883
2884static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
2885{
2886 struct btrfs_fs_devices *fs_devices;
2887 int ret;
2888
2889 mutex_lock(&uuid_mutex);
2890
2891 fs_devices = root->fs_info->fs_devices->seed;
2892 while (fs_devices) {
2893 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
2894 ret = 0;
2895 goto out;
2896 }
2897 fs_devices = fs_devices->seed;
2898 }
2899
2900 fs_devices = find_fsid(fsid);
2901 if (!fs_devices) {
2902 ret = -ENOENT;
2903 goto out;
2904 }
2905 if (fs_devices->opened) {
2906 ret = -EBUSY;
2907 goto out;
2908 }
2909
2910 ret = __btrfs_open_devices(fs_devices, root->fs_info->bdev_holder);
2911 if (ret)
2912 goto out;
2913
2914 if (!fs_devices->seeding) {
2915 __btrfs_close_devices(fs_devices);
2916 ret = -EINVAL;
2917 goto out;
2918 }
2919
2920 fs_devices->seed = root->fs_info->fs_devices->seed;
2921 root->fs_info->fs_devices->seed = fs_devices;
2922 fs_devices->sprouted = 1;
2923out:
2924 mutex_unlock(&uuid_mutex);
2925 return ret;
2926}
2927
2928static int read_one_dev(struct btrfs_root *root,
2929 struct extent_buffer *leaf,
2930 struct btrfs_dev_item *dev_item)
2931{
2932 struct btrfs_device *device;
2933 u64 devid;
2934 int ret;
2935 int seed_devices = 0;
2936 u8 fs_uuid[BTRFS_UUID_SIZE];
2937 u8 dev_uuid[BTRFS_UUID_SIZE];
2938
2939 devid = btrfs_device_id(leaf, dev_item);
2940 read_extent_buffer(leaf, dev_uuid,
2941 (unsigned long)btrfs_device_uuid(dev_item),
2942 BTRFS_UUID_SIZE);
2943 read_extent_buffer(leaf, fs_uuid,
2944 (unsigned long)btrfs_device_fsid(dev_item),
2945 BTRFS_UUID_SIZE);
2946
2947 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
2948 ret = open_seed_devices(root, fs_uuid);
2949 if (ret)
2950 return ret;
2951 seed_devices = 1;
2952 }
2953
2954 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
2955 if (!device || !device->bdev) {
2956 if (!btrfs_test_opt(root, DEGRADED) || seed_devices)
2957 return -EIO;
2958
2959 if (!device) {
2960 printk("warning devid %Lu missing\n", devid);
2961 device = add_missing_dev(root, devid, dev_uuid);
2962 if (!device)
2963 return -ENOMEM;
2964 }
2965 }
2966
2967 if (device->fs_devices != root->fs_info->fs_devices) {
2968 BUG_ON(device->writeable);
2969 if (device->generation !=
2970 btrfs_device_generation(leaf, dev_item))
2971 return -EINVAL;
2972 }
2973
2974 fill_device_from_item(leaf, dev_item, device);
2975 device->dev_root = root->fs_info->dev_root;
2976 device->in_fs_metadata = 1;
2977 if (device->writeable)
2978 device->fs_devices->total_rw_bytes += device->total_bytes;
2979 ret = 0;
2980#if 0
2981 ret = btrfs_open_device(device);
2982 if (ret) {
2983 kfree(device);
2984 }
2985#endif
2986 return ret;
2987}
2988
2989int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
2990{
2991 struct btrfs_dev_item *dev_item;
2992
2993 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
2994 dev_item);
2995 return read_one_dev(root, buf, dev_item);
2996}
2997
2998int btrfs_read_sys_array(struct btrfs_root *root)
2999{
3000 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
3001 struct extent_buffer *sb;
3002 struct btrfs_disk_key *disk_key;
3003 struct btrfs_chunk *chunk;
3004 u8 *ptr;
3005 unsigned long sb_ptr;
3006 int ret = 0;
3007 u32 num_stripes;
3008 u32 array_size;
3009 u32 len = 0;
3010 u32 cur;
3011 struct btrfs_key key;
3012
3013 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
3014 BTRFS_SUPER_INFO_SIZE);
3015 if (!sb)
3016 return -ENOMEM;
3017 btrfs_set_buffer_uptodate(sb);
3018 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3019 array_size = btrfs_super_sys_array_size(super_copy);
3020
3021 ptr = super_copy->sys_chunk_array;
3022 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
3023 cur = 0;
3024
3025 while (cur < array_size) {
3026 disk_key = (struct btrfs_disk_key *)ptr;
3027 btrfs_disk_key_to_cpu(&key, disk_key);
3028
3029 len = sizeof(*disk_key); ptr += len;
3030 sb_ptr += len;
3031 cur += len;
3032
3033 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
3034 chunk = (struct btrfs_chunk *)sb_ptr;
3035 ret = read_one_chunk(root, &key, sb, chunk);
3036 if (ret)
3037 break;
3038 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
3039 len = btrfs_chunk_item_size(num_stripes);
3040 } else {
3041 ret = -EIO;
3042 break;
3043 }
3044 ptr += len;
3045 sb_ptr += len;
3046 cur += len;
3047 }
3048 free_extent_buffer(sb);
3049 return ret;
3050}
3051
3052int btrfs_read_chunk_tree(struct btrfs_root *root)
3053{
3054 struct btrfs_path *path;
3055 struct extent_buffer *leaf;
3056 struct btrfs_key key;
3057 struct btrfs_key found_key;
3058 int ret;
3059 int slot;
3060
3061 root = root->fs_info->chunk_root;
3062
3063 path = btrfs_alloc_path();
3064 if (!path)
3065 return -ENOMEM;
3066
3067 /* first we search for all of the device items, and then we
3068 * read in all of the chunk items. This way we can create chunk
3069 * mappings that reference all of the devices that are afound
3070 */
3071 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
3072 key.offset = 0;
3073 key.type = 0;
3074again:
3075 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3076 while(1) {
3077 leaf = path->nodes[0];
3078 slot = path->slots[0];
3079 if (slot >= btrfs_header_nritems(leaf)) {
3080 ret = btrfs_next_leaf(root, path);
3081 if (ret == 0)
3082 continue;
3083 if (ret < 0)
3084 goto error;
3085 break;
3086 }
3087 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3088 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3089 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
3090 break;
3091 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
3092 struct btrfs_dev_item *dev_item;
3093 dev_item = btrfs_item_ptr(leaf, slot,
3094 struct btrfs_dev_item);
3095 ret = read_one_dev(root, leaf, dev_item);
3096 if (ret)
3097 goto error;
3098 }
3099 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
3100 struct btrfs_chunk *chunk;
3101 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3102 ret = read_one_chunk(root, &found_key, leaf, chunk);
3103 if (ret)
3104 goto error;
3105 }
3106 path->slots[0]++;
3107 }
3108 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3109 key.objectid = 0;
3110 btrfs_release_path(root, path);
3111 goto again;
3112 }
3113 ret = 0;
3114error:
3115 btrfs_free_path(path);
3116 return ret;
3117}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 000000000000..1f6f25a5787f
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,158 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_VOLUMES_
20#define __BTRFS_VOLUMES_
21
22#include <linux/bio.h>
23#include "async-thread.h"
24
25struct buffer_head;
26struct btrfs_device {
27 struct list_head dev_list;
28 struct list_head dev_alloc_list;
29 struct btrfs_fs_devices *fs_devices;
30 struct btrfs_root *dev_root;
31 struct buffer_head *pending_io;
32 struct bio *pending_bios;
33 struct bio *pending_bio_tail;
34 int running_pending;
35 u64 generation;
36
37 int barriers;
38 int writeable;
39 int in_fs_metadata;
40
41 spinlock_t io_lock;
42
43 struct block_device *bdev;
44
45 char *name;
46
47 /* the internal btrfs device id */
48 u64 devid;
49
50 /* size of the device */
51 u64 total_bytes;
52
53 /* bytes used */
54 u64 bytes_used;
55
56 /* optimal io alignment for this device */
57 u32 io_align;
58
59 /* optimal io width for this device */
60 u32 io_width;
61
62 /* minimal io size for this device */
63 u32 sector_size;
64
65 /* type and info about this device */
66 u64 type;
67
68 /* physical drive uuid (or lvm uuid) */
69 u8 uuid[BTRFS_UUID_SIZE];
70
71 struct btrfs_work work;
72};
73
74struct btrfs_fs_devices {
75 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
76
77 /* the device with this id has the most recent coyp of the super */
78 u64 latest_devid;
79 u64 latest_trans;
80 u64 num_devices;
81 u64 open_devices;
82 u64 rw_devices;
83 u64 total_rw_bytes;
84 struct block_device *latest_bdev;
85 /* all of the devices in the FS */
86 struct list_head devices;
87
88 /* devices not currently being allocated */
89 struct list_head alloc_list;
90 struct list_head list;
91
92 struct btrfs_fs_devices *seed;
93 int seeding;
94 int sprouted;
95
96 int opened;
97};
98
99struct btrfs_bio_stripe {
100 struct btrfs_device *dev;
101 u64 physical;
102};
103
104struct btrfs_multi_bio {
105 atomic_t stripes_pending;
106 bio_end_io_t *end_io;
107 struct bio *orig_bio;
108 void *private;
109 atomic_t error;
110 int max_errors;
111 int num_stripes;
112 struct btrfs_bio_stripe stripes[];
113};
114
115#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
116 (sizeof(struct btrfs_bio_stripe) * (n)))
117
118int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
119 struct btrfs_device *device,
120 u64 chunk_tree, u64 chunk_objectid,
121 u64 chunk_offset, u64 start, u64 num_bytes);
122int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
123 u64 logical, u64 *length,
124 struct btrfs_multi_bio **multi_ret, int mirror_num);
125int btrfs_read_sys_array(struct btrfs_root *root);
126int btrfs_read_chunk_tree(struct btrfs_root *root);
127int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
128 struct btrfs_root *extent_root, u64 type);
129void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
130void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
131int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
132 int mirror_num, int async_submit);
133int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
134int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
135 int flags, void *holder);
136int btrfs_scan_one_device(const char *path, int flags, void *holder,
137 struct btrfs_fs_devices **fs_devices_ret);
138int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
139int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
140int btrfs_add_device(struct btrfs_trans_handle *trans,
141 struct btrfs_root *root,
142 struct btrfs_device *device);
143int btrfs_rm_device(struct btrfs_root *root, char *device_path);
144int btrfs_cleanup_fs_uuids(void);
145int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
146int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
147 u64 logical, struct page *page);
148int btrfs_grow_device(struct btrfs_trans_handle *trans,
149 struct btrfs_device *device, u64 new_size);
150struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
151 u8 *uuid, u8 *fsid);
152int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
153int btrfs_init_new_device(struct btrfs_root *root, char *path);
154int btrfs_balance(struct btrfs_root *dev_root);
155void btrfs_unlock_volumes(void);
156void btrfs_lock_volumes(void);
157int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
158#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 000000000000..adb4b32a9d51
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,321 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/init.h>
20#include <linux/fs.h>
21#include <linux/slab.h>
22#include <linux/rwsem.h>
23#include <linux/xattr.h>
24#include "ctree.h"
25#include "btrfs_inode.h"
26#include "transaction.h"
27#include "xattr.h"
28#include "disk-io.h"
29
30
31ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
32 void *buffer, size_t size)
33{
34 struct btrfs_dir_item *di;
35 struct btrfs_root *root = BTRFS_I(inode)->root;
36 struct btrfs_path *path;
37 struct extent_buffer *leaf;
38 int ret = 0;
39 unsigned long data_ptr;
40
41 path = btrfs_alloc_path();
42 if (!path)
43 return -ENOMEM;
44
45 /* lookup the xattr by name */
46 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
47 strlen(name), 0);
48 if (!di || IS_ERR(di)) {
49 ret = -ENODATA;
50 goto out;
51 }
52
53 leaf = path->nodes[0];
54 /* if size is 0, that means we want the size of the attr */
55 if (!size) {
56 ret = btrfs_dir_data_len(leaf, di);
57 goto out;
58 }
59
60 /* now get the data out of our dir_item */
61 if (btrfs_dir_data_len(leaf, di) > size) {
62 ret = -ERANGE;
63 goto out;
64 }
65 data_ptr = (unsigned long)((char *)(di + 1) +
66 btrfs_dir_name_len(leaf, di));
67 read_extent_buffer(leaf, buffer, data_ptr,
68 btrfs_dir_data_len(leaf, di));
69 ret = btrfs_dir_data_len(leaf, di);
70
71out:
72 btrfs_free_path(path);
73 return ret;
74}
75
76int __btrfs_setxattr(struct inode *inode, const char *name,
77 const void *value, size_t size, int flags)
78{
79 struct btrfs_dir_item *di;
80 struct btrfs_root *root = BTRFS_I(inode)->root;
81 struct btrfs_trans_handle *trans;
82 struct btrfs_path *path;
83 int ret = 0, mod = 0;
84
85 path = btrfs_alloc_path();
86 if (!path)
87 return -ENOMEM;
88
89 trans = btrfs_start_transaction(root, 1);
90 btrfs_set_trans_block_group(trans, inode);
91
92 /* first lets see if we already have this xattr */
93 di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
94 strlen(name), -1);
95 if (IS_ERR(di)) {
96 ret = PTR_ERR(di);
97 goto out;
98 }
99
100 /* ok we already have this xattr, lets remove it */
101 if (di) {
102 /* if we want create only exit */
103 if (flags & XATTR_CREATE) {
104 ret = -EEXIST;
105 goto out;
106 }
107
108 ret = btrfs_delete_one_dir_name(trans, root, path, di);
109 if (ret)
110 goto out;
111 btrfs_release_path(root, path);
112
113 /* if we don't have a value then we are removing the xattr */
114 if (!value) {
115 mod = 1;
116 goto out;
117 }
118 } else {
119 btrfs_release_path(root, path);
120
121 if (flags & XATTR_REPLACE) {
122 /* we couldn't find the attr to replace */
123 ret = -ENODATA;
124 goto out;
125 }
126 }
127
128 /* ok we have to create a completely new xattr */
129 ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
130 value, size, inode->i_ino);
131 if (ret)
132 goto out;
133 mod = 1;
134
135out:
136 if (mod) {
137 inode->i_ctime = CURRENT_TIME;
138 ret = btrfs_update_inode(trans, root, inode);
139 }
140
141 btrfs_end_transaction(trans, root);
142 btrfs_free_path(path);
143 return ret;
144}
145
146ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
147{
148 struct btrfs_key key, found_key;
149 struct inode *inode = dentry->d_inode;
150 struct btrfs_root *root = BTRFS_I(inode)->root;
151 struct btrfs_path *path;
152 struct btrfs_item *item;
153 struct extent_buffer *leaf;
154 struct btrfs_dir_item *di;
155 int ret = 0, slot, advance;
156 size_t total_size = 0, size_left = size;
157 unsigned long name_ptr;
158 size_t name_len;
159 u32 nritems;
160
161 /*
162 * ok we want all objects associated with this id.
163 * NOTE: we set key.offset = 0; because we want to start with the
164 * first xattr that we find and walk forward
165 */
166 key.objectid = inode->i_ino;
167 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
168 key.offset = 0;
169
170 path = btrfs_alloc_path();
171 if (!path)
172 return -ENOMEM;
173 path->reada = 2;
174
175 /* search for our xattrs */
176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
177 if (ret < 0)
178 goto err;
179 ret = 0;
180 advance = 0;
181 while (1) {
182 leaf = path->nodes[0];
183 nritems = btrfs_header_nritems(leaf);
184 slot = path->slots[0];
185
186 /* this is where we start walking through the path */
187 if (advance || slot >= nritems) {
188 /*
189 * if we've reached the last slot in this leaf we need
190 * to go to the next leaf and reset everything
191 */
192 if (slot >= nritems-1) {
193 ret = btrfs_next_leaf(root, path);
194 if (ret)
195 break;
196 leaf = path->nodes[0];
197 nritems = btrfs_header_nritems(leaf);
198 slot = path->slots[0];
199 } else {
200 /*
201 * just walking through the slots on this leaf
202 */
203 slot++;
204 path->slots[0]++;
205 }
206 }
207 advance = 1;
208
209 item = btrfs_item_nr(leaf, slot);
210 btrfs_item_key_to_cpu(leaf, &found_key, slot);
211
212 /* check to make sure this item is what we want */
213 if (found_key.objectid != key.objectid)
214 break;
215 if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
216 break;
217
218 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
219
220 name_len = btrfs_dir_name_len(leaf, di);
221 total_size += name_len + 1;
222
223 /* we are just looking for how big our buffer needs to be */
224 if (!size)
225 continue;
226
227 if (!buffer || (name_len + 1) > size_left) {
228 ret = -ERANGE;
229 break;
230 }
231
232 name_ptr = (unsigned long)(di + 1);
233 read_extent_buffer(leaf, buffer, name_ptr, name_len);
234 buffer[name_len] = '\0';
235
236 size_left -= name_len + 1;
237 buffer += name_len + 1;
238 }
239 ret = total_size;
240
241err:
242 btrfs_free_path(path);
243
244 return ret;
245}
246
247/*
248 * List of handlers for synthetic system.* attributes. All real ondisk
249 * attributes are handled directly.
250 */
251struct xattr_handler *btrfs_xattr_handlers[] = {
252#ifdef CONFIG_FS_POSIX_ACL
253 &btrfs_xattr_acl_access_handler,
254 &btrfs_xattr_acl_default_handler,
255#endif
256 NULL,
257};
258
259/*
260 * Check if the attribute is in a supported namespace.
261 *
262 * This applied after the check for the synthetic attributes in the system
263 * namespace.
264 */
265static bool btrfs_is_valid_xattr(const char *name)
266{
267 return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
268 !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
269 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
270 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
271}
272
273ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
274 void *buffer, size_t size)
275{
276 /*
277 * If this is a request for a synthetic attribute in the system.*
278 * namespace use the generic infrastructure to resolve a handler
279 * for it via sb->s_xattr.
280 */
281 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
282 return generic_getxattr(dentry, name, buffer, size);
283
284 if (!btrfs_is_valid_xattr(name))
285 return -EOPNOTSUPP;
286 return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
287}
288
289int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
290 size_t size, int flags)
291{
292 /*
293 * If this is a request for a synthetic attribute in the system.*
294 * namespace use the generic infrastructure to resolve a handler
295 * for it via sb->s_xattr.
296 */
297 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
298 return generic_setxattr(dentry, name, value, size, flags);
299
300 if (!btrfs_is_valid_xattr(name))
301 return -EOPNOTSUPP;
302
303 if (size == 0)
304 value = ""; /* empty EA, do not remove */
305 return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
306}
307
308int btrfs_removexattr(struct dentry *dentry, const char *name)
309{
310 /*
311 * If this is a request for a synthetic attribute in the system.*
312 * namespace use the generic infrastructure to resolve a handler
313 * for it via sb->s_xattr.
314 */
315 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
316 return generic_removexattr(dentry, name);
317
318 if (!btrfs_is_valid_xattr(name))
319 return -EOPNOTSUPP;
320 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
321}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 000000000000..5b1d08f8e68d
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __XATTR__
20#define __XATTR__
21
22#include <linux/xattr.h>
23
24extern struct xattr_handler btrfs_xattr_acl_access_handler;
25extern struct xattr_handler btrfs_xattr_acl_default_handler;
26extern struct xattr_handler *btrfs_xattr_handlers[];
27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size);
30extern int __btrfs_setxattr(struct inode *inode, const char *name,
31 const void *value, size_t size, int flags);
32
33extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
34 void *buffer, size_t size);
35extern int btrfs_setxattr(struct dentry *dentry, const char *name,
36 const void *value, size_t size, int flags);
37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38
39#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 000000000000..5b9f7002513c
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,638 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 *
18 * Based on jffs2 zlib code:
19 * Copyright © 2001-2007 Red Hat, Inc.
20 * Created by David Woodhouse <dwmw2@infradead.org>
21 */
22
23#include <linux/kernel.h>
24#include <linux/slab.h>
25#include <linux/zlib.h>
26#include <linux/zutil.h>
27#include <linux/vmalloc.h>
28#include <linux/init.h>
29#include <linux/err.h>
30#include <linux/sched.h>
31#include <linux/pagemap.h>
32#include <linux/bio.h>
33
34/* Plan: call deflate() with avail_in == *sourcelen,
35 avail_out = *dstlen - 12 and flush == Z_FINISH.
36 If it doesn't manage to finish, call it again with
37 avail_in == 0 and avail_out set to the remaining 12
38 bytes for it to clean up.
39 Q: Is 12 bytes sufficient?
40*/
41#define STREAM_END_SPACE 12
42
43struct workspace {
44 z_stream inf_strm;
45 z_stream def_strm;
46 char *buf;
47 struct list_head list;
48};
49
50static LIST_HEAD(idle_workspace);
51static DEFINE_SPINLOCK(workspace_lock);
52static unsigned long num_workspace;
53static atomic_t alloc_workspace = ATOMIC_INIT(0);
54static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
55
56/*
57 * this finds an available zlib workspace or allocates a new one
58 * NULL or an ERR_PTR is returned if things go bad.
59 */
60static struct workspace *find_zlib_workspace(void)
61{
62 struct workspace *workspace;
63 int ret;
64 int cpus = num_online_cpus();
65
66again:
67 spin_lock(&workspace_lock);
68 if (!list_empty(&idle_workspace)) {
69 workspace = list_entry(idle_workspace.next, struct workspace,
70 list);
71 list_del(&workspace->list);
72 num_workspace--;
73 spin_unlock(&workspace_lock);
74 return workspace;
75
76 }
77 spin_unlock(&workspace_lock);
78 if (atomic_read(&alloc_workspace) > cpus) {
79 DEFINE_WAIT(wait);
80 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
81 if (atomic_read(&alloc_workspace) > cpus)
82 schedule();
83 finish_wait(&workspace_wait, &wait);
84 goto again;
85 }
86 atomic_inc(&alloc_workspace);
87 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
88 if (!workspace) {
89 ret = -ENOMEM;
90 goto fail;
91 }
92
93 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
94 if (!workspace->def_strm.workspace) {
95 ret = -ENOMEM;
96 goto fail;
97 }
98 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
99 if (!workspace->inf_strm.workspace) {
100 ret = -ENOMEM;
101 goto fail_inflate;
102 }
103 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
104 if (!workspace->buf) {
105 ret = -ENOMEM;
106 goto fail_kmalloc;
107 }
108 return workspace;
109
110fail_kmalloc:
111 vfree(workspace->inf_strm.workspace);
112fail_inflate:
113 vfree(workspace->def_strm.workspace);
114fail:
115 kfree(workspace);
116 atomic_dec(&alloc_workspace);
117 wake_up(&workspace_wait);
118 return ERR_PTR(ret);
119}
120
121/*
122 * put a workspace struct back on the list or free it if we have enough
123 * idle ones sitting around
124 */
125static int free_workspace(struct workspace *workspace)
126{
127 spin_lock(&workspace_lock);
128 if (num_workspace < num_online_cpus()) {
129 list_add_tail(&workspace->list, &idle_workspace);
130 num_workspace++;
131 spin_unlock(&workspace_lock);
132 if (waitqueue_active(&workspace_wait))
133 wake_up(&workspace_wait);
134 return 0;
135 }
136 spin_unlock(&workspace_lock);
137 vfree(workspace->def_strm.workspace);
138 vfree(workspace->inf_strm.workspace);
139 kfree(workspace->buf);
140 kfree(workspace);
141
142 atomic_dec(&alloc_workspace);
143 if (waitqueue_active(&workspace_wait))
144 wake_up(&workspace_wait);
145 return 0;
146}
147
148/*
149 * cleanup function for module exit
150 */
151static void free_workspaces(void)
152{
153 struct workspace *workspace;
154 while(!list_empty(&idle_workspace)) {
155 workspace = list_entry(idle_workspace.next, struct workspace,
156 list);
157 list_del(&workspace->list);
158 vfree(workspace->def_strm.workspace);
159 vfree(workspace->inf_strm.workspace);
160 kfree(workspace->buf);
161 kfree(workspace);
162 atomic_dec(&alloc_workspace);
163 }
164}
165
166/*
167 * given an address space and start/len, compress the bytes.
168 *
169 * pages are allocated to hold the compressed result and stored
170 * in 'pages'
171 *
172 * out_pages is used to return the number of pages allocated. There
173 * may be pages allocated even if we return an error
174 *
175 * total_in is used to return the number of bytes actually read. It
176 * may be smaller then len if we had to exit early because we
177 * ran out of room in the pages array or because we cross the
178 * max_out threshold.
179 *
180 * total_out is used to return the total number of compressed bytes
181 *
182 * max_out tells us the max number of bytes that we're allowed to
183 * stuff into pages
184 */
185int btrfs_zlib_compress_pages(struct address_space *mapping,
186 u64 start, unsigned long len,
187 struct page **pages,
188 unsigned long nr_dest_pages,
189 unsigned long *out_pages,
190 unsigned long *total_in,
191 unsigned long *total_out,
192 unsigned long max_out)
193{
194 int ret;
195 struct workspace *workspace;
196 char *data_in;
197 char *cpage_out;
198 int nr_pages = 0;
199 struct page *in_page = NULL;
200 struct page *out_page = NULL;
201 int out_written = 0;
202 int in_read = 0;
203 unsigned long bytes_left;
204
205 *out_pages = 0;
206 *total_out = 0;
207 *total_in = 0;
208
209 workspace = find_zlib_workspace();
210 if (!workspace)
211 return -1;
212
213 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
214 printk(KERN_WARNING "deflateInit failed\n");
215 ret = -1;
216 goto out;
217 }
218
219 workspace->def_strm.total_in = 0;
220 workspace->def_strm.total_out = 0;
221
222 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
223 data_in = kmap(in_page);
224
225 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
226 cpage_out = kmap(out_page);
227 pages[0] = out_page;
228 nr_pages = 1;
229
230 workspace->def_strm.next_in = data_in;
231 workspace->def_strm.next_out = cpage_out;
232 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
233 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
234
235 out_written = 0;
236 in_read = 0;
237
238 while (workspace->def_strm.total_in < len) {
239 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
240 if (ret != Z_OK) {
241 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
242 ret);
243 zlib_deflateEnd(&workspace->def_strm);
244 ret = -1;
245 goto out;
246 }
247
248 /* we're making it bigger, give up */
249 if (workspace->def_strm.total_in > 8192 &&
250 workspace->def_strm.total_in <
251 workspace->def_strm.total_out) {
252 ret = -1;
253 goto out;
254 }
255 /* we need another page for writing out. Test this
256 * before the total_in so we will pull in a new page for
257 * the stream end if required
258 */
259 if (workspace->def_strm.avail_out == 0) {
260 kunmap(out_page);
261 if (nr_pages == nr_dest_pages) {
262 out_page = NULL;
263 ret = -1;
264 goto out;
265 }
266 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
267 cpage_out = kmap(out_page);
268 pages[nr_pages] = out_page;
269 nr_pages++;
270 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
271 workspace->def_strm.next_out = cpage_out;
272 }
273 /* we're all done */
274 if (workspace->def_strm.total_in >= len)
275 break;
276
277 /* we've read in a full page, get a new one */
278 if (workspace->def_strm.avail_in == 0) {
279 if (workspace->def_strm.total_out > max_out)
280 break;
281
282 bytes_left = len - workspace->def_strm.total_in;
283 kunmap(in_page);
284 page_cache_release(in_page);
285
286 start += PAGE_CACHE_SIZE;
287 in_page = find_get_page(mapping,
288 start >> PAGE_CACHE_SHIFT);
289 data_in = kmap(in_page);
290 workspace->def_strm.avail_in = min(bytes_left,
291 PAGE_CACHE_SIZE);
292 workspace->def_strm.next_in = data_in;
293 }
294 }
295 workspace->def_strm.avail_in = 0;
296 ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
297 zlib_deflateEnd(&workspace->def_strm);
298
299 if (ret != Z_STREAM_END) {
300 ret = -1;
301 goto out;
302 }
303
304 if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
305 ret = -1;
306 goto out;
307 }
308
309 ret = 0;
310 *total_out = workspace->def_strm.total_out;
311 *total_in = workspace->def_strm.total_in;
312out:
313 *out_pages = nr_pages;
314 if (out_page)
315 kunmap(out_page);
316
317 if (in_page) {
318 kunmap(in_page);
319 page_cache_release(in_page);
320 }
321 free_workspace(workspace);
322 return ret;
323}
324
325/*
326 * pages_in is an array of pages with compressed data.
327 *
328 * disk_start is the starting logical offset of this array in the file
329 *
330 * bvec is a bio_vec of pages from the file that we want to decompress into
331 *
332 * vcnt is the count of pages in the biovec
333 *
334 * srclen is the number of bytes in pages_in
335 *
336 * The basic idea is that we have a bio that was created by readpages.
337 * The pages in the bio are for the uncompressed data, and they may not
338 * be contiguous. They all correspond to the range of bytes covered by
339 * the compressed extent.
340 */
341int btrfs_zlib_decompress_biovec(struct page **pages_in,
342 u64 disk_start,
343 struct bio_vec *bvec,
344 int vcnt,
345 size_t srclen)
346{
347 int ret = 0;
348 int wbits = MAX_WBITS;
349 struct workspace *workspace;
350 char *data_in;
351 size_t total_out = 0;
352 unsigned long page_bytes_left;
353 unsigned long page_in_index = 0;
354 unsigned long page_out_index = 0;
355 struct page *page_out;
356 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
357 PAGE_CACHE_SIZE;
358 unsigned long buf_start;
359 unsigned long buf_offset;
360 unsigned long bytes;
361 unsigned long working_bytes;
362 unsigned long pg_offset;
363 unsigned long start_byte;
364 unsigned long current_buf_start;
365 char *kaddr;
366
367 workspace = find_zlib_workspace();
368 if (!workspace)
369 return -ENOMEM;
370
371 data_in = kmap(pages_in[page_in_index]);
372 workspace->inf_strm.next_in = data_in;
373 workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
374 workspace->inf_strm.total_in = 0;
375
376 workspace->inf_strm.total_out = 0;
377 workspace->inf_strm.next_out = workspace->buf;
378 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
379 page_out = bvec[page_out_index].bv_page;
380 page_bytes_left = PAGE_CACHE_SIZE;
381 pg_offset = 0;
382
383 /* If it's deflate, and it's got no preset dictionary, then
384 we can tell zlib to skip the adler32 check. */
385 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
386 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
387 !(((data_in[0]<<8) + data_in[1]) % 31)) {
388
389 wbits = -((data_in[0] >> 4) + 8);
390 workspace->inf_strm.next_in += 2;
391 workspace->inf_strm.avail_in -= 2;
392 }
393
394 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
395 printk(KERN_WARNING "inflateInit failed\n");
396 ret = -1;
397 goto out;
398 }
399 while(workspace->inf_strm.total_in < srclen) {
400 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
401 if (ret != Z_OK && ret != Z_STREAM_END) {
402 break;
403 }
404
405 /*
406 * buf start is the byte offset we're of the start of
407 * our workspace buffer
408 */
409 buf_start = total_out;
410
411 /* total_out is the last byte of the workspace buffer */
412 total_out = workspace->inf_strm.total_out;
413
414 working_bytes = total_out - buf_start;
415
416 /*
417 * start byte is the first byte of the page we're currently
418 * copying into relative to the start of the compressed data.
419 */
420 start_byte = page_offset(page_out) - disk_start;
421
422 if (working_bytes == 0) {
423 /* we didn't make progress in this inflate
424 * call, we're done
425 */
426 if (ret != Z_STREAM_END) {
427 ret = -1;
428 }
429 break;
430 }
431
432 /* we haven't yet hit data corresponding to this page */
433 if (total_out <= start_byte) {
434 goto next;
435 }
436
437 /*
438 * the start of the data we care about is offset into
439 * the middle of our working buffer
440 */
441 if (total_out > start_byte && buf_start < start_byte) {
442 buf_offset = start_byte - buf_start;
443 working_bytes -= buf_offset;
444 } else {
445 buf_offset = 0;
446 }
447 current_buf_start = buf_start;
448
449 /* copy bytes from the working buffer into the pages */
450 while(working_bytes > 0) {
451 bytes = min(PAGE_CACHE_SIZE - pg_offset,
452 PAGE_CACHE_SIZE - buf_offset);
453 bytes = min(bytes, working_bytes);
454 kaddr = kmap_atomic(page_out, KM_USER0);
455 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
456 bytes);
457 kunmap_atomic(kaddr, KM_USER0);
458 flush_dcache_page(page_out);
459
460 pg_offset += bytes;
461 page_bytes_left -= bytes;
462 buf_offset += bytes;
463 working_bytes -= bytes;
464 current_buf_start += bytes;
465
466 /* check if we need to pick another page */
467 if (page_bytes_left == 0) {
468 page_out_index++;
469 if (page_out_index >= vcnt) {
470 ret = 0;
471 goto done;
472 }
473 page_out = bvec[page_out_index].bv_page;
474 pg_offset = 0;
475 page_bytes_left = PAGE_CACHE_SIZE;
476 start_byte = page_offset(page_out) - disk_start;
477
478 /*
479 * make sure our new page is covered by this
480 * working buffer
481 */
482 if (total_out <= start_byte) {
483 goto next;
484 }
485
486 /* the next page in the biovec might not
487 * be adjacent to the last page, but it
488 * might still be found inside this working
489 * buffer. bump our offset pointer
490 */
491 if (total_out > start_byte &&
492 current_buf_start < start_byte) {
493 buf_offset = start_byte - buf_start;
494 working_bytes = total_out - start_byte;
495 current_buf_start = buf_start +
496 buf_offset;
497 }
498 }
499 }
500next:
501 workspace->inf_strm.next_out = workspace->buf;
502 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
503
504 if (workspace->inf_strm.avail_in == 0) {
505 unsigned long tmp;
506 kunmap(pages_in[page_in_index]);
507 page_in_index++;
508 if (page_in_index >= total_pages_in) {
509 data_in = NULL;
510 break;
511 }
512 data_in = kmap(pages_in[page_in_index]);
513 workspace->inf_strm.next_in = data_in;
514 tmp = srclen - workspace->inf_strm.total_in;
515 workspace->inf_strm.avail_in = min(tmp,
516 PAGE_CACHE_SIZE);
517 }
518 }
519 if (ret != Z_STREAM_END) {
520 ret = -1;
521 } else {
522 ret = 0;
523 }
524done:
525 zlib_inflateEnd(&workspace->inf_strm);
526 if (data_in)
527 kunmap(pages_in[page_in_index]);
528out:
529 free_workspace(workspace);
530 return ret;
531}
532
533/*
534 * a less complex decompression routine. Our compressed data fits in a
535 * single page, and we want to read a single page out of it.
536 * start_byte tells us the offset into the compressed data we're interested in
537 */
538int btrfs_zlib_decompress(unsigned char *data_in,
539 struct page *dest_page,
540 unsigned long start_byte,
541 size_t srclen, size_t destlen)
542{
543 int ret = 0;
544 int wbits = MAX_WBITS;
545 struct workspace *workspace;
546 unsigned long bytes_left = destlen;
547 unsigned long total_out = 0;
548 char *kaddr;
549
550 if (destlen > PAGE_CACHE_SIZE)
551 return -ENOMEM;
552
553 workspace = find_zlib_workspace();
554 if (!workspace)
555 return -ENOMEM;
556
557 workspace->inf_strm.next_in = data_in;
558 workspace->inf_strm.avail_in = srclen;
559 workspace->inf_strm.total_in = 0;
560
561 workspace->inf_strm.next_out = workspace->buf;
562 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
563 workspace->inf_strm.total_out = 0;
564 /* If it's deflate, and it's got no preset dictionary, then
565 we can tell zlib to skip the adler32 check. */
566 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
567 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
568 !(((data_in[0]<<8) + data_in[1]) % 31)) {
569
570 wbits = -((data_in[0] >> 4) + 8);
571 workspace->inf_strm.next_in += 2;
572 workspace->inf_strm.avail_in -= 2;
573 }
574
575 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
576 printk(KERN_WARNING "inflateInit failed\n");
577 ret = -1;
578 goto out;
579 }
580
581 while(bytes_left > 0) {
582 unsigned long buf_start;
583 unsigned long buf_offset;
584 unsigned long bytes;
585 unsigned long pg_offset = 0;
586
587 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
588 if (ret != Z_OK && ret != Z_STREAM_END) {
589 break;
590 }
591
592 buf_start = total_out;
593 total_out = workspace->inf_strm.total_out;
594
595 if (total_out == buf_start) {
596 ret = -1;
597 break;
598 }
599
600 if (total_out <= start_byte) {
601 goto next;
602 }
603
604 if (total_out > start_byte && buf_start < start_byte) {
605 buf_offset = start_byte - buf_start;
606 } else {
607 buf_offset = 0;
608 }
609
610 bytes = min(PAGE_CACHE_SIZE - pg_offset,
611 PAGE_CACHE_SIZE - buf_offset);
612 bytes = min(bytes, bytes_left);
613
614 kaddr = kmap_atomic(dest_page, KM_USER0);
615 memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
616 kunmap_atomic(kaddr, KM_USER0);
617
618 pg_offset += bytes;
619 bytes_left -= bytes;
620next:
621 workspace->inf_strm.next_out = workspace->buf;
622 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
623 }
624 if (ret != Z_STREAM_END && bytes_left != 0) {
625 ret = -1;
626 } else {
627 ret = 0;
628 }
629 zlib_inflateEnd(&workspace->inf_strm);
630out:
631 free_workspace(workspace);
632 return ret;
633}
634
635void btrfs_zlib_exit(void)
636{
637 free_workspaces();
638}