aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/COPYING356
-rw-r--r--fs/btrfs/INSTALL48
-rw-r--r--fs/btrfs/Makefile29
-rw-r--r--fs/btrfs/TODO20
-rw-r--r--fs/btrfs/acl.c352
-rw-r--r--fs/btrfs/async-thread.c343
-rw-r--r--fs/btrfs/async-thread.h82
-rw-r--r--fs/btrfs/bit-radix.c130
-rw-r--r--fs/btrfs/bit-radix.h33
-rw-r--r--fs/btrfs/btrfs_inode.h85
-rw-r--r--fs/btrfs/compat.h60
-rw-r--r--fs/btrfs/crc32c.h108
-rw-r--r--fs/btrfs/ctree.c3450
-rw-r--r--fs/btrfs/ctree.h1875
-rw-r--r--fs/btrfs/dir-item.c345
-rw-r--r--fs/btrfs/disk-io.c2056
-rw-r--r--fs/btrfs/disk-io.h84
-rw-r--r--fs/btrfs/export.c207
-rw-r--r--fs/btrfs/export.h19
-rw-r--r--fs/btrfs/extent-tree.c4034
-rw-r--r--fs/btrfs/extent_io.c3441
-rw-r--r--fs/btrfs/extent_io.h247
-rw-r--r--fs/btrfs/extent_map.c332
-rw-r--r--fs/btrfs/extent_map.h57
-rw-r--r--fs/btrfs/file-item.c512
-rw-r--r--fs/btrfs/file.c1133
-rw-r--r--fs/btrfs/free-space-cache.c415
-rw-r--r--fs/btrfs/hash.h27
-rw-r--r--fs/btrfs/inode-item.c206
-rw-r--r--fs/btrfs/inode-map.c141
-rw-r--r--fs/btrfs/inode.c3774
-rw-r--r--fs/btrfs/ioctl.c790
-rw-r--r--fs/btrfs/ioctl.h55
-rw-r--r--fs/btrfs/locking.c74
-rw-r--r--fs/btrfs/locking.h27
-rw-r--r--fs/btrfs/ordered-data.c709
-rw-r--r--fs/btrfs/ordered-data.h149
-rw-r--r--fs/btrfs/orphan.c67
-rw-r--r--fs/btrfs/print-tree.c201
-rw-r--r--fs/btrfs/print-tree.h23
-rw-r--r--fs/btrfs/ref-cache.c187
-rw-r--r--fs/btrfs/ref-cache.h71
-rw-r--r--fs/btrfs/root-tree.c257
-rw-r--r--fs/btrfs/struct-funcs.c111
-rw-r--r--fs/btrfs/super.c663
-rw-r--r--fs/btrfs/sysfs.c301
-rw-r--r--fs/btrfs/transaction.c950
-rw-r--r--fs/btrfs/transaction.h104
-rw-r--r--fs/btrfs/tree-defrag.c145
-rw-r--r--fs/btrfs/tree-log.c2892
-rw-r--r--fs/btrfs/tree-log.h41
-rw-r--r--fs/btrfs/version.sh43
-rw-r--r--fs/btrfs/volumes.c2565
-rw-r--r--fs/btrfs/volumes.h150
-rw-r--r--fs/btrfs/xattr.c321
-rw-r--r--fs/btrfs/xattr.h39
56 files changed, 34936 insertions, 0 deletions
diff --git a/fs/btrfs/COPYING b/fs/btrfs/COPYING
new file mode 100644
index 000000000000..ca442d313d86
--- /dev/null
+++ b/fs/btrfs/COPYING
@@ -0,0 +1,356 @@
1
2 NOTE! This copyright does *not* cover user programs that use kernel
3 services by normal system calls - this is merely considered normal use
4 of the kernel, and does *not* fall under the heading of "derived work".
5 Also note that the GPL below is copyrighted by the Free Software
6 Foundation, but the instance of code that it refers to (the Linux
7 kernel) is copyrighted by me and others who actually wrote it.
8
9 Also note that the only valid version of the GPL as far as the kernel
10 is concerned is _this_ particular version of the license (ie v2, not
11 v2.2 or v3.x or whatever), unless explicitly otherwise stated.
12
13 Linus Torvalds
14
15----------------------------------------
16
17 GNU GENERAL PUBLIC LICENSE
18 Version 2, June 1991
19
20 Copyright (C) 1989, 1991 Free Software Foundation, Inc.
21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 Everyone is permitted to copy and distribute verbatim copies
23 of this license document, but changing it is not allowed.
24
25 Preamble
26
27 The licenses for most software are designed to take away your
28freedom to share and change it. By contrast, the GNU General Public
29License is intended to guarantee your freedom to share and change free
30software--to make sure the software is free for all its users. This
31General Public License applies to most of the Free Software
32Foundation's software and to any other program whose authors commit to
33using it. (Some other Free Software Foundation software is covered by
34the GNU Library General Public License instead.) You can apply it to
35your programs, too.
36
37 When we speak of free software, we are referring to freedom, not
38price. Our General Public Licenses are designed to make sure that you
39have the freedom to distribute copies of free software (and charge for
40this service if you wish), that you receive source code or can get it
41if you want it, that you can change the software or use pieces of it
42in new free programs; and that you know you can do these things.
43
44 To protect your rights, we need to make restrictions that forbid
45anyone to deny you these rights or to ask you to surrender the rights.
46These restrictions translate to certain responsibilities for you if you
47distribute copies of the software, or if you modify it.
48
49 For example, if you distribute copies of such a program, whether
50gratis or for a fee, you must give the recipients all the rights that
51you have. You must make sure that they, too, receive or can get the
52source code. And you must show them these terms so they know their
53rights.
54
55 We protect your rights with two steps: (1) copyright the software, and
56(2) offer you this license which gives you legal permission to copy,
57distribute and/or modify the software.
58
59 Also, for each author's protection and ours, we want to make certain
60that everyone understands that there is no warranty for this free
61software. If the software is modified by someone else and passed on, we
62want its recipients to know that what they have is not the original, so
63that any problems introduced by others will not reflect on the original
64authors' reputations.
65
66 Finally, any free program is threatened constantly by software
67patents. We wish to avoid the danger that redistributors of a free
68program will individually obtain patent licenses, in effect making the
69program proprietary. To prevent this, we have made it clear that any
70patent must be licensed for everyone's free use or not licensed at all.
71
72 The precise terms and conditions for copying, distribution and
73modification follow.
74
75 GNU GENERAL PUBLIC LICENSE
76 TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
77
78 0. This License applies to any program or other work which contains
79a notice placed by the copyright holder saying it may be distributed
80under the terms of this General Public License. The "Program", below,
81refers to any such program or work, and a "work based on the Program"
82means either the Program or any derivative work under copyright law:
83that is to say, a work containing the Program or a portion of it,
84either verbatim or with modifications and/or translated into another
85language. (Hereinafter, translation is included without limitation in
86the term "modification".) Each licensee is addressed as "you".
87
88Activities other than copying, distribution and modification are not
89covered by this License; they are outside its scope. The act of
90running the Program is not restricted, and the output from the Program
91is covered only if its contents constitute a work based on the
92Program (independent of having been made by running the Program).
93Whether that is true depends on what the Program does.
94
95 1. You may copy and distribute verbatim copies of the Program's
96source code as you receive it, in any medium, provided that you
97conspicuously and appropriately publish on each copy an appropriate
98copyright notice and disclaimer of warranty; keep intact all the
99notices that refer to this License and to the absence of any warranty;
100and give any other recipients of the Program a copy of this License
101along with the Program.
102
103You may charge a fee for the physical act of transferring a copy, and
104you may at your option offer warranty protection in exchange for a fee.
105
106 2. You may modify your copy or copies of the Program or any portion
107of it, thus forming a work based on the Program, and copy and
108distribute such modifications or work under the terms of Section 1
109above, provided that you also meet all of these conditions:
110
111 a) You must cause the modified files to carry prominent notices
112 stating that you changed the files and the date of any change.
113
114 b) You must cause any work that you distribute or publish, that in
115 whole or in part contains or is derived from the Program or any
116 part thereof, to be licensed as a whole at no charge to all third
117 parties under the terms of this License.
118
119 c) If the modified program normally reads commands interactively
120 when run, you must cause it, when started running for such
121 interactive use in the most ordinary way, to print or display an
122 announcement including an appropriate copyright notice and a
123 notice that there is no warranty (or else, saying that you provide
124 a warranty) and that users may redistribute the program under
125 these conditions, and telling the user how to view a copy of this
126 License. (Exception: if the Program itself is interactive but
127 does not normally print such an announcement, your work based on
128 the Program is not required to print an announcement.)
129
130These requirements apply to the modified work as a whole. If
131identifiable sections of that work are not derived from the Program,
132and can be reasonably considered independent and separate works in
133themselves, then this License, and its terms, do not apply to those
134sections when you distribute them as separate works. But when you
135distribute the same sections as part of a whole which is a work based
136on the Program, the distribution of the whole must be on the terms of
137this License, whose permissions for other licensees extend to the
138entire whole, and thus to each and every part regardless of who wrote it.
139
140Thus, it is not the intent of this section to claim rights or contest
141your rights to work written entirely by you; rather, the intent is to
142exercise the right to control the distribution of derivative or
143collective works based on the Program.
144
145In addition, mere aggregation of another work not based on the Program
146with the Program (or with a work based on the Program) on a volume of
147a storage or distribution medium does not bring the other work under
148the scope of this License.
149
150 3. You may copy and distribute the Program (or a work based on it,
151under Section 2) in object code or executable form under the terms of
152Sections 1 and 2 above provided that you also do one of the following:
153
154 a) Accompany it with the complete corresponding machine-readable
155 source code, which must be distributed under the terms of Sections
156 1 and 2 above on a medium customarily used for software interchange; or,
157
158 b) Accompany it with a written offer, valid for at least three
159 years, to give any third party, for a charge no more than your
160 cost of physically performing source distribution, a complete
161 machine-readable copy of the corresponding source code, to be
162 distributed under the terms of Sections 1 and 2 above on a medium
163 customarily used for software interchange; or,
164
165 c) Accompany it with the information you received as to the offer
166 to distribute corresponding source code. (This alternative is
167 allowed only for noncommercial distribution and only if you
168 received the program in object code or executable form with such
169 an offer, in accord with Subsection b above.)
170
171The source code for a work means the preferred form of the work for
172making modifications to it. For an executable work, complete source
173code means all the source code for all modules it contains, plus any
174associated interface definition files, plus the scripts used to
175control compilation and installation of the executable. However, as a
176special exception, the source code distributed need not include
177anything that is normally distributed (in either source or binary
178form) with the major components (compiler, kernel, and so on) of the
179operating system on which the executable runs, unless that component
180itself accompanies the executable.
181
182If distribution of executable or object code is made by offering
183access to copy from a designated place, then offering equivalent
184access to copy the source code from the same place counts as
185distribution of the source code, even though third parties are not
186compelled to copy the source along with the object code.
187
188 4. You may not copy, modify, sublicense, or distribute the Program
189except as expressly provided under this License. Any attempt
190otherwise to copy, modify, sublicense or distribute the Program is
191void, and will automatically terminate your rights under this License.
192However, parties who have received copies, or rights, from you under
193this License will not have their licenses terminated so long as such
194parties remain in full compliance.
195
196 5. You are not required to accept this License, since you have not
197signed it. However, nothing else grants you permission to modify or
198distribute the Program or its derivative works. These actions are
199prohibited by law if you do not accept this License. Therefore, by
200modifying or distributing the Program (or any work based on the
201Program), you indicate your acceptance of this License to do so, and
202all its terms and conditions for copying, distributing or modifying
203the Program or works based on it.
204
205 6. Each time you redistribute the Program (or any work based on the
206Program), the recipient automatically receives a license from the
207original licensor to copy, distribute or modify the Program subject to
208these terms and conditions. You may not impose any further
209restrictions on the recipients' exercise of the rights granted herein.
210You are not responsible for enforcing compliance by third parties to
211this License.
212
213 7. If, as a consequence of a court judgment or allegation of patent
214infringement or for any other reason (not limited to patent issues),
215conditions are imposed on you (whether by court order, agreement or
216otherwise) that contradict the conditions of this License, they do not
217excuse you from the conditions of this License. If you cannot
218distribute so as to satisfy simultaneously your obligations under this
219License and any other pertinent obligations, then as a consequence you
220may not distribute the Program at all. For example, if a patent
221license would not permit royalty-free redistribution of the Program by
222all those who receive copies directly or indirectly through you, then
223the only way you could satisfy both it and this License would be to
224refrain entirely from distribution of the Program.
225
226If any portion of this section is held invalid or unenforceable under
227any particular circumstance, the balance of the section is intended to
228apply and the section as a whole is intended to apply in other
229circumstances.
230
231It is not the purpose of this section to induce you to infringe any
232patents or other property right claims or to contest validity of any
233such claims; this section has the sole purpose of protecting the
234integrity of the free software distribution system, which is
235implemented by public license practices. Many people have made
236generous contributions to the wide range of software distributed
237through that system in reliance on consistent application of that
238system; it is up to the author/donor to decide if he or she is willing
239to distribute software through any other system and a licensee cannot
240impose that choice.
241
242This section is intended to make thoroughly clear what is believed to
243be a consequence of the rest of this License.
244
245 8. If the distribution and/or use of the Program is restricted in
246certain countries either by patents or by copyrighted interfaces, the
247original copyright holder who places the Program under this License
248may add an explicit geographical distribution limitation excluding
249those countries, so that distribution is permitted only in or among
250countries not thus excluded. In such case, this License incorporates
251the limitation as if written in the body of this License.
252
253 9. The Free Software Foundation may publish revised and/or new versions
254of the General Public License from time to time. Such new versions will
255be similar in spirit to the present version, but may differ in detail to
256address new problems or concerns.
257
258Each version is given a distinguishing version number. If the Program
259specifies a version number of this License which applies to it and "any
260later version", you have the option of following the terms and conditions
261either of that version or of any later version published by the Free
262Software Foundation. If the Program does not specify a version number of
263this License, you may choose any version ever published by the Free Software
264Foundation.
265
266 10. If you wish to incorporate parts of the Program into other free
267programs whose distribution conditions are different, write to the author
268to ask for permission. For software which is copyrighted by the Free
269Software Foundation, write to the Free Software Foundation; we sometimes
270make exceptions for this. Our decision will be guided by the two goals
271of preserving the free status of all derivatives of our free software and
272of promoting the sharing and reuse of software generally.
273
274 NO WARRANTY
275
276 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
277FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
278OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
279PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
280OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
281MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
282TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
283PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
284REPAIR OR CORRECTION.
285
286 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
287WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
288REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
289INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
290OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
291TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
292YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
293PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
294POSSIBILITY OF SUCH DAMAGES.
295
296 END OF TERMS AND CONDITIONS
297
298 How to Apply These Terms to Your New Programs
299
300 If you develop a new program, and you want it to be of the greatest
301possible use to the public, the best way to achieve this is to make it
302free software which everyone can redistribute and change under these terms.
303
304 To do so, attach the following notices to the program. It is safest
305to attach them to the start of each source file to most effectively
306convey the exclusion of warranty; and each file should have at least
307the "copyright" line and a pointer to where the full notice is found.
308
309 <one line to give the program's name and a brief idea of what it does.>
310 Copyright (C) <year> <name of author>
311
312 This program is free software; you can redistribute it and/or modify
313 it under the terms of the GNU General Public License as published by
314 the Free Software Foundation; either version 2 of the License, or
315 (at your option) any later version.
316
317 This program is distributed in the hope that it will be useful,
318 but WITHOUT ANY WARRANTY; without even the implied warranty of
319 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
320 GNU General Public License for more details.
321
322 You should have received a copy of the GNU General Public License
323 along with this program; if not, write to the Free Software
324 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
325
326
327Also add information on how to contact you by electronic and paper mail.
328
329If the program is interactive, make it output a short notice like this
330when it starts in an interactive mode:
331
332 Gnomovision version 69, Copyright (C) year name of author
333 Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
334 This is free software, and you are welcome to redistribute it
335 under certain conditions; type `show c' for details.
336
337The hypothetical commands `show w' and `show c' should show the appropriate
338parts of the General Public License. Of course, the commands you use may
339be called something other than `show w' and `show c'; they could even be
340mouse-clicks or menu items--whatever suits your program.
341
342You should also get your employer (if you work as a programmer) or your
343school, if any, to sign a "copyright disclaimer" for the program, if
344necessary. Here is a sample; alter the names:
345
346 Yoyodyne, Inc., hereby disclaims all copyright interest in the program
347 `Gnomovision' (which makes passes at compilers) written by James Hacker.
348
349 <signature of Ty Coon>, 1 April 1989
350 Ty Coon, President of Vice
351
352This General Public License does not permit incorporating your program into
353proprietary programs. If your program is a subroutine library, you may
354consider it more useful to permit linking proprietary applications with the
355library. If this is what you want to do, use the GNU Library General
356Public License instead of this License.
diff --git a/fs/btrfs/INSTALL b/fs/btrfs/INSTALL
new file mode 100644
index 000000000000..16b45a56878d
--- /dev/null
+++ b/fs/btrfs/INSTALL
@@ -0,0 +1,48 @@
1Install Instructions
2
3Btrfs puts snapshots and subvolumes into the root directory of the FS. This
4directory can only be changed by btrfsctl right now, and normal filesystem
5operations do not work on it. The default subvolume is called 'default',
6and you can create files and directories in mount_point/default
7
8Btrfs uses libcrc32c in the kernel for file and metadata checksums. You need
9to compile the kernel with:
10
11CONFIG_LIBCRC32C=m
12
13libcrc32c can be static as well. Once your kernel is setup, typing make in the
14btrfs module sources will build against the running kernel. When the build is
15complete:
16
17modprobe libcrc32c
18insmod btrfs.ko
19
20The Btrfs utility programs require libuuid to build. This can be found
21in the e2fsprogs sources, and is usually available as libuuid or
22e2fsprogs-devel from various distros.
23
24Building the utilities is just make ; make install. The programs go
25into /usr/local/bin. The commands available are:
26
27mkfs.btrfs: create a filesystem
28
29btrfsctl: control program to create snapshots and subvolumes:
30
31 mount /dev/sda2 /mnt
32 btrfsctl -s new_subvol_name /mnt
33 btrfsctl -s snapshot_of_default /mnt/default
34 btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name
35 btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol
36 ls /mnt
37 default snapshot_of_a_snapshot snapshot_of_new_subvol
38 new_subvol_name snapshot_of_default
39
40 Snapshots and subvolumes cannot be deleted right now, but you can
41 rm -rf all the files and directories inside them.
42
43btrfsck: do a limited check of the FS extent trees.</li>
44
45debug-tree: print all of the FS metadata in text form. Example:
46
47 debug-tree /dev/sda2 >& big_output_file
48
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 000000000000..eb36ae981bdc
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,29 @@
1ifneq ($(KERNELRELEASE),)
2# kbuild part of makefile
3
4obj-m := btrfs.o
5btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 file-item.o inode-item.o inode-map.o disk-io.o \
7 transaction.o bit-radix.o inode.o file.o tree-defrag.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o
11else
12
13# Normal Makefile
14
15KERNELDIR := /lib/modules/`uname -r`/build
16all: version
17 $(MAKE) -C $(KERNELDIR) M=`pwd` modules
18
19version:
20 bash version.sh
21
22modules_install:
23 $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
24clean:
25 $(MAKE) -C $(KERNELDIR) M=`pwd` clean
26
27tester:
28 $(MAKE) -C $(KERNELDIR) M=`pwd` tree-defrag.o transaction.o sysfs.o super.o root-tree.o inode-map.o inode-item.o inode.o file-item.o file.o extent_map.o disk-io.o ctree.o dir-item.o extent-tree.o
29endif
diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
new file mode 100644
index 000000000000..d9b6d38c603a
--- /dev/null
+++ b/fs/btrfs/TODO
@@ -0,0 +1,20 @@
1* cleanup, add more error checking, get rid of BUG_ONs
2* Fix ENOSPC handling
3* Make allocator smarter
4* add a block group to struct inode
5* Do actual block accounting
6* Check compat and incompat flags on the inode
7* Get rid of struct ctree_path, limiting tree levels held at one time
8* Add generation number to key pointer in nodes
9* Add generation number to inode
10* forbid cross subvolume renames and hardlinks
11* Release
12* Do real tree locking
13* Add extent mirroring (backup copies of blocks)
14* Add fancy interface to get access to incremental backups
15* Add fancy striped extents to make big reads faster
16* Use relocation to try and fix write errors
17* Make allocator much smarter
18* xattrs (directory streams for regular files)
19* Scrub & defrag
20
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 000000000000..867eaf1f8efb
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,352 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/string.h>
21#include <linux/xattr.h>
22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h>
24#include <linux/sched.h>
25
26#include "ctree.h"
27#include "btrfs_inode.h"
28#include "xattr.h"
29
30#ifdef CONFIG_FS_POSIX_ACL
31
32static void btrfs_update_cached_acl(struct inode *inode,
33 struct posix_acl **p_acl,
34 struct posix_acl *acl)
35{
36 spin_lock(&inode->i_lock);
37 if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
38 posix_acl_release(*p_acl);
39 *p_acl = posix_acl_dup(acl);
40 spin_unlock(&inode->i_lock);
41}
42
43static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
44{
45 int size;
46 const char *name;
47 char *value = NULL;
48 struct posix_acl *acl = NULL, **p_acl;
49
50 switch (type) {
51 case ACL_TYPE_ACCESS:
52 name = POSIX_ACL_XATTR_ACCESS;
53 p_acl = &BTRFS_I(inode)->i_acl;
54 break;
55 case ACL_TYPE_DEFAULT:
56 name = POSIX_ACL_XATTR_DEFAULT;
57 p_acl = &BTRFS_I(inode)->i_default_acl;
58 break;
59 default:
60 return ERR_PTR(-EINVAL);
61 }
62
63 spin_lock(&inode->i_lock);
64 if (*p_acl != BTRFS_ACL_NOT_CACHED)
65 acl = posix_acl_dup(*p_acl);
66 spin_unlock(&inode->i_lock);
67
68 if (acl)
69 return acl;
70
71
72 size = __btrfs_getxattr(inode, name, "", 0);
73 if (size > 0) {
74 value = kzalloc(size, GFP_NOFS);
75 if (!value)
76 return ERR_PTR(-ENOMEM);
77 size = __btrfs_getxattr(inode, name, value, size);
78 if (size > 0) {
79 acl = posix_acl_from_xattr(value, size);
80 btrfs_update_cached_acl(inode, p_acl, acl);
81 }
82 kfree(value);
83 } else if (size == -ENOENT) {
84 acl = NULL;
85 btrfs_update_cached_acl(inode, p_acl, acl);
86 }
87
88 return acl;
89}
90
91static int btrfs_xattr_get_acl(struct inode *inode, int type,
92 void *value, size_t size)
93{
94 struct posix_acl *acl;
95 int ret = 0;
96
97 acl = btrfs_get_acl(inode, type);
98
99 if (IS_ERR(acl))
100 return PTR_ERR(acl);
101 if (acl == NULL)
102 return -ENODATA;
103 ret = posix_acl_to_xattr(acl, value, size);
104 posix_acl_release(acl);
105
106 return ret;
107}
108
109/*
110 * Needs to be called with fs_mutex held
111 */
112static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
113{
114 int ret, size = 0;
115 const char *name;
116 struct posix_acl **p_acl;
117 char *value = NULL;
118 mode_t mode;
119
120 if (acl) {
121 ret = posix_acl_valid(acl);
122 if (ret < 0)
123 return ret;
124 ret = 0;
125 }
126
127 switch (type) {
128 case ACL_TYPE_ACCESS:
129 mode = inode->i_mode;
130 ret = posix_acl_equiv_mode(acl, &mode);
131 if (ret < 0)
132 return ret;
133 ret = 0;
134 inode->i_mode = mode;
135 name = POSIX_ACL_XATTR_ACCESS;
136 p_acl = &BTRFS_I(inode)->i_acl;
137 break;
138 case ACL_TYPE_DEFAULT:
139 if (!S_ISDIR(inode->i_mode))
140 return acl ? -EINVAL : 0;
141 name = POSIX_ACL_XATTR_DEFAULT;
142 p_acl = &BTRFS_I(inode)->i_default_acl;
143 break;
144 default:
145 return -EINVAL;
146 }
147
148 if (acl) {
149 size = posix_acl_xattr_size(acl->a_count);
150 value = kmalloc(size, GFP_NOFS);
151 if (!value) {
152 ret = -ENOMEM;
153 goto out;
154 }
155
156 ret = posix_acl_to_xattr(acl, value, size);
157 if (ret < 0)
158 goto out;
159 }
160
161 ret = __btrfs_setxattr(inode, name, value, size, 0);
162
163out:
164 if (value)
165 kfree(value);
166
167 if (!ret)
168 btrfs_update_cached_acl(inode, p_acl, acl);
169
170 return ret;
171}
172
173static int btrfs_xattr_set_acl(struct inode *inode, int type,
174 const void *value, size_t size)
175{
176 int ret = 0;
177 struct posix_acl *acl = NULL;
178
179 if (value) {
180 acl = posix_acl_from_xattr(value, size);
181 if (acl == NULL) {
182 value = NULL;
183 size = 0;
184 } else if (IS_ERR(acl)) {
185 return PTR_ERR(acl);
186 }
187 }
188
189 ret = btrfs_set_acl(inode, acl, type);
190
191 posix_acl_release(acl);
192
193 return ret;
194}
195
196
197static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
198 void *value, size_t size)
199{
200 return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
201}
202
203static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
204 const void *value, size_t size, int flags)
205{
206 return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
207}
208
209static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
210 void *value, size_t size)
211{
212 return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
213}
214
215static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
216 const void *value, size_t size, int flags)
217{
218 return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
219}
220
221int btrfs_check_acl(struct inode *inode, int mask)
222{
223 struct posix_acl *acl;
224 int error = -EAGAIN;
225
226 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
227
228 if (IS_ERR(acl))
229 return PTR_ERR(acl);
230 if (acl) {
231 error = posix_acl_permission(inode, acl, mask);
232 posix_acl_release(acl);
233 }
234
235 return error;
236}
237
238/*
239 * btrfs_init_acl is already generally called under fs_mutex, so the locking
240 * stuff has been fixed to work with that. If the locking stuff changes, we
241 * need to re-evaluate the acl locking stuff.
242 */
243int btrfs_init_acl(struct inode *inode, struct inode *dir)
244{
245 struct posix_acl *acl = NULL;
246 int ret = 0;
247
248 /* this happens with subvols */
249 if (!dir)
250 return 0;
251
252 if (!S_ISLNK(inode->i_mode)) {
253 if (IS_POSIXACL(dir)) {
254 acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
255 if (IS_ERR(acl))
256 return PTR_ERR(acl);
257 }
258
259 if (!acl)
260 inode->i_mode &= ~current->fs->umask;
261 }
262
263 if (IS_POSIXACL(dir) && acl) {
264 struct posix_acl *clone;
265 mode_t mode;
266
267 if (S_ISDIR(inode->i_mode)) {
268 ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
269 if (ret)
270 goto failed;
271 }
272 clone = posix_acl_clone(acl, GFP_NOFS);
273 ret = -ENOMEM;
274 if (!clone)
275 goto failed;
276
277 mode = inode->i_mode;
278 ret = posix_acl_create_masq(clone, &mode);
279 if (ret >= 0) {
280 inode->i_mode = mode;
281 if (ret > 0) {
282 /* we need an acl */
283 ret = btrfs_set_acl(inode, clone,
284 ACL_TYPE_ACCESS);
285 }
286 }
287 }
288failed:
289 posix_acl_release(acl);
290
291 return ret;
292}
293
294int btrfs_acl_chmod(struct inode *inode)
295{
296 struct posix_acl *acl, *clone;
297 int ret = 0;
298
299 if (S_ISLNK(inode->i_mode))
300 return -EOPNOTSUPP;
301
302 if (!IS_POSIXACL(inode))
303 return 0;
304
305 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
306 if (IS_ERR(acl) || !acl)
307 return PTR_ERR(acl);
308
309 clone = posix_acl_clone(acl, GFP_KERNEL);
310 posix_acl_release(acl);
311 if (!clone)
312 return -ENOMEM;
313
314 ret = posix_acl_chmod_masq(clone, inode->i_mode);
315 if (!ret)
316 ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
317
318 posix_acl_release(clone);
319
320 return ret;
321}
322
323struct xattr_handler btrfs_xattr_acl_default_handler = {
324 .prefix = POSIX_ACL_XATTR_DEFAULT,
325 .get = btrfs_xattr_acl_default_get,
326 .set = btrfs_xattr_acl_default_set,
327};
328
329struct xattr_handler btrfs_xattr_acl_access_handler = {
330 .prefix = POSIX_ACL_XATTR_ACCESS,
331 .get = btrfs_xattr_acl_access_get,
332 .set = btrfs_xattr_acl_access_set,
333};
334
335#else /* CONFIG_FS_POSIX_ACL */
336
337int btrfs_acl_chmod(struct inode *inode)
338{
339 return 0;
340}
341
342int btrfs_init_acl(struct inode *inode, struct inode *dir)
343{
344 return 0;
345}
346
347int btrfs_check_acl(struct inode *inode, int mask)
348{
349 return 0;
350}
351
352#endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 000000000000..2ee301740195
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,343 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/version.h>
20#include <linux/kthread.h>
21#include <linux/list.h>
22#include <linux/spinlock.h>
23
24#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
25# include <linux/freezer.h>
26#else
27# include <linux/sched.h>
28#endif
29
30#include "async-thread.h"
31
32/*
33 * container for the kthread task pointer and the list of pending work
34 * One of these is allocated per thread.
35 */
36struct btrfs_worker_thread {
37 /* pool we belong to */
38 struct btrfs_workers *workers;
39
40 /* list of struct btrfs_work that are waiting for service */
41 struct list_head pending;
42
43 /* list of worker threads from struct btrfs_workers */
44 struct list_head worker_list;
45
46 /* kthread */
47 struct task_struct *task;
48
49 /* number of things on the pending list */
50 atomic_t num_pending;
51
52 unsigned long sequence;
53
54 /* protects the pending list. */
55 spinlock_t lock;
56
57 /* set to non-zero when this thread is already awake and kicking */
58 int working;
59
60 /* are we currently idle */
61 int idle;
62};
63
64/*
65 * helper function to move a thread onto the idle list after it
66 * has finished some requests.
67 */
68static void check_idle_worker(struct btrfs_worker_thread *worker)
69{
70 if (!worker->idle && atomic_read(&worker->num_pending) <
71 worker->workers->idle_thresh / 2) {
72 unsigned long flags;
73 spin_lock_irqsave(&worker->workers->lock, flags);
74 worker->idle = 1;
75 list_move(&worker->worker_list, &worker->workers->idle_list);
76 spin_unlock_irqrestore(&worker->workers->lock, flags);
77 }
78}
79
80/*
81 * helper function to move a thread off the idle list after new
82 * pending work is added.
83 */
84static void check_busy_worker(struct btrfs_worker_thread *worker)
85{
86 if (worker->idle && atomic_read(&worker->num_pending) >=
87 worker->workers->idle_thresh) {
88 unsigned long flags;
89 spin_lock_irqsave(&worker->workers->lock, flags);
90 worker->idle = 0;
91 list_move_tail(&worker->worker_list,
92 &worker->workers->worker_list);
93 spin_unlock_irqrestore(&worker->workers->lock, flags);
94 }
95}
96
97/*
98 * main loop for servicing work items
99 */
100static int worker_loop(void *arg)
101{
102 struct btrfs_worker_thread *worker = arg;
103 struct list_head *cur;
104 struct btrfs_work *work;
105 do {
106 spin_lock_irq(&worker->lock);
107 while(!list_empty(&worker->pending)) {
108 cur = worker->pending.next;
109 work = list_entry(cur, struct btrfs_work, list);
110 list_del(&work->list);
111 clear_bit(0, &work->flags);
112
113 work->worker = worker;
114 spin_unlock_irq(&worker->lock);
115
116 work->func(work);
117
118 atomic_dec(&worker->num_pending);
119 spin_lock_irq(&worker->lock);
120 check_idle_worker(worker);
121 }
122 worker->working = 0;
123 if (freezing(current)) {
124 refrigerator();
125 } else {
126 set_current_state(TASK_INTERRUPTIBLE);
127 spin_unlock_irq(&worker->lock);
128 schedule();
129 __set_current_state(TASK_RUNNING);
130 }
131 } while (!kthread_should_stop());
132 return 0;
133}
134
135/*
136 * this will wait for all the worker threads to shutdown
137 */
138int btrfs_stop_workers(struct btrfs_workers *workers)
139{
140 struct list_head *cur;
141 struct btrfs_worker_thread *worker;
142
143 list_splice_init(&workers->idle_list, &workers->worker_list);
144 while(!list_empty(&workers->worker_list)) {
145 cur = workers->worker_list.next;
146 worker = list_entry(cur, struct btrfs_worker_thread,
147 worker_list);
148 kthread_stop(worker->task);
149 list_del(&worker->worker_list);
150 kfree(worker);
151 }
152 return 0;
153}
154
155/*
156 * simple init on struct btrfs_workers
157 */
158void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
159{
160 workers->num_workers = 0;
161 INIT_LIST_HEAD(&workers->worker_list);
162 INIT_LIST_HEAD(&workers->idle_list);
163 spin_lock_init(&workers->lock);
164 workers->max_workers = max;
165 workers->idle_thresh = 32;
166 workers->name = name;
167}
168
169/*
170 * starts new worker threads. This does not enforce the max worker
171 * count in case you need to temporarily go past it.
172 */
173int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
174{
175 struct btrfs_worker_thread *worker;
176 int ret = 0;
177 int i;
178
179 for (i = 0; i < num_workers; i++) {
180 worker = kzalloc(sizeof(*worker), GFP_NOFS);
181 if (!worker) {
182 ret = -ENOMEM;
183 goto fail;
184 }
185
186 INIT_LIST_HEAD(&worker->pending);
187 INIT_LIST_HEAD(&worker->worker_list);
188 spin_lock_init(&worker->lock);
189 atomic_set(&worker->num_pending, 0);
190 worker->task = kthread_run(worker_loop, worker,
191 "btrfs-%s-%d", workers->name,
192 workers->num_workers + i);
193 worker->workers = workers;
194 if (IS_ERR(worker->task)) {
195 kfree(worker);
196 ret = PTR_ERR(worker->task);
197 goto fail;
198 }
199
200 spin_lock_irq(&workers->lock);
201 list_add_tail(&worker->worker_list, &workers->idle_list);
202 worker->idle = 1;
203 workers->num_workers++;
204 spin_unlock_irq(&workers->lock);
205 }
206 return 0;
207fail:
208 btrfs_stop_workers(workers);
209 return ret;
210}
211
212/*
213 * run through the list and find a worker thread that doesn't have a lot
214 * to do right now. This can return null if we aren't yet at the thread
215 * count limit and all of the threads are busy.
216 */
217static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
218{
219 struct btrfs_worker_thread *worker;
220 struct list_head *next;
221 int enforce_min = workers->num_workers < workers->max_workers;
222
223 /*
224 * if we find an idle thread, don't move it to the end of the
225 * idle list. This improves the chance that the next submission
226 * will reuse the same thread, and maybe catch it while it is still
227 * working
228 */
229 if (!list_empty(&workers->idle_list)) {
230 next = workers->idle_list.next;
231 worker = list_entry(next, struct btrfs_worker_thread,
232 worker_list);
233 return worker;
234 }
235 if (enforce_min || list_empty(&workers->worker_list))
236 return NULL;
237
238 /*
239 * if we pick a busy task, move the task to the end of the list.
240 * hopefully this will keep things somewhat evenly balanced
241 */
242 next = workers->worker_list.next;
243 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
244 atomic_inc(&worker->num_pending);
245 worker->sequence++;
246 if (worker->sequence % workers->idle_thresh == 0)
247 list_move_tail(next, &workers->worker_list);
248 return worker;
249}
250
251static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
252{
253 struct btrfs_worker_thread *worker;
254 unsigned long flags;
255
256again:
257 spin_lock_irqsave(&workers->lock, flags);
258 worker = next_worker(workers);
259 spin_unlock_irqrestore(&workers->lock, flags);
260
261 if (!worker) {
262 spin_lock_irqsave(&workers->lock, flags);
263 if (workers->num_workers >= workers->max_workers) {
264 struct list_head *fallback = NULL;
265 /*
266 * we have failed to find any workers, just
267 * return the force one
268 */
269 if (!list_empty(&workers->worker_list))
270 fallback = workers->worker_list.next;
271 if (!list_empty(&workers->idle_list))
272 fallback = workers->idle_list.next;
273 BUG_ON(!fallback);
274 worker = list_entry(fallback,
275 struct btrfs_worker_thread, worker_list);
276 spin_unlock_irqrestore(&workers->lock, flags);
277 } else {
278 spin_unlock_irqrestore(&workers->lock, flags);
279 /* we're below the limit, start another worker */
280 btrfs_start_workers(workers, 1);
281 goto again;
282 }
283 }
284 return worker;
285}
286
287/*
288 * btrfs_requeue_work just puts the work item back on the tail of the list
289 * it was taken from. It is intended for use with long running work functions
290 * that make some progress and want to give the cpu up for others.
291 */
292int btrfs_requeue_work(struct btrfs_work *work)
293{
294 struct btrfs_worker_thread *worker = work->worker;
295 unsigned long flags;
296
297 if (test_and_set_bit(0, &work->flags))
298 goto out;
299
300 spin_lock_irqsave(&worker->lock, flags);
301 atomic_inc(&worker->num_pending);
302 list_add_tail(&work->list, &worker->pending);
303 check_busy_worker(worker);
304 spin_unlock_irqrestore(&worker->lock, flags);
305out:
306 return 0;
307}
308
309/*
310 * places a struct btrfs_work into the pending queue of one of the kthreads
311 */
312int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
313{
314 struct btrfs_worker_thread *worker;
315 unsigned long flags;
316 int wake = 0;
317
318 /* don't requeue something already on a list */
319 if (test_and_set_bit(0, &work->flags))
320 goto out;
321
322 worker = find_worker(workers);
323
324 spin_lock_irqsave(&worker->lock, flags);
325 atomic_inc(&worker->num_pending);
326 check_busy_worker(worker);
327 list_add_tail(&work->list, &worker->pending);
328
329 /*
330 * avoid calling into wake_up_process if this thread has already
331 * been kicked
332 */
333 if (!worker->working)
334 wake = 1;
335 worker->working = 1;
336
337 spin_unlock_irqrestore(&worker->lock, flags);
338
339 if (wake)
340 wake_up_process(worker->task);
341out:
342 return 0;
343}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 000000000000..43e44d115dd1
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,82 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_ASYNC_THREAD_
20#define __BTRFS_ASYNC_THREAD_
21
22struct btrfs_worker_thread;
23
24/*
25 * This is similar to a workqueue, but it is meant to spread the operations
26 * across all available cpus instead of just the CPU that was used to
27 * queue the work. There is also some batching introduced to try and
28 * cut down on context switches.
29 *
30 * By default threads are added on demand up to 2 * the number of cpus.
31 * Changing struct btrfs_workers->max_workers is one way to prevent
32 * demand creation of kthreads.
33 *
34 * the basic model of these worker threads is to embed a btrfs_work
35 * structure in your own data struct, and use container_of in a
36 * work function to get back to your data struct.
37 */
38struct btrfs_work {
39 /*
40 * only func should be set to the function you want called
41 * your work struct is passed as the only arg
42 */
43 void (*func)(struct btrfs_work *work);
44
45 /*
46 * flags should be set to zero. It is used to make sure the
47 * struct is only inserted once into the list.
48 */
49 unsigned long flags;
50
51 /* don't touch these */
52 struct btrfs_worker_thread *worker;
53 struct list_head list;
54};
55
56struct btrfs_workers {
57 /* current number of running workers */
58 int num_workers;
59
60 /* max number of workers allowed. changed by btrfs_start_workers */
61 int max_workers;
62
63 /* once a worker has this many requests or fewer, it is idle */
64 int idle_thresh;
65
66 /* list with all the work threads */
67 struct list_head worker_list;
68 struct list_head idle_list;
69
70 /* lock for finding the next worker thread to queue on */
71 spinlock_t lock;
72
73 /* extra name for this worker */
74 char *name;
75};
76
77int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
78int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
79int btrfs_stop_workers(struct btrfs_workers *workers);
80void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
81int btrfs_requeue_work(struct btrfs_work *work);
82#endif
diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c
new file mode 100644
index 000000000000..e8bf876db393
--- /dev/null
+++ b/fs/btrfs/bit-radix.c
@@ -0,0 +1,130 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "bit-radix.h"
20
21#define BIT_ARRAY_BYTES 256
22#define BIT_RADIX_BITS_PER_ARRAY ((BIT_ARRAY_BYTES - sizeof(unsigned long)) * 8)
23
24extern struct kmem_cache *btrfs_bit_radix_cachep;
25int set_radix_bit(struct radix_tree_root *radix, unsigned long bit)
26{
27 unsigned long *bits;
28 unsigned long slot;
29 int bit_slot;
30 int ret;
31
32 slot = bit / BIT_RADIX_BITS_PER_ARRAY;
33 bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
34
35 bits = radix_tree_lookup(radix, slot);
36 if (!bits) {
37 bits = kmem_cache_alloc(btrfs_bit_radix_cachep, GFP_NOFS);
38 if (!bits)
39 return -ENOMEM;
40 memset(bits + 1, 0, BIT_ARRAY_BYTES - sizeof(unsigned long));
41 bits[0] = slot;
42 ret = radix_tree_insert(radix, slot, bits);
43 if (ret)
44 return ret;
45 }
46 ret = test_and_set_bit(bit_slot, bits + 1);
47 if (ret < 0)
48 ret = 1;
49 return ret;
50}
51
52int test_radix_bit(struct radix_tree_root *radix, unsigned long bit)
53{
54 unsigned long *bits;
55 unsigned long slot;
56 int bit_slot;
57
58 slot = bit / BIT_RADIX_BITS_PER_ARRAY;
59 bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
60
61 bits = radix_tree_lookup(radix, slot);
62 if (!bits)
63 return 0;
64 return test_bit(bit_slot, bits + 1);
65}
66
67int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit)
68{
69 unsigned long *bits;
70 unsigned long slot;
71 int bit_slot;
72 int i;
73 int empty = 1;
74
75 slot = bit / BIT_RADIX_BITS_PER_ARRAY;
76 bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
77
78 bits = radix_tree_lookup(radix, slot);
79 if (!bits)
80 return 0;
81 clear_bit(bit_slot, bits + 1);
82 for (i = 1; i < BIT_ARRAY_BYTES / sizeof(unsigned long); i++) {
83 if (bits[i]) {
84 empty = 0;
85 break;
86 }
87 }
88 if (empty) {
89 bits = radix_tree_delete(radix, slot);
90 BUG_ON(!bits);
91 kmem_cache_free(btrfs_bit_radix_cachep, bits);
92 }
93 return 0;
94}
95
96int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
97 unsigned long start, int nr)
98{
99 unsigned long *bits;
100 unsigned long *gang[4];
101 int found;
102 int ret;
103 int i;
104 int total_found = 0;
105 unsigned long slot;
106
107 slot = start / BIT_RADIX_BITS_PER_ARRAY;
108 ret = radix_tree_gang_lookup(radix, (void **)gang, slot,
109 ARRAY_SIZE(gang));
110 found = start % BIT_RADIX_BITS_PER_ARRAY;
111 for (i = 0; i < ret && nr > 0; i++) {
112 bits = gang[i];
113 while(nr > 0) {
114 found = find_next_bit(bits + 1,
115 BIT_RADIX_BITS_PER_ARRAY,
116 found);
117 if (found < BIT_RADIX_BITS_PER_ARRAY) {
118 *retbits = bits[0] *
119 BIT_RADIX_BITS_PER_ARRAY + found;
120 retbits++;
121 nr--;
122 total_found++;
123 found++;
124 } else
125 break;
126 }
127 found = 0;
128 }
129 return total_found;
130}
diff --git a/fs/btrfs/bit-radix.h b/fs/btrfs/bit-radix.h
new file mode 100644
index 000000000000..c100f54d5c32
--- /dev/null
+++ b/fs/btrfs/bit-radix.h
@@ -0,0 +1,33 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BIT_RADIX__
20#define __BIT_RADIX__
21#include <linux/radix-tree.h>
22
23int set_radix_bit(struct radix_tree_root *radix, unsigned long bit);
24int test_radix_bit(struct radix_tree_root *radix, unsigned long bit);
25int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit);
26int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
27 unsigned long start, int nr);
28
29static inline void init_bit_radix(struct radix_tree_root *radix)
30{
31 INIT_RADIX_TREE(radix, GFP_NOFS);
32}
33#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 000000000000..0577fda2168a
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,85 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_I__
20#define __BTRFS_I__
21
22#include "extent_map.h"
23#include "extent_io.h"
24#include "ordered-data.h"
25
26/* in memory btrfs inode */
27struct btrfs_inode {
28 struct btrfs_root *root;
29 struct btrfs_block_group_cache *block_group;
30 struct btrfs_key location;
31 struct extent_map_tree extent_tree;
32 struct extent_io_tree io_tree;
33 struct extent_io_tree io_failure_tree;
34 struct mutex csum_mutex;
35 struct mutex extent_mutex;
36 struct mutex log_mutex;
37 struct inode vfs_inode;
38 struct btrfs_ordered_inode_tree ordered_tree;
39
40 struct posix_acl *i_acl;
41 struct posix_acl *i_default_acl;
42
43 /* for keeping track of orphaned inodes */
44 struct list_head i_orphan;
45
46 struct list_head delalloc_inodes;
47
48 /* full 64 bit generation number */
49 u64 generation;
50
51 /*
52 * transid of the trans_handle that last modified this inode
53 */
54 u64 last_trans;
55 /*
56 * transid that last logged this inode
57 */
58 u64 logged_trans;
59
60 /* trans that last made a change that should be fully fsync'd */
61 u64 log_dirty_trans;
62 u64 delalloc_bytes;
63 u64 disk_i_size;
64 u32 flags;
65
66 /*
67 * if this is a directory then index_cnt is the counter for the index
68 * number for new files that are created
69 */
70 u64 index_cnt;
71};
72
73static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
74{
75 return container_of(inode, struct btrfs_inode, vfs_inode);
76}
77
78static inline void btrfs_i_size_write(struct inode *inode, u64 size)
79{
80 inode->i_size = size;
81 BTRFS_I(inode)->disk_i_size = size;
82}
83
84
85#endif
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
new file mode 100644
index 000000000000..b0ed1887d9b1
--- /dev/null
+++ b/fs/btrfs/compat.h
@@ -0,0 +1,60 @@
1#ifndef _COMPAT_H_
2#define _COMPAT_H_
3
4#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,26)
5#define trylock_page(page) (!TestSetPageLocked(page))
6#endif
7
8#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27)
9static inline struct dentry *d_obtain_alias(struct inode *inode)
10{
11 struct dentry *d;
12
13 if (!inode)
14 return NULL;
15 if (IS_ERR(inode))
16 return ERR_CAST(inode);
17
18 d = d_alloc_anon(inode);
19 if (!d)
20 iput(inode);
21 return d;
22}
23#endif
24
25#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
26static inline void btrfs_drop_nlink(struct inode *inode)
27{
28 inode->i_nlink--;
29}
30
31static inline void btrfs_inc_nlink(struct inode *inode)
32{
33 inode->i_nlink++;
34}
35#else
36# define btrfs_drop_nlink(inode) drop_nlink(inode)
37# define btrfs_inc_nlink(inode) inc_nlink(inode)
38#endif
39
40/*
41 * Even if AppArmor isn't enabled, it still has different prototypes.
42 * Add more distro/version pairs here to declare which has AppArmor applied.
43 */
44#if defined(CONFIG_SUSE_KERNEL)
45# if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
46# define REMOVE_SUID_PATH 1
47# endif
48#endif
49
50/*
51 * catch any other distros that have patched in apparmor. This isn't
52 * 100% reliable because it won't catch people that hand compile their
53 * own distro kernels without apparmor compiled in. But, it is better
54 * than nothing.
55 */
56#ifdef CONFIG_SECURITY_APPARMOR
57# define REMOVE_SUID_PATH 1
58#endif
59
60#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
new file mode 100644
index 000000000000..bf6c12e85730
--- /dev/null
+++ b/fs/btrfs/crc32c.h
@@ -0,0 +1,108 @@
1#ifndef __BTRFS_CRC32C__
2#define __BTRFS_CRC32C__
3#include <asm/byteorder.h>
4#include <linux/crc32c.h>
5#include <linux/version.h>
6
7/* #define CONFIG_BTRFS_HW_SUM 1 */
8
9#ifdef CONFIG_BTRFS_HW_SUM
10#ifdef CONFIG_X86
11/*
12 * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
13 * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
14 * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
15 * http://www.intel.com/products/processor/manuals/
16 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
17 * Volume 2A: Instruction Set Reference, A-M
18 */
19
20#include <asm/cpufeature.h>
21#include <asm/processor.h>
22
23#define X86_FEATURE_XMM4_2 (4*32+20) /* Streaming SIMD Extensions-4.2 */
24#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
25
26#ifdef CONFIG_X86_64
27#define REX_PRE "0x48, "
28#define SCALE_F 8
29#else
30#define REX_PRE
31#define SCALE_F 4
32#endif
33
34static inline u32 btrfs_crc32c_le_hw_byte(u32 crc, unsigned char const *data,
35 size_t length)
36{
37 while (length--) {
38 __asm__ __volatile__(
39 ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
40 :"=S"(crc)
41 :"0"(crc), "c"(*data)
42 );
43 data++;
44 }
45
46 return crc;
47}
48
49static inline u32 __pure btrfs_crc32c_le_hw(u32 crc, unsigned char const *p,
50 size_t len)
51{
52 unsigned int iquotient = len / SCALE_F;
53 unsigned int iremainder = len % SCALE_F;
54#ifdef CONFIG_X86_64
55 u64 *ptmp = (u64 *)p;
56#else
57 u32 *ptmp = (u32 *)p;
58#endif
59
60 while (iquotient--) {
61 __asm__ __volatile__(
62 ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
63 :"=S"(crc)
64 :"0"(crc), "c"(*ptmp)
65 );
66 ptmp++;
67 }
68
69 if (iremainder)
70 crc = btrfs_crc32c_le_hw_byte(crc, (unsigned char *)ptmp,
71 iremainder);
72
73 return crc;
74}
75#endif /* CONFIG_BTRFS_HW_SUM */
76
77static inline u32 __btrfs_crc32c(u32 crc, unsigned char const *address,
78 size_t len)
79{
80#ifdef CONFIG_BTRFS_HW_SUM
81 if (cpu_has_xmm4_2)
82 return btrfs_crc32c_le_hw(crc, address, len);
83#endif
84 return crc32c_le(crc, address, len);
85}
86
87#else
88
89#define __btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
90
91#endif /* CONFIG_X86 */
92
93/**
94 * implementation of crc32c_le() changed in linux-2.6.23,
95 * has of v0.13 btrfs-progs is using the latest version.
96 * We must workaround older implementations of crc32c_le()
97 * found on older kernel versions.
98 */
99#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
100#define btrfs_crc32c(seed, data, length) \
101 __cpu_to_le32( __btrfs_crc32c( __le32_to_cpu(seed), \
102 (unsigned char const *)data, length) )
103#else
104#define btrfs_crc32c(seed, data, length) \
105 __btrfs_crc32c(seed, (unsigned char const *)data, length)
106#endif
107#endif
108
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 000000000000..50aea8cb653a
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,3450 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "transaction.h"
23#include "print-tree.h"
24#include "locking.h"
25
26static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
27 *root, struct btrfs_path *path, int level);
28static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29 *root, struct btrfs_key *ins_key,
30 struct btrfs_path *path, int data_size, int extend);
31static int push_node_left(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, struct extent_buffer *dst,
33 struct extent_buffer *src, int empty);
34static int balance_node_right(struct btrfs_trans_handle *trans,
35 struct btrfs_root *root,
36 struct extent_buffer *dst_buf,
37 struct extent_buffer *src_buf);
38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39 struct btrfs_path *path, int level, int slot);
40
41inline void btrfs_init_path(struct btrfs_path *p)
42{
43 memset(p, 0, sizeof(*p));
44}
45
46struct btrfs_path *btrfs_alloc_path(void)
47{
48 struct btrfs_path *path;
49 path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
50 if (path) {
51 btrfs_init_path(path);
52 path->reada = 1;
53 }
54 return path;
55}
56
57void btrfs_free_path(struct btrfs_path *p)
58{
59 btrfs_release_path(NULL, p);
60 kmem_cache_free(btrfs_path_cachep, p);
61}
62
63void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
64{
65 int i;
66
67 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
68 p->slots[i] = 0;
69 if (!p->nodes[i])
70 continue;
71 if (p->locks[i]) {
72 btrfs_tree_unlock(p->nodes[i]);
73 p->locks[i] = 0;
74 }
75 free_extent_buffer(p->nodes[i]);
76 p->nodes[i] = NULL;
77 }
78}
79
80struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
81{
82 struct extent_buffer *eb;
83 spin_lock(&root->node_lock);
84 eb = root->node;
85 extent_buffer_get(eb);
86 spin_unlock(&root->node_lock);
87 return eb;
88}
89
90struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
91{
92 struct extent_buffer *eb;
93
94 while(1) {
95 eb = btrfs_root_node(root);
96 btrfs_tree_lock(eb);
97
98 spin_lock(&root->node_lock);
99 if (eb == root->node) {
100 spin_unlock(&root->node_lock);
101 break;
102 }
103 spin_unlock(&root->node_lock);
104
105 btrfs_tree_unlock(eb);
106 free_extent_buffer(eb);
107 }
108 return eb;
109}
110
111static void add_root_to_dirty_list(struct btrfs_root *root)
112{
113 if (root->track_dirty && list_empty(&root->dirty_list)) {
114 list_add(&root->dirty_list,
115 &root->fs_info->dirty_cowonly_roots);
116 }
117}
118
119int btrfs_copy_root(struct btrfs_trans_handle *trans,
120 struct btrfs_root *root,
121 struct extent_buffer *buf,
122 struct extent_buffer **cow_ret, u64 new_root_objectid)
123{
124 struct extent_buffer *cow;
125 u32 nritems;
126 int ret = 0;
127 int level;
128 struct btrfs_root *new_root;
129
130 new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
131 if (!new_root)
132 return -ENOMEM;
133
134 memcpy(new_root, root, sizeof(*new_root));
135 new_root->root_key.objectid = new_root_objectid;
136
137 WARN_ON(root->ref_cows && trans->transid !=
138 root->fs_info->running_transaction->transid);
139 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
140
141 level = btrfs_header_level(buf);
142 nritems = btrfs_header_nritems(buf);
143
144 cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
145 new_root_objectid, trans->transid,
146 level, buf->start, 0);
147 if (IS_ERR(cow)) {
148 kfree(new_root);
149 return PTR_ERR(cow);
150 }
151
152 copy_extent_buffer(cow, buf, 0, 0, cow->len);
153 btrfs_set_header_bytenr(cow, cow->start);
154 btrfs_set_header_generation(cow, trans->transid);
155 btrfs_set_header_owner(cow, new_root_objectid);
156 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
157
158 WARN_ON(btrfs_header_generation(buf) > trans->transid);
159 ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
160 kfree(new_root);
161
162 if (ret)
163 return ret;
164
165 btrfs_mark_buffer_dirty(cow);
166 *cow_ret = cow;
167 return 0;
168}
169
170int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
171 struct btrfs_root *root,
172 struct extent_buffer *buf,
173 struct extent_buffer *parent, int parent_slot,
174 struct extent_buffer **cow_ret,
175 u64 search_start, u64 empty_size,
176 u64 prealloc_dest)
177{
178 u64 parent_start;
179 struct extent_buffer *cow;
180 u32 nritems;
181 int ret = 0;
182 int different_trans = 0;
183 int level;
184 int unlock_orig = 0;
185
186 if (*cow_ret == buf)
187 unlock_orig = 1;
188
189 WARN_ON(!btrfs_tree_locked(buf));
190
191 if (parent)
192 parent_start = parent->start;
193 else
194 parent_start = 0;
195
196 WARN_ON(root->ref_cows && trans->transid !=
197 root->fs_info->running_transaction->transid);
198 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
199
200 level = btrfs_header_level(buf);
201 nritems = btrfs_header_nritems(buf);
202
203 if (prealloc_dest) {
204 struct btrfs_key ins;
205
206 ins.objectid = prealloc_dest;
207 ins.offset = buf->len;
208 ins.type = BTRFS_EXTENT_ITEM_KEY;
209
210 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
211 root->root_key.objectid,
212 trans->transid, level, 0,
213 &ins);
214 BUG_ON(ret);
215 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
216 buf->len);
217 } else {
218 cow = btrfs_alloc_free_block(trans, root, buf->len,
219 parent_start,
220 root->root_key.objectid,
221 trans->transid, level,
222 search_start, empty_size);
223 }
224 if (IS_ERR(cow))
225 return PTR_ERR(cow);
226
227 copy_extent_buffer(cow, buf, 0, 0, cow->len);
228 btrfs_set_header_bytenr(cow, cow->start);
229 btrfs_set_header_generation(cow, trans->transid);
230 btrfs_set_header_owner(cow, root->root_key.objectid);
231 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
232
233 WARN_ON(btrfs_header_generation(buf) > trans->transid);
234 if (btrfs_header_generation(buf) != trans->transid) {
235 u32 nr_extents;
236 different_trans = 1;
237 ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
238 if (ret)
239 return ret;
240
241 ret = btrfs_cache_ref(trans, root, buf, nr_extents);
242 WARN_ON(ret);
243 } else {
244 ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
245 if (ret)
246 return ret;
247 clean_tree_block(trans, root, buf);
248 }
249
250 if (buf == root->node) {
251 WARN_ON(parent && parent != buf);
252
253 spin_lock(&root->node_lock);
254 root->node = cow;
255 extent_buffer_get(cow);
256 spin_unlock(&root->node_lock);
257
258 if (buf != root->commit_root) {
259 btrfs_free_extent(trans, root, buf->start,
260 buf->len, buf->start,
261 root->root_key.objectid,
262 btrfs_header_generation(buf),
263 0, 0, 1);
264 }
265 free_extent_buffer(buf);
266 add_root_to_dirty_list(root);
267 } else {
268 btrfs_set_node_blockptr(parent, parent_slot,
269 cow->start);
270 WARN_ON(trans->transid == 0);
271 btrfs_set_node_ptr_generation(parent, parent_slot,
272 trans->transid);
273 btrfs_mark_buffer_dirty(parent);
274 WARN_ON(btrfs_header_generation(parent) != trans->transid);
275 btrfs_free_extent(trans, root, buf->start, buf->len,
276 parent_start, btrfs_header_owner(parent),
277 btrfs_header_generation(parent), 0, 0, 1);
278 }
279 if (unlock_orig)
280 btrfs_tree_unlock(buf);
281 free_extent_buffer(buf);
282 btrfs_mark_buffer_dirty(cow);
283 *cow_ret = cow;
284 return 0;
285}
286
287int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
288 struct btrfs_root *root, struct extent_buffer *buf,
289 struct extent_buffer *parent, int parent_slot,
290 struct extent_buffer **cow_ret, u64 prealloc_dest)
291{
292 u64 search_start;
293 u64 header_trans;
294 int ret;
295
296 if (trans->transaction != root->fs_info->running_transaction) {
297 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
298 root->fs_info->running_transaction->transid);
299 WARN_ON(1);
300 }
301 if (trans->transid != root->fs_info->generation) {
302 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
303 root->fs_info->generation);
304 WARN_ON(1);
305 }
306
307 header_trans = btrfs_header_generation(buf);
308 spin_lock(&root->fs_info->hash_lock);
309 if (header_trans == trans->transid &&
310 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
311 *cow_ret = buf;
312 spin_unlock(&root->fs_info->hash_lock);
313 WARN_ON(prealloc_dest);
314 return 0;
315 }
316 spin_unlock(&root->fs_info->hash_lock);
317 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
318 ret = __btrfs_cow_block(trans, root, buf, parent,
319 parent_slot, cow_ret, search_start, 0,
320 prealloc_dest);
321 return ret;
322}
323
324static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
325{
326 if (blocknr < other && other - (blocknr + blocksize) < 32768)
327 return 1;
328 if (blocknr > other && blocknr - (other + blocksize) < 32768)
329 return 1;
330 return 0;
331}
332
333/*
334 * compare two keys in a memcmp fashion
335 */
336static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
337{
338 struct btrfs_key k1;
339
340 btrfs_disk_key_to_cpu(&k1, disk);
341
342 if (k1.objectid > k2->objectid)
343 return 1;
344 if (k1.objectid < k2->objectid)
345 return -1;
346 if (k1.type > k2->type)
347 return 1;
348 if (k1.type < k2->type)
349 return -1;
350 if (k1.offset > k2->offset)
351 return 1;
352 if (k1.offset < k2->offset)
353 return -1;
354 return 0;
355}
356
357
358int btrfs_realloc_node(struct btrfs_trans_handle *trans,
359 struct btrfs_root *root, struct extent_buffer *parent,
360 int start_slot, int cache_only, u64 *last_ret,
361 struct btrfs_key *progress)
362{
363 struct extent_buffer *cur;
364 u64 blocknr;
365 u64 gen;
366 u64 search_start = *last_ret;
367 u64 last_block = 0;
368 u64 other;
369 u32 parent_nritems;
370 int end_slot;
371 int i;
372 int err = 0;
373 int parent_level;
374 int uptodate;
375 u32 blocksize;
376 int progress_passed = 0;
377 struct btrfs_disk_key disk_key;
378
379 parent_level = btrfs_header_level(parent);
380 if (cache_only && parent_level != 1)
381 return 0;
382
383 if (trans->transaction != root->fs_info->running_transaction) {
384 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
385 root->fs_info->running_transaction->transid);
386 WARN_ON(1);
387 }
388 if (trans->transid != root->fs_info->generation) {
389 printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
390 root->fs_info->generation);
391 WARN_ON(1);
392 }
393
394 parent_nritems = btrfs_header_nritems(parent);
395 blocksize = btrfs_level_size(root, parent_level - 1);
396 end_slot = parent_nritems;
397
398 if (parent_nritems == 1)
399 return 0;
400
401 for (i = start_slot; i < end_slot; i++) {
402 int close = 1;
403
404 if (!parent->map_token) {
405 map_extent_buffer(parent,
406 btrfs_node_key_ptr_offset(i),
407 sizeof(struct btrfs_key_ptr),
408 &parent->map_token, &parent->kaddr,
409 &parent->map_start, &parent->map_len,
410 KM_USER1);
411 }
412 btrfs_node_key(parent, &disk_key, i);
413 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
414 continue;
415
416 progress_passed = 1;
417 blocknr = btrfs_node_blockptr(parent, i);
418 gen = btrfs_node_ptr_generation(parent, i);
419 if (last_block == 0)
420 last_block = blocknr;
421
422 if (i > 0) {
423 other = btrfs_node_blockptr(parent, i - 1);
424 close = close_blocks(blocknr, other, blocksize);
425 }
426 if (!close && i < end_slot - 2) {
427 other = btrfs_node_blockptr(parent, i + 1);
428 close = close_blocks(blocknr, other, blocksize);
429 }
430 if (close) {
431 last_block = blocknr;
432 continue;
433 }
434 if (parent->map_token) {
435 unmap_extent_buffer(parent, parent->map_token,
436 KM_USER1);
437 parent->map_token = NULL;
438 }
439
440 cur = btrfs_find_tree_block(root, blocknr, blocksize);
441 if (cur)
442 uptodate = btrfs_buffer_uptodate(cur, gen);
443 else
444 uptodate = 0;
445 if (!cur || !uptodate) {
446 if (cache_only) {
447 free_extent_buffer(cur);
448 continue;
449 }
450 if (!cur) {
451 cur = read_tree_block(root, blocknr,
452 blocksize, gen);
453 } else if (!uptodate) {
454 btrfs_read_buffer(cur, gen);
455 }
456 }
457 if (search_start == 0)
458 search_start = last_block;
459
460 btrfs_tree_lock(cur);
461 err = __btrfs_cow_block(trans, root, cur, parent, i,
462 &cur, search_start,
463 min(16 * blocksize,
464 (end_slot - i) * blocksize), 0);
465 if (err) {
466 btrfs_tree_unlock(cur);
467 free_extent_buffer(cur);
468 break;
469 }
470 search_start = cur->start;
471 last_block = cur->start;
472 *last_ret = search_start;
473 btrfs_tree_unlock(cur);
474 free_extent_buffer(cur);
475 }
476 if (parent->map_token) {
477 unmap_extent_buffer(parent, parent->map_token,
478 KM_USER1);
479 parent->map_token = NULL;
480 }
481 return err;
482}
483
484/*
485 * The leaf data grows from end-to-front in the node.
486 * this returns the address of the start of the last item,
487 * which is the stop of the leaf data stack
488 */
489static inline unsigned int leaf_data_end(struct btrfs_root *root,
490 struct extent_buffer *leaf)
491{
492 u32 nr = btrfs_header_nritems(leaf);
493 if (nr == 0)
494 return BTRFS_LEAF_DATA_SIZE(root);
495 return btrfs_item_offset_nr(leaf, nr - 1);
496}
497
498static int check_node(struct btrfs_root *root, struct btrfs_path *path,
499 int level)
500{
501 struct extent_buffer *parent = NULL;
502 struct extent_buffer *node = path->nodes[level];
503 struct btrfs_disk_key parent_key;
504 struct btrfs_disk_key node_key;
505 int parent_slot;
506 int slot;
507 struct btrfs_key cpukey;
508 u32 nritems = btrfs_header_nritems(node);
509
510 if (path->nodes[level + 1])
511 parent = path->nodes[level + 1];
512
513 slot = path->slots[level];
514 BUG_ON(nritems == 0);
515 if (parent) {
516 parent_slot = path->slots[level + 1];
517 btrfs_node_key(parent, &parent_key, parent_slot);
518 btrfs_node_key(node, &node_key, 0);
519 BUG_ON(memcmp(&parent_key, &node_key,
520 sizeof(struct btrfs_disk_key)));
521 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
522 btrfs_header_bytenr(node));
523 }
524 BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
525 if (slot != 0) {
526 btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
527 btrfs_node_key(node, &node_key, slot);
528 BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
529 }
530 if (slot < nritems - 1) {
531 btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
532 btrfs_node_key(node, &node_key, slot);
533 BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
534 }
535 return 0;
536}
537
538static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
539 int level)
540{
541 struct extent_buffer *leaf = path->nodes[level];
542 struct extent_buffer *parent = NULL;
543 int parent_slot;
544 struct btrfs_key cpukey;
545 struct btrfs_disk_key parent_key;
546 struct btrfs_disk_key leaf_key;
547 int slot = path->slots[0];
548
549 u32 nritems = btrfs_header_nritems(leaf);
550
551 if (path->nodes[level + 1])
552 parent = path->nodes[level + 1];
553
554 if (nritems == 0)
555 return 0;
556
557 if (parent) {
558 parent_slot = path->slots[level + 1];
559 btrfs_node_key(parent, &parent_key, parent_slot);
560 btrfs_item_key(leaf, &leaf_key, 0);
561
562 BUG_ON(memcmp(&parent_key, &leaf_key,
563 sizeof(struct btrfs_disk_key)));
564 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
565 btrfs_header_bytenr(leaf));
566 }
567#if 0
568 for (i = 0; nritems > 1 && i < nritems - 2; i++) {
569 btrfs_item_key_to_cpu(leaf, &cpukey, i + 1);
570 btrfs_item_key(leaf, &leaf_key, i);
571 if (comp_keys(&leaf_key, &cpukey) >= 0) {
572 btrfs_print_leaf(root, leaf);
573 printk("slot %d offset bad key\n", i);
574 BUG_ON(1);
575 }
576 if (btrfs_item_offset_nr(leaf, i) !=
577 btrfs_item_end_nr(leaf, i + 1)) {
578 btrfs_print_leaf(root, leaf);
579 printk("slot %d offset bad\n", i);
580 BUG_ON(1);
581 }
582 if (i == 0) {
583 if (btrfs_item_offset_nr(leaf, i) +
584 btrfs_item_size_nr(leaf, i) !=
585 BTRFS_LEAF_DATA_SIZE(root)) {
586 btrfs_print_leaf(root, leaf);
587 printk("slot %d first offset bad\n", i);
588 BUG_ON(1);
589 }
590 }
591 }
592 if (nritems > 0) {
593 if (btrfs_item_size_nr(leaf, nritems - 1) > 4096) {
594 btrfs_print_leaf(root, leaf);
595 printk("slot %d bad size \n", nritems - 1);
596 BUG_ON(1);
597 }
598 }
599#endif
600 if (slot != 0 && slot < nritems - 1) {
601 btrfs_item_key(leaf, &leaf_key, slot);
602 btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
603 if (comp_keys(&leaf_key, &cpukey) <= 0) {
604 btrfs_print_leaf(root, leaf);
605 printk("slot %d offset bad key\n", slot);
606 BUG_ON(1);
607 }
608 if (btrfs_item_offset_nr(leaf, slot - 1) !=
609 btrfs_item_end_nr(leaf, slot)) {
610 btrfs_print_leaf(root, leaf);
611 printk("slot %d offset bad\n", slot);
612 BUG_ON(1);
613 }
614 }
615 if (slot < nritems - 1) {
616 btrfs_item_key(leaf, &leaf_key, slot);
617 btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
618 BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
619 if (btrfs_item_offset_nr(leaf, slot) !=
620 btrfs_item_end_nr(leaf, slot + 1)) {
621 btrfs_print_leaf(root, leaf);
622 printk("slot %d offset bad\n", slot);
623 BUG_ON(1);
624 }
625 }
626 BUG_ON(btrfs_item_offset_nr(leaf, 0) +
627 btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
628 return 0;
629}
630
631static int noinline check_block(struct btrfs_root *root,
632 struct btrfs_path *path, int level)
633{
634 u64 found_start;
635 return 0;
636 if (btrfs_header_level(path->nodes[level]) != level)
637 printk("warning: bad level %Lu wanted %d found %d\n",
638 path->nodes[level]->start, level,
639 btrfs_header_level(path->nodes[level]));
640 found_start = btrfs_header_bytenr(path->nodes[level]);
641 if (found_start != path->nodes[level]->start) {
642 printk("warning: bad bytentr %Lu found %Lu\n",
643 path->nodes[level]->start, found_start);
644 }
645#if 0
646 struct extent_buffer *buf = path->nodes[level];
647
648 if (memcmp_extent_buffer(buf, root->fs_info->fsid,
649 (unsigned long)btrfs_header_fsid(buf),
650 BTRFS_FSID_SIZE)) {
651 printk("warning bad block %Lu\n", buf->start);
652 return 1;
653 }
654#endif
655 if (level == 0)
656 return check_leaf(root, path, level);
657 return check_node(root, path, level);
658}
659
660/*
661 * search for key in the extent_buffer. The items start at offset p,
662 * and they are item_size apart. There are 'max' items in p.
663 *
664 * the slot in the array is returned via slot, and it points to
665 * the place where you would insert key if it is not found in
666 * the array.
667 *
668 * slot may point to max if the key is bigger than all of the keys
669 */
670static noinline int generic_bin_search(struct extent_buffer *eb,
671 unsigned long p,
672 int item_size, struct btrfs_key *key,
673 int max, int *slot)
674{
675 int low = 0;
676 int high = max;
677 int mid;
678 int ret;
679 struct btrfs_disk_key *tmp = NULL;
680 struct btrfs_disk_key unaligned;
681 unsigned long offset;
682 char *map_token = NULL;
683 char *kaddr = NULL;
684 unsigned long map_start = 0;
685 unsigned long map_len = 0;
686 int err;
687
688 while(low < high) {
689 mid = (low + high) / 2;
690 offset = p + mid * item_size;
691
692 if (!map_token || offset < map_start ||
693 (offset + sizeof(struct btrfs_disk_key)) >
694 map_start + map_len) {
695 if (map_token) {
696 unmap_extent_buffer(eb, map_token, KM_USER0);
697 map_token = NULL;
698 }
699 err = map_extent_buffer(eb, offset,
700 sizeof(struct btrfs_disk_key),
701 &map_token, &kaddr,
702 &map_start, &map_len, KM_USER0);
703
704 if (!err) {
705 tmp = (struct btrfs_disk_key *)(kaddr + offset -
706 map_start);
707 } else {
708 read_extent_buffer(eb, &unaligned,
709 offset, sizeof(unaligned));
710 tmp = &unaligned;
711 }
712
713 } else {
714 tmp = (struct btrfs_disk_key *)(kaddr + offset -
715 map_start);
716 }
717 ret = comp_keys(tmp, key);
718
719 if (ret < 0)
720 low = mid + 1;
721 else if (ret > 0)
722 high = mid;
723 else {
724 *slot = mid;
725 if (map_token)
726 unmap_extent_buffer(eb, map_token, KM_USER0);
727 return 0;
728 }
729 }
730 *slot = low;
731 if (map_token)
732 unmap_extent_buffer(eb, map_token, KM_USER0);
733 return 1;
734}
735
736/*
737 * simple bin_search frontend that does the right thing for
738 * leaves vs nodes
739 */
740static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
741 int level, int *slot)
742{
743 if (level == 0) {
744 return generic_bin_search(eb,
745 offsetof(struct btrfs_leaf, items),
746 sizeof(struct btrfs_item),
747 key, btrfs_header_nritems(eb),
748 slot);
749 } else {
750 return generic_bin_search(eb,
751 offsetof(struct btrfs_node, ptrs),
752 sizeof(struct btrfs_key_ptr),
753 key, btrfs_header_nritems(eb),
754 slot);
755 }
756 return -1;
757}
758
759static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
760 struct extent_buffer *parent, int slot)
761{
762 int level = btrfs_header_level(parent);
763 if (slot < 0)
764 return NULL;
765 if (slot >= btrfs_header_nritems(parent))
766 return NULL;
767
768 BUG_ON(level == 0);
769
770 return read_tree_block(root, btrfs_node_blockptr(parent, slot),
771 btrfs_level_size(root, level - 1),
772 btrfs_node_ptr_generation(parent, slot));
773}
774
775static noinline int balance_level(struct btrfs_trans_handle *trans,
776 struct btrfs_root *root,
777 struct btrfs_path *path, int level)
778{
779 struct extent_buffer *right = NULL;
780 struct extent_buffer *mid;
781 struct extent_buffer *left = NULL;
782 struct extent_buffer *parent = NULL;
783 int ret = 0;
784 int wret;
785 int pslot;
786 int orig_slot = path->slots[level];
787 int err_on_enospc = 0;
788 u64 orig_ptr;
789
790 if (level == 0)
791 return 0;
792
793 mid = path->nodes[level];
794 WARN_ON(!path->locks[level]);
795 WARN_ON(btrfs_header_generation(mid) != trans->transid);
796
797 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
798
799 if (level < BTRFS_MAX_LEVEL - 1)
800 parent = path->nodes[level + 1];
801 pslot = path->slots[level + 1];
802
803 /*
804 * deal with the case where there is only one pointer in the root
805 * by promoting the node below to a root
806 */
807 if (!parent) {
808 struct extent_buffer *child;
809
810 if (btrfs_header_nritems(mid) != 1)
811 return 0;
812
813 /* promote the child to a root */
814 child = read_node_slot(root, mid, 0);
815 btrfs_tree_lock(child);
816 BUG_ON(!child);
817 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
818 BUG_ON(ret);
819
820 spin_lock(&root->node_lock);
821 root->node = child;
822 spin_unlock(&root->node_lock);
823
824 ret = btrfs_update_extent_ref(trans, root, child->start,
825 mid->start, child->start,
826 root->root_key.objectid,
827 trans->transid, level - 1, 0);
828 BUG_ON(ret);
829
830 add_root_to_dirty_list(root);
831 btrfs_tree_unlock(child);
832 path->locks[level] = 0;
833 path->nodes[level] = NULL;
834 clean_tree_block(trans, root, mid);
835 btrfs_tree_unlock(mid);
836 /* once for the path */
837 free_extent_buffer(mid);
838 ret = btrfs_free_extent(trans, root, mid->start, mid->len,
839 mid->start, root->root_key.objectid,
840 btrfs_header_generation(mid), 0, 0, 1);
841 /* once for the root ptr */
842 free_extent_buffer(mid);
843 return ret;
844 }
845 if (btrfs_header_nritems(mid) >
846 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
847 return 0;
848
849 if (btrfs_header_nritems(mid) < 2)
850 err_on_enospc = 1;
851
852 left = read_node_slot(root, parent, pslot - 1);
853 if (left) {
854 btrfs_tree_lock(left);
855 wret = btrfs_cow_block(trans, root, left,
856 parent, pslot - 1, &left, 0);
857 if (wret) {
858 ret = wret;
859 goto enospc;
860 }
861 }
862 right = read_node_slot(root, parent, pslot + 1);
863 if (right) {
864 btrfs_tree_lock(right);
865 wret = btrfs_cow_block(trans, root, right,
866 parent, pslot + 1, &right, 0);
867 if (wret) {
868 ret = wret;
869 goto enospc;
870 }
871 }
872
873 /* first, try to make some room in the middle buffer */
874 if (left) {
875 orig_slot += btrfs_header_nritems(left);
876 wret = push_node_left(trans, root, left, mid, 1);
877 if (wret < 0)
878 ret = wret;
879 if (btrfs_header_nritems(mid) < 2)
880 err_on_enospc = 1;
881 }
882
883 /*
884 * then try to empty the right most buffer into the middle
885 */
886 if (right) {
887 wret = push_node_left(trans, root, mid, right, 1);
888 if (wret < 0 && wret != -ENOSPC)
889 ret = wret;
890 if (btrfs_header_nritems(right) == 0) {
891 u64 bytenr = right->start;
892 u64 generation = btrfs_header_generation(parent);
893 u32 blocksize = right->len;
894
895 clean_tree_block(trans, root, right);
896 btrfs_tree_unlock(right);
897 free_extent_buffer(right);
898 right = NULL;
899 wret = del_ptr(trans, root, path, level + 1, pslot +
900 1);
901 if (wret)
902 ret = wret;
903 wret = btrfs_free_extent(trans, root, bytenr,
904 blocksize, parent->start,
905 btrfs_header_owner(parent),
906 generation, 0, 0, 1);
907 if (wret)
908 ret = wret;
909 } else {
910 struct btrfs_disk_key right_key;
911 btrfs_node_key(right, &right_key, 0);
912 btrfs_set_node_key(parent, &right_key, pslot + 1);
913 btrfs_mark_buffer_dirty(parent);
914 }
915 }
916 if (btrfs_header_nritems(mid) == 1) {
917 /*
918 * we're not allowed to leave a node with one item in the
919 * tree during a delete. A deletion from lower in the tree
920 * could try to delete the only pointer in this node.
921 * So, pull some keys from the left.
922 * There has to be a left pointer at this point because
923 * otherwise we would have pulled some pointers from the
924 * right
925 */
926 BUG_ON(!left);
927 wret = balance_node_right(trans, root, mid, left);
928 if (wret < 0) {
929 ret = wret;
930 goto enospc;
931 }
932 if (wret == 1) {
933 wret = push_node_left(trans, root, left, mid, 1);
934 if (wret < 0)
935 ret = wret;
936 }
937 BUG_ON(wret == 1);
938 }
939 if (btrfs_header_nritems(mid) == 0) {
940 /* we've managed to empty the middle node, drop it */
941 u64 root_gen = btrfs_header_generation(parent);
942 u64 bytenr = mid->start;
943 u32 blocksize = mid->len;
944
945 clean_tree_block(trans, root, mid);
946 btrfs_tree_unlock(mid);
947 free_extent_buffer(mid);
948 mid = NULL;
949 wret = del_ptr(trans, root, path, level + 1, pslot);
950 if (wret)
951 ret = wret;
952 wret = btrfs_free_extent(trans, root, bytenr, blocksize,
953 parent->start,
954 btrfs_header_owner(parent),
955 root_gen, 0, 0, 1);
956 if (wret)
957 ret = wret;
958 } else {
959 /* update the parent key to reflect our changes */
960 struct btrfs_disk_key mid_key;
961 btrfs_node_key(mid, &mid_key, 0);
962 btrfs_set_node_key(parent, &mid_key, pslot);
963 btrfs_mark_buffer_dirty(parent);
964 }
965
966 /* update the path */
967 if (left) {
968 if (btrfs_header_nritems(left) > orig_slot) {
969 extent_buffer_get(left);
970 /* left was locked after cow */
971 path->nodes[level] = left;
972 path->slots[level + 1] -= 1;
973 path->slots[level] = orig_slot;
974 if (mid) {
975 btrfs_tree_unlock(mid);
976 free_extent_buffer(mid);
977 }
978 } else {
979 orig_slot -= btrfs_header_nritems(left);
980 path->slots[level] = orig_slot;
981 }
982 }
983 /* double check we haven't messed things up */
984 check_block(root, path, level);
985 if (orig_ptr !=
986 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
987 BUG();
988enospc:
989 if (right) {
990 btrfs_tree_unlock(right);
991 free_extent_buffer(right);
992 }
993 if (left) {
994 if (path->nodes[level] != left)
995 btrfs_tree_unlock(left);
996 free_extent_buffer(left);
997 }
998 return ret;
999}
1000
1001/* returns zero if the push worked, non-zero otherwise */
1002static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
1003 struct btrfs_root *root,
1004 struct btrfs_path *path, int level)
1005{
1006 struct extent_buffer *right = NULL;
1007 struct extent_buffer *mid;
1008 struct extent_buffer *left = NULL;
1009 struct extent_buffer *parent = NULL;
1010 int ret = 0;
1011 int wret;
1012 int pslot;
1013 int orig_slot = path->slots[level];
1014 u64 orig_ptr;
1015
1016 if (level == 0)
1017 return 1;
1018
1019 mid = path->nodes[level];
1020 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1021 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1022
1023 if (level < BTRFS_MAX_LEVEL - 1)
1024 parent = path->nodes[level + 1];
1025 pslot = path->slots[level + 1];
1026
1027 if (!parent)
1028 return 1;
1029
1030 left = read_node_slot(root, parent, pslot - 1);
1031
1032 /* first, try to make some room in the middle buffer */
1033 if (left) {
1034 u32 left_nr;
1035
1036 btrfs_tree_lock(left);
1037 left_nr = btrfs_header_nritems(left);
1038 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1039 wret = 1;
1040 } else {
1041 ret = btrfs_cow_block(trans, root, left, parent,
1042 pslot - 1, &left, 0);
1043 if (ret)
1044 wret = 1;
1045 else {
1046 wret = push_node_left(trans, root,
1047 left, mid, 0);
1048 }
1049 }
1050 if (wret < 0)
1051 ret = wret;
1052 if (wret == 0) {
1053 struct btrfs_disk_key disk_key;
1054 orig_slot += left_nr;
1055 btrfs_node_key(mid, &disk_key, 0);
1056 btrfs_set_node_key(parent, &disk_key, pslot);
1057 btrfs_mark_buffer_dirty(parent);
1058 if (btrfs_header_nritems(left) > orig_slot) {
1059 path->nodes[level] = left;
1060 path->slots[level + 1] -= 1;
1061 path->slots[level] = orig_slot;
1062 btrfs_tree_unlock(mid);
1063 free_extent_buffer(mid);
1064 } else {
1065 orig_slot -=
1066 btrfs_header_nritems(left);
1067 path->slots[level] = orig_slot;
1068 btrfs_tree_unlock(left);
1069 free_extent_buffer(left);
1070 }
1071 return 0;
1072 }
1073 btrfs_tree_unlock(left);
1074 free_extent_buffer(left);
1075 }
1076 right = read_node_slot(root, parent, pslot + 1);
1077
1078 /*
1079 * then try to empty the right most buffer into the middle
1080 */
1081 if (right) {
1082 u32 right_nr;
1083 btrfs_tree_lock(right);
1084 right_nr = btrfs_header_nritems(right);
1085 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1086 wret = 1;
1087 } else {
1088 ret = btrfs_cow_block(trans, root, right,
1089 parent, pslot + 1,
1090 &right, 0);
1091 if (ret)
1092 wret = 1;
1093 else {
1094 wret = balance_node_right(trans, root,
1095 right, mid);
1096 }
1097 }
1098 if (wret < 0)
1099 ret = wret;
1100 if (wret == 0) {
1101 struct btrfs_disk_key disk_key;
1102
1103 btrfs_node_key(right, &disk_key, 0);
1104 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1105 btrfs_mark_buffer_dirty(parent);
1106
1107 if (btrfs_header_nritems(mid) <= orig_slot) {
1108 path->nodes[level] = right;
1109 path->slots[level + 1] += 1;
1110 path->slots[level] = orig_slot -
1111 btrfs_header_nritems(mid);
1112 btrfs_tree_unlock(mid);
1113 free_extent_buffer(mid);
1114 } else {
1115 btrfs_tree_unlock(right);
1116 free_extent_buffer(right);
1117 }
1118 return 0;
1119 }
1120 btrfs_tree_unlock(right);
1121 free_extent_buffer(right);
1122 }
1123 return 1;
1124}
1125
1126/*
1127 * readahead one full node of leaves
1128 */
1129static noinline void reada_for_search(struct btrfs_root *root,
1130 struct btrfs_path *path,
1131 int level, int slot, u64 objectid)
1132{
1133 struct extent_buffer *node;
1134 struct btrfs_disk_key disk_key;
1135 u32 nritems;
1136 u64 search;
1137 u64 lowest_read;
1138 u64 highest_read;
1139 u64 nread = 0;
1140 int direction = path->reada;
1141 struct extent_buffer *eb;
1142 u32 nr;
1143 u32 blocksize;
1144 u32 nscan = 0;
1145
1146 if (level != 1)
1147 return;
1148
1149 if (!path->nodes[level])
1150 return;
1151
1152 node = path->nodes[level];
1153
1154 search = btrfs_node_blockptr(node, slot);
1155 blocksize = btrfs_level_size(root, level - 1);
1156 eb = btrfs_find_tree_block(root, search, blocksize);
1157 if (eb) {
1158 free_extent_buffer(eb);
1159 return;
1160 }
1161
1162 highest_read = search;
1163 lowest_read = search;
1164
1165 nritems = btrfs_header_nritems(node);
1166 nr = slot;
1167 while(1) {
1168 if (direction < 0) {
1169 if (nr == 0)
1170 break;
1171 nr--;
1172 } else if (direction > 0) {
1173 nr++;
1174 if (nr >= nritems)
1175 break;
1176 }
1177 if (path->reada < 0 && objectid) {
1178 btrfs_node_key(node, &disk_key, nr);
1179 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1180 break;
1181 }
1182 search = btrfs_node_blockptr(node, nr);
1183 if ((search >= lowest_read && search <= highest_read) ||
1184 (search < lowest_read && lowest_read - search <= 32768) ||
1185 (search > highest_read && search - highest_read <= 32768)) {
1186 readahead_tree_block(root, search, blocksize,
1187 btrfs_node_ptr_generation(node, nr));
1188 nread += blocksize;
1189 }
1190 nscan++;
1191 if (path->reada < 2 && (nread > (256 * 1024) || nscan > 32))
1192 break;
1193 if(nread > (1024 * 1024) || nscan > 128)
1194 break;
1195
1196 if (search < lowest_read)
1197 lowest_read = search;
1198 if (search > highest_read)
1199 highest_read = search;
1200 }
1201}
1202
1203static noinline void unlock_up(struct btrfs_path *path, int level,
1204 int lowest_unlock)
1205{
1206 int i;
1207 int skip_level = level;
1208 int no_skips = 0;
1209 struct extent_buffer *t;
1210
1211 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1212 if (!path->nodes[i])
1213 break;
1214 if (!path->locks[i])
1215 break;
1216 if (!no_skips && path->slots[i] == 0) {
1217 skip_level = i + 1;
1218 continue;
1219 }
1220 if (!no_skips && path->keep_locks) {
1221 u32 nritems;
1222 t = path->nodes[i];
1223 nritems = btrfs_header_nritems(t);
1224 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1225 skip_level = i + 1;
1226 continue;
1227 }
1228 }
1229 if (skip_level < i && i >= lowest_unlock)
1230 no_skips = 1;
1231
1232 t = path->nodes[i];
1233 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1234 btrfs_tree_unlock(t);
1235 path->locks[i] = 0;
1236 }
1237 }
1238}
1239
1240/*
1241 * look for key in the tree. path is filled in with nodes along the way
1242 * if key is found, we return zero and you can find the item in the leaf
1243 * level of the path (level 0)
1244 *
1245 * If the key isn't found, the path points to the slot where it should
1246 * be inserted, and 1 is returned. If there are other errors during the
1247 * search a negative error number is returned.
1248 *
1249 * if ins_len > 0, nodes and leaves will be split as we walk down the
1250 * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
1251 * possible)
1252 */
1253int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1254 *root, struct btrfs_key *key, struct btrfs_path *p, int
1255 ins_len, int cow)
1256{
1257 struct extent_buffer *b;
1258 struct extent_buffer *tmp;
1259 int slot;
1260 int ret;
1261 int level;
1262 int should_reada = p->reada;
1263 int lowest_unlock = 1;
1264 int blocksize;
1265 u8 lowest_level = 0;
1266 u64 blocknr;
1267 u64 gen;
1268 struct btrfs_key prealloc_block;
1269
1270 lowest_level = p->lowest_level;
1271 WARN_ON(lowest_level && ins_len);
1272 WARN_ON(p->nodes[0] != NULL);
1273 WARN_ON(cow && root == root->fs_info->extent_root &&
1274 !mutex_is_locked(&root->fs_info->alloc_mutex));
1275 if (ins_len < 0)
1276 lowest_unlock = 2;
1277
1278 prealloc_block.objectid = 0;
1279
1280again:
1281 if (p->skip_locking)
1282 b = btrfs_root_node(root);
1283 else
1284 b = btrfs_lock_root_node(root);
1285
1286 while (b) {
1287 level = btrfs_header_level(b);
1288
1289 /*
1290 * setup the path here so we can release it under lock
1291 * contention with the cow code
1292 */
1293 p->nodes[level] = b;
1294 if (!p->skip_locking)
1295 p->locks[level] = 1;
1296
1297 if (cow) {
1298 int wret;
1299
1300 /* is a cow on this block not required */
1301 spin_lock(&root->fs_info->hash_lock);
1302 if (btrfs_header_generation(b) == trans->transid &&
1303 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1304 spin_unlock(&root->fs_info->hash_lock);
1305 goto cow_done;
1306 }
1307 spin_unlock(&root->fs_info->hash_lock);
1308
1309 /* ok, we have to cow, is our old prealloc the right
1310 * size?
1311 */
1312 if (prealloc_block.objectid &&
1313 prealloc_block.offset != b->len) {
1314 btrfs_free_reserved_extent(root,
1315 prealloc_block.objectid,
1316 prealloc_block.offset);
1317 prealloc_block.objectid = 0;
1318 }
1319
1320 /*
1321 * for higher level blocks, try not to allocate blocks
1322 * with the block and the parent locks held.
1323 */
1324 if (level > 1 && !prealloc_block.objectid &&
1325 btrfs_path_lock_waiting(p, level)) {
1326 u32 size = b->len;
1327 u64 hint = b->start;
1328
1329 btrfs_release_path(root, p);
1330 ret = btrfs_reserve_extent(trans, root,
1331 size, size, 0,
1332 hint, (u64)-1,
1333 &prealloc_block, 0);
1334 BUG_ON(ret);
1335 goto again;
1336 }
1337
1338 wret = btrfs_cow_block(trans, root, b,
1339 p->nodes[level + 1],
1340 p->slots[level + 1],
1341 &b, prealloc_block.objectid);
1342 prealloc_block.objectid = 0;
1343 if (wret) {
1344 free_extent_buffer(b);
1345 ret = wret;
1346 goto done;
1347 }
1348 }
1349cow_done:
1350 BUG_ON(!cow && ins_len);
1351 if (level != btrfs_header_level(b))
1352 WARN_ON(1);
1353 level = btrfs_header_level(b);
1354
1355 p->nodes[level] = b;
1356 if (!p->skip_locking)
1357 p->locks[level] = 1;
1358
1359 ret = check_block(root, p, level);
1360 if (ret) {
1361 ret = -1;
1362 goto done;
1363 }
1364
1365 ret = bin_search(b, key, level, &slot);
1366 if (level != 0) {
1367 if (ret && slot > 0)
1368 slot -= 1;
1369 p->slots[level] = slot;
1370 if (ins_len > 0 && btrfs_header_nritems(b) >=
1371 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1372 int sret = split_node(trans, root, p, level);
1373 BUG_ON(sret > 0);
1374 if (sret) {
1375 ret = sret;
1376 goto done;
1377 }
1378 b = p->nodes[level];
1379 slot = p->slots[level];
1380 } else if (ins_len < 0) {
1381 int sret = balance_level(trans, root, p,
1382 level);
1383 if (sret) {
1384 ret = sret;
1385 goto done;
1386 }
1387 b = p->nodes[level];
1388 if (!b) {
1389 btrfs_release_path(NULL, p);
1390 goto again;
1391 }
1392 slot = p->slots[level];
1393 BUG_ON(btrfs_header_nritems(b) == 1);
1394 }
1395 unlock_up(p, level, lowest_unlock);
1396
1397 /* this is only true while dropping a snapshot */
1398 if (level == lowest_level) {
1399 break;
1400 }
1401
1402 blocknr = btrfs_node_blockptr(b, slot);
1403 gen = btrfs_node_ptr_generation(b, slot);
1404 blocksize = btrfs_level_size(root, level - 1);
1405
1406 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1407 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1408 b = tmp;
1409 } else {
1410 /*
1411 * reduce lock contention at high levels
1412 * of the btree by dropping locks before
1413 * we read.
1414 */
1415 if (level > 1) {
1416 btrfs_release_path(NULL, p);
1417 if (tmp)
1418 free_extent_buffer(tmp);
1419 if (should_reada)
1420 reada_for_search(root, p,
1421 level, slot,
1422 key->objectid);
1423
1424 tmp = read_tree_block(root, blocknr,
1425 blocksize, gen);
1426 if (tmp)
1427 free_extent_buffer(tmp);
1428 goto again;
1429 } else {
1430 if (tmp)
1431 free_extent_buffer(tmp);
1432 if (should_reada)
1433 reada_for_search(root, p,
1434 level, slot,
1435 key->objectid);
1436 b = read_node_slot(root, b, slot);
1437 }
1438 }
1439 if (!p->skip_locking)
1440 btrfs_tree_lock(b);
1441 } else {
1442 p->slots[level] = slot;
1443 if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
1444 sizeof(struct btrfs_item) + ins_len) {
1445 int sret = split_leaf(trans, root, key,
1446 p, ins_len, ret == 0);
1447 BUG_ON(sret > 0);
1448 if (sret) {
1449 ret = sret;
1450 goto done;
1451 }
1452 }
1453 unlock_up(p, level, lowest_unlock);
1454 goto done;
1455 }
1456 }
1457 ret = 1;
1458done:
1459 if (prealloc_block.objectid) {
1460 btrfs_free_reserved_extent(root,
1461 prealloc_block.objectid,
1462 prealloc_block.offset);
1463 }
1464
1465 return ret;
1466}
1467
1468/*
1469 * adjust the pointers going up the tree, starting at level
1470 * making sure the right key of each node is points to 'key'.
1471 * This is used after shifting pointers to the left, so it stops
1472 * fixing up pointers when a given leaf/node is not in slot 0 of the
1473 * higher levels
1474 *
1475 * If this fails to write a tree block, it returns -1, but continues
1476 * fixing up the blocks in ram so the tree is consistent.
1477 */
1478static int fixup_low_keys(struct btrfs_trans_handle *trans,
1479 struct btrfs_root *root, struct btrfs_path *path,
1480 struct btrfs_disk_key *key, int level)
1481{
1482 int i;
1483 int ret = 0;
1484 struct extent_buffer *t;
1485
1486 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1487 int tslot = path->slots[i];
1488 if (!path->nodes[i])
1489 break;
1490 t = path->nodes[i];
1491 btrfs_set_node_key(t, key, tslot);
1492 btrfs_mark_buffer_dirty(path->nodes[i]);
1493 if (tslot != 0)
1494 break;
1495 }
1496 return ret;
1497}
1498
1499/*
1500 * update item key.
1501 *
1502 * This function isn't completely safe. It's the caller's responsibility
1503 * that the new key won't break the order
1504 */
1505int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1506 struct btrfs_root *root, struct btrfs_path *path,
1507 struct btrfs_key *new_key)
1508{
1509 struct btrfs_disk_key disk_key;
1510 struct extent_buffer *eb;
1511 int slot;
1512
1513 eb = path->nodes[0];
1514 slot = path->slots[0];
1515 if (slot > 0) {
1516 btrfs_item_key(eb, &disk_key, slot - 1);
1517 if (comp_keys(&disk_key, new_key) >= 0)
1518 return -1;
1519 }
1520 if (slot < btrfs_header_nritems(eb) - 1) {
1521 btrfs_item_key(eb, &disk_key, slot + 1);
1522 if (comp_keys(&disk_key, new_key) <= 0)
1523 return -1;
1524 }
1525
1526 btrfs_cpu_key_to_disk(&disk_key, new_key);
1527 btrfs_set_item_key(eb, &disk_key, slot);
1528 btrfs_mark_buffer_dirty(eb);
1529 if (slot == 0)
1530 fixup_low_keys(trans, root, path, &disk_key, 1);
1531 return 0;
1532}
1533
1534/*
1535 * try to push data from one node into the next node left in the
1536 * tree.
1537 *
1538 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1539 * error, and > 0 if there was no room in the left hand block.
1540 */
1541static int push_node_left(struct btrfs_trans_handle *trans,
1542 struct btrfs_root *root, struct extent_buffer *dst,
1543 struct extent_buffer *src, int empty)
1544{
1545 int push_items = 0;
1546 int src_nritems;
1547 int dst_nritems;
1548 int ret = 0;
1549
1550 src_nritems = btrfs_header_nritems(src);
1551 dst_nritems = btrfs_header_nritems(dst);
1552 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1553 WARN_ON(btrfs_header_generation(src) != trans->transid);
1554 WARN_ON(btrfs_header_generation(dst) != trans->transid);
1555
1556 if (!empty && src_nritems <= 8)
1557 return 1;
1558
1559 if (push_items <= 0) {
1560 return 1;
1561 }
1562
1563 if (empty) {
1564 push_items = min(src_nritems, push_items);
1565 if (push_items < src_nritems) {
1566 /* leave at least 8 pointers in the node if
1567 * we aren't going to empty it
1568 */
1569 if (src_nritems - push_items < 8) {
1570 if (push_items <= 8)
1571 return 1;
1572 push_items -= 8;
1573 }
1574 }
1575 } else
1576 push_items = min(src_nritems - 8, push_items);
1577
1578 copy_extent_buffer(dst, src,
1579 btrfs_node_key_ptr_offset(dst_nritems),
1580 btrfs_node_key_ptr_offset(0),
1581 push_items * sizeof(struct btrfs_key_ptr));
1582
1583 if (push_items < src_nritems) {
1584 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
1585 btrfs_node_key_ptr_offset(push_items),
1586 (src_nritems - push_items) *
1587 sizeof(struct btrfs_key_ptr));
1588 }
1589 btrfs_set_header_nritems(src, src_nritems - push_items);
1590 btrfs_set_header_nritems(dst, dst_nritems + push_items);
1591 btrfs_mark_buffer_dirty(src);
1592 btrfs_mark_buffer_dirty(dst);
1593
1594 ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
1595 BUG_ON(ret);
1596
1597 return ret;
1598}
1599
1600/*
1601 * try to push data from one node into the next node right in the
1602 * tree.
1603 *
1604 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
1605 * error, and > 0 if there was no room in the right hand block.
1606 *
1607 * this will only push up to 1/2 the contents of the left node over
1608 */
1609static int balance_node_right(struct btrfs_trans_handle *trans,
1610 struct btrfs_root *root,
1611 struct extent_buffer *dst,
1612 struct extent_buffer *src)
1613{
1614 int push_items = 0;
1615 int max_push;
1616 int src_nritems;
1617 int dst_nritems;
1618 int ret = 0;
1619
1620 WARN_ON(btrfs_header_generation(src) != trans->transid);
1621 WARN_ON(btrfs_header_generation(dst) != trans->transid);
1622
1623 src_nritems = btrfs_header_nritems(src);
1624 dst_nritems = btrfs_header_nritems(dst);
1625 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1626 if (push_items <= 0) {
1627 return 1;
1628 }
1629
1630 if (src_nritems < 4) {
1631 return 1;
1632 }
1633
1634 max_push = src_nritems / 2 + 1;
1635 /* don't try to empty the node */
1636 if (max_push >= src_nritems) {
1637 return 1;
1638 }
1639
1640 if (max_push < push_items)
1641 push_items = max_push;
1642
1643 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
1644 btrfs_node_key_ptr_offset(0),
1645 (dst_nritems) *
1646 sizeof(struct btrfs_key_ptr));
1647
1648 copy_extent_buffer(dst, src,
1649 btrfs_node_key_ptr_offset(0),
1650 btrfs_node_key_ptr_offset(src_nritems - push_items),
1651 push_items * sizeof(struct btrfs_key_ptr));
1652
1653 btrfs_set_header_nritems(src, src_nritems - push_items);
1654 btrfs_set_header_nritems(dst, dst_nritems + push_items);
1655
1656 btrfs_mark_buffer_dirty(src);
1657 btrfs_mark_buffer_dirty(dst);
1658
1659 ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
1660 BUG_ON(ret);
1661
1662 return ret;
1663}
1664
1665/*
1666 * helper function to insert a new root level in the tree.
1667 * A new node is allocated, and a single item is inserted to
1668 * point to the existing root
1669 *
1670 * returns zero on success or < 0 on failure.
1671 */
1672static int noinline insert_new_root(struct btrfs_trans_handle *trans,
1673 struct btrfs_root *root,
1674 struct btrfs_path *path, int level)
1675{
1676 u64 lower_gen;
1677 struct extent_buffer *lower;
1678 struct extent_buffer *c;
1679 struct extent_buffer *old;
1680 struct btrfs_disk_key lower_key;
1681 int ret;
1682
1683 BUG_ON(path->nodes[level]);
1684 BUG_ON(path->nodes[level-1] != root->node);
1685
1686 lower = path->nodes[level-1];
1687 if (level == 1)
1688 btrfs_item_key(lower, &lower_key, 0);
1689 else
1690 btrfs_node_key(lower, &lower_key, 0);
1691
1692 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
1693 root->root_key.objectid, trans->transid,
1694 level, root->node->start, 0);
1695 if (IS_ERR(c))
1696 return PTR_ERR(c);
1697
1698 memset_extent_buffer(c, 0, 0, root->nodesize);
1699 btrfs_set_header_nritems(c, 1);
1700 btrfs_set_header_level(c, level);
1701 btrfs_set_header_bytenr(c, c->start);
1702 btrfs_set_header_generation(c, trans->transid);
1703 btrfs_set_header_owner(c, root->root_key.objectid);
1704
1705 write_extent_buffer(c, root->fs_info->fsid,
1706 (unsigned long)btrfs_header_fsid(c),
1707 BTRFS_FSID_SIZE);
1708
1709 write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
1710 (unsigned long)btrfs_header_chunk_tree_uuid(c),
1711 BTRFS_UUID_SIZE);
1712
1713 btrfs_set_node_key(c, &lower_key, 0);
1714 btrfs_set_node_blockptr(c, 0, lower->start);
1715 lower_gen = btrfs_header_generation(lower);
1716 WARN_ON(lower_gen != trans->transid);
1717
1718 btrfs_set_node_ptr_generation(c, 0, lower_gen);
1719
1720 btrfs_mark_buffer_dirty(c);
1721
1722 spin_lock(&root->node_lock);
1723 old = root->node;
1724 root->node = c;
1725 spin_unlock(&root->node_lock);
1726
1727 ret = btrfs_update_extent_ref(trans, root, lower->start,
1728 lower->start, c->start,
1729 root->root_key.objectid,
1730 trans->transid, level - 1, 0);
1731 BUG_ON(ret);
1732
1733 /* the super has an extra ref to root->node */
1734 free_extent_buffer(old);
1735
1736 add_root_to_dirty_list(root);
1737 extent_buffer_get(c);
1738 path->nodes[level] = c;
1739 path->locks[level] = 1;
1740 path->slots[level] = 0;
1741 return 0;
1742}
1743
1744/*
1745 * worker function to insert a single pointer in a node.
1746 * the node should have enough room for the pointer already
1747 *
1748 * slot and level indicate where you want the key to go, and
1749 * blocknr is the block the key points to.
1750 *
1751 * returns zero on success and < 0 on any error
1752 */
1753static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
1754 *root, struct btrfs_path *path, struct btrfs_disk_key
1755 *key, u64 bytenr, int slot, int level)
1756{
1757 struct extent_buffer *lower;
1758 int nritems;
1759
1760 BUG_ON(!path->nodes[level]);
1761 lower = path->nodes[level];
1762 nritems = btrfs_header_nritems(lower);
1763 if (slot > nritems)
1764 BUG();
1765 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
1766 BUG();
1767 if (slot != nritems) {
1768 memmove_extent_buffer(lower,
1769 btrfs_node_key_ptr_offset(slot + 1),
1770 btrfs_node_key_ptr_offset(slot),
1771 (nritems - slot) * sizeof(struct btrfs_key_ptr));
1772 }
1773 btrfs_set_node_key(lower, key, slot);
1774 btrfs_set_node_blockptr(lower, slot, bytenr);
1775 WARN_ON(trans->transid == 0);
1776 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
1777 btrfs_set_header_nritems(lower, nritems + 1);
1778 btrfs_mark_buffer_dirty(lower);
1779 return 0;
1780}
1781
1782/*
1783 * split the node at the specified level in path in two.
1784 * The path is corrected to point to the appropriate node after the split
1785 *
1786 * Before splitting this tries to make some room in the node by pushing
1787 * left and right, if either one works, it returns right away.
1788 *
1789 * returns 0 on success and < 0 on failure
1790 */
1791static noinline int split_node(struct btrfs_trans_handle *trans,
1792 struct btrfs_root *root,
1793 struct btrfs_path *path, int level)
1794{
1795 struct extent_buffer *c;
1796 struct extent_buffer *split;
1797 struct btrfs_disk_key disk_key;
1798 int mid;
1799 int ret;
1800 int wret;
1801 u32 c_nritems;
1802
1803 c = path->nodes[level];
1804 WARN_ON(btrfs_header_generation(c) != trans->transid);
1805 if (c == root->node) {
1806 /* trying to split the root, lets make a new one */
1807 ret = insert_new_root(trans, root, path, level + 1);
1808 if (ret)
1809 return ret;
1810 } else {
1811 ret = push_nodes_for_insert(trans, root, path, level);
1812 c = path->nodes[level];
1813 if (!ret && btrfs_header_nritems(c) <
1814 BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
1815 return 0;
1816 if (ret < 0)
1817 return ret;
1818 }
1819
1820 c_nritems = btrfs_header_nritems(c);
1821
1822 split = btrfs_alloc_free_block(trans, root, root->nodesize,
1823 path->nodes[level + 1]->start,
1824 root->root_key.objectid,
1825 trans->transid, level, c->start, 0);
1826 if (IS_ERR(split))
1827 return PTR_ERR(split);
1828
1829 btrfs_set_header_flags(split, btrfs_header_flags(c));
1830 btrfs_set_header_level(split, btrfs_header_level(c));
1831 btrfs_set_header_bytenr(split, split->start);
1832 btrfs_set_header_generation(split, trans->transid);
1833 btrfs_set_header_owner(split, root->root_key.objectid);
1834 btrfs_set_header_flags(split, 0);
1835 write_extent_buffer(split, root->fs_info->fsid,
1836 (unsigned long)btrfs_header_fsid(split),
1837 BTRFS_FSID_SIZE);
1838 write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
1839 (unsigned long)btrfs_header_chunk_tree_uuid(split),
1840 BTRFS_UUID_SIZE);
1841
1842 mid = (c_nritems + 1) / 2;
1843
1844 copy_extent_buffer(split, c,
1845 btrfs_node_key_ptr_offset(0),
1846 btrfs_node_key_ptr_offset(mid),
1847 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
1848 btrfs_set_header_nritems(split, c_nritems - mid);
1849 btrfs_set_header_nritems(c, mid);
1850 ret = 0;
1851
1852 btrfs_mark_buffer_dirty(c);
1853 btrfs_mark_buffer_dirty(split);
1854
1855 btrfs_node_key(split, &disk_key, 0);
1856 wret = insert_ptr(trans, root, path, &disk_key, split->start,
1857 path->slots[level + 1] + 1,
1858 level + 1);
1859 if (wret)
1860 ret = wret;
1861
1862 ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
1863 BUG_ON(ret);
1864
1865 if (path->slots[level] >= mid) {
1866 path->slots[level] -= mid;
1867 btrfs_tree_unlock(c);
1868 free_extent_buffer(c);
1869 path->nodes[level] = split;
1870 path->slots[level + 1] += 1;
1871 } else {
1872 btrfs_tree_unlock(split);
1873 free_extent_buffer(split);
1874 }
1875 return ret;
1876}
1877
1878/*
1879 * how many bytes are required to store the items in a leaf. start
1880 * and nr indicate which items in the leaf to check. This totals up the
1881 * space used both by the item structs and the item data
1882 */
1883static int leaf_space_used(struct extent_buffer *l, int start, int nr)
1884{
1885 int data_len;
1886 int nritems = btrfs_header_nritems(l);
1887 int end = min(nritems, start + nr) - 1;
1888
1889 if (!nr)
1890 return 0;
1891 data_len = btrfs_item_end_nr(l, start);
1892 data_len = data_len - btrfs_item_offset_nr(l, end);
1893 data_len += sizeof(struct btrfs_item) * nr;
1894 WARN_ON(data_len < 0);
1895 return data_len;
1896}
1897
1898/*
1899 * The space between the end of the leaf items and
1900 * the start of the leaf data. IOW, how much room
1901 * the leaf has left for both items and data
1902 */
1903int noinline btrfs_leaf_free_space(struct btrfs_root *root,
1904 struct extent_buffer *leaf)
1905{
1906 int nritems = btrfs_header_nritems(leaf);
1907 int ret;
1908 ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
1909 if (ret < 0) {
1910 printk("leaf free space ret %d, leaf data size %lu, used %d nritems %d\n",
1911 ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
1912 leaf_space_used(leaf, 0, nritems), nritems);
1913 }
1914 return ret;
1915}
1916
1917/*
1918 * push some data in the path leaf to the right, trying to free up at
1919 * least data_size bytes. returns zero if the push worked, nonzero otherwise
1920 *
1921 * returns 1 if the push failed because the other node didn't have enough
1922 * room, 0 if everything worked out and < 0 if there were major errors.
1923 */
1924static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
1925 *root, struct btrfs_path *path, int data_size,
1926 int empty)
1927{
1928 struct extent_buffer *left = path->nodes[0];
1929 struct extent_buffer *right;
1930 struct extent_buffer *upper;
1931 struct btrfs_disk_key disk_key;
1932 int slot;
1933 u32 i;
1934 int free_space;
1935 int push_space = 0;
1936 int push_items = 0;
1937 struct btrfs_item *item;
1938 u32 left_nritems;
1939 u32 nr;
1940 u32 right_nritems;
1941 u32 data_end;
1942 u32 this_item_size;
1943 int ret;
1944
1945 slot = path->slots[1];
1946 if (!path->nodes[1]) {
1947 return 1;
1948 }
1949 upper = path->nodes[1];
1950 if (slot >= btrfs_header_nritems(upper) - 1)
1951 return 1;
1952
1953 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
1954
1955 right = read_node_slot(root, upper, slot + 1);
1956 btrfs_tree_lock(right);
1957 free_space = btrfs_leaf_free_space(root, right);
1958 if (free_space < data_size + sizeof(struct btrfs_item))
1959 goto out_unlock;
1960
1961 /* cow and double check */
1962 ret = btrfs_cow_block(trans, root, right, upper,
1963 slot + 1, &right, 0);
1964 if (ret)
1965 goto out_unlock;
1966
1967 free_space = btrfs_leaf_free_space(root, right);
1968 if (free_space < data_size + sizeof(struct btrfs_item))
1969 goto out_unlock;
1970
1971 left_nritems = btrfs_header_nritems(left);
1972 if (left_nritems == 0)
1973 goto out_unlock;
1974
1975 if (empty)
1976 nr = 0;
1977 else
1978 nr = 1;
1979
1980 if (path->slots[0] >= left_nritems)
1981 push_space += data_size + sizeof(*item);
1982
1983 i = left_nritems - 1;
1984 while (i >= nr) {
1985 item = btrfs_item_nr(left, i);
1986
1987 if (!empty && push_items > 0) {
1988 if (path->slots[0] > i)
1989 break;
1990 if (path->slots[0] == i) {
1991 int space = btrfs_leaf_free_space(root, left);
1992 if (space + push_space * 2 > free_space)
1993 break;
1994 }
1995 }
1996
1997 if (path->slots[0] == i)
1998 push_space += data_size + sizeof(*item);
1999
2000 if (!left->map_token) {
2001 map_extent_buffer(left, (unsigned long)item,
2002 sizeof(struct btrfs_item),
2003 &left->map_token, &left->kaddr,
2004 &left->map_start, &left->map_len,
2005 KM_USER1);
2006 }
2007
2008 this_item_size = btrfs_item_size(left, item);
2009 if (this_item_size + sizeof(*item) + push_space > free_space)
2010 break;
2011
2012 push_items++;
2013 push_space += this_item_size + sizeof(*item);
2014 if (i == 0)
2015 break;
2016 i--;
2017 }
2018 if (left->map_token) {
2019 unmap_extent_buffer(left, left->map_token, KM_USER1);
2020 left->map_token = NULL;
2021 }
2022
2023 if (push_items == 0)
2024 goto out_unlock;
2025
2026 if (!empty && push_items == left_nritems)
2027 WARN_ON(1);
2028
2029 /* push left to right */
2030 right_nritems = btrfs_header_nritems(right);
2031
2032 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2033 push_space -= leaf_data_end(root, left);
2034
2035 /* make room in the right data area */
2036 data_end = leaf_data_end(root, right);
2037 memmove_extent_buffer(right,
2038 btrfs_leaf_data(right) + data_end - push_space,
2039 btrfs_leaf_data(right) + data_end,
2040 BTRFS_LEAF_DATA_SIZE(root) - data_end);
2041
2042 /* copy from the left data area */
2043 copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2044 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2045 btrfs_leaf_data(left) + leaf_data_end(root, left),
2046 push_space);
2047
2048 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2049 btrfs_item_nr_offset(0),
2050 right_nritems * sizeof(struct btrfs_item));
2051
2052 /* copy the items from left to right */
2053 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2054 btrfs_item_nr_offset(left_nritems - push_items),
2055 push_items * sizeof(struct btrfs_item));
2056
2057 /* update the item pointers */
2058 right_nritems += push_items;
2059 btrfs_set_header_nritems(right, right_nritems);
2060 push_space = BTRFS_LEAF_DATA_SIZE(root);
2061 for (i = 0; i < right_nritems; i++) {
2062 item = btrfs_item_nr(right, i);
2063 if (!right->map_token) {
2064 map_extent_buffer(right, (unsigned long)item,
2065 sizeof(struct btrfs_item),
2066 &right->map_token, &right->kaddr,
2067 &right->map_start, &right->map_len,
2068 KM_USER1);
2069 }
2070 push_space -= btrfs_item_size(right, item);
2071 btrfs_set_item_offset(right, item, push_space);
2072 }
2073
2074 if (right->map_token) {
2075 unmap_extent_buffer(right, right->map_token, KM_USER1);
2076 right->map_token = NULL;
2077 }
2078 left_nritems -= push_items;
2079 btrfs_set_header_nritems(left, left_nritems);
2080
2081 if (left_nritems)
2082 btrfs_mark_buffer_dirty(left);
2083 btrfs_mark_buffer_dirty(right);
2084
2085 ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
2086 BUG_ON(ret);
2087
2088 btrfs_item_key(right, &disk_key, 0);
2089 btrfs_set_node_key(upper, &disk_key, slot + 1);
2090 btrfs_mark_buffer_dirty(upper);
2091
2092 /* then fixup the leaf pointer in the path */
2093 if (path->slots[0] >= left_nritems) {
2094 path->slots[0] -= left_nritems;
2095 if (btrfs_header_nritems(path->nodes[0]) == 0)
2096 clean_tree_block(trans, root, path->nodes[0]);
2097 btrfs_tree_unlock(path->nodes[0]);
2098 free_extent_buffer(path->nodes[0]);
2099 path->nodes[0] = right;
2100 path->slots[1] += 1;
2101 } else {
2102 btrfs_tree_unlock(right);
2103 free_extent_buffer(right);
2104 }
2105 return 0;
2106
2107out_unlock:
2108 btrfs_tree_unlock(right);
2109 free_extent_buffer(right);
2110 return 1;
2111}
2112
2113/*
2114 * push some data in the path leaf to the left, trying to free up at
2115 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2116 */
2117static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2118 *root, struct btrfs_path *path, int data_size,
2119 int empty)
2120{
2121 struct btrfs_disk_key disk_key;
2122 struct extent_buffer *right = path->nodes[0];
2123 struct extent_buffer *left;
2124 int slot;
2125 int i;
2126 int free_space;
2127 int push_space = 0;
2128 int push_items = 0;
2129 struct btrfs_item *item;
2130 u32 old_left_nritems;
2131 u32 right_nritems;
2132 u32 nr;
2133 int ret = 0;
2134 int wret;
2135 u32 this_item_size;
2136 u32 old_left_item_size;
2137
2138 slot = path->slots[1];
2139 if (slot == 0)
2140 return 1;
2141 if (!path->nodes[1])
2142 return 1;
2143
2144 right_nritems = btrfs_header_nritems(right);
2145 if (right_nritems == 0) {
2146 return 1;
2147 }
2148
2149 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2150
2151 left = read_node_slot(root, path->nodes[1], slot - 1);
2152 btrfs_tree_lock(left);
2153 free_space = btrfs_leaf_free_space(root, left);
2154 if (free_space < data_size + sizeof(struct btrfs_item)) {
2155 ret = 1;
2156 goto out;
2157 }
2158
2159 /* cow and double check */
2160 ret = btrfs_cow_block(trans, root, left,
2161 path->nodes[1], slot - 1, &left, 0);
2162 if (ret) {
2163 /* we hit -ENOSPC, but it isn't fatal here */
2164 ret = 1;
2165 goto out;
2166 }
2167
2168 free_space = btrfs_leaf_free_space(root, left);
2169 if (free_space < data_size + sizeof(struct btrfs_item)) {
2170 ret = 1;
2171 goto out;
2172 }
2173
2174 if (empty)
2175 nr = right_nritems;
2176 else
2177 nr = right_nritems - 1;
2178
2179 for (i = 0; i < nr; i++) {
2180 item = btrfs_item_nr(right, i);
2181 if (!right->map_token) {
2182 map_extent_buffer(right, (unsigned long)item,
2183 sizeof(struct btrfs_item),
2184 &right->map_token, &right->kaddr,
2185 &right->map_start, &right->map_len,
2186 KM_USER1);
2187 }
2188
2189 if (!empty && push_items > 0) {
2190 if (path->slots[0] < i)
2191 break;
2192 if (path->slots[0] == i) {
2193 int space = btrfs_leaf_free_space(root, right);
2194 if (space + push_space * 2 > free_space)
2195 break;
2196 }
2197 }
2198
2199 if (path->slots[0] == i)
2200 push_space += data_size + sizeof(*item);
2201
2202 this_item_size = btrfs_item_size(right, item);
2203 if (this_item_size + sizeof(*item) + push_space > free_space)
2204 break;
2205
2206 push_items++;
2207 push_space += this_item_size + sizeof(*item);
2208 }
2209
2210 if (right->map_token) {
2211 unmap_extent_buffer(right, right->map_token, KM_USER1);
2212 right->map_token = NULL;
2213 }
2214
2215 if (push_items == 0) {
2216 ret = 1;
2217 goto out;
2218 }
2219 if (!empty && push_items == btrfs_header_nritems(right))
2220 WARN_ON(1);
2221
2222 /* push data from right to left */
2223 copy_extent_buffer(left, right,
2224 btrfs_item_nr_offset(btrfs_header_nritems(left)),
2225 btrfs_item_nr_offset(0),
2226 push_items * sizeof(struct btrfs_item));
2227
2228 push_space = BTRFS_LEAF_DATA_SIZE(root) -
2229 btrfs_item_offset_nr(right, push_items -1);
2230
2231 copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2232 leaf_data_end(root, left) - push_space,
2233 btrfs_leaf_data(right) +
2234 btrfs_item_offset_nr(right, push_items - 1),
2235 push_space);
2236 old_left_nritems = btrfs_header_nritems(left);
2237 BUG_ON(old_left_nritems < 0);
2238
2239 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2240 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2241 u32 ioff;
2242
2243 item = btrfs_item_nr(left, i);
2244 if (!left->map_token) {
2245 map_extent_buffer(left, (unsigned long)item,
2246 sizeof(struct btrfs_item),
2247 &left->map_token, &left->kaddr,
2248 &left->map_start, &left->map_len,
2249 KM_USER1);
2250 }
2251
2252 ioff = btrfs_item_offset(left, item);
2253 btrfs_set_item_offset(left, item,
2254 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2255 }
2256 btrfs_set_header_nritems(left, old_left_nritems + push_items);
2257 if (left->map_token) {
2258 unmap_extent_buffer(left, left->map_token, KM_USER1);
2259 left->map_token = NULL;
2260 }
2261
2262 /* fixup right node */
2263 if (push_items > right_nritems) {
2264 printk("push items %d nr %u\n", push_items, right_nritems);
2265 WARN_ON(1);
2266 }
2267
2268 if (push_items < right_nritems) {
2269 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2270 leaf_data_end(root, right);
2271 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2272 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2273 btrfs_leaf_data(right) +
2274 leaf_data_end(root, right), push_space);
2275
2276 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2277 btrfs_item_nr_offset(push_items),
2278 (btrfs_header_nritems(right) - push_items) *
2279 sizeof(struct btrfs_item));
2280 }
2281 right_nritems -= push_items;
2282 btrfs_set_header_nritems(right, right_nritems);
2283 push_space = BTRFS_LEAF_DATA_SIZE(root);
2284 for (i = 0; i < right_nritems; i++) {
2285 item = btrfs_item_nr(right, i);
2286
2287 if (!right->map_token) {
2288 map_extent_buffer(right, (unsigned long)item,
2289 sizeof(struct btrfs_item),
2290 &right->map_token, &right->kaddr,
2291 &right->map_start, &right->map_len,
2292 KM_USER1);
2293 }
2294
2295 push_space = push_space - btrfs_item_size(right, item);
2296 btrfs_set_item_offset(right, item, push_space);
2297 }
2298 if (right->map_token) {
2299 unmap_extent_buffer(right, right->map_token, KM_USER1);
2300 right->map_token = NULL;
2301 }
2302
2303 btrfs_mark_buffer_dirty(left);
2304 if (right_nritems)
2305 btrfs_mark_buffer_dirty(right);
2306
2307 ret = btrfs_update_ref(trans, root, right, left,
2308 old_left_nritems, push_items);
2309 BUG_ON(ret);
2310
2311 btrfs_item_key(right, &disk_key, 0);
2312 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2313 if (wret)
2314 ret = wret;
2315
2316 /* then fixup the leaf pointer in the path */
2317 if (path->slots[0] < push_items) {
2318 path->slots[0] += old_left_nritems;
2319 if (btrfs_header_nritems(path->nodes[0]) == 0)
2320 clean_tree_block(trans, root, path->nodes[0]);
2321 btrfs_tree_unlock(path->nodes[0]);
2322 free_extent_buffer(path->nodes[0]);
2323 path->nodes[0] = left;
2324 path->slots[1] -= 1;
2325 } else {
2326 btrfs_tree_unlock(left);
2327 free_extent_buffer(left);
2328 path->slots[0] -= push_items;
2329 }
2330 BUG_ON(path->slots[0] < 0);
2331 return ret;
2332out:
2333 btrfs_tree_unlock(left);
2334 free_extent_buffer(left);
2335 return ret;
2336}
2337
2338/*
2339 * split the path's leaf in two, making sure there is at least data_size
2340 * available for the resulting leaf level of the path.
2341 *
2342 * returns 0 if all went well and < 0 on failure.
2343 */
2344static noinline int split_leaf(struct btrfs_trans_handle *trans,
2345 struct btrfs_root *root,
2346 struct btrfs_key *ins_key,
2347 struct btrfs_path *path, int data_size,
2348 int extend)
2349{
2350 struct extent_buffer *l;
2351 u32 nritems;
2352 int mid;
2353 int slot;
2354 struct extent_buffer *right;
2355 int space_needed = data_size + sizeof(struct btrfs_item);
2356 int data_copy_size;
2357 int rt_data_off;
2358 int i;
2359 int ret = 0;
2360 int wret;
2361 int double_split;
2362 int num_doubles = 0;
2363 struct btrfs_disk_key disk_key;
2364
2365 if (extend)
2366 space_needed = data_size;
2367
2368 /* first try to make some room by pushing left and right */
2369 if (ins_key->type != BTRFS_DIR_ITEM_KEY) {
2370 wret = push_leaf_right(trans, root, path, data_size, 0);
2371 if (wret < 0) {
2372 return wret;
2373 }
2374 if (wret) {
2375 wret = push_leaf_left(trans, root, path, data_size, 0);
2376 if (wret < 0)
2377 return wret;
2378 }
2379 l = path->nodes[0];
2380
2381 /* did the pushes work? */
2382 if (btrfs_leaf_free_space(root, l) >= space_needed)
2383 return 0;
2384 }
2385
2386 if (!path->nodes[1]) {
2387 ret = insert_new_root(trans, root, path, 1);
2388 if (ret)
2389 return ret;
2390 }
2391again:
2392 double_split = 0;
2393 l = path->nodes[0];
2394 slot = path->slots[0];
2395 nritems = btrfs_header_nritems(l);
2396 mid = (nritems + 1)/ 2;
2397
2398 right = btrfs_alloc_free_block(trans, root, root->leafsize,
2399 path->nodes[1]->start,
2400 root->root_key.objectid,
2401 trans->transid, 0, l->start, 0);
2402 if (IS_ERR(right)) {
2403 BUG_ON(1);
2404 return PTR_ERR(right);
2405 }
2406
2407 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2408 btrfs_set_header_bytenr(right, right->start);
2409 btrfs_set_header_generation(right, trans->transid);
2410 btrfs_set_header_owner(right, root->root_key.objectid);
2411 btrfs_set_header_level(right, 0);
2412 write_extent_buffer(right, root->fs_info->fsid,
2413 (unsigned long)btrfs_header_fsid(right),
2414 BTRFS_FSID_SIZE);
2415
2416 write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2417 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2418 BTRFS_UUID_SIZE);
2419 if (mid <= slot) {
2420 if (nritems == 1 ||
2421 leaf_space_used(l, mid, nritems - mid) + space_needed >
2422 BTRFS_LEAF_DATA_SIZE(root)) {
2423 if (slot >= nritems) {
2424 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2425 btrfs_set_header_nritems(right, 0);
2426 wret = insert_ptr(trans, root, path,
2427 &disk_key, right->start,
2428 path->slots[1] + 1, 1);
2429 if (wret)
2430 ret = wret;
2431
2432 btrfs_tree_unlock(path->nodes[0]);
2433 free_extent_buffer(path->nodes[0]);
2434 path->nodes[0] = right;
2435 path->slots[0] = 0;
2436 path->slots[1] += 1;
2437 btrfs_mark_buffer_dirty(right);
2438 return ret;
2439 }
2440 mid = slot;
2441 if (mid != nritems &&
2442 leaf_space_used(l, mid, nritems - mid) +
2443 space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
2444 double_split = 1;
2445 }
2446 }
2447 } else {
2448 if (leaf_space_used(l, 0, mid + 1) + space_needed >
2449 BTRFS_LEAF_DATA_SIZE(root)) {
2450 if (!extend && slot == 0) {
2451 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2452 btrfs_set_header_nritems(right, 0);
2453 wret = insert_ptr(trans, root, path,
2454 &disk_key,
2455 right->start,
2456 path->slots[1], 1);
2457 if (wret)
2458 ret = wret;
2459 btrfs_tree_unlock(path->nodes[0]);
2460 free_extent_buffer(path->nodes[0]);
2461 path->nodes[0] = right;
2462 path->slots[0] = 0;
2463 if (path->slots[1] == 0) {
2464 wret = fixup_low_keys(trans, root,
2465 path, &disk_key, 1);
2466 if (wret)
2467 ret = wret;
2468 }
2469 btrfs_mark_buffer_dirty(right);
2470 return ret;
2471 } else if (extend && slot == 0) {
2472 mid = 1;
2473 } else {
2474 mid = slot;
2475 if (mid != nritems &&
2476 leaf_space_used(l, mid, nritems - mid) +
2477 space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
2478 double_split = 1;
2479 }
2480 }
2481 }
2482 }
2483 nritems = nritems - mid;
2484 btrfs_set_header_nritems(right, nritems);
2485 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2486
2487 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2488 btrfs_item_nr_offset(mid),
2489 nritems * sizeof(struct btrfs_item));
2490
2491 copy_extent_buffer(right, l,
2492 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2493 data_copy_size, btrfs_leaf_data(l) +
2494 leaf_data_end(root, l), data_copy_size);
2495
2496 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2497 btrfs_item_end_nr(l, mid);
2498
2499 for (i = 0; i < nritems; i++) {
2500 struct btrfs_item *item = btrfs_item_nr(right, i);
2501 u32 ioff;
2502
2503 if (!right->map_token) {
2504 map_extent_buffer(right, (unsigned long)item,
2505 sizeof(struct btrfs_item),
2506 &right->map_token, &right->kaddr,
2507 &right->map_start, &right->map_len,
2508 KM_USER1);
2509 }
2510
2511 ioff = btrfs_item_offset(right, item);
2512 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2513 }
2514
2515 if (right->map_token) {
2516 unmap_extent_buffer(right, right->map_token, KM_USER1);
2517 right->map_token = NULL;
2518 }
2519
2520 btrfs_set_header_nritems(l, mid);
2521 ret = 0;
2522 btrfs_item_key(right, &disk_key, 0);
2523 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2524 path->slots[1] + 1, 1);
2525 if (wret)
2526 ret = wret;
2527
2528 btrfs_mark_buffer_dirty(right);
2529 btrfs_mark_buffer_dirty(l);
2530 BUG_ON(path->slots[0] != slot);
2531
2532 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2533 BUG_ON(ret);
2534
2535 if (mid <= slot) {
2536 btrfs_tree_unlock(path->nodes[0]);
2537 free_extent_buffer(path->nodes[0]);
2538 path->nodes[0] = right;
2539 path->slots[0] -= mid;
2540 path->slots[1] += 1;
2541 } else {
2542 btrfs_tree_unlock(right);
2543 free_extent_buffer(right);
2544 }
2545
2546 BUG_ON(path->slots[0] < 0);
2547
2548 if (double_split) {
2549 BUG_ON(num_doubles != 0);
2550 num_doubles++;
2551 goto again;
2552 }
2553 return ret;
2554}
2555
2556int btrfs_truncate_item(struct btrfs_trans_handle *trans,
2557 struct btrfs_root *root,
2558 struct btrfs_path *path,
2559 u32 new_size, int from_end)
2560{
2561 int ret = 0;
2562 int slot;
2563 int slot_orig;
2564 struct extent_buffer *leaf;
2565 struct btrfs_item *item;
2566 u32 nritems;
2567 unsigned int data_end;
2568 unsigned int old_data_start;
2569 unsigned int old_size;
2570 unsigned int size_diff;
2571 int i;
2572
2573 slot_orig = path->slots[0];
2574 leaf = path->nodes[0];
2575 slot = path->slots[0];
2576
2577 old_size = btrfs_item_size_nr(leaf, slot);
2578 if (old_size == new_size)
2579 return 0;
2580
2581 nritems = btrfs_header_nritems(leaf);
2582 data_end = leaf_data_end(root, leaf);
2583
2584 old_data_start = btrfs_item_offset_nr(leaf, slot);
2585
2586 size_diff = old_size - new_size;
2587
2588 BUG_ON(slot < 0);
2589 BUG_ON(slot >= nritems);
2590
2591 /*
2592 * item0..itemN ... dataN.offset..dataN.size .. data0.size
2593 */
2594 /* first correct the data pointers */
2595 for (i = slot; i < nritems; i++) {
2596 u32 ioff;
2597 item = btrfs_item_nr(leaf, i);
2598
2599 if (!leaf->map_token) {
2600 map_extent_buffer(leaf, (unsigned long)item,
2601 sizeof(struct btrfs_item),
2602 &leaf->map_token, &leaf->kaddr,
2603 &leaf->map_start, &leaf->map_len,
2604 KM_USER1);
2605 }
2606
2607 ioff = btrfs_item_offset(leaf, item);
2608 btrfs_set_item_offset(leaf, item, ioff + size_diff);
2609 }
2610
2611 if (leaf->map_token) {
2612 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2613 leaf->map_token = NULL;
2614 }
2615
2616 /* shift the data */
2617 if (from_end) {
2618 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2619 data_end + size_diff, btrfs_leaf_data(leaf) +
2620 data_end, old_data_start + new_size - data_end);
2621 } else {
2622 struct btrfs_disk_key disk_key;
2623 u64 offset;
2624
2625 btrfs_item_key(leaf, &disk_key, slot);
2626
2627 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
2628 unsigned long ptr;
2629 struct btrfs_file_extent_item *fi;
2630
2631 fi = btrfs_item_ptr(leaf, slot,
2632 struct btrfs_file_extent_item);
2633 fi = (struct btrfs_file_extent_item *)(
2634 (unsigned long)fi - size_diff);
2635
2636 if (btrfs_file_extent_type(leaf, fi) ==
2637 BTRFS_FILE_EXTENT_INLINE) {
2638 ptr = btrfs_item_ptr_offset(leaf, slot);
2639 memmove_extent_buffer(leaf, ptr,
2640 (unsigned long)fi,
2641 offsetof(struct btrfs_file_extent_item,
2642 disk_bytenr));
2643 }
2644 }
2645
2646 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2647 data_end + size_diff, btrfs_leaf_data(leaf) +
2648 data_end, old_data_start - data_end);
2649
2650 offset = btrfs_disk_key_offset(&disk_key);
2651 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
2652 btrfs_set_item_key(leaf, &disk_key, slot);
2653 if (slot == 0)
2654 fixup_low_keys(trans, root, path, &disk_key, 1);
2655 }
2656
2657 item = btrfs_item_nr(leaf, slot);
2658 btrfs_set_item_size(leaf, item, new_size);
2659 btrfs_mark_buffer_dirty(leaf);
2660
2661 ret = 0;
2662 if (btrfs_leaf_free_space(root, leaf) < 0) {
2663 btrfs_print_leaf(root, leaf);
2664 BUG();
2665 }
2666 return ret;
2667}
2668
2669int btrfs_extend_item(struct btrfs_trans_handle *trans,
2670 struct btrfs_root *root, struct btrfs_path *path,
2671 u32 data_size)
2672{
2673 int ret = 0;
2674 int slot;
2675 int slot_orig;
2676 struct extent_buffer *leaf;
2677 struct btrfs_item *item;
2678 u32 nritems;
2679 unsigned int data_end;
2680 unsigned int old_data;
2681 unsigned int old_size;
2682 int i;
2683
2684 slot_orig = path->slots[0];
2685 leaf = path->nodes[0];
2686
2687 nritems = btrfs_header_nritems(leaf);
2688 data_end = leaf_data_end(root, leaf);
2689
2690 if (btrfs_leaf_free_space(root, leaf) < data_size) {
2691 btrfs_print_leaf(root, leaf);
2692 BUG();
2693 }
2694 slot = path->slots[0];
2695 old_data = btrfs_item_end_nr(leaf, slot);
2696
2697 BUG_ON(slot < 0);
2698 if (slot >= nritems) {
2699 btrfs_print_leaf(root, leaf);
2700 printk("slot %d too large, nritems %d\n", slot, nritems);
2701 BUG_ON(1);
2702 }
2703
2704 /*
2705 * item0..itemN ... dataN.offset..dataN.size .. data0.size
2706 */
2707 /* first correct the data pointers */
2708 for (i = slot; i < nritems; i++) {
2709 u32 ioff;
2710 item = btrfs_item_nr(leaf, i);
2711
2712 if (!leaf->map_token) {
2713 map_extent_buffer(leaf, (unsigned long)item,
2714 sizeof(struct btrfs_item),
2715 &leaf->map_token, &leaf->kaddr,
2716 &leaf->map_start, &leaf->map_len,
2717 KM_USER1);
2718 }
2719 ioff = btrfs_item_offset(leaf, item);
2720 btrfs_set_item_offset(leaf, item, ioff - data_size);
2721 }
2722
2723 if (leaf->map_token) {
2724 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2725 leaf->map_token = NULL;
2726 }
2727
2728 /* shift the data */
2729 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2730 data_end - data_size, btrfs_leaf_data(leaf) +
2731 data_end, old_data - data_end);
2732
2733 data_end = old_data;
2734 old_size = btrfs_item_size_nr(leaf, slot);
2735 item = btrfs_item_nr(leaf, slot);
2736 btrfs_set_item_size(leaf, item, old_size + data_size);
2737 btrfs_mark_buffer_dirty(leaf);
2738
2739 ret = 0;
2740 if (btrfs_leaf_free_space(root, leaf) < 0) {
2741 btrfs_print_leaf(root, leaf);
2742 BUG();
2743 }
2744 return ret;
2745}
2746
2747/*
2748 * Given a key and some data, insert an item into the tree.
2749 * This does all the path init required, making room in the tree if needed.
2750 */
2751int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
2752 struct btrfs_root *root,
2753 struct btrfs_path *path,
2754 struct btrfs_key *cpu_key, u32 *data_size,
2755 int nr)
2756{
2757 struct extent_buffer *leaf;
2758 struct btrfs_item *item;
2759 int ret = 0;
2760 int slot;
2761 int slot_orig;
2762 int i;
2763 u32 nritems;
2764 u32 total_size = 0;
2765 u32 total_data = 0;
2766 unsigned int data_end;
2767 struct btrfs_disk_key disk_key;
2768
2769 for (i = 0; i < nr; i++) {
2770 total_data += data_size[i];
2771 }
2772
2773 total_size = total_data + (nr * sizeof(struct btrfs_item));
2774 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
2775 if (ret == 0)
2776 return -EEXIST;
2777 if (ret < 0)
2778 goto out;
2779
2780 slot_orig = path->slots[0];
2781 leaf = path->nodes[0];
2782
2783 nritems = btrfs_header_nritems(leaf);
2784 data_end = leaf_data_end(root, leaf);
2785
2786 if (btrfs_leaf_free_space(root, leaf) < total_size) {
2787 btrfs_print_leaf(root, leaf);
2788 printk("not enough freespace need %u have %d\n",
2789 total_size, btrfs_leaf_free_space(root, leaf));
2790 BUG();
2791 }
2792
2793 slot = path->slots[0];
2794 BUG_ON(slot < 0);
2795
2796 if (slot != nritems) {
2797 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
2798
2799 if (old_data < data_end) {
2800 btrfs_print_leaf(root, leaf);
2801 printk("slot %d old_data %d data_end %d\n",
2802 slot, old_data, data_end);
2803 BUG_ON(1);
2804 }
2805 /*
2806 * item0..itemN ... dataN.offset..dataN.size .. data0.size
2807 */
2808 /* first correct the data pointers */
2809 WARN_ON(leaf->map_token);
2810 for (i = slot; i < nritems; i++) {
2811 u32 ioff;
2812
2813 item = btrfs_item_nr(leaf, i);
2814 if (!leaf->map_token) {
2815 map_extent_buffer(leaf, (unsigned long)item,
2816 sizeof(struct btrfs_item),
2817 &leaf->map_token, &leaf->kaddr,
2818 &leaf->map_start, &leaf->map_len,
2819 KM_USER1);
2820 }
2821
2822 ioff = btrfs_item_offset(leaf, item);
2823 btrfs_set_item_offset(leaf, item, ioff - total_data);
2824 }
2825 if (leaf->map_token) {
2826 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2827 leaf->map_token = NULL;
2828 }
2829
2830 /* shift the items */
2831 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
2832 btrfs_item_nr_offset(slot),
2833 (nritems - slot) * sizeof(struct btrfs_item));
2834
2835 /* shift the data */
2836 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2837 data_end - total_data, btrfs_leaf_data(leaf) +
2838 data_end, old_data - data_end);
2839 data_end = old_data;
2840 }
2841
2842 /* setup the item for the new data */
2843 for (i = 0; i < nr; i++) {
2844 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
2845 btrfs_set_item_key(leaf, &disk_key, slot + i);
2846 item = btrfs_item_nr(leaf, slot + i);
2847 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
2848 data_end -= data_size[i];
2849 btrfs_set_item_size(leaf, item, data_size[i]);
2850 }
2851 btrfs_set_header_nritems(leaf, nritems + nr);
2852 btrfs_mark_buffer_dirty(leaf);
2853
2854 ret = 0;
2855 if (slot == 0) {
2856 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
2857 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
2858 }
2859
2860 if (btrfs_leaf_free_space(root, leaf) < 0) {
2861 btrfs_print_leaf(root, leaf);
2862 BUG();
2863 }
2864out:
2865 return ret;
2866}
2867
2868/*
2869 * Given a key and some data, insert an item into the tree.
2870 * This does all the path init required, making room in the tree if needed.
2871 */
2872int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
2873 *root, struct btrfs_key *cpu_key, void *data, u32
2874 data_size)
2875{
2876 int ret = 0;
2877 struct btrfs_path *path;
2878 struct extent_buffer *leaf;
2879 unsigned long ptr;
2880
2881 path = btrfs_alloc_path();
2882 BUG_ON(!path);
2883 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
2884 if (!ret) {
2885 leaf = path->nodes[0];
2886 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
2887 write_extent_buffer(leaf, data, ptr, data_size);
2888 btrfs_mark_buffer_dirty(leaf);
2889 }
2890 btrfs_free_path(path);
2891 return ret;
2892}
2893
2894/*
2895 * delete the pointer from a given node.
2896 *
2897 * If the delete empties a node, the node is removed from the tree,
2898 * continuing all the way the root if required. The root is converted into
2899 * a leaf if all the nodes are emptied.
2900 */
2901static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2902 struct btrfs_path *path, int level, int slot)
2903{
2904 struct extent_buffer *parent = path->nodes[level];
2905 u32 nritems;
2906 int ret = 0;
2907 int wret;
2908
2909 nritems = btrfs_header_nritems(parent);
2910 if (slot != nritems -1) {
2911 memmove_extent_buffer(parent,
2912 btrfs_node_key_ptr_offset(slot),
2913 btrfs_node_key_ptr_offset(slot + 1),
2914 sizeof(struct btrfs_key_ptr) *
2915 (nritems - slot - 1));
2916 }
2917 nritems--;
2918 btrfs_set_header_nritems(parent, nritems);
2919 if (nritems == 0 && parent == root->node) {
2920 BUG_ON(btrfs_header_level(root->node) != 1);
2921 /* just turn the root into a leaf and break */
2922 btrfs_set_header_level(root->node, 0);
2923 } else if (slot == 0) {
2924 struct btrfs_disk_key disk_key;
2925
2926 btrfs_node_key(parent, &disk_key, 0);
2927 wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
2928 if (wret)
2929 ret = wret;
2930 }
2931 btrfs_mark_buffer_dirty(parent);
2932 return ret;
2933}
2934
2935/*
2936 * delete the item at the leaf level in path. If that empties
2937 * the leaf, remove it from the tree
2938 */
2939int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2940 struct btrfs_path *path, int slot, int nr)
2941{
2942 struct extent_buffer *leaf;
2943 struct btrfs_item *item;
2944 int last_off;
2945 int dsize = 0;
2946 int ret = 0;
2947 int wret;
2948 int i;
2949 u32 nritems;
2950
2951 leaf = path->nodes[0];
2952 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
2953
2954 for (i = 0; i < nr; i++)
2955 dsize += btrfs_item_size_nr(leaf, slot + i);
2956
2957 nritems = btrfs_header_nritems(leaf);
2958
2959 if (slot + nr != nritems) {
2960 int data_end = leaf_data_end(root, leaf);
2961
2962 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2963 data_end + dsize,
2964 btrfs_leaf_data(leaf) + data_end,
2965 last_off - data_end);
2966
2967 for (i = slot + nr; i < nritems; i++) {
2968 u32 ioff;
2969
2970 item = btrfs_item_nr(leaf, i);
2971 if (!leaf->map_token) {
2972 map_extent_buffer(leaf, (unsigned long)item,
2973 sizeof(struct btrfs_item),
2974 &leaf->map_token, &leaf->kaddr,
2975 &leaf->map_start, &leaf->map_len,
2976 KM_USER1);
2977 }
2978 ioff = btrfs_item_offset(leaf, item);
2979 btrfs_set_item_offset(leaf, item, ioff + dsize);
2980 }
2981
2982 if (leaf->map_token) {
2983 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2984 leaf->map_token = NULL;
2985 }
2986
2987 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
2988 btrfs_item_nr_offset(slot + nr),
2989 sizeof(struct btrfs_item) *
2990 (nritems - slot - nr));
2991 }
2992 btrfs_set_header_nritems(leaf, nritems - nr);
2993 nritems -= nr;
2994
2995 /* delete the leaf if we've emptied it */
2996 if (nritems == 0) {
2997 if (leaf == root->node) {
2998 btrfs_set_header_level(leaf, 0);
2999 } else {
3000 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3001 wret = del_ptr(trans, root, path, 1, path->slots[1]);
3002 if (wret)
3003 ret = wret;
3004 wret = btrfs_free_extent(trans, root,
3005 leaf->start, leaf->len,
3006 path->nodes[1]->start,
3007 btrfs_header_owner(path->nodes[1]),
3008 root_gen, 0, 0, 1);
3009 if (wret)
3010 ret = wret;
3011 }
3012 } else {
3013 int used = leaf_space_used(leaf, 0, nritems);
3014 if (slot == 0) {
3015 struct btrfs_disk_key disk_key;
3016
3017 btrfs_item_key(leaf, &disk_key, 0);
3018 wret = fixup_low_keys(trans, root, path,
3019 &disk_key, 1);
3020 if (wret)
3021 ret = wret;
3022 }
3023
3024 /* delete the leaf if it is mostly empty */
3025 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
3026 /* push_leaf_left fixes the path.
3027 * make sure the path still points to our leaf
3028 * for possible call to del_ptr below
3029 */
3030 slot = path->slots[1];
3031 extent_buffer_get(leaf);
3032
3033 wret = push_leaf_left(trans, root, path, 1, 1);
3034 if (wret < 0 && wret != -ENOSPC)
3035 ret = wret;
3036
3037 if (path->nodes[0] == leaf &&
3038 btrfs_header_nritems(leaf)) {
3039 wret = push_leaf_right(trans, root, path, 1, 1);
3040 if (wret < 0 && wret != -ENOSPC)
3041 ret = wret;
3042 }
3043
3044 if (btrfs_header_nritems(leaf) == 0) {
3045 u64 root_gen;
3046 u64 bytenr = leaf->start;
3047 u32 blocksize = leaf->len;
3048
3049 root_gen = btrfs_header_generation(
3050 path->nodes[1]);
3051
3052 wret = del_ptr(trans, root, path, 1, slot);
3053 if (wret)
3054 ret = wret;
3055
3056 free_extent_buffer(leaf);
3057 wret = btrfs_free_extent(trans, root, bytenr,
3058 blocksize, path->nodes[1]->start,
3059 btrfs_header_owner(path->nodes[1]),
3060 root_gen, 0, 0, 1);
3061 if (wret)
3062 ret = wret;
3063 } else {
3064 /* if we're still in the path, make sure
3065 * we're dirty. Otherwise, one of the
3066 * push_leaf functions must have already
3067 * dirtied this buffer
3068 */
3069 if (path->nodes[0] == leaf)
3070 btrfs_mark_buffer_dirty(leaf);
3071 free_extent_buffer(leaf);
3072 }
3073 } else {
3074 btrfs_mark_buffer_dirty(leaf);
3075 }
3076 }
3077 return ret;
3078}
3079
3080/*
3081 * search the tree again to find a leaf with lesser keys
3082 * returns 0 if it found something or 1 if there are no lesser leaves.
3083 * returns < 0 on io errors.
3084 */
3085int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3086{
3087 struct btrfs_key key;
3088 struct btrfs_disk_key found_key;
3089 int ret;
3090
3091 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3092
3093 if (key.offset > 0)
3094 key.offset--;
3095 else if (key.type > 0)
3096 key.type--;
3097 else if (key.objectid > 0)
3098 key.objectid--;
3099 else
3100 return 1;
3101
3102 btrfs_release_path(root, path);
3103 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3104 if (ret < 0)
3105 return ret;
3106 btrfs_item_key(path->nodes[0], &found_key, 0);
3107 ret = comp_keys(&found_key, &key);
3108 if (ret < 0)
3109 return 0;
3110 return 1;
3111}
3112
3113/*
3114 * A helper function to walk down the tree starting at min_key, and looking
3115 * for nodes or leaves that are either in cache or have a minimum
3116 * transaction id. This is used by the btree defrag code, but could
3117 * also be used to search for blocks that have changed since a given
3118 * transaction id.
3119 *
3120 * This does not cow, but it does stuff the starting key it finds back
3121 * into min_key, so you can call btrfs_search_slot with cow=1 on the
3122 * key and get a writable path.
3123 *
3124 * This does lock as it descends, and path->keep_locks should be set
3125 * to 1 by the caller.
3126 *
3127 * This honors path->lowest_level to prevent descent past a given level
3128 * of the tree.
3129 *
3130 * returns zero if something useful was found, < 0 on error and 1 if there
3131 * was nothing in the tree that matched the search criteria.
3132 */
3133int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3134 struct btrfs_key *max_key,
3135 struct btrfs_path *path, int cache_only,
3136 u64 min_trans)
3137{
3138 struct extent_buffer *cur;
3139 struct btrfs_key found_key;
3140 int slot;
3141 int sret;
3142 u32 nritems;
3143 int level;
3144 int ret = 1;
3145
3146again:
3147 cur = btrfs_lock_root_node(root);
3148 level = btrfs_header_level(cur);
3149 WARN_ON(path->nodes[level]);
3150 path->nodes[level] = cur;
3151 path->locks[level] = 1;
3152
3153 if (btrfs_header_generation(cur) < min_trans) {
3154 ret = 1;
3155 goto out;
3156 }
3157 while(1) {
3158 nritems = btrfs_header_nritems(cur);
3159 level = btrfs_header_level(cur);
3160 sret = bin_search(cur, min_key, level, &slot);
3161
3162 /* at level = 0, we're done, setup the path and exit */
3163 if (level == 0) {
3164 if (slot >= nritems)
3165 goto find_next_key;
3166 ret = 0;
3167 path->slots[level] = slot;
3168 btrfs_item_key_to_cpu(cur, &found_key, slot);
3169 goto out;
3170 }
3171 if (sret && slot > 0)
3172 slot--;
3173 /*
3174 * check this node pointer against the cache_only and
3175 * min_trans parameters. If it isn't in cache or is too
3176 * old, skip to the next one.
3177 */
3178 while(slot < nritems) {
3179 u64 blockptr;
3180 u64 gen;
3181 struct extent_buffer *tmp;
3182 struct btrfs_disk_key disk_key;
3183
3184 blockptr = btrfs_node_blockptr(cur, slot);
3185 gen = btrfs_node_ptr_generation(cur, slot);
3186 if (gen < min_trans) {
3187 slot++;
3188 continue;
3189 }
3190 if (!cache_only)
3191 break;
3192
3193 if (max_key) {
3194 btrfs_node_key(cur, &disk_key, slot);
3195 if (comp_keys(&disk_key, max_key) >= 0) {
3196 ret = 1;
3197 goto out;
3198 }
3199 }
3200
3201 tmp = btrfs_find_tree_block(root, blockptr,
3202 btrfs_level_size(root, level - 1));
3203
3204 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
3205 free_extent_buffer(tmp);
3206 break;
3207 }
3208 if (tmp)
3209 free_extent_buffer(tmp);
3210 slot++;
3211 }
3212find_next_key:
3213 /*
3214 * we didn't find a candidate key in this node, walk forward
3215 * and find another one
3216 */
3217 if (slot >= nritems) {
3218 path->slots[level] = slot;
3219 sret = btrfs_find_next_key(root, path, min_key, level,
3220 cache_only, min_trans);
3221 if (sret == 0) {
3222 btrfs_release_path(root, path);
3223 goto again;
3224 } else {
3225 goto out;
3226 }
3227 }
3228 /* save our key for returning back */
3229 btrfs_node_key_to_cpu(cur, &found_key, slot);
3230 path->slots[level] = slot;
3231 if (level == path->lowest_level) {
3232 ret = 0;
3233 unlock_up(path, level, 1);
3234 goto out;
3235 }
3236 cur = read_node_slot(root, cur, slot);
3237
3238 btrfs_tree_lock(cur);
3239 path->locks[level - 1] = 1;
3240 path->nodes[level - 1] = cur;
3241 unlock_up(path, level, 1);
3242 }
3243out:
3244 if (ret == 0)
3245 memcpy(min_key, &found_key, sizeof(found_key));
3246 return ret;
3247}
3248
3249/*
3250 * this is similar to btrfs_next_leaf, but does not try to preserve
3251 * and fixup the path. It looks for and returns the next key in the
3252 * tree based on the current path and the cache_only and min_trans
3253 * parameters.
3254 *
3255 * 0 is returned if another key is found, < 0 if there are any errors
3256 * and 1 is returned if there are no higher keys in the tree
3257 *
3258 * path->keep_locks should be set to 1 on the search made before
3259 * calling this function.
3260 */
3261int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3262 struct btrfs_key *key, int lowest_level,
3263 int cache_only, u64 min_trans)
3264{
3265 int level = lowest_level;
3266 int slot;
3267 struct extent_buffer *c;
3268
3269 while(level < BTRFS_MAX_LEVEL) {
3270 if (!path->nodes[level])
3271 return 1;
3272
3273 slot = path->slots[level] + 1;
3274 c = path->nodes[level];
3275next:
3276 if (slot >= btrfs_header_nritems(c)) {
3277 level++;
3278 if (level == BTRFS_MAX_LEVEL) {
3279 return 1;
3280 }
3281 continue;
3282 }
3283 if (level == 0)
3284 btrfs_item_key_to_cpu(c, key, slot);
3285 else {
3286 u64 blockptr = btrfs_node_blockptr(c, slot);
3287 u64 gen = btrfs_node_ptr_generation(c, slot);
3288
3289 if (cache_only) {
3290 struct extent_buffer *cur;
3291 cur = btrfs_find_tree_block(root, blockptr,
3292 btrfs_level_size(root, level - 1));
3293 if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
3294 slot++;
3295 if (cur)
3296 free_extent_buffer(cur);
3297 goto next;
3298 }
3299 free_extent_buffer(cur);
3300 }
3301 if (gen < min_trans) {
3302 slot++;
3303 goto next;
3304 }
3305 btrfs_node_key_to_cpu(c, key, slot);
3306 }
3307 return 0;
3308 }
3309 return 1;
3310}
3311
3312/*
3313 * search the tree again to find a leaf with greater keys
3314 * returns 0 if it found something or 1 if there are no greater leaves.
3315 * returns < 0 on io errors.
3316 */
3317int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3318{
3319 int slot;
3320 int level = 1;
3321 struct extent_buffer *c;
3322 struct extent_buffer *next = NULL;
3323 struct btrfs_key key;
3324 u32 nritems;
3325 int ret;
3326
3327 nritems = btrfs_header_nritems(path->nodes[0]);
3328 if (nritems == 0) {
3329 return 1;
3330 }
3331
3332 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
3333
3334 btrfs_release_path(root, path);
3335 path->keep_locks = 1;
3336 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3337 path->keep_locks = 0;
3338
3339 if (ret < 0)
3340 return ret;
3341
3342 nritems = btrfs_header_nritems(path->nodes[0]);
3343 /*
3344 * by releasing the path above we dropped all our locks. A balance
3345 * could have added more items next to the key that used to be
3346 * at the very end of the block. So, check again here and
3347 * advance the path if there are now more items available.
3348 */
3349 if (nritems > 0 && path->slots[0] < nritems - 1) {
3350 path->slots[0]++;
3351 goto done;
3352 }
3353
3354 while(level < BTRFS_MAX_LEVEL) {
3355 if (!path->nodes[level])
3356 return 1;
3357
3358 slot = path->slots[level] + 1;
3359 c = path->nodes[level];
3360 if (slot >= btrfs_header_nritems(c)) {
3361 level++;
3362 if (level == BTRFS_MAX_LEVEL) {
3363 return 1;
3364 }
3365 continue;
3366 }
3367
3368 if (next) {
3369 btrfs_tree_unlock(next);
3370 free_extent_buffer(next);
3371 }
3372
3373 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3374 path->reada)
3375 reada_for_search(root, path, level, slot, 0);
3376
3377 next = read_node_slot(root, c, slot);
3378 if (!path->skip_locking) {
3379 WARN_ON(!btrfs_tree_locked(c));
3380 btrfs_tree_lock(next);
3381 }
3382 break;
3383 }
3384 path->slots[level] = slot;
3385 while(1) {
3386 level--;
3387 c = path->nodes[level];
3388 if (path->locks[level])
3389 btrfs_tree_unlock(c);
3390 free_extent_buffer(c);
3391 path->nodes[level] = next;
3392 path->slots[level] = 0;
3393 if (!path->skip_locking)
3394 path->locks[level] = 1;
3395 if (!level)
3396 break;
3397 if (level == 1 && path->locks[1] && path->reada)
3398 reada_for_search(root, path, level, slot, 0);
3399 next = read_node_slot(root, next, 0);
3400 if (!path->skip_locking) {
3401 WARN_ON(!btrfs_tree_locked(path->nodes[level]));
3402 btrfs_tree_lock(next);
3403 }
3404 }
3405done:
3406 unlock_up(path, 0, 1);
3407 return 0;
3408}
3409
3410/*
3411 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
3412 * searching until it gets past min_objectid or finds an item of 'type'
3413 *
3414 * returns 0 if something is found, 1 if nothing was found and < 0 on error
3415 */
3416int btrfs_previous_item(struct btrfs_root *root,
3417 struct btrfs_path *path, u64 min_objectid,
3418 int type)
3419{
3420 struct btrfs_key found_key;
3421 struct extent_buffer *leaf;
3422 u32 nritems;
3423 int ret;
3424
3425 while(1) {
3426 if (path->slots[0] == 0) {
3427 ret = btrfs_prev_leaf(root, path);
3428 if (ret != 0)
3429 return ret;
3430 } else {
3431 path->slots[0]--;
3432 }
3433 leaf = path->nodes[0];
3434 nritems = btrfs_header_nritems(leaf);
3435 if (nritems == 0)
3436 return 1;
3437 if (path->slots[0] == nritems)
3438 path->slots[0]--;
3439
3440 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3441 if (found_key.type == type)
3442 return 0;
3443 if (found_key.objectid < min_objectid)
3444 break;
3445 if (found_key.objectid == min_objectid &&
3446 found_key.type < type)
3447 break;
3448 }
3449 return 1;
3450}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 000000000000..138c157bbc45
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,1875 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CTREE__
20#define __BTRFS_CTREE__
21
22#include <linux/version.h>
23#include <linux/mm.h>
24#include <linux/highmem.h>
25#include <linux/fs.h>
26#include <linux/completion.h>
27#include <linux/backing-dev.h>
28#include <linux/wait.h>
29#include <asm/kmap_types.h>
30#include "bit-radix.h"
31#include "extent_io.h"
32#include "extent_map.h"
33#include "async-thread.h"
34
35struct btrfs_trans_handle;
36struct btrfs_transaction;
37extern struct kmem_cache *btrfs_trans_handle_cachep;
38extern struct kmem_cache *btrfs_transaction_cachep;
39extern struct kmem_cache *btrfs_bit_radix_cachep;
40extern struct kmem_cache *btrfs_path_cachep;
41struct btrfs_ordered_sum;
42
43#define BTRFS_MAGIC "_B9RfS_M"
44
45#define BTRFS_ACL_NOT_CACHED ((void *)-1)
46
47#ifdef CONFIG_LOCKDEP
48# define BTRFS_MAX_LEVEL 7
49#else
50# define BTRFS_MAX_LEVEL 8
51#endif
52
53/* holds pointers to all of the tree roots */
54#define BTRFS_ROOT_TREE_OBJECTID 1ULL
55
56/* stores information about which extents are in use, and reference counts */
57#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
58
59/*
60 * chunk tree stores translations from logical -> physical block numbering
61 * the super block points to the chunk tree
62 */
63#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
64
65/*
66 * stores information about which areas of a given device are in use.
67 * one per device. The tree of tree roots points to the device tree
68 */
69#define BTRFS_DEV_TREE_OBJECTID 4ULL
70
71/* one per subvolume, storing files and directories */
72#define BTRFS_FS_TREE_OBJECTID 5ULL
73
74/* directory objectid inside the root tree */
75#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
76
77/* orhpan objectid for tracking unlinked/truncated files */
78#define BTRFS_ORPHAN_OBJECTID -5ULL
79
80/* does write ahead logging to speed up fsyncs */
81#define BTRFS_TREE_LOG_OBJECTID -6ULL
82#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
83
84/* dummy objectid represents multiple objectids */
85#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
86
87/*
88 * All files have objectids in this range.
89 */
90#define BTRFS_FIRST_FREE_OBJECTID 256ULL
91#define BTRFS_LAST_FREE_OBJECTID -256ULL
92#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
93
94
95/*
96 * the device items go into the chunk tree. The key is in the form
97 * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
98 */
99#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
100
101/*
102 * we can actually store much bigger names, but lets not confuse the rest
103 * of linux
104 */
105#define BTRFS_NAME_LEN 255
106
107/* 32 bytes in various csum fields */
108#define BTRFS_CSUM_SIZE 32
109/* four bytes for CRC32 */
110#define BTRFS_CRC32_SIZE 4
111#define BTRFS_EMPTY_DIR_SIZE 0
112
113#define BTRFS_FT_UNKNOWN 0
114#define BTRFS_FT_REG_FILE 1
115#define BTRFS_FT_DIR 2
116#define BTRFS_FT_CHRDEV 3
117#define BTRFS_FT_BLKDEV 4
118#define BTRFS_FT_FIFO 5
119#define BTRFS_FT_SOCK 6
120#define BTRFS_FT_SYMLINK 7
121#define BTRFS_FT_XATTR 8
122#define BTRFS_FT_MAX 9
123
124/*
125 * the key defines the order in the tree, and so it also defines (optimal)
126 * block layout. objectid corresonds to the inode number. The flags
127 * tells us things about the object, and is a kind of stream selector.
128 * so for a given inode, keys with flags of 1 might refer to the inode
129 * data, flags of 2 may point to file data in the btree and flags == 3
130 * may point to extents.
131 *
132 * offset is the starting byte offset for this key in the stream.
133 *
134 * btrfs_disk_key is in disk byte order. struct btrfs_key is always
135 * in cpu native order. Otherwise they are identical and their sizes
136 * should be the same (ie both packed)
137 */
138struct btrfs_disk_key {
139 __le64 objectid;
140 u8 type;
141 __le64 offset;
142} __attribute__ ((__packed__));
143
144struct btrfs_key {
145 u64 objectid;
146 u8 type;
147 u64 offset;
148} __attribute__ ((__packed__));
149
150struct btrfs_mapping_tree {
151 struct extent_map_tree map_tree;
152};
153
154#define BTRFS_UUID_SIZE 16
155struct btrfs_dev_item {
156 /* the internal btrfs device id */
157 __le64 devid;
158
159 /* size of the device */
160 __le64 total_bytes;
161
162 /* bytes used */
163 __le64 bytes_used;
164
165 /* optimal io alignment for this device */
166 __le32 io_align;
167
168 /* optimal io width for this device */
169 __le32 io_width;
170
171 /* minimal io size for this device */
172 __le32 sector_size;
173
174 /* type and info about this device */
175 __le64 type;
176
177 /* grouping information for allocation decisions */
178 __le32 dev_group;
179
180 /* seek speed 0-100 where 100 is fastest */
181 u8 seek_speed;
182
183 /* bandwidth 0-100 where 100 is fastest */
184 u8 bandwidth;
185
186 /* btrfs generated uuid for this device */
187 u8 uuid[BTRFS_UUID_SIZE];
188} __attribute__ ((__packed__));
189
190struct btrfs_stripe {
191 __le64 devid;
192 __le64 offset;
193 u8 dev_uuid[BTRFS_UUID_SIZE];
194} __attribute__ ((__packed__));
195
196struct btrfs_chunk {
197 /* size of this chunk in bytes */
198 __le64 length;
199
200 /* objectid of the root referencing this chunk */
201 __le64 owner;
202
203 __le64 stripe_len;
204 __le64 type;
205
206 /* optimal io alignment for this chunk */
207 __le32 io_align;
208
209 /* optimal io width for this chunk */
210 __le32 io_width;
211
212 /* minimal io size for this chunk */
213 __le32 sector_size;
214
215 /* 2^16 stripes is quite a lot, a second limit is the size of a single
216 * item in the btree
217 */
218 __le16 num_stripes;
219
220 /* sub stripes only matter for raid10 */
221 __le16 sub_stripes;
222 struct btrfs_stripe stripe;
223 /* additional stripes go here */
224} __attribute__ ((__packed__));
225
226static inline unsigned long btrfs_chunk_item_size(int num_stripes)
227{
228 BUG_ON(num_stripes == 0);
229 return sizeof(struct btrfs_chunk) +
230 sizeof(struct btrfs_stripe) * (num_stripes - 1);
231}
232
233#define BTRFS_FSID_SIZE 16
234#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
235
236/*
237 * every tree block (leaf or node) starts with this header.
238 */
239struct btrfs_header {
240 /* these first four must match the super block */
241 u8 csum[BTRFS_CSUM_SIZE];
242 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
243 __le64 bytenr; /* which block this node is supposed to live in */
244 __le64 flags;
245
246 /* allowed to be different from the super from here on down */
247 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
248 __le64 generation;
249 __le64 owner;
250 __le32 nritems;
251 u8 level;
252} __attribute__ ((__packed__));
253
254#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
255 sizeof(struct btrfs_header)) / \
256 sizeof(struct btrfs_key_ptr))
257#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
258#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
259#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
260 sizeof(struct btrfs_item) - \
261 sizeof(struct btrfs_file_extent_item))
262
263
264/*
265 * this is a very generous portion of the super block, giving us
266 * room to translate 14 chunks with 3 stripes each.
267 */
268#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
269#define BTRFS_LABEL_SIZE 256
270
271/*
272 * the super block basically lists the main trees of the FS
273 * it currently lacks any block count etc etc
274 */
275struct btrfs_super_block {
276 u8 csum[BTRFS_CSUM_SIZE];
277 /* the first 4 fields must match struct btrfs_header */
278 u8 fsid[16]; /* FS specific uuid */
279 __le64 bytenr; /* this block number */
280 __le64 flags;
281
282 /* allowed to be different from the btrfs_header from here own down */
283 __le64 magic;
284 __le64 generation;
285 __le64 root;
286 __le64 chunk_root;
287 __le64 log_root;
288 __le64 total_bytes;
289 __le64 bytes_used;
290 __le64 root_dir_objectid;
291 __le64 num_devices;
292 __le32 sectorsize;
293 __le32 nodesize;
294 __le32 leafsize;
295 __le32 stripesize;
296 __le32 sys_chunk_array_size;
297 u8 root_level;
298 u8 chunk_root_level;
299 u8 log_root_level;
300 struct btrfs_dev_item dev_item;
301 char label[BTRFS_LABEL_SIZE];
302 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
303} __attribute__ ((__packed__));
304
305/*
306 * A leaf is full of items. offset and size tell us where to find
307 * the item in the leaf (relative to the start of the data area)
308 */
309struct btrfs_item {
310 struct btrfs_disk_key key;
311 __le32 offset;
312 __le32 size;
313} __attribute__ ((__packed__));
314
315/*
316 * leaves have an item area and a data area:
317 * [item0, item1....itemN] [free space] [dataN...data1, data0]
318 *
319 * The data is separate from the items to get the keys closer together
320 * during searches.
321 */
322struct btrfs_leaf {
323 struct btrfs_header header;
324 struct btrfs_item items[];
325} __attribute__ ((__packed__));
326
327/*
328 * all non-leaf blocks are nodes, they hold only keys and pointers to
329 * other blocks
330 */
331struct btrfs_key_ptr {
332 struct btrfs_disk_key key;
333 __le64 blockptr;
334 __le64 generation;
335} __attribute__ ((__packed__));
336
337struct btrfs_node {
338 struct btrfs_header header;
339 struct btrfs_key_ptr ptrs[];
340} __attribute__ ((__packed__));
341
342/*
343 * btrfs_paths remember the path taken from the root down to the leaf.
344 * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
345 * to any other levels that are present.
346 *
347 * The slots array records the index of the item or block pointer
348 * used while walking the tree.
349 */
350struct btrfs_path {
351 struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
352 int slots[BTRFS_MAX_LEVEL];
353 /* if there is real range locking, this locks field will change */
354 int locks[BTRFS_MAX_LEVEL];
355 int reada;
356 /* keep some upper locks as we walk down */
357 int keep_locks;
358 int skip_locking;
359 int lowest_level;
360};
361
362/*
363 * items in the extent btree are used to record the objectid of the
364 * owner of the block and the number of references
365 */
366struct btrfs_extent_item {
367 __le32 refs;
368} __attribute__ ((__packed__));
369
370struct btrfs_extent_ref {
371 __le64 root;
372 __le64 generation;
373 __le64 objectid;
374 __le64 offset;
375 __le32 num_refs;
376} __attribute__ ((__packed__));
377
378/* dev extents record free space on individual devices. The owner
379 * field points back to the chunk allocation mapping tree that allocated
380 * the extent. The chunk tree uuid field is a way to double check the owner
381 */
382struct btrfs_dev_extent {
383 __le64 chunk_tree;
384 __le64 chunk_objectid;
385 __le64 chunk_offset;
386 __le64 length;
387 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
388} __attribute__ ((__packed__));
389
390struct btrfs_inode_ref {
391 __le64 index;
392 __le16 name_len;
393 /* name goes here */
394} __attribute__ ((__packed__));
395
396struct btrfs_timespec {
397 __le64 sec;
398 __le32 nsec;
399} __attribute__ ((__packed__));
400
401/*
402 * there is no padding here on purpose. If you want to extent the inode,
403 * make a new item type
404 */
405struct btrfs_inode_item {
406 /* nfs style generation number */
407 __le64 generation;
408 /* transid that last touched this inode */
409 __le64 transid;
410 __le64 size;
411 __le64 nblocks;
412 __le64 block_group;
413 __le32 nlink;
414 __le32 uid;
415 __le32 gid;
416 __le32 mode;
417 __le64 rdev;
418 __le16 flags;
419 __le16 compat_flags;
420 struct btrfs_timespec atime;
421 struct btrfs_timespec ctime;
422 struct btrfs_timespec mtime;
423 struct btrfs_timespec otime;
424} __attribute__ ((__packed__));
425
426struct btrfs_dir_log_item {
427 __le64 end;
428} __attribute__ ((__packed__));
429
430struct btrfs_dir_item {
431 struct btrfs_disk_key location;
432 __le64 transid;
433 __le16 data_len;
434 __le16 name_len;
435 u8 type;
436} __attribute__ ((__packed__));
437
438struct btrfs_root_item {
439 struct btrfs_inode_item inode;
440 __le64 root_dirid;
441 __le64 bytenr;
442 __le64 byte_limit;
443 __le64 bytes_used;
444 __le32 flags;
445 __le32 refs;
446 struct btrfs_disk_key drop_progress;
447 u8 drop_level;
448 u8 level;
449} __attribute__ ((__packed__));
450
451#define BTRFS_FILE_EXTENT_REG 0
452#define BTRFS_FILE_EXTENT_INLINE 1
453
454struct btrfs_file_extent_item {
455 __le64 generation;
456 u8 type;
457 /*
458 * disk space consumed by the extent, checksum blocks are included
459 * in these numbers
460 */
461 __le64 disk_bytenr;
462 __le64 disk_num_bytes;
463 /*
464 * the logical offset in file blocks (no csums)
465 * this extent record is for. This allows a file extent to point
466 * into the middle of an existing extent on disk, sharing it
467 * between two snapshots (useful if some bytes in the middle of the
468 * extent have changed
469 */
470 __le64 offset;
471 /*
472 * the logical number of file blocks (no csums included)
473 */
474 __le64 num_bytes;
475} __attribute__ ((__packed__));
476
477struct btrfs_csum_item {
478 u8 csum;
479} __attribute__ ((__packed__));
480
481/* different types of block groups (and chunks) */
482#define BTRFS_BLOCK_GROUP_DATA (1 << 0)
483#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1)
484#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
485#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3)
486#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
487#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
488#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
489
490struct btrfs_block_group_item {
491 __le64 used;
492 __le64 chunk_objectid;
493 __le64 flags;
494} __attribute__ ((__packed__));
495
496struct btrfs_space_info {
497 u64 flags;
498 u64 total_bytes;
499 u64 bytes_used;
500 u64 bytes_pinned;
501 int full;
502 int force_alloc;
503 struct list_head list;
504
505 /* for block groups in our same type */
506 struct list_head block_groups;
507 spinlock_t lock;
508};
509
510struct btrfs_free_space {
511 struct rb_node bytes_index;
512 struct rb_node offset_index;
513 u64 offset;
514 u64 bytes;
515};
516
517struct btrfs_block_group_cache {
518 struct btrfs_key key;
519 struct btrfs_block_group_item item;
520 spinlock_t lock;
521 u64 pinned;
522 u64 flags;
523 int cached;
524 int ro;
525 int dirty;
526
527 struct btrfs_space_info *space_info;
528
529 /* free space cache stuff */
530 struct rb_root free_space_bytes;
531 struct rb_root free_space_offset;
532
533 /* block group cache stuff */
534 struct rb_node cache_node;
535
536 /* for block groups in the same raid type */
537 struct list_head list;
538};
539
540struct btrfs_device;
541struct btrfs_fs_devices;
542struct btrfs_fs_info {
543 u8 fsid[BTRFS_FSID_SIZE];
544 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
545 struct btrfs_root *extent_root;
546 struct btrfs_root *tree_root;
547 struct btrfs_root *chunk_root;
548 struct btrfs_root *dev_root;
549
550 /* the log root tree is a directory of all the other log roots */
551 struct btrfs_root *log_root_tree;
552 struct radix_tree_root fs_roots_radix;
553
554 /* block group cache stuff */
555 spinlock_t block_group_cache_lock;
556 struct rb_root block_group_cache_tree;
557
558 struct extent_io_tree pinned_extents;
559 struct extent_io_tree pending_del;
560 struct extent_io_tree extent_ins;
561
562 /* logical->physical extent mapping */
563 struct btrfs_mapping_tree mapping_tree;
564
565 u64 generation;
566 u64 last_trans_committed;
567 u64 last_trans_new_blockgroup;
568 u64 open_ioctl_trans;
569 unsigned long mount_opt;
570 u64 max_extent;
571 u64 max_inline;
572 u64 alloc_start;
573 struct btrfs_transaction *running_transaction;
574 wait_queue_head_t transaction_throttle;
575 wait_queue_head_t transaction_wait;
576 wait_queue_head_t async_submit_wait;
577
578 wait_queue_head_t tree_log_wait;
579
580 struct btrfs_super_block super_copy;
581 struct btrfs_super_block super_for_commit;
582 struct block_device *__bdev;
583 struct super_block *sb;
584 struct inode *btree_inode;
585 struct backing_dev_info bdi;
586 spinlock_t hash_lock;
587 struct mutex trans_mutex;
588 struct mutex tree_log_mutex;
589 struct mutex transaction_kthread_mutex;
590 struct mutex cleaner_mutex;
591 struct mutex alloc_mutex;
592 struct mutex chunk_mutex;
593 struct mutex drop_mutex;
594 struct mutex volume_mutex;
595 struct list_head trans_list;
596 struct list_head hashers;
597 struct list_head dead_roots;
598
599 atomic_t nr_async_submits;
600 atomic_t nr_async_bios;
601 atomic_t tree_log_writers;
602 atomic_t tree_log_commit;
603 unsigned long tree_log_batch;
604 u64 tree_log_transid;
605
606 /*
607 * this is used by the balancing code to wait for all the pending
608 * ordered extents
609 */
610 spinlock_t ordered_extent_lock;
611 struct list_head ordered_extents;
612 struct list_head delalloc_inodes;
613
614 /*
615 * there is a pool of worker threads for checksumming during writes
616 * and a pool for checksumming after reads. This is because readers
617 * can run with FS locks held, and the writers may be waiting for
618 * those locks. We don't want ordering in the pending list to cause
619 * deadlocks, and so the two are serviced separately.
620 *
621 * A third pool does submit_bio to avoid deadlocking with the other
622 * two
623 */
624 struct btrfs_workers workers;
625 struct btrfs_workers endio_workers;
626 struct btrfs_workers endio_write_workers;
627 struct btrfs_workers submit_workers;
628 /*
629 * fixup workers take dirty pages that didn't properly go through
630 * the cow mechanism and make them safe to write. It happens
631 * for the sys_munmap function call path
632 */
633 struct btrfs_workers fixup_workers;
634 struct task_struct *transaction_kthread;
635 struct task_struct *cleaner_kthread;
636 int thread_pool_size;
637
638 struct kobject super_kobj;
639 struct completion kobj_unregister;
640 int do_barriers;
641 int closing;
642 int log_root_recovering;
643 atomic_t throttles;
644 atomic_t throttle_gen;
645
646 u64 total_pinned;
647 struct list_head dirty_cowonly_roots;
648
649 struct btrfs_fs_devices *fs_devices;
650 struct list_head space_info;
651 spinlock_t delalloc_lock;
652 spinlock_t new_trans_lock;
653 u64 delalloc_bytes;
654 u64 last_alloc;
655 u64 last_data_alloc;
656 u64 last_log_alloc;
657
658 spinlock_t ref_cache_lock;
659 u64 total_ref_cache_size;
660
661 u64 avail_data_alloc_bits;
662 u64 avail_metadata_alloc_bits;
663 u64 avail_system_alloc_bits;
664 u64 data_alloc_profile;
665 u64 metadata_alloc_profile;
666 u64 system_alloc_profile;
667
668 void *bdev_holder;
669};
670
671struct btrfs_leaf_ref_tree {
672 struct rb_root root;
673 struct btrfs_leaf_ref *last;
674 struct list_head list;
675 spinlock_t lock;
676};
677
678/*
679 * in ram representation of the tree. extent_root is used for all allocations
680 * and for the extent tree extent_root root.
681 */
682struct btrfs_dirty_root;
683struct btrfs_root {
684 struct extent_buffer *node;
685
686 /* the node lock is held while changing the node pointer */
687 spinlock_t node_lock;
688
689 struct extent_buffer *commit_root;
690 struct btrfs_leaf_ref_tree *ref_tree;
691 struct btrfs_leaf_ref_tree ref_tree_struct;
692 struct btrfs_dirty_root *dirty_root;
693 struct btrfs_root *log_root;
694
695 struct btrfs_root_item root_item;
696 struct btrfs_key root_key;
697 struct btrfs_fs_info *fs_info;
698 struct inode *inode;
699 struct extent_io_tree dirty_log_pages;
700
701 struct kobject root_kobj;
702 struct completion kobj_unregister;
703 struct mutex objectid_mutex;
704 struct mutex log_mutex;
705
706 u64 objectid;
707 u64 last_trans;
708
709 /* data allocations are done in sectorsize units */
710 u32 sectorsize;
711
712 /* node allocations are done in nodesize units */
713 u32 nodesize;
714
715 /* leaf allocations are done in leafsize units */
716 u32 leafsize;
717
718 u32 stripesize;
719
720 u32 type;
721 u64 highest_inode;
722 u64 last_inode_alloc;
723 int ref_cows;
724 int track_dirty;
725 u64 defrag_trans_start;
726 struct btrfs_key defrag_progress;
727 struct btrfs_key defrag_max;
728 int defrag_running;
729 int defrag_level;
730 char *name;
731 int in_sysfs;
732
733 /* the dirty list is only used by non-reference counted roots */
734 struct list_head dirty_list;
735
736 spinlock_t list_lock;
737 struct list_head dead_list;
738 struct list_head orphan_list;
739};
740
741/*
742
743 * inode items have the data typically returned from stat and store other
744 * info about object characteristics. There is one for every file and dir in
745 * the FS
746 */
747#define BTRFS_INODE_ITEM_KEY 1
748#define BTRFS_INODE_REF_KEY 2
749#define BTRFS_XATTR_ITEM_KEY 8
750#define BTRFS_ORPHAN_ITEM_KEY 9
751/* reserve 2-15 close to the inode for later flexibility */
752
753/*
754 * dir items are the name -> inode pointers in a directory. There is one
755 * for every name in a directory.
756 */
757#define BTRFS_DIR_LOG_ITEM_KEY 14
758#define BTRFS_DIR_LOG_INDEX_KEY 15
759#define BTRFS_DIR_ITEM_KEY 16
760#define BTRFS_DIR_INDEX_KEY 17
761/*
762 * extent data is for file data
763 */
764#define BTRFS_EXTENT_DATA_KEY 18
765/*
766 * csum items have the checksums for data in the extents
767 */
768#define BTRFS_CSUM_ITEM_KEY 19
769
770
771/* reserve 21-31 for other file/dir stuff */
772
773/*
774 * root items point to tree roots. There are typically in the root
775 * tree used by the super block to find all the other trees
776 */
777#define BTRFS_ROOT_ITEM_KEY 32
778/*
779 * extent items are in the extent map tree. These record which blocks
780 * are used, and how many references there are to each block
781 */
782#define BTRFS_EXTENT_ITEM_KEY 33
783#define BTRFS_EXTENT_REF_KEY 34
784
785/*
786 * block groups give us hints into the extent allocation trees. Which
787 * blocks are free etc etc
788 */
789#define BTRFS_BLOCK_GROUP_ITEM_KEY 50
790
791#define BTRFS_DEV_EXTENT_KEY 75
792#define BTRFS_DEV_ITEM_KEY 76
793#define BTRFS_CHUNK_ITEM_KEY 77
794
795/*
796 * string items are for debugging. They just store a short string of
797 * data in the FS
798 */
799#define BTRFS_STRING_ITEM_KEY 253
800
801#define BTRFS_MOUNT_NODATASUM (1 << 0)
802#define BTRFS_MOUNT_NODATACOW (1 << 1)
803#define BTRFS_MOUNT_NOBARRIER (1 << 2)
804#define BTRFS_MOUNT_SSD (1 << 3)
805#define BTRFS_MOUNT_DEGRADED (1 << 4)
806
807#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
808#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
809#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
810 BTRFS_MOUNT_##opt)
811/*
812 * Inode flags
813 */
814#define BTRFS_INODE_NODATASUM (1 << 0)
815#define BTRFS_INODE_NODATACOW (1 << 1)
816#define BTRFS_INODE_READONLY (1 << 2)
817#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \
818 ~BTRFS_INODE_##flag)
819#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \
820 BTRFS_INODE_##flag)
821#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \
822 BTRFS_INODE_##flag)
823/* some macros to generate set/get funcs for the struct fields. This
824 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
825 * one for u8:
826 */
827#define le8_to_cpu(v) (v)
828#define cpu_to_le8(v) (v)
829#define __le8 u8
830
831#define read_eb_member(eb, ptr, type, member, result) ( \
832 read_extent_buffer(eb, (char *)(result), \
833 ((unsigned long)(ptr)) + \
834 offsetof(type, member), \
835 sizeof(((type *)0)->member)))
836
837#define write_eb_member(eb, ptr, type, member, result) ( \
838 write_extent_buffer(eb, (char *)(result), \
839 ((unsigned long)(ptr)) + \
840 offsetof(type, member), \
841 sizeof(((type *)0)->member)))
842
843#ifndef BTRFS_SETGET_FUNCS
844#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
845u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
846void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
847#endif
848
849#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
850static inline u##bits btrfs_##name(struct extent_buffer *eb) \
851{ \
852 type *p = kmap_atomic(eb->first_page, KM_USER0); \
853 u##bits res = le##bits##_to_cpu(p->member); \
854 kunmap_atomic(p, KM_USER0); \
855 return res; \
856} \
857static inline void btrfs_set_##name(struct extent_buffer *eb, \
858 u##bits val) \
859{ \
860 type *p = kmap_atomic(eb->first_page, KM_USER0); \
861 p->member = cpu_to_le##bits(val); \
862 kunmap_atomic(p, KM_USER0); \
863}
864
865#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
866static inline u##bits btrfs_##name(type *s) \
867{ \
868 return le##bits##_to_cpu(s->member); \
869} \
870static inline void btrfs_set_##name(type *s, u##bits val) \
871{ \
872 s->member = cpu_to_le##bits(val); \
873}
874
875BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
876BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
877BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
878BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
879BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
880BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
881BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
882BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
883BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
884BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
885
886BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
887BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
888 total_bytes, 64);
889BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
890 bytes_used, 64);
891BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
892 io_align, 32);
893BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
894 io_width, 32);
895BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
896 sector_size, 32);
897BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
898BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
899 dev_group, 32);
900BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
901 seek_speed, 8);
902BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
903 bandwidth, 8);
904
905static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
906{
907 return (char *)d + offsetof(struct btrfs_dev_item, uuid);
908}
909
910BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
911BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
912BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
913BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
914BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
915BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
916BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
917BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
918BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
919BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
920BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
921
922static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
923{
924 return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
925}
926
927BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
928BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
929BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
930 stripe_len, 64);
931BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
932 io_align, 32);
933BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
934 io_width, 32);
935BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
936 sector_size, 32);
937BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
938BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
939 num_stripes, 16);
940BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
941 sub_stripes, 16);
942BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
943BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
944
945static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
946 int nr)
947{
948 unsigned long offset = (unsigned long)c;
949 offset += offsetof(struct btrfs_chunk, stripe);
950 offset += nr * sizeof(struct btrfs_stripe);
951 return (struct btrfs_stripe *)offset;
952}
953
954static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
955{
956 return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
957}
958
959static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
960 struct btrfs_chunk *c, int nr)
961{
962 return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
963}
964
965static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
966 struct btrfs_chunk *c, int nr,
967 u64 val)
968{
969 btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
970}
971
972static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
973 struct btrfs_chunk *c, int nr)
974{
975 return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
976}
977
978static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
979 struct btrfs_chunk *c, int nr,
980 u64 val)
981{
982 btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
983}
984
985/* struct btrfs_block_group_item */
986BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
987 used, 64);
988BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
989 used, 64);
990BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
991 struct btrfs_block_group_item, chunk_objectid, 64);
992
993BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
994 struct btrfs_block_group_item, chunk_objectid, 64);
995BTRFS_SETGET_FUNCS(disk_block_group_flags,
996 struct btrfs_block_group_item, flags, 64);
997BTRFS_SETGET_STACK_FUNCS(block_group_flags,
998 struct btrfs_block_group_item, flags, 64);
999
1000/* struct btrfs_inode_ref */
1001BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
1002BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
1003
1004/* struct btrfs_inode_item */
1005BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
1006BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
1007BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
1008BTRFS_SETGET_FUNCS(inode_nblocks, struct btrfs_inode_item, nblocks, 64);
1009BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
1010BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
1011BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
1012BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
1013BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
1014BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
1015BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 16);
1016BTRFS_SETGET_FUNCS(inode_compat_flags, struct btrfs_inode_item,
1017 compat_flags, 16);
1018
1019static inline struct btrfs_timespec *
1020btrfs_inode_atime(struct btrfs_inode_item *inode_item)
1021{
1022 unsigned long ptr = (unsigned long)inode_item;
1023 ptr += offsetof(struct btrfs_inode_item, atime);
1024 return (struct btrfs_timespec *)ptr;
1025}
1026
1027static inline struct btrfs_timespec *
1028btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
1029{
1030 unsigned long ptr = (unsigned long)inode_item;
1031 ptr += offsetof(struct btrfs_inode_item, mtime);
1032 return (struct btrfs_timespec *)ptr;
1033}
1034
1035static inline struct btrfs_timespec *
1036btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
1037{
1038 unsigned long ptr = (unsigned long)inode_item;
1039 ptr += offsetof(struct btrfs_inode_item, ctime);
1040 return (struct btrfs_timespec *)ptr;
1041}
1042
1043static inline struct btrfs_timespec *
1044btrfs_inode_otime(struct btrfs_inode_item *inode_item)
1045{
1046 unsigned long ptr = (unsigned long)inode_item;
1047 ptr += offsetof(struct btrfs_inode_item, otime);
1048 return (struct btrfs_timespec *)ptr;
1049}
1050
1051BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
1052BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
1053
1054/* struct btrfs_dev_extent */
1055BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
1056 chunk_tree, 64);
1057BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
1058 chunk_objectid, 64);
1059BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
1060 chunk_offset, 64);
1061BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
1062
1063static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
1064{
1065 unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
1066 return (u8 *)((unsigned long)dev + ptr);
1067}
1068
1069/* struct btrfs_extent_ref */
1070BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
1071BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
1072BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
1073BTRFS_SETGET_FUNCS(ref_offset, struct btrfs_extent_ref, offset, 64);
1074BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
1075
1076BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
1077BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
1078 generation, 64);
1079BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
1080 objectid, 64);
1081BTRFS_SETGET_STACK_FUNCS(stack_ref_offset, struct btrfs_extent_ref,
1082 offset, 64);
1083BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
1084 num_refs, 32);
1085
1086/* struct btrfs_extent_item */
1087BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
1088BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
1089 refs, 32);
1090
1091/* struct btrfs_node */
1092BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
1093BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
1094
1095static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
1096{
1097 unsigned long ptr;
1098 ptr = offsetof(struct btrfs_node, ptrs) +
1099 sizeof(struct btrfs_key_ptr) * nr;
1100 return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
1101}
1102
1103static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
1104 int nr, u64 val)
1105{
1106 unsigned long ptr;
1107 ptr = offsetof(struct btrfs_node, ptrs) +
1108 sizeof(struct btrfs_key_ptr) * nr;
1109 btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
1110}
1111
1112static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
1113{
1114 unsigned long ptr;
1115 ptr = offsetof(struct btrfs_node, ptrs) +
1116 sizeof(struct btrfs_key_ptr) * nr;
1117 return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
1118}
1119
1120static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
1121 int nr, u64 val)
1122{
1123 unsigned long ptr;
1124 ptr = offsetof(struct btrfs_node, ptrs) +
1125 sizeof(struct btrfs_key_ptr) * nr;
1126 btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
1127}
1128
1129static inline unsigned long btrfs_node_key_ptr_offset(int nr)
1130{
1131 return offsetof(struct btrfs_node, ptrs) +
1132 sizeof(struct btrfs_key_ptr) * nr;
1133}
1134
1135void btrfs_node_key(struct extent_buffer *eb,
1136 struct btrfs_disk_key *disk_key, int nr);
1137
1138static inline void btrfs_set_node_key(struct extent_buffer *eb,
1139 struct btrfs_disk_key *disk_key, int nr)
1140{
1141 unsigned long ptr;
1142 ptr = btrfs_node_key_ptr_offset(nr);
1143 write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
1144 struct btrfs_key_ptr, key, disk_key);
1145}
1146
1147/* struct btrfs_item */
1148BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
1149BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
1150
1151static inline unsigned long btrfs_item_nr_offset(int nr)
1152{
1153 return offsetof(struct btrfs_leaf, items) +
1154 sizeof(struct btrfs_item) * nr;
1155}
1156
1157static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
1158 int nr)
1159{
1160 return (struct btrfs_item *)btrfs_item_nr_offset(nr);
1161}
1162
1163static inline u32 btrfs_item_end(struct extent_buffer *eb,
1164 struct btrfs_item *item)
1165{
1166 return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
1167}
1168
1169static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
1170{
1171 return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
1172}
1173
1174static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
1175{
1176 return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
1177}
1178
1179static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
1180{
1181 return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
1182}
1183
1184static inline void btrfs_item_key(struct extent_buffer *eb,
1185 struct btrfs_disk_key *disk_key, int nr)
1186{
1187 struct btrfs_item *item = btrfs_item_nr(eb, nr);
1188 read_eb_member(eb, item, struct btrfs_item, key, disk_key);
1189}
1190
1191static inline void btrfs_set_item_key(struct extent_buffer *eb,
1192 struct btrfs_disk_key *disk_key, int nr)
1193{
1194 struct btrfs_item *item = btrfs_item_nr(eb, nr);
1195 write_eb_member(eb, item, struct btrfs_item, key, disk_key);
1196}
1197
1198BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
1199
1200/* struct btrfs_dir_item */
1201BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
1202BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
1203BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
1204BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
1205
1206static inline void btrfs_dir_item_key(struct extent_buffer *eb,
1207 struct btrfs_dir_item *item,
1208 struct btrfs_disk_key *key)
1209{
1210 read_eb_member(eb, item, struct btrfs_dir_item, location, key);
1211}
1212
1213static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
1214 struct btrfs_dir_item *item,
1215 struct btrfs_disk_key *key)
1216{
1217 write_eb_member(eb, item, struct btrfs_dir_item, location, key);
1218}
1219
1220/* struct btrfs_disk_key */
1221BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
1222 objectid, 64);
1223BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
1224BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
1225
1226static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
1227 struct btrfs_disk_key *disk)
1228{
1229 cpu->offset = le64_to_cpu(disk->offset);
1230 cpu->type = disk->type;
1231 cpu->objectid = le64_to_cpu(disk->objectid);
1232}
1233
1234static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
1235 struct btrfs_key *cpu)
1236{
1237 disk->offset = cpu_to_le64(cpu->offset);
1238 disk->type = cpu->type;
1239 disk->objectid = cpu_to_le64(cpu->objectid);
1240}
1241
1242static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
1243 struct btrfs_key *key, int nr)
1244{
1245 struct btrfs_disk_key disk_key;
1246 btrfs_node_key(eb, &disk_key, nr);
1247 btrfs_disk_key_to_cpu(key, &disk_key);
1248}
1249
1250static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
1251 struct btrfs_key *key, int nr)
1252{
1253 struct btrfs_disk_key disk_key;
1254 btrfs_item_key(eb, &disk_key, nr);
1255 btrfs_disk_key_to_cpu(key, &disk_key);
1256}
1257
1258static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
1259 struct btrfs_dir_item *item,
1260 struct btrfs_key *key)
1261{
1262 struct btrfs_disk_key disk_key;
1263 btrfs_dir_item_key(eb, item, &disk_key);
1264 btrfs_disk_key_to_cpu(key, &disk_key);
1265}
1266
1267
1268static inline u8 btrfs_key_type(struct btrfs_key *key)
1269{
1270 return key->type;
1271}
1272
1273static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
1274{
1275 key->type = val;
1276}
1277
1278/* struct btrfs_header */
1279BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
1280BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
1281 generation, 64);
1282BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
1283BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
1284BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
1285BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
1286
1287static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
1288{
1289 return (btrfs_header_flags(eb) & flag) == flag;
1290}
1291
1292static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
1293{
1294 u64 flags = btrfs_header_flags(eb);
1295 btrfs_set_header_flags(eb, flags | flag);
1296 return (flags & flag) == flag;
1297}
1298
1299static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
1300{
1301 u64 flags = btrfs_header_flags(eb);
1302 btrfs_set_header_flags(eb, flags & ~flag);
1303 return (flags & flag) == flag;
1304}
1305
1306static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
1307{
1308 unsigned long ptr = offsetof(struct btrfs_header, fsid);
1309 return (u8 *)ptr;
1310}
1311
1312static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
1313{
1314 unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
1315 return (u8 *)ptr;
1316}
1317
1318static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
1319{
1320 unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
1321 return (u8 *)ptr;
1322}
1323
1324static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
1325{
1326 unsigned long ptr = offsetof(struct btrfs_header, csum);
1327 return (u8 *)ptr;
1328}
1329
1330static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
1331{
1332 return NULL;
1333}
1334
1335static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
1336{
1337 return NULL;
1338}
1339
1340static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
1341{
1342 return NULL;
1343}
1344
1345static inline int btrfs_is_leaf(struct extent_buffer *eb)
1346{
1347 return (btrfs_header_level(eb) == 0);
1348}
1349
1350/* struct btrfs_root_item */
1351BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
1352BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
1353BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
1354
1355BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
1356BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
1357BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
1358BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
1359BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 32);
1360BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
1361BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
1362
1363/* struct btrfs_super_block */
1364BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
1365BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
1366BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
1367 generation, 64);
1368BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
1369BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
1370 struct btrfs_super_block, sys_chunk_array_size, 32);
1371BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
1372 root_level, 8);
1373BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
1374 chunk_root, 64);
1375BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
1376 chunk_root_level, 8);
1377BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
1378 log_root, 64);
1379BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
1380 log_root_level, 8);
1381BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
1382 total_bytes, 64);
1383BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
1384 bytes_used, 64);
1385BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
1386 sectorsize, 32);
1387BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
1388 nodesize, 32);
1389BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
1390 leafsize, 32);
1391BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
1392 stripesize, 32);
1393BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
1394 root_dir_objectid, 64);
1395BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
1396 num_devices, 64);
1397
1398static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
1399{
1400 return offsetof(struct btrfs_leaf, items);
1401}
1402
1403/* struct btrfs_file_extent_item */
1404BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
1405
1406static inline unsigned long btrfs_file_extent_inline_start(struct
1407 btrfs_file_extent_item *e)
1408{
1409 unsigned long offset = (unsigned long)e;
1410 offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
1411 return offset;
1412}
1413
1414static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
1415{
1416 return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
1417}
1418
1419static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
1420 struct btrfs_item *e)
1421{
1422 unsigned long offset;
1423 offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
1424 return btrfs_item_size(eb, e) - offset;
1425}
1426
1427BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
1428 disk_bytenr, 64);
1429BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
1430 generation, 64);
1431BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
1432 disk_num_bytes, 64);
1433BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
1434 offset, 64);
1435BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
1436 num_bytes, 64);
1437
1438static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
1439{
1440 return sb->s_fs_info;
1441}
1442
1443static inline int btrfs_set_root_name(struct btrfs_root *root,
1444 const char *name, int len)
1445{
1446 /* if we already have a name just free it */
1447 if (root->name)
1448 kfree(root->name);
1449
1450 root->name = kmalloc(len+1, GFP_KERNEL);
1451 if (!root->name)
1452 return -ENOMEM;
1453
1454 memcpy(root->name, name, len);
1455 root->name[len] ='\0';
1456
1457 return 0;
1458}
1459
1460static inline u32 btrfs_level_size(struct btrfs_root *root, int level) {
1461 if (level == 0)
1462 return root->leafsize;
1463 return root->nodesize;
1464}
1465
1466/* helper function to cast into the data area of the leaf. */
1467#define btrfs_item_ptr(leaf, slot, type) \
1468 ((type *)(btrfs_leaf_data(leaf) + \
1469 btrfs_item_offset_nr(leaf, slot)))
1470
1471#define btrfs_item_ptr_offset(leaf, slot) \
1472 ((unsigned long)(btrfs_leaf_data(leaf) + \
1473 btrfs_item_offset_nr(leaf, slot)))
1474
1475static inline struct dentry *fdentry(struct file *file) {
1476#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
1477 return file->f_dentry;
1478#else
1479 return file->f_path.dentry;
1480#endif
1481}
1482
1483/* extent-tree.c */
1484int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1485int btrfs_update_pinned_extents(struct btrfs_root *root,
1486 u64 bytenr, u64 num, int pin);
1487int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1488 struct btrfs_root *root, struct extent_buffer *leaf);
1489int btrfs_cross_ref_exists(struct btrfs_trans_handle *trans,
1490 struct btrfs_root *root,
1491 struct btrfs_key *key, u64 bytenr);
1492int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1493 struct btrfs_root *root);
1494int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
1495struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
1496 btrfs_fs_info *info,
1497 u64 bytenr);
1498struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
1499 struct btrfs_block_group_cache
1500 *hint, u64 search_start,
1501 int data, int owner);
1502struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1503 struct btrfs_root *root,
1504 u32 blocksize, u64 parent,
1505 u64 root_objectid,
1506 u64 ref_generation,
1507 int level,
1508 u64 hint,
1509 u64 empty_size);
1510struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1511 struct btrfs_root *root,
1512 u64 bytenr, u32 blocksize);
1513int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size);
1514int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
1515 struct btrfs_root *root,
1516 struct btrfs_path *path,
1517 u64 bytenr, u64 parent,
1518 u64 root_objectid, u64 ref_generation,
1519 u64 owner, u64 owner_offset);
1520int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1521 struct btrfs_root *root,
1522 u64 num_bytes, u64 parent, u64 min_bytes,
1523 u64 root_objectid, u64 ref_generation,
1524 u64 owner, u64 owner_offset,
1525 u64 empty_size, u64 hint_byte,
1526 u64 search_end, struct btrfs_key *ins, u64 data);
1527int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
1528 struct btrfs_root *root, u64 parent,
1529 u64 root_objectid, u64 ref_generation,
1530 u64 owner, u64 owner_offset,
1531 struct btrfs_key *ins);
1532int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
1533 struct btrfs_root *root, u64 parent,
1534 u64 root_objectid, u64 ref_generation,
1535 u64 owner, u64 owner_offset,
1536 struct btrfs_key *ins);
1537int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
1538 struct btrfs_root *root,
1539 u64 num_bytes, u64 min_alloc_size,
1540 u64 empty_size, u64 hint_byte,
1541 u64 search_end, struct btrfs_key *ins,
1542 u64 data);
1543int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1544 struct extent_buffer *orig_buf, struct extent_buffer *buf,
1545 u32 *nr_extents);
1546int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1547 struct extent_buffer *buf, u32 nr_extents);
1548int btrfs_update_ref(struct btrfs_trans_handle *trans,
1549 struct btrfs_root *root, struct extent_buffer *orig_buf,
1550 struct extent_buffer *buf, int start_slot, int nr);
1551int btrfs_free_extent(struct btrfs_trans_handle *trans,
1552 struct btrfs_root *root,
1553 u64 bytenr, u64 num_bytes, u64 parent,
1554 u64 root_objectid, u64 ref_generation,
1555 u64 owner_objectid, u64 owner_offset, int pin);
1556int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
1557int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1558 struct btrfs_root *root,
1559 struct extent_io_tree *unpin);
1560int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1561 struct btrfs_root *root,
1562 u64 bytenr, u64 num_bytes, u64 parent,
1563 u64 root_objectid, u64 ref_generation,
1564 u64 owner, u64 owner_offset);
1565int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1566 struct btrfs_root *root, u64 bytenr,
1567 u64 orig_parent, u64 parent,
1568 u64 root_objectid, u64 ref_generation,
1569 u64 owner, u64 owner_offset);
1570int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1571 struct btrfs_root *root);
1572int btrfs_free_block_groups(struct btrfs_fs_info *info);
1573int btrfs_read_block_groups(struct btrfs_root *root);
1574int btrfs_make_block_group(struct btrfs_trans_handle *trans,
1575 struct btrfs_root *root, u64 bytes_used,
1576 u64 type, u64 chunk_objectid, u64 chunk_offset,
1577 u64 size);
1578/* ctree.c */
1579int btrfs_previous_item(struct btrfs_root *root,
1580 struct btrfs_path *path, u64 min_objectid,
1581 int type);
1582int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1583 struct btrfs_root *root, struct btrfs_path *path,
1584 struct btrfs_key *new_key);
1585struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
1586struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
1587int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
1588 struct btrfs_key *key, int lowest_level,
1589 int cache_only, u64 min_trans);
1590int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
1591 struct btrfs_key *max_key,
1592 struct btrfs_path *path, int cache_only,
1593 u64 min_trans);
1594int btrfs_cow_block(struct btrfs_trans_handle *trans,
1595 struct btrfs_root *root, struct extent_buffer *buf,
1596 struct extent_buffer *parent, int parent_slot,
1597 struct extent_buffer **cow_ret, u64 prealloc_dest);
1598int btrfs_copy_root(struct btrfs_trans_handle *trans,
1599 struct btrfs_root *root,
1600 struct extent_buffer *buf,
1601 struct extent_buffer **cow_ret, u64 new_root_objectid);
1602int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
1603 *root, struct btrfs_path *path, u32 data_size);
1604int btrfs_truncate_item(struct btrfs_trans_handle *trans,
1605 struct btrfs_root *root,
1606 struct btrfs_path *path,
1607 u32 new_size, int from_end);
1608int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1609 *root, struct btrfs_key *key, struct btrfs_path *p, int
1610 ins_len, int cow);
1611int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1612 struct btrfs_root *root, struct extent_buffer *parent,
1613 int start_slot, int cache_only, u64 *last_ret,
1614 struct btrfs_key *progress);
1615void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1616struct btrfs_path *btrfs_alloc_path(void);
1617void btrfs_free_path(struct btrfs_path *p);
1618void btrfs_init_path(struct btrfs_path *p);
1619int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1620 struct btrfs_path *path, int slot, int nr);
1621
1622static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
1623 struct btrfs_root *root,
1624 struct btrfs_path *path)
1625{
1626 return btrfs_del_items(trans, root, path, path->slots[0], 1);
1627}
1628
1629int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
1630 *root, struct btrfs_key *key, void *data, u32 data_size);
1631int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
1632 struct btrfs_root *root,
1633 struct btrfs_path *path,
1634 struct btrfs_key *cpu_key, u32 *data_size, int nr);
1635
1636static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
1637 struct btrfs_root *root,
1638 struct btrfs_path *path,
1639 struct btrfs_key *key,
1640 u32 data_size)
1641{
1642 return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
1643}
1644
1645int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
1646int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
1647int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
1648int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
1649 *root);
1650/* root-item.c */
1651int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1652 struct btrfs_key *key);
1653int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
1654 *root, struct btrfs_key *key, struct btrfs_root_item
1655 *item);
1656int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
1657 *root, struct btrfs_key *key, struct btrfs_root_item
1658 *item);
1659int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
1660 btrfs_root_item *item, struct btrfs_key *key);
1661int btrfs_search_root(struct btrfs_root *root, u64 search_start,
1662 u64 *found_objectid);
1663int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
1664 struct btrfs_root *latest_root);
1665/* dir-item.c */
1666int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
1667 *root, const char *name, int name_len, u64 dir,
1668 struct btrfs_key *location, u8 type, u64 index);
1669struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
1670 struct btrfs_root *root,
1671 struct btrfs_path *path, u64 dir,
1672 const char *name, int name_len,
1673 int mod);
1674struct btrfs_dir_item *
1675btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
1676 struct btrfs_root *root,
1677 struct btrfs_path *path, u64 dir,
1678 u64 objectid, const char *name, int name_len,
1679 int mod);
1680struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
1681 struct btrfs_path *path,
1682 const char *name, int name_len);
1683int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
1684 struct btrfs_root *root,
1685 struct btrfs_path *path,
1686 struct btrfs_dir_item *di);
1687int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
1688 struct btrfs_root *root, const char *name,
1689 u16 name_len, const void *data, u16 data_len,
1690 u64 dir);
1691struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
1692 struct btrfs_root *root,
1693 struct btrfs_path *path, u64 dir,
1694 const char *name, u16 name_len,
1695 int mod);
1696
1697/* orphan.c */
1698int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
1699 struct btrfs_root *root, u64 offset);
1700int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
1701 struct btrfs_root *root, u64 offset);
1702
1703/* inode-map.c */
1704int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
1705 struct btrfs_root *fs_root,
1706 u64 dirid, u64 *objectid);
1707int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
1708
1709/* inode-item.c */
1710int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
1711 struct btrfs_root *root,
1712 const char *name, int name_len,
1713 u64 inode_objectid, u64 ref_objectid, u64 index);
1714int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
1715 struct btrfs_root *root,
1716 const char *name, int name_len,
1717 u64 inode_objectid, u64 ref_objectid, u64 *index);
1718int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
1719 struct btrfs_root *root,
1720 struct btrfs_path *path, u64 objectid);
1721int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
1722 *root, struct btrfs_path *path,
1723 struct btrfs_key *location, int mod);
1724
1725/* file-item.c */
1726int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
1727 struct bio *bio);
1728int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
1729 struct btrfs_root *root,
1730 u64 objectid, u64 pos, u64 disk_offset,
1731 u64 disk_num_bytes,
1732 u64 num_bytes, u64 offset);
1733int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
1734 struct btrfs_root *root,
1735 struct btrfs_path *path, u64 objectid,
1736 u64 bytenr, int mod);
1737int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
1738 struct btrfs_root *root, struct inode *inode,
1739 struct btrfs_ordered_sum *sums);
1740int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
1741 struct bio *bio);
1742struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
1743 struct btrfs_root *root,
1744 struct btrfs_path *path,
1745 u64 objectid, u64 offset,
1746 int cow);
1747int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
1748 struct btrfs_root *root, struct btrfs_path *path,
1749 u64 isize);
1750/* inode.c */
1751
1752/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
1753#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
1754#define ClearPageChecked ClearPageFsMisc
1755#define SetPageChecked SetPageFsMisc
1756#define PageChecked PageFsMisc
1757#endif
1758
1759int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
1760 struct btrfs_root *root,
1761 struct inode *dir, struct inode *inode,
1762 const char *name, int name_len);
1763int btrfs_add_link(struct btrfs_trans_handle *trans,
1764 struct inode *parent_inode, struct inode *inode,
1765 const char *name, int name_len, int add_backref, u64 index);
1766int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
1767 struct btrfs_root *root,
1768 struct inode *inode, u64 new_size,
1769 u32 min_type);
1770
1771int btrfs_start_delalloc_inodes(struct btrfs_root *root);
1772int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
1773int btrfs_writepages(struct address_space *mapping,
1774 struct writeback_control *wbc);
1775int btrfs_create_subvol_root(struct btrfs_root *new_root,
1776 struct btrfs_trans_handle *trans, u64 new_dirid,
1777 struct btrfs_block_group_cache *block_group);
1778
1779void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
1780 int namelen);
1781
1782int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1783 size_t size, struct bio *bio);
1784
1785static inline void dec_i_blocks(struct inode *inode, u64 dec)
1786{
1787 dec = dec >> 9;
1788 if (dec <= inode->i_blocks)
1789 inode->i_blocks -= dec;
1790 else
1791 inode->i_blocks = 0;
1792}
1793
1794unsigned long btrfs_force_ra(struct address_space *mapping,
1795 struct file_ra_state *ra, struct file *file,
1796 pgoff_t offset, pgoff_t last_index);
1797int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
1798 int for_del);
1799int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
1800int btrfs_readpage(struct file *file, struct page *page);
1801void btrfs_delete_inode(struct inode *inode);
1802void btrfs_put_inode(struct inode *inode);
1803void btrfs_read_locked_inode(struct inode *inode);
1804int btrfs_write_inode(struct inode *inode, int wait);
1805void btrfs_dirty_inode(struct inode *inode);
1806struct inode *btrfs_alloc_inode(struct super_block *sb);
1807void btrfs_destroy_inode(struct inode *inode);
1808int btrfs_init_cachep(void);
1809void btrfs_destroy_cachep(void);
1810long btrfs_ioctl_trans_end(struct file *file);
1811struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
1812 struct btrfs_root *root);
1813struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
1814 struct btrfs_root *root, int *is_new);
1815int btrfs_commit_write(struct file *file, struct page *page,
1816 unsigned from, unsigned to);
1817struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
1818 size_t page_offset, u64 start, u64 end,
1819 int create);
1820int btrfs_update_inode(struct btrfs_trans_handle *trans,
1821 struct btrfs_root *root,
1822 struct inode *inode);
1823
1824/* ioctl.c */
1825long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
1826
1827/* file.c */
1828int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
1829int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end);
1830int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
1831extern struct file_operations btrfs_file_operations;
1832int btrfs_drop_extents(struct btrfs_trans_handle *trans,
1833 struct btrfs_root *root, struct inode *inode,
1834 u64 start, u64 end, u64 inline_limit, u64 *hint_block);
1835int btrfs_release_file(struct inode *inode, struct file *file);
1836
1837/* tree-defrag.c */
1838int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
1839 struct btrfs_root *root, int cache_only);
1840
1841/* sysfs.c */
1842int btrfs_init_sysfs(void);
1843void btrfs_exit_sysfs(void);
1844int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
1845int btrfs_sysfs_add_root(struct btrfs_root *root);
1846void btrfs_sysfs_del_root(struct btrfs_root *root);
1847void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
1848
1849/* xattr.c */
1850ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
1851
1852/* super.c */
1853u64 btrfs_parse_size(char *str);
1854int btrfs_parse_options(struct btrfs_root *root, char *options);
1855int btrfs_sync_fs(struct super_block *sb, int wait);
1856
1857/* acl.c */
1858int btrfs_check_acl(struct inode *inode, int mask);
1859int btrfs_init_acl(struct inode *inode, struct inode *dir);
1860int btrfs_acl_chmod(struct inode *inode);
1861
1862/* free-space-cache.c */
1863int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
1864 u64 bytenr, u64 size);
1865int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
1866 u64 bytenr, u64 size);
1867void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
1868 *block_group);
1869struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
1870 *block_group, u64 offset,
1871 u64 bytes);
1872void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
1873 u64 bytes);
1874u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
1875#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 000000000000..e4f30090d640
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,345 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "hash.h"
22#include "transaction.h"
23
24static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
25 *trans,
26 struct btrfs_root *root,
27 struct btrfs_path *path,
28 struct btrfs_key *cpu_key,
29 u32 data_size,
30 const char *name,
31 int name_len)
32{
33 int ret;
34 char *ptr;
35 struct btrfs_item *item;
36 struct extent_buffer *leaf;
37
38 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
39 if (ret == -EEXIST) {
40 struct btrfs_dir_item *di;
41 di = btrfs_match_dir_item_name(root, path, name, name_len);
42 if (di)
43 return ERR_PTR(-EEXIST);
44 ret = btrfs_extend_item(trans, root, path, data_size);
45 WARN_ON(ret > 0);
46 }
47 if (ret < 0)
48 return ERR_PTR(ret);
49 WARN_ON(ret > 0);
50 leaf = path->nodes[0];
51 item = btrfs_item_nr(leaf, path->slots[0]);
52 ptr = btrfs_item_ptr(leaf, path->slots[0], char);
53 BUG_ON(data_size > btrfs_item_size(leaf, item));
54 ptr += btrfs_item_size(leaf, item) - data_size;
55 return (struct btrfs_dir_item *)ptr;
56}
57
58int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
59 struct btrfs_root *root, const char *name,
60 u16 name_len, const void *data, u16 data_len,
61 u64 dir)
62{
63 int ret = 0;
64 struct btrfs_path *path;
65 struct btrfs_dir_item *dir_item;
66 unsigned long name_ptr, data_ptr;
67 struct btrfs_key key, location;
68 struct btrfs_disk_key disk_key;
69 struct extent_buffer *leaf;
70 u32 data_size;
71
72 key.objectid = dir;
73 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
74 key.offset = btrfs_name_hash(name, name_len);
75 path = btrfs_alloc_path();
76 if (!path)
77 return -ENOMEM;
78 if (name_len + data_len + sizeof(struct btrfs_dir_item) >
79 BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
80 return -ENOSPC;
81
82 data_size = sizeof(*dir_item) + name_len + data_len;
83 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
84 name, name_len);
85 /*
86 * FIXME: at some point we should handle xattr's that are larger than
87 * what we can fit in our leaf. We set location to NULL b/c we arent
88 * pointing at anything else, that will change if we store the xattr
89 * data in a separate inode.
90 */
91 BUG_ON(IS_ERR(dir_item));
92 memset(&location, 0, sizeof(location));
93
94 leaf = path->nodes[0];
95 btrfs_cpu_key_to_disk(&disk_key, &location);
96 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
97 btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
98 btrfs_set_dir_name_len(leaf, dir_item, name_len);
99 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
100 btrfs_set_dir_data_len(leaf, dir_item, data_len);
101 name_ptr = (unsigned long)(dir_item + 1);
102 data_ptr = (unsigned long)((char *)name_ptr + name_len);
103
104 write_extent_buffer(leaf, name, name_ptr, name_len);
105 write_extent_buffer(leaf, data, data_ptr, data_len);
106 btrfs_mark_buffer_dirty(path->nodes[0]);
107
108 btrfs_free_path(path);
109 return ret;
110}
111
112int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
113 *root, const char *name, int name_len, u64 dir,
114 struct btrfs_key *location, u8 type, u64 index)
115{
116 int ret = 0;
117 int ret2 = 0;
118 struct btrfs_path *path;
119 struct btrfs_dir_item *dir_item;
120 struct extent_buffer *leaf;
121 unsigned long name_ptr;
122 struct btrfs_key key;
123 struct btrfs_disk_key disk_key;
124 u32 data_size;
125
126 key.objectid = dir;
127 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
128 key.offset = btrfs_name_hash(name, name_len);
129 path = btrfs_alloc_path();
130 data_size = sizeof(*dir_item) + name_len;
131 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
132 name, name_len);
133 if (IS_ERR(dir_item)) {
134 ret = PTR_ERR(dir_item);
135 if (ret == -EEXIST)
136 goto second_insert;
137 goto out;
138 }
139
140 leaf = path->nodes[0];
141 btrfs_cpu_key_to_disk(&disk_key, location);
142 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
143 btrfs_set_dir_type(leaf, dir_item, type);
144 btrfs_set_dir_data_len(leaf, dir_item, 0);
145 btrfs_set_dir_name_len(leaf, dir_item, name_len);
146 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
147 name_ptr = (unsigned long)(dir_item + 1);
148
149 write_extent_buffer(leaf, name, name_ptr, name_len);
150 btrfs_mark_buffer_dirty(leaf);
151
152second_insert:
153 /* FIXME, use some real flag for selecting the extra index */
154 if (root == root->fs_info->tree_root) {
155 ret = 0;
156 goto out;
157 }
158 btrfs_release_path(root, path);
159
160 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
161 key.offset = index;
162 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
163 name, name_len);
164 if (IS_ERR(dir_item)) {
165 ret2 = PTR_ERR(dir_item);
166 goto out;
167 }
168 leaf = path->nodes[0];
169 btrfs_cpu_key_to_disk(&disk_key, location);
170 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
171 btrfs_set_dir_type(leaf, dir_item, type);
172 btrfs_set_dir_data_len(leaf, dir_item, 0);
173 btrfs_set_dir_name_len(leaf, dir_item, name_len);
174 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
175 name_ptr = (unsigned long)(dir_item + 1);
176 write_extent_buffer(leaf, name, name_ptr, name_len);
177 btrfs_mark_buffer_dirty(leaf);
178out:
179 btrfs_free_path(path);
180 if (ret)
181 return ret;
182 if (ret2)
183 return ret2;
184 return 0;
185}
186
187struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
188 struct btrfs_root *root,
189 struct btrfs_path *path, u64 dir,
190 const char *name, int name_len,
191 int mod)
192{
193 int ret;
194 struct btrfs_key key;
195 int ins_len = mod < 0 ? -1 : 0;
196 int cow = mod != 0;
197 struct btrfs_key found_key;
198 struct extent_buffer *leaf;
199
200 key.objectid = dir;
201 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
202
203 key.offset = btrfs_name_hash(name, name_len);
204
205 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
206 if (ret < 0)
207 return ERR_PTR(ret);
208 if (ret > 0) {
209 if (path->slots[0] == 0)
210 return NULL;
211 path->slots[0]--;
212 }
213
214 leaf = path->nodes[0];
215 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
216
217 if (found_key.objectid != dir ||
218 btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
219 found_key.offset != key.offset)
220 return NULL;
221
222 return btrfs_match_dir_item_name(root, path, name, name_len);
223}
224
225struct btrfs_dir_item *
226btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
227 struct btrfs_root *root,
228 struct btrfs_path *path, u64 dir,
229 u64 objectid, const char *name, int name_len,
230 int mod)
231{
232 int ret;
233 struct btrfs_key key;
234 int ins_len = mod < 0 ? -1 : 0;
235 int cow = mod != 0;
236
237 key.objectid = dir;
238 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
239 key.offset = objectid;
240
241 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
242 if (ret < 0)
243 return ERR_PTR(ret);
244 if (ret > 0)
245 return ERR_PTR(-ENOENT);
246 return btrfs_match_dir_item_name(root, path, name, name_len);
247}
248
249struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
250 struct btrfs_root *root,
251 struct btrfs_path *path, u64 dir,
252 const char *name, u16 name_len,
253 int mod)
254{
255 int ret;
256 struct btrfs_key key;
257 int ins_len = mod < 0 ? -1 : 0;
258 int cow = mod != 0;
259 struct btrfs_key found_key;
260 struct extent_buffer *leaf;
261
262 key.objectid = dir;
263 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
264 key.offset = btrfs_name_hash(name, name_len);
265 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
266 if (ret < 0)
267 return ERR_PTR(ret);
268 if (ret > 0) {
269 if (path->slots[0] == 0)
270 return NULL;
271 path->slots[0]--;
272 }
273
274 leaf = path->nodes[0];
275 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
276
277 if (found_key.objectid != dir ||
278 btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
279 found_key.offset != key.offset)
280 return NULL;
281
282 return btrfs_match_dir_item_name(root, path, name, name_len);
283}
284
285struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
286 struct btrfs_path *path,
287 const char *name, int name_len)
288{
289 struct btrfs_dir_item *dir_item;
290 unsigned long name_ptr;
291 u32 total_len;
292 u32 cur = 0;
293 u32 this_len;
294 struct extent_buffer *leaf;
295
296 leaf = path->nodes[0];
297 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
298 total_len = btrfs_item_size_nr(leaf, path->slots[0]);
299 while(cur < total_len) {
300 this_len = sizeof(*dir_item) +
301 btrfs_dir_name_len(leaf, dir_item) +
302 btrfs_dir_data_len(leaf, dir_item);
303 name_ptr = (unsigned long)(dir_item + 1);
304
305 if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
306 memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
307 return dir_item;
308
309 cur += this_len;
310 dir_item = (struct btrfs_dir_item *)((char *)dir_item +
311 this_len);
312 }
313 return NULL;
314}
315
316int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
317 struct btrfs_root *root,
318 struct btrfs_path *path,
319 struct btrfs_dir_item *di)
320{
321
322 struct extent_buffer *leaf;
323 u32 sub_item_len;
324 u32 item_len;
325 int ret = 0;
326
327 leaf = path->nodes[0];
328 sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
329 btrfs_dir_data_len(leaf, di);
330 item_len = btrfs_item_size_nr(leaf, path->slots[0]);
331 if (sub_item_len == item_len) {
332 ret = btrfs_del_item(trans, root, path);
333 } else {
334 /* MARKER */
335 unsigned long ptr = (unsigned long)di;
336 unsigned long start;
337
338 start = btrfs_item_ptr_offset(leaf, path->slots[0]);
339 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
340 item_len - (ptr + sub_item_len - start));
341 ret = btrfs_truncate_item(trans, root, path,
342 item_len - sub_item_len, 1);
343 }
344 return 0;
345}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 000000000000..d35ca6a3f513
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,2056 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/version.h>
20#include <linux/fs.h>
21#include <linux/blkdev.h>
22#include <linux/scatterlist.h>
23#include <linux/swap.h>
24#include <linux/radix-tree.h>
25#include <linux/writeback.h>
26#include <linux/buffer_head.h> // for block_sync_page
27#include <linux/workqueue.h>
28#include <linux/kthread.h>
29#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
30# include <linux/freezer.h>
31#else
32# include <linux/sched.h>
33#endif
34#include "crc32c.h"
35#include "ctree.h"
36#include "disk-io.h"
37#include "transaction.h"
38#include "btrfs_inode.h"
39#include "volumes.h"
40#include "print-tree.h"
41#include "async-thread.h"
42#include "locking.h"
43#include "ref-cache.h"
44#include "tree-log.h"
45
46#if 0
47static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
48{
49 if (extent_buffer_blocknr(buf) != btrfs_header_blocknr(buf)) {
50 printk(KERN_CRIT "buf blocknr(buf) is %llu, header is %llu\n",
51 (unsigned long long)extent_buffer_blocknr(buf),
52 (unsigned long long)btrfs_header_blocknr(buf));
53 return 1;
54 }
55 return 0;
56}
57#endif
58
59static struct extent_io_ops btree_extent_io_ops;
60static void end_workqueue_fn(struct btrfs_work *work);
61
62struct end_io_wq {
63 struct bio *bio;
64 bio_end_io_t *end_io;
65 void *private;
66 struct btrfs_fs_info *info;
67 int error;
68 int metadata;
69 struct list_head list;
70 struct btrfs_work work;
71};
72
73struct async_submit_bio {
74 struct inode *inode;
75 struct bio *bio;
76 struct list_head list;
77 extent_submit_bio_hook_t *submit_bio_hook;
78 int rw;
79 int mirror_num;
80 struct btrfs_work work;
81};
82
83struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
84 size_t page_offset, u64 start, u64 len,
85 int create)
86{
87 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
88 struct extent_map *em;
89 int ret;
90
91 spin_lock(&em_tree->lock);
92 em = lookup_extent_mapping(em_tree, start, len);
93 if (em) {
94 em->bdev =
95 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
96 spin_unlock(&em_tree->lock);
97 goto out;
98 }
99 spin_unlock(&em_tree->lock);
100
101 em = alloc_extent_map(GFP_NOFS);
102 if (!em) {
103 em = ERR_PTR(-ENOMEM);
104 goto out;
105 }
106 em->start = 0;
107 em->len = (u64)-1;
108 em->block_start = 0;
109 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
110
111 spin_lock(&em_tree->lock);
112 ret = add_extent_mapping(em_tree, em);
113 if (ret == -EEXIST) {
114 u64 failed_start = em->start;
115 u64 failed_len = em->len;
116
117 printk("failed to insert %Lu %Lu -> %Lu into tree\n",
118 em->start, em->len, em->block_start);
119 free_extent_map(em);
120 em = lookup_extent_mapping(em_tree, start, len);
121 if (em) {
122 printk("after failing, found %Lu %Lu %Lu\n",
123 em->start, em->len, em->block_start);
124 ret = 0;
125 } else {
126 em = lookup_extent_mapping(em_tree, failed_start,
127 failed_len);
128 if (em) {
129 printk("double failure lookup gives us "
130 "%Lu %Lu -> %Lu\n", em->start,
131 em->len, em->block_start);
132 free_extent_map(em);
133 }
134 ret = -EIO;
135 }
136 } else if (ret) {
137 free_extent_map(em);
138 em = NULL;
139 }
140 spin_unlock(&em_tree->lock);
141
142 if (ret)
143 em = ERR_PTR(ret);
144out:
145 return em;
146}
147
148u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
149{
150 return btrfs_crc32c(seed, data, len);
151}
152
153void btrfs_csum_final(u32 crc, char *result)
154{
155 *(__le32 *)result = ~cpu_to_le32(crc);
156}
157
158static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
159 int verify)
160{
161 char result[BTRFS_CRC32_SIZE];
162 unsigned long len;
163 unsigned long cur_len;
164 unsigned long offset = BTRFS_CSUM_SIZE;
165 char *map_token = NULL;
166 char *kaddr;
167 unsigned long map_start;
168 unsigned long map_len;
169 int err;
170 u32 crc = ~(u32)0;
171
172 len = buf->len - offset;
173 while(len > 0) {
174 err = map_private_extent_buffer(buf, offset, 32,
175 &map_token, &kaddr,
176 &map_start, &map_len, KM_USER0);
177 if (err) {
178 printk("failed to map extent buffer! %lu\n",
179 offset);
180 return 1;
181 }
182 cur_len = min(len, map_len - (offset - map_start));
183 crc = btrfs_csum_data(root, kaddr + offset - map_start,
184 crc, cur_len);
185 len -= cur_len;
186 offset += cur_len;
187 unmap_extent_buffer(buf, map_token, KM_USER0);
188 }
189 btrfs_csum_final(crc, result);
190
191 if (verify) {
192 /* FIXME, this is not good */
193 if (memcmp_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE)) {
194 u32 val;
195 u32 found = 0;
196 memcpy(&found, result, BTRFS_CRC32_SIZE);
197
198 read_extent_buffer(buf, &val, 0, BTRFS_CRC32_SIZE);
199 printk("btrfs: %s checksum verify failed on %llu "
200 "wanted %X found %X level %d\n",
201 root->fs_info->sb->s_id,
202 buf->start, val, found, btrfs_header_level(buf));
203 return 1;
204 }
205 } else {
206 write_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE);
207 }
208 return 0;
209}
210
211static int verify_parent_transid(struct extent_io_tree *io_tree,
212 struct extent_buffer *eb, u64 parent_transid)
213{
214 int ret;
215
216 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
217 return 0;
218
219 lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
220 if (extent_buffer_uptodate(io_tree, eb) &&
221 btrfs_header_generation(eb) == parent_transid) {
222 ret = 0;
223 goto out;
224 }
225 printk("parent transid verify failed on %llu wanted %llu found %llu\n",
226 (unsigned long long)eb->start,
227 (unsigned long long)parent_transid,
228 (unsigned long long)btrfs_header_generation(eb));
229 ret = 1;
230 clear_extent_buffer_uptodate(io_tree, eb);
231out:
232 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
233 GFP_NOFS);
234 return ret;
235
236}
237
238static int btree_read_extent_buffer_pages(struct btrfs_root *root,
239 struct extent_buffer *eb,
240 u64 start, u64 parent_transid)
241{
242 struct extent_io_tree *io_tree;
243 int ret;
244 int num_copies = 0;
245 int mirror_num = 0;
246
247 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
248 while (1) {
249 ret = read_extent_buffer_pages(io_tree, eb, start, 1,
250 btree_get_extent, mirror_num);
251 if (!ret &&
252 !verify_parent_transid(io_tree, eb, parent_transid))
253 return ret;
254printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror_num);
255 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
256 eb->start, eb->len);
257 if (num_copies == 1)
258 return ret;
259
260 mirror_num++;
261 if (mirror_num > num_copies)
262 return ret;
263 }
264 return -EIO;
265}
266
267int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
268{
269 struct extent_io_tree *tree;
270 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
271 u64 found_start;
272 int found_level;
273 unsigned long len;
274 struct extent_buffer *eb;
275 int ret;
276
277 tree = &BTRFS_I(page->mapping->host)->io_tree;
278
279 if (page->private == EXTENT_PAGE_PRIVATE)
280 goto out;
281 if (!page->private)
282 goto out;
283 len = page->private >> 2;
284 if (len == 0) {
285 WARN_ON(1);
286 }
287 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
288 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
289 btrfs_header_generation(eb));
290 BUG_ON(ret);
291 found_start = btrfs_header_bytenr(eb);
292 if (found_start != start) {
293 printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
294 start, found_start, len);
295 WARN_ON(1);
296 goto err;
297 }
298 if (eb->first_page != page) {
299 printk("bad first page %lu %lu\n", eb->first_page->index,
300 page->index);
301 WARN_ON(1);
302 goto err;
303 }
304 if (!PageUptodate(page)) {
305 printk("csum not up to date page %lu\n", page->index);
306 WARN_ON(1);
307 goto err;
308 }
309 found_level = btrfs_header_level(eb);
310
311 csum_tree_block(root, eb, 0);
312err:
313 free_extent_buffer(eb);
314out:
315 return 0;
316}
317
318int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
319 struct extent_state *state)
320{
321 struct extent_io_tree *tree;
322 u64 found_start;
323 int found_level;
324 unsigned long len;
325 struct extent_buffer *eb;
326 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
327 int ret = 0;
328
329 tree = &BTRFS_I(page->mapping->host)->io_tree;
330 if (page->private == EXTENT_PAGE_PRIVATE)
331 goto out;
332 if (!page->private)
333 goto out;
334 len = page->private >> 2;
335 if (len == 0) {
336 WARN_ON(1);
337 }
338 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
339
340 found_start = btrfs_header_bytenr(eb);
341 if (found_start != start) {
342 printk("bad tree block start %llu %llu\n",
343 (unsigned long long)found_start,
344 (unsigned long long)eb->start);
345 ret = -EIO;
346 goto err;
347 }
348 if (eb->first_page != page) {
349 printk("bad first page %lu %lu\n", eb->first_page->index,
350 page->index);
351 WARN_ON(1);
352 ret = -EIO;
353 goto err;
354 }
355 if (memcmp_extent_buffer(eb, root->fs_info->fsid,
356 (unsigned long)btrfs_header_fsid(eb),
357 BTRFS_FSID_SIZE)) {
358 printk("bad fsid on block %Lu\n", eb->start);
359 ret = -EIO;
360 goto err;
361 }
362 found_level = btrfs_header_level(eb);
363
364 ret = csum_tree_block(root, eb, 1);
365 if (ret)
366 ret = -EIO;
367
368 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
369 end = eb->start + end - 1;
370err:
371 free_extent_buffer(eb);
372out:
373 return ret;
374}
375
376#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
377static void end_workqueue_bio(struct bio *bio, int err)
378#else
379static int end_workqueue_bio(struct bio *bio,
380 unsigned int bytes_done, int err)
381#endif
382{
383 struct end_io_wq *end_io_wq = bio->bi_private;
384 struct btrfs_fs_info *fs_info;
385
386#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
387 if (bio->bi_size)
388 return 1;
389#endif
390
391 fs_info = end_io_wq->info;
392 end_io_wq->error = err;
393 end_io_wq->work.func = end_workqueue_fn;
394 end_io_wq->work.flags = 0;
395 if (bio->bi_rw & (1 << BIO_RW))
396 btrfs_queue_worker(&fs_info->endio_write_workers,
397 &end_io_wq->work);
398 else
399 btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
400
401#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
402 return 0;
403#endif
404}
405
406int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
407 int metadata)
408{
409 struct end_io_wq *end_io_wq;
410 end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
411 if (!end_io_wq)
412 return -ENOMEM;
413
414 end_io_wq->private = bio->bi_private;
415 end_io_wq->end_io = bio->bi_end_io;
416 end_io_wq->info = info;
417 end_io_wq->error = 0;
418 end_io_wq->bio = bio;
419 end_io_wq->metadata = metadata;
420
421 bio->bi_private = end_io_wq;
422 bio->bi_end_io = end_workqueue_bio;
423 return 0;
424}
425
426unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
427{
428 unsigned long limit = min_t(unsigned long,
429 info->workers.max_workers,
430 info->fs_devices->open_devices);
431 return 256 * limit;
432}
433
434int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
435{
436 return atomic_read(&info->nr_async_bios) >
437 btrfs_async_submit_limit(info);
438}
439
440static void run_one_async_submit(struct btrfs_work *work)
441{
442 struct btrfs_fs_info *fs_info;
443 struct async_submit_bio *async;
444 int limit;
445
446 async = container_of(work, struct async_submit_bio, work);
447 fs_info = BTRFS_I(async->inode)->root->fs_info;
448
449 limit = btrfs_async_submit_limit(fs_info);
450 limit = limit * 2 / 3;
451
452 atomic_dec(&fs_info->nr_async_submits);
453
454 if (atomic_read(&fs_info->nr_async_submits) < limit &&
455 waitqueue_active(&fs_info->async_submit_wait))
456 wake_up(&fs_info->async_submit_wait);
457
458 async->submit_bio_hook(async->inode, async->rw, async->bio,
459 async->mirror_num);
460 kfree(async);
461}
462
463int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
464 int rw, struct bio *bio, int mirror_num,
465 extent_submit_bio_hook_t *submit_bio_hook)
466{
467 struct async_submit_bio *async;
468 int limit = btrfs_async_submit_limit(fs_info);
469
470 async = kmalloc(sizeof(*async), GFP_NOFS);
471 if (!async)
472 return -ENOMEM;
473
474 async->inode = inode;
475 async->rw = rw;
476 async->bio = bio;
477 async->mirror_num = mirror_num;
478 async->submit_bio_hook = submit_bio_hook;
479 async->work.func = run_one_async_submit;
480 async->work.flags = 0;
481 atomic_inc(&fs_info->nr_async_submits);
482 btrfs_queue_worker(&fs_info->workers, &async->work);
483
484 if (atomic_read(&fs_info->nr_async_submits) > limit) {
485 wait_event_timeout(fs_info->async_submit_wait,
486 (atomic_read(&fs_info->nr_async_submits) < limit),
487 HZ/10);
488
489 wait_event_timeout(fs_info->async_submit_wait,
490 (atomic_read(&fs_info->nr_async_bios) < limit),
491 HZ/10);
492 }
493 return 0;
494}
495
496static int btree_csum_one_bio(struct bio *bio)
497{
498 struct bio_vec *bvec = bio->bi_io_vec;
499 int bio_index = 0;
500 struct btrfs_root *root;
501
502 WARN_ON(bio->bi_vcnt <= 0);
503 while(bio_index < bio->bi_vcnt) {
504 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
505 csum_dirty_buffer(root, bvec->bv_page);
506 bio_index++;
507 bvec++;
508 }
509 return 0;
510}
511
512static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
513 int mirror_num)
514{
515 struct btrfs_root *root = BTRFS_I(inode)->root;
516 u64 offset;
517 int ret;
518
519 offset = bio->bi_sector << 9;
520
521 /*
522 * when we're called for a write, we're already in the async
523 * submission context. Just jump into btrfs_map_bio
524 */
525 if (rw & (1 << BIO_RW)) {
526 btree_csum_one_bio(bio);
527 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
528 mirror_num, 1);
529 }
530
531 /*
532 * called for a read, do the setup so that checksum validation
533 * can happen in the async kernel threads
534 */
535 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1);
536 BUG_ON(ret);
537
538 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
539}
540
541static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
542 int mirror_num)
543{
544 /*
545 * kthread helpers are used to submit writes so that checksumming
546 * can happen in parallel across all CPUs
547 */
548 if (!(rw & (1 << BIO_RW))) {
549 return __btree_submit_bio_hook(inode, rw, bio, mirror_num);
550 }
551 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
552 inode, rw, bio, mirror_num,
553 __btree_submit_bio_hook);
554}
555
556static int btree_writepage(struct page *page, struct writeback_control *wbc)
557{
558 struct extent_io_tree *tree;
559 tree = &BTRFS_I(page->mapping->host)->io_tree;
560
561 if (current->flags & PF_MEMALLOC) {
562 redirty_page_for_writepage(wbc, page);
563 unlock_page(page);
564 return 0;
565 }
566 return extent_write_full_page(tree, page, btree_get_extent, wbc);
567}
568
569static int btree_writepages(struct address_space *mapping,
570 struct writeback_control *wbc)
571{
572 struct extent_io_tree *tree;
573 tree = &BTRFS_I(mapping->host)->io_tree;
574 if (wbc->sync_mode == WB_SYNC_NONE) {
575 u64 num_dirty;
576 u64 start = 0;
577 unsigned long thresh = 8 * 1024 * 1024;
578
579 if (wbc->for_kupdate)
580 return 0;
581
582 num_dirty = count_range_bits(tree, &start, (u64)-1,
583 thresh, EXTENT_DIRTY);
584 if (num_dirty < thresh) {
585 return 0;
586 }
587 }
588 return extent_writepages(tree, mapping, btree_get_extent, wbc);
589}
590
591int btree_readpage(struct file *file, struct page *page)
592{
593 struct extent_io_tree *tree;
594 tree = &BTRFS_I(page->mapping->host)->io_tree;
595 return extent_read_full_page(tree, page, btree_get_extent);
596}
597
598static int btree_releasepage(struct page *page, gfp_t gfp_flags)
599{
600 struct extent_io_tree *tree;
601 struct extent_map_tree *map;
602 int ret;
603
604 if (PageWriteback(page) || PageDirty(page))
605 return 0;
606
607 tree = &BTRFS_I(page->mapping->host)->io_tree;
608 map = &BTRFS_I(page->mapping->host)->extent_tree;
609
610 ret = try_release_extent_state(map, tree, page, gfp_flags);
611 if (!ret) {
612 return 0;
613 }
614
615 ret = try_release_extent_buffer(tree, page);
616 if (ret == 1) {
617 ClearPagePrivate(page);
618 set_page_private(page, 0);
619 page_cache_release(page);
620 }
621
622 return ret;
623}
624
625static void btree_invalidatepage(struct page *page, unsigned long offset)
626{
627 struct extent_io_tree *tree;
628 tree = &BTRFS_I(page->mapping->host)->io_tree;
629 extent_invalidatepage(tree, page, offset);
630 btree_releasepage(page, GFP_NOFS);
631 if (PagePrivate(page)) {
632 printk("warning page private not zero on page %Lu\n",
633 page_offset(page));
634 ClearPagePrivate(page);
635 set_page_private(page, 0);
636 page_cache_release(page);
637 }
638}
639
640#if 0
641static int btree_writepage(struct page *page, struct writeback_control *wbc)
642{
643 struct buffer_head *bh;
644 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
645 struct buffer_head *head;
646 if (!page_has_buffers(page)) {
647 create_empty_buffers(page, root->fs_info->sb->s_blocksize,
648 (1 << BH_Dirty)|(1 << BH_Uptodate));
649 }
650 head = page_buffers(page);
651 bh = head;
652 do {
653 if (buffer_dirty(bh))
654 csum_tree_block(root, bh, 0);
655 bh = bh->b_this_page;
656 } while (bh != head);
657 return block_write_full_page(page, btree_get_block, wbc);
658}
659#endif
660
661static struct address_space_operations btree_aops = {
662 .readpage = btree_readpage,
663 .writepage = btree_writepage,
664 .writepages = btree_writepages,
665 .releasepage = btree_releasepage,
666 .invalidatepage = btree_invalidatepage,
667 .sync_page = block_sync_page,
668};
669
670int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
671 u64 parent_transid)
672{
673 struct extent_buffer *buf = NULL;
674 struct inode *btree_inode = root->fs_info->btree_inode;
675 int ret = 0;
676
677 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
678 if (!buf)
679 return 0;
680 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
681 buf, 0, 0, btree_get_extent, 0);
682 free_extent_buffer(buf);
683 return ret;
684}
685
686struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
687 u64 bytenr, u32 blocksize)
688{
689 struct inode *btree_inode = root->fs_info->btree_inode;
690 struct extent_buffer *eb;
691 eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
692 bytenr, blocksize, GFP_NOFS);
693 return eb;
694}
695
696struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
697 u64 bytenr, u32 blocksize)
698{
699 struct inode *btree_inode = root->fs_info->btree_inode;
700 struct extent_buffer *eb;
701
702 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
703 bytenr, blocksize, NULL, GFP_NOFS);
704 return eb;
705}
706
707
708int btrfs_write_tree_block(struct extent_buffer *buf)
709{
710 return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
711 buf->start + buf->len - 1, WB_SYNC_NONE);
712}
713
714int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
715{
716 return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
717 buf->start, buf->start + buf->len -1);
718}
719
720struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
721 u32 blocksize, u64 parent_transid)
722{
723 struct extent_buffer *buf = NULL;
724 struct inode *btree_inode = root->fs_info->btree_inode;
725 struct extent_io_tree *io_tree;
726 int ret;
727
728 io_tree = &BTRFS_I(btree_inode)->io_tree;
729
730 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
731 if (!buf)
732 return NULL;
733
734 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
735
736 if (ret == 0) {
737 buf->flags |= EXTENT_UPTODATE;
738 } else {
739 WARN_ON(1);
740 }
741 return buf;
742
743}
744
745int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
746 struct extent_buffer *buf)
747{
748 struct inode *btree_inode = root->fs_info->btree_inode;
749 if (btrfs_header_generation(buf) ==
750 root->fs_info->running_transaction->transid) {
751 WARN_ON(!btrfs_tree_locked(buf));
752 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
753 buf);
754 }
755 return 0;
756}
757
758static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
759 u32 stripesize, struct btrfs_root *root,
760 struct btrfs_fs_info *fs_info,
761 u64 objectid)
762{
763 root->node = NULL;
764 root->inode = NULL;
765 root->commit_root = NULL;
766 root->ref_tree = NULL;
767 root->sectorsize = sectorsize;
768 root->nodesize = nodesize;
769 root->leafsize = leafsize;
770 root->stripesize = stripesize;
771 root->ref_cows = 0;
772 root->track_dirty = 0;
773
774 root->fs_info = fs_info;
775 root->objectid = objectid;
776 root->last_trans = 0;
777 root->highest_inode = 0;
778 root->last_inode_alloc = 0;
779 root->name = NULL;
780 root->in_sysfs = 0;
781
782 INIT_LIST_HEAD(&root->dirty_list);
783 INIT_LIST_HEAD(&root->orphan_list);
784 INIT_LIST_HEAD(&root->dead_list);
785 spin_lock_init(&root->node_lock);
786 spin_lock_init(&root->list_lock);
787 mutex_init(&root->objectid_mutex);
788 mutex_init(&root->log_mutex);
789 extent_io_tree_init(&root->dirty_log_pages,
790 fs_info->btree_inode->i_mapping, GFP_NOFS);
791
792 btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
793 root->ref_tree = &root->ref_tree_struct;
794
795 memset(&root->root_key, 0, sizeof(root->root_key));
796 memset(&root->root_item, 0, sizeof(root->root_item));
797 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
798 memset(&root->root_kobj, 0, sizeof(root->root_kobj));
799 root->defrag_trans_start = fs_info->generation;
800 init_completion(&root->kobj_unregister);
801 root->defrag_running = 0;
802 root->defrag_level = 0;
803 root->root_key.objectid = objectid;
804 return 0;
805}
806
807static int find_and_setup_root(struct btrfs_root *tree_root,
808 struct btrfs_fs_info *fs_info,
809 u64 objectid,
810 struct btrfs_root *root)
811{
812 int ret;
813 u32 blocksize;
814
815 __setup_root(tree_root->nodesize, tree_root->leafsize,
816 tree_root->sectorsize, tree_root->stripesize,
817 root, fs_info, objectid);
818 ret = btrfs_find_last_root(tree_root, objectid,
819 &root->root_item, &root->root_key);
820 BUG_ON(ret);
821
822 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
823 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
824 blocksize, 0);
825 BUG_ON(!root->node);
826 return 0;
827}
828
829int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
830 struct btrfs_fs_info *fs_info)
831{
832 struct extent_buffer *eb;
833 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
834 u64 start = 0;
835 u64 end = 0;
836 int ret;
837
838 if (!log_root_tree)
839 return 0;
840
841 while(1) {
842 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
843 0, &start, &end, EXTENT_DIRTY);
844 if (ret)
845 break;
846
847 clear_extent_dirty(&log_root_tree->dirty_log_pages,
848 start, end, GFP_NOFS);
849 }
850 eb = fs_info->log_root_tree->node;
851
852 WARN_ON(btrfs_header_level(eb) != 0);
853 WARN_ON(btrfs_header_nritems(eb) != 0);
854
855 ret = btrfs_free_reserved_extent(fs_info->tree_root,
856 eb->start, eb->len);
857 BUG_ON(ret);
858
859 free_extent_buffer(eb);
860 kfree(fs_info->log_root_tree);
861 fs_info->log_root_tree = NULL;
862 return 0;
863}
864
865int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
866 struct btrfs_fs_info *fs_info)
867{
868 struct btrfs_root *root;
869 struct btrfs_root *tree_root = fs_info->tree_root;
870
871 root = kzalloc(sizeof(*root), GFP_NOFS);
872 if (!root)
873 return -ENOMEM;
874
875 __setup_root(tree_root->nodesize, tree_root->leafsize,
876 tree_root->sectorsize, tree_root->stripesize,
877 root, fs_info, BTRFS_TREE_LOG_OBJECTID);
878
879 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
880 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
881 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
882 root->ref_cows = 0;
883
884 root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
885 0, BTRFS_TREE_LOG_OBJECTID,
886 trans->transid, 0, 0, 0);
887
888 btrfs_set_header_nritems(root->node, 0);
889 btrfs_set_header_level(root->node, 0);
890 btrfs_set_header_bytenr(root->node, root->node->start);
891 btrfs_set_header_generation(root->node, trans->transid);
892 btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
893
894 write_extent_buffer(root->node, root->fs_info->fsid,
895 (unsigned long)btrfs_header_fsid(root->node),
896 BTRFS_FSID_SIZE);
897 btrfs_mark_buffer_dirty(root->node);
898 btrfs_tree_unlock(root->node);
899 fs_info->log_root_tree = root;
900 return 0;
901}
902
903struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
904 struct btrfs_key *location)
905{
906 struct btrfs_root *root;
907 struct btrfs_fs_info *fs_info = tree_root->fs_info;
908 struct btrfs_path *path;
909 struct extent_buffer *l;
910 u64 highest_inode;
911 u32 blocksize;
912 int ret = 0;
913
914 root = kzalloc(sizeof(*root), GFP_NOFS);
915 if (!root)
916 return ERR_PTR(-ENOMEM);
917 if (location->offset == (u64)-1) {
918 ret = find_and_setup_root(tree_root, fs_info,
919 location->objectid, root);
920 if (ret) {
921 kfree(root);
922 return ERR_PTR(ret);
923 }
924 goto insert;
925 }
926
927 __setup_root(tree_root->nodesize, tree_root->leafsize,
928 tree_root->sectorsize, tree_root->stripesize,
929 root, fs_info, location->objectid);
930
931 path = btrfs_alloc_path();
932 BUG_ON(!path);
933 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
934 if (ret != 0) {
935 if (ret > 0)
936 ret = -ENOENT;
937 goto out;
938 }
939 l = path->nodes[0];
940 read_extent_buffer(l, &root->root_item,
941 btrfs_item_ptr_offset(l, path->slots[0]),
942 sizeof(root->root_item));
943 memcpy(&root->root_key, location, sizeof(*location));
944 ret = 0;
945out:
946 btrfs_release_path(root, path);
947 btrfs_free_path(path);
948 if (ret) {
949 kfree(root);
950 return ERR_PTR(ret);
951 }
952 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
953 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
954 blocksize, 0);
955 BUG_ON(!root->node);
956insert:
957 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
958 root->ref_cows = 1;
959 ret = btrfs_find_highest_inode(root, &highest_inode);
960 if (ret == 0) {
961 root->highest_inode = highest_inode;
962 root->last_inode_alloc = highest_inode;
963 }
964 }
965 return root;
966}
967
968struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
969 u64 root_objectid)
970{
971 struct btrfs_root *root;
972
973 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
974 return fs_info->tree_root;
975 if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
976 return fs_info->extent_root;
977
978 root = radix_tree_lookup(&fs_info->fs_roots_radix,
979 (unsigned long)root_objectid);
980 return root;
981}
982
983struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
984 struct btrfs_key *location)
985{
986 struct btrfs_root *root;
987 int ret;
988
989 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
990 return fs_info->tree_root;
991 if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
992 return fs_info->extent_root;
993 if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
994 return fs_info->chunk_root;
995 if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
996 return fs_info->dev_root;
997
998 root = radix_tree_lookup(&fs_info->fs_roots_radix,
999 (unsigned long)location->objectid);
1000 if (root)
1001 return root;
1002
1003 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1004 if (IS_ERR(root))
1005 return root;
1006 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1007 (unsigned long)root->root_key.objectid,
1008 root);
1009 if (ret) {
1010 free_extent_buffer(root->node);
1011 kfree(root);
1012 return ERR_PTR(ret);
1013 }
1014 ret = btrfs_find_dead_roots(fs_info->tree_root,
1015 root->root_key.objectid, root);
1016 BUG_ON(ret);
1017
1018 return root;
1019}
1020
1021struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1022 struct btrfs_key *location,
1023 const char *name, int namelen)
1024{
1025 struct btrfs_root *root;
1026 int ret;
1027
1028 root = btrfs_read_fs_root_no_name(fs_info, location);
1029 if (!root)
1030 return NULL;
1031
1032 if (root->in_sysfs)
1033 return root;
1034
1035 ret = btrfs_set_root_name(root, name, namelen);
1036 if (ret) {
1037 free_extent_buffer(root->node);
1038 kfree(root);
1039 return ERR_PTR(ret);
1040 }
1041
1042 ret = btrfs_sysfs_add_root(root);
1043 if (ret) {
1044 free_extent_buffer(root->node);
1045 kfree(root->name);
1046 kfree(root);
1047 return ERR_PTR(ret);
1048 }
1049 root->in_sysfs = 1;
1050 return root;
1051}
1052#if 0
1053static int add_hasher(struct btrfs_fs_info *info, char *type) {
1054 struct btrfs_hasher *hasher;
1055
1056 hasher = kmalloc(sizeof(*hasher), GFP_NOFS);
1057 if (!hasher)
1058 return -ENOMEM;
1059 hasher->hash_tfm = crypto_alloc_hash(type, 0, CRYPTO_ALG_ASYNC);
1060 if (!hasher->hash_tfm) {
1061 kfree(hasher);
1062 return -EINVAL;
1063 }
1064 spin_lock(&info->hash_lock);
1065 list_add(&hasher->list, &info->hashers);
1066 spin_unlock(&info->hash_lock);
1067 return 0;
1068}
1069#endif
1070
1071static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1072{
1073 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1074 int ret = 0;
1075 struct list_head *cur;
1076 struct btrfs_device *device;
1077 struct backing_dev_info *bdi;
1078
1079 if ((bdi_bits & (1 << BDI_write_congested)) &&
1080 btrfs_congested_async(info, 0))
1081 return 1;
1082
1083 list_for_each(cur, &info->fs_devices->devices) {
1084 device = list_entry(cur, struct btrfs_device, dev_list);
1085 if (!device->bdev)
1086 continue;
1087 bdi = blk_get_backing_dev_info(device->bdev);
1088 if (bdi && bdi_congested(bdi, bdi_bits)) {
1089 ret = 1;
1090 break;
1091 }
1092 }
1093 return ret;
1094}
1095
1096/*
1097 * this unplugs every device on the box, and it is only used when page
1098 * is null
1099 */
1100static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1101{
1102 struct list_head *cur;
1103 struct btrfs_device *device;
1104 struct btrfs_fs_info *info;
1105
1106 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1107 list_for_each(cur, &info->fs_devices->devices) {
1108 device = list_entry(cur, struct btrfs_device, dev_list);
1109 bdi = blk_get_backing_dev_info(device->bdev);
1110 if (bdi->unplug_io_fn) {
1111 bdi->unplug_io_fn(bdi, page);
1112 }
1113 }
1114}
1115
1116void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1117{
1118 struct inode *inode;
1119 struct extent_map_tree *em_tree;
1120 struct extent_map *em;
1121 struct address_space *mapping;
1122 u64 offset;
1123
1124 /* the generic O_DIRECT read code does this */
1125 if (!page) {
1126 __unplug_io_fn(bdi, page);
1127 return;
1128 }
1129
1130 /*
1131 * page->mapping may change at any time. Get a consistent copy
1132 * and use that for everything below
1133 */
1134 smp_mb();
1135 mapping = page->mapping;
1136 if (!mapping)
1137 return;
1138
1139 inode = mapping->host;
1140 offset = page_offset(page);
1141
1142 em_tree = &BTRFS_I(inode)->extent_tree;
1143 spin_lock(&em_tree->lock);
1144 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1145 spin_unlock(&em_tree->lock);
1146 if (!em) {
1147 __unplug_io_fn(bdi, page);
1148 return;
1149 }
1150
1151 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1152 free_extent_map(em);
1153 __unplug_io_fn(bdi, page);
1154 return;
1155 }
1156 offset = offset - em->start;
1157 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1158 em->block_start + offset, page);
1159 free_extent_map(em);
1160}
1161
1162static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1163{
1164#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1165 bdi_init(bdi);
1166#endif
1167 bdi->ra_pages = default_backing_dev_info.ra_pages;
1168 bdi->state = 0;
1169 bdi->capabilities = default_backing_dev_info.capabilities;
1170 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1171 bdi->unplug_io_data = info;
1172 bdi->congested_fn = btrfs_congested_fn;
1173 bdi->congested_data = info;
1174 return 0;
1175}
1176
1177static int bio_ready_for_csum(struct bio *bio)
1178{
1179 u64 length = 0;
1180 u64 buf_len = 0;
1181 u64 start = 0;
1182 struct page *page;
1183 struct extent_io_tree *io_tree = NULL;
1184 struct btrfs_fs_info *info = NULL;
1185 struct bio_vec *bvec;
1186 int i;
1187 int ret;
1188
1189 bio_for_each_segment(bvec, bio, i) {
1190 page = bvec->bv_page;
1191 if (page->private == EXTENT_PAGE_PRIVATE) {
1192 length += bvec->bv_len;
1193 continue;
1194 }
1195 if (!page->private) {
1196 length += bvec->bv_len;
1197 continue;
1198 }
1199 length = bvec->bv_len;
1200 buf_len = page->private >> 2;
1201 start = page_offset(page) + bvec->bv_offset;
1202 io_tree = &BTRFS_I(page->mapping->host)->io_tree;
1203 info = BTRFS_I(page->mapping->host)->root->fs_info;
1204 }
1205 /* are we fully contained in this bio? */
1206 if (buf_len <= length)
1207 return 1;
1208
1209 ret = extent_range_uptodate(io_tree, start + length,
1210 start + buf_len - 1);
1211 if (ret == 1)
1212 return ret;
1213 return ret;
1214}
1215
1216/*
1217 * called by the kthread helper functions to finally call the bio end_io
1218 * functions. This is where read checksum verification actually happens
1219 */
1220static void end_workqueue_fn(struct btrfs_work *work)
1221{
1222 struct bio *bio;
1223 struct end_io_wq *end_io_wq;
1224 struct btrfs_fs_info *fs_info;
1225 int error;
1226
1227 end_io_wq = container_of(work, struct end_io_wq, work);
1228 bio = end_io_wq->bio;
1229 fs_info = end_io_wq->info;
1230
1231 /* metadata bios are special because the whole tree block must
1232 * be checksummed at once. This makes sure the entire block is in
1233 * ram and up to date before trying to verify things. For
1234 * blocksize <= pagesize, it is basically a noop
1235 */
1236 if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
1237 btrfs_queue_worker(&fs_info->endio_workers,
1238 &end_io_wq->work);
1239 return;
1240 }
1241 error = end_io_wq->error;
1242 bio->bi_private = end_io_wq->private;
1243 bio->bi_end_io = end_io_wq->end_io;
1244 kfree(end_io_wq);
1245#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1246 bio_endio(bio, bio->bi_size, error);
1247#else
1248 bio_endio(bio, error);
1249#endif
1250}
1251
1252static int cleaner_kthread(void *arg)
1253{
1254 struct btrfs_root *root = arg;
1255
1256 do {
1257 smp_mb();
1258 if (root->fs_info->closing)
1259 break;
1260
1261 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1262 mutex_lock(&root->fs_info->cleaner_mutex);
1263 btrfs_clean_old_snapshots(root);
1264 mutex_unlock(&root->fs_info->cleaner_mutex);
1265
1266 if (freezing(current)) {
1267 refrigerator();
1268 } else {
1269 smp_mb();
1270 if (root->fs_info->closing)
1271 break;
1272 set_current_state(TASK_INTERRUPTIBLE);
1273 schedule();
1274 __set_current_state(TASK_RUNNING);
1275 }
1276 } while (!kthread_should_stop());
1277 return 0;
1278}
1279
1280static int transaction_kthread(void *arg)
1281{
1282 struct btrfs_root *root = arg;
1283 struct btrfs_trans_handle *trans;
1284 struct btrfs_transaction *cur;
1285 unsigned long now;
1286 unsigned long delay;
1287 int ret;
1288
1289 do {
1290 smp_mb();
1291 if (root->fs_info->closing)
1292 break;
1293
1294 delay = HZ * 30;
1295 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1296 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1297
1298 if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
1299 printk("btrfs: total reference cache size %Lu\n",
1300 root->fs_info->total_ref_cache_size);
1301 }
1302
1303 mutex_lock(&root->fs_info->trans_mutex);
1304 cur = root->fs_info->running_transaction;
1305 if (!cur) {
1306 mutex_unlock(&root->fs_info->trans_mutex);
1307 goto sleep;
1308 }
1309
1310 now = get_seconds();
1311 if (now < cur->start_time || now - cur->start_time < 30) {
1312 mutex_unlock(&root->fs_info->trans_mutex);
1313 delay = HZ * 5;
1314 goto sleep;
1315 }
1316 mutex_unlock(&root->fs_info->trans_mutex);
1317 trans = btrfs_start_transaction(root, 1);
1318 ret = btrfs_commit_transaction(trans, root);
1319sleep:
1320 wake_up_process(root->fs_info->cleaner_kthread);
1321 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
1322
1323 if (freezing(current)) {
1324 refrigerator();
1325 } else {
1326 if (root->fs_info->closing)
1327 break;
1328 set_current_state(TASK_INTERRUPTIBLE);
1329 schedule_timeout(delay);
1330 __set_current_state(TASK_RUNNING);
1331 }
1332 } while (!kthread_should_stop());
1333 return 0;
1334}
1335
1336struct btrfs_root *open_ctree(struct super_block *sb,
1337 struct btrfs_fs_devices *fs_devices,
1338 char *options)
1339{
1340 u32 sectorsize;
1341 u32 nodesize;
1342 u32 leafsize;
1343 u32 blocksize;
1344 u32 stripesize;
1345 struct buffer_head *bh;
1346 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
1347 GFP_NOFS);
1348 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
1349 GFP_NOFS);
1350 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
1351 GFP_NOFS);
1352 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1353 GFP_NOFS);
1354 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
1355 GFP_NOFS);
1356 struct btrfs_root *log_tree_root;
1357
1358 int ret;
1359 int err = -EINVAL;
1360
1361 struct btrfs_super_block *disk_super;
1362
1363 if (!extent_root || !tree_root || !fs_info) {
1364 err = -ENOMEM;
1365 goto fail;
1366 }
1367 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
1368 INIT_LIST_HEAD(&fs_info->trans_list);
1369 INIT_LIST_HEAD(&fs_info->dead_roots);
1370 INIT_LIST_HEAD(&fs_info->hashers);
1371 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1372 spin_lock_init(&fs_info->hash_lock);
1373 spin_lock_init(&fs_info->delalloc_lock);
1374 spin_lock_init(&fs_info->new_trans_lock);
1375 spin_lock_init(&fs_info->ref_cache_lock);
1376
1377 init_completion(&fs_info->kobj_unregister);
1378 fs_info->tree_root = tree_root;
1379 fs_info->extent_root = extent_root;
1380 fs_info->chunk_root = chunk_root;
1381 fs_info->dev_root = dev_root;
1382 fs_info->fs_devices = fs_devices;
1383 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1384 INIT_LIST_HEAD(&fs_info->space_info);
1385 btrfs_mapping_init(&fs_info->mapping_tree);
1386 atomic_set(&fs_info->nr_async_submits, 0);
1387 atomic_set(&fs_info->nr_async_bios, 0);
1388 atomic_set(&fs_info->throttles, 0);
1389 atomic_set(&fs_info->throttle_gen, 0);
1390 fs_info->sb = sb;
1391 fs_info->max_extent = (u64)-1;
1392 fs_info->max_inline = 8192 * 1024;
1393 setup_bdi(fs_info, &fs_info->bdi);
1394 fs_info->btree_inode = new_inode(sb);
1395 fs_info->btree_inode->i_ino = 1;
1396 fs_info->btree_inode->i_nlink = 1;
1397 fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
1398
1399 INIT_LIST_HEAD(&fs_info->ordered_extents);
1400 spin_lock_init(&fs_info->ordered_extent_lock);
1401
1402 sb->s_blocksize = 4096;
1403 sb->s_blocksize_bits = blksize_bits(4096);
1404
1405 /*
1406 * we set the i_size on the btree inode to the max possible int.
1407 * the real end of the address space is determined by all of
1408 * the devices in the system
1409 */
1410 fs_info->btree_inode->i_size = OFFSET_MAX;
1411 fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
1412 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
1413
1414 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
1415 fs_info->btree_inode->i_mapping,
1416 GFP_NOFS);
1417 extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
1418 GFP_NOFS);
1419
1420 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1421
1422 spin_lock_init(&fs_info->block_group_cache_lock);
1423 fs_info->block_group_cache_tree.rb_node = NULL;
1424
1425 extent_io_tree_init(&fs_info->pinned_extents,
1426 fs_info->btree_inode->i_mapping, GFP_NOFS);
1427 extent_io_tree_init(&fs_info->pending_del,
1428 fs_info->btree_inode->i_mapping, GFP_NOFS);
1429 extent_io_tree_init(&fs_info->extent_ins,
1430 fs_info->btree_inode->i_mapping, GFP_NOFS);
1431 fs_info->do_barriers = 1;
1432
1433 BTRFS_I(fs_info->btree_inode)->root = tree_root;
1434 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1435 sizeof(struct btrfs_key));
1436 insert_inode_hash(fs_info->btree_inode);
1437
1438 mutex_init(&fs_info->trans_mutex);
1439 mutex_init(&fs_info->tree_log_mutex);
1440 mutex_init(&fs_info->drop_mutex);
1441 mutex_init(&fs_info->alloc_mutex);
1442 mutex_init(&fs_info->chunk_mutex);
1443 mutex_init(&fs_info->transaction_kthread_mutex);
1444 mutex_init(&fs_info->cleaner_mutex);
1445 mutex_init(&fs_info->volume_mutex);
1446 init_waitqueue_head(&fs_info->transaction_throttle);
1447 init_waitqueue_head(&fs_info->transaction_wait);
1448 init_waitqueue_head(&fs_info->async_submit_wait);
1449 init_waitqueue_head(&fs_info->tree_log_wait);
1450 atomic_set(&fs_info->tree_log_commit, 0);
1451 atomic_set(&fs_info->tree_log_writers, 0);
1452 fs_info->tree_log_transid = 0;
1453
1454#if 0
1455 ret = add_hasher(fs_info, "crc32c");
1456 if (ret) {
1457 printk("btrfs: failed hash setup, modprobe cryptomgr?\n");
1458 err = -ENOMEM;
1459 goto fail_iput;
1460 }
1461#endif
1462 __setup_root(4096, 4096, 4096, 4096, tree_root,
1463 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1464
1465
1466 bh = __bread(fs_devices->latest_bdev,
1467 BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
1468 if (!bh)
1469 goto fail_iput;
1470
1471 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1472 brelse(bh);
1473
1474 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
1475
1476 disk_super = &fs_info->super_copy;
1477 if (!btrfs_super_root(disk_super))
1478 goto fail_sb_buffer;
1479
1480 err = btrfs_parse_options(tree_root, options);
1481 if (err)
1482 goto fail_sb_buffer;
1483
1484 /*
1485 * we need to start all the end_io workers up front because the
1486 * queue work function gets called at interrupt time, and so it
1487 * cannot dynamically grow.
1488 */
1489 btrfs_init_workers(&fs_info->workers, "worker",
1490 fs_info->thread_pool_size);
1491 btrfs_init_workers(&fs_info->submit_workers, "submit",
1492 min_t(u64, fs_devices->num_devices,
1493 fs_info->thread_pool_size));
1494
1495 /* a higher idle thresh on the submit workers makes it much more
1496 * likely that bios will be send down in a sane order to the
1497 * devices
1498 */
1499 fs_info->submit_workers.idle_thresh = 64;
1500
1501 /* fs_info->workers is responsible for checksumming file data
1502 * blocks and metadata. Using a larger idle thresh allows each
1503 * worker thread to operate on things in roughly the order they
1504 * were sent by the writeback daemons, improving overall locality
1505 * of the IO going down the pipe.
1506 */
1507 fs_info->workers.idle_thresh = 128;
1508
1509 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
1510 btrfs_init_workers(&fs_info->endio_workers, "endio",
1511 fs_info->thread_pool_size);
1512 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1513 fs_info->thread_pool_size);
1514
1515 /*
1516 * endios are largely parallel and should have a very
1517 * low idle thresh
1518 */
1519 fs_info->endio_workers.idle_thresh = 4;
1520 fs_info->endio_write_workers.idle_thresh = 64;
1521
1522 btrfs_start_workers(&fs_info->workers, 1);
1523 btrfs_start_workers(&fs_info->submit_workers, 1);
1524 btrfs_start_workers(&fs_info->fixup_workers, 1);
1525 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
1526 btrfs_start_workers(&fs_info->endio_write_workers,
1527 fs_info->thread_pool_size);
1528
1529 err = -EINVAL;
1530 if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
1531 printk("Btrfs: wanted %llu devices, but found %llu\n",
1532 (unsigned long long)btrfs_super_num_devices(disk_super),
1533 (unsigned long long)fs_devices->open_devices);
1534 if (btrfs_test_opt(tree_root, DEGRADED))
1535 printk("continuing in degraded mode\n");
1536 else {
1537 goto fail_sb_buffer;
1538 }
1539 }
1540
1541 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1542
1543 nodesize = btrfs_super_nodesize(disk_super);
1544 leafsize = btrfs_super_leafsize(disk_super);
1545 sectorsize = btrfs_super_sectorsize(disk_super);
1546 stripesize = btrfs_super_stripesize(disk_super);
1547 tree_root->nodesize = nodesize;
1548 tree_root->leafsize = leafsize;
1549 tree_root->sectorsize = sectorsize;
1550 tree_root->stripesize = stripesize;
1551
1552 sb->s_blocksize = sectorsize;
1553 sb->s_blocksize_bits = blksize_bits(sectorsize);
1554
1555 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
1556 sizeof(disk_super->magic))) {
1557 printk("btrfs: valid FS not found on %s\n", sb->s_id);
1558 goto fail_sb_buffer;
1559 }
1560
1561 mutex_lock(&fs_info->chunk_mutex);
1562 ret = btrfs_read_sys_array(tree_root);
1563 mutex_unlock(&fs_info->chunk_mutex);
1564 if (ret) {
1565 printk("btrfs: failed to read the system array on %s\n",
1566 sb->s_id);
1567 goto fail_sys_array;
1568 }
1569
1570 blocksize = btrfs_level_size(tree_root,
1571 btrfs_super_chunk_root_level(disk_super));
1572
1573 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1574 chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
1575
1576 chunk_root->node = read_tree_block(chunk_root,
1577 btrfs_super_chunk_root(disk_super),
1578 blocksize, 0);
1579 BUG_ON(!chunk_root->node);
1580
1581 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
1582 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
1583 BTRFS_UUID_SIZE);
1584
1585 mutex_lock(&fs_info->chunk_mutex);
1586 ret = btrfs_read_chunk_tree(chunk_root);
1587 mutex_unlock(&fs_info->chunk_mutex);
1588 BUG_ON(ret);
1589
1590 btrfs_close_extra_devices(fs_devices);
1591
1592 blocksize = btrfs_level_size(tree_root,
1593 btrfs_super_root_level(disk_super));
1594
1595
1596 tree_root->node = read_tree_block(tree_root,
1597 btrfs_super_root(disk_super),
1598 blocksize, 0);
1599 if (!tree_root->node)
1600 goto fail_sb_buffer;
1601
1602
1603 ret = find_and_setup_root(tree_root, fs_info,
1604 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1605 if (ret)
1606 goto fail_tree_root;
1607 extent_root->track_dirty = 1;
1608
1609 ret = find_and_setup_root(tree_root, fs_info,
1610 BTRFS_DEV_TREE_OBJECTID, dev_root);
1611 dev_root->track_dirty = 1;
1612
1613 if (ret)
1614 goto fail_extent_root;
1615
1616 btrfs_read_block_groups(extent_root);
1617
1618 fs_info->generation = btrfs_super_generation(disk_super) + 1;
1619 fs_info->data_alloc_profile = (u64)-1;
1620 fs_info->metadata_alloc_profile = (u64)-1;
1621 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1622 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1623 "btrfs-cleaner");
1624 if (!fs_info->cleaner_kthread)
1625 goto fail_extent_root;
1626
1627 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1628 tree_root,
1629 "btrfs-transaction");
1630 if (!fs_info->transaction_kthread)
1631 goto fail_cleaner;
1632
1633 if (btrfs_super_log_root(disk_super) != 0) {
1634 u32 blocksize;
1635 u64 bytenr = btrfs_super_log_root(disk_super);
1636
1637 blocksize =
1638 btrfs_level_size(tree_root,
1639 btrfs_super_log_root_level(disk_super));
1640
1641 log_tree_root = kzalloc(sizeof(struct btrfs_root),
1642 GFP_NOFS);
1643
1644 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1645 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
1646
1647 log_tree_root->node = read_tree_block(tree_root, bytenr,
1648 blocksize, 0);
1649 ret = btrfs_recover_log_trees(log_tree_root);
1650 BUG_ON(ret);
1651 }
1652 fs_info->last_trans_committed = btrfs_super_generation(disk_super);
1653 return tree_root;
1654
1655fail_cleaner:
1656 kthread_stop(fs_info->cleaner_kthread);
1657fail_extent_root:
1658 free_extent_buffer(extent_root->node);
1659fail_tree_root:
1660 free_extent_buffer(tree_root->node);
1661fail_sys_array:
1662fail_sb_buffer:
1663 btrfs_stop_workers(&fs_info->fixup_workers);
1664 btrfs_stop_workers(&fs_info->workers);
1665 btrfs_stop_workers(&fs_info->endio_workers);
1666 btrfs_stop_workers(&fs_info->endio_write_workers);
1667 btrfs_stop_workers(&fs_info->submit_workers);
1668fail_iput:
1669 iput(fs_info->btree_inode);
1670fail:
1671 btrfs_close_devices(fs_info->fs_devices);
1672 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1673
1674 kfree(extent_root);
1675 kfree(tree_root);
1676#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1677 bdi_destroy(&fs_info->bdi);
1678#endif
1679 kfree(fs_info);
1680 return ERR_PTR(err);
1681}
1682
1683static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
1684{
1685 char b[BDEVNAME_SIZE];
1686
1687 if (uptodate) {
1688 set_buffer_uptodate(bh);
1689 } else {
1690 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
1691 printk(KERN_WARNING "lost page write due to "
1692 "I/O error on %s\n",
1693 bdevname(bh->b_bdev, b));
1694 }
1695 /* note, we dont' set_buffer_write_io_error because we have
1696 * our own ways of dealing with the IO errors
1697 */
1698 clear_buffer_uptodate(bh);
1699 }
1700 unlock_buffer(bh);
1701 put_bh(bh);
1702}
1703
1704int write_all_supers(struct btrfs_root *root)
1705{
1706 struct list_head *cur;
1707 struct list_head *head = &root->fs_info->fs_devices->devices;
1708 struct btrfs_device *dev;
1709 struct btrfs_super_block *sb;
1710 struct btrfs_dev_item *dev_item;
1711 struct buffer_head *bh;
1712 int ret;
1713 int do_barriers;
1714 int max_errors;
1715 int total_errors = 0;
1716 u32 crc;
1717 u64 flags;
1718
1719 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1720 do_barriers = !btrfs_test_opt(root, NOBARRIER);
1721
1722 sb = &root->fs_info->super_for_commit;
1723 dev_item = &sb->dev_item;
1724 list_for_each(cur, head) {
1725 dev = list_entry(cur, struct btrfs_device, dev_list);
1726 if (!dev->bdev) {
1727 total_errors++;
1728 continue;
1729 }
1730 if (!dev->in_fs_metadata)
1731 continue;
1732
1733 btrfs_set_stack_device_type(dev_item, dev->type);
1734 btrfs_set_stack_device_id(dev_item, dev->devid);
1735 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
1736 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
1737 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
1738 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
1739 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
1740 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
1741 flags = btrfs_super_flags(sb);
1742 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
1743
1744
1745 crc = ~(u32)0;
1746 crc = btrfs_csum_data(root, (char *)sb + BTRFS_CSUM_SIZE, crc,
1747 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
1748 btrfs_csum_final(crc, sb->csum);
1749
1750 bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET / 4096,
1751 BTRFS_SUPER_INFO_SIZE);
1752
1753 memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
1754 dev->pending_io = bh;
1755
1756 get_bh(bh);
1757 set_buffer_uptodate(bh);
1758 lock_buffer(bh);
1759 bh->b_end_io = btrfs_end_buffer_write_sync;
1760
1761 if (do_barriers && dev->barriers) {
1762 ret = submit_bh(WRITE_BARRIER, bh);
1763 if (ret == -EOPNOTSUPP) {
1764 printk("btrfs: disabling barriers on dev %s\n",
1765 dev->name);
1766 set_buffer_uptodate(bh);
1767 dev->barriers = 0;
1768 get_bh(bh);
1769 lock_buffer(bh);
1770 ret = submit_bh(WRITE, bh);
1771 }
1772 } else {
1773 ret = submit_bh(WRITE, bh);
1774 }
1775 if (ret)
1776 total_errors++;
1777 }
1778 if (total_errors > max_errors) {
1779 printk("btrfs: %d errors while writing supers\n", total_errors);
1780 BUG();
1781 }
1782 total_errors = 0;
1783
1784 list_for_each(cur, head) {
1785 dev = list_entry(cur, struct btrfs_device, dev_list);
1786 if (!dev->bdev)
1787 continue;
1788 if (!dev->in_fs_metadata)
1789 continue;
1790
1791 BUG_ON(!dev->pending_io);
1792 bh = dev->pending_io;
1793 wait_on_buffer(bh);
1794 if (!buffer_uptodate(dev->pending_io)) {
1795 if (do_barriers && dev->barriers) {
1796 printk("btrfs: disabling barriers on dev %s\n",
1797 dev->name);
1798 set_buffer_uptodate(bh);
1799 get_bh(bh);
1800 lock_buffer(bh);
1801 dev->barriers = 0;
1802 ret = submit_bh(WRITE, bh);
1803 BUG_ON(ret);
1804 wait_on_buffer(bh);
1805 if (!buffer_uptodate(bh))
1806 total_errors++;
1807 } else {
1808 total_errors++;
1809 }
1810
1811 }
1812 dev->pending_io = NULL;
1813 brelse(bh);
1814 }
1815 if (total_errors > max_errors) {
1816 printk("btrfs: %d errors while writing supers\n", total_errors);
1817 BUG();
1818 }
1819 return 0;
1820}
1821
1822int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
1823 *root)
1824{
1825 int ret;
1826
1827 ret = write_all_supers(root);
1828 return ret;
1829}
1830
1831int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
1832{
1833 radix_tree_delete(&fs_info->fs_roots_radix,
1834 (unsigned long)root->root_key.objectid);
1835 if (root->in_sysfs)
1836 btrfs_sysfs_del_root(root);
1837 if (root->inode)
1838 iput(root->inode);
1839 if (root->node)
1840 free_extent_buffer(root->node);
1841 if (root->commit_root)
1842 free_extent_buffer(root->commit_root);
1843 if (root->name)
1844 kfree(root->name);
1845 kfree(root);
1846 return 0;
1847}
1848
1849static int del_fs_roots(struct btrfs_fs_info *fs_info)
1850{
1851 int ret;
1852 struct btrfs_root *gang[8];
1853 int i;
1854
1855 while(1) {
1856 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
1857 (void **)gang, 0,
1858 ARRAY_SIZE(gang));
1859 if (!ret)
1860 break;
1861 for (i = 0; i < ret; i++)
1862 btrfs_free_fs_root(fs_info, gang[i]);
1863 }
1864 return 0;
1865}
1866
1867int close_ctree(struct btrfs_root *root)
1868{
1869 int ret;
1870 struct btrfs_trans_handle *trans;
1871 struct btrfs_fs_info *fs_info = root->fs_info;
1872
1873 fs_info->closing = 1;
1874 smp_mb();
1875
1876 kthread_stop(root->fs_info->transaction_kthread);
1877 kthread_stop(root->fs_info->cleaner_kthread);
1878
1879 btrfs_clean_old_snapshots(root);
1880 trans = btrfs_start_transaction(root, 1);
1881 ret = btrfs_commit_transaction(trans, root);
1882 /* run commit again to drop the original snapshot */
1883 trans = btrfs_start_transaction(root, 1);
1884 btrfs_commit_transaction(trans, root);
1885 ret = btrfs_write_and_wait_transaction(NULL, root);
1886 BUG_ON(ret);
1887
1888 write_ctree_super(NULL, root);
1889
1890 if (fs_info->delalloc_bytes) {
1891 printk("btrfs: at unmount delalloc count %Lu\n",
1892 fs_info->delalloc_bytes);
1893 }
1894 if (fs_info->total_ref_cache_size) {
1895 printk("btrfs: at umount reference cache size %Lu\n",
1896 fs_info->total_ref_cache_size);
1897 }
1898
1899 if (fs_info->extent_root->node)
1900 free_extent_buffer(fs_info->extent_root->node);
1901
1902 if (fs_info->tree_root->node)
1903 free_extent_buffer(fs_info->tree_root->node);
1904
1905 if (root->fs_info->chunk_root->node);
1906 free_extent_buffer(root->fs_info->chunk_root->node);
1907
1908 if (root->fs_info->dev_root->node);
1909 free_extent_buffer(root->fs_info->dev_root->node);
1910
1911 btrfs_free_block_groups(root->fs_info);
1912 fs_info->closing = 2;
1913 del_fs_roots(fs_info);
1914
1915 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
1916
1917 truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
1918
1919 btrfs_stop_workers(&fs_info->fixup_workers);
1920 btrfs_stop_workers(&fs_info->workers);
1921 btrfs_stop_workers(&fs_info->endio_workers);
1922 btrfs_stop_workers(&fs_info->endio_write_workers);
1923 btrfs_stop_workers(&fs_info->submit_workers);
1924
1925 iput(fs_info->btree_inode);
1926#if 0
1927 while(!list_empty(&fs_info->hashers)) {
1928 struct btrfs_hasher *hasher;
1929 hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
1930 hashers);
1931 list_del(&hasher->hashers);
1932 crypto_free_hash(&fs_info->hash_tfm);
1933 kfree(hasher);
1934 }
1935#endif
1936 btrfs_close_devices(fs_info->fs_devices);
1937 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1938
1939#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1940 bdi_destroy(&fs_info->bdi);
1941#endif
1942
1943 kfree(fs_info->extent_root);
1944 kfree(fs_info->tree_root);
1945 kfree(fs_info->chunk_root);
1946 kfree(fs_info->dev_root);
1947 return 0;
1948}
1949
1950int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
1951{
1952 int ret;
1953 struct inode *btree_inode = buf->first_page->mapping->host;
1954
1955 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
1956 if (!ret)
1957 return ret;
1958
1959 ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
1960 parent_transid);
1961 return !ret;
1962}
1963
1964int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
1965{
1966 struct inode *btree_inode = buf->first_page->mapping->host;
1967 return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
1968 buf);
1969}
1970
1971void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
1972{
1973 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
1974 u64 transid = btrfs_header_generation(buf);
1975 struct inode *btree_inode = root->fs_info->btree_inode;
1976
1977 WARN_ON(!btrfs_tree_locked(buf));
1978 if (transid != root->fs_info->generation) {
1979 printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
1980 (unsigned long long)buf->start,
1981 transid, root->fs_info->generation);
1982 WARN_ON(1);
1983 }
1984 set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
1985}
1986
1987void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
1988{
1989 /*
1990 * looks as though older kernels can get into trouble with
1991 * this code, they end up stuck in balance_dirty_pages forever
1992 */
1993 struct extent_io_tree *tree;
1994 u64 num_dirty;
1995 u64 start = 0;
1996 unsigned long thresh = 96 * 1024 * 1024;
1997 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
1998
1999 if (current_is_pdflush() || current->flags & PF_MEMALLOC)
2000 return;
2001
2002 num_dirty = count_range_bits(tree, &start, (u64)-1,
2003 thresh, EXTENT_DIRTY);
2004 if (num_dirty > thresh) {
2005 balance_dirty_pages_ratelimited_nr(
2006 root->fs_info->btree_inode->i_mapping, 1);
2007 }
2008 return;
2009}
2010
2011int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2012{
2013 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2014 int ret;
2015 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2016 if (ret == 0) {
2017 buf->flags |= EXTENT_UPTODATE;
2018 }
2019 return ret;
2020}
2021
2022int btree_lock_page_hook(struct page *page)
2023{
2024 struct inode *inode = page->mapping->host;
2025 struct btrfs_root *root = BTRFS_I(inode)->root;
2026 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2027 struct extent_buffer *eb;
2028 unsigned long len;
2029 u64 bytenr = page_offset(page);
2030
2031 if (page->private == EXTENT_PAGE_PRIVATE)
2032 goto out;
2033
2034 len = page->private >> 2;
2035 eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
2036 if (!eb)
2037 goto out;
2038
2039 btrfs_tree_lock(eb);
2040 spin_lock(&root->fs_info->hash_lock);
2041 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2042 spin_unlock(&root->fs_info->hash_lock);
2043 btrfs_tree_unlock(eb);
2044 free_extent_buffer(eb);
2045out:
2046 lock_page(page);
2047 return 0;
2048}
2049
2050static struct extent_io_ops btree_extent_io_ops = {
2051 .write_cache_pages_lock_hook = btree_lock_page_hook,
2052 .readpage_end_io_hook = btree_readpage_end_io_hook,
2053 .submit_bio_hook = btree_submit_bio_hook,
2054 /* note we're sharing with inode.c for the merge bio hook */
2055 .merge_bio_hook = btrfs_merge_bio_hook,
2056};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 000000000000..f84f5058dbbb
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,84 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __DISKIO__
20#define __DISKIO__
21
22#define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
23#define BTRFS_SUPER_INFO_SIZE 4096
24struct btrfs_device;
25struct btrfs_fs_devices;
26
27struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
28 u32 blocksize, u64 parent_transid);
29int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
30 u64 parent_transid);
31struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
32 u64 bytenr, u32 blocksize);
33int clean_tree_block(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root, struct extent_buffer *buf);
35struct btrfs_root *open_ctree(struct super_block *sb,
36 struct btrfs_fs_devices *fs_devices,
37 char *options);
38int close_ctree(struct btrfs_root *root);
39int write_ctree_super(struct btrfs_trans_handle *trans,
40 struct btrfs_root *root);
41struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
42 u64 bytenr, u32 blocksize);
43struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
44 u64 root_objectid);
45struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
46 struct btrfs_key *location,
47 const char *name, int namelen);
48struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
49 struct btrfs_key *location);
50struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
51 struct btrfs_key *location);
52int btrfs_insert_dev_radix(struct btrfs_root *root,
53 struct block_device *bdev,
54 u64 device_id,
55 u64 block_start,
56 u64 num_blocks);
57void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
58int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
59void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
60int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
61int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
62int wait_on_tree_block_writeback(struct btrfs_root *root,
63 struct extent_buffer *buf);
64int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
65u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
66void btrfs_csum_final(u32 crc, char *result);
67int btrfs_open_device(struct btrfs_device *dev);
68int btrfs_verify_block_csum(struct btrfs_root *root,
69 struct extent_buffer *buf);
70int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
71 int metadata);
72int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
73 int rw, struct bio *bio, int mirror_num,
74 extent_submit_bio_hook_t *submit_bio_hook);
75int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
76unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
77int btrfs_write_tree_block(struct extent_buffer *buf);
78int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
79int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
80 struct btrfs_fs_info *fs_info);
81int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
82 struct btrfs_fs_info *fs_info);
83int btree_lock_page_hook(struct page *page);
84#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 000000000000..2b357a6d2407
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,207 @@
1#include <linux/fs.h>
2#include <linux/types.h>
3#include "ctree.h"
4#include "disk-io.h"
5#include "btrfs_inode.h"
6#include "print-tree.h"
7#include "export.h"
8#include "compat.h"
9
10#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
11#define FILEID_BTRFS_WITHOUT_PARENT 0x4d
12#define FILEID_BTRFS_WITH_PARENT 0x4e
13#define FILEID_BTRFS_WITH_PARENT_ROOT 0x4f
14#endif
15
16#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, parent_objectid)/4)
17#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, parent_root_objectid)/4)
18#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid)/4)
19
20static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
21 int connectable)
22{
23 struct btrfs_fid *fid = (struct btrfs_fid *)fh;
24 struct inode *inode = dentry->d_inode;
25 int len = *max_len;
26 int type;
27
28 if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
29 (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
30 return 255;
31
32 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
33 type = FILEID_BTRFS_WITHOUT_PARENT;
34
35 fid->objectid = BTRFS_I(inode)->location.objectid;
36 fid->root_objectid = BTRFS_I(inode)->root->objectid;
37 fid->gen = inode->i_generation;
38
39 if (connectable && !S_ISDIR(inode->i_mode)) {
40 struct inode *parent;
41 u64 parent_root_id;
42
43 spin_lock(&dentry->d_lock);
44
45 parent = dentry->d_parent->d_inode;
46 fid->parent_objectid = BTRFS_I(parent)->location.objectid;
47 fid->parent_gen = parent->i_generation;
48 parent_root_id = BTRFS_I(parent)->root->objectid;
49
50 spin_unlock(&dentry->d_lock);
51
52 if (parent_root_id != fid->root_objectid) {
53 fid->parent_root_objectid = parent_root_id;
54 len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
55 type = FILEID_BTRFS_WITH_PARENT_ROOT;
56 } else {
57 len = BTRFS_FID_SIZE_CONNECTABLE;
58 type = FILEID_BTRFS_WITH_PARENT;
59 }
60 }
61
62 *max_len = len;
63 return type;
64}
65
66static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
67 u64 root_objectid, u32 generation)
68{
69 struct btrfs_root *root;
70 struct inode *inode;
71 struct btrfs_key key;
72
73 key.objectid = root_objectid;
74 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
75 key.offset = (u64)-1;
76
77 root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
78 if (IS_ERR(root))
79 return ERR_CAST(root);
80
81 key.objectid = objectid;
82 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
83 key.offset = 0;
84
85 inode = btrfs_iget(sb, &key, root, NULL);
86 if (IS_ERR(inode))
87 return (void *)inode;
88
89 if (generation != inode->i_generation) {
90 iput(inode);
91 return ERR_PTR(-ESTALE);
92 }
93
94 return d_obtain_alias(inode);
95}
96
97static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
98 int fh_len, int fh_type)
99{
100 struct btrfs_fid *fid = (struct btrfs_fid *) fh;
101 u64 objectid, root_objectid;
102 u32 generation;
103
104 if (fh_type == FILEID_BTRFS_WITH_PARENT) {
105 if (fh_len != BTRFS_FID_SIZE_CONNECTABLE)
106 return NULL;
107 root_objectid = fid->root_objectid;
108 } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
109 if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
110 return NULL;
111 root_objectid = fid->parent_root_objectid;
112 } else
113 return NULL;
114
115 objectid = fid->parent_objectid;
116 generation = fid->parent_gen;
117
118 return btrfs_get_dentry(sb, objectid, root_objectid, generation);
119}
120
121static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
122 int fh_len, int fh_type)
123{
124 struct btrfs_fid *fid = (struct btrfs_fid *) fh;
125 u64 objectid, root_objectid;
126 u32 generation;
127
128 if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
129 fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
130 (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
131 fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
132 (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
133 fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
134 return NULL;
135
136 objectid = fid->objectid;
137 root_objectid = fid->root_objectid;
138 generation = fid->gen;
139
140 return btrfs_get_dentry(sb, objectid, root_objectid, generation);
141}
142
143static struct dentry *btrfs_get_parent(struct dentry *child)
144{
145 struct inode *dir = child->d_inode;
146 struct btrfs_root *root = BTRFS_I(dir)->root;
147 struct btrfs_key key;
148 struct btrfs_path *path;
149 struct extent_buffer *leaf;
150 int slot;
151 u64 objectid;
152 int ret;
153
154 path = btrfs_alloc_path();
155
156 key.objectid = dir->i_ino;
157 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
158 key.offset = (u64)-1;
159
160 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
161 if (ret < 0) {
162 /* Error */
163 btrfs_free_path(path);
164 return ERR_PTR(ret);
165 }
166 leaf = path->nodes[0];
167 slot = path->slots[0];
168 if (ret) {
169 /* btrfs_search_slot() returns the slot where we'd want to
170 insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
171 The _real_ backref, telling us what the parent inode
172 _actually_ is, will be in the slot _before_ the one
173 that btrfs_search_slot() returns. */
174 if (!slot) {
175 /* Unless there is _no_ key in the tree before... */
176 btrfs_free_path(path);
177 return ERR_PTR(-EIO);
178 }
179 slot--;
180 }
181
182 btrfs_item_key_to_cpu(leaf, &key, slot);
183 btrfs_free_path(path);
184
185 if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
186 return ERR_PTR(-EINVAL);
187
188 objectid = key.offset;
189
190 /* If we are already at the root of a subvol, return the real root */
191 if (objectid == dir->i_ino)
192 return dget(dir->i_sb->s_root);
193
194 /* Build a new key for the inode item */
195 key.objectid = objectid;
196 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
197 key.offset = 0;
198
199 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
200}
201
202const struct export_operations btrfs_export_ops = {
203 .encode_fh = btrfs_encode_fh,
204 .fh_to_dentry = btrfs_fh_to_dentry,
205 .fh_to_parent = btrfs_fh_to_parent,
206 .get_parent = btrfs_get_parent,
207};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 000000000000..074348a95841
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,19 @@
1#ifndef BTRFS_EXPORT_H
2#define BTRFS_EXPORT_H
3
4#include <linux/exportfs.h>
5
6extern const struct export_operations btrfs_export_ops;
7
8struct btrfs_fid {
9 u64 objectid;
10 u64 root_objectid;
11 u32 gen;
12
13 u64 parent_objectid;
14 u32 parent_gen;
15
16 u64 parent_root_objectid;
17} __attribute__ ((packed));
18
19#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 000000000000..5258923d621f
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,4034 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/pagemap.h>
20#include <linux/writeback.h>
21#include <linux/blkdev.h>
22#include "hash.h"
23#include "crc32c.h"
24#include "ctree.h"
25#include "disk-io.h"
26#include "print-tree.h"
27#include "transaction.h"
28#include "volumes.h"
29#include "locking.h"
30#include "ref-cache.h"
31
32#define PENDING_EXTENT_INSERT 0
33#define PENDING_EXTENT_DELETE 1
34#define PENDING_BACKREF_UPDATE 2
35
36struct pending_extent_op {
37 int type;
38 u64 bytenr;
39 u64 num_bytes;
40 u64 parent;
41 u64 orig_parent;
42 u64 generation;
43 u64 orig_generation;
44 int level;
45};
46
47static int finish_current_insert(struct btrfs_trans_handle *trans, struct
48 btrfs_root *extent_root);
49static int del_pending_extents(struct btrfs_trans_handle *trans, struct
50 btrfs_root *extent_root);
51static struct btrfs_block_group_cache *
52__btrfs_find_block_group(struct btrfs_root *root,
53 struct btrfs_block_group_cache *hint,
54 u64 search_start, int data, int owner);
55
56void maybe_lock_mutex(struct btrfs_root *root)
57{
58 if (root != root->fs_info->extent_root &&
59 root != root->fs_info->chunk_root &&
60 root != root->fs_info->dev_root) {
61 mutex_lock(&root->fs_info->alloc_mutex);
62 }
63}
64
65void maybe_unlock_mutex(struct btrfs_root *root)
66{
67 if (root != root->fs_info->extent_root &&
68 root != root->fs_info->chunk_root &&
69 root != root->fs_info->dev_root) {
70 mutex_unlock(&root->fs_info->alloc_mutex);
71 }
72}
73
74static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
75{
76 return (cache->flags & bits) == bits;
77}
78
79/*
80 * this adds the block group to the fs_info rb tree for the block group
81 * cache
82 */
83int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
84 struct btrfs_block_group_cache *block_group)
85{
86 struct rb_node **p;
87 struct rb_node *parent = NULL;
88 struct btrfs_block_group_cache *cache;
89
90 spin_lock(&info->block_group_cache_lock);
91 p = &info->block_group_cache_tree.rb_node;
92
93 while (*p) {
94 parent = *p;
95 cache = rb_entry(parent, struct btrfs_block_group_cache,
96 cache_node);
97 if (block_group->key.objectid < cache->key.objectid) {
98 p = &(*p)->rb_left;
99 } else if (block_group->key.objectid > cache->key.objectid) {
100 p = &(*p)->rb_right;
101 } else {
102 spin_unlock(&info->block_group_cache_lock);
103 return -EEXIST;
104 }
105 }
106
107 rb_link_node(&block_group->cache_node, parent, p);
108 rb_insert_color(&block_group->cache_node,
109 &info->block_group_cache_tree);
110 spin_unlock(&info->block_group_cache_lock);
111
112 return 0;
113}
114
115/*
116 * This will return the block group at or after bytenr if contains is 0, else
117 * it will return the block group that contains the bytenr
118 */
119static struct btrfs_block_group_cache *
120block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
121 int contains)
122{
123 struct btrfs_block_group_cache *cache, *ret = NULL;
124 struct rb_node *n;
125 u64 end, start;
126
127 spin_lock(&info->block_group_cache_lock);
128 n = info->block_group_cache_tree.rb_node;
129
130 while (n) {
131 cache = rb_entry(n, struct btrfs_block_group_cache,
132 cache_node);
133 end = cache->key.objectid + cache->key.offset - 1;
134 start = cache->key.objectid;
135
136 if (bytenr < start) {
137 if (!contains && (!ret || start < ret->key.objectid))
138 ret = cache;
139 n = n->rb_left;
140 } else if (bytenr > start) {
141 if (contains && bytenr <= end) {
142 ret = cache;
143 break;
144 }
145 n = n->rb_right;
146 } else {
147 ret = cache;
148 break;
149 }
150 }
151 spin_unlock(&info->block_group_cache_lock);
152
153 return ret;
154}
155
156/*
157 * this is only called by cache_block_group, since we could have freed extents
158 * we need to check the pinned_extents for any extents that can't be used yet
159 * since their free space will be released as soon as the transaction commits.
160 */
161static int add_new_free_space(struct btrfs_block_group_cache *block_group,
162 struct btrfs_fs_info *info, u64 start, u64 end)
163{
164 u64 extent_start, extent_end, size;
165 int ret;
166
167 while (start < end) {
168 ret = find_first_extent_bit(&info->pinned_extents, start,
169 &extent_start, &extent_end,
170 EXTENT_DIRTY);
171 if (ret)
172 break;
173
174 if (extent_start == start) {
175 start = extent_end + 1;
176 } else if (extent_start > start && extent_start < end) {
177 size = extent_start - start;
178 ret = btrfs_add_free_space(block_group, start, size);
179 BUG_ON(ret);
180 start = extent_end + 1;
181 } else {
182 break;
183 }
184 }
185
186 if (start < end) {
187 size = end - start;
188 ret = btrfs_add_free_space(block_group, start, size);
189 BUG_ON(ret);
190 }
191
192 return 0;
193}
194
195static int cache_block_group(struct btrfs_root *root,
196 struct btrfs_block_group_cache *block_group)
197{
198 struct btrfs_path *path;
199 int ret = 0;
200 struct btrfs_key key;
201 struct extent_buffer *leaf;
202 int slot;
203 u64 last = 0;
204 u64 first_free;
205 int found = 0;
206
207 if (!block_group)
208 return 0;
209
210 root = root->fs_info->extent_root;
211
212 if (block_group->cached)
213 return 0;
214
215 path = btrfs_alloc_path();
216 if (!path)
217 return -ENOMEM;
218
219 path->reada = 2;
220 /*
221 * we get into deadlocks with paths held by callers of this function.
222 * since the alloc_mutex is protecting things right now, just
223 * skip the locking here
224 */
225 path->skip_locking = 1;
226 first_free = max_t(u64, block_group->key.objectid,
227 BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
228 key.objectid = block_group->key.objectid;
229 key.offset = 0;
230 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
231 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
232 if (ret < 0)
233 goto err;
234 ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
235 if (ret < 0)
236 goto err;
237 if (ret == 0) {
238 leaf = path->nodes[0];
239 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
240 if (key.objectid + key.offset > first_free)
241 first_free = key.objectid + key.offset;
242 }
243 while(1) {
244 leaf = path->nodes[0];
245 slot = path->slots[0];
246 if (slot >= btrfs_header_nritems(leaf)) {
247 ret = btrfs_next_leaf(root, path);
248 if (ret < 0)
249 goto err;
250 if (ret == 0)
251 continue;
252 else
253 break;
254 }
255 btrfs_item_key_to_cpu(leaf, &key, slot);
256 if (key.objectid < block_group->key.objectid)
257 goto next;
258
259 if (key.objectid >= block_group->key.objectid +
260 block_group->key.offset)
261 break;
262
263 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
264 if (!found) {
265 last = first_free;
266 found = 1;
267 }
268
269 add_new_free_space(block_group, root->fs_info, last,
270 key.objectid);
271
272 last = key.objectid + key.offset;
273 }
274next:
275 path->slots[0]++;
276 }
277
278 if (!found)
279 last = first_free;
280
281 add_new_free_space(block_group, root->fs_info, last,
282 block_group->key.objectid +
283 block_group->key.offset);
284
285 block_group->cached = 1;
286 ret = 0;
287err:
288 btrfs_free_path(path);
289 return ret;
290}
291
292/*
293 * return the block group that starts at or after bytenr
294 */
295struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
296 btrfs_fs_info *info,
297 u64 bytenr)
298{
299 struct btrfs_block_group_cache *cache;
300
301 cache = block_group_cache_tree_search(info, bytenr, 0);
302
303 return cache;
304}
305
306/*
307 * return the block group that contains teh given bytenr
308 */
309struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
310 btrfs_fs_info *info,
311 u64 bytenr)
312{
313 struct btrfs_block_group_cache *cache;
314
315 cache = block_group_cache_tree_search(info, bytenr, 1);
316
317 return cache;
318}
319
320static int noinline find_free_space(struct btrfs_root *root,
321 struct btrfs_block_group_cache **cache_ret,
322 u64 *start_ret, u64 num, int data)
323{
324 int ret;
325 struct btrfs_block_group_cache *cache = *cache_ret;
326 struct btrfs_free_space *info = NULL;
327 u64 last;
328 u64 total_fs_bytes;
329 u64 search_start = *start_ret;
330
331 WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
332 total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
333
334 if (!cache)
335 goto out;
336
337 last = max(search_start, cache->key.objectid);
338
339again:
340 ret = cache_block_group(root, cache);
341 if (ret)
342 goto out;
343
344 if (cache->ro || !block_group_bits(cache, data))
345 goto new_group;
346
347 info = btrfs_find_free_space(cache, last, num);
348 if (info) {
349 *start_ret = info->offset;
350 return 0;
351 }
352
353new_group:
354 last = cache->key.objectid + cache->key.offset;
355
356 cache = btrfs_lookup_first_block_group(root->fs_info, last);
357 if (!cache || cache->key.objectid >= total_fs_bytes)
358 goto out;
359
360 *cache_ret = cache;
361 goto again;
362
363out:
364 return -ENOSPC;
365}
366
367static u64 div_factor(u64 num, int factor)
368{
369 if (factor == 10)
370 return num;
371 num *= factor;
372 do_div(num, 10);
373 return num;
374}
375
376static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
377 u64 flags)
378{
379 struct list_head *head = &info->space_info;
380 struct list_head *cur;
381 struct btrfs_space_info *found;
382 list_for_each(cur, head) {
383 found = list_entry(cur, struct btrfs_space_info, list);
384 if (found->flags == flags)
385 return found;
386 }
387 return NULL;
388
389}
390
391static struct btrfs_block_group_cache *
392__btrfs_find_block_group(struct btrfs_root *root,
393 struct btrfs_block_group_cache *hint,
394 u64 search_start, int data, int owner)
395{
396 struct btrfs_block_group_cache *cache;
397 struct btrfs_block_group_cache *found_group = NULL;
398 struct btrfs_fs_info *info = root->fs_info;
399 struct btrfs_space_info *sinfo;
400 u64 used;
401 u64 last = 0;
402 u64 free_check;
403 int full_search = 0;
404 int factor = 10;
405 int wrapped = 0;
406
407 if (data & BTRFS_BLOCK_GROUP_METADATA)
408 factor = 9;
409
410 if (search_start) {
411 struct btrfs_block_group_cache *shint;
412 shint = btrfs_lookup_first_block_group(info, search_start);
413 if (shint && block_group_bits(shint, data) && !shint->ro) {
414 spin_lock(&shint->lock);
415 used = btrfs_block_group_used(&shint->item);
416 if (used + shint->pinned <
417 div_factor(shint->key.offset, factor)) {
418 spin_unlock(&shint->lock);
419 return shint;
420 }
421 spin_unlock(&shint->lock);
422 }
423 }
424 if (hint && !hint->ro && block_group_bits(hint, data)) {
425 spin_lock(&hint->lock);
426 used = btrfs_block_group_used(&hint->item);
427 if (used + hint->pinned <
428 div_factor(hint->key.offset, factor)) {
429 spin_unlock(&hint->lock);
430 return hint;
431 }
432 spin_unlock(&hint->lock);
433 last = hint->key.objectid + hint->key.offset;
434 } else {
435 if (hint)
436 last = max(hint->key.objectid, search_start);
437 else
438 last = search_start;
439 }
440 sinfo = __find_space_info(root->fs_info, data);
441 if (!sinfo)
442 goto found;
443again:
444 while(1) {
445 struct list_head *l;
446
447 cache = NULL;
448
449 spin_lock(&sinfo->lock);
450 list_for_each(l, &sinfo->block_groups) {
451 struct btrfs_block_group_cache *entry;
452 entry = list_entry(l, struct btrfs_block_group_cache,
453 list);
454 if ((entry->key.objectid >= last) &&
455 (!cache || (entry->key.objectid <
456 cache->key.objectid)))
457 cache = entry;
458 }
459 spin_unlock(&sinfo->lock);
460
461 if (!cache)
462 break;
463
464 spin_lock(&cache->lock);
465 last = cache->key.objectid + cache->key.offset;
466 used = btrfs_block_group_used(&cache->item);
467
468 if (!cache->ro && block_group_bits(cache, data)) {
469 free_check = div_factor(cache->key.offset, factor);
470 if (used + cache->pinned < free_check) {
471 found_group = cache;
472 spin_unlock(&cache->lock);
473 goto found;
474 }
475 }
476 spin_unlock(&cache->lock);
477 cond_resched();
478 }
479 if (!wrapped) {
480 last = search_start;
481 wrapped = 1;
482 goto again;
483 }
484 if (!full_search && factor < 10) {
485 last = search_start;
486 full_search = 1;
487 factor = 10;
488 goto again;
489 }
490found:
491 return found_group;
492}
493
494struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
495 struct btrfs_block_group_cache
496 *hint, u64 search_start,
497 int data, int owner)
498{
499
500 struct btrfs_block_group_cache *ret;
501 ret = __btrfs_find_block_group(root, hint, search_start, data, owner);
502 return ret;
503}
504
505/* simple helper to search for an existing extent at a given offset */
506int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
507{
508 int ret;
509 struct btrfs_key key;
510 struct btrfs_path *path;
511
512 path = btrfs_alloc_path();
513 BUG_ON(!path);
514 maybe_lock_mutex(root);
515 key.objectid = start;
516 key.offset = len;
517 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
518 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
519 0, 0);
520 maybe_unlock_mutex(root);
521 btrfs_free_path(path);
522 return ret;
523}
524
525/*
526 * Back reference rules. Back refs have three main goals:
527 *
528 * 1) differentiate between all holders of references to an extent so that
529 * when a reference is dropped we can make sure it was a valid reference
530 * before freeing the extent.
531 *
532 * 2) Provide enough information to quickly find the holders of an extent
533 * if we notice a given block is corrupted or bad.
534 *
535 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
536 * maintenance. This is actually the same as #2, but with a slightly
537 * different use case.
538 *
539 * File extents can be referenced by:
540 *
541 * - multiple snapshots, subvolumes, or different generations in one subvol
542 * - different files inside a single subvolume
543 * - different offsets inside a file (bookend extents in file.c)
544 *
545 * The extent ref structure has fields for:
546 *
547 * - Objectid of the subvolume root
548 * - Generation number of the tree holding the reference
549 * - objectid of the file holding the reference
550 * - offset in the file corresponding to the key holding the reference
551 * - number of references holding by parent node (alway 1 for tree blocks)
552 *
553 * Btree leaf may hold multiple references to a file extent. In most cases,
554 * these references are from same file and the corresponding offsets inside
555 * the file are close together. So inode objectid and offset in file are
556 * just hints, they provide hints about where in the btree the references
557 * can be found and when we can stop searching.
558 *
559 * When a file extent is allocated the fields are filled in:
560 * (root_key.objectid, trans->transid, inode objectid, offset in file, 1)
561 *
562 * When a leaf is cow'd new references are added for every file extent found
563 * in the leaf. It looks similar to the create case, but trans->transid will
564 * be different when the block is cow'd.
565 *
566 * (root_key.objectid, trans->transid, inode objectid, offset in file,
567 * number of references in the leaf)
568 *
569 * Because inode objectid and offset in file are just hints, they are not
570 * used when backrefs are deleted. When a file extent is removed either
571 * during snapshot deletion or file truncation, we find the corresponding
572 * back back reference and check the following fields.
573 *
574 * (btrfs_header_owner(leaf), btrfs_header_generation(leaf))
575 *
576 * Btree extents can be referenced by:
577 *
578 * - Different subvolumes
579 * - Different generations of the same subvolume
580 *
581 * When a tree block is created, back references are inserted:
582 *
583 * (root->root_key.objectid, trans->transid, level, 0, 1)
584 *
585 * When a tree block is cow'd, new back references are added for all the
586 * blocks it points to. If the tree block isn't in reference counted root,
587 * the old back references are removed. These new back references are of
588 * the form (trans->transid will have increased since creation):
589 *
590 * (root->root_key.objectid, trans->transid, level, 0, 1)
591 *
592 * When a backref is in deleting, the following fields are checked:
593 *
594 * if backref was for a tree root:
595 * (btrfs_header_owner(itself), btrfs_header_generation(itself))
596 * else
597 * (btrfs_header_owner(parent), btrfs_header_generation(parent))
598 *
599 * Back Reference Key composing:
600 *
601 * The key objectid corresponds to the first byte in the extent, the key
602 * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
603 * byte of parent extent. If a extent is tree root, the key offset is set
604 * to the key objectid.
605 */
606
607static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
608 struct btrfs_root *root,
609 struct btrfs_path *path, u64 bytenr,
610 u64 parent, u64 ref_root,
611 u64 ref_generation, int del)
612{
613 struct btrfs_key key;
614 struct btrfs_extent_ref *ref;
615 struct extent_buffer *leaf;
616 int ret;
617
618 key.objectid = bytenr;
619 key.type = BTRFS_EXTENT_REF_KEY;
620 key.offset = parent;
621
622 ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
623 if (ret < 0)
624 goto out;
625 if (ret > 0) {
626 ret = -ENOENT;
627 goto out;
628 }
629
630 leaf = path->nodes[0];
631 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
632 if (btrfs_ref_root(leaf, ref) != ref_root ||
633 btrfs_ref_generation(leaf, ref) != ref_generation) {
634 ret = -EIO;
635 WARN_ON(1);
636 goto out;
637 }
638 ret = 0;
639out:
640 return ret;
641}
642
643static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
644 struct btrfs_root *root,
645 struct btrfs_path *path,
646 u64 bytenr, u64 parent,
647 u64 ref_root, u64 ref_generation,
648 u64 owner_objectid, u64 owner_offset)
649{
650 struct btrfs_key key;
651 struct extent_buffer *leaf;
652 struct btrfs_extent_ref *ref;
653 u32 num_refs;
654 int ret;
655
656 key.objectid = bytenr;
657 key.type = BTRFS_EXTENT_REF_KEY;
658 key.offset = parent;
659
660 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
661 if (ret == 0) {
662 leaf = path->nodes[0];
663 ref = btrfs_item_ptr(leaf, path->slots[0],
664 struct btrfs_extent_ref);
665 btrfs_set_ref_root(leaf, ref, ref_root);
666 btrfs_set_ref_generation(leaf, ref, ref_generation);
667 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
668 btrfs_set_ref_offset(leaf, ref, owner_offset);
669 btrfs_set_ref_num_refs(leaf, ref, 1);
670 } else if (ret == -EEXIST) {
671 u64 existing_owner;
672 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
673 leaf = path->nodes[0];
674 ref = btrfs_item_ptr(leaf, path->slots[0],
675 struct btrfs_extent_ref);
676 if (btrfs_ref_root(leaf, ref) != ref_root ||
677 btrfs_ref_generation(leaf, ref) != ref_generation) {
678 ret = -EIO;
679 WARN_ON(1);
680 goto out;
681 }
682
683 num_refs = btrfs_ref_num_refs(leaf, ref);
684 BUG_ON(num_refs == 0);
685 btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
686
687 existing_owner = btrfs_ref_objectid(leaf, ref);
688 if (existing_owner == owner_objectid &&
689 btrfs_ref_offset(leaf, ref) > owner_offset) {
690 btrfs_set_ref_offset(leaf, ref, owner_offset);
691 } else if (existing_owner != owner_objectid &&
692 existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
693 btrfs_set_ref_objectid(leaf, ref,
694 BTRFS_MULTIPLE_OBJECTIDS);
695 btrfs_set_ref_offset(leaf, ref, 0);
696 }
697 ret = 0;
698 } else {
699 goto out;
700 }
701 btrfs_mark_buffer_dirty(path->nodes[0]);
702out:
703 btrfs_release_path(root, path);
704 return ret;
705}
706
707static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
708 struct btrfs_root *root,
709 struct btrfs_path *path)
710{
711 struct extent_buffer *leaf;
712 struct btrfs_extent_ref *ref;
713 u32 num_refs;
714 int ret = 0;
715
716 leaf = path->nodes[0];
717 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
718 num_refs = btrfs_ref_num_refs(leaf, ref);
719 BUG_ON(num_refs == 0);
720 num_refs -= 1;
721 if (num_refs == 0) {
722 ret = btrfs_del_item(trans, root, path);
723 } else {
724 btrfs_set_ref_num_refs(leaf, ref, num_refs);
725 btrfs_mark_buffer_dirty(leaf);
726 }
727 btrfs_release_path(root, path);
728 return ret;
729}
730
731static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
732 struct btrfs_root *root, u64 bytenr,
733 u64 orig_parent, u64 parent,
734 u64 orig_root, u64 ref_root,
735 u64 orig_generation, u64 ref_generation,
736 u64 owner_objectid, u64 owner_offset)
737{
738 int ret;
739 struct btrfs_root *extent_root = root->fs_info->extent_root;
740 struct btrfs_path *path;
741
742 if (root == root->fs_info->extent_root) {
743 struct pending_extent_op *extent_op;
744 u64 num_bytes;
745
746 BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
747 num_bytes = btrfs_level_size(root, (int)owner_objectid);
748 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
749 bytenr + num_bytes - 1, EXTENT_LOCKED, 0)) {
750 u64 priv;
751 ret = get_state_private(&root->fs_info->extent_ins,
752 bytenr, &priv);
753 BUG_ON(ret);
754 extent_op = (struct pending_extent_op *)
755 (unsigned long)priv;
756 BUG_ON(extent_op->parent != orig_parent);
757 BUG_ON(extent_op->generation != orig_generation);
758 extent_op->parent = parent;
759 extent_op->generation = ref_generation;
760 } else {
761 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
762 BUG_ON(!extent_op);
763
764 extent_op->type = PENDING_BACKREF_UPDATE;
765 extent_op->bytenr = bytenr;
766 extent_op->num_bytes = num_bytes;
767 extent_op->parent = parent;
768 extent_op->orig_parent = orig_parent;
769 extent_op->generation = ref_generation;
770 extent_op->orig_generation = orig_generation;
771 extent_op->level = (int)owner_objectid;
772
773 set_extent_bits(&root->fs_info->extent_ins,
774 bytenr, bytenr + num_bytes - 1,
775 EXTENT_LOCKED, GFP_NOFS);
776 set_state_private(&root->fs_info->extent_ins,
777 bytenr, (unsigned long)extent_op);
778 }
779 return 0;
780 }
781
782 path = btrfs_alloc_path();
783 if (!path)
784 return -ENOMEM;
785 ret = lookup_extent_backref(trans, extent_root, path,
786 bytenr, orig_parent, orig_root,
787 orig_generation, 1);
788 if (ret)
789 goto out;
790 ret = remove_extent_backref(trans, extent_root, path);
791 if (ret)
792 goto out;
793 ret = insert_extent_backref(trans, extent_root, path, bytenr,
794 parent, ref_root, ref_generation,
795 owner_objectid, owner_offset);
796 BUG_ON(ret);
797 finish_current_insert(trans, extent_root);
798 del_pending_extents(trans, extent_root);
799out:
800 btrfs_free_path(path);
801 return ret;
802}
803
804int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
805 struct btrfs_root *root, u64 bytenr,
806 u64 orig_parent, u64 parent,
807 u64 ref_root, u64 ref_generation,
808 u64 owner_objectid, u64 owner_offset)
809{
810 int ret;
811 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
812 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
813 return 0;
814 maybe_lock_mutex(root);
815 ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
816 parent, ref_root, ref_root,
817 ref_generation, ref_generation,
818 owner_objectid, owner_offset);
819 maybe_unlock_mutex(root);
820 return ret;
821}
822
823static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
824 struct btrfs_root *root, u64 bytenr,
825 u64 orig_parent, u64 parent,
826 u64 orig_root, u64 ref_root,
827 u64 orig_generation, u64 ref_generation,
828 u64 owner_objectid, u64 owner_offset)
829{
830 struct btrfs_path *path;
831 int ret;
832 struct btrfs_key key;
833 struct extent_buffer *l;
834 struct btrfs_extent_item *item;
835 u32 refs;
836
837 path = btrfs_alloc_path();
838 if (!path)
839 return -ENOMEM;
840
841 path->reada = 1;
842 key.objectid = bytenr;
843 key.type = BTRFS_EXTENT_ITEM_KEY;
844 key.offset = (u64)-1;
845
846 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
847 0, 1);
848 if (ret < 0)
849 return ret;
850 BUG_ON(ret == 0 || path->slots[0] == 0);
851
852 path->slots[0]--;
853 l = path->nodes[0];
854
855 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
856 BUG_ON(key.objectid != bytenr);
857 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
858
859 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
860 refs = btrfs_extent_refs(l, item);
861 btrfs_set_extent_refs(l, item, refs + 1);
862 btrfs_mark_buffer_dirty(path->nodes[0]);
863
864 btrfs_release_path(root->fs_info->extent_root, path);
865
866 path->reada = 1;
867 ret = insert_extent_backref(trans, root->fs_info->extent_root,
868 path, bytenr, parent,
869 ref_root, ref_generation,
870 owner_objectid, owner_offset);
871 BUG_ON(ret);
872 finish_current_insert(trans, root->fs_info->extent_root);
873 del_pending_extents(trans, root->fs_info->extent_root);
874
875 btrfs_free_path(path);
876 return 0;
877}
878
879int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
880 struct btrfs_root *root,
881 u64 bytenr, u64 num_bytes, u64 parent,
882 u64 ref_root, u64 ref_generation,
883 u64 owner_objectid, u64 owner_offset)
884{
885 int ret;
886 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
887 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
888 return 0;
889 maybe_lock_mutex(root);
890 ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
891 0, ref_root, 0, ref_generation,
892 owner_objectid, owner_offset);
893 maybe_unlock_mutex(root);
894 return ret;
895}
896
897int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
898 struct btrfs_root *root)
899{
900 finish_current_insert(trans, root->fs_info->extent_root);
901 del_pending_extents(trans, root->fs_info->extent_root);
902 return 0;
903}
904
905int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
906 struct btrfs_root *root, u64 bytenr,
907 u64 num_bytes, u32 *refs)
908{
909 struct btrfs_path *path;
910 int ret;
911 struct btrfs_key key;
912 struct extent_buffer *l;
913 struct btrfs_extent_item *item;
914
915 WARN_ON(num_bytes < root->sectorsize);
916 path = btrfs_alloc_path();
917 path->reada = 1;
918 key.objectid = bytenr;
919 key.offset = num_bytes;
920 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
921 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
922 0, 0);
923 if (ret < 0)
924 goto out;
925 if (ret != 0) {
926 btrfs_print_leaf(root, path->nodes[0]);
927 printk("failed to find block number %Lu\n", bytenr);
928 BUG();
929 }
930 l = path->nodes[0];
931 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
932 *refs = btrfs_extent_refs(l, item);
933out:
934 btrfs_free_path(path);
935 return 0;
936}
937
938static int get_reference_status(struct btrfs_root *root, u64 bytenr,
939 u64 parent_gen, u64 ref_objectid,
940 u64 *min_generation, u32 *ref_count)
941{
942 struct btrfs_root *extent_root = root->fs_info->extent_root;
943 struct btrfs_path *path;
944 struct extent_buffer *leaf;
945 struct btrfs_extent_ref *ref_item;
946 struct btrfs_key key;
947 struct btrfs_key found_key;
948 u64 root_objectid = root->root_key.objectid;
949 u64 ref_generation;
950 u32 nritems;
951 int ret;
952
953 key.objectid = bytenr;
954 key.offset = (u64)-1;
955 key.type = BTRFS_EXTENT_ITEM_KEY;
956
957 path = btrfs_alloc_path();
958 mutex_lock(&root->fs_info->alloc_mutex);
959 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
960 if (ret < 0)
961 goto out;
962 BUG_ON(ret == 0);
963 if (ret < 0 || path->slots[0] == 0)
964 goto out;
965
966 path->slots[0]--;
967 leaf = path->nodes[0];
968 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
969
970 if (found_key.objectid != bytenr ||
971 found_key.type != BTRFS_EXTENT_ITEM_KEY) {
972 ret = 1;
973 goto out;
974 }
975
976 *ref_count = 0;
977 *min_generation = (u64)-1;
978
979 while (1) {
980 leaf = path->nodes[0];
981 nritems = btrfs_header_nritems(leaf);
982 if (path->slots[0] >= nritems) {
983 ret = btrfs_next_leaf(extent_root, path);
984 if (ret < 0)
985 goto out;
986 if (ret == 0)
987 continue;
988 break;
989 }
990 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
991 if (found_key.objectid != bytenr)
992 break;
993
994 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
995 path->slots[0]++;
996 continue;
997 }
998
999 ref_item = btrfs_item_ptr(leaf, path->slots[0],
1000 struct btrfs_extent_ref);
1001 ref_generation = btrfs_ref_generation(leaf, ref_item);
1002 /*
1003 * For (parent_gen > 0 && parent_gen > ref_generation):
1004 *
1005 * we reach here through the oldest root, therefore
1006 * all other reference from same snapshot should have
1007 * a larger generation.
1008 */
1009 if ((root_objectid != btrfs_ref_root(leaf, ref_item)) ||
1010 (parent_gen > 0 && parent_gen > ref_generation) ||
1011 (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
1012 ref_objectid != btrfs_ref_objectid(leaf, ref_item))) {
1013 *ref_count = 2;
1014 break;
1015 }
1016
1017 *ref_count = 1;
1018 if (*min_generation > ref_generation)
1019 *min_generation = ref_generation;
1020
1021 path->slots[0]++;
1022 }
1023 ret = 0;
1024out:
1025 mutex_unlock(&root->fs_info->alloc_mutex);
1026 btrfs_free_path(path);
1027 return ret;
1028}
1029
1030int btrfs_cross_ref_exists(struct btrfs_trans_handle *trans,
1031 struct btrfs_root *root,
1032 struct btrfs_key *key, u64 bytenr)
1033{
1034 struct btrfs_root *old_root;
1035 struct btrfs_path *path = NULL;
1036 struct extent_buffer *eb;
1037 struct btrfs_file_extent_item *item;
1038 u64 ref_generation;
1039 u64 min_generation;
1040 u64 extent_start;
1041 u32 ref_count;
1042 int level;
1043 int ret;
1044
1045 BUG_ON(trans == NULL);
1046 BUG_ON(key->type != BTRFS_EXTENT_DATA_KEY);
1047 ret = get_reference_status(root, bytenr, 0, key->objectid,
1048 &min_generation, &ref_count);
1049 if (ret)
1050 return ret;
1051
1052 if (ref_count != 1)
1053 return 1;
1054
1055 old_root = root->dirty_root->root;
1056 ref_generation = old_root->root_key.offset;
1057
1058 /* all references are created in running transaction */
1059 if (min_generation > ref_generation) {
1060 ret = 0;
1061 goto out;
1062 }
1063
1064 path = btrfs_alloc_path();
1065 if (!path) {
1066 ret = -ENOMEM;
1067 goto out;
1068 }
1069
1070 path->skip_locking = 1;
1071 /* if no item found, the extent is referenced by other snapshot */
1072 ret = btrfs_search_slot(NULL, old_root, key, path, 0, 0);
1073 if (ret)
1074 goto out;
1075
1076 eb = path->nodes[0];
1077 item = btrfs_item_ptr(eb, path->slots[0],
1078 struct btrfs_file_extent_item);
1079 if (btrfs_file_extent_type(eb, item) != BTRFS_FILE_EXTENT_REG ||
1080 btrfs_file_extent_disk_bytenr(eb, item) != bytenr) {
1081 ret = 1;
1082 goto out;
1083 }
1084
1085 for (level = BTRFS_MAX_LEVEL - 1; level >= -1; level--) {
1086 if (level >= 0) {
1087 eb = path->nodes[level];
1088 if (!eb)
1089 continue;
1090 extent_start = eb->start;
1091 } else
1092 extent_start = bytenr;
1093
1094 ret = get_reference_status(root, extent_start, ref_generation,
1095 0, &min_generation, &ref_count);
1096 if (ret)
1097 goto out;
1098
1099 if (ref_count != 1) {
1100 ret = 1;
1101 goto out;
1102 }
1103 if (level >= 0)
1104 ref_generation = btrfs_header_generation(eb);
1105 }
1106 ret = 0;
1107out:
1108 if (path)
1109 btrfs_free_path(path);
1110 return ret;
1111}
1112
1113int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1114 struct extent_buffer *buf, u32 nr_extents)
1115{
1116 u32 nritems;
1117 struct btrfs_key key;
1118 struct btrfs_file_extent_item *fi;
1119 int i;
1120 int level;
1121 int ret = 0;
1122
1123 if (!root->ref_cows)
1124 return 0;
1125
1126 level = btrfs_header_level(buf);
1127 nritems = btrfs_header_nritems(buf);
1128
1129 if (level == 0) {
1130 struct btrfs_leaf_ref *ref;
1131 struct btrfs_extent_info *info;
1132
1133 ref = btrfs_alloc_leaf_ref(root, nr_extents);
1134 if (!ref) {
1135 ret = -ENOMEM;
1136 goto out;
1137 }
1138
1139 ref->root_gen = root->root_key.offset;
1140 ref->bytenr = buf->start;
1141 ref->owner = btrfs_header_owner(buf);
1142 ref->generation = btrfs_header_generation(buf);
1143 ref->nritems = nr_extents;
1144 info = ref->extents;
1145
1146 for (i = 0; nr_extents > 0 && i < nritems; i++) {
1147 u64 disk_bytenr;
1148 btrfs_item_key_to_cpu(buf, &key, i);
1149 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1150 continue;
1151 fi = btrfs_item_ptr(buf, i,
1152 struct btrfs_file_extent_item);
1153 if (btrfs_file_extent_type(buf, fi) ==
1154 BTRFS_FILE_EXTENT_INLINE)
1155 continue;
1156 disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1157 if (disk_bytenr == 0)
1158 continue;
1159
1160 info->bytenr = disk_bytenr;
1161 info->num_bytes =
1162 btrfs_file_extent_disk_num_bytes(buf, fi);
1163 info->objectid = key.objectid;
1164 info->offset = key.offset;
1165 info++;
1166 }
1167
1168 BUG_ON(!root->ref_tree);
1169 ret = btrfs_add_leaf_ref(root, ref);
1170 WARN_ON(ret);
1171 btrfs_free_leaf_ref(root, ref);
1172 }
1173out:
1174 return ret;
1175}
1176
1177int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1178 struct extent_buffer *orig_buf, struct extent_buffer *buf,
1179 u32 *nr_extents)
1180{
1181 u64 bytenr;
1182 u64 ref_root;
1183 u64 orig_root;
1184 u64 ref_generation;
1185 u64 orig_generation;
1186 u32 nritems;
1187 u32 nr_file_extents = 0;
1188 struct btrfs_key key;
1189 struct btrfs_file_extent_item *fi;
1190 int i;
1191 int level;
1192 int ret = 0;
1193 int faili = 0;
1194 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1195 u64, u64, u64, u64, u64, u64, u64, u64, u64);
1196
1197 ref_root = btrfs_header_owner(buf);
1198 ref_generation = btrfs_header_generation(buf);
1199 orig_root = btrfs_header_owner(orig_buf);
1200 orig_generation = btrfs_header_generation(orig_buf);
1201
1202 nritems = btrfs_header_nritems(buf);
1203 level = btrfs_header_level(buf);
1204
1205 if (root->ref_cows) {
1206 process_func = __btrfs_inc_extent_ref;
1207 } else {
1208 if (level == 0 &&
1209 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1210 goto out;
1211 if (level != 0 &&
1212 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1213 goto out;
1214 process_func = __btrfs_update_extent_ref;
1215 }
1216
1217 for (i = 0; i < nritems; i++) {
1218 cond_resched();
1219 if (level == 0) {
1220 btrfs_item_key_to_cpu(buf, &key, i);
1221 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1222 continue;
1223 fi = btrfs_item_ptr(buf, i,
1224 struct btrfs_file_extent_item);
1225 if (btrfs_file_extent_type(buf, fi) ==
1226 BTRFS_FILE_EXTENT_INLINE)
1227 continue;
1228 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1229 if (bytenr == 0)
1230 continue;
1231
1232 nr_file_extents++;
1233
1234 maybe_lock_mutex(root);
1235 ret = process_func(trans, root, bytenr,
1236 orig_buf->start, buf->start,
1237 orig_root, ref_root,
1238 orig_generation, ref_generation,
1239 key.objectid, key.offset);
1240 maybe_unlock_mutex(root);
1241
1242 if (ret) {
1243 faili = i;
1244 WARN_ON(1);
1245 goto fail;
1246 }
1247 } else {
1248 bytenr = btrfs_node_blockptr(buf, i);
1249 maybe_lock_mutex(root);
1250 ret = process_func(trans, root, bytenr,
1251 orig_buf->start, buf->start,
1252 orig_root, ref_root,
1253 orig_generation, ref_generation,
1254 level - 1, 0);
1255 maybe_unlock_mutex(root);
1256 if (ret) {
1257 faili = i;
1258 WARN_ON(1);
1259 goto fail;
1260 }
1261 }
1262 }
1263out:
1264 if (nr_extents) {
1265 if (level == 0)
1266 *nr_extents = nr_file_extents;
1267 else
1268 *nr_extents = nritems;
1269 }
1270 return 0;
1271fail:
1272 WARN_ON(1);
1273 return ret;
1274}
1275
1276int btrfs_update_ref(struct btrfs_trans_handle *trans,
1277 struct btrfs_root *root, struct extent_buffer *orig_buf,
1278 struct extent_buffer *buf, int start_slot, int nr)
1279
1280{
1281 u64 bytenr;
1282 u64 ref_root;
1283 u64 orig_root;
1284 u64 ref_generation;
1285 u64 orig_generation;
1286 struct btrfs_key key;
1287 struct btrfs_file_extent_item *fi;
1288 int i;
1289 int ret;
1290 int slot;
1291 int level;
1292
1293 BUG_ON(start_slot < 0);
1294 BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1295
1296 ref_root = btrfs_header_owner(buf);
1297 ref_generation = btrfs_header_generation(buf);
1298 orig_root = btrfs_header_owner(orig_buf);
1299 orig_generation = btrfs_header_generation(orig_buf);
1300 level = btrfs_header_level(buf);
1301
1302 if (!root->ref_cows) {
1303 if (level == 0 &&
1304 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1305 return 0;
1306 if (level != 0 &&
1307 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1308 return 0;
1309 }
1310
1311 for (i = 0, slot = start_slot; i < nr; i++, slot++) {
1312 cond_resched();
1313 if (level == 0) {
1314 btrfs_item_key_to_cpu(buf, &key, slot);
1315 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1316 continue;
1317 fi = btrfs_item_ptr(buf, slot,
1318 struct btrfs_file_extent_item);
1319 if (btrfs_file_extent_type(buf, fi) ==
1320 BTRFS_FILE_EXTENT_INLINE)
1321 continue;
1322 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1323 if (bytenr == 0)
1324 continue;
1325 maybe_lock_mutex(root);
1326 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1327 orig_buf->start, buf->start,
1328 orig_root, ref_root,
1329 orig_generation, ref_generation,
1330 key.objectid, key.offset);
1331 maybe_unlock_mutex(root);
1332 if (ret)
1333 goto fail;
1334 } else {
1335 bytenr = btrfs_node_blockptr(buf, slot);
1336 maybe_lock_mutex(root);
1337 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1338 orig_buf->start, buf->start,
1339 orig_root, ref_root,
1340 orig_generation, ref_generation,
1341 level - 1, 0);
1342 maybe_unlock_mutex(root);
1343 if (ret)
1344 goto fail;
1345 }
1346 }
1347 return 0;
1348fail:
1349 WARN_ON(1);
1350 return -1;
1351}
1352
1353static int write_one_cache_group(struct btrfs_trans_handle *trans,
1354 struct btrfs_root *root,
1355 struct btrfs_path *path,
1356 struct btrfs_block_group_cache *cache)
1357{
1358 int ret;
1359 int pending_ret;
1360 struct btrfs_root *extent_root = root->fs_info->extent_root;
1361 unsigned long bi;
1362 struct extent_buffer *leaf;
1363
1364 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1365 if (ret < 0)
1366 goto fail;
1367 BUG_ON(ret);
1368
1369 leaf = path->nodes[0];
1370 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1371 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1372 btrfs_mark_buffer_dirty(leaf);
1373 btrfs_release_path(extent_root, path);
1374fail:
1375 finish_current_insert(trans, extent_root);
1376 pending_ret = del_pending_extents(trans, extent_root);
1377 if (ret)
1378 return ret;
1379 if (pending_ret)
1380 return pending_ret;
1381 return 0;
1382
1383}
1384
1385int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1386 struct btrfs_root *root)
1387{
1388 struct btrfs_block_group_cache *cache, *entry;
1389 struct rb_node *n;
1390 int err = 0;
1391 int werr = 0;
1392 struct btrfs_path *path;
1393 u64 last = 0;
1394
1395 path = btrfs_alloc_path();
1396 if (!path)
1397 return -ENOMEM;
1398
1399 mutex_lock(&root->fs_info->alloc_mutex);
1400 while(1) {
1401 cache = NULL;
1402 spin_lock(&root->fs_info->block_group_cache_lock);
1403 for (n = rb_first(&root->fs_info->block_group_cache_tree);
1404 n; n = rb_next(n)) {
1405 entry = rb_entry(n, struct btrfs_block_group_cache,
1406 cache_node);
1407 if (entry->dirty) {
1408 cache = entry;
1409 break;
1410 }
1411 }
1412 spin_unlock(&root->fs_info->block_group_cache_lock);
1413
1414 if (!cache)
1415 break;
1416
1417 last += cache->key.offset;
1418
1419 err = write_one_cache_group(trans, root,
1420 path, cache);
1421 /*
1422 * if we fail to write the cache group, we want
1423 * to keep it marked dirty in hopes that a later
1424 * write will work
1425 */
1426 if (err) {
1427 werr = err;
1428 continue;
1429 }
1430
1431 cache->dirty = 0;
1432 }
1433 btrfs_free_path(path);
1434 mutex_unlock(&root->fs_info->alloc_mutex);
1435 return werr;
1436}
1437
1438static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1439 u64 total_bytes, u64 bytes_used,
1440 struct btrfs_space_info **space_info)
1441{
1442 struct btrfs_space_info *found;
1443
1444 found = __find_space_info(info, flags);
1445 if (found) {
1446 found->total_bytes += total_bytes;
1447 found->bytes_used += bytes_used;
1448 found->full = 0;
1449 *space_info = found;
1450 return 0;
1451 }
1452 found = kmalloc(sizeof(*found), GFP_NOFS);
1453 if (!found)
1454 return -ENOMEM;
1455
1456 list_add(&found->list, &info->space_info);
1457 INIT_LIST_HEAD(&found->block_groups);
1458 spin_lock_init(&found->lock);
1459 found->flags = flags;
1460 found->total_bytes = total_bytes;
1461 found->bytes_used = bytes_used;
1462 found->bytes_pinned = 0;
1463 found->full = 0;
1464 found->force_alloc = 0;
1465 *space_info = found;
1466 return 0;
1467}
1468
1469static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1470{
1471 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1472 BTRFS_BLOCK_GROUP_RAID1 |
1473 BTRFS_BLOCK_GROUP_RAID10 |
1474 BTRFS_BLOCK_GROUP_DUP);
1475 if (extra_flags) {
1476 if (flags & BTRFS_BLOCK_GROUP_DATA)
1477 fs_info->avail_data_alloc_bits |= extra_flags;
1478 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1479 fs_info->avail_metadata_alloc_bits |= extra_flags;
1480 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1481 fs_info->avail_system_alloc_bits |= extra_flags;
1482 }
1483}
1484
1485static u64 reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1486{
1487 u64 num_devices = root->fs_info->fs_devices->num_devices;
1488
1489 if (num_devices == 1)
1490 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1491 if (num_devices < 4)
1492 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1493
1494 if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1495 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1496 BTRFS_BLOCK_GROUP_RAID10))) {
1497 flags &= ~BTRFS_BLOCK_GROUP_DUP;
1498 }
1499
1500 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1501 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1502 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1503 }
1504
1505 if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1506 ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1507 (flags & BTRFS_BLOCK_GROUP_RAID10) |
1508 (flags & BTRFS_BLOCK_GROUP_DUP)))
1509 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1510 return flags;
1511}
1512
1513static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1514 struct btrfs_root *extent_root, u64 alloc_bytes,
1515 u64 flags, int force)
1516{
1517 struct btrfs_space_info *space_info;
1518 u64 thresh;
1519 u64 start;
1520 u64 num_bytes;
1521 int ret = 0;
1522
1523 flags = reduce_alloc_profile(extent_root, flags);
1524
1525 space_info = __find_space_info(extent_root->fs_info, flags);
1526 if (!space_info) {
1527 ret = update_space_info(extent_root->fs_info, flags,
1528 0, 0, &space_info);
1529 BUG_ON(ret);
1530 }
1531 BUG_ON(!space_info);
1532
1533 if (space_info->force_alloc) {
1534 force = 1;
1535 space_info->force_alloc = 0;
1536 }
1537 if (space_info->full)
1538 goto out;
1539
1540 thresh = div_factor(space_info->total_bytes, 6);
1541 if (!force &&
1542 (space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
1543 thresh)
1544 goto out;
1545
1546 mutex_lock(&extent_root->fs_info->chunk_mutex);
1547 ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
1548 if (ret == -ENOSPC) {
1549printk("space info full %Lu\n", flags);
1550 space_info->full = 1;
1551 goto out_unlock;
1552 }
1553 BUG_ON(ret);
1554
1555 ret = btrfs_make_block_group(trans, extent_root, 0, flags,
1556 BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
1557 BUG_ON(ret);
1558
1559out_unlock:
1560 mutex_unlock(&extent_root->fs_info->chunk_mutex);
1561out:
1562 return ret;
1563}
1564
1565static int update_block_group(struct btrfs_trans_handle *trans,
1566 struct btrfs_root *root,
1567 u64 bytenr, u64 num_bytes, int alloc,
1568 int mark_free)
1569{
1570 struct btrfs_block_group_cache *cache;
1571 struct btrfs_fs_info *info = root->fs_info;
1572 u64 total = num_bytes;
1573 u64 old_val;
1574 u64 byte_in_group;
1575
1576 WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1577 while(total) {
1578 cache = btrfs_lookup_block_group(info, bytenr);
1579 if (!cache) {
1580 return -1;
1581 }
1582 byte_in_group = bytenr - cache->key.objectid;
1583 WARN_ON(byte_in_group > cache->key.offset);
1584
1585 spin_lock(&cache->lock);
1586 cache->dirty = 1;
1587 old_val = btrfs_block_group_used(&cache->item);
1588 num_bytes = min(total, cache->key.offset - byte_in_group);
1589 if (alloc) {
1590 old_val += num_bytes;
1591 cache->space_info->bytes_used += num_bytes;
1592 btrfs_set_block_group_used(&cache->item, old_val);
1593 spin_unlock(&cache->lock);
1594 } else {
1595 old_val -= num_bytes;
1596 cache->space_info->bytes_used -= num_bytes;
1597 btrfs_set_block_group_used(&cache->item, old_val);
1598 spin_unlock(&cache->lock);
1599 if (mark_free) {
1600 int ret;
1601 ret = btrfs_add_free_space(cache, bytenr,
1602 num_bytes);
1603 if (ret)
1604 return -1;
1605 }
1606 }
1607 total -= num_bytes;
1608 bytenr += num_bytes;
1609 }
1610 return 0;
1611}
1612
1613static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
1614{
1615 struct btrfs_block_group_cache *cache;
1616
1617 cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
1618 if (!cache)
1619 return 0;
1620
1621 return cache->key.objectid;
1622}
1623
1624
1625int btrfs_update_pinned_extents(struct btrfs_root *root,
1626 u64 bytenr, u64 num, int pin)
1627{
1628 u64 len;
1629 struct btrfs_block_group_cache *cache;
1630 struct btrfs_fs_info *fs_info = root->fs_info;
1631
1632 WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1633 if (pin) {
1634 set_extent_dirty(&fs_info->pinned_extents,
1635 bytenr, bytenr + num - 1, GFP_NOFS);
1636 } else {
1637 clear_extent_dirty(&fs_info->pinned_extents,
1638 bytenr, bytenr + num - 1, GFP_NOFS);
1639 }
1640 while (num > 0) {
1641 cache = btrfs_lookup_block_group(fs_info, bytenr);
1642 if (!cache) {
1643 u64 first = first_logical_byte(root, bytenr);
1644 WARN_ON(first < bytenr);
1645 len = min(first - bytenr, num);
1646 } else {
1647 len = min(num, cache->key.offset -
1648 (bytenr - cache->key.objectid));
1649 }
1650 if (pin) {
1651 if (cache) {
1652 spin_lock(&cache->lock);
1653 cache->pinned += len;
1654 cache->space_info->bytes_pinned += len;
1655 spin_unlock(&cache->lock);
1656 }
1657 fs_info->total_pinned += len;
1658 } else {
1659 if (cache) {
1660 spin_lock(&cache->lock);
1661 cache->pinned -= len;
1662 cache->space_info->bytes_pinned -= len;
1663 spin_unlock(&cache->lock);
1664 }
1665 fs_info->total_pinned -= len;
1666 }
1667 bytenr += len;
1668 num -= len;
1669 }
1670 return 0;
1671}
1672
1673int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
1674{
1675 u64 last = 0;
1676 u64 start;
1677 u64 end;
1678 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
1679 int ret;
1680
1681 while(1) {
1682 ret = find_first_extent_bit(pinned_extents, last,
1683 &start, &end, EXTENT_DIRTY);
1684 if (ret)
1685 break;
1686 set_extent_dirty(copy, start, end, GFP_NOFS);
1687 last = end + 1;
1688 }
1689 return 0;
1690}
1691
1692int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1693 struct btrfs_root *root,
1694 struct extent_io_tree *unpin)
1695{
1696 u64 start;
1697 u64 end;
1698 int ret;
1699 struct btrfs_block_group_cache *cache;
1700
1701 mutex_lock(&root->fs_info->alloc_mutex);
1702 while(1) {
1703 ret = find_first_extent_bit(unpin, 0, &start, &end,
1704 EXTENT_DIRTY);
1705 if (ret)
1706 break;
1707 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
1708 clear_extent_dirty(unpin, start, end, GFP_NOFS);
1709 cache = btrfs_lookup_block_group(root->fs_info, start);
1710 if (cache->cached)
1711 btrfs_add_free_space(cache, start, end - start + 1);
1712 if (need_resched()) {
1713 mutex_unlock(&root->fs_info->alloc_mutex);
1714 cond_resched();
1715 mutex_lock(&root->fs_info->alloc_mutex);
1716 }
1717 }
1718 mutex_unlock(&root->fs_info->alloc_mutex);
1719 return 0;
1720}
1721
1722static int finish_current_insert(struct btrfs_trans_handle *trans,
1723 struct btrfs_root *extent_root)
1724{
1725 u64 start;
1726 u64 end;
1727 u64 priv;
1728 struct btrfs_fs_info *info = extent_root->fs_info;
1729 struct btrfs_path *path;
1730 struct btrfs_extent_ref *ref;
1731 struct pending_extent_op *extent_op;
1732 struct btrfs_key key;
1733 struct btrfs_extent_item extent_item;
1734 int ret;
1735 int err = 0;
1736
1737 WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
1738 btrfs_set_stack_extent_refs(&extent_item, 1);
1739 path = btrfs_alloc_path();
1740
1741 while(1) {
1742 ret = find_first_extent_bit(&info->extent_ins, 0, &start,
1743 &end, EXTENT_LOCKED);
1744 if (ret)
1745 break;
1746
1747 ret = get_state_private(&info->extent_ins, start, &priv);
1748 BUG_ON(ret);
1749 extent_op = (struct pending_extent_op *)(unsigned long)priv;
1750
1751 if (extent_op->type == PENDING_EXTENT_INSERT) {
1752 key.objectid = start;
1753 key.offset = end + 1 - start;
1754 key.type = BTRFS_EXTENT_ITEM_KEY;
1755 err = btrfs_insert_item(trans, extent_root, &key,
1756 &extent_item, sizeof(extent_item));
1757 BUG_ON(err);
1758
1759 clear_extent_bits(&info->extent_ins, start, end,
1760 EXTENT_LOCKED, GFP_NOFS);
1761
1762 err = insert_extent_backref(trans, extent_root, path,
1763 start, extent_op->parent,
1764 extent_root->root_key.objectid,
1765 extent_op->generation,
1766 extent_op->level, 0);
1767 BUG_ON(err);
1768 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
1769 err = lookup_extent_backref(trans, extent_root, path,
1770 start, extent_op->orig_parent,
1771 extent_root->root_key.objectid,
1772 extent_op->orig_generation, 0);
1773 BUG_ON(err);
1774
1775 clear_extent_bits(&info->extent_ins, start, end,
1776 EXTENT_LOCKED, GFP_NOFS);
1777
1778 key.objectid = start;
1779 key.offset = extent_op->parent;
1780 key.type = BTRFS_EXTENT_REF_KEY;
1781 err = btrfs_set_item_key_safe(trans, extent_root, path,
1782 &key);
1783 BUG_ON(err);
1784 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
1785 struct btrfs_extent_ref);
1786 btrfs_set_ref_generation(path->nodes[0], ref,
1787 extent_op->generation);
1788 btrfs_mark_buffer_dirty(path->nodes[0]);
1789 btrfs_release_path(extent_root, path);
1790 } else {
1791 BUG_ON(1);
1792 }
1793 kfree(extent_op);
1794
1795 if (need_resched()) {
1796 mutex_unlock(&extent_root->fs_info->alloc_mutex);
1797 cond_resched();
1798 mutex_lock(&extent_root->fs_info->alloc_mutex);
1799 }
1800 }
1801 btrfs_free_path(path);
1802 return 0;
1803}
1804
1805static int pin_down_bytes(struct btrfs_trans_handle *trans,
1806 struct btrfs_root *root,
1807 u64 bytenr, u64 num_bytes, int is_data)
1808{
1809 int err = 0;
1810 struct extent_buffer *buf;
1811
1812 WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1813 if (is_data)
1814 goto pinit;
1815
1816 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
1817 if (!buf)
1818 goto pinit;
1819
1820 /* we can reuse a block if it hasn't been written
1821 * and it is from this transaction. We can't
1822 * reuse anything from the tree log root because
1823 * it has tiny sub-transactions.
1824 */
1825 if (btrfs_buffer_uptodate(buf, 0) &&
1826 btrfs_try_tree_lock(buf)) {
1827 u64 header_owner = btrfs_header_owner(buf);
1828 u64 header_transid = btrfs_header_generation(buf);
1829 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
1830 header_transid == trans->transid &&
1831 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
1832 clean_tree_block(NULL, root, buf);
1833 btrfs_tree_unlock(buf);
1834 free_extent_buffer(buf);
1835 return 1;
1836 }
1837 btrfs_tree_unlock(buf);
1838 }
1839 free_extent_buffer(buf);
1840pinit:
1841 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
1842
1843 BUG_ON(err < 0);
1844 return 0;
1845}
1846
1847/*
1848 * remove an extent from the root, returns 0 on success
1849 */
1850static int __free_extent(struct btrfs_trans_handle *trans,
1851 struct btrfs_root *root,
1852 u64 bytenr, u64 num_bytes, u64 parent,
1853 u64 root_objectid, u64 ref_generation,
1854 u64 owner_objectid, u64 owner_offset,
1855 int pin, int mark_free)
1856{
1857 struct btrfs_path *path;
1858 struct btrfs_key key;
1859 struct btrfs_fs_info *info = root->fs_info;
1860 struct btrfs_root *extent_root = info->extent_root;
1861 struct extent_buffer *leaf;
1862 int ret;
1863 int extent_slot = 0;
1864 int found_extent = 0;
1865 int num_to_del = 1;
1866 struct btrfs_extent_item *ei;
1867 u32 refs;
1868
1869 WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
1870 key.objectid = bytenr;
1871 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1872 key.offset = num_bytes;
1873 path = btrfs_alloc_path();
1874 if (!path)
1875 return -ENOMEM;
1876
1877 path->reada = 1;
1878 ret = lookup_extent_backref(trans, extent_root, path, bytenr, parent,
1879 root_objectid, ref_generation, 1);
1880 if (ret == 0) {
1881 struct btrfs_key found_key;
1882 extent_slot = path->slots[0];
1883 while(extent_slot > 0) {
1884 extent_slot--;
1885 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1886 extent_slot);
1887 if (found_key.objectid != bytenr)
1888 break;
1889 if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
1890 found_key.offset == num_bytes) {
1891 found_extent = 1;
1892 break;
1893 }
1894 if (path->slots[0] - extent_slot > 5)
1895 break;
1896 }
1897 if (!found_extent) {
1898 ret = remove_extent_backref(trans, extent_root, path);
1899 BUG_ON(ret);
1900 btrfs_release_path(extent_root, path);
1901 ret = btrfs_search_slot(trans, extent_root,
1902 &key, path, -1, 1);
1903 BUG_ON(ret);
1904 extent_slot = path->slots[0];
1905 }
1906 } else {
1907 btrfs_print_leaf(extent_root, path->nodes[0]);
1908 WARN_ON(1);
1909 printk("Unable to find ref byte nr %Lu root %Lu "
1910 " gen %Lu owner %Lu offset %Lu\n", bytenr,
1911 root_objectid, ref_generation, owner_objectid,
1912 owner_offset);
1913 }
1914
1915 leaf = path->nodes[0];
1916 ei = btrfs_item_ptr(leaf, extent_slot,
1917 struct btrfs_extent_item);
1918 refs = btrfs_extent_refs(leaf, ei);
1919 BUG_ON(refs == 0);
1920 refs -= 1;
1921 btrfs_set_extent_refs(leaf, ei, refs);
1922
1923 btrfs_mark_buffer_dirty(leaf);
1924
1925 if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
1926 struct btrfs_extent_ref *ref;
1927 ref = btrfs_item_ptr(leaf, path->slots[0],
1928 struct btrfs_extent_ref);
1929 BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
1930 /* if the back ref and the extent are next to each other
1931 * they get deleted below in one shot
1932 */
1933 path->slots[0] = extent_slot;
1934 num_to_del = 2;
1935 } else if (found_extent) {
1936 /* otherwise delete the extent back ref */
1937 ret = remove_extent_backref(trans, extent_root, path);
1938 BUG_ON(ret);
1939 /* if refs are 0, we need to setup the path for deletion */
1940 if (refs == 0) {
1941 btrfs_release_path(extent_root, path);
1942 ret = btrfs_search_slot(trans, extent_root, &key, path,
1943 -1, 1);
1944 BUG_ON(ret);
1945 }
1946 }
1947
1948 if (refs == 0) {
1949 u64 super_used;
1950 u64 root_used;
1951#ifdef BIO_RW_DISCARD
1952 u64 map_length = num_bytes;
1953 struct btrfs_multi_bio *multi = NULL;
1954#endif
1955
1956 if (pin) {
1957 ret = pin_down_bytes(trans, root, bytenr, num_bytes,
1958 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
1959 if (ret > 0)
1960 mark_free = 1;
1961 BUG_ON(ret < 0);
1962 }
1963
1964 /* block accounting for super block */
1965 spin_lock_irq(&info->delalloc_lock);
1966 super_used = btrfs_super_bytes_used(&info->super_copy);
1967 btrfs_set_super_bytes_used(&info->super_copy,
1968 super_used - num_bytes);
1969 spin_unlock_irq(&info->delalloc_lock);
1970
1971 /* block accounting for root item */
1972 root_used = btrfs_root_used(&root->root_item);
1973 btrfs_set_root_used(&root->root_item,
1974 root_used - num_bytes);
1975 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
1976 num_to_del);
1977 BUG_ON(ret);
1978 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
1979 mark_free);
1980 BUG_ON(ret);
1981
1982#ifdef BIO_RW_DISCARD
1983 /* Tell the block device(s) that the sectors can be discarded */
1984 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
1985 bytenr, &map_length, &multi, 0);
1986 if (!ret) {
1987 struct btrfs_bio_stripe *stripe = multi->stripes;
1988 int i;
1989
1990 if (map_length > num_bytes)
1991 map_length = num_bytes;
1992
1993 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1994 blkdev_issue_discard(stripe->dev->bdev,
1995 stripe->physical >> 9,
1996 map_length >> 9);
1997 }
1998 kfree(multi);
1999 }
2000#endif
2001 }
2002 btrfs_free_path(path);
2003 finish_current_insert(trans, extent_root);
2004 return ret;
2005}
2006
2007/*
2008 * find all the blocks marked as pending in the radix tree and remove
2009 * them from the extent map
2010 */
2011static int del_pending_extents(struct btrfs_trans_handle *trans, struct
2012 btrfs_root *extent_root)
2013{
2014 int ret;
2015 int err = 0;
2016 int mark_free = 0;
2017 u64 start;
2018 u64 end;
2019 u64 priv;
2020 struct extent_io_tree *pending_del;
2021 struct extent_io_tree *extent_ins;
2022 struct pending_extent_op *extent_op;
2023
2024 WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
2025 extent_ins = &extent_root->fs_info->extent_ins;
2026 pending_del = &extent_root->fs_info->pending_del;
2027
2028 while(1) {
2029 ret = find_first_extent_bit(pending_del, 0, &start, &end,
2030 EXTENT_LOCKED);
2031 if (ret)
2032 break;
2033
2034 ret = get_state_private(pending_del, start, &priv);
2035 BUG_ON(ret);
2036 extent_op = (struct pending_extent_op *)(unsigned long)priv;
2037
2038 clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
2039 GFP_NOFS);
2040
2041 ret = pin_down_bytes(trans, extent_root, start,
2042 end + 1 - start, 0);
2043 mark_free = ret > 0;
2044 if (!test_range_bit(extent_ins, start, end,
2045 EXTENT_LOCKED, 0)) {
2046free_extent:
2047 ret = __free_extent(trans, extent_root,
2048 start, end + 1 - start,
2049 extent_op->orig_parent,
2050 extent_root->root_key.objectid,
2051 extent_op->orig_generation,
2052 extent_op->level, 0, 0, mark_free);
2053 kfree(extent_op);
2054 } else {
2055 kfree(extent_op);
2056 ret = get_state_private(extent_ins, start, &priv);
2057 BUG_ON(ret);
2058 extent_op = (struct pending_extent_op *)
2059 (unsigned long)priv;
2060
2061 clear_extent_bits(extent_ins, start, end,
2062 EXTENT_LOCKED, GFP_NOFS);
2063
2064 if (extent_op->type == PENDING_BACKREF_UPDATE)
2065 goto free_extent;
2066
2067 ret = update_block_group(trans, extent_root, start,
2068 end + 1 - start, 0, mark_free);
2069 BUG_ON(ret);
2070 kfree(extent_op);
2071 }
2072 if (ret)
2073 err = ret;
2074
2075 if (need_resched()) {
2076 mutex_unlock(&extent_root->fs_info->alloc_mutex);
2077 cond_resched();
2078 mutex_lock(&extent_root->fs_info->alloc_mutex);
2079 }
2080 }
2081 return err;
2082}
2083
2084/*
2085 * remove an extent from the root, returns 0 on success
2086 */
2087static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2088 struct btrfs_root *root,
2089 u64 bytenr, u64 num_bytes, u64 parent,
2090 u64 root_objectid, u64 ref_generation,
2091 u64 owner_objectid, u64 owner_offset, int pin)
2092{
2093 struct btrfs_root *extent_root = root->fs_info->extent_root;
2094 int pending_ret;
2095 int ret;
2096
2097 WARN_ON(num_bytes < root->sectorsize);
2098 if (root == extent_root) {
2099 struct pending_extent_op *extent_op;
2100
2101 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2102 BUG_ON(!extent_op);
2103
2104 extent_op->type = PENDING_EXTENT_DELETE;
2105 extent_op->bytenr = bytenr;
2106 extent_op->num_bytes = num_bytes;
2107 extent_op->parent = parent;
2108 extent_op->orig_parent = parent;
2109 extent_op->generation = ref_generation;
2110 extent_op->orig_generation = ref_generation;
2111 extent_op->level = (int)owner_objectid;
2112
2113 set_extent_bits(&root->fs_info->pending_del,
2114 bytenr, bytenr + num_bytes - 1,
2115 EXTENT_LOCKED, GFP_NOFS);
2116 set_state_private(&root->fs_info->pending_del,
2117 bytenr, (unsigned long)extent_op);
2118 return 0;
2119 }
2120 /* if metadata always pin */
2121 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2122 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2123 struct btrfs_block_group_cache *cache;
2124
2125 /* btrfs_free_reserved_extent */
2126 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2127 BUG_ON(!cache);
2128 btrfs_add_free_space(cache, bytenr, num_bytes);
2129 return 0;
2130 }
2131 pin = 1;
2132 }
2133
2134 /* if data pin when any transaction has committed this */
2135 if (ref_generation != trans->transid)
2136 pin = 1;
2137
2138 ret = __free_extent(trans, root, bytenr, num_bytes, parent,
2139 root_objectid, ref_generation, owner_objectid,
2140 owner_offset, pin, pin == 0);
2141
2142 finish_current_insert(trans, root->fs_info->extent_root);
2143 pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
2144 return ret ? ret : pending_ret;
2145}
2146
2147int btrfs_free_extent(struct btrfs_trans_handle *trans,
2148 struct btrfs_root *root,
2149 u64 bytenr, u64 num_bytes, u64 parent,
2150 u64 root_objectid, u64 ref_generation,
2151 u64 owner_objectid, u64 owner_offset, int pin)
2152{
2153 int ret;
2154
2155 maybe_lock_mutex(root);
2156 ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
2157 root_objectid, ref_generation,
2158 owner_objectid, owner_offset, pin);
2159 maybe_unlock_mutex(root);
2160 return ret;
2161}
2162
2163static u64 stripe_align(struct btrfs_root *root, u64 val)
2164{
2165 u64 mask = ((u64)root->stripesize - 1);
2166 u64 ret = (val + mask) & ~mask;
2167 return ret;
2168}
2169
2170/*
2171 * walks the btree of allocated extents and find a hole of a given size.
2172 * The key ins is changed to record the hole:
2173 * ins->objectid == block start
2174 * ins->flags = BTRFS_EXTENT_ITEM_KEY
2175 * ins->offset == number of blocks
2176 * Any available blocks before search_start are skipped.
2177 */
2178static int noinline find_free_extent(struct btrfs_trans_handle *trans,
2179 struct btrfs_root *orig_root,
2180 u64 num_bytes, u64 empty_size,
2181 u64 search_start, u64 search_end,
2182 u64 hint_byte, struct btrfs_key *ins,
2183 u64 exclude_start, u64 exclude_nr,
2184 int data)
2185{
2186 int ret;
2187 u64 orig_search_start;
2188 struct btrfs_root * root = orig_root->fs_info->extent_root;
2189 struct btrfs_fs_info *info = root->fs_info;
2190 u64 total_needed = num_bytes;
2191 u64 *last_ptr = NULL;
2192 struct btrfs_block_group_cache *block_group;
2193 int chunk_alloc_done = 0;
2194 int empty_cluster = 2 * 1024 * 1024;
2195 int allowed_chunk_alloc = 0;
2196
2197 WARN_ON(num_bytes < root->sectorsize);
2198 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2199
2200 if (orig_root->ref_cows || empty_size)
2201 allowed_chunk_alloc = 1;
2202
2203 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2204 last_ptr = &root->fs_info->last_alloc;
2205 empty_cluster = 256 * 1024;
2206 }
2207
2208 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
2209 last_ptr = &root->fs_info->last_data_alloc;
2210
2211 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2212 last_ptr = &root->fs_info->last_log_alloc;
2213 if (!last_ptr == 0 && root->fs_info->last_alloc) {
2214 *last_ptr = root->fs_info->last_alloc + empty_cluster;
2215 }
2216 }
2217
2218 if (last_ptr) {
2219 if (*last_ptr)
2220 hint_byte = *last_ptr;
2221 else
2222 empty_size += empty_cluster;
2223 }
2224
2225 search_start = max(search_start, first_logical_byte(root, 0));
2226 orig_search_start = search_start;
2227
2228 if (search_end == (u64)-1)
2229 search_end = btrfs_super_total_bytes(&info->super_copy);
2230
2231 search_start = max(search_start, hint_byte);
2232 total_needed += empty_size;
2233
2234new_group:
2235 block_group = btrfs_lookup_block_group(info, search_start);
2236
2237 /*
2238 * Ok this looks a little tricky, buts its really simple. First if we
2239 * didn't find a block group obviously we want to start over.
2240 * Secondly, if the block group we found does not match the type we
2241 * need, and we have a last_ptr and its not 0, chances are the last
2242 * allocation we made was at the end of the block group, so lets go
2243 * ahead and skip the looking through the rest of the block groups and
2244 * start at the beginning. This helps with metadata allocations,
2245 * since you are likely to have a bunch of data block groups to search
2246 * through first before you realize that you need to start over, so go
2247 * ahead and start over and save the time.
2248 */
2249 if (!block_group || (!block_group_bits(block_group, data) &&
2250 last_ptr && *last_ptr)) {
2251 if (search_start != orig_search_start) {
2252 if (last_ptr && *last_ptr)
2253 *last_ptr = 0;
2254 search_start = orig_search_start;
2255 goto new_group;
2256 } else if (!chunk_alloc_done && allowed_chunk_alloc) {
2257 ret = do_chunk_alloc(trans, root,
2258 num_bytes + 2 * 1024 * 1024,
2259 data, 1);
2260 if (ret < 0) {
2261 struct btrfs_space_info *info;
2262
2263 info = __find_space_info(root->fs_info, data);
2264 goto error;
2265 }
2266 BUG_ON(ret);
2267 chunk_alloc_done = 1;
2268 search_start = orig_search_start;
2269 goto new_group;
2270 } else {
2271 ret = -ENOSPC;
2272 goto error;
2273 }
2274 }
2275
2276 /*
2277 * this is going to seach through all of the existing block groups it
2278 * can find, so if we don't find something we need to see if we can
2279 * allocate what we need.
2280 */
2281 ret = find_free_space(root, &block_group, &search_start,
2282 total_needed, data);
2283 if (ret == -ENOSPC) {
2284 /*
2285 * instead of allocating, start at the original search start
2286 * and see if there is something to be found, if not then we
2287 * allocate
2288 */
2289 if (search_start != orig_search_start) {
2290 if (last_ptr && *last_ptr) {
2291 *last_ptr = 0;
2292 total_needed += empty_cluster;
2293 }
2294 search_start = orig_search_start;
2295 goto new_group;
2296 }
2297
2298 /*
2299 * we've already allocated, we're pretty screwed
2300 */
2301 if (chunk_alloc_done) {
2302 goto error;
2303 } else if (!allowed_chunk_alloc && block_group &&
2304 block_group_bits(block_group, data)) {
2305 block_group->space_info->force_alloc = 1;
2306 goto error;
2307 } else if (!allowed_chunk_alloc) {
2308 goto error;
2309 }
2310
2311 ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024,
2312 data, 1);
2313 if (ret < 0)
2314 goto error;
2315
2316 BUG_ON(ret);
2317 chunk_alloc_done = 1;
2318 if (block_group)
2319 search_start = block_group->key.objectid +
2320 block_group->key.offset;
2321 else
2322 search_start = orig_search_start;
2323 goto new_group;
2324 }
2325
2326 if (ret)
2327 goto error;
2328
2329 search_start = stripe_align(root, search_start);
2330 ins->objectid = search_start;
2331 ins->offset = num_bytes;
2332
2333 if (ins->objectid + num_bytes >= search_end) {
2334 search_start = orig_search_start;
2335 if (chunk_alloc_done) {
2336 ret = -ENOSPC;
2337 goto error;
2338 }
2339 goto new_group;
2340 }
2341
2342 if (ins->objectid + num_bytes >
2343 block_group->key.objectid + block_group->key.offset) {
2344 if (search_start == orig_search_start && chunk_alloc_done) {
2345 ret = -ENOSPC;
2346 goto error;
2347 }
2348 search_start = block_group->key.objectid +
2349 block_group->key.offset;
2350 goto new_group;
2351 }
2352
2353 if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start &&
2354 ins->objectid < exclude_start + exclude_nr)) {
2355 search_start = exclude_start + exclude_nr;
2356 goto new_group;
2357 }
2358
2359 if (!(data & BTRFS_BLOCK_GROUP_DATA))
2360 trans->block_group = block_group;
2361
2362 ins->offset = num_bytes;
2363 if (last_ptr) {
2364 *last_ptr = ins->objectid + ins->offset;
2365 if (*last_ptr ==
2366 btrfs_super_total_bytes(&root->fs_info->super_copy))
2367 *last_ptr = 0;
2368 }
2369
2370 ret = 0;
2371error:
2372 return ret;
2373}
2374
2375static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
2376{
2377 struct btrfs_block_group_cache *cache;
2378 struct list_head *l;
2379
2380 printk(KERN_INFO "space_info has %Lu free, is %sfull\n",
2381 info->total_bytes - info->bytes_used - info->bytes_pinned,
2382 (info->full) ? "" : "not ");
2383
2384 spin_lock(&info->lock);
2385 list_for_each(l, &info->block_groups) {
2386 cache = list_entry(l, struct btrfs_block_group_cache, list);
2387 spin_lock(&cache->lock);
2388 printk(KERN_INFO "block group %Lu has %Lu bytes, %Lu used "
2389 "%Lu pinned\n",
2390 cache->key.objectid, cache->key.offset,
2391 btrfs_block_group_used(&cache->item), cache->pinned);
2392 btrfs_dump_free_space(cache, bytes);
2393 spin_unlock(&cache->lock);
2394 }
2395 spin_unlock(&info->lock);
2396}
2397static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2398 struct btrfs_root *root,
2399 u64 num_bytes, u64 min_alloc_size,
2400 u64 empty_size, u64 hint_byte,
2401 u64 search_end, struct btrfs_key *ins,
2402 u64 data)
2403{
2404 int ret;
2405 u64 search_start = 0;
2406 u64 alloc_profile;
2407 struct btrfs_fs_info *info = root->fs_info;
2408 struct btrfs_block_group_cache *cache;
2409
2410 if (data) {
2411 alloc_profile = info->avail_data_alloc_bits &
2412 info->data_alloc_profile;
2413 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2414 } else if (root == root->fs_info->chunk_root) {
2415 alloc_profile = info->avail_system_alloc_bits &
2416 info->system_alloc_profile;
2417 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2418 } else {
2419 alloc_profile = info->avail_metadata_alloc_bits &
2420 info->metadata_alloc_profile;
2421 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2422 }
2423again:
2424 data = reduce_alloc_profile(root, data);
2425 /*
2426 * the only place that sets empty_size is btrfs_realloc_node, which
2427 * is not called recursively on allocations
2428 */
2429 if (empty_size || root->ref_cows) {
2430 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
2431 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2432 2 * 1024 * 1024,
2433 BTRFS_BLOCK_GROUP_METADATA |
2434 (info->metadata_alloc_profile &
2435 info->avail_metadata_alloc_bits), 0);
2436 }
2437 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2438 num_bytes + 2 * 1024 * 1024, data, 0);
2439 }
2440
2441 WARN_ON(num_bytes < root->sectorsize);
2442 ret = find_free_extent(trans, root, num_bytes, empty_size,
2443 search_start, search_end, hint_byte, ins,
2444 trans->alloc_exclude_start,
2445 trans->alloc_exclude_nr, data);
2446
2447 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
2448 num_bytes = num_bytes >> 1;
2449 num_bytes = num_bytes & ~(root->sectorsize - 1);
2450 num_bytes = max(num_bytes, min_alloc_size);
2451 do_chunk_alloc(trans, root->fs_info->extent_root,
2452 num_bytes, data, 1);
2453 goto again;
2454 }
2455 if (ret) {
2456 struct btrfs_space_info *sinfo;
2457
2458 sinfo = __find_space_info(root->fs_info, data);
2459 printk("allocation failed flags %Lu, wanted %Lu\n",
2460 data, num_bytes);
2461 dump_space_info(sinfo, num_bytes);
2462 BUG();
2463 }
2464 cache = btrfs_lookup_block_group(root->fs_info, ins->objectid);
2465 if (!cache) {
2466 printk(KERN_ERR "Unable to find block group for %Lu\n", ins->objectid);
2467 return -ENOSPC;
2468 }
2469
2470 ret = btrfs_remove_free_space(cache, ins->objectid, ins->offset);
2471
2472 return ret;
2473}
2474
2475int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
2476{
2477 struct btrfs_block_group_cache *cache;
2478
2479 maybe_lock_mutex(root);
2480 cache = btrfs_lookup_block_group(root->fs_info, start);
2481 if (!cache) {
2482 printk(KERN_ERR "Unable to find block group for %Lu\n", start);
2483 maybe_unlock_mutex(root);
2484 return -ENOSPC;
2485 }
2486 btrfs_add_free_space(cache, start, len);
2487 maybe_unlock_mutex(root);
2488 return 0;
2489}
2490
2491int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2492 struct btrfs_root *root,
2493 u64 num_bytes, u64 min_alloc_size,
2494 u64 empty_size, u64 hint_byte,
2495 u64 search_end, struct btrfs_key *ins,
2496 u64 data)
2497{
2498 int ret;
2499 maybe_lock_mutex(root);
2500 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
2501 empty_size, hint_byte, search_end, ins,
2502 data);
2503 maybe_unlock_mutex(root);
2504 return ret;
2505}
2506
2507static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
2508 struct btrfs_root *root, u64 parent,
2509 u64 root_objectid, u64 ref_generation,
2510 u64 owner, u64 owner_offset,
2511 struct btrfs_key *ins)
2512{
2513 int ret;
2514 int pending_ret;
2515 u64 super_used;
2516 u64 root_used;
2517 u64 num_bytes = ins->offset;
2518 u32 sizes[2];
2519 struct btrfs_fs_info *info = root->fs_info;
2520 struct btrfs_root *extent_root = info->extent_root;
2521 struct btrfs_extent_item *extent_item;
2522 struct btrfs_extent_ref *ref;
2523 struct btrfs_path *path;
2524 struct btrfs_key keys[2];
2525
2526 if (parent == 0)
2527 parent = ins->objectid;
2528
2529 /* block accounting for super block */
2530 spin_lock_irq(&info->delalloc_lock);
2531 super_used = btrfs_super_bytes_used(&info->super_copy);
2532 btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
2533 spin_unlock_irq(&info->delalloc_lock);
2534
2535 /* block accounting for root item */
2536 root_used = btrfs_root_used(&root->root_item);
2537 btrfs_set_root_used(&root->root_item, root_used + num_bytes);
2538
2539 if (root == extent_root) {
2540 struct pending_extent_op *extent_op;
2541
2542 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2543 BUG_ON(!extent_op);
2544
2545 extent_op->type = PENDING_EXTENT_INSERT;
2546 extent_op->bytenr = ins->objectid;
2547 extent_op->num_bytes = ins->offset;
2548 extent_op->parent = parent;
2549 extent_op->orig_parent = 0;
2550 extent_op->generation = ref_generation;
2551 extent_op->orig_generation = 0;
2552 extent_op->level = (int)owner;
2553
2554 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
2555 ins->objectid + ins->offset - 1,
2556 EXTENT_LOCKED, GFP_NOFS);
2557 set_state_private(&root->fs_info->extent_ins,
2558 ins->objectid, (unsigned long)extent_op);
2559 goto update_block;
2560 }
2561
2562 memcpy(&keys[0], ins, sizeof(*ins));
2563 keys[1].objectid = ins->objectid;
2564 keys[1].type = BTRFS_EXTENT_REF_KEY;
2565 keys[1].offset = parent;
2566 sizes[0] = sizeof(*extent_item);
2567 sizes[1] = sizeof(*ref);
2568
2569 path = btrfs_alloc_path();
2570 BUG_ON(!path);
2571
2572 ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
2573 sizes, 2);
2574 BUG_ON(ret);
2575
2576 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2577 struct btrfs_extent_item);
2578 btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
2579 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
2580 struct btrfs_extent_ref);
2581
2582 btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
2583 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
2584 btrfs_set_ref_objectid(path->nodes[0], ref, owner);
2585 btrfs_set_ref_offset(path->nodes[0], ref, owner_offset);
2586 btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
2587
2588 btrfs_mark_buffer_dirty(path->nodes[0]);
2589
2590 trans->alloc_exclude_start = 0;
2591 trans->alloc_exclude_nr = 0;
2592 btrfs_free_path(path);
2593 finish_current_insert(trans, extent_root);
2594 pending_ret = del_pending_extents(trans, extent_root);
2595
2596 if (ret)
2597 goto out;
2598 if (pending_ret) {
2599 ret = pending_ret;
2600 goto out;
2601 }
2602
2603update_block:
2604 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
2605 if (ret) {
2606 printk("update block group failed for %Lu %Lu\n",
2607 ins->objectid, ins->offset);
2608 BUG();
2609 }
2610out:
2611 return ret;
2612}
2613
2614int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
2615 struct btrfs_root *root, u64 parent,
2616 u64 root_objectid, u64 ref_generation,
2617 u64 owner, u64 owner_offset,
2618 struct btrfs_key *ins)
2619{
2620 int ret;
2621
2622 if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
2623 return 0;
2624 maybe_lock_mutex(root);
2625 ret = __btrfs_alloc_reserved_extent(trans, root, parent,
2626 root_objectid, ref_generation,
2627 owner, owner_offset, ins);
2628 maybe_unlock_mutex(root);
2629 return ret;
2630}
2631
2632/*
2633 * this is used by the tree logging recovery code. It records that
2634 * an extent has been allocated and makes sure to clear the free
2635 * space cache bits as well
2636 */
2637int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
2638 struct btrfs_root *root, u64 parent,
2639 u64 root_objectid, u64 ref_generation,
2640 u64 owner, u64 owner_offset,
2641 struct btrfs_key *ins)
2642{
2643 int ret;
2644 struct btrfs_block_group_cache *block_group;
2645
2646 maybe_lock_mutex(root);
2647 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
2648 cache_block_group(root, block_group);
2649
2650 ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset);
2651 BUG_ON(ret);
2652 ret = __btrfs_alloc_reserved_extent(trans, root, parent,
2653 root_objectid, ref_generation,
2654 owner, owner_offset, ins);
2655 maybe_unlock_mutex(root);
2656 return ret;
2657}
2658
2659/*
2660 * finds a free extent and does all the dirty work required for allocation
2661 * returns the key for the extent through ins, and a tree buffer for
2662 * the first block of the extent through buf.
2663 *
2664 * returns 0 if everything worked, non-zero otherwise.
2665 */
2666int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
2667 struct btrfs_root *root,
2668 u64 num_bytes, u64 parent, u64 min_alloc_size,
2669 u64 root_objectid, u64 ref_generation,
2670 u64 owner_objectid, u64 owner_offset,
2671 u64 empty_size, u64 hint_byte,
2672 u64 search_end, struct btrfs_key *ins, u64 data)
2673{
2674 int ret;
2675
2676 maybe_lock_mutex(root);
2677
2678 ret = __btrfs_reserve_extent(trans, root, num_bytes,
2679 min_alloc_size, empty_size, hint_byte,
2680 search_end, ins, data);
2681 BUG_ON(ret);
2682 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
2683 ret = __btrfs_alloc_reserved_extent(trans, root, parent,
2684 root_objectid, ref_generation,
2685 owner_objectid, owner_offset, ins);
2686 BUG_ON(ret);
2687
2688 }
2689 maybe_unlock_mutex(root);
2690 return ret;
2691}
2692
2693struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
2694 struct btrfs_root *root,
2695 u64 bytenr, u32 blocksize)
2696{
2697 struct extent_buffer *buf;
2698
2699 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
2700 if (!buf)
2701 return ERR_PTR(-ENOMEM);
2702 btrfs_set_header_generation(buf, trans->transid);
2703 btrfs_tree_lock(buf);
2704 clean_tree_block(trans, root, buf);
2705 btrfs_set_buffer_uptodate(buf);
2706 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2707 set_extent_dirty(&root->dirty_log_pages, buf->start,
2708 buf->start + buf->len - 1, GFP_NOFS);
2709 } else {
2710 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
2711 buf->start + buf->len - 1, GFP_NOFS);
2712 }
2713 trans->blocks_used++;
2714 return buf;
2715}
2716
2717/*
2718 * helper function to allocate a block for a given tree
2719 * returns the tree buffer or NULL.
2720 */
2721struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
2722 struct btrfs_root *root,
2723 u32 blocksize, u64 parent,
2724 u64 root_objectid,
2725 u64 ref_generation,
2726 int level,
2727 u64 hint,
2728 u64 empty_size)
2729{
2730 struct btrfs_key ins;
2731 int ret;
2732 struct extent_buffer *buf;
2733
2734 ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
2735 root_objectid, ref_generation, level, 0,
2736 empty_size, hint, (u64)-1, &ins, 0);
2737 if (ret) {
2738 BUG_ON(ret > 0);
2739 return ERR_PTR(ret);
2740 }
2741
2742 buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
2743 return buf;
2744}
2745
2746int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
2747 struct btrfs_root *root, struct extent_buffer *leaf)
2748{
2749 u64 leaf_owner;
2750 u64 leaf_generation;
2751 struct btrfs_key key;
2752 struct btrfs_file_extent_item *fi;
2753 int i;
2754 int nritems;
2755 int ret;
2756
2757 BUG_ON(!btrfs_is_leaf(leaf));
2758 nritems = btrfs_header_nritems(leaf);
2759 leaf_owner = btrfs_header_owner(leaf);
2760 leaf_generation = btrfs_header_generation(leaf);
2761
2762 for (i = 0; i < nritems; i++) {
2763 u64 disk_bytenr;
2764 cond_resched();
2765
2766 btrfs_item_key_to_cpu(leaf, &key, i);
2767 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2768 continue;
2769 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2770 if (btrfs_file_extent_type(leaf, fi) ==
2771 BTRFS_FILE_EXTENT_INLINE)
2772 continue;
2773 /*
2774 * FIXME make sure to insert a trans record that
2775 * repeats the snapshot del on crash
2776 */
2777 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2778 if (disk_bytenr == 0)
2779 continue;
2780
2781 mutex_lock(&root->fs_info->alloc_mutex);
2782 ret = __btrfs_free_extent(trans, root, disk_bytenr,
2783 btrfs_file_extent_disk_num_bytes(leaf, fi),
2784 leaf->start, leaf_owner, leaf_generation,
2785 key.objectid, key.offset, 0);
2786 mutex_unlock(&root->fs_info->alloc_mutex);
2787 BUG_ON(ret);
2788
2789 atomic_inc(&root->fs_info->throttle_gen);
2790 wake_up(&root->fs_info->transaction_throttle);
2791 cond_resched();
2792 }
2793 return 0;
2794}
2795
2796static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
2797 struct btrfs_root *root,
2798 struct btrfs_leaf_ref *ref)
2799{
2800 int i;
2801 int ret;
2802 struct btrfs_extent_info *info = ref->extents;
2803
2804 for (i = 0; i < ref->nritems; i++) {
2805 mutex_lock(&root->fs_info->alloc_mutex);
2806 ret = __btrfs_free_extent(trans, root, info->bytenr,
2807 info->num_bytes, ref->bytenr,
2808 ref->owner, ref->generation,
2809 info->objectid, info->offset, 0);
2810 mutex_unlock(&root->fs_info->alloc_mutex);
2811
2812 atomic_inc(&root->fs_info->throttle_gen);
2813 wake_up(&root->fs_info->transaction_throttle);
2814 cond_resched();
2815
2816 BUG_ON(ret);
2817 info++;
2818 }
2819
2820 return 0;
2821}
2822
2823int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
2824 u32 *refs)
2825{
2826 int ret;
2827
2828 ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
2829 BUG_ON(ret);
2830
2831#if 0 // some debugging code in case we see problems here
2832 /* if the refs count is one, it won't get increased again. But
2833 * if the ref count is > 1, someone may be decreasing it at
2834 * the same time we are.
2835 */
2836 if (*refs != 1) {
2837 struct extent_buffer *eb = NULL;
2838 eb = btrfs_find_create_tree_block(root, start, len);
2839 if (eb)
2840 btrfs_tree_lock(eb);
2841
2842 mutex_lock(&root->fs_info->alloc_mutex);
2843 ret = lookup_extent_ref(NULL, root, start, len, refs);
2844 BUG_ON(ret);
2845 mutex_unlock(&root->fs_info->alloc_mutex);
2846
2847 if (eb) {
2848 btrfs_tree_unlock(eb);
2849 free_extent_buffer(eb);
2850 }
2851 if (*refs == 1) {
2852 printk("block %llu went down to one during drop_snap\n",
2853 (unsigned long long)start);
2854 }
2855
2856 }
2857#endif
2858
2859 cond_resched();
2860 return ret;
2861}
2862
2863/*
2864 * helper function for drop_snapshot, this walks down the tree dropping ref
2865 * counts as it goes.
2866 */
2867static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
2868 struct btrfs_root *root,
2869 struct btrfs_path *path, int *level)
2870{
2871 u64 root_owner;
2872 u64 root_gen;
2873 u64 bytenr;
2874 u64 ptr_gen;
2875 struct extent_buffer *next;
2876 struct extent_buffer *cur;
2877 struct extent_buffer *parent;
2878 struct btrfs_leaf_ref *ref;
2879 u32 blocksize;
2880 int ret;
2881 u32 refs;
2882
2883 WARN_ON(*level < 0);
2884 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2885 ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
2886 path->nodes[*level]->len, &refs);
2887 BUG_ON(ret);
2888 if (refs > 1)
2889 goto out;
2890
2891 /*
2892 * walk down to the last node level and free all the leaves
2893 */
2894 while(*level >= 0) {
2895 WARN_ON(*level < 0);
2896 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2897 cur = path->nodes[*level];
2898
2899 if (btrfs_header_level(cur) != *level)
2900 WARN_ON(1);
2901
2902 if (path->slots[*level] >=
2903 btrfs_header_nritems(cur))
2904 break;
2905 if (*level == 0) {
2906 ret = btrfs_drop_leaf_ref(trans, root, cur);
2907 BUG_ON(ret);
2908 break;
2909 }
2910 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2911 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2912 blocksize = btrfs_level_size(root, *level - 1);
2913
2914 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
2915 BUG_ON(ret);
2916 if (refs != 1) {
2917 parent = path->nodes[*level];
2918 root_owner = btrfs_header_owner(parent);
2919 root_gen = btrfs_header_generation(parent);
2920 path->slots[*level]++;
2921
2922 mutex_lock(&root->fs_info->alloc_mutex);
2923 ret = __btrfs_free_extent(trans, root, bytenr,
2924 blocksize, parent->start,
2925 root_owner, root_gen, 0, 0, 1);
2926 BUG_ON(ret);
2927 mutex_unlock(&root->fs_info->alloc_mutex);
2928
2929 atomic_inc(&root->fs_info->throttle_gen);
2930 wake_up(&root->fs_info->transaction_throttle);
2931 cond_resched();
2932
2933 continue;
2934 }
2935 /*
2936 * at this point, we have a single ref, and since the
2937 * only place referencing this extent is a dead root
2938 * the reference count should never go higher.
2939 * So, we don't need to check it again
2940 */
2941 if (*level == 1) {
2942 ref = btrfs_lookup_leaf_ref(root, bytenr);
2943 if (ref) {
2944 ret = cache_drop_leaf_ref(trans, root, ref);
2945 BUG_ON(ret);
2946 btrfs_remove_leaf_ref(root, ref);
2947 btrfs_free_leaf_ref(root, ref);
2948 *level = 0;
2949 break;
2950 }
2951 if (printk_ratelimit())
2952 printk("leaf ref miss for bytenr %llu\n",
2953 (unsigned long long)bytenr);
2954 }
2955 next = btrfs_find_tree_block(root, bytenr, blocksize);
2956 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
2957 free_extent_buffer(next);
2958
2959 next = read_tree_block(root, bytenr, blocksize,
2960 ptr_gen);
2961 cond_resched();
2962#if 0
2963 /*
2964 * this is a debugging check and can go away
2965 * the ref should never go all the way down to 1
2966 * at this point
2967 */
2968 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
2969 &refs);
2970 BUG_ON(ret);
2971 WARN_ON(refs != 1);
2972#endif
2973 }
2974 WARN_ON(*level <= 0);
2975 if (path->nodes[*level-1])
2976 free_extent_buffer(path->nodes[*level-1]);
2977 path->nodes[*level-1] = next;
2978 *level = btrfs_header_level(next);
2979 path->slots[*level] = 0;
2980 cond_resched();
2981 }
2982out:
2983 WARN_ON(*level < 0);
2984 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2985
2986 if (path->nodes[*level] == root->node) {
2987 parent = path->nodes[*level];
2988 bytenr = path->nodes[*level]->start;
2989 } else {
2990 parent = path->nodes[*level + 1];
2991 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
2992 }
2993
2994 blocksize = btrfs_level_size(root, *level);
2995 root_owner = btrfs_header_owner(parent);
2996 root_gen = btrfs_header_generation(parent);
2997
2998 mutex_lock(&root->fs_info->alloc_mutex);
2999 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3000 parent->start, root_owner, root_gen,
3001 0, 0, 1);
3002 mutex_unlock(&root->fs_info->alloc_mutex);
3003 free_extent_buffer(path->nodes[*level]);
3004 path->nodes[*level] = NULL;
3005 *level += 1;
3006 BUG_ON(ret);
3007
3008 cond_resched();
3009 return 0;
3010}
3011
3012/*
3013 * helper for dropping snapshots. This walks back up the tree in the path
3014 * to find the first node higher up where we haven't yet gone through
3015 * all the slots
3016 */
3017static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
3018 struct btrfs_root *root,
3019 struct btrfs_path *path, int *level)
3020{
3021 u64 root_owner;
3022 u64 root_gen;
3023 struct btrfs_root_item *root_item = &root->root_item;
3024 int i;
3025 int slot;
3026 int ret;
3027
3028 for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
3029 slot = path->slots[i];
3030 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3031 struct extent_buffer *node;
3032 struct btrfs_disk_key disk_key;
3033 node = path->nodes[i];
3034 path->slots[i]++;
3035 *level = i;
3036 WARN_ON(*level == 0);
3037 btrfs_node_key(node, &disk_key, path->slots[i]);
3038 memcpy(&root_item->drop_progress,
3039 &disk_key, sizeof(disk_key));
3040 root_item->drop_level = i;
3041 return 0;
3042 } else {
3043 struct extent_buffer *parent;
3044 if (path->nodes[*level] == root->node)
3045 parent = path->nodes[*level];
3046 else
3047 parent = path->nodes[*level + 1];
3048
3049 root_owner = btrfs_header_owner(parent);
3050 root_gen = btrfs_header_generation(parent);
3051 ret = btrfs_free_extent(trans, root,
3052 path->nodes[*level]->start,
3053 path->nodes[*level]->len,
3054 parent->start,
3055 root_owner, root_gen, 0, 0, 1);
3056 BUG_ON(ret);
3057 free_extent_buffer(path->nodes[*level]);
3058 path->nodes[*level] = NULL;
3059 *level = i + 1;
3060 }
3061 }
3062 return 1;
3063}
3064
3065/*
3066 * drop the reference count on the tree rooted at 'snap'. This traverses
3067 * the tree freeing any blocks that have a ref count of zero after being
3068 * decremented.
3069 */
3070int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3071 *root)
3072{
3073 int ret = 0;
3074 int wret;
3075 int level;
3076 struct btrfs_path *path;
3077 int i;
3078 int orig_level;
3079 struct btrfs_root_item *root_item = &root->root_item;
3080
3081 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
3082 path = btrfs_alloc_path();
3083 BUG_ON(!path);
3084
3085 level = btrfs_header_level(root->node);
3086 orig_level = level;
3087 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3088 path->nodes[level] = root->node;
3089 extent_buffer_get(root->node);
3090 path->slots[level] = 0;
3091 } else {
3092 struct btrfs_key key;
3093 struct btrfs_disk_key found_key;
3094 struct extent_buffer *node;
3095
3096 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
3097 level = root_item->drop_level;
3098 path->lowest_level = level;
3099 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3100 if (wret < 0) {
3101 ret = wret;
3102 goto out;
3103 }
3104 node = path->nodes[level];
3105 btrfs_node_key(node, &found_key, path->slots[level]);
3106 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
3107 sizeof(found_key)));
3108 /*
3109 * unlock our path, this is safe because only this
3110 * function is allowed to delete this snapshot
3111 */
3112 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3113 if (path->nodes[i] && path->locks[i]) {
3114 path->locks[i] = 0;
3115 btrfs_tree_unlock(path->nodes[i]);
3116 }
3117 }
3118 }
3119 while(1) {
3120 wret = walk_down_tree(trans, root, path, &level);
3121 if (wret > 0)
3122 break;
3123 if (wret < 0)
3124 ret = wret;
3125
3126 wret = walk_up_tree(trans, root, path, &level);
3127 if (wret > 0)
3128 break;
3129 if (wret < 0)
3130 ret = wret;
3131 if (trans->transaction->in_commit) {
3132 ret = -EAGAIN;
3133 break;
3134 }
3135 atomic_inc(&root->fs_info->throttle_gen);
3136 wake_up(&root->fs_info->transaction_throttle);
3137 }
3138 for (i = 0; i <= orig_level; i++) {
3139 if (path->nodes[i]) {
3140 free_extent_buffer(path->nodes[i]);
3141 path->nodes[i] = NULL;
3142 }
3143 }
3144out:
3145 btrfs_free_path(path);
3146 return ret;
3147}
3148
3149int btrfs_free_block_groups(struct btrfs_fs_info *info)
3150{
3151 struct btrfs_block_group_cache *block_group;
3152 struct rb_node *n;
3153
3154 mutex_lock(&info->alloc_mutex);
3155 spin_lock(&info->block_group_cache_lock);
3156 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
3157 block_group = rb_entry(n, struct btrfs_block_group_cache,
3158 cache_node);
3159
3160 btrfs_remove_free_space_cache(block_group);
3161 rb_erase(&block_group->cache_node,
3162 &info->block_group_cache_tree);
3163 spin_lock(&block_group->space_info->lock);
3164 list_del(&block_group->list);
3165 spin_unlock(&block_group->space_info->lock);
3166 kfree(block_group);
3167 }
3168 spin_unlock(&info->block_group_cache_lock);
3169 mutex_unlock(&info->alloc_mutex);
3170 return 0;
3171}
3172
3173static unsigned long calc_ra(unsigned long start, unsigned long last,
3174 unsigned long nr)
3175{
3176 return min(last, start + nr - 1);
3177}
3178
3179static int noinline relocate_inode_pages(struct inode *inode, u64 start,
3180 u64 len)
3181{
3182 u64 page_start;
3183 u64 page_end;
3184 unsigned long last_index;
3185 unsigned long i;
3186 struct page *page;
3187 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3188 struct file_ra_state *ra;
3189 unsigned long total_read = 0;
3190 unsigned long ra_pages;
3191 struct btrfs_ordered_extent *ordered;
3192 struct btrfs_trans_handle *trans;
3193
3194 ra = kzalloc(sizeof(*ra), GFP_NOFS);
3195
3196 mutex_lock(&inode->i_mutex);
3197 i = start >> PAGE_CACHE_SHIFT;
3198 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
3199
3200 ra_pages = BTRFS_I(inode)->root->fs_info->bdi.ra_pages;
3201
3202 file_ra_state_init(ra, inode->i_mapping);
3203
3204 for (; i <= last_index; i++) {
3205 if (total_read % ra_pages == 0) {
3206 btrfs_force_ra(inode->i_mapping, ra, NULL, i,
3207 calc_ra(i, last_index, ra_pages));
3208 }
3209 total_read++;
3210again:
3211 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
3212 goto truncate_racing;
3213 page = grab_cache_page(inode->i_mapping, i);
3214 if (!page) {
3215 goto out_unlock;
3216 }
3217 if (!PageUptodate(page)) {
3218 btrfs_readpage(NULL, page);
3219 lock_page(page);
3220 if (!PageUptodate(page)) {
3221 unlock_page(page);
3222 page_cache_release(page);
3223 goto out_unlock;
3224 }
3225 }
3226 wait_on_page_writeback(page);
3227
3228 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
3229 page_end = page_start + PAGE_CACHE_SIZE - 1;
3230 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
3231
3232 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3233 if (ordered) {
3234 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3235 unlock_page(page);
3236 page_cache_release(page);
3237 btrfs_start_ordered_extent(inode, ordered, 1);
3238 btrfs_put_ordered_extent(ordered);
3239 goto again;
3240 }
3241 set_page_extent_mapped(page);
3242
3243 /*
3244 * make sure page_mkwrite is called for this page if userland
3245 * wants to change it from mmap
3246 */
3247 clear_page_dirty_for_io(page);
3248
3249 btrfs_set_extent_delalloc(inode, page_start, page_end);
3250 set_page_dirty(page);
3251
3252 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3253 unlock_page(page);
3254 page_cache_release(page);
3255 }
3256
3257out_unlock:
3258 /* we have to start the IO in order to get the ordered extents
3259 * instantiated. This allows the relocation to code to wait
3260 * for all the ordered extents to hit the disk.
3261 *
3262 * Otherwise, it would constantly loop over the same extents
3263 * because the old ones don't get deleted until the IO is
3264 * started
3265 */
3266 btrfs_fdatawrite_range(inode->i_mapping, start, start + len - 1,
3267 WB_SYNC_NONE);
3268 kfree(ra);
3269 trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
3270 if (trans) {
3271 btrfs_end_transaction(trans, BTRFS_I(inode)->root);
3272 mark_inode_dirty(inode);
3273 }
3274 mutex_unlock(&inode->i_mutex);
3275 return 0;
3276
3277truncate_racing:
3278 vmtruncate(inode, inode->i_size);
3279 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
3280 total_read);
3281 goto out_unlock;
3282}
3283
3284/*
3285 * The back references tell us which tree holds a ref on a block,
3286 * but it is possible for the tree root field in the reference to
3287 * reflect the original root before a snapshot was made. In this
3288 * case we should search through all the children of a given root
3289 * to find potential holders of references on a block.
3290 *
3291 * Instead, we do something a little less fancy and just search
3292 * all the roots for a given key/block combination.
3293 */
3294static int find_root_for_ref(struct btrfs_root *root,
3295 struct btrfs_path *path,
3296 struct btrfs_key *key0,
3297 int level,
3298 int file_key,
3299 struct btrfs_root **found_root,
3300 u64 bytenr)
3301{
3302 struct btrfs_key root_location;
3303 struct btrfs_root *cur_root = *found_root;
3304 struct btrfs_file_extent_item *file_extent;
3305 u64 root_search_start = BTRFS_FS_TREE_OBJECTID;
3306 u64 found_bytenr;
3307 int ret;
3308
3309 root_location.offset = (u64)-1;
3310 root_location.type = BTRFS_ROOT_ITEM_KEY;
3311 path->lowest_level = level;
3312 path->reada = 0;
3313 while(1) {
3314 ret = btrfs_search_slot(NULL, cur_root, key0, path, 0, 0);
3315 found_bytenr = 0;
3316 if (ret == 0 && file_key) {
3317 struct extent_buffer *leaf = path->nodes[0];
3318 file_extent = btrfs_item_ptr(leaf, path->slots[0],
3319 struct btrfs_file_extent_item);
3320 if (btrfs_file_extent_type(leaf, file_extent) ==
3321 BTRFS_FILE_EXTENT_REG) {
3322 found_bytenr =
3323 btrfs_file_extent_disk_bytenr(leaf,
3324 file_extent);
3325 }
3326 } else if (!file_key) {
3327 if (path->nodes[level])
3328 found_bytenr = path->nodes[level]->start;
3329 }
3330
3331 btrfs_release_path(cur_root, path);
3332
3333 if (found_bytenr == bytenr) {
3334 *found_root = cur_root;
3335 ret = 0;
3336 goto out;
3337 }
3338 ret = btrfs_search_root(root->fs_info->tree_root,
3339 root_search_start, &root_search_start);
3340 if (ret)
3341 break;
3342
3343 root_location.objectid = root_search_start;
3344 cur_root = btrfs_read_fs_root_no_name(root->fs_info,
3345 &root_location);
3346 if (!cur_root) {
3347 ret = 1;
3348 break;
3349 }
3350 }
3351out:
3352 path->lowest_level = 0;
3353 return ret;
3354}
3355
3356/*
3357 * note, this releases the path
3358 */
3359static int noinline relocate_one_reference(struct btrfs_root *extent_root,
3360 struct btrfs_path *path,
3361 struct btrfs_key *extent_key,
3362 u64 *last_file_objectid,
3363 u64 *last_file_offset,
3364 u64 *last_file_root,
3365 u64 last_extent)
3366{
3367 struct inode *inode;
3368 struct btrfs_root *found_root;
3369 struct btrfs_key root_location;
3370 struct btrfs_key found_key;
3371 struct btrfs_extent_ref *ref;
3372 u64 ref_root;
3373 u64 ref_gen;
3374 u64 ref_objectid;
3375 u64 ref_offset;
3376 int ret;
3377 int level;
3378
3379 WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
3380
3381 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
3382 struct btrfs_extent_ref);
3383 ref_root = btrfs_ref_root(path->nodes[0], ref);
3384 ref_gen = btrfs_ref_generation(path->nodes[0], ref);
3385 ref_objectid = btrfs_ref_objectid(path->nodes[0], ref);
3386 ref_offset = btrfs_ref_offset(path->nodes[0], ref);
3387 btrfs_release_path(extent_root, path);
3388
3389 root_location.objectid = ref_root;
3390 if (ref_gen == 0)
3391 root_location.offset = 0;
3392 else
3393 root_location.offset = (u64)-1;
3394 root_location.type = BTRFS_ROOT_ITEM_KEY;
3395
3396 found_root = btrfs_read_fs_root_no_name(extent_root->fs_info,
3397 &root_location);
3398 BUG_ON(!found_root);
3399 mutex_unlock(&extent_root->fs_info->alloc_mutex);
3400
3401 if (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
3402 found_key.objectid = ref_objectid;
3403 found_key.type = BTRFS_EXTENT_DATA_KEY;
3404 found_key.offset = ref_offset;
3405 level = 0;
3406
3407 if (last_extent == extent_key->objectid &&
3408 *last_file_objectid == ref_objectid &&
3409 *last_file_offset == ref_offset &&
3410 *last_file_root == ref_root)
3411 goto out;
3412
3413 ret = find_root_for_ref(extent_root, path, &found_key,
3414 level, 1, &found_root,
3415 extent_key->objectid);
3416
3417 if (ret)
3418 goto out;
3419
3420 if (last_extent == extent_key->objectid &&
3421 *last_file_objectid == ref_objectid &&
3422 *last_file_offset == ref_offset &&
3423 *last_file_root == ref_root)
3424 goto out;
3425
3426 inode = btrfs_iget_locked(extent_root->fs_info->sb,
3427 ref_objectid, found_root);
3428 if (inode->i_state & I_NEW) {
3429 /* the inode and parent dir are two different roots */
3430 BTRFS_I(inode)->root = found_root;
3431 BTRFS_I(inode)->location.objectid = ref_objectid;
3432 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
3433 BTRFS_I(inode)->location.offset = 0;
3434 btrfs_read_locked_inode(inode);
3435 unlock_new_inode(inode);
3436
3437 }
3438 /* this can happen if the reference is not against
3439 * the latest version of the tree root
3440 */
3441 if (is_bad_inode(inode))
3442 goto out;
3443
3444 *last_file_objectid = inode->i_ino;
3445 *last_file_root = found_root->root_key.objectid;
3446 *last_file_offset = ref_offset;
3447
3448 relocate_inode_pages(inode, ref_offset, extent_key->offset);
3449 iput(inode);
3450 } else {
3451 struct btrfs_trans_handle *trans;
3452 struct extent_buffer *eb;
3453 int needs_lock = 0;
3454
3455 eb = read_tree_block(found_root, extent_key->objectid,
3456 extent_key->offset, 0);
3457 btrfs_tree_lock(eb);
3458 level = btrfs_header_level(eb);
3459
3460 if (level == 0)
3461 btrfs_item_key_to_cpu(eb, &found_key, 0);
3462 else
3463 btrfs_node_key_to_cpu(eb, &found_key, 0);
3464
3465 btrfs_tree_unlock(eb);
3466 free_extent_buffer(eb);
3467
3468 ret = find_root_for_ref(extent_root, path, &found_key,
3469 level, 0, &found_root,
3470 extent_key->objectid);
3471
3472 if (ret)
3473 goto out;
3474
3475 /*
3476 * right here almost anything could happen to our key,
3477 * but that's ok. The cow below will either relocate it
3478 * or someone else will have relocated it. Either way,
3479 * it is in a different spot than it was before and
3480 * we're happy.
3481 */
3482
3483 trans = btrfs_start_transaction(found_root, 1);
3484
3485 if (found_root == extent_root->fs_info->extent_root ||
3486 found_root == extent_root->fs_info->chunk_root ||
3487 found_root == extent_root->fs_info->dev_root) {
3488 needs_lock = 1;
3489 mutex_lock(&extent_root->fs_info->alloc_mutex);
3490 }
3491
3492 path->lowest_level = level;
3493 path->reada = 2;
3494 ret = btrfs_search_slot(trans, found_root, &found_key, path,
3495 0, 1);
3496 path->lowest_level = 0;
3497 btrfs_release_path(found_root, path);
3498
3499 if (found_root == found_root->fs_info->extent_root)
3500 btrfs_extent_post_op(trans, found_root);
3501 if (needs_lock)
3502 mutex_unlock(&extent_root->fs_info->alloc_mutex);
3503
3504 btrfs_end_transaction(trans, found_root);
3505
3506 }
3507out:
3508 mutex_lock(&extent_root->fs_info->alloc_mutex);
3509 return 0;
3510}
3511
3512static int noinline del_extent_zero(struct btrfs_root *extent_root,
3513 struct btrfs_path *path,
3514 struct btrfs_key *extent_key)
3515{
3516 int ret;
3517 struct btrfs_trans_handle *trans;
3518
3519 trans = btrfs_start_transaction(extent_root, 1);
3520 ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
3521 if (ret > 0) {
3522 ret = -EIO;
3523 goto out;
3524 }
3525 if (ret < 0)
3526 goto out;
3527 ret = btrfs_del_item(trans, extent_root, path);
3528out:
3529 btrfs_end_transaction(trans, extent_root);
3530 return ret;
3531}
3532
3533static int noinline relocate_one_extent(struct btrfs_root *extent_root,
3534 struct btrfs_path *path,
3535 struct btrfs_key *extent_key)
3536{
3537 struct btrfs_key key;
3538 struct btrfs_key found_key;
3539 struct extent_buffer *leaf;
3540 u64 last_file_objectid = 0;
3541 u64 last_file_root = 0;
3542 u64 last_file_offset = (u64)-1;
3543 u64 last_extent = 0;
3544 u32 nritems;
3545 u32 item_size;
3546 int ret = 0;
3547
3548 if (extent_key->objectid == 0) {
3549 ret = del_extent_zero(extent_root, path, extent_key);
3550 goto out;
3551 }
3552 key.objectid = extent_key->objectid;
3553 key.type = BTRFS_EXTENT_REF_KEY;
3554 key.offset = 0;
3555
3556 while(1) {
3557 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3558
3559 if (ret < 0)
3560 goto out;
3561
3562 ret = 0;
3563 leaf = path->nodes[0];
3564 nritems = btrfs_header_nritems(leaf);
3565 if (path->slots[0] == nritems) {
3566 ret = btrfs_next_leaf(extent_root, path);
3567 if (ret > 0) {
3568 ret = 0;
3569 goto out;
3570 }
3571 if (ret < 0)
3572 goto out;
3573 leaf = path->nodes[0];
3574 }
3575
3576 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3577 if (found_key.objectid != extent_key->objectid) {
3578 break;
3579 }
3580
3581 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
3582 break;
3583 }
3584
3585 key.offset = found_key.offset + 1;
3586 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3587
3588 ret = relocate_one_reference(extent_root, path, extent_key,
3589 &last_file_objectid,
3590 &last_file_offset,
3591 &last_file_root, last_extent);
3592 if (ret)
3593 goto out;
3594 last_extent = extent_key->objectid;
3595 }
3596 ret = 0;
3597out:
3598 btrfs_release_path(extent_root, path);
3599 return ret;
3600}
3601
3602static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
3603{
3604 u64 num_devices;
3605 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
3606 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
3607
3608 num_devices = root->fs_info->fs_devices->num_devices;
3609 if (num_devices == 1) {
3610 stripped |= BTRFS_BLOCK_GROUP_DUP;
3611 stripped = flags & ~stripped;
3612
3613 /* turn raid0 into single device chunks */
3614 if (flags & BTRFS_BLOCK_GROUP_RAID0)
3615 return stripped;
3616
3617 /* turn mirroring into duplication */
3618 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3619 BTRFS_BLOCK_GROUP_RAID10))
3620 return stripped | BTRFS_BLOCK_GROUP_DUP;
3621 return flags;
3622 } else {
3623 /* they already had raid on here, just return */
3624 if (flags & stripped)
3625 return flags;
3626
3627 stripped |= BTRFS_BLOCK_GROUP_DUP;
3628 stripped = flags & ~stripped;
3629
3630 /* switch duplicated blocks with raid1 */
3631 if (flags & BTRFS_BLOCK_GROUP_DUP)
3632 return stripped | BTRFS_BLOCK_GROUP_RAID1;
3633
3634 /* turn single device chunks into raid0 */
3635 return stripped | BTRFS_BLOCK_GROUP_RAID0;
3636 }
3637 return flags;
3638}
3639
3640int __alloc_chunk_for_shrink(struct btrfs_root *root,
3641 struct btrfs_block_group_cache *shrink_block_group,
3642 int force)
3643{
3644 struct btrfs_trans_handle *trans;
3645 u64 new_alloc_flags;
3646 u64 calc;
3647
3648 spin_lock(&shrink_block_group->lock);
3649 if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
3650 spin_unlock(&shrink_block_group->lock);
3651 mutex_unlock(&root->fs_info->alloc_mutex);
3652
3653 trans = btrfs_start_transaction(root, 1);
3654 mutex_lock(&root->fs_info->alloc_mutex);
3655 spin_lock(&shrink_block_group->lock);
3656
3657 new_alloc_flags = update_block_group_flags(root,
3658 shrink_block_group->flags);
3659 if (new_alloc_flags != shrink_block_group->flags) {
3660 calc =
3661 btrfs_block_group_used(&shrink_block_group->item);
3662 } else {
3663 calc = shrink_block_group->key.offset;
3664 }
3665 spin_unlock(&shrink_block_group->lock);
3666
3667 do_chunk_alloc(trans, root->fs_info->extent_root,
3668 calc + 2 * 1024 * 1024, new_alloc_flags, force);
3669
3670 mutex_unlock(&root->fs_info->alloc_mutex);
3671 btrfs_end_transaction(trans, root);
3672 mutex_lock(&root->fs_info->alloc_mutex);
3673 } else
3674 spin_unlock(&shrink_block_group->lock);
3675 return 0;
3676}
3677
3678int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
3679{
3680 struct btrfs_trans_handle *trans;
3681 struct btrfs_root *tree_root = root->fs_info->tree_root;
3682 struct btrfs_path *path;
3683 u64 cur_byte;
3684 u64 total_found;
3685 u64 shrink_last_byte;
3686 struct btrfs_block_group_cache *shrink_block_group;
3687 struct btrfs_key key;
3688 struct btrfs_key found_key;
3689 struct extent_buffer *leaf;
3690 u32 nritems;
3691 int ret;
3692 int progress;
3693
3694 mutex_lock(&root->fs_info->alloc_mutex);
3695 shrink_block_group = btrfs_lookup_block_group(root->fs_info,
3696 shrink_start);
3697 BUG_ON(!shrink_block_group);
3698
3699 shrink_last_byte = shrink_block_group->key.objectid +
3700 shrink_block_group->key.offset;
3701
3702 shrink_block_group->space_info->total_bytes -=
3703 shrink_block_group->key.offset;
3704 path = btrfs_alloc_path();
3705 root = root->fs_info->extent_root;
3706 path->reada = 2;
3707
3708 printk("btrfs relocating block group %llu flags %llu\n",
3709 (unsigned long long)shrink_start,
3710 (unsigned long long)shrink_block_group->flags);
3711
3712 __alloc_chunk_for_shrink(root, shrink_block_group, 1);
3713
3714again:
3715
3716 shrink_block_group->ro = 1;
3717
3718 total_found = 0;
3719 progress = 0;
3720 key.objectid = shrink_start;
3721 key.offset = 0;
3722 key.type = 0;
3723 cur_byte = key.objectid;
3724
3725 mutex_unlock(&root->fs_info->alloc_mutex);
3726
3727 btrfs_start_delalloc_inodes(root);
3728 btrfs_wait_ordered_extents(tree_root, 0);
3729
3730 mutex_lock(&root->fs_info->alloc_mutex);
3731
3732 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3733 if (ret < 0)
3734 goto out;
3735
3736 ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
3737 if (ret < 0)
3738 goto out;
3739
3740 if (ret == 0) {
3741 leaf = path->nodes[0];
3742 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3743 if (found_key.objectid + found_key.offset > shrink_start &&
3744 found_key.objectid < shrink_last_byte) {
3745 cur_byte = found_key.objectid;
3746 key.objectid = cur_byte;
3747 }
3748 }
3749 btrfs_release_path(root, path);
3750
3751 while(1) {
3752 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3753 if (ret < 0)
3754 goto out;
3755
3756next:
3757 leaf = path->nodes[0];
3758 nritems = btrfs_header_nritems(leaf);
3759 if (path->slots[0] >= nritems) {
3760 ret = btrfs_next_leaf(root, path);
3761 if (ret < 0)
3762 goto out;
3763 if (ret == 1) {
3764 ret = 0;
3765 break;
3766 }
3767 leaf = path->nodes[0];
3768 nritems = btrfs_header_nritems(leaf);
3769 }
3770
3771 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3772
3773 if (found_key.objectid >= shrink_last_byte)
3774 break;
3775
3776 if (progress && need_resched()) {
3777 memcpy(&key, &found_key, sizeof(key));
3778 cond_resched();
3779 btrfs_release_path(root, path);
3780 btrfs_search_slot(NULL, root, &key, path, 0, 0);
3781 progress = 0;
3782 goto next;
3783 }
3784 progress = 1;
3785
3786 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_ITEM_KEY ||
3787 found_key.objectid + found_key.offset <= cur_byte) {
3788 memcpy(&key, &found_key, sizeof(key));
3789 key.offset++;
3790 path->slots[0]++;
3791 goto next;
3792 }
3793
3794 total_found++;
3795 cur_byte = found_key.objectid + found_key.offset;
3796 key.objectid = cur_byte;
3797 btrfs_release_path(root, path);
3798 ret = relocate_one_extent(root, path, &found_key);
3799 __alloc_chunk_for_shrink(root, shrink_block_group, 0);
3800 }
3801
3802 btrfs_release_path(root, path);
3803
3804 if (total_found > 0) {
3805 printk("btrfs relocate found %llu last extent was %llu\n",
3806 (unsigned long long)total_found,
3807 (unsigned long long)found_key.objectid);
3808 mutex_unlock(&root->fs_info->alloc_mutex);
3809 trans = btrfs_start_transaction(tree_root, 1);
3810 btrfs_commit_transaction(trans, tree_root);
3811
3812 btrfs_clean_old_snapshots(tree_root);
3813
3814 btrfs_start_delalloc_inodes(root);
3815 btrfs_wait_ordered_extents(tree_root, 0);
3816
3817 trans = btrfs_start_transaction(tree_root, 1);
3818 btrfs_commit_transaction(trans, tree_root);
3819 mutex_lock(&root->fs_info->alloc_mutex);
3820 goto again;
3821 }
3822
3823 /*
3824 * we've freed all the extents, now remove the block
3825 * group item from the tree
3826 */
3827 mutex_unlock(&root->fs_info->alloc_mutex);
3828
3829 trans = btrfs_start_transaction(root, 1);
3830
3831 mutex_lock(&root->fs_info->alloc_mutex);
3832 memcpy(&key, &shrink_block_group->key, sizeof(key));
3833
3834 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3835 if (ret > 0)
3836 ret = -EIO;
3837 if (ret < 0) {
3838 btrfs_end_transaction(trans, root);
3839 goto out;
3840 }
3841
3842 spin_lock(&root->fs_info->block_group_cache_lock);
3843 rb_erase(&shrink_block_group->cache_node,
3844 &root->fs_info->block_group_cache_tree);
3845 spin_unlock(&root->fs_info->block_group_cache_lock);
3846
3847 ret = btrfs_remove_free_space(shrink_block_group, key.objectid,
3848 key.offset);
3849 if (ret) {
3850 btrfs_end_transaction(trans, root);
3851 goto out;
3852 }
3853 /*
3854 memset(shrink_block_group, 0, sizeof(*shrink_block_group));
3855 kfree(shrink_block_group);
3856 */
3857
3858 btrfs_del_item(trans, root, path);
3859 btrfs_release_path(root, path);
3860 mutex_unlock(&root->fs_info->alloc_mutex);
3861 btrfs_commit_transaction(trans, root);
3862
3863 mutex_lock(&root->fs_info->alloc_mutex);
3864
3865 /* the code to unpin extents might set a few bits in the free
3866 * space cache for this range again
3867 */
3868 /* XXX? */
3869 ret = btrfs_remove_free_space(shrink_block_group, key.objectid,
3870 key.offset);
3871out:
3872 btrfs_free_path(path);
3873 mutex_unlock(&root->fs_info->alloc_mutex);
3874 return ret;
3875}
3876
3877int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
3878 struct btrfs_key *key)
3879{
3880 int ret = 0;
3881 struct btrfs_key found_key;
3882 struct extent_buffer *leaf;
3883 int slot;
3884
3885 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
3886 if (ret < 0)
3887 goto out;
3888
3889 while(1) {
3890 slot = path->slots[0];
3891 leaf = path->nodes[0];
3892 if (slot >= btrfs_header_nritems(leaf)) {
3893 ret = btrfs_next_leaf(root, path);
3894 if (ret == 0)
3895 continue;
3896 if (ret < 0)
3897 goto out;
3898 break;
3899 }
3900 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3901
3902 if (found_key.objectid >= key->objectid &&
3903 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
3904 ret = 0;
3905 goto out;
3906 }
3907 path->slots[0]++;
3908 }
3909 ret = -ENOENT;
3910out:
3911 return ret;
3912}
3913
3914int btrfs_read_block_groups(struct btrfs_root *root)
3915{
3916 struct btrfs_path *path;
3917 int ret;
3918 struct btrfs_block_group_cache *cache;
3919 struct btrfs_fs_info *info = root->fs_info;
3920 struct btrfs_space_info *space_info;
3921 struct btrfs_key key;
3922 struct btrfs_key found_key;
3923 struct extent_buffer *leaf;
3924
3925 root = info->extent_root;
3926 key.objectid = 0;
3927 key.offset = 0;
3928 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
3929 path = btrfs_alloc_path();
3930 if (!path)
3931 return -ENOMEM;
3932
3933 mutex_lock(&root->fs_info->alloc_mutex);
3934 while(1) {
3935 ret = find_first_block_group(root, path, &key);
3936 if (ret > 0) {
3937 ret = 0;
3938 goto error;
3939 }
3940 if (ret != 0)
3941 goto error;
3942
3943 leaf = path->nodes[0];
3944 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3945 cache = kzalloc(sizeof(*cache), GFP_NOFS);
3946 if (!cache) {
3947 ret = -ENOMEM;
3948 break;
3949 }
3950
3951 spin_lock_init(&cache->lock);
3952 INIT_LIST_HEAD(&cache->list);
3953 read_extent_buffer(leaf, &cache->item,
3954 btrfs_item_ptr_offset(leaf, path->slots[0]),
3955 sizeof(cache->item));
3956 memcpy(&cache->key, &found_key, sizeof(found_key));
3957
3958 key.objectid = found_key.objectid + found_key.offset;
3959 btrfs_release_path(root, path);
3960 cache->flags = btrfs_block_group_flags(&cache->item);
3961
3962 ret = update_space_info(info, cache->flags, found_key.offset,
3963 btrfs_block_group_used(&cache->item),
3964 &space_info);
3965 BUG_ON(ret);
3966 cache->space_info = space_info;
3967 spin_lock(&space_info->lock);
3968 list_add(&cache->list, &space_info->block_groups);
3969 spin_unlock(&space_info->lock);
3970
3971 ret = btrfs_add_block_group_cache(root->fs_info, cache);
3972 BUG_ON(ret);
3973
3974 if (key.objectid >=
3975 btrfs_super_total_bytes(&info->super_copy))
3976 break;
3977 }
3978 ret = 0;
3979error:
3980 btrfs_free_path(path);
3981 mutex_unlock(&root->fs_info->alloc_mutex);
3982 return ret;
3983}
3984
3985int btrfs_make_block_group(struct btrfs_trans_handle *trans,
3986 struct btrfs_root *root, u64 bytes_used,
3987 u64 type, u64 chunk_objectid, u64 chunk_offset,
3988 u64 size)
3989{
3990 int ret;
3991 struct btrfs_root *extent_root;
3992 struct btrfs_block_group_cache *cache;
3993
3994 WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
3995 extent_root = root->fs_info->extent_root;
3996
3997 root->fs_info->last_trans_new_blockgroup = trans->transid;
3998
3999 cache = kzalloc(sizeof(*cache), GFP_NOFS);
4000 if (!cache)
4001 return -ENOMEM;
4002
4003 cache->key.objectid = chunk_offset;
4004 cache->key.offset = size;
4005 spin_lock_init(&cache->lock);
4006 INIT_LIST_HEAD(&cache->list);
4007 btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
4008
4009 btrfs_set_block_group_used(&cache->item, bytes_used);
4010 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
4011 cache->flags = type;
4012 btrfs_set_block_group_flags(&cache->item, type);
4013
4014 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
4015 &cache->space_info);
4016 BUG_ON(ret);
4017 spin_lock(&cache->space_info->lock);
4018 list_add(&cache->list, &cache->space_info->block_groups);
4019 spin_unlock(&cache->space_info->lock);
4020
4021 ret = btrfs_add_block_group_cache(root->fs_info, cache);
4022 BUG_ON(ret);
4023
4024 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
4025 sizeof(cache->item));
4026 BUG_ON(ret);
4027
4028 finish_current_insert(trans, extent_root);
4029 ret = del_pending_extents(trans, extent_root);
4030 BUG_ON(ret);
4031 set_avail_alloc_bits(extent_root->fs_info, type);
4032
4033 return 0;
4034}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 000000000000..58ad25838a41
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3441 @@
1#include <linux/bitops.h>
2#include <linux/slab.h>
3#include <linux/bio.h>
4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h>
7#include <linux/page-flags.h>
8#include <linux/module.h>
9#include <linux/spinlock.h>
10#include <linux/blkdev.h>
11#include <linux/swap.h>
12#include <linux/version.h>
13#include <linux/writeback.h>
14#include <linux/pagevec.h>
15#include "extent_io.h"
16#include "extent_map.h"
17#include "compat.h"
18#include "ctree.h"
19#include "btrfs_inode.h"
20
21/* temporary define until extent_map moves out of btrfs */
22struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
23 unsigned long extra_flags,
24 void (*ctor)(void *, struct kmem_cache *,
25 unsigned long));
26
27static struct kmem_cache *extent_state_cache;
28static struct kmem_cache *extent_buffer_cache;
29
30static LIST_HEAD(buffers);
31static LIST_HEAD(states);
32
33#ifdef LEAK_DEBUG
34static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
35#endif
36
37#define BUFFER_LRU_MAX 64
38
39struct tree_entry {
40 u64 start;
41 u64 end;
42 struct rb_node rb_node;
43};
44
45struct extent_page_data {
46 struct bio *bio;
47 struct extent_io_tree *tree;
48 get_extent_t *get_extent;
49};
50
51int __init extent_io_init(void)
52{
53 extent_state_cache = btrfs_cache_create("extent_state",
54 sizeof(struct extent_state), 0,
55 NULL);
56 if (!extent_state_cache)
57 return -ENOMEM;
58
59 extent_buffer_cache = btrfs_cache_create("extent_buffers",
60 sizeof(struct extent_buffer), 0,
61 NULL);
62 if (!extent_buffer_cache)
63 goto free_state_cache;
64 return 0;
65
66free_state_cache:
67 kmem_cache_destroy(extent_state_cache);
68 return -ENOMEM;
69}
70
71void extent_io_exit(void)
72{
73 struct extent_state *state;
74 struct extent_buffer *eb;
75
76 while (!list_empty(&states)) {
77 state = list_entry(states.next, struct extent_state, leak_list);
78 printk("state leak: start %Lu end %Lu state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs));
79 list_del(&state->leak_list);
80 kmem_cache_free(extent_state_cache, state);
81
82 }
83
84 while (!list_empty(&buffers)) {
85 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
86 printk("buffer leak start %Lu len %lu refs %d\n", eb->start, eb->len, atomic_read(&eb->refs));
87 list_del(&eb->leak_list);
88 kmem_cache_free(extent_buffer_cache, eb);
89 }
90 if (extent_state_cache)
91 kmem_cache_destroy(extent_state_cache);
92 if (extent_buffer_cache)
93 kmem_cache_destroy(extent_buffer_cache);
94}
95
96void extent_io_tree_init(struct extent_io_tree *tree,
97 struct address_space *mapping, gfp_t mask)
98{
99 tree->state.rb_node = NULL;
100 tree->buffer.rb_node = NULL;
101 tree->ops = NULL;
102 tree->dirty_bytes = 0;
103 spin_lock_init(&tree->lock);
104 spin_lock_init(&tree->buffer_lock);
105 tree->mapping = mapping;
106}
107EXPORT_SYMBOL(extent_io_tree_init);
108
109struct extent_state *alloc_extent_state(gfp_t mask)
110{
111 struct extent_state *state;
112#ifdef LEAK_DEBUG
113 unsigned long flags;
114#endif
115
116 state = kmem_cache_alloc(extent_state_cache, mask);
117 if (!state)
118 return state;
119 state->state = 0;
120 state->private = 0;
121 state->tree = NULL;
122#ifdef LEAK_DEBUG
123 spin_lock_irqsave(&leak_lock, flags);
124 list_add(&state->leak_list, &states);
125 spin_unlock_irqrestore(&leak_lock, flags);
126#endif
127 atomic_set(&state->refs, 1);
128 init_waitqueue_head(&state->wq);
129 return state;
130}
131EXPORT_SYMBOL(alloc_extent_state);
132
133void free_extent_state(struct extent_state *state)
134{
135 if (!state)
136 return;
137 if (atomic_dec_and_test(&state->refs)) {
138#ifdef LEAK_DEBUG
139 unsigned long flags;
140#endif
141 WARN_ON(state->tree);
142#ifdef LEAK_DEBUG
143 spin_lock_irqsave(&leak_lock, flags);
144 list_del(&state->leak_list);
145 spin_unlock_irqrestore(&leak_lock, flags);
146#endif
147 kmem_cache_free(extent_state_cache, state);
148 }
149}
150EXPORT_SYMBOL(free_extent_state);
151
152static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
153 struct rb_node *node)
154{
155 struct rb_node ** p = &root->rb_node;
156 struct rb_node * parent = NULL;
157 struct tree_entry *entry;
158
159 while(*p) {
160 parent = *p;
161 entry = rb_entry(parent, struct tree_entry, rb_node);
162
163 if (offset < entry->start)
164 p = &(*p)->rb_left;
165 else if (offset > entry->end)
166 p = &(*p)->rb_right;
167 else
168 return parent;
169 }
170
171 entry = rb_entry(node, struct tree_entry, rb_node);
172 rb_link_node(node, parent, p);
173 rb_insert_color(node, root);
174 return NULL;
175}
176
177static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
178 struct rb_node **prev_ret,
179 struct rb_node **next_ret)
180{
181 struct rb_root *root = &tree->state;
182 struct rb_node * n = root->rb_node;
183 struct rb_node *prev = NULL;
184 struct rb_node *orig_prev = NULL;
185 struct tree_entry *entry;
186 struct tree_entry *prev_entry = NULL;
187
188 while(n) {
189 entry = rb_entry(n, struct tree_entry, rb_node);
190 prev = n;
191 prev_entry = entry;
192
193 if (offset < entry->start)
194 n = n->rb_left;
195 else if (offset > entry->end)
196 n = n->rb_right;
197 else {
198 return n;
199 }
200 }
201
202 if (prev_ret) {
203 orig_prev = prev;
204 while(prev && offset > prev_entry->end) {
205 prev = rb_next(prev);
206 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
207 }
208 *prev_ret = prev;
209 prev = orig_prev;
210 }
211
212 if (next_ret) {
213 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
214 while(prev && offset < prev_entry->start) {
215 prev = rb_prev(prev);
216 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
217 }
218 *next_ret = prev;
219 }
220 return NULL;
221}
222
223static inline struct rb_node *tree_search(struct extent_io_tree *tree,
224 u64 offset)
225{
226 struct rb_node *prev = NULL;
227 struct rb_node *ret;
228
229 ret = __etree_search(tree, offset, &prev, NULL);
230 if (!ret) {
231 return prev;
232 }
233 return ret;
234}
235
236static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
237 u64 offset, struct rb_node *node)
238{
239 struct rb_root *root = &tree->buffer;
240 struct rb_node ** p = &root->rb_node;
241 struct rb_node * parent = NULL;
242 struct extent_buffer *eb;
243
244 while(*p) {
245 parent = *p;
246 eb = rb_entry(parent, struct extent_buffer, rb_node);
247
248 if (offset < eb->start)
249 p = &(*p)->rb_left;
250 else if (offset > eb->start)
251 p = &(*p)->rb_right;
252 else
253 return eb;
254 }
255
256 rb_link_node(node, parent, p);
257 rb_insert_color(node, root);
258 return NULL;
259}
260
261static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
262 u64 offset)
263{
264 struct rb_root *root = &tree->buffer;
265 struct rb_node * n = root->rb_node;
266 struct extent_buffer *eb;
267
268 while(n) {
269 eb = rb_entry(n, struct extent_buffer, rb_node);
270 if (offset < eb->start)
271 n = n->rb_left;
272 else if (offset > eb->start)
273 n = n->rb_right;
274 else
275 return eb;
276 }
277 return NULL;
278}
279
280/*
281 * utility function to look for merge candidates inside a given range.
282 * Any extents with matching state are merged together into a single
283 * extent in the tree. Extents with EXTENT_IO in their state field
284 * are not merged because the end_io handlers need to be able to do
285 * operations on them without sleeping (or doing allocations/splits).
286 *
287 * This should be called with the tree lock held.
288 */
289static int merge_state(struct extent_io_tree *tree,
290 struct extent_state *state)
291{
292 struct extent_state *other;
293 struct rb_node *other_node;
294
295 if (state->state & EXTENT_IOBITS)
296 return 0;
297
298 other_node = rb_prev(&state->rb_node);
299 if (other_node) {
300 other = rb_entry(other_node, struct extent_state, rb_node);
301 if (other->end == state->start - 1 &&
302 other->state == state->state) {
303 state->start = other->start;
304 other->tree = NULL;
305 rb_erase(&other->rb_node, &tree->state);
306 free_extent_state(other);
307 }
308 }
309 other_node = rb_next(&state->rb_node);
310 if (other_node) {
311 other = rb_entry(other_node, struct extent_state, rb_node);
312 if (other->start == state->end + 1 &&
313 other->state == state->state) {
314 other->start = state->start;
315 state->tree = NULL;
316 rb_erase(&state->rb_node, &tree->state);
317 free_extent_state(state);
318 }
319 }
320 return 0;
321}
322
323static void set_state_cb(struct extent_io_tree *tree,
324 struct extent_state *state,
325 unsigned long bits)
326{
327 if (tree->ops && tree->ops->set_bit_hook) {
328 tree->ops->set_bit_hook(tree->mapping->host, state->start,
329 state->end, state->state, bits);
330 }
331}
332
333static void clear_state_cb(struct extent_io_tree *tree,
334 struct extent_state *state,
335 unsigned long bits)
336{
337 if (tree->ops && tree->ops->set_bit_hook) {
338 tree->ops->clear_bit_hook(tree->mapping->host, state->start,
339 state->end, state->state, bits);
340 }
341}
342
343/*
344 * insert an extent_state struct into the tree. 'bits' are set on the
345 * struct before it is inserted.
346 *
347 * This may return -EEXIST if the extent is already there, in which case the
348 * state struct is freed.
349 *
350 * The tree lock is not taken internally. This is a utility function and
351 * probably isn't what you want to call (see set/clear_extent_bit).
352 */
353static int insert_state(struct extent_io_tree *tree,
354 struct extent_state *state, u64 start, u64 end,
355 int bits)
356{
357 struct rb_node *node;
358
359 if (end < start) {
360 printk("end < start %Lu %Lu\n", end, start);
361 WARN_ON(1);
362 }
363 if (bits & EXTENT_DIRTY)
364 tree->dirty_bytes += end - start + 1;
365 set_state_cb(tree, state, bits);
366 state->state |= bits;
367 state->start = start;
368 state->end = end;
369 node = tree_insert(&tree->state, end, &state->rb_node);
370 if (node) {
371 struct extent_state *found;
372 found = rb_entry(node, struct extent_state, rb_node);
373 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
374 free_extent_state(state);
375 return -EEXIST;
376 }
377 state->tree = tree;
378 merge_state(tree, state);
379 return 0;
380}
381
382/*
383 * split a given extent state struct in two, inserting the preallocated
384 * struct 'prealloc' as the newly created second half. 'split' indicates an
385 * offset inside 'orig' where it should be split.
386 *
387 * Before calling,
388 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
389 * are two extent state structs in the tree:
390 * prealloc: [orig->start, split - 1]
391 * orig: [ split, orig->end ]
392 *
393 * The tree locks are not taken by this function. They need to be held
394 * by the caller.
395 */
396static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
397 struct extent_state *prealloc, u64 split)
398{
399 struct rb_node *node;
400 prealloc->start = orig->start;
401 prealloc->end = split - 1;
402 prealloc->state = orig->state;
403 orig->start = split;
404
405 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
406 if (node) {
407 struct extent_state *found;
408 found = rb_entry(node, struct extent_state, rb_node);
409 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
410 free_extent_state(prealloc);
411 return -EEXIST;
412 }
413 prealloc->tree = tree;
414 return 0;
415}
416
417/*
418 * utility function to clear some bits in an extent state struct.
419 * it will optionally wake up any one waiting on this state (wake == 1), or
420 * forcibly remove the state from the tree (delete == 1).
421 *
422 * If no bits are set on the state struct after clearing things, the
423 * struct is freed and removed from the tree
424 */
425static int clear_state_bit(struct extent_io_tree *tree,
426 struct extent_state *state, int bits, int wake,
427 int delete)
428{
429 int ret = state->state & bits;
430
431 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
432 u64 range = state->end - state->start + 1;
433 WARN_ON(range > tree->dirty_bytes);
434 tree->dirty_bytes -= range;
435 }
436 clear_state_cb(tree, state, bits);
437 state->state &= ~bits;
438 if (wake)
439 wake_up(&state->wq);
440 if (delete || state->state == 0) {
441 if (state->tree) {
442 clear_state_cb(tree, state, state->state);
443 rb_erase(&state->rb_node, &tree->state);
444 state->tree = NULL;
445 free_extent_state(state);
446 } else {
447 WARN_ON(1);
448 }
449 } else {
450 merge_state(tree, state);
451 }
452 return ret;
453}
454
455/*
456 * clear some bits on a range in the tree. This may require splitting
457 * or inserting elements in the tree, so the gfp mask is used to
458 * indicate which allocations or sleeping are allowed.
459 *
460 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
461 * the given range from the tree regardless of state (ie for truncate).
462 *
463 * the range [start, end] is inclusive.
464 *
465 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
466 * bits were already set, or zero if none of the bits were already set.
467 */
468int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
469 int bits, int wake, int delete, gfp_t mask)
470{
471 struct extent_state *state;
472 struct extent_state *prealloc = NULL;
473 struct rb_node *node;
474 unsigned long flags;
475 int err;
476 int set = 0;
477
478again:
479 if (!prealloc && (mask & __GFP_WAIT)) {
480 prealloc = alloc_extent_state(mask);
481 if (!prealloc)
482 return -ENOMEM;
483 }
484
485 spin_lock_irqsave(&tree->lock, flags);
486 /*
487 * this search will find the extents that end after
488 * our range starts
489 */
490 node = tree_search(tree, start);
491 if (!node)
492 goto out;
493 state = rb_entry(node, struct extent_state, rb_node);
494 if (state->start > end)
495 goto out;
496 WARN_ON(state->end < start);
497
498 /*
499 * | ---- desired range ---- |
500 * | state | or
501 * | ------------- state -------------- |
502 *
503 * We need to split the extent we found, and may flip
504 * bits on second half.
505 *
506 * If the extent we found extends past our range, we
507 * just split and search again. It'll get split again
508 * the next time though.
509 *
510 * If the extent we found is inside our range, we clear
511 * the desired bit on it.
512 */
513
514 if (state->start < start) {
515 if (!prealloc)
516 prealloc = alloc_extent_state(GFP_ATOMIC);
517 err = split_state(tree, state, prealloc, start);
518 BUG_ON(err == -EEXIST);
519 prealloc = NULL;
520 if (err)
521 goto out;
522 if (state->end <= end) {
523 start = state->end + 1;
524 set |= clear_state_bit(tree, state, bits,
525 wake, delete);
526 } else {
527 start = state->start;
528 }
529 goto search_again;
530 }
531 /*
532 * | ---- desired range ---- |
533 * | state |
534 * We need to split the extent, and clear the bit
535 * on the first half
536 */
537 if (state->start <= end && state->end > end) {
538 if (!prealloc)
539 prealloc = alloc_extent_state(GFP_ATOMIC);
540 err = split_state(tree, state, prealloc, end + 1);
541 BUG_ON(err == -EEXIST);
542
543 if (wake)
544 wake_up(&state->wq);
545 set |= clear_state_bit(tree, prealloc, bits,
546 wake, delete);
547 prealloc = NULL;
548 goto out;
549 }
550
551 start = state->end + 1;
552 set |= clear_state_bit(tree, state, bits, wake, delete);
553 goto search_again;
554
555out:
556 spin_unlock_irqrestore(&tree->lock, flags);
557 if (prealloc)
558 free_extent_state(prealloc);
559
560 return set;
561
562search_again:
563 if (start > end)
564 goto out;
565 spin_unlock_irqrestore(&tree->lock, flags);
566 if (mask & __GFP_WAIT)
567 cond_resched();
568 goto again;
569}
570EXPORT_SYMBOL(clear_extent_bit);
571
572static int wait_on_state(struct extent_io_tree *tree,
573 struct extent_state *state)
574{
575 DEFINE_WAIT(wait);
576 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
577 spin_unlock_irq(&tree->lock);
578 schedule();
579 spin_lock_irq(&tree->lock);
580 finish_wait(&state->wq, &wait);
581 return 0;
582}
583
584/*
585 * waits for one or more bits to clear on a range in the state tree.
586 * The range [start, end] is inclusive.
587 * The tree lock is taken by this function
588 */
589int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
590{
591 struct extent_state *state;
592 struct rb_node *node;
593
594 spin_lock_irq(&tree->lock);
595again:
596 while (1) {
597 /*
598 * this search will find all the extents that end after
599 * our range starts
600 */
601 node = tree_search(tree, start);
602 if (!node)
603 break;
604
605 state = rb_entry(node, struct extent_state, rb_node);
606
607 if (state->start > end)
608 goto out;
609
610 if (state->state & bits) {
611 start = state->start;
612 atomic_inc(&state->refs);
613 wait_on_state(tree, state);
614 free_extent_state(state);
615 goto again;
616 }
617 start = state->end + 1;
618
619 if (start > end)
620 break;
621
622 if (need_resched()) {
623 spin_unlock_irq(&tree->lock);
624 cond_resched();
625 spin_lock_irq(&tree->lock);
626 }
627 }
628out:
629 spin_unlock_irq(&tree->lock);
630 return 0;
631}
632EXPORT_SYMBOL(wait_extent_bit);
633
634static void set_state_bits(struct extent_io_tree *tree,
635 struct extent_state *state,
636 int bits)
637{
638 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
639 u64 range = state->end - state->start + 1;
640 tree->dirty_bytes += range;
641 }
642 set_state_cb(tree, state, bits);
643 state->state |= bits;
644}
645
646/*
647 * set some bits on a range in the tree. This may require allocations
648 * or sleeping, so the gfp mask is used to indicate what is allowed.
649 *
650 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
651 * range already has the desired bits set. The start of the existing
652 * range is returned in failed_start in this case.
653 *
654 * [start, end] is inclusive
655 * This takes the tree lock.
656 */
657int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
658 int exclusive, u64 *failed_start, gfp_t mask)
659{
660 struct extent_state *state;
661 struct extent_state *prealloc = NULL;
662 struct rb_node *node;
663 unsigned long flags;
664 int err = 0;
665 int set;
666 u64 last_start;
667 u64 last_end;
668again:
669 if (!prealloc && (mask & __GFP_WAIT)) {
670 prealloc = alloc_extent_state(mask);
671 if (!prealloc)
672 return -ENOMEM;
673 }
674
675 spin_lock_irqsave(&tree->lock, flags);
676 /*
677 * this search will find all the extents that end after
678 * our range starts.
679 */
680 node = tree_search(tree, start);
681 if (!node) {
682 err = insert_state(tree, prealloc, start, end, bits);
683 prealloc = NULL;
684 BUG_ON(err == -EEXIST);
685 goto out;
686 }
687
688 state = rb_entry(node, struct extent_state, rb_node);
689 last_start = state->start;
690 last_end = state->end;
691
692 /*
693 * | ---- desired range ---- |
694 * | state |
695 *
696 * Just lock what we found and keep going
697 */
698 if (state->start == start && state->end <= end) {
699 set = state->state & bits;
700 if (set && exclusive) {
701 *failed_start = state->start;
702 err = -EEXIST;
703 goto out;
704 }
705 set_state_bits(tree, state, bits);
706 start = state->end + 1;
707 merge_state(tree, state);
708 goto search_again;
709 }
710
711 /*
712 * | ---- desired range ---- |
713 * | state |
714 * or
715 * | ------------- state -------------- |
716 *
717 * We need to split the extent we found, and may flip bits on
718 * second half.
719 *
720 * If the extent we found extends past our
721 * range, we just split and search again. It'll get split
722 * again the next time though.
723 *
724 * If the extent we found is inside our range, we set the
725 * desired bit on it.
726 */
727 if (state->start < start) {
728 set = state->state & bits;
729 if (exclusive && set) {
730 *failed_start = start;
731 err = -EEXIST;
732 goto out;
733 }
734 err = split_state(tree, state, prealloc, start);
735 BUG_ON(err == -EEXIST);
736 prealloc = NULL;
737 if (err)
738 goto out;
739 if (state->end <= end) {
740 set_state_bits(tree, state, bits);
741 start = state->end + 1;
742 merge_state(tree, state);
743 } else {
744 start = state->start;
745 }
746 goto search_again;
747 }
748 /*
749 * | ---- desired range ---- |
750 * | state | or | state |
751 *
752 * There's a hole, we need to insert something in it and
753 * ignore the extent we found.
754 */
755 if (state->start > start) {
756 u64 this_end;
757 if (end < last_start)
758 this_end = end;
759 else
760 this_end = last_start -1;
761 err = insert_state(tree, prealloc, start, this_end,
762 bits);
763 prealloc = NULL;
764 BUG_ON(err == -EEXIST);
765 if (err)
766 goto out;
767 start = this_end + 1;
768 goto search_again;
769 }
770 /*
771 * | ---- desired range ---- |
772 * | state |
773 * We need to split the extent, and set the bit
774 * on the first half
775 */
776 if (state->start <= end && state->end > end) {
777 set = state->state & bits;
778 if (exclusive && set) {
779 *failed_start = start;
780 err = -EEXIST;
781 goto out;
782 }
783 err = split_state(tree, state, prealloc, end + 1);
784 BUG_ON(err == -EEXIST);
785
786 set_state_bits(tree, prealloc, bits);
787 merge_state(tree, prealloc);
788 prealloc = NULL;
789 goto out;
790 }
791
792 goto search_again;
793
794out:
795 spin_unlock_irqrestore(&tree->lock, flags);
796 if (prealloc)
797 free_extent_state(prealloc);
798
799 return err;
800
801search_again:
802 if (start > end)
803 goto out;
804 spin_unlock_irqrestore(&tree->lock, flags);
805 if (mask & __GFP_WAIT)
806 cond_resched();
807 goto again;
808}
809EXPORT_SYMBOL(set_extent_bit);
810
811/* wrappers around set/clear extent bit */
812int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
813 gfp_t mask)
814{
815 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
816 mask);
817}
818EXPORT_SYMBOL(set_extent_dirty);
819
820int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
821 gfp_t mask)
822{
823 return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
824}
825EXPORT_SYMBOL(set_extent_ordered);
826
827int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
828 int bits, gfp_t mask)
829{
830 return set_extent_bit(tree, start, end, bits, 0, NULL,
831 mask);
832}
833EXPORT_SYMBOL(set_extent_bits);
834
835int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
836 int bits, gfp_t mask)
837{
838 return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
839}
840EXPORT_SYMBOL(clear_extent_bits);
841
842int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
843 gfp_t mask)
844{
845 return set_extent_bit(tree, start, end,
846 EXTENT_DELALLOC | EXTENT_DIRTY,
847 0, NULL, mask);
848}
849EXPORT_SYMBOL(set_extent_delalloc);
850
851int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
852 gfp_t mask)
853{
854 return clear_extent_bit(tree, start, end,
855 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
856}
857EXPORT_SYMBOL(clear_extent_dirty);
858
859int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
860 gfp_t mask)
861{
862 return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
863}
864EXPORT_SYMBOL(clear_extent_ordered);
865
866int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
867 gfp_t mask)
868{
869 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
870 mask);
871}
872EXPORT_SYMBOL(set_extent_new);
873
874int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
875 gfp_t mask)
876{
877 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
878}
879EXPORT_SYMBOL(clear_extent_new);
880
881int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
882 gfp_t mask)
883{
884 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
885 mask);
886}
887EXPORT_SYMBOL(set_extent_uptodate);
888
889int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
890 gfp_t mask)
891{
892 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
893}
894EXPORT_SYMBOL(clear_extent_uptodate);
895
896int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
897 gfp_t mask)
898{
899 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
900 0, NULL, mask);
901}
902EXPORT_SYMBOL(set_extent_writeback);
903
904int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
905 gfp_t mask)
906{
907 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
908}
909EXPORT_SYMBOL(clear_extent_writeback);
910
911int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
912{
913 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
914}
915EXPORT_SYMBOL(wait_on_extent_writeback);
916
917int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
918{
919 int err;
920 u64 failed_start;
921 while (1) {
922 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
923 &failed_start, mask);
924 if (err == -EEXIST && (mask & __GFP_WAIT)) {
925 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
926 start = failed_start;
927 } else {
928 break;
929 }
930 WARN_ON(start > end);
931 }
932 return err;
933}
934EXPORT_SYMBOL(lock_extent);
935
936int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
937 gfp_t mask)
938{
939 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
940}
941EXPORT_SYMBOL(unlock_extent);
942
943/*
944 * helper function to set pages and extents in the tree dirty
945 */
946int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
947{
948 unsigned long index = start >> PAGE_CACHE_SHIFT;
949 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
950 struct page *page;
951
952 while (index <= end_index) {
953 page = find_get_page(tree->mapping, index);
954 BUG_ON(!page);
955 __set_page_dirty_nobuffers(page);
956 page_cache_release(page);
957 index++;
958 }
959 set_extent_dirty(tree, start, end, GFP_NOFS);
960 return 0;
961}
962EXPORT_SYMBOL(set_range_dirty);
963
964/*
965 * helper function to set both pages and extents in the tree writeback
966 */
967int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
968{
969 unsigned long index = start >> PAGE_CACHE_SHIFT;
970 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
971 struct page *page;
972
973 while (index <= end_index) {
974 page = find_get_page(tree->mapping, index);
975 BUG_ON(!page);
976 set_page_writeback(page);
977 page_cache_release(page);
978 index++;
979 }
980 set_extent_writeback(tree, start, end, GFP_NOFS);
981 return 0;
982}
983EXPORT_SYMBOL(set_range_writeback);
984
985int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
986 u64 *start_ret, u64 *end_ret, int bits)
987{
988 struct rb_node *node;
989 struct extent_state *state;
990 int ret = 1;
991
992 spin_lock_irq(&tree->lock);
993 /*
994 * this search will find all the extents that end after
995 * our range starts.
996 */
997 node = tree_search(tree, start);
998 if (!node) {
999 goto out;
1000 }
1001
1002 while(1) {
1003 state = rb_entry(node, struct extent_state, rb_node);
1004 if (state->end >= start && (state->state & bits)) {
1005 *start_ret = state->start;
1006 *end_ret = state->end;
1007 ret = 0;
1008 break;
1009 }
1010 node = rb_next(node);
1011 if (!node)
1012 break;
1013 }
1014out:
1015 spin_unlock_irq(&tree->lock);
1016 return ret;
1017}
1018EXPORT_SYMBOL(find_first_extent_bit);
1019
1020struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1021 u64 start, int bits)
1022{
1023 struct rb_node *node;
1024 struct extent_state *state;
1025
1026 /*
1027 * this search will find all the extents that end after
1028 * our range starts.
1029 */
1030 node = tree_search(tree, start);
1031 if (!node) {
1032 goto out;
1033 }
1034
1035 while(1) {
1036 state = rb_entry(node, struct extent_state, rb_node);
1037 if (state->end >= start && (state->state & bits)) {
1038 return state;
1039 }
1040 node = rb_next(node);
1041 if (!node)
1042 break;
1043 }
1044out:
1045 return NULL;
1046}
1047EXPORT_SYMBOL(find_first_extent_bit_state);
1048
1049u64 find_lock_delalloc_range(struct extent_io_tree *tree,
1050 u64 *start, u64 *end, u64 max_bytes)
1051{
1052 struct rb_node *node;
1053 struct extent_state *state;
1054 u64 cur_start = *start;
1055 u64 found = 0;
1056 u64 total_bytes = 0;
1057
1058 spin_lock_irq(&tree->lock);
1059 /*
1060 * this search will find all the extents that end after
1061 * our range starts.
1062 */
1063search_again:
1064 node = tree_search(tree, cur_start);
1065 if (!node) {
1066 if (!found)
1067 *end = (u64)-1;
1068 goto out;
1069 }
1070
1071 while(1) {
1072 state = rb_entry(node, struct extent_state, rb_node);
1073 if (found && state->start != cur_start) {
1074 goto out;
1075 }
1076 if (!(state->state & EXTENT_DELALLOC)) {
1077 if (!found)
1078 *end = state->end;
1079 goto out;
1080 }
1081 if (!found) {
1082 struct extent_state *prev_state;
1083 struct rb_node *prev_node = node;
1084 while(1) {
1085 prev_node = rb_prev(prev_node);
1086 if (!prev_node)
1087 break;
1088 prev_state = rb_entry(prev_node,
1089 struct extent_state,
1090 rb_node);
1091 if (!(prev_state->state & EXTENT_DELALLOC))
1092 break;
1093 state = prev_state;
1094 node = prev_node;
1095 }
1096 }
1097 if (state->state & EXTENT_LOCKED) {
1098 DEFINE_WAIT(wait);
1099 atomic_inc(&state->refs);
1100 prepare_to_wait(&state->wq, &wait,
1101 TASK_UNINTERRUPTIBLE);
1102 spin_unlock_irq(&tree->lock);
1103 schedule();
1104 spin_lock_irq(&tree->lock);
1105 finish_wait(&state->wq, &wait);
1106 free_extent_state(state);
1107 goto search_again;
1108 }
1109 set_state_cb(tree, state, EXTENT_LOCKED);
1110 state->state |= EXTENT_LOCKED;
1111 if (!found)
1112 *start = state->start;
1113 found++;
1114 *end = state->end;
1115 cur_start = state->end + 1;
1116 node = rb_next(node);
1117 if (!node)
1118 break;
1119 total_bytes += state->end - state->start + 1;
1120 if (total_bytes >= max_bytes)
1121 break;
1122 }
1123out:
1124 spin_unlock_irq(&tree->lock);
1125 return found;
1126}
1127
1128u64 count_range_bits(struct extent_io_tree *tree,
1129 u64 *start, u64 search_end, u64 max_bytes,
1130 unsigned long bits)
1131{
1132 struct rb_node *node;
1133 struct extent_state *state;
1134 u64 cur_start = *start;
1135 u64 total_bytes = 0;
1136 int found = 0;
1137
1138 if (search_end <= cur_start) {
1139 printk("search_end %Lu start %Lu\n", search_end, cur_start);
1140 WARN_ON(1);
1141 return 0;
1142 }
1143
1144 spin_lock_irq(&tree->lock);
1145 if (cur_start == 0 && bits == EXTENT_DIRTY) {
1146 total_bytes = tree->dirty_bytes;
1147 goto out;
1148 }
1149 /*
1150 * this search will find all the extents that end after
1151 * our range starts.
1152 */
1153 node = tree_search(tree, cur_start);
1154 if (!node) {
1155 goto out;
1156 }
1157
1158 while(1) {
1159 state = rb_entry(node, struct extent_state, rb_node);
1160 if (state->start > search_end)
1161 break;
1162 if (state->end >= cur_start && (state->state & bits)) {
1163 total_bytes += min(search_end, state->end) + 1 -
1164 max(cur_start, state->start);
1165 if (total_bytes >= max_bytes)
1166 break;
1167 if (!found) {
1168 *start = state->start;
1169 found = 1;
1170 }
1171 }
1172 node = rb_next(node);
1173 if (!node)
1174 break;
1175 }
1176out:
1177 spin_unlock_irq(&tree->lock);
1178 return total_bytes;
1179}
1180/*
1181 * helper function to lock both pages and extents in the tree.
1182 * pages must be locked first.
1183 */
1184int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1185{
1186 unsigned long index = start >> PAGE_CACHE_SHIFT;
1187 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1188 struct page *page;
1189 int err;
1190
1191 while (index <= end_index) {
1192 page = grab_cache_page(tree->mapping, index);
1193 if (!page) {
1194 err = -ENOMEM;
1195 goto failed;
1196 }
1197 if (IS_ERR(page)) {
1198 err = PTR_ERR(page);
1199 goto failed;
1200 }
1201 index++;
1202 }
1203 lock_extent(tree, start, end, GFP_NOFS);
1204 return 0;
1205
1206failed:
1207 /*
1208 * we failed above in getting the page at 'index', so we undo here
1209 * up to but not including the page at 'index'
1210 */
1211 end_index = index;
1212 index = start >> PAGE_CACHE_SHIFT;
1213 while (index < end_index) {
1214 page = find_get_page(tree->mapping, index);
1215 unlock_page(page);
1216 page_cache_release(page);
1217 index++;
1218 }
1219 return err;
1220}
1221EXPORT_SYMBOL(lock_range);
1222
1223/*
1224 * helper function to unlock both pages and extents in the tree.
1225 */
1226int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1227{
1228 unsigned long index = start >> PAGE_CACHE_SHIFT;
1229 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1230 struct page *page;
1231
1232 while (index <= end_index) {
1233 page = find_get_page(tree->mapping, index);
1234 unlock_page(page);
1235 page_cache_release(page);
1236 index++;
1237 }
1238 unlock_extent(tree, start, end, GFP_NOFS);
1239 return 0;
1240}
1241EXPORT_SYMBOL(unlock_range);
1242
1243int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1244{
1245 struct rb_node *node;
1246 struct extent_state *state;
1247 int ret = 0;
1248
1249 spin_lock_irq(&tree->lock);
1250 /*
1251 * this search will find all the extents that end after
1252 * our range starts.
1253 */
1254 node = tree_search(tree, start);
1255 if (!node) {
1256 ret = -ENOENT;
1257 goto out;
1258 }
1259 state = rb_entry(node, struct extent_state, rb_node);
1260 if (state->start != start) {
1261 ret = -ENOENT;
1262 goto out;
1263 }
1264 state->private = private;
1265out:
1266 spin_unlock_irq(&tree->lock);
1267 return ret;
1268}
1269
1270int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1271{
1272 struct rb_node *node;
1273 struct extent_state *state;
1274 int ret = 0;
1275
1276 spin_lock_irq(&tree->lock);
1277 /*
1278 * this search will find all the extents that end after
1279 * our range starts.
1280 */
1281 node = tree_search(tree, start);
1282 if (!node) {
1283 ret = -ENOENT;
1284 goto out;
1285 }
1286 state = rb_entry(node, struct extent_state, rb_node);
1287 if (state->start != start) {
1288 ret = -ENOENT;
1289 goto out;
1290 }
1291 *private = state->private;
1292out:
1293 spin_unlock_irq(&tree->lock);
1294 return ret;
1295}
1296
1297/*
1298 * searches a range in the state tree for a given mask.
1299 * If 'filled' == 1, this returns 1 only if every extent in the tree
1300 * has the bits set. Otherwise, 1 is returned if any bit in the
1301 * range is found set.
1302 */
1303int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1304 int bits, int filled)
1305{
1306 struct extent_state *state = NULL;
1307 struct rb_node *node;
1308 int bitset = 0;
1309 unsigned long flags;
1310
1311 spin_lock_irqsave(&tree->lock, flags);
1312 node = tree_search(tree, start);
1313 while (node && start <= end) {
1314 state = rb_entry(node, struct extent_state, rb_node);
1315
1316 if (filled && state->start > start) {
1317 bitset = 0;
1318 break;
1319 }
1320
1321 if (state->start > end)
1322 break;
1323
1324 if (state->state & bits) {
1325 bitset = 1;
1326 if (!filled)
1327 break;
1328 } else if (filled) {
1329 bitset = 0;
1330 break;
1331 }
1332 start = state->end + 1;
1333 if (start > end)
1334 break;
1335 node = rb_next(node);
1336 if (!node) {
1337 if (filled)
1338 bitset = 0;
1339 break;
1340 }
1341 }
1342 spin_unlock_irqrestore(&tree->lock, flags);
1343 return bitset;
1344}
1345EXPORT_SYMBOL(test_range_bit);
1346
1347/*
1348 * helper function to set a given page up to date if all the
1349 * extents in the tree for that page are up to date
1350 */
1351static int check_page_uptodate(struct extent_io_tree *tree,
1352 struct page *page)
1353{
1354 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1355 u64 end = start + PAGE_CACHE_SIZE - 1;
1356 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1357 SetPageUptodate(page);
1358 return 0;
1359}
1360
1361/*
1362 * helper function to unlock a page if all the extents in the tree
1363 * for that page are unlocked
1364 */
1365static int check_page_locked(struct extent_io_tree *tree,
1366 struct page *page)
1367{
1368 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1369 u64 end = start + PAGE_CACHE_SIZE - 1;
1370 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1371 unlock_page(page);
1372 return 0;
1373}
1374
1375/*
1376 * helper function to end page writeback if all the extents
1377 * in the tree for that page are done with writeback
1378 */
1379static int check_page_writeback(struct extent_io_tree *tree,
1380 struct page *page)
1381{
1382 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1383 u64 end = start + PAGE_CACHE_SIZE - 1;
1384 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1385 end_page_writeback(page);
1386 return 0;
1387}
1388
1389/* lots and lots of room for performance fixes in the end_bio funcs */
1390
1391/*
1392 * after a writepage IO is done, we need to:
1393 * clear the uptodate bits on error
1394 * clear the writeback bits in the extent tree for this IO
1395 * end_page_writeback if the page has no more pending IO
1396 *
1397 * Scheduling is not allowed, so the extent state tree is expected
1398 * to have one and only one object corresponding to this IO.
1399 */
1400#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1401static void end_bio_extent_writepage(struct bio *bio, int err)
1402#else
1403static int end_bio_extent_writepage(struct bio *bio,
1404 unsigned int bytes_done, int err)
1405#endif
1406{
1407 int uptodate = err == 0;
1408 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1409 struct extent_io_tree *tree;
1410 u64 start;
1411 u64 end;
1412 int whole_page;
1413 int ret;
1414
1415#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1416 if (bio->bi_size)
1417 return 1;
1418#endif
1419 do {
1420 struct page *page = bvec->bv_page;
1421 tree = &BTRFS_I(page->mapping->host)->io_tree;
1422
1423 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1424 bvec->bv_offset;
1425 end = start + bvec->bv_len - 1;
1426
1427 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1428 whole_page = 1;
1429 else
1430 whole_page = 0;
1431
1432 if (--bvec >= bio->bi_io_vec)
1433 prefetchw(&bvec->bv_page->flags);
1434 if (tree->ops && tree->ops->writepage_end_io_hook) {
1435 ret = tree->ops->writepage_end_io_hook(page, start,
1436 end, NULL, uptodate);
1437 if (ret)
1438 uptodate = 0;
1439 }
1440
1441 if (!uptodate && tree->ops &&
1442 tree->ops->writepage_io_failed_hook) {
1443 ret = tree->ops->writepage_io_failed_hook(bio, page,
1444 start, end, NULL);
1445 if (ret == 0) {
1446 uptodate = (err == 0);
1447 continue;
1448 }
1449 }
1450
1451 if (!uptodate) {
1452 clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1453 ClearPageUptodate(page);
1454 SetPageError(page);
1455 }
1456
1457 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1458
1459 if (whole_page)
1460 end_page_writeback(page);
1461 else
1462 check_page_writeback(tree, page);
1463 } while (bvec >= bio->bi_io_vec);
1464 bio_put(bio);
1465#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1466 return 0;
1467#endif
1468}
1469
1470/*
1471 * after a readpage IO is done, we need to:
1472 * clear the uptodate bits on error
1473 * set the uptodate bits if things worked
1474 * set the page up to date if all extents in the tree are uptodate
1475 * clear the lock bit in the extent tree
1476 * unlock the page if there are no other extents locked for it
1477 *
1478 * Scheduling is not allowed, so the extent state tree is expected
1479 * to have one and only one object corresponding to this IO.
1480 */
1481#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1482static void end_bio_extent_readpage(struct bio *bio, int err)
1483#else
1484static int end_bio_extent_readpage(struct bio *bio,
1485 unsigned int bytes_done, int err)
1486#endif
1487{
1488 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1489 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1490 struct extent_io_tree *tree;
1491 u64 start;
1492 u64 end;
1493 int whole_page;
1494 int ret;
1495
1496#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1497 if (bio->bi_size)
1498 return 1;
1499#endif
1500
1501 do {
1502 struct page *page = bvec->bv_page;
1503 tree = &BTRFS_I(page->mapping->host)->io_tree;
1504
1505 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1506 bvec->bv_offset;
1507 end = start + bvec->bv_len - 1;
1508
1509 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1510 whole_page = 1;
1511 else
1512 whole_page = 0;
1513
1514 if (--bvec >= bio->bi_io_vec)
1515 prefetchw(&bvec->bv_page->flags);
1516
1517 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1518 ret = tree->ops->readpage_end_io_hook(page, start, end,
1519 NULL);
1520 if (ret)
1521 uptodate = 0;
1522 }
1523 if (!uptodate && tree->ops &&
1524 tree->ops->readpage_io_failed_hook) {
1525 ret = tree->ops->readpage_io_failed_hook(bio, page,
1526 start, end, NULL);
1527 if (ret == 0) {
1528 uptodate =
1529 test_bit(BIO_UPTODATE, &bio->bi_flags);
1530 continue;
1531 }
1532 }
1533
1534 if (uptodate)
1535 set_extent_uptodate(tree, start, end,
1536 GFP_ATOMIC);
1537 unlock_extent(tree, start, end, GFP_ATOMIC);
1538
1539 if (whole_page) {
1540 if (uptodate) {
1541 SetPageUptodate(page);
1542 } else {
1543 ClearPageUptodate(page);
1544 SetPageError(page);
1545 }
1546 unlock_page(page);
1547 } else {
1548 if (uptodate) {
1549 check_page_uptodate(tree, page);
1550 } else {
1551 ClearPageUptodate(page);
1552 SetPageError(page);
1553 }
1554 check_page_locked(tree, page);
1555 }
1556 } while (bvec >= bio->bi_io_vec);
1557
1558 bio_put(bio);
1559#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1560 return 0;
1561#endif
1562}
1563
1564/*
1565 * IO done from prepare_write is pretty simple, we just unlock
1566 * the structs in the extent tree when done, and set the uptodate bits
1567 * as appropriate.
1568 */
1569#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1570static void end_bio_extent_preparewrite(struct bio *bio, int err)
1571#else
1572static int end_bio_extent_preparewrite(struct bio *bio,
1573 unsigned int bytes_done, int err)
1574#endif
1575{
1576 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1577 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1578 struct extent_io_tree *tree;
1579 u64 start;
1580 u64 end;
1581
1582#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1583 if (bio->bi_size)
1584 return 1;
1585#endif
1586
1587 do {
1588 struct page *page = bvec->bv_page;
1589 tree = &BTRFS_I(page->mapping->host)->io_tree;
1590
1591 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1592 bvec->bv_offset;
1593 end = start + bvec->bv_len - 1;
1594
1595 if (--bvec >= bio->bi_io_vec)
1596 prefetchw(&bvec->bv_page->flags);
1597
1598 if (uptodate) {
1599 set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1600 } else {
1601 ClearPageUptodate(page);
1602 SetPageError(page);
1603 }
1604
1605 unlock_extent(tree, start, end, GFP_ATOMIC);
1606
1607 } while (bvec >= bio->bi_io_vec);
1608
1609 bio_put(bio);
1610#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1611 return 0;
1612#endif
1613}
1614
1615static struct bio *
1616extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1617 gfp_t gfp_flags)
1618{
1619 struct bio *bio;
1620
1621 bio = bio_alloc(gfp_flags, nr_vecs);
1622
1623 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1624 while (!bio && (nr_vecs /= 2))
1625 bio = bio_alloc(gfp_flags, nr_vecs);
1626 }
1627
1628 if (bio) {
1629 bio->bi_size = 0;
1630 bio->bi_bdev = bdev;
1631 bio->bi_sector = first_sector;
1632 }
1633 return bio;
1634}
1635
1636static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
1637{
1638 int ret = 0;
1639 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1640 struct page *page = bvec->bv_page;
1641 struct extent_io_tree *tree = bio->bi_private;
1642 struct rb_node *node;
1643 struct extent_state *state;
1644 u64 start;
1645 u64 end;
1646
1647 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1648 end = start + bvec->bv_len - 1;
1649
1650 spin_lock_irq(&tree->lock);
1651 node = __etree_search(tree, start, NULL, NULL);
1652 BUG_ON(!node);
1653 state = rb_entry(node, struct extent_state, rb_node);
1654 while(state->end < end) {
1655 node = rb_next(node);
1656 state = rb_entry(node, struct extent_state, rb_node);
1657 }
1658 BUG_ON(state->end != end);
1659 spin_unlock_irq(&tree->lock);
1660
1661 bio->bi_private = NULL;
1662
1663 bio_get(bio);
1664
1665 if (tree->ops && tree->ops->submit_bio_hook)
1666 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1667 mirror_num);
1668 else
1669 submit_bio(rw, bio);
1670 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1671 ret = -EOPNOTSUPP;
1672 bio_put(bio);
1673 return ret;
1674}
1675
1676static int submit_extent_page(int rw, struct extent_io_tree *tree,
1677 struct page *page, sector_t sector,
1678 size_t size, unsigned long offset,
1679 struct block_device *bdev,
1680 struct bio **bio_ret,
1681 unsigned long max_pages,
1682 bio_end_io_t end_io_func,
1683 int mirror_num)
1684{
1685 int ret = 0;
1686 struct bio *bio;
1687 int nr;
1688
1689 if (bio_ret && *bio_ret) {
1690 bio = *bio_ret;
1691 if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
1692 (tree->ops && tree->ops->merge_bio_hook &&
1693 tree->ops->merge_bio_hook(page, offset, size, bio)) ||
1694 bio_add_page(bio, page, size, offset) < size) {
1695 ret = submit_one_bio(rw, bio, mirror_num);
1696 bio = NULL;
1697 } else {
1698 return 0;
1699 }
1700 }
1701 nr = bio_get_nr_vecs(bdev);
1702 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1703 if (!bio) {
1704 printk("failed to allocate bio nr %d\n", nr);
1705 }
1706
1707
1708 bio_add_page(bio, page, size, offset);
1709 bio->bi_end_io = end_io_func;
1710 bio->bi_private = tree;
1711
1712 if (bio_ret) {
1713 *bio_ret = bio;
1714 } else {
1715 ret = submit_one_bio(rw, bio, mirror_num);
1716 }
1717
1718 return ret;
1719}
1720
1721void set_page_extent_mapped(struct page *page)
1722{
1723 if (!PagePrivate(page)) {
1724 SetPagePrivate(page);
1725 page_cache_get(page);
1726 set_page_private(page, EXTENT_PAGE_PRIVATE);
1727 }
1728}
1729
1730void set_page_extent_head(struct page *page, unsigned long len)
1731{
1732 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1733}
1734
1735/*
1736 * basic readpage implementation. Locked extent state structs are inserted
1737 * into the tree that are removed when the IO is done (by the end_io
1738 * handlers)
1739 */
1740static int __extent_read_full_page(struct extent_io_tree *tree,
1741 struct page *page,
1742 get_extent_t *get_extent,
1743 struct bio **bio, int mirror_num)
1744{
1745 struct inode *inode = page->mapping->host;
1746 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1747 u64 page_end = start + PAGE_CACHE_SIZE - 1;
1748 u64 end;
1749 u64 cur = start;
1750 u64 extent_offset;
1751 u64 last_byte = i_size_read(inode);
1752 u64 block_start;
1753 u64 cur_end;
1754 sector_t sector;
1755 struct extent_map *em;
1756 struct block_device *bdev;
1757 int ret;
1758 int nr = 0;
1759 size_t page_offset = 0;
1760 size_t iosize;
1761 size_t blocksize = inode->i_sb->s_blocksize;
1762
1763 set_page_extent_mapped(page);
1764
1765 end = page_end;
1766 lock_extent(tree, start, end, GFP_NOFS);
1767
1768 while (cur <= end) {
1769 if (cur >= last_byte) {
1770 char *userpage;
1771 iosize = PAGE_CACHE_SIZE - page_offset;
1772 userpage = kmap_atomic(page, KM_USER0);
1773 memset(userpage + page_offset, 0, iosize);
1774 flush_dcache_page(page);
1775 kunmap_atomic(userpage, KM_USER0);
1776 set_extent_uptodate(tree, cur, cur + iosize - 1,
1777 GFP_NOFS);
1778 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1779 break;
1780 }
1781 em = get_extent(inode, page, page_offset, cur,
1782 end - cur + 1, 0);
1783 if (IS_ERR(em) || !em) {
1784 SetPageError(page);
1785 unlock_extent(tree, cur, end, GFP_NOFS);
1786 break;
1787 }
1788 extent_offset = cur - em->start;
1789 if (extent_map_end(em) <= cur) {
1790printk("bad mapping em [%Lu %Lu] cur %Lu\n", em->start, extent_map_end(em), cur);
1791 }
1792 BUG_ON(extent_map_end(em) <= cur);
1793 if (end < cur) {
1794printk("2bad mapping end %Lu cur %Lu\n", end, cur);
1795 }
1796 BUG_ON(end < cur);
1797
1798 iosize = min(extent_map_end(em) - cur, end - cur + 1);
1799 cur_end = min(extent_map_end(em) - 1, end);
1800 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1801 sector = (em->block_start + extent_offset) >> 9;
1802 bdev = em->bdev;
1803 block_start = em->block_start;
1804 free_extent_map(em);
1805 em = NULL;
1806
1807 /* we've found a hole, just zero and go on */
1808 if (block_start == EXTENT_MAP_HOLE) {
1809 char *userpage;
1810 userpage = kmap_atomic(page, KM_USER0);
1811 memset(userpage + page_offset, 0, iosize);
1812 flush_dcache_page(page);
1813 kunmap_atomic(userpage, KM_USER0);
1814
1815 set_extent_uptodate(tree, cur, cur + iosize - 1,
1816 GFP_NOFS);
1817 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1818 cur = cur + iosize;
1819 page_offset += iosize;
1820 continue;
1821 }
1822 /* the get_extent function already copied into the page */
1823 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
1824 check_page_uptodate(tree, page);
1825 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1826 cur = cur + iosize;
1827 page_offset += iosize;
1828 continue;
1829 }
1830 /* we have an inline extent but it didn't get marked up
1831 * to date. Error out
1832 */
1833 if (block_start == EXTENT_MAP_INLINE) {
1834 SetPageError(page);
1835 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1836 cur = cur + iosize;
1837 page_offset += iosize;
1838 continue;
1839 }
1840
1841 ret = 0;
1842 if (tree->ops && tree->ops->readpage_io_hook) {
1843 ret = tree->ops->readpage_io_hook(page, cur,
1844 cur + iosize - 1);
1845 }
1846 if (!ret) {
1847 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
1848 pnr -= page->index;
1849 ret = submit_extent_page(READ, tree, page,
1850 sector, iosize, page_offset,
1851 bdev, bio, pnr,
1852 end_bio_extent_readpage, mirror_num);
1853 nr++;
1854 }
1855 if (ret)
1856 SetPageError(page);
1857 cur = cur + iosize;
1858 page_offset += iosize;
1859 }
1860 if (!nr) {
1861 if (!PageError(page))
1862 SetPageUptodate(page);
1863 unlock_page(page);
1864 }
1865 return 0;
1866}
1867
1868int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
1869 get_extent_t *get_extent)
1870{
1871 struct bio *bio = NULL;
1872 int ret;
1873
1874 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0);
1875 if (bio)
1876 submit_one_bio(READ, bio, 0);
1877 return ret;
1878}
1879EXPORT_SYMBOL(extent_read_full_page);
1880
1881/*
1882 * the writepage semantics are similar to regular writepage. extent
1883 * records are inserted to lock ranges in the tree, and as dirty areas
1884 * are found, they are marked writeback. Then the lock bits are removed
1885 * and the end_io handler clears the writeback ranges
1886 */
1887static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1888 void *data)
1889{
1890 struct inode *inode = page->mapping->host;
1891 struct extent_page_data *epd = data;
1892 struct extent_io_tree *tree = epd->tree;
1893 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1894 u64 delalloc_start;
1895 u64 page_end = start + PAGE_CACHE_SIZE - 1;
1896 u64 end;
1897 u64 cur = start;
1898 u64 extent_offset;
1899 u64 last_byte = i_size_read(inode);
1900 u64 block_start;
1901 u64 iosize;
1902 u64 unlock_start;
1903 sector_t sector;
1904 struct extent_map *em;
1905 struct block_device *bdev;
1906 int ret;
1907 int nr = 0;
1908 size_t pg_offset = 0;
1909 size_t blocksize;
1910 loff_t i_size = i_size_read(inode);
1911 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
1912 u64 nr_delalloc;
1913 u64 delalloc_end;
1914
1915 WARN_ON(!PageLocked(page));
1916 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
1917 if (page->index > end_index ||
1918 (page->index == end_index && !pg_offset)) {
1919 page->mapping->a_ops->invalidatepage(page, 0);
1920 unlock_page(page);
1921 return 0;
1922 }
1923
1924 if (page->index == end_index) {
1925 char *userpage;
1926
1927 userpage = kmap_atomic(page, KM_USER0);
1928 memset(userpage + pg_offset, 0,
1929 PAGE_CACHE_SIZE - pg_offset);
1930 kunmap_atomic(userpage, KM_USER0);
1931 flush_dcache_page(page);
1932 }
1933 pg_offset = 0;
1934
1935 set_page_extent_mapped(page);
1936
1937 delalloc_start = start;
1938 delalloc_end = 0;
1939 while(delalloc_end < page_end) {
1940 nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
1941 &delalloc_end,
1942 128 * 1024 * 1024);
1943 if (nr_delalloc == 0) {
1944 delalloc_start = delalloc_end + 1;
1945 continue;
1946 }
1947 tree->ops->fill_delalloc(inode, delalloc_start,
1948 delalloc_end);
1949 clear_extent_bit(tree, delalloc_start,
1950 delalloc_end,
1951 EXTENT_LOCKED | EXTENT_DELALLOC,
1952 1, 0, GFP_NOFS);
1953 delalloc_start = delalloc_end + 1;
1954 }
1955 lock_extent(tree, start, page_end, GFP_NOFS);
1956 unlock_start = start;
1957
1958 if (tree->ops && tree->ops->writepage_start_hook) {
1959 ret = tree->ops->writepage_start_hook(page, start, page_end);
1960 if (ret == -EAGAIN) {
1961 unlock_extent(tree, start, page_end, GFP_NOFS);
1962 redirty_page_for_writepage(wbc, page);
1963 unlock_page(page);
1964 return 0;
1965 }
1966 }
1967
1968 end = page_end;
1969 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1970 printk("found delalloc bits after lock_extent\n");
1971 }
1972
1973 if (last_byte <= start) {
1974 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1975 unlock_extent(tree, start, page_end, GFP_NOFS);
1976 if (tree->ops && tree->ops->writepage_end_io_hook)
1977 tree->ops->writepage_end_io_hook(page, start,
1978 page_end, NULL, 1);
1979 unlock_start = page_end + 1;
1980 goto done;
1981 }
1982
1983 set_extent_uptodate(tree, start, page_end, GFP_NOFS);
1984 blocksize = inode->i_sb->s_blocksize;
1985
1986 while (cur <= end) {
1987 if (cur >= last_byte) {
1988 clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
1989 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
1990 if (tree->ops && tree->ops->writepage_end_io_hook)
1991 tree->ops->writepage_end_io_hook(page, cur,
1992 page_end, NULL, 1);
1993 unlock_start = page_end + 1;
1994 break;
1995 }
1996 em = epd->get_extent(inode, page, pg_offset, cur,
1997 end - cur + 1, 1);
1998 if (IS_ERR(em) || !em) {
1999 SetPageError(page);
2000 break;
2001 }
2002
2003 extent_offset = cur - em->start;
2004 BUG_ON(extent_map_end(em) <= cur);
2005 BUG_ON(end < cur);
2006 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2007 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2008 sector = (em->block_start + extent_offset) >> 9;
2009 bdev = em->bdev;
2010 block_start = em->block_start;
2011 free_extent_map(em);
2012 em = NULL;
2013
2014 if (block_start == EXTENT_MAP_HOLE ||
2015 block_start == EXTENT_MAP_INLINE) {
2016 clear_extent_dirty(tree, cur,
2017 cur + iosize - 1, GFP_NOFS);
2018
2019 unlock_extent(tree, unlock_start, cur + iosize -1,
2020 GFP_NOFS);
2021
2022 if (tree->ops && tree->ops->writepage_end_io_hook)
2023 tree->ops->writepage_end_io_hook(page, cur,
2024 cur + iosize - 1,
2025 NULL, 1);
2026 cur = cur + iosize;
2027 pg_offset += iosize;
2028 unlock_start = cur;
2029 continue;
2030 }
2031
2032 /* leave this out until we have a page_mkwrite call */
2033 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2034 EXTENT_DIRTY, 0)) {
2035 cur = cur + iosize;
2036 pg_offset += iosize;
2037 continue;
2038 }
2039 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2040 if (tree->ops && tree->ops->writepage_io_hook) {
2041 ret = tree->ops->writepage_io_hook(page, cur,
2042 cur + iosize - 1);
2043 } else {
2044 ret = 0;
2045 }
2046 if (ret) {
2047 SetPageError(page);
2048 } else {
2049 unsigned long max_nr = end_index + 1;
2050
2051 set_range_writeback(tree, cur, cur + iosize - 1);
2052 if (!PageWriteback(page)) {
2053 printk("warning page %lu not writeback, "
2054 "cur %llu end %llu\n", page->index,
2055 (unsigned long long)cur,
2056 (unsigned long long)end);
2057 }
2058
2059 ret = submit_extent_page(WRITE, tree, page, sector,
2060 iosize, pg_offset, bdev,
2061 &epd->bio, max_nr,
2062 end_bio_extent_writepage, 0);
2063 if (ret)
2064 SetPageError(page);
2065 }
2066 cur = cur + iosize;
2067 pg_offset += iosize;
2068 nr++;
2069 }
2070done:
2071 if (nr == 0) {
2072 /* make sure the mapping tag for page dirty gets cleared */
2073 set_page_writeback(page);
2074 end_page_writeback(page);
2075 }
2076 if (unlock_start <= page_end)
2077 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2078 unlock_page(page);
2079 return 0;
2080}
2081
2082#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
2083/* Taken directly from 2.6.23 with a mod for a lockpage hook */
2084typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
2085 void *data);
2086#endif
2087
2088/**
2089 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2090 * @mapping: address space structure to write
2091 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2092 * @writepage: function called for each page
2093 * @data: data passed to writepage function
2094 *
2095 * If a page is already under I/O, write_cache_pages() skips it, even
2096 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
2097 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
2098 * and msync() need to guarantee that all the data which was dirty at the time
2099 * the call was made get new I/O started against them. If wbc->sync_mode is
2100 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2101 * existing IO to complete.
2102 */
2103int extent_write_cache_pages(struct extent_io_tree *tree,
2104 struct address_space *mapping,
2105 struct writeback_control *wbc,
2106 writepage_t writepage, void *data)
2107{
2108 struct backing_dev_info *bdi = mapping->backing_dev_info;
2109 int ret = 0;
2110 int done = 0;
2111 struct pagevec pvec;
2112 int nr_pages;
2113 pgoff_t index;
2114 pgoff_t end; /* Inclusive */
2115 int scanned = 0;
2116 int range_whole = 0;
2117
2118 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2119 wbc->encountered_congestion = 1;
2120 return 0;
2121 }
2122
2123 pagevec_init(&pvec, 0);
2124 if (wbc->range_cyclic) {
2125 index = mapping->writeback_index; /* Start from prev offset */
2126 end = -1;
2127 } else {
2128 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2129 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2130 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2131 range_whole = 1;
2132 scanned = 1;
2133 }
2134retry:
2135 while (!done && (index <= end) &&
2136 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2137 PAGECACHE_TAG_DIRTY,
2138 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2139 unsigned i;
2140
2141 scanned = 1;
2142 for (i = 0; i < nr_pages; i++) {
2143 struct page *page = pvec.pages[i];
2144
2145 /*
2146 * At this point we hold neither mapping->tree_lock nor
2147 * lock on the page itself: the page may be truncated or
2148 * invalidated (changing page->mapping to NULL), or even
2149 * swizzled back from swapper_space to tmpfs file
2150 * mapping
2151 */
2152 if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2153 tree->ops->write_cache_pages_lock_hook(page);
2154 else
2155 lock_page(page);
2156
2157 if (unlikely(page->mapping != mapping)) {
2158 unlock_page(page);
2159 continue;
2160 }
2161
2162 if (!wbc->range_cyclic && page->index > end) {
2163 done = 1;
2164 unlock_page(page);
2165 continue;
2166 }
2167
2168 if (wbc->sync_mode != WB_SYNC_NONE)
2169 wait_on_page_writeback(page);
2170
2171 if (PageWriteback(page) ||
2172 !clear_page_dirty_for_io(page)) {
2173 unlock_page(page);
2174 continue;
2175 }
2176
2177 ret = (*writepage)(page, wbc, data);
2178
2179 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2180 unlock_page(page);
2181 ret = 0;
2182 }
2183 if (ret || (--(wbc->nr_to_write) <= 0))
2184 done = 1;
2185 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2186 wbc->encountered_congestion = 1;
2187 done = 1;
2188 }
2189 }
2190 pagevec_release(&pvec);
2191 cond_resched();
2192 }
2193 if (!scanned && !done) {
2194 /*
2195 * We hit the last page and there is more work to be done: wrap
2196 * back to the start of the file
2197 */
2198 scanned = 1;
2199 index = 0;
2200 goto retry;
2201 }
2202 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2203 mapping->writeback_index = index;
2204#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
2205 if (wbc->range_cont)
2206 wbc->range_start = index << PAGE_CACHE_SHIFT;
2207#endif
2208 return ret;
2209}
2210EXPORT_SYMBOL(extent_write_cache_pages);
2211
2212int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2213 get_extent_t *get_extent,
2214 struct writeback_control *wbc)
2215{
2216 int ret;
2217 struct address_space *mapping = page->mapping;
2218 struct extent_page_data epd = {
2219 .bio = NULL,
2220 .tree = tree,
2221 .get_extent = get_extent,
2222 };
2223 struct writeback_control wbc_writepages = {
2224 .bdi = wbc->bdi,
2225 .sync_mode = WB_SYNC_NONE,
2226 .older_than_this = NULL,
2227 .nr_to_write = 64,
2228 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2229 .range_end = (loff_t)-1,
2230 };
2231
2232
2233 ret = __extent_writepage(page, wbc, &epd);
2234
2235 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2236 __extent_writepage, &epd);
2237 if (epd.bio) {
2238 submit_one_bio(WRITE, epd.bio, 0);
2239 }
2240 return ret;
2241}
2242EXPORT_SYMBOL(extent_write_full_page);
2243
2244
2245int extent_writepages(struct extent_io_tree *tree,
2246 struct address_space *mapping,
2247 get_extent_t *get_extent,
2248 struct writeback_control *wbc)
2249{
2250 int ret = 0;
2251 struct extent_page_data epd = {
2252 .bio = NULL,
2253 .tree = tree,
2254 .get_extent = get_extent,
2255 };
2256
2257 ret = extent_write_cache_pages(tree, mapping, wbc,
2258 __extent_writepage, &epd);
2259 if (epd.bio) {
2260 submit_one_bio(WRITE, epd.bio, 0);
2261 }
2262 return ret;
2263}
2264EXPORT_SYMBOL(extent_writepages);
2265
2266int extent_readpages(struct extent_io_tree *tree,
2267 struct address_space *mapping,
2268 struct list_head *pages, unsigned nr_pages,
2269 get_extent_t get_extent)
2270{
2271 struct bio *bio = NULL;
2272 unsigned page_idx;
2273 struct pagevec pvec;
2274
2275 pagevec_init(&pvec, 0);
2276 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2277 struct page *page = list_entry(pages->prev, struct page, lru);
2278
2279 prefetchw(&page->flags);
2280 list_del(&page->lru);
2281 /*
2282 * what we want to do here is call add_to_page_cache_lru,
2283 * but that isn't exported, so we reproduce it here
2284 */
2285 if (!add_to_page_cache(page, mapping,
2286 page->index, GFP_KERNEL)) {
2287
2288 /* open coding of lru_cache_add, also not exported */
2289 page_cache_get(page);
2290 if (!pagevec_add(&pvec, page))
2291 __pagevec_lru_add(&pvec);
2292 __extent_read_full_page(tree, page, get_extent,
2293 &bio, 0);
2294 }
2295 page_cache_release(page);
2296 }
2297 if (pagevec_count(&pvec))
2298 __pagevec_lru_add(&pvec);
2299 BUG_ON(!list_empty(pages));
2300 if (bio)
2301 submit_one_bio(READ, bio, 0);
2302 return 0;
2303}
2304EXPORT_SYMBOL(extent_readpages);
2305
2306/*
2307 * basic invalidatepage code, this waits on any locked or writeback
2308 * ranges corresponding to the page, and then deletes any extent state
2309 * records from the tree
2310 */
2311int extent_invalidatepage(struct extent_io_tree *tree,
2312 struct page *page, unsigned long offset)
2313{
2314 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2315 u64 end = start + PAGE_CACHE_SIZE - 1;
2316 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2317
2318 start += (offset + blocksize -1) & ~(blocksize - 1);
2319 if (start > end)
2320 return 0;
2321
2322 lock_extent(tree, start, end, GFP_NOFS);
2323 wait_on_extent_writeback(tree, start, end);
2324 clear_extent_bit(tree, start, end,
2325 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2326 1, 1, GFP_NOFS);
2327 return 0;
2328}
2329EXPORT_SYMBOL(extent_invalidatepage);
2330
2331/*
2332 * simple commit_write call, set_range_dirty is used to mark both
2333 * the pages and the extent records as dirty
2334 */
2335int extent_commit_write(struct extent_io_tree *tree,
2336 struct inode *inode, struct page *page,
2337 unsigned from, unsigned to)
2338{
2339 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2340
2341 set_page_extent_mapped(page);
2342 set_page_dirty(page);
2343
2344 if (pos > inode->i_size) {
2345 i_size_write(inode, pos);
2346 mark_inode_dirty(inode);
2347 }
2348 return 0;
2349}
2350EXPORT_SYMBOL(extent_commit_write);
2351
2352int extent_prepare_write(struct extent_io_tree *tree,
2353 struct inode *inode, struct page *page,
2354 unsigned from, unsigned to, get_extent_t *get_extent)
2355{
2356 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2357 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2358 u64 block_start;
2359 u64 orig_block_start;
2360 u64 block_end;
2361 u64 cur_end;
2362 struct extent_map *em;
2363 unsigned blocksize = 1 << inode->i_blkbits;
2364 size_t page_offset = 0;
2365 size_t block_off_start;
2366 size_t block_off_end;
2367 int err = 0;
2368 int iocount = 0;
2369 int ret = 0;
2370 int isnew;
2371
2372 set_page_extent_mapped(page);
2373
2374 block_start = (page_start + from) & ~((u64)blocksize - 1);
2375 block_end = (page_start + to - 1) | (blocksize - 1);
2376 orig_block_start = block_start;
2377
2378 lock_extent(tree, page_start, page_end, GFP_NOFS);
2379 while(block_start <= block_end) {
2380 em = get_extent(inode, page, page_offset, block_start,
2381 block_end - block_start + 1, 1);
2382 if (IS_ERR(em) || !em) {
2383 goto err;
2384 }
2385 cur_end = min(block_end, extent_map_end(em) - 1);
2386 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2387 block_off_end = block_off_start + blocksize;
2388 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2389
2390 if (!PageUptodate(page) && isnew &&
2391 (block_off_end > to || block_off_start < from)) {
2392 void *kaddr;
2393
2394 kaddr = kmap_atomic(page, KM_USER0);
2395 if (block_off_end > to)
2396 memset(kaddr + to, 0, block_off_end - to);
2397 if (block_off_start < from)
2398 memset(kaddr + block_off_start, 0,
2399 from - block_off_start);
2400 flush_dcache_page(page);
2401 kunmap_atomic(kaddr, KM_USER0);
2402 }
2403 if ((em->block_start != EXTENT_MAP_HOLE &&
2404 em->block_start != EXTENT_MAP_INLINE) &&
2405 !isnew && !PageUptodate(page) &&
2406 (block_off_end > to || block_off_start < from) &&
2407 !test_range_bit(tree, block_start, cur_end,
2408 EXTENT_UPTODATE, 1)) {
2409 u64 sector;
2410 u64 extent_offset = block_start - em->start;
2411 size_t iosize;
2412 sector = (em->block_start + extent_offset) >> 9;
2413 iosize = (cur_end - block_start + blocksize) &
2414 ~((u64)blocksize - 1);
2415 /*
2416 * we've already got the extent locked, but we
2417 * need to split the state such that our end_bio
2418 * handler can clear the lock.
2419 */
2420 set_extent_bit(tree, block_start,
2421 block_start + iosize - 1,
2422 EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2423 ret = submit_extent_page(READ, tree, page,
2424 sector, iosize, page_offset, em->bdev,
2425 NULL, 1,
2426 end_bio_extent_preparewrite, 0);
2427 iocount++;
2428 block_start = block_start + iosize;
2429 } else {
2430 set_extent_uptodate(tree, block_start, cur_end,
2431 GFP_NOFS);
2432 unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2433 block_start = cur_end + 1;
2434 }
2435 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2436 free_extent_map(em);
2437 }
2438 if (iocount) {
2439 wait_extent_bit(tree, orig_block_start,
2440 block_end, EXTENT_LOCKED);
2441 }
2442 check_page_uptodate(tree, page);
2443err:
2444 /* FIXME, zero out newly allocated blocks on error */
2445 return err;
2446}
2447EXPORT_SYMBOL(extent_prepare_write);
2448
2449/*
2450 * a helper for releasepage, this tests for areas of the page that
2451 * are locked or under IO and drops the related state bits if it is safe
2452 * to drop the page.
2453 */
2454int try_release_extent_state(struct extent_map_tree *map,
2455 struct extent_io_tree *tree, struct page *page,
2456 gfp_t mask)
2457{
2458 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2459 u64 end = start + PAGE_CACHE_SIZE - 1;
2460 int ret = 1;
2461
2462 if (test_range_bit(tree, start, end,
2463 EXTENT_IOBITS | EXTENT_ORDERED, 0))
2464 ret = 0;
2465 else {
2466 if ((mask & GFP_NOFS) == GFP_NOFS)
2467 mask = GFP_NOFS;
2468 clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2469 1, 1, mask);
2470 }
2471 return ret;
2472}
2473EXPORT_SYMBOL(try_release_extent_state);
2474
2475/*
2476 * a helper for releasepage. As long as there are no locked extents
2477 * in the range corresponding to the page, both state records and extent
2478 * map records are removed
2479 */
2480int try_release_extent_mapping(struct extent_map_tree *map,
2481 struct extent_io_tree *tree, struct page *page,
2482 gfp_t mask)
2483{
2484 struct extent_map *em;
2485 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2486 u64 end = start + PAGE_CACHE_SIZE - 1;
2487
2488 if ((mask & __GFP_WAIT) &&
2489 page->mapping->host->i_size > 16 * 1024 * 1024) {
2490 u64 len;
2491 while (start <= end) {
2492 len = end - start + 1;
2493 spin_lock(&map->lock);
2494 em = lookup_extent_mapping(map, start, len);
2495 if (!em || IS_ERR(em)) {
2496 spin_unlock(&map->lock);
2497 break;
2498 }
2499 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2500 em->start != start) {
2501 spin_unlock(&map->lock);
2502 free_extent_map(em);
2503 break;
2504 }
2505 if (!test_range_bit(tree, em->start,
2506 extent_map_end(em) - 1,
2507 EXTENT_LOCKED, 0)) {
2508 remove_extent_mapping(map, em);
2509 /* once for the rb tree */
2510 free_extent_map(em);
2511 }
2512 start = extent_map_end(em);
2513 spin_unlock(&map->lock);
2514
2515 /* once for us */
2516 free_extent_map(em);
2517 }
2518 }
2519 return try_release_extent_state(map, tree, page, mask);
2520}
2521EXPORT_SYMBOL(try_release_extent_mapping);
2522
2523sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2524 get_extent_t *get_extent)
2525{
2526 struct inode *inode = mapping->host;
2527 u64 start = iblock << inode->i_blkbits;
2528 sector_t sector = 0;
2529 struct extent_map *em;
2530
2531 em = get_extent(inode, NULL, 0, start, (1 << inode->i_blkbits), 0);
2532 if (!em || IS_ERR(em))
2533 return 0;
2534
2535 if (em->block_start == EXTENT_MAP_INLINE ||
2536 em->block_start == EXTENT_MAP_HOLE)
2537 goto out;
2538
2539 sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2540out:
2541 free_extent_map(em);
2542 return sector;
2543}
2544
2545static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2546 unsigned long i)
2547{
2548 struct page *p;
2549 struct address_space *mapping;
2550
2551 if (i == 0)
2552 return eb->first_page;
2553 i += eb->start >> PAGE_CACHE_SHIFT;
2554 mapping = eb->first_page->mapping;
2555 if (!mapping)
2556 return NULL;
2557
2558 /*
2559 * extent_buffer_page is only called after pinning the page
2560 * by increasing the reference count. So we know the page must
2561 * be in the radix tree.
2562 */
2563#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
2564 rcu_read_lock();
2565#else
2566 read_lock_irq(&mapping->tree_lock);
2567#endif
2568 p = radix_tree_lookup(&mapping->page_tree, i);
2569
2570#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
2571 rcu_read_unlock();
2572#else
2573 read_unlock_irq(&mapping->tree_lock);
2574#endif
2575 return p;
2576}
2577
2578static inline unsigned long num_extent_pages(u64 start, u64 len)
2579{
2580 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2581 (start >> PAGE_CACHE_SHIFT);
2582}
2583
2584static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2585 u64 start,
2586 unsigned long len,
2587 gfp_t mask)
2588{
2589 struct extent_buffer *eb = NULL;
2590#ifdef LEAK_DEBUG
2591 unsigned long flags;
2592#endif
2593
2594 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2595 eb->start = start;
2596 eb->len = len;
2597 mutex_init(&eb->mutex);
2598#ifdef LEAK_DEBUG
2599 spin_lock_irqsave(&leak_lock, flags);
2600 list_add(&eb->leak_list, &buffers);
2601 spin_unlock_irqrestore(&leak_lock, flags);
2602#endif
2603 atomic_set(&eb->refs, 1);
2604
2605 return eb;
2606}
2607
2608static void __free_extent_buffer(struct extent_buffer *eb)
2609{
2610#ifdef LEAK_DEBUG
2611 unsigned long flags;
2612 spin_lock_irqsave(&leak_lock, flags);
2613 list_del(&eb->leak_list);
2614 spin_unlock_irqrestore(&leak_lock, flags);
2615#endif
2616 kmem_cache_free(extent_buffer_cache, eb);
2617}
2618
2619struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2620 u64 start, unsigned long len,
2621 struct page *page0,
2622 gfp_t mask)
2623{
2624 unsigned long num_pages = num_extent_pages(start, len);
2625 unsigned long i;
2626 unsigned long index = start >> PAGE_CACHE_SHIFT;
2627 struct extent_buffer *eb;
2628 struct extent_buffer *exists = NULL;
2629 struct page *p;
2630 struct address_space *mapping = tree->mapping;
2631 int uptodate = 1;
2632
2633 spin_lock(&tree->buffer_lock);
2634 eb = buffer_search(tree, start);
2635 if (eb) {
2636 atomic_inc(&eb->refs);
2637 spin_unlock(&tree->buffer_lock);
2638 mark_page_accessed(eb->first_page);
2639 return eb;
2640 }
2641 spin_unlock(&tree->buffer_lock);
2642
2643 eb = __alloc_extent_buffer(tree, start, len, mask);
2644 if (!eb)
2645 return NULL;
2646
2647 if (page0) {
2648 eb->first_page = page0;
2649 i = 1;
2650 index++;
2651 page_cache_get(page0);
2652 mark_page_accessed(page0);
2653 set_page_extent_mapped(page0);
2654 set_page_extent_head(page0, len);
2655 uptodate = PageUptodate(page0);
2656 } else {
2657 i = 0;
2658 }
2659 for (; i < num_pages; i++, index++) {
2660 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2661 if (!p) {
2662 WARN_ON(1);
2663 goto free_eb;
2664 }
2665 set_page_extent_mapped(p);
2666 mark_page_accessed(p);
2667 if (i == 0) {
2668 eb->first_page = p;
2669 set_page_extent_head(p, len);
2670 } else {
2671 set_page_private(p, EXTENT_PAGE_PRIVATE);
2672 }
2673 if (!PageUptodate(p))
2674 uptodate = 0;
2675 unlock_page(p);
2676 }
2677 if (uptodate)
2678 eb->flags |= EXTENT_UPTODATE;
2679 eb->flags |= EXTENT_BUFFER_FILLED;
2680
2681 spin_lock(&tree->buffer_lock);
2682 exists = buffer_tree_insert(tree, start, &eb->rb_node);
2683 if (exists) {
2684 /* add one reference for the caller */
2685 atomic_inc(&exists->refs);
2686 spin_unlock(&tree->buffer_lock);
2687 goto free_eb;
2688 }
2689 spin_unlock(&tree->buffer_lock);
2690
2691 /* add one reference for the tree */
2692 atomic_inc(&eb->refs);
2693 return eb;
2694
2695free_eb:
2696 if (!atomic_dec_and_test(&eb->refs))
2697 return exists;
2698 for (index = 1; index < i; index++)
2699 page_cache_release(extent_buffer_page(eb, index));
2700 page_cache_release(extent_buffer_page(eb, 0));
2701 __free_extent_buffer(eb);
2702 return exists;
2703}
2704EXPORT_SYMBOL(alloc_extent_buffer);
2705
2706struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
2707 u64 start, unsigned long len,
2708 gfp_t mask)
2709{
2710 struct extent_buffer *eb;
2711
2712 spin_lock(&tree->buffer_lock);
2713 eb = buffer_search(tree, start);
2714 if (eb)
2715 atomic_inc(&eb->refs);
2716 spin_unlock(&tree->buffer_lock);
2717
2718 if (eb)
2719 mark_page_accessed(eb->first_page);
2720
2721 return eb;
2722}
2723EXPORT_SYMBOL(find_extent_buffer);
2724
2725void free_extent_buffer(struct extent_buffer *eb)
2726{
2727 if (!eb)
2728 return;
2729
2730 if (!atomic_dec_and_test(&eb->refs))
2731 return;
2732
2733 WARN_ON(1);
2734}
2735EXPORT_SYMBOL(free_extent_buffer);
2736
2737int clear_extent_buffer_dirty(struct extent_io_tree *tree,
2738 struct extent_buffer *eb)
2739{
2740 int set;
2741 unsigned long i;
2742 unsigned long num_pages;
2743 struct page *page;
2744
2745 u64 start = eb->start;
2746 u64 end = start + eb->len - 1;
2747
2748 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
2749 num_pages = num_extent_pages(eb->start, eb->len);
2750
2751 for (i = 0; i < num_pages; i++) {
2752 page = extent_buffer_page(eb, i);
2753 lock_page(page);
2754 if (i == 0)
2755 set_page_extent_head(page, eb->len);
2756 else
2757 set_page_private(page, EXTENT_PAGE_PRIVATE);
2758
2759 /*
2760 * if we're on the last page or the first page and the
2761 * block isn't aligned on a page boundary, do extra checks
2762 * to make sure we don't clean page that is partially dirty
2763 */
2764 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2765 ((i == num_pages - 1) &&
2766 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2767 start = (u64)page->index << PAGE_CACHE_SHIFT;
2768 end = start + PAGE_CACHE_SIZE - 1;
2769 if (test_range_bit(tree, start, end,
2770 EXTENT_DIRTY, 0)) {
2771 unlock_page(page);
2772 continue;
2773 }
2774 }
2775 clear_page_dirty_for_io(page);
2776#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
2777 spin_lock_irq(&page->mapping->tree_lock);
2778#else
2779 read_lock_irq(&page->mapping->tree_lock);
2780#endif
2781 if (!PageDirty(page)) {
2782 radix_tree_tag_clear(&page->mapping->page_tree,
2783 page_index(page),
2784 PAGECACHE_TAG_DIRTY);
2785 }
2786#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
2787 spin_unlock_irq(&page->mapping->tree_lock);
2788#else
2789 read_unlock_irq(&page->mapping->tree_lock);
2790#endif
2791 unlock_page(page);
2792 }
2793 return 0;
2794}
2795EXPORT_SYMBOL(clear_extent_buffer_dirty);
2796
2797int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
2798 struct extent_buffer *eb)
2799{
2800 return wait_on_extent_writeback(tree, eb->start,
2801 eb->start + eb->len - 1);
2802}
2803EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
2804
2805int set_extent_buffer_dirty(struct extent_io_tree *tree,
2806 struct extent_buffer *eb)
2807{
2808 unsigned long i;
2809 unsigned long num_pages;
2810
2811 num_pages = num_extent_pages(eb->start, eb->len);
2812 for (i = 0; i < num_pages; i++) {
2813 struct page *page = extent_buffer_page(eb, i);
2814 /* writepage may need to do something special for the
2815 * first page, we have to make sure page->private is
2816 * properly set. releasepage may drop page->private
2817 * on us if the page isn't already dirty.
2818 */
2819 lock_page(page);
2820 if (i == 0) {
2821 set_page_extent_head(page, eb->len);
2822 } else if (PagePrivate(page) &&
2823 page->private != EXTENT_PAGE_PRIVATE) {
2824 set_page_extent_mapped(page);
2825 }
2826 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
2827 set_extent_dirty(tree, page_offset(page),
2828 page_offset(page) + PAGE_CACHE_SIZE -1,
2829 GFP_NOFS);
2830 unlock_page(page);
2831 }
2832 return 0;
2833}
2834EXPORT_SYMBOL(set_extent_buffer_dirty);
2835
2836int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
2837 struct extent_buffer *eb)
2838{
2839 unsigned long i;
2840 struct page *page;
2841 unsigned long num_pages;
2842
2843 num_pages = num_extent_pages(eb->start, eb->len);
2844 eb->flags &= ~EXTENT_UPTODATE;
2845
2846 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2847 GFP_NOFS);
2848 for (i = 0; i < num_pages; i++) {
2849 page = extent_buffer_page(eb, i);
2850 if (page)
2851 ClearPageUptodate(page);
2852 }
2853 return 0;
2854}
2855
2856int set_extent_buffer_uptodate(struct extent_io_tree *tree,
2857 struct extent_buffer *eb)
2858{
2859 unsigned long i;
2860 struct page *page;
2861 unsigned long num_pages;
2862
2863 num_pages = num_extent_pages(eb->start, eb->len);
2864
2865 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2866 GFP_NOFS);
2867 for (i = 0; i < num_pages; i++) {
2868 page = extent_buffer_page(eb, i);
2869 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2870 ((i == num_pages - 1) &&
2871 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2872 check_page_uptodate(tree, page);
2873 continue;
2874 }
2875 SetPageUptodate(page);
2876 }
2877 return 0;
2878}
2879EXPORT_SYMBOL(set_extent_buffer_uptodate);
2880
2881int extent_range_uptodate(struct extent_io_tree *tree,
2882 u64 start, u64 end)
2883{
2884 struct page *page;
2885 int ret;
2886 int pg_uptodate = 1;
2887 int uptodate;
2888 unsigned long index;
2889
2890 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
2891 if (ret)
2892 return 1;
2893 while(start <= end) {
2894 index = start >> PAGE_CACHE_SHIFT;
2895 page = find_get_page(tree->mapping, index);
2896 uptodate = PageUptodate(page);
2897 page_cache_release(page);
2898 if (!uptodate) {
2899 pg_uptodate = 0;
2900 break;
2901 }
2902 start += PAGE_CACHE_SIZE;
2903 }
2904 return pg_uptodate;
2905}
2906
2907int extent_buffer_uptodate(struct extent_io_tree *tree,
2908 struct extent_buffer *eb)
2909{
2910 int ret = 0;
2911 unsigned long num_pages;
2912 unsigned long i;
2913 struct page *page;
2914 int pg_uptodate = 1;
2915
2916 if (eb->flags & EXTENT_UPTODATE)
2917 return 1;
2918
2919 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2920 EXTENT_UPTODATE, 1);
2921 if (ret)
2922 return ret;
2923
2924 num_pages = num_extent_pages(eb->start, eb->len);
2925 for (i = 0; i < num_pages; i++) {
2926 page = extent_buffer_page(eb, i);
2927 if (!PageUptodate(page)) {
2928 pg_uptodate = 0;
2929 break;
2930 }
2931 }
2932 return pg_uptodate;
2933}
2934EXPORT_SYMBOL(extent_buffer_uptodate);
2935
2936int read_extent_buffer_pages(struct extent_io_tree *tree,
2937 struct extent_buffer *eb,
2938 u64 start, int wait,
2939 get_extent_t *get_extent, int mirror_num)
2940{
2941 unsigned long i;
2942 unsigned long start_i;
2943 struct page *page;
2944 int err;
2945 int ret = 0;
2946 int locked_pages = 0;
2947 int all_uptodate = 1;
2948 int inc_all_pages = 0;
2949 unsigned long num_pages;
2950 struct bio *bio = NULL;
2951
2952 if (eb->flags & EXTENT_UPTODATE)
2953 return 0;
2954
2955 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2956 EXTENT_UPTODATE, 1)) {
2957 return 0;
2958 }
2959
2960 if (start) {
2961 WARN_ON(start < eb->start);
2962 start_i = (start >> PAGE_CACHE_SHIFT) -
2963 (eb->start >> PAGE_CACHE_SHIFT);
2964 } else {
2965 start_i = 0;
2966 }
2967
2968 num_pages = num_extent_pages(eb->start, eb->len);
2969 for (i = start_i; i < num_pages; i++) {
2970 page = extent_buffer_page(eb, i);
2971 if (!wait) {
2972 if (!trylock_page(page))
2973 goto unlock_exit;
2974 } else {
2975 lock_page(page);
2976 }
2977 locked_pages++;
2978 if (!PageUptodate(page)) {
2979 all_uptodate = 0;
2980 }
2981 }
2982 if (all_uptodate) {
2983 if (start_i == 0)
2984 eb->flags |= EXTENT_UPTODATE;
2985 if (ret) {
2986 printk("all up to date but ret is %d\n", ret);
2987 }
2988 goto unlock_exit;
2989 }
2990
2991 for (i = start_i; i < num_pages; i++) {
2992 page = extent_buffer_page(eb, i);
2993 if (inc_all_pages)
2994 page_cache_get(page);
2995 if (!PageUptodate(page)) {
2996 if (start_i == 0)
2997 inc_all_pages = 1;
2998 ClearPageError(page);
2999 err = __extent_read_full_page(tree, page,
3000 get_extent, &bio,
3001 mirror_num);
3002 if (err) {
3003 ret = err;
3004 printk("err %d from __extent_read_full_page\n", ret);
3005 }
3006 } else {
3007 unlock_page(page);
3008 }
3009 }
3010
3011 if (bio)
3012 submit_one_bio(READ, bio, mirror_num);
3013
3014 if (ret || !wait) {
3015 if (ret)
3016 printk("ret %d wait %d returning\n", ret, wait);
3017 return ret;
3018 }
3019 for (i = start_i; i < num_pages; i++) {
3020 page = extent_buffer_page(eb, i);
3021 wait_on_page_locked(page);
3022 if (!PageUptodate(page)) {
3023 printk("page not uptodate after wait_on_page_locked\n");
3024 ret = -EIO;
3025 }
3026 }
3027 if (!ret)
3028 eb->flags |= EXTENT_UPTODATE;
3029 return ret;
3030
3031unlock_exit:
3032 i = start_i;
3033 while(locked_pages > 0) {
3034 page = extent_buffer_page(eb, i);
3035 i++;
3036 unlock_page(page);
3037 locked_pages--;
3038 }
3039 return ret;
3040}
3041EXPORT_SYMBOL(read_extent_buffer_pages);
3042
3043void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3044 unsigned long start,
3045 unsigned long len)
3046{
3047 size_t cur;
3048 size_t offset;
3049 struct page *page;
3050 char *kaddr;
3051 char *dst = (char *)dstv;
3052 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3053 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3054
3055 WARN_ON(start > eb->len);
3056 WARN_ON(start + len > eb->start + eb->len);
3057
3058 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3059
3060 while(len > 0) {
3061 page = extent_buffer_page(eb, i);
3062
3063 cur = min(len, (PAGE_CACHE_SIZE - offset));
3064 kaddr = kmap_atomic(page, KM_USER1);
3065 memcpy(dst, kaddr + offset, cur);
3066 kunmap_atomic(kaddr, KM_USER1);
3067
3068 dst += cur;
3069 len -= cur;
3070 offset = 0;
3071 i++;
3072 }
3073}
3074EXPORT_SYMBOL(read_extent_buffer);
3075
3076int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3077 unsigned long min_len, char **token, char **map,
3078 unsigned long *map_start,
3079 unsigned long *map_len, int km)
3080{
3081 size_t offset = start & (PAGE_CACHE_SIZE - 1);
3082 char *kaddr;
3083 struct page *p;
3084 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3085 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3086 unsigned long end_i = (start_offset + start + min_len - 1) >>
3087 PAGE_CACHE_SHIFT;
3088
3089 if (i != end_i)
3090 return -EINVAL;
3091
3092 if (i == 0) {
3093 offset = start_offset;
3094 *map_start = 0;
3095 } else {
3096 offset = 0;
3097 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3098 }
3099 if (start + min_len > eb->len) {
3100printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
3101 WARN_ON(1);
3102 }
3103
3104 p = extent_buffer_page(eb, i);
3105 kaddr = kmap_atomic(p, km);
3106 *token = kaddr;
3107 *map = kaddr + offset;
3108 *map_len = PAGE_CACHE_SIZE - offset;
3109 return 0;
3110}
3111EXPORT_SYMBOL(map_private_extent_buffer);
3112
3113int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3114 unsigned long min_len,
3115 char **token, char **map,
3116 unsigned long *map_start,
3117 unsigned long *map_len, int km)
3118{
3119 int err;
3120 int save = 0;
3121 if (eb->map_token) {
3122 unmap_extent_buffer(eb, eb->map_token, km);
3123 eb->map_token = NULL;
3124 save = 1;
3125 }
3126 err = map_private_extent_buffer(eb, start, min_len, token, map,
3127 map_start, map_len, km);
3128 if (!err && save) {
3129 eb->map_token = *token;
3130 eb->kaddr = *map;
3131 eb->map_start = *map_start;
3132 eb->map_len = *map_len;
3133 }
3134 return err;
3135}
3136EXPORT_SYMBOL(map_extent_buffer);
3137
3138void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3139{
3140 kunmap_atomic(token, km);
3141}
3142EXPORT_SYMBOL(unmap_extent_buffer);
3143
3144int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3145 unsigned long start,
3146 unsigned long len)
3147{
3148 size_t cur;
3149 size_t offset;
3150 struct page *page;
3151 char *kaddr;
3152 char *ptr = (char *)ptrv;
3153 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3154 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3155 int ret = 0;
3156
3157 WARN_ON(start > eb->len);
3158 WARN_ON(start + len > eb->start + eb->len);
3159
3160 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3161
3162 while(len > 0) {
3163 page = extent_buffer_page(eb, i);
3164
3165 cur = min(len, (PAGE_CACHE_SIZE - offset));
3166
3167 kaddr = kmap_atomic(page, KM_USER0);
3168 ret = memcmp(ptr, kaddr + offset, cur);
3169 kunmap_atomic(kaddr, KM_USER0);
3170 if (ret)
3171 break;
3172
3173 ptr += cur;
3174 len -= cur;
3175 offset = 0;
3176 i++;
3177 }
3178 return ret;
3179}
3180EXPORT_SYMBOL(memcmp_extent_buffer);
3181
3182void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3183 unsigned long start, unsigned long len)
3184{
3185 size_t cur;
3186 size_t offset;
3187 struct page *page;
3188 char *kaddr;
3189 char *src = (char *)srcv;
3190 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3191 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3192
3193 WARN_ON(start > eb->len);
3194 WARN_ON(start + len > eb->start + eb->len);
3195
3196 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3197
3198 while(len > 0) {
3199 page = extent_buffer_page(eb, i);
3200 WARN_ON(!PageUptodate(page));
3201
3202 cur = min(len, PAGE_CACHE_SIZE - offset);
3203 kaddr = kmap_atomic(page, KM_USER1);
3204 memcpy(kaddr + offset, src, cur);
3205 kunmap_atomic(kaddr, KM_USER1);
3206
3207 src += cur;
3208 len -= cur;
3209 offset = 0;
3210 i++;
3211 }
3212}
3213EXPORT_SYMBOL(write_extent_buffer);
3214
3215void memset_extent_buffer(struct extent_buffer *eb, char c,
3216 unsigned long start, unsigned long len)
3217{
3218 size_t cur;
3219 size_t offset;
3220 struct page *page;
3221 char *kaddr;
3222 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3223 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3224
3225 WARN_ON(start > eb->len);
3226 WARN_ON(start + len > eb->start + eb->len);
3227
3228 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3229
3230 while(len > 0) {
3231 page = extent_buffer_page(eb, i);
3232 WARN_ON(!PageUptodate(page));
3233
3234 cur = min(len, PAGE_CACHE_SIZE - offset);
3235 kaddr = kmap_atomic(page, KM_USER0);
3236 memset(kaddr + offset, c, cur);
3237 kunmap_atomic(kaddr, KM_USER0);
3238
3239 len -= cur;
3240 offset = 0;
3241 i++;
3242 }
3243}
3244EXPORT_SYMBOL(memset_extent_buffer);
3245
3246void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3247 unsigned long dst_offset, unsigned long src_offset,
3248 unsigned long len)
3249{
3250 u64 dst_len = dst->len;
3251 size_t cur;
3252 size_t offset;
3253 struct page *page;
3254 char *kaddr;
3255 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3256 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3257
3258 WARN_ON(src->len != dst_len);
3259
3260 offset = (start_offset + dst_offset) &
3261 ((unsigned long)PAGE_CACHE_SIZE - 1);
3262
3263 while(len > 0) {
3264 page = extent_buffer_page(dst, i);
3265 WARN_ON(!PageUptodate(page));
3266
3267 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3268
3269 kaddr = kmap_atomic(page, KM_USER0);
3270 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3271 kunmap_atomic(kaddr, KM_USER0);
3272
3273 src_offset += cur;
3274 len -= cur;
3275 offset = 0;
3276 i++;
3277 }
3278}
3279EXPORT_SYMBOL(copy_extent_buffer);
3280
3281static void move_pages(struct page *dst_page, struct page *src_page,
3282 unsigned long dst_off, unsigned long src_off,
3283 unsigned long len)
3284{
3285 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3286 if (dst_page == src_page) {
3287 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3288 } else {
3289 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3290 char *p = dst_kaddr + dst_off + len;
3291 char *s = src_kaddr + src_off + len;
3292
3293 while (len--)
3294 *--p = *--s;
3295
3296 kunmap_atomic(src_kaddr, KM_USER1);
3297 }
3298 kunmap_atomic(dst_kaddr, KM_USER0);
3299}
3300
3301static void copy_pages(struct page *dst_page, struct page *src_page,
3302 unsigned long dst_off, unsigned long src_off,
3303 unsigned long len)
3304{
3305 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3306 char *src_kaddr;
3307
3308 if (dst_page != src_page)
3309 src_kaddr = kmap_atomic(src_page, KM_USER1);
3310 else
3311 src_kaddr = dst_kaddr;
3312
3313 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3314 kunmap_atomic(dst_kaddr, KM_USER0);
3315 if (dst_page != src_page)
3316 kunmap_atomic(src_kaddr, KM_USER1);
3317}
3318
3319void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3320 unsigned long src_offset, unsigned long len)
3321{
3322 size_t cur;
3323 size_t dst_off_in_page;
3324 size_t src_off_in_page;
3325 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3326 unsigned long dst_i;
3327 unsigned long src_i;
3328
3329 if (src_offset + len > dst->len) {
3330 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3331 src_offset, len, dst->len);
3332 BUG_ON(1);
3333 }
3334 if (dst_offset + len > dst->len) {
3335 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3336 dst_offset, len, dst->len);
3337 BUG_ON(1);
3338 }
3339
3340 while(len > 0) {
3341 dst_off_in_page = (start_offset + dst_offset) &
3342 ((unsigned long)PAGE_CACHE_SIZE - 1);
3343 src_off_in_page = (start_offset + src_offset) &
3344 ((unsigned long)PAGE_CACHE_SIZE - 1);
3345
3346 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3347 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3348
3349 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3350 src_off_in_page));
3351 cur = min_t(unsigned long, cur,
3352 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3353
3354 copy_pages(extent_buffer_page(dst, dst_i),
3355 extent_buffer_page(dst, src_i),
3356 dst_off_in_page, src_off_in_page, cur);
3357
3358 src_offset += cur;
3359 dst_offset += cur;
3360 len -= cur;
3361 }
3362}
3363EXPORT_SYMBOL(memcpy_extent_buffer);
3364
3365void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3366 unsigned long src_offset, unsigned long len)
3367{
3368 size_t cur;
3369 size_t dst_off_in_page;
3370 size_t src_off_in_page;
3371 unsigned long dst_end = dst_offset + len - 1;
3372 unsigned long src_end = src_offset + len - 1;
3373 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3374 unsigned long dst_i;
3375 unsigned long src_i;
3376
3377 if (src_offset + len > dst->len) {
3378 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3379 src_offset, len, dst->len);
3380 BUG_ON(1);
3381 }
3382 if (dst_offset + len > dst->len) {
3383 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3384 dst_offset, len, dst->len);
3385 BUG_ON(1);
3386 }
3387 if (dst_offset < src_offset) {
3388 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3389 return;
3390 }
3391 while(len > 0) {
3392 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3393 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3394
3395 dst_off_in_page = (start_offset + dst_end) &
3396 ((unsigned long)PAGE_CACHE_SIZE - 1);
3397 src_off_in_page = (start_offset + src_end) &
3398 ((unsigned long)PAGE_CACHE_SIZE - 1);
3399
3400 cur = min_t(unsigned long, len, src_off_in_page + 1);
3401 cur = min(cur, dst_off_in_page + 1);
3402 move_pages(extent_buffer_page(dst, dst_i),
3403 extent_buffer_page(dst, src_i),
3404 dst_off_in_page - cur + 1,
3405 src_off_in_page - cur + 1, cur);
3406
3407 dst_end -= cur;
3408 src_end -= cur;
3409 len -= cur;
3410 }
3411}
3412EXPORT_SYMBOL(memmove_extent_buffer);
3413
3414int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3415{
3416 u64 start = page_offset(page);
3417 struct extent_buffer *eb;
3418 int ret = 1;
3419 unsigned long i;
3420 unsigned long num_pages;
3421
3422 spin_lock(&tree->buffer_lock);
3423 eb = buffer_search(tree, start);
3424 if (!eb)
3425 goto out;
3426
3427 if (atomic_read(&eb->refs) > 1) {
3428 ret = 0;
3429 goto out;
3430 }
3431 /* at this point we can safely release the extent buffer */
3432 num_pages = num_extent_pages(eb->start, eb->len);
3433 for (i = 0; i < num_pages; i++)
3434 page_cache_release(extent_buffer_page(eb, i));
3435 rb_erase(&eb->rb_node, &tree->buffer);
3436 __free_extent_buffer(eb);
3437out:
3438 spin_unlock(&tree->buffer_lock);
3439 return ret;
3440}
3441EXPORT_SYMBOL(try_release_extent_buffer);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 000000000000..3cb411a5f4d3
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,247 @@
1#ifndef __EXTENTIO__
2#define __EXTENTIO__
3
4#include <linux/rbtree.h>
5
6/* bits for the extent state */
7#define EXTENT_DIRTY 1
8#define EXTENT_WRITEBACK (1 << 1)
9#define EXTENT_UPTODATE (1 << 2)
10#define EXTENT_LOCKED (1 << 3)
11#define EXTENT_NEW (1 << 4)
12#define EXTENT_DELALLOC (1 << 5)
13#define EXTENT_DEFRAG (1 << 6)
14#define EXTENT_DEFRAG_DONE (1 << 7)
15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_ORDERED (1 << 9)
17#define EXTENT_ORDERED_METADATA (1 << 10)
18#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
19
20/*
21 * page->private values. Every page that is controlled by the extent
22 * map has page->private set to one.
23 */
24#define EXTENT_PAGE_PRIVATE 1
25#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
26
27struct extent_state;
28
29typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
30 struct bio *bio, int mirror_num);
31struct extent_io_ops {
32 int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
33 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
34 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
35 extent_submit_bio_hook_t *submit_bio_hook;
36 int (*merge_bio_hook)(struct page *page, unsigned long offset,
37 size_t size, struct bio *bio);
38 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
39 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
40 u64 start, u64 end,
41 struct extent_state *state);
42 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
43 u64 start, u64 end,
44 struct extent_state *state);
45 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
46 struct extent_state *state);
47 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
48 struct extent_state *state, int uptodate);
49 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
50 unsigned long old, unsigned long bits);
51 int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
52 unsigned long old, unsigned long bits);
53 int (*write_cache_pages_lock_hook)(struct page *page);
54};
55
56struct extent_io_tree {
57 struct rb_root state;
58 struct rb_root buffer;
59 struct address_space *mapping;
60 u64 dirty_bytes;
61 spinlock_t lock;
62 spinlock_t buffer_lock;
63 struct extent_io_ops *ops;
64};
65
66struct extent_state {
67 u64 start;
68 u64 end; /* inclusive */
69 struct rb_node rb_node;
70 struct extent_io_tree *tree;
71 wait_queue_head_t wq;
72 atomic_t refs;
73 unsigned long state;
74
75 /* for use by the FS */
76 u64 private;
77
78 struct list_head leak_list;
79};
80
81struct extent_buffer {
82 u64 start;
83 unsigned long len;
84 char *map_token;
85 char *kaddr;
86 unsigned long map_start;
87 unsigned long map_len;
88 struct page *first_page;
89 atomic_t refs;
90 int flags;
91 struct list_head leak_list;
92 struct rb_node rb_node;
93 struct mutex mutex;
94};
95
96struct extent_map_tree;
97
98static inline struct extent_state *extent_state_next(struct extent_state *state)
99{
100 struct rb_node *node;
101 node = rb_next(&state->rb_node);
102 if (!node)
103 return NULL;
104 return rb_entry(node, struct extent_state, rb_node);
105}
106
107typedef struct extent_map *(get_extent_t)(struct inode *inode,
108 struct page *page,
109 size_t page_offset,
110 u64 start, u64 len,
111 int create);
112
113void extent_io_tree_init(struct extent_io_tree *tree,
114 struct address_space *mapping, gfp_t mask);
115int try_release_extent_mapping(struct extent_map_tree *map,
116 struct extent_io_tree *tree, struct page *page,
117 gfp_t mask);
118int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
119int try_release_extent_state(struct extent_map_tree *map,
120 struct extent_io_tree *tree, struct page *page,
121 gfp_t mask);
122int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
123int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
124int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
125 get_extent_t *get_extent);
126int __init extent_io_init(void);
127void extent_io_exit(void);
128
129u64 count_range_bits(struct extent_io_tree *tree,
130 u64 *start, u64 search_end,
131 u64 max_bytes, unsigned long bits);
132
133int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
134 int bits, int filled);
135int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
136 int bits, gfp_t mask);
137int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
138 int bits, int wake, int delete, gfp_t mask);
139int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
140 int bits, gfp_t mask);
141int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
142 gfp_t mask);
143int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
144 gfp_t mask);
145int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
146 gfp_t mask);
147int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
148 gfp_t mask);
149int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
150 gfp_t mask);
151int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
152 u64 end, gfp_t mask);
153int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
154 gfp_t mask);
155int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
156 gfp_t mask);
157int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
158 u64 *start_ret, u64 *end_ret, int bits);
159struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
160 u64 start, int bits);
161int extent_invalidatepage(struct extent_io_tree *tree,
162 struct page *page, unsigned long offset);
163int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
164 get_extent_t *get_extent,
165 struct writeback_control *wbc);
166int extent_writepages(struct extent_io_tree *tree,
167 struct address_space *mapping,
168 get_extent_t *get_extent,
169 struct writeback_control *wbc);
170int extent_readpages(struct extent_io_tree *tree,
171 struct address_space *mapping,
172 struct list_head *pages, unsigned nr_pages,
173 get_extent_t get_extent);
174int extent_prepare_write(struct extent_io_tree *tree,
175 struct inode *inode, struct page *page,
176 unsigned from, unsigned to, get_extent_t *get_extent);
177int extent_commit_write(struct extent_io_tree *tree,
178 struct inode *inode, struct page *page,
179 unsigned from, unsigned to);
180sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
181 get_extent_t *get_extent);
182int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
183int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
184int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
185void set_page_extent_mapped(struct page *page);
186
187struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
188 u64 start, unsigned long len,
189 struct page *page0,
190 gfp_t mask);
191struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
192 u64 start, unsigned long len,
193 gfp_t mask);
194void free_extent_buffer(struct extent_buffer *eb);
195int read_extent_buffer_pages(struct extent_io_tree *tree,
196 struct extent_buffer *eb, u64 start, int wait,
197 get_extent_t *get_extent, int mirror_num);
198
199static inline void extent_buffer_get(struct extent_buffer *eb)
200{
201 atomic_inc(&eb->refs);
202}
203
204int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
205 unsigned long start,
206 unsigned long len);
207void read_extent_buffer(struct extent_buffer *eb, void *dst,
208 unsigned long start,
209 unsigned long len);
210void write_extent_buffer(struct extent_buffer *eb, const void *src,
211 unsigned long start, unsigned long len);
212void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
213 unsigned long dst_offset, unsigned long src_offset,
214 unsigned long len);
215void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
216 unsigned long src_offset, unsigned long len);
217void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
218 unsigned long src_offset, unsigned long len);
219void memset_extent_buffer(struct extent_buffer *eb, char c,
220 unsigned long start, unsigned long len);
221int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
222 struct extent_buffer *eb);
223int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
224int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
225int clear_extent_buffer_dirty(struct extent_io_tree *tree,
226 struct extent_buffer *eb);
227int set_extent_buffer_dirty(struct extent_io_tree *tree,
228 struct extent_buffer *eb);
229int set_extent_buffer_uptodate(struct extent_io_tree *tree,
230 struct extent_buffer *eb);
231int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
232 struct extent_buffer *eb);
233int extent_buffer_uptodate(struct extent_io_tree *tree,
234 struct extent_buffer *eb);
235int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
236 unsigned long min_len, char **token, char **map,
237 unsigned long *map_start,
238 unsigned long *map_len, int km);
239int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
240 unsigned long min_len, char **token, char **map,
241 unsigned long *map_start,
242 unsigned long *map_len, int km);
243void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
244int release_extent_buffer_tail_pages(struct extent_buffer *eb);
245int extent_range_uptodate(struct extent_io_tree *tree,
246 u64 start, u64 end);
247#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 000000000000..78ced11d18c7
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,332 @@
1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h>
4#include <linux/module.h>
5#include <linux/spinlock.h>
6#include <linux/version.h>
7#include <linux/hardirq.h>
8#include "extent_map.h"
9
10/* temporary define until extent_map moves out of btrfs */
11struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
12 unsigned long extra_flags,
13 void (*ctor)(void *, struct kmem_cache *,
14 unsigned long));
15
16static struct kmem_cache *extent_map_cache;
17
18int __init extent_map_init(void)
19{
20 extent_map_cache = btrfs_cache_create("extent_map",
21 sizeof(struct extent_map), 0,
22 NULL);
23 if (!extent_map_cache)
24 return -ENOMEM;
25 return 0;
26}
27
28void extent_map_exit(void)
29{
30 if (extent_map_cache)
31 kmem_cache_destroy(extent_map_cache);
32}
33
34/**
35 * extent_map_tree_init - initialize extent map tree
36 * @tree: tree to initialize
37 * @mask: flags for memory allocations during tree operations
38 *
39 * Initialize the extent tree @tree. Should be called for each new inode
40 * or other user of the extent_map interface.
41 */
42void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
43{
44 tree->map.rb_node = NULL;
45 spin_lock_init(&tree->lock);
46}
47EXPORT_SYMBOL(extent_map_tree_init);
48
49/**
50 * alloc_extent_map - allocate new extent map structure
51 * @mask: memory allocation flags
52 *
53 * Allocate a new extent_map structure. The new structure is
54 * returned with a reference count of one and needs to be
55 * freed using free_extent_map()
56 */
57struct extent_map *alloc_extent_map(gfp_t mask)
58{
59 struct extent_map *em;
60 em = kmem_cache_alloc(extent_map_cache, mask);
61 if (!em || IS_ERR(em))
62 return em;
63 em->in_tree = 0;
64 em->flags = 0;
65 atomic_set(&em->refs, 1);
66 return em;
67}
68EXPORT_SYMBOL(alloc_extent_map);
69
70/**
71 * free_extent_map - drop reference count of an extent_map
72 * @em: extent map beeing releasead
73 *
74 * Drops the reference out on @em by one and free the structure
75 * if the reference count hits zero.
76 */
77void free_extent_map(struct extent_map *em)
78{
79 if (!em)
80 return;
81 WARN_ON(atomic_read(&em->refs) == 0);
82 if (atomic_dec_and_test(&em->refs)) {
83 WARN_ON(em->in_tree);
84 kmem_cache_free(extent_map_cache, em);
85 }
86}
87EXPORT_SYMBOL(free_extent_map);
88
89static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
90 struct rb_node *node)
91{
92 struct rb_node ** p = &root->rb_node;
93 struct rb_node * parent = NULL;
94 struct extent_map *entry;
95
96 while(*p) {
97 parent = *p;
98 entry = rb_entry(parent, struct extent_map, rb_node);
99
100 WARN_ON(!entry->in_tree);
101
102 if (offset < entry->start)
103 p = &(*p)->rb_left;
104 else if (offset >= extent_map_end(entry))
105 p = &(*p)->rb_right;
106 else
107 return parent;
108 }
109
110 entry = rb_entry(node, struct extent_map, rb_node);
111 entry->in_tree = 1;
112 rb_link_node(node, parent, p);
113 rb_insert_color(node, root);
114 return NULL;
115}
116
117static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
118 struct rb_node **prev_ret,
119 struct rb_node **next_ret)
120{
121 struct rb_node * n = root->rb_node;
122 struct rb_node *prev = NULL;
123 struct rb_node *orig_prev = NULL;
124 struct extent_map *entry;
125 struct extent_map *prev_entry = NULL;
126
127 while(n) {
128 entry = rb_entry(n, struct extent_map, rb_node);
129 prev = n;
130 prev_entry = entry;
131
132 WARN_ON(!entry->in_tree);
133
134 if (offset < entry->start)
135 n = n->rb_left;
136 else if (offset >= extent_map_end(entry))
137 n = n->rb_right;
138 else
139 return n;
140 }
141
142 if (prev_ret) {
143 orig_prev = prev;
144 while(prev && offset >= extent_map_end(prev_entry)) {
145 prev = rb_next(prev);
146 prev_entry = rb_entry(prev, struct extent_map, rb_node);
147 }
148 *prev_ret = prev;
149 prev = orig_prev;
150 }
151
152 if (next_ret) {
153 prev_entry = rb_entry(prev, struct extent_map, rb_node);
154 while(prev && offset < prev_entry->start) {
155 prev = rb_prev(prev);
156 prev_entry = rb_entry(prev, struct extent_map, rb_node);
157 }
158 *next_ret = prev;
159 }
160 return NULL;
161}
162
163static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
164{
165 struct rb_node *prev;
166 struct rb_node *ret;
167 ret = __tree_search(root, offset, &prev, NULL);
168 if (!ret)
169 return prev;
170 return ret;
171}
172
173static int mergable_maps(struct extent_map *prev, struct extent_map *next)
174{
175 if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
176 return 0;
177
178 if (extent_map_end(prev) == next->start &&
179 prev->flags == next->flags &&
180 prev->bdev == next->bdev &&
181 ((next->block_start == EXTENT_MAP_HOLE &&
182 prev->block_start == EXTENT_MAP_HOLE) ||
183 (next->block_start == EXTENT_MAP_INLINE &&
184 prev->block_start == EXTENT_MAP_INLINE) ||
185 (next->block_start == EXTENT_MAP_DELALLOC &&
186 prev->block_start == EXTENT_MAP_DELALLOC) ||
187 (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
188 next->block_start == extent_map_block_end(prev)))) {
189 return 1;
190 }
191 return 0;
192}
193
194/**
195 * add_extent_mapping - add new extent map to the extent tree
196 * @tree: tree to insert new map in
197 * @em: map to insert
198 *
199 * Insert @em into @tree or perform a simple forward/backward merge with
200 * existing mappings. The extent_map struct passed in will be inserted
201 * into the tree directly, with an additional reference taken, or a
202 * reference dropped if the merge attempt was sucessfull.
203 */
204int add_extent_mapping(struct extent_map_tree *tree,
205 struct extent_map *em)
206{
207 int ret = 0;
208 struct extent_map *merge = NULL;
209 struct rb_node *rb;
210 struct extent_map *exist;
211
212 exist = lookup_extent_mapping(tree, em->start, em->len);
213 if (exist) {
214 free_extent_map(exist);
215 ret = -EEXIST;
216 goto out;
217 }
218 assert_spin_locked(&tree->lock);
219 rb = tree_insert(&tree->map, em->start, &em->rb_node);
220 if (rb) {
221 ret = -EEXIST;
222 free_extent_map(merge);
223 goto out;
224 }
225 atomic_inc(&em->refs);
226 if (em->start != 0) {
227 rb = rb_prev(&em->rb_node);
228 if (rb)
229 merge = rb_entry(rb, struct extent_map, rb_node);
230 if (rb && mergable_maps(merge, em)) {
231 em->start = merge->start;
232 em->len += merge->len;
233 em->block_start = merge->block_start;
234 merge->in_tree = 0;
235 rb_erase(&merge->rb_node, &tree->map);
236 free_extent_map(merge);
237 }
238 }
239 rb = rb_next(&em->rb_node);
240 if (rb)
241 merge = rb_entry(rb, struct extent_map, rb_node);
242 if (rb && mergable_maps(em, merge)) {
243 em->len += merge->len;
244 rb_erase(&merge->rb_node, &tree->map);
245 merge->in_tree = 0;
246 free_extent_map(merge);
247 }
248out:
249 return ret;
250}
251EXPORT_SYMBOL(add_extent_mapping);
252
253static u64 range_end(u64 start, u64 len)
254{
255 if (start + len < start)
256 return (u64)-1;
257 return start + len;
258}
259
260/**
261 * lookup_extent_mapping - lookup extent_map
262 * @tree: tree to lookup in
263 * @start: byte offset to start the search
264 * @len: length of the lookup range
265 *
266 * Find and return the first extent_map struct in @tree that intersects the
267 * [start, len] range. There may be additional objects in the tree that
268 * intersect, so check the object returned carefully to make sure that no
269 * additional lookups are needed.
270 */
271struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
272 u64 start, u64 len)
273{
274 struct extent_map *em;
275 struct rb_node *rb_node;
276 struct rb_node *prev = NULL;
277 struct rb_node *next = NULL;
278 u64 end = range_end(start, len);
279
280 assert_spin_locked(&tree->lock);
281 rb_node = __tree_search(&tree->map, start, &prev, &next);
282 if (!rb_node && prev) {
283 em = rb_entry(prev, struct extent_map, rb_node);
284 if (end > em->start && start < extent_map_end(em))
285 goto found;
286 }
287 if (!rb_node && next) {
288 em = rb_entry(next, struct extent_map, rb_node);
289 if (end > em->start && start < extent_map_end(em))
290 goto found;
291 }
292 if (!rb_node) {
293 em = NULL;
294 goto out;
295 }
296 if (IS_ERR(rb_node)) {
297 em = ERR_PTR(PTR_ERR(rb_node));
298 goto out;
299 }
300 em = rb_entry(rb_node, struct extent_map, rb_node);
301 if (end > em->start && start < extent_map_end(em))
302 goto found;
303
304 em = NULL;
305 goto out;
306
307found:
308 atomic_inc(&em->refs);
309out:
310 return em;
311}
312EXPORT_SYMBOL(lookup_extent_mapping);
313
314/**
315 * remove_extent_mapping - removes an extent_map from the extent tree
316 * @tree: extent tree to remove from
317 * @em: extent map beeing removed
318 *
319 * Removes @em from @tree. No reference counts are dropped, and no checks
320 * are done to see if the range is in use
321 */
322int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
323{
324 int ret = 0;
325
326 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
327 assert_spin_locked(&tree->lock);
328 rb_erase(&em->rb_node, &tree->map);
329 em->in_tree = 0;
330 return ret;
331}
332EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 000000000000..26ac6fe0b268
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,57 @@
1#ifndef __EXTENTMAP__
2#define __EXTENTMAP__
3
4#include <linux/rbtree.h>
5
6#define EXTENT_MAP_LAST_BYTE (u64)-4
7#define EXTENT_MAP_HOLE (u64)-3
8#define EXTENT_MAP_INLINE (u64)-2
9#define EXTENT_MAP_DELALLOC (u64)-1
10
11/* bits for the flags field */
12#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
13
14struct extent_map {
15 struct rb_node rb_node;
16
17 /* all of these are in bytes */
18 u64 start;
19 u64 len;
20 u64 block_start;
21 unsigned long flags;
22 struct block_device *bdev;
23 atomic_t refs;
24 int in_tree;
25};
26
27struct extent_map_tree {
28 struct rb_root map;
29 spinlock_t lock;
30};
31
32static inline u64 extent_map_end(struct extent_map *em)
33{
34 if (em->start + em->len < em->start)
35 return (u64)-1;
36 return em->start + em->len;
37}
38
39static inline u64 extent_map_block_end(struct extent_map *em)
40{
41 if (em->block_start + em->len < em->block_start)
42 return (u64)-1;
43 return em->block_start + em->len;
44}
45
46void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
47struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
48 u64 start, u64 len);
49int add_extent_mapping(struct extent_map_tree *tree,
50 struct extent_map *em);
51int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
52
53struct extent_map *alloc_extent_map(gfp_t mask);
54void free_extent_map(struct extent_map *em);
55int __init extent_map_init(void);
56void extent_map_exit(void);
57#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 000000000000..6dbe88b9d7d4
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,512 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/bio.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include "ctree.h"
23#include "disk-io.h"
24#include "transaction.h"
25#include "print-tree.h"
26
27#define MAX_CSUM_ITEMS(r) ((((BTRFS_LEAF_DATA_SIZE(r) - \
28 sizeof(struct btrfs_item) * 2) / \
29 BTRFS_CRC32_SIZE) - 1))
30int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root,
32 u64 objectid, u64 pos,
33 u64 disk_offset, u64 disk_num_bytes,
34 u64 num_bytes, u64 offset)
35{
36 int ret = 0;
37 struct btrfs_file_extent_item *item;
38 struct btrfs_key file_key;
39 struct btrfs_path *path;
40 struct extent_buffer *leaf;
41
42 path = btrfs_alloc_path();
43 BUG_ON(!path);
44 file_key.objectid = objectid;
45 file_key.offset = pos;
46 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
47
48 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
49 sizeof(*item));
50 if (ret < 0)
51 goto out;
52 BUG_ON(ret);
53 leaf = path->nodes[0];
54 item = btrfs_item_ptr(leaf, path->slots[0],
55 struct btrfs_file_extent_item);
56 btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
57 btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
58 btrfs_set_file_extent_offset(leaf, item, offset);
59 btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
60 btrfs_set_file_extent_generation(leaf, item, trans->transid);
61 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
62 btrfs_mark_buffer_dirty(leaf);
63out:
64 btrfs_free_path(path);
65 return ret;
66}
67
68struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
69 struct btrfs_root *root,
70 struct btrfs_path *path,
71 u64 objectid, u64 offset,
72 int cow)
73{
74 int ret;
75 struct btrfs_key file_key;
76 struct btrfs_key found_key;
77 struct btrfs_csum_item *item;
78 struct extent_buffer *leaf;
79 u64 csum_offset = 0;
80 int csums_in_item;
81
82 file_key.objectid = objectid;
83 file_key.offset = offset;
84 btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
85 ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
86 if (ret < 0)
87 goto fail;
88 leaf = path->nodes[0];
89 if (ret > 0) {
90 ret = 1;
91 if (path->slots[0] == 0)
92 goto fail;
93 path->slots[0]--;
94 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
95 if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
96 found_key.objectid != objectid) {
97 goto fail;
98 }
99 csum_offset = (offset - found_key.offset) >>
100 root->fs_info->sb->s_blocksize_bits;
101 csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
102 csums_in_item /= BTRFS_CRC32_SIZE;
103
104 if (csum_offset >= csums_in_item) {
105 ret = -EFBIG;
106 goto fail;
107 }
108 }
109 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
110 item = (struct btrfs_csum_item *)((unsigned char *)item +
111 csum_offset * BTRFS_CRC32_SIZE);
112 return item;
113fail:
114 if (ret > 0)
115 ret = -ENOENT;
116 return ERR_PTR(ret);
117}
118
119
120int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
121 struct btrfs_root *root,
122 struct btrfs_path *path, u64 objectid,
123 u64 offset, int mod)
124{
125 int ret;
126 struct btrfs_key file_key;
127 int ins_len = mod < 0 ? -1 : 0;
128 int cow = mod != 0;
129
130 file_key.objectid = objectid;
131 file_key.offset = offset;
132 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
133 ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
134 return ret;
135}
136
137int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
138 struct bio *bio)
139{
140 u32 sum;
141 struct bio_vec *bvec = bio->bi_io_vec;
142 int bio_index = 0;
143 u64 offset;
144 u64 item_start_offset = 0;
145 u64 item_last_offset = 0;
146 u32 diff;
147 int ret;
148 struct btrfs_path *path;
149 struct btrfs_csum_item *item = NULL;
150 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
151
152 path = btrfs_alloc_path();
153 if (bio->bi_size > PAGE_CACHE_SIZE * 8)
154 path->reada = 2;
155
156 WARN_ON(bio->bi_vcnt <= 0);
157
158 while(bio_index < bio->bi_vcnt) {
159 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
160 ret = btrfs_find_ordered_sum(inode, offset, &sum);
161 if (ret == 0)
162 goto found;
163
164 if (!item || offset < item_start_offset ||
165 offset >= item_last_offset) {
166 struct btrfs_key found_key;
167 u32 item_size;
168
169 if (item)
170 btrfs_release_path(root, path);
171 item = btrfs_lookup_csum(NULL, root, path,
172 inode->i_ino, offset, 0);
173 if (IS_ERR(item)) {
174 ret = PTR_ERR(item);
175 if (ret == -ENOENT || ret == -EFBIG)
176 ret = 0;
177 sum = 0;
178 printk("no csum found for inode %lu start "
179 "%llu\n", inode->i_ino,
180 (unsigned long long)offset);
181 item = NULL;
182 goto found;
183 }
184 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
185 path->slots[0]);
186
187 item_start_offset = found_key.offset;
188 item_size = btrfs_item_size_nr(path->nodes[0],
189 path->slots[0]);
190 item_last_offset = item_start_offset +
191 (item_size / BTRFS_CRC32_SIZE) *
192 root->sectorsize;
193 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
194 struct btrfs_csum_item);
195 }
196 /*
197 * this byte range must be able to fit inside
198 * a single leaf so it will also fit inside a u32
199 */
200 diff = offset - item_start_offset;
201 diff = diff / root->sectorsize;
202 diff = diff * BTRFS_CRC32_SIZE;
203
204 read_extent_buffer(path->nodes[0], &sum,
205 ((unsigned long)item) + diff,
206 BTRFS_CRC32_SIZE);
207found:
208 set_state_private(io_tree, offset, sum);
209 bio_index++;
210 bvec++;
211 }
212 btrfs_free_path(path);
213 return 0;
214}
215
216int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
217 struct bio *bio)
218{
219 struct btrfs_ordered_sum *sums;
220 struct btrfs_sector_sum *sector_sum;
221 struct btrfs_ordered_extent *ordered;
222 char *data;
223 struct bio_vec *bvec = bio->bi_io_vec;
224 int bio_index = 0;
225 unsigned long total_bytes = 0;
226 unsigned long this_sum_bytes = 0;
227 u64 offset;
228
229 WARN_ON(bio->bi_vcnt <= 0);
230 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
231 if (!sums)
232 return -ENOMEM;
233
234 sector_sum = sums->sums;
235 sums->file_offset = page_offset(bvec->bv_page) + bvec->bv_offset;
236 sums->len = bio->bi_size;
237 INIT_LIST_HEAD(&sums->list);
238 ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
239 BUG_ON(!ordered);
240
241 while(bio_index < bio->bi_vcnt) {
242 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
243 if (offset >= ordered->file_offset + ordered->len ||
244 offset < ordered->file_offset) {
245 unsigned long bytes_left;
246 sums->len = this_sum_bytes;
247 this_sum_bytes = 0;
248 btrfs_add_ordered_sum(inode, ordered, sums);
249 btrfs_put_ordered_extent(ordered);
250
251 bytes_left = bio->bi_size - total_bytes;
252
253 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
254 GFP_NOFS);
255 BUG_ON(!sums);
256 sector_sum = sums->sums;
257 sums->len = bytes_left;
258 sums->file_offset = offset;
259 ordered = btrfs_lookup_ordered_extent(inode,
260 sums->file_offset);
261 BUG_ON(!ordered);
262 }
263
264 data = kmap_atomic(bvec->bv_page, KM_USER0);
265 sector_sum->sum = ~(u32)0;
266 sector_sum->sum = btrfs_csum_data(root,
267 data + bvec->bv_offset,
268 sector_sum->sum,
269 bvec->bv_len);
270 kunmap_atomic(data, KM_USER0);
271 btrfs_csum_final(sector_sum->sum,
272 (char *)&sector_sum->sum);
273 sector_sum->offset = page_offset(bvec->bv_page) +
274 bvec->bv_offset;
275
276 sector_sum++;
277 bio_index++;
278 total_bytes += bvec->bv_len;
279 this_sum_bytes += bvec->bv_len;
280 bvec++;
281 }
282 this_sum_bytes = 0;
283 btrfs_add_ordered_sum(inode, ordered, sums);
284 btrfs_put_ordered_extent(ordered);
285 return 0;
286}
287
288int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
289 struct btrfs_root *root, struct inode *inode,
290 struct btrfs_ordered_sum *sums)
291{
292 u64 objectid = inode->i_ino;
293 u64 offset;
294 int ret;
295 struct btrfs_key file_key;
296 struct btrfs_key found_key;
297 u64 next_offset;
298 u64 total_bytes = 0;
299 int found_next;
300 struct btrfs_path *path;
301 struct btrfs_csum_item *item;
302 struct btrfs_csum_item *item_end;
303 struct extent_buffer *leaf = NULL;
304 u64 csum_offset;
305 struct btrfs_sector_sum *sector_sum;
306 u32 nritems;
307 u32 ins_size;
308 char *eb_map;
309 char *eb_token;
310 unsigned long map_len;
311 unsigned long map_start;
312
313 path = btrfs_alloc_path();
314 BUG_ON(!path);
315 sector_sum = sums->sums;
316again:
317 next_offset = (u64)-1;
318 found_next = 0;
319 offset = sector_sum->offset;
320 file_key.objectid = objectid;
321 file_key.offset = offset;
322 btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
323
324 mutex_lock(&BTRFS_I(inode)->csum_mutex);
325 item = btrfs_lookup_csum(trans, root, path, objectid, offset, 1);
326 if (!IS_ERR(item)) {
327 leaf = path->nodes[0];
328 ret = 0;
329 goto found;
330 }
331 ret = PTR_ERR(item);
332 if (ret == -EFBIG) {
333 u32 item_size;
334 /* we found one, but it isn't big enough yet */
335 leaf = path->nodes[0];
336 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
337 if ((item_size / BTRFS_CRC32_SIZE) >= MAX_CSUM_ITEMS(root)) {
338 /* already at max size, make a new one */
339 goto insert;
340 }
341 } else {
342 int slot = path->slots[0] + 1;
343 /* we didn't find a csum item, insert one */
344 nritems = btrfs_header_nritems(path->nodes[0]);
345 if (path->slots[0] >= nritems - 1) {
346 ret = btrfs_next_leaf(root, path);
347 if (ret == 1)
348 found_next = 1;
349 if (ret != 0)
350 goto insert;
351 slot = 0;
352 }
353 btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
354 if (found_key.objectid != objectid ||
355 found_key.type != BTRFS_CSUM_ITEM_KEY) {
356 found_next = 1;
357 goto insert;
358 }
359 next_offset = found_key.offset;
360 found_next = 1;
361 goto insert;
362 }
363
364 /*
365 * at this point, we know the tree has an item, but it isn't big
366 * enough yet to put our csum in. Grow it
367 */
368 btrfs_release_path(root, path);
369 ret = btrfs_search_slot(trans, root, &file_key, path,
370 BTRFS_CRC32_SIZE, 1);
371 if (ret < 0)
372 goto fail_unlock;
373 if (ret == 0) {
374 BUG();
375 }
376 if (path->slots[0] == 0) {
377 goto insert;
378 }
379 path->slots[0]--;
380 leaf = path->nodes[0];
381 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
382 csum_offset = (offset - found_key.offset) >>
383 root->fs_info->sb->s_blocksize_bits;
384 if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
385 found_key.objectid != objectid ||
386 csum_offset >= MAX_CSUM_ITEMS(root)) {
387 goto insert;
388 }
389 if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
390 BTRFS_CRC32_SIZE) {
391 u32 diff = (csum_offset + 1) * BTRFS_CRC32_SIZE;
392 diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
393 if (diff != BTRFS_CRC32_SIZE)
394 goto insert;
395 ret = btrfs_extend_item(trans, root, path, diff);
396 BUG_ON(ret);
397 goto csum;
398 }
399
400insert:
401 btrfs_release_path(root, path);
402 csum_offset = 0;
403 if (found_next) {
404 u64 tmp = min((u64)i_size_read(inode), next_offset);
405 tmp -= offset & ~((u64)root->sectorsize -1);
406 tmp >>= root->fs_info->sb->s_blocksize_bits;
407 tmp = max((u64)1, tmp);
408 tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root));
409 ins_size = BTRFS_CRC32_SIZE * tmp;
410 } else {
411 ins_size = BTRFS_CRC32_SIZE;
412 }
413 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
414 ins_size);
415 if (ret < 0)
416 goto fail_unlock;
417 if (ret != 0) {
418 WARN_ON(1);
419 goto fail_unlock;
420 }
421csum:
422 leaf = path->nodes[0];
423 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
424 ret = 0;
425 item = (struct btrfs_csum_item *)((unsigned char *)item +
426 csum_offset * BTRFS_CRC32_SIZE);
427found:
428 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
429 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
430 btrfs_item_size_nr(leaf, path->slots[0]));
431 eb_token = NULL;
432 mutex_unlock(&BTRFS_I(inode)->csum_mutex);
433 cond_resched();
434next_sector:
435
436 if (!eb_token ||
437 (unsigned long)item + BTRFS_CRC32_SIZE >= map_start + map_len) {
438 int err;
439
440 if (eb_token)
441 unmap_extent_buffer(leaf, eb_token, KM_USER1);
442 eb_token = NULL;
443 err = map_private_extent_buffer(leaf, (unsigned long)item,
444 BTRFS_CRC32_SIZE,
445 &eb_token, &eb_map,
446 &map_start, &map_len, KM_USER1);
447 if (err)
448 eb_token = NULL;
449 }
450 if (eb_token) {
451 memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
452 &sector_sum->sum, BTRFS_CRC32_SIZE);
453 } else {
454 write_extent_buffer(leaf, &sector_sum->sum,
455 (unsigned long)item, BTRFS_CRC32_SIZE);
456 }
457
458 total_bytes += root->sectorsize;
459 sector_sum++;
460 if (total_bytes < sums->len) {
461 item = (struct btrfs_csum_item *)((char *)item +
462 BTRFS_CRC32_SIZE);
463 if (item < item_end && offset + PAGE_CACHE_SIZE ==
464 sector_sum->offset) {
465 offset = sector_sum->offset;
466 goto next_sector;
467 }
468 }
469 if (eb_token) {
470 unmap_extent_buffer(leaf, eb_token, KM_USER1);
471 eb_token = NULL;
472 }
473 btrfs_mark_buffer_dirty(path->nodes[0]);
474 cond_resched();
475 if (total_bytes < sums->len) {
476 btrfs_release_path(root, path);
477 goto again;
478 }
479out:
480 btrfs_free_path(path);
481 return ret;
482
483fail_unlock:
484 mutex_unlock(&BTRFS_I(inode)->csum_mutex);
485 goto out;
486}
487
488int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
489 struct btrfs_root *root, struct btrfs_path *path,
490 u64 isize)
491{
492 struct btrfs_key key;
493 struct extent_buffer *leaf = path->nodes[0];
494 int slot = path->slots[0];
495 int ret;
496 u32 new_item_size;
497 u64 new_item_span;
498 u64 blocks;
499
500 btrfs_item_key_to_cpu(leaf, &key, slot);
501 if (isize <= key.offset)
502 return 0;
503 new_item_span = isize - key.offset;
504 blocks = (new_item_span + root->sectorsize - 1) >>
505 root->fs_info->sb->s_blocksize_bits;
506 new_item_size = blocks * BTRFS_CRC32_SIZE;
507 if (new_item_size >= btrfs_item_size_nr(leaf, slot))
508 return 0;
509 ret = btrfs_truncate_item(trans, root, path, new_item_size, 1);
510 BUG_ON(ret);
511 return ret;
512}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 000000000000..48a702d41c8c
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,1133 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include <linux/time.h>
23#include <linux/init.h>
24#include <linux/string.h>
25#include <linux/smp_lock.h>
26#include <linux/backing-dev.h>
27#include <linux/mpage.h>
28#include <linux/swap.h>
29#include <linux/writeback.h>
30#include <linux/statfs.h>
31#include <linux/compat.h>
32#include <linux/version.h>
33#include "ctree.h"
34#include "disk-io.h"
35#include "transaction.h"
36#include "btrfs_inode.h"
37#include "ioctl.h"
38#include "print-tree.h"
39#include "tree-log.h"
40#include "locking.h"
41#include "compat.h"
42
43
44static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
45 int write_bytes,
46 struct page **prepared_pages,
47 const char __user * buf)
48{
49 long page_fault = 0;
50 int i;
51 int offset = pos & (PAGE_CACHE_SIZE - 1);
52
53 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
54 size_t count = min_t(size_t,
55 PAGE_CACHE_SIZE - offset, write_bytes);
56 struct page *page = prepared_pages[i];
57 fault_in_pages_readable(buf, count);
58
59 /* Copy data from userspace to the current page */
60 kmap(page);
61 page_fault = __copy_from_user(page_address(page) + offset,
62 buf, count);
63 /* Flush processor's dcache for this page */
64 flush_dcache_page(page);
65 kunmap(page);
66 buf += count;
67 write_bytes -= count;
68
69 if (page_fault)
70 break;
71 }
72 return page_fault ? -EFAULT : 0;
73}
74
75static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
76{
77 size_t i;
78 for (i = 0; i < num_pages; i++) {
79 if (!pages[i])
80 break;
81 ClearPageChecked(pages[i]);
82 unlock_page(pages[i]);
83 mark_page_accessed(pages[i]);
84 page_cache_release(pages[i]);
85 }
86}
87
88static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
89 struct btrfs_root *root, struct inode *inode,
90 u64 offset, size_t size,
91 struct page **pages, size_t page_offset,
92 int num_pages)
93{
94 struct btrfs_key key;
95 struct btrfs_path *path;
96 struct extent_buffer *leaf;
97 char *kaddr;
98 unsigned long ptr;
99 struct btrfs_file_extent_item *ei;
100 struct page *page;
101 u32 datasize;
102 int err = 0;
103 int ret;
104 int i;
105 ssize_t cur_size;
106
107 path = btrfs_alloc_path();
108 if (!path)
109 return -ENOMEM;
110
111 btrfs_set_trans_block_group(trans, inode);
112
113 key.objectid = inode->i_ino;
114 key.offset = offset;
115 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
116
117 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
118 if (ret < 0) {
119 err = ret;
120 goto fail;
121 }
122 if (ret == 1) {
123 struct btrfs_key found_key;
124
125 if (path->slots[0] == 0)
126 goto insert;
127
128 path->slots[0]--;
129 leaf = path->nodes[0];
130 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
131
132 if (found_key.objectid != inode->i_ino)
133 goto insert;
134
135 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
136 goto insert;
137 ei = btrfs_item_ptr(leaf, path->slots[0],
138 struct btrfs_file_extent_item);
139
140 if (btrfs_file_extent_type(leaf, ei) !=
141 BTRFS_FILE_EXTENT_INLINE) {
142 goto insert;
143 }
144 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
145 ret = 0;
146 }
147 if (ret == 0) {
148 u32 found_size;
149 u64 found_end;
150
151 leaf = path->nodes[0];
152 ei = btrfs_item_ptr(leaf, path->slots[0],
153 struct btrfs_file_extent_item);
154
155 if (btrfs_file_extent_type(leaf, ei) !=
156 BTRFS_FILE_EXTENT_INLINE) {
157 err = ret;
158 btrfs_print_leaf(root, leaf);
159 printk("found wasn't inline offset %Lu inode %lu\n",
160 offset, inode->i_ino);
161 goto fail;
162 }
163 found_size = btrfs_file_extent_inline_len(leaf,
164 btrfs_item_nr(leaf, path->slots[0]));
165 found_end = key.offset + found_size;
166
167 if (found_end < offset + size) {
168 btrfs_release_path(root, path);
169 ret = btrfs_search_slot(trans, root, &key, path,
170 offset + size - found_end, 1);
171 BUG_ON(ret != 0);
172
173 ret = btrfs_extend_item(trans, root, path,
174 offset + size - found_end);
175 if (ret) {
176 err = ret;
177 goto fail;
178 }
179 leaf = path->nodes[0];
180 ei = btrfs_item_ptr(leaf, path->slots[0],
181 struct btrfs_file_extent_item);
182 inode->i_blocks += (offset + size - found_end) >> 9;
183 }
184 if (found_end < offset) {
185 ptr = btrfs_file_extent_inline_start(ei) + found_size;
186 memset_extent_buffer(leaf, 0, ptr, offset - found_end);
187 }
188 } else {
189insert:
190 btrfs_release_path(root, path);
191 datasize = offset + size - key.offset;
192 inode->i_blocks += datasize >> 9;
193 datasize = btrfs_file_extent_calc_inline_size(datasize);
194 ret = btrfs_insert_empty_item(trans, root, path, &key,
195 datasize);
196 if (ret) {
197 err = ret;
198 printk("got bad ret %d\n", ret);
199 goto fail;
200 }
201 leaf = path->nodes[0];
202 ei = btrfs_item_ptr(leaf, path->slots[0],
203 struct btrfs_file_extent_item);
204 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
205 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
206 }
207 ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset;
208
209 cur_size = size;
210 i = 0;
211 while (size > 0) {
212 page = pages[i];
213 kaddr = kmap_atomic(page, KM_USER0);
214 cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size);
215 write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
216 kunmap_atomic(kaddr, KM_USER0);
217 page_offset = 0;
218 ptr += cur_size;
219 size -= cur_size;
220 if (i >= num_pages) {
221 printk("i %d num_pages %d\n", i, num_pages);
222 }
223 i++;
224 }
225 btrfs_mark_buffer_dirty(leaf);
226fail:
227 btrfs_free_path(path);
228 return err;
229}
230
231static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
232 struct btrfs_root *root,
233 struct file *file,
234 struct page **pages,
235 size_t num_pages,
236 loff_t pos,
237 size_t write_bytes)
238{
239 int err = 0;
240 int i;
241 struct inode *inode = fdentry(file)->d_inode;
242 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
243 u64 hint_byte;
244 u64 num_bytes;
245 u64 start_pos;
246 u64 end_of_last_block;
247 u64 end_pos = pos + write_bytes;
248 u64 inline_size;
249 int did_inline = 0;
250 loff_t isize = i_size_read(inode);
251
252 start_pos = pos & ~((u64)root->sectorsize - 1);
253 num_bytes = (write_bytes + pos - start_pos +
254 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
255
256 end_of_last_block = start_pos + num_bytes - 1;
257
258 lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
259 trans = btrfs_join_transaction(root, 1);
260 if (!trans) {
261 err = -ENOMEM;
262 goto out_unlock;
263 }
264 btrfs_set_trans_block_group(trans, inode);
265 hint_byte = 0;
266
267 if ((end_of_last_block & 4095) == 0) {
268 printk("strange end of last %Lu %zu %Lu\n", start_pos, write_bytes, end_of_last_block);
269 }
270 set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
271
272 /* FIXME...EIEIO, ENOSPC and more */
273 /* insert any holes we need to create */
274 if (isize < start_pos) {
275 u64 last_pos_in_file;
276 u64 hole_size;
277 u64 mask = root->sectorsize - 1;
278 last_pos_in_file = (isize + mask) & ~mask;
279 hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
280 if (hole_size > 0) {
281 btrfs_wait_ordered_range(inode, last_pos_in_file,
282 last_pos_in_file + hole_size);
283 mutex_lock(&BTRFS_I(inode)->extent_mutex);
284 err = btrfs_drop_extents(trans, root, inode,
285 last_pos_in_file,
286 last_pos_in_file + hole_size,
287 last_pos_in_file,
288 &hint_byte);
289 if (err)
290 goto failed;
291
292 err = btrfs_insert_file_extent(trans, root,
293 inode->i_ino,
294 last_pos_in_file,
295 0, 0, hole_size, 0);
296 btrfs_drop_extent_cache(inode, last_pos_in_file,
297 last_pos_in_file + hole_size -1);
298 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
299 btrfs_check_file(root, inode);
300 }
301 if (err)
302 goto failed;
303 }
304
305 /*
306 * either allocate an extent for the new bytes or setup the key
307 * to show we are doing inline data in the extent
308 */
309 inline_size = end_pos;
310 if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
311 inline_size > root->fs_info->max_inline ||
312 (inline_size & (root->sectorsize -1)) == 0 ||
313 inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
314 /* check for reserved extents on each page, we don't want
315 * to reset the delalloc bit on things that already have
316 * extents reserved.
317 */
318 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
319 for (i = 0; i < num_pages; i++) {
320 struct page *p = pages[i];
321 SetPageUptodate(p);
322 ClearPageChecked(p);
323 set_page_dirty(p);
324 }
325 } else {
326 u64 aligned_end;
327 /* step one, delete the existing extents in this range */
328 aligned_end = (pos + write_bytes + root->sectorsize - 1) &
329 ~((u64)root->sectorsize - 1);
330 mutex_lock(&BTRFS_I(inode)->extent_mutex);
331 err = btrfs_drop_extents(trans, root, inode, start_pos,
332 aligned_end, aligned_end, &hint_byte);
333 if (err)
334 goto failed;
335 if (isize > inline_size)
336 inline_size = min_t(u64, isize, aligned_end);
337 inline_size -= start_pos;
338 err = insert_inline_extent(trans, root, inode, start_pos,
339 inline_size, pages, 0, num_pages);
340 btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1);
341 BUG_ON(err);
342 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
343
344 /*
345 * an ugly way to do all the prop accounting around
346 * the page bits and mapping tags
347 */
348 set_page_writeback(pages[0]);
349 end_page_writeback(pages[0]);
350 did_inline = 1;
351 }
352 if (end_pos > isize) {
353 i_size_write(inode, end_pos);
354 if (did_inline)
355 BTRFS_I(inode)->disk_i_size = end_pos;
356 btrfs_update_inode(trans, root, inode);
357 }
358failed:
359 err = btrfs_end_transaction(trans, root);
360out_unlock:
361 unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
362 return err;
363}
364
365int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
366{
367 struct extent_map *em;
368 struct extent_map *split = NULL;
369 struct extent_map *split2 = NULL;
370 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
371 u64 len = end - start + 1;
372 int ret;
373 int testend = 1;
374
375 WARN_ON(end < start);
376 if (end == (u64)-1) {
377 len = (u64)-1;
378 testend = 0;
379 }
380 while(1) {
381 if (!split)
382 split = alloc_extent_map(GFP_NOFS);
383 if (!split2)
384 split2 = alloc_extent_map(GFP_NOFS);
385
386 spin_lock(&em_tree->lock);
387 em = lookup_extent_mapping(em_tree, start, len);
388 if (!em) {
389 spin_unlock(&em_tree->lock);
390 break;
391 }
392 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
393 remove_extent_mapping(em_tree, em);
394
395 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
396 em->start < start) {
397 split->start = em->start;
398 split->len = start - em->start;
399 split->block_start = em->block_start;
400 split->bdev = em->bdev;
401 split->flags = em->flags;
402 ret = add_extent_mapping(em_tree, split);
403 BUG_ON(ret);
404 free_extent_map(split);
405 split = split2;
406 split2 = NULL;
407 }
408 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
409 testend && em->start + em->len > start + len) {
410 u64 diff = start + len - em->start;
411
412 split->start = start + len;
413 split->len = em->start + em->len - (start + len);
414 split->bdev = em->bdev;
415 split->flags = em->flags;
416
417 split->block_start = em->block_start + diff;
418
419 ret = add_extent_mapping(em_tree, split);
420 BUG_ON(ret);
421 free_extent_map(split);
422 split = NULL;
423 }
424 spin_unlock(&em_tree->lock);
425
426 /* once for us */
427 free_extent_map(em);
428 /* once for the tree*/
429 free_extent_map(em);
430 }
431 if (split)
432 free_extent_map(split);
433 if (split2)
434 free_extent_map(split2);
435 return 0;
436}
437
438int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
439{
440 return 0;
441#if 0
442 struct btrfs_path *path;
443 struct btrfs_key found_key;
444 struct extent_buffer *leaf;
445 struct btrfs_file_extent_item *extent;
446 u64 last_offset = 0;
447 int nritems;
448 int slot;
449 int found_type;
450 int ret;
451 int err = 0;
452 u64 extent_end = 0;
453
454 path = btrfs_alloc_path();
455 ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
456 last_offset, 0);
457 while(1) {
458 nritems = btrfs_header_nritems(path->nodes[0]);
459 if (path->slots[0] >= nritems) {
460 ret = btrfs_next_leaf(root, path);
461 if (ret)
462 goto out;
463 nritems = btrfs_header_nritems(path->nodes[0]);
464 }
465 slot = path->slots[0];
466 leaf = path->nodes[0];
467 btrfs_item_key_to_cpu(leaf, &found_key, slot);
468 if (found_key.objectid != inode->i_ino)
469 break;
470 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
471 goto out;
472
473 if (found_key.offset < last_offset) {
474 WARN_ON(1);
475 btrfs_print_leaf(root, leaf);
476 printk("inode %lu found offset %Lu expected %Lu\n",
477 inode->i_ino, found_key.offset, last_offset);
478 err = 1;
479 goto out;
480 }
481 extent = btrfs_item_ptr(leaf, slot,
482 struct btrfs_file_extent_item);
483 found_type = btrfs_file_extent_type(leaf, extent);
484 if (found_type == BTRFS_FILE_EXTENT_REG) {
485 extent_end = found_key.offset +
486 btrfs_file_extent_num_bytes(leaf, extent);
487 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
488 struct btrfs_item *item;
489 item = btrfs_item_nr(leaf, slot);
490 extent_end = found_key.offset +
491 btrfs_file_extent_inline_len(leaf, item);
492 extent_end = (extent_end + root->sectorsize - 1) &
493 ~((u64)root->sectorsize -1 );
494 }
495 last_offset = extent_end;
496 path->slots[0]++;
497 }
498 if (0 && last_offset < inode->i_size) {
499 WARN_ON(1);
500 btrfs_print_leaf(root, leaf);
501 printk("inode %lu found offset %Lu size %Lu\n", inode->i_ino,
502 last_offset, inode->i_size);
503 err = 1;
504
505 }
506out:
507 btrfs_free_path(path);
508 return err;
509#endif
510}
511
512/*
513 * this is very complex, but the basic idea is to drop all extents
514 * in the range start - end. hint_block is filled in with a block number
515 * that would be a good hint to the block allocator for this file.
516 *
517 * If an extent intersects the range but is not entirely inside the range
518 * it is either truncated or split. Anything entirely inside the range
519 * is deleted from the tree.
520 */
521int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
522 struct btrfs_root *root, struct inode *inode,
523 u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
524{
525 u64 extent_end = 0;
526 u64 search_start = start;
527 u64 leaf_start;
528 u64 root_gen;
529 u64 root_owner;
530 struct extent_buffer *leaf;
531 struct btrfs_file_extent_item *extent;
532 struct btrfs_path *path;
533 struct btrfs_key key;
534 struct btrfs_file_extent_item old;
535 int keep;
536 int slot;
537 int bookend;
538 int found_type;
539 int found_extent;
540 int found_inline;
541 int recow;
542 int ret;
543
544 btrfs_drop_extent_cache(inode, start, end - 1);
545
546 path = btrfs_alloc_path();
547 if (!path)
548 return -ENOMEM;
549 while(1) {
550 recow = 0;
551 btrfs_release_path(root, path);
552 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
553 search_start, -1);
554 if (ret < 0)
555 goto out;
556 if (ret > 0) {
557 if (path->slots[0] == 0) {
558 ret = 0;
559 goto out;
560 }
561 path->slots[0]--;
562 }
563next_slot:
564 keep = 0;
565 bookend = 0;
566 found_extent = 0;
567 found_inline = 0;
568 leaf_start = 0;
569 root_gen = 0;
570 root_owner = 0;
571 extent = NULL;
572 leaf = path->nodes[0];
573 slot = path->slots[0];
574 ret = 0;
575 btrfs_item_key_to_cpu(leaf, &key, slot);
576 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
577 key.offset >= end) {
578 goto out;
579 }
580 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
581 key.objectid != inode->i_ino) {
582 goto out;
583 }
584 if (recow) {
585 search_start = key.offset;
586 continue;
587 }
588 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
589 extent = btrfs_item_ptr(leaf, slot,
590 struct btrfs_file_extent_item);
591 found_type = btrfs_file_extent_type(leaf, extent);
592 if (found_type == BTRFS_FILE_EXTENT_REG) {
593 extent_end =
594 btrfs_file_extent_disk_bytenr(leaf,
595 extent);
596 if (extent_end)
597 *hint_byte = extent_end;
598
599 extent_end = key.offset +
600 btrfs_file_extent_num_bytes(leaf, extent);
601 found_extent = 1;
602 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
603 struct btrfs_item *item;
604 item = btrfs_item_nr(leaf, slot);
605 found_inline = 1;
606 extent_end = key.offset +
607 btrfs_file_extent_inline_len(leaf, item);
608 }
609 } else {
610 extent_end = search_start;
611 }
612
613 /* we found nothing we can drop */
614 if ((!found_extent && !found_inline) ||
615 search_start >= extent_end) {
616 int nextret;
617 u32 nritems;
618 nritems = btrfs_header_nritems(leaf);
619 if (slot >= nritems - 1) {
620 nextret = btrfs_next_leaf(root, path);
621 if (nextret)
622 goto out;
623 recow = 1;
624 } else {
625 path->slots[0]++;
626 }
627 goto next_slot;
628 }
629
630 if (found_inline) {
631 u64 mask = root->sectorsize - 1;
632 search_start = (extent_end + mask) & ~mask;
633 } else
634 search_start = extent_end;
635 if (end <= extent_end && start >= key.offset && found_inline) {
636 *hint_byte = EXTENT_MAP_INLINE;
637 goto out;
638 }
639
640 if (found_extent) {
641 read_extent_buffer(leaf, &old, (unsigned long)extent,
642 sizeof(old));
643 root_gen = btrfs_header_generation(leaf);
644 root_owner = btrfs_header_owner(leaf);
645 leaf_start = leaf->start;
646 }
647
648 if (end < extent_end && end >= key.offset) {
649 bookend = 1;
650 if (found_inline && start <= key.offset)
651 keep = 1;
652 }
653 /* truncate existing extent */
654 if (start > key.offset) {
655 u64 new_num;
656 u64 old_num;
657 keep = 1;
658 WARN_ON(start & (root->sectorsize - 1));
659 if (found_extent) {
660 new_num = start - key.offset;
661 old_num = btrfs_file_extent_num_bytes(leaf,
662 extent);
663 *hint_byte =
664 btrfs_file_extent_disk_bytenr(leaf,
665 extent);
666 if (btrfs_file_extent_disk_bytenr(leaf,
667 extent)) {
668 dec_i_blocks(inode, old_num - new_num);
669 }
670 btrfs_set_file_extent_num_bytes(leaf, extent,
671 new_num);
672 btrfs_mark_buffer_dirty(leaf);
673 } else if (key.offset < inline_limit &&
674 (end > extent_end) &&
675 (inline_limit < extent_end)) {
676 u32 new_size;
677 new_size = btrfs_file_extent_calc_inline_size(
678 inline_limit - key.offset);
679 dec_i_blocks(inode, (extent_end - key.offset) -
680 (inline_limit - key.offset));
681 btrfs_truncate_item(trans, root, path,
682 new_size, 1);
683 }
684 }
685 /* delete the entire extent */
686 if (!keep) {
687 ret = btrfs_del_item(trans, root, path);
688 /* TODO update progress marker and return */
689 BUG_ON(ret);
690 extent = NULL;
691 btrfs_release_path(root, path);
692 /* the extent will be freed later */
693 }
694 if (bookend && found_inline && start <= key.offset) {
695 u32 new_size;
696 new_size = btrfs_file_extent_calc_inline_size(
697 extent_end - end);
698 dec_i_blocks(inode, (extent_end - key.offset) -
699 (extent_end - end));
700 ret = btrfs_truncate_item(trans, root, path,
701 new_size, 0);
702 BUG_ON(ret);
703 }
704 /* create bookend, splitting the extent in two */
705 if (bookend && found_extent) {
706 u64 disk_bytenr;
707 struct btrfs_key ins;
708 ins.objectid = inode->i_ino;
709 ins.offset = end;
710 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
711 btrfs_release_path(root, path);
712 ret = btrfs_insert_empty_item(trans, root, path, &ins,
713 sizeof(*extent));
714 BUG_ON(ret);
715
716 leaf = path->nodes[0];
717 extent = btrfs_item_ptr(leaf, path->slots[0],
718 struct btrfs_file_extent_item);
719 write_extent_buffer(leaf, &old,
720 (unsigned long)extent, sizeof(old));
721
722 btrfs_set_file_extent_offset(leaf, extent,
723 le64_to_cpu(old.offset) + end - key.offset);
724 WARN_ON(le64_to_cpu(old.num_bytes) <
725 (extent_end - end));
726 btrfs_set_file_extent_num_bytes(leaf, extent,
727 extent_end - end);
728 btrfs_set_file_extent_type(leaf, extent,
729 BTRFS_FILE_EXTENT_REG);
730
731 btrfs_mark_buffer_dirty(path->nodes[0]);
732
733 disk_bytenr = le64_to_cpu(old.disk_bytenr);
734 if (disk_bytenr != 0) {
735 ret = btrfs_inc_extent_ref(trans, root,
736 disk_bytenr,
737 le64_to_cpu(old.disk_num_bytes),
738 leaf->start,
739 root->root_key.objectid,
740 trans->transid,
741 ins.objectid, ins.offset);
742 BUG_ON(ret);
743 }
744 btrfs_release_path(root, path);
745 if (disk_bytenr != 0) {
746 inode->i_blocks +=
747 btrfs_file_extent_num_bytes(leaf,
748 extent) >> 9;
749 }
750 }
751
752 if (found_extent && !keep) {
753 u64 disk_bytenr = le64_to_cpu(old.disk_bytenr);
754
755 if (disk_bytenr != 0) {
756 dec_i_blocks(inode, le64_to_cpu(old.num_bytes));
757 ret = btrfs_free_extent(trans, root,
758 disk_bytenr,
759 le64_to_cpu(old.disk_num_bytes),
760 leaf_start, root_owner,
761 root_gen, key.objectid,
762 key.offset, 0);
763 BUG_ON(ret);
764 *hint_byte = disk_bytenr;
765 }
766 }
767
768 if (search_start >= end) {
769 ret = 0;
770 goto out;
771 }
772 }
773out:
774 btrfs_free_path(path);
775 btrfs_check_file(root, inode);
776 return ret;
777}
778
779/*
780 * this gets pages into the page cache and locks them down
781 */
782static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
783 struct page **pages, size_t num_pages,
784 loff_t pos, unsigned long first_index,
785 unsigned long last_index, size_t write_bytes)
786{
787 int i;
788 unsigned long index = pos >> PAGE_CACHE_SHIFT;
789 struct inode *inode = fdentry(file)->d_inode;
790 int err = 0;
791 u64 start_pos;
792 u64 last_pos;
793
794 start_pos = pos & ~((u64)root->sectorsize - 1);
795 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
796
797 memset(pages, 0, num_pages * sizeof(struct page *));
798again:
799 for (i = 0; i < num_pages; i++) {
800 pages[i] = grab_cache_page(inode->i_mapping, index + i);
801 if (!pages[i]) {
802 err = -ENOMEM;
803 BUG_ON(1);
804 }
805 wait_on_page_writeback(pages[i]);
806 }
807 if (start_pos < inode->i_size) {
808 struct btrfs_ordered_extent *ordered;
809 lock_extent(&BTRFS_I(inode)->io_tree,
810 start_pos, last_pos - 1, GFP_NOFS);
811 ordered = btrfs_lookup_first_ordered_extent(inode, last_pos -1);
812 if (ordered &&
813 ordered->file_offset + ordered->len > start_pos &&
814 ordered->file_offset < last_pos) {
815 btrfs_put_ordered_extent(ordered);
816 unlock_extent(&BTRFS_I(inode)->io_tree,
817 start_pos, last_pos - 1, GFP_NOFS);
818 for (i = 0; i < num_pages; i++) {
819 unlock_page(pages[i]);
820 page_cache_release(pages[i]);
821 }
822 btrfs_wait_ordered_range(inode, start_pos,
823 last_pos - start_pos);
824 goto again;
825 }
826 if (ordered)
827 btrfs_put_ordered_extent(ordered);
828
829 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
830 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
831 GFP_NOFS);
832 unlock_extent(&BTRFS_I(inode)->io_tree,
833 start_pos, last_pos - 1, GFP_NOFS);
834 }
835 for (i = 0; i < num_pages; i++) {
836 clear_page_dirty_for_io(pages[i]);
837 set_page_extent_mapped(pages[i]);
838 WARN_ON(!PageLocked(pages[i]));
839 }
840 return 0;
841}
842
843static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
844 size_t count, loff_t *ppos)
845{
846 loff_t pos;
847 loff_t start_pos;
848 ssize_t num_written = 0;
849 ssize_t err = 0;
850 int ret = 0;
851 struct inode *inode = fdentry(file)->d_inode;
852 struct btrfs_root *root = BTRFS_I(inode)->root;
853 struct page **pages = NULL;
854 int nrptrs;
855 struct page *pinned[2];
856 unsigned long first_index;
857 unsigned long last_index;
858
859 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
860 PAGE_CACHE_SIZE / (sizeof(struct page *)));
861 pinned[0] = NULL;
862 pinned[1] = NULL;
863
864 pos = *ppos;
865 start_pos = pos;
866
867 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
868 current->backing_dev_info = inode->i_mapping->backing_dev_info;
869 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
870 if (err)
871 goto out_nolock;
872 if (count == 0)
873 goto out_nolock;
874#ifdef REMOVE_SUID_PATH
875 err = remove_suid(&file->f_path);
876#else
877# if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
878 err = file_remove_suid(file);
879# else
880 err = remove_suid(fdentry(file));
881# endif
882#endif
883 if (err)
884 goto out_nolock;
885 file_update_time(file);
886
887 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
888
889 mutex_lock(&inode->i_mutex);
890 first_index = pos >> PAGE_CACHE_SHIFT;
891 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
892
893 /*
894 * if this is a nodatasum mount, force summing off for the inode
895 * all the time. That way a later mount with summing on won't
896 * get confused
897 */
898 if (btrfs_test_opt(root, NODATASUM))
899 btrfs_set_flag(inode, NODATASUM);
900
901 /*
902 * there are lots of better ways to do this, but this code
903 * makes sure the first and last page in the file range are
904 * up to date and ready for cow
905 */
906 if ((pos & (PAGE_CACHE_SIZE - 1))) {
907 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
908 if (!PageUptodate(pinned[0])) {
909 ret = btrfs_readpage(NULL, pinned[0]);
910 BUG_ON(ret);
911 wait_on_page_locked(pinned[0]);
912 } else {
913 unlock_page(pinned[0]);
914 }
915 }
916 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
917 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
918 if (!PageUptodate(pinned[1])) {
919 ret = btrfs_readpage(NULL, pinned[1]);
920 BUG_ON(ret);
921 wait_on_page_locked(pinned[1]);
922 } else {
923 unlock_page(pinned[1]);
924 }
925 }
926
927 while(count > 0) {
928 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
929 size_t write_bytes = min(count, nrptrs *
930 (size_t)PAGE_CACHE_SIZE -
931 offset);
932 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
933 PAGE_CACHE_SHIFT;
934
935 WARN_ON(num_pages > nrptrs);
936 memset(pages, 0, sizeof(pages));
937
938 ret = btrfs_check_free_space(root, write_bytes, 0);
939 if (ret)
940 goto out;
941
942 ret = prepare_pages(root, file, pages, num_pages,
943 pos, first_index, last_index,
944 write_bytes);
945 if (ret)
946 goto out;
947
948 ret = btrfs_copy_from_user(pos, num_pages,
949 write_bytes, pages, buf);
950 if (ret) {
951 btrfs_drop_pages(pages, num_pages);
952 goto out;
953 }
954
955 ret = dirty_and_release_pages(NULL, root, file, pages,
956 num_pages, pos, write_bytes);
957 btrfs_drop_pages(pages, num_pages);
958 if (ret)
959 goto out;
960
961 buf += write_bytes;
962 count -= write_bytes;
963 pos += write_bytes;
964 num_written += write_bytes;
965
966 balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
967 if (num_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
968 btrfs_btree_balance_dirty(root, 1);
969 btrfs_throttle(root);
970 cond_resched();
971 }
972out:
973 mutex_unlock(&inode->i_mutex);
974
975out_nolock:
976 kfree(pages);
977 if (pinned[0])
978 page_cache_release(pinned[0]);
979 if (pinned[1])
980 page_cache_release(pinned[1]);
981 *ppos = pos;
982
983 if (num_written > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
984 struct btrfs_trans_handle *trans;
985
986 err = btrfs_fdatawrite_range(inode->i_mapping, start_pos,
987 start_pos + num_written -1,
988 WB_SYNC_NONE);
989 if (err < 0)
990 num_written = err;
991
992 err = btrfs_wait_on_page_writeback_range(inode->i_mapping,
993 start_pos, start_pos + num_written - 1);
994 if (err < 0)
995 num_written = err;
996
997 trans = btrfs_start_transaction(root, 1);
998 ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
999 if (ret == 0) {
1000 btrfs_sync_log(trans, root);
1001 btrfs_end_transaction(trans, root);
1002 } else {
1003 btrfs_commit_transaction(trans, root);
1004 }
1005 } else if (num_written > 0 && (file->f_flags & O_DIRECT)) {
1006#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
1007 do_sync_file_range(file, start_pos,
1008 start_pos + num_written - 1,
1009 SYNC_FILE_RANGE_WRITE |
1010 SYNC_FILE_RANGE_WAIT_AFTER);
1011#else
1012 do_sync_mapping_range(inode->i_mapping, start_pos,
1013 start_pos + num_written - 1,
1014 SYNC_FILE_RANGE_WRITE |
1015 SYNC_FILE_RANGE_WAIT_AFTER);
1016#endif
1017 invalidate_mapping_pages(inode->i_mapping,
1018 start_pos >> PAGE_CACHE_SHIFT,
1019 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1020 }
1021 current->backing_dev_info = NULL;
1022 return num_written ? num_written : err;
1023}
1024
1025int btrfs_release_file(struct inode * inode, struct file * filp)
1026{
1027 if (filp->private_data)
1028 btrfs_ioctl_trans_end(filp);
1029 return 0;
1030}
1031
1032int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1033{
1034 struct inode *inode = dentry->d_inode;
1035 struct btrfs_root *root = BTRFS_I(inode)->root;
1036 int ret = 0;
1037 struct btrfs_trans_handle *trans;
1038
1039 /*
1040 * check the transaction that last modified this inode
1041 * and see if its already been committed
1042 */
1043 if (!BTRFS_I(inode)->last_trans)
1044 goto out;
1045
1046 mutex_lock(&root->fs_info->trans_mutex);
1047 if (BTRFS_I(inode)->last_trans <=
1048 root->fs_info->last_trans_committed) {
1049 BTRFS_I(inode)->last_trans = 0;
1050 mutex_unlock(&root->fs_info->trans_mutex);
1051 goto out;
1052 }
1053 mutex_unlock(&root->fs_info->trans_mutex);
1054
1055 root->fs_info->tree_log_batch++;
1056 filemap_fdatawait(inode->i_mapping);
1057 root->fs_info->tree_log_batch++;
1058
1059 /*
1060 * ok we haven't committed the transaction yet, lets do a commit
1061 */
1062 if (file->private_data)
1063 btrfs_ioctl_trans_end(file);
1064
1065 trans = btrfs_start_transaction(root, 1);
1066 if (!trans) {
1067 ret = -ENOMEM;
1068 goto out;
1069 }
1070
1071 ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
1072 if (ret < 0) {
1073 goto out;
1074 }
1075
1076 /* we've logged all the items and now have a consistent
1077 * version of the file in the log. It is possible that
1078 * someone will come in and modify the file, but that's
1079 * fine because the log is consistent on disk, and we
1080 * have references to all of the file's extents
1081 *
1082 * It is possible that someone will come in and log the
1083 * file again, but that will end up using the synchronization
1084 * inside btrfs_sync_log to keep things safe.
1085 */
1086 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
1087
1088 if (ret > 0) {
1089 ret = btrfs_commit_transaction(trans, root);
1090 } else {
1091 btrfs_sync_log(trans, root);
1092 ret = btrfs_end_transaction(trans, root);
1093 }
1094 mutex_lock(&file->f_dentry->d_inode->i_mutex);
1095out:
1096 return ret > 0 ? EIO : ret;
1097}
1098
1099static struct vm_operations_struct btrfs_file_vm_ops = {
1100#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
1101 .nopage = filemap_nopage,
1102 .populate = filemap_populate,
1103#else
1104 .fault = filemap_fault,
1105#endif
1106 .page_mkwrite = btrfs_page_mkwrite,
1107};
1108
1109static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1110{
1111 vma->vm_ops = &btrfs_file_vm_ops;
1112 file_accessed(filp);
1113 return 0;
1114}
1115
1116struct file_operations btrfs_file_operations = {
1117 .llseek = generic_file_llseek,
1118 .read = do_sync_read,
1119 .aio_read = generic_file_aio_read,
1120 .splice_read = generic_file_splice_read,
1121#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
1122 .sendfile = generic_file_sendfile,
1123#endif
1124 .write = btrfs_file_write,
1125 .mmap = btrfs_file_mmap,
1126 .open = generic_file_open,
1127 .release = btrfs_release_file,
1128 .fsync = btrfs_sync_file,
1129 .unlocked_ioctl = btrfs_ioctl,
1130#ifdef CONFIG_COMPAT
1131 .compat_ioctl = btrfs_ioctl,
1132#endif
1133};
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 000000000000..01c26e8ae555
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,415 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21
22static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node)
24{
25 struct rb_node **p = &root->rb_node;
26 struct rb_node *parent = NULL;
27 struct btrfs_free_space *info;
28
29 while (*p) {
30 parent = *p;
31 info = rb_entry(parent, struct btrfs_free_space, offset_index);
32
33 if (offset < info->offset)
34 p = &(*p)->rb_left;
35 else if (offset > info->offset)
36 p = &(*p)->rb_right;
37 else
38 return -EEXIST;
39 }
40
41 rb_link_node(node, parent, p);
42 rb_insert_color(node, root);
43
44 return 0;
45}
46
47static int tree_insert_bytes(struct rb_root *root, u64 bytes,
48 struct rb_node *node)
49{
50 struct rb_node **p = &root->rb_node;
51 struct rb_node *parent = NULL;
52 struct btrfs_free_space *info;
53
54 while (*p) {
55 parent = *p;
56 info = rb_entry(parent, struct btrfs_free_space, bytes_index);
57
58 if (bytes < info->bytes)
59 p = &(*p)->rb_left;
60 else
61 p = &(*p)->rb_right;
62 }
63
64 rb_link_node(node, parent, p);
65 rb_insert_color(node, root);
66
67 return 0;
68}
69
70/*
71 * searches the tree for the given offset. If contains is set we will return
72 * the free space that contains the given offset. If contains is not set we
73 * will return the free space that starts at or after the given offset and is
74 * at least bytes long.
75 */
76static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
77 u64 offset, u64 bytes,
78 int contains)
79{
80 struct rb_node *n = root->rb_node;
81 struct btrfs_free_space *entry, *ret = NULL;
82
83 while (n) {
84 entry = rb_entry(n, struct btrfs_free_space, offset_index);
85
86 if (offset < entry->offset) {
87 if (!contains &&
88 (!ret || entry->offset < ret->offset) &&
89 (bytes <= entry->bytes))
90 ret = entry;
91 n = n->rb_left;
92 } else if (offset > entry->offset) {
93 if (contains &&
94 (entry->offset + entry->bytes - 1) >= offset) {
95 ret = entry;
96 break;
97 }
98 n = n->rb_right;
99 } else {
100 if (bytes > entry->bytes) {
101 n = n->rb_right;
102 continue;
103 }
104 ret = entry;
105 break;
106 }
107 }
108
109 return ret;
110}
111
112/*
113 * return a chunk at least bytes size, as close to offset that we can get.
114 */
115static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
116 u64 offset, u64 bytes)
117{
118 struct rb_node *n = root->rb_node;
119 struct btrfs_free_space *entry, *ret = NULL;
120
121 while (n) {
122 entry = rb_entry(n, struct btrfs_free_space, bytes_index);
123
124 if (bytes < entry->bytes) {
125 /*
126 * We prefer to get a hole size as close to the size we
127 * are asking for so we don't take small slivers out of
128 * huge holes, but we also want to get as close to the
129 * offset as possible so we don't have a whole lot of
130 * fragmentation.
131 */
132 if (offset <= entry->offset) {
133 if (!ret)
134 ret = entry;
135 else if (entry->bytes < ret->bytes)
136 ret = entry;
137 else if (entry->offset < ret->offset)
138 ret = entry;
139 }
140 n = n->rb_left;
141 } else if (bytes > entry->bytes) {
142 n = n->rb_right;
143 } else {
144 /*
145 * Ok we may have multiple chunks of the wanted size,
146 * so we don't want to take the first one we find, we
147 * want to take the one closest to our given offset, so
148 * keep searching just in case theres a better match.
149 */
150 n = n->rb_right;
151 if (offset > entry->offset)
152 continue;
153 else if (!ret || entry->offset < ret->offset)
154 ret = entry;
155 }
156 }
157
158 return ret;
159}
160
161static void unlink_free_space(struct btrfs_block_group_cache *block_group,
162 struct btrfs_free_space *info)
163{
164 rb_erase(&info->offset_index, &block_group->free_space_offset);
165 rb_erase(&info->bytes_index, &block_group->free_space_bytes);
166}
167
168static int link_free_space(struct btrfs_block_group_cache *block_group,
169 struct btrfs_free_space *info)
170{
171 int ret = 0;
172
173
174 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
175 &info->offset_index);
176 if (ret)
177 return ret;
178
179 ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
180 &info->bytes_index);
181 if (ret)
182 return ret;
183
184 return ret;
185}
186
187int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
188 u64 offset, u64 bytes)
189{
190 struct btrfs_free_space *right_info;
191 struct btrfs_free_space *left_info;
192 struct btrfs_free_space *info = NULL;
193 struct btrfs_free_space *alloc_info;
194 int ret = 0;
195
196 alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
197 if (!alloc_info)
198 return -ENOMEM;
199
200 /*
201 * first we want to see if there is free space adjacent to the range we
202 * are adding, if there is remove that struct and add a new one to
203 * cover the entire range
204 */
205 spin_lock(&block_group->lock);
206
207 right_info = tree_search_offset(&block_group->free_space_offset,
208 offset+bytes, 0, 1);
209 left_info = tree_search_offset(&block_group->free_space_offset,
210 offset-1, 0, 1);
211
212 if (right_info && right_info->offset == offset+bytes) {
213 unlink_free_space(block_group, right_info);
214 info = right_info;
215 info->offset = offset;
216 info->bytes += bytes;
217 } else if (right_info && right_info->offset != offset+bytes) {
218 printk(KERN_ERR "adding space in the middle of an existing "
219 "free space area. existing: offset=%Lu, bytes=%Lu. "
220 "new: offset=%Lu, bytes=%Lu\n", right_info->offset,
221 right_info->bytes, offset, bytes);
222 BUG();
223 }
224
225 if (left_info) {
226 unlink_free_space(block_group, left_info);
227
228 if (unlikely((left_info->offset + left_info->bytes) !=
229 offset)) {
230 printk(KERN_ERR "free space to the left of new free "
231 "space isn't quite right. existing: offset=%Lu,"
232 " bytes=%Lu. new: offset=%Lu, bytes=%Lu\n",
233 left_info->offset, left_info->bytes, offset,
234 bytes);
235 BUG();
236 }
237
238 if (info) {
239 info->offset = left_info->offset;
240 info->bytes += left_info->bytes;
241 kfree(left_info);
242 } else {
243 info = left_info;
244 info->bytes += bytes;
245 }
246 }
247
248 if (info) {
249 ret = link_free_space(block_group, info);
250 if (!ret)
251 info = NULL;
252 goto out;
253 }
254
255 info = alloc_info;
256 alloc_info = NULL;
257 info->offset = offset;
258 info->bytes = bytes;
259
260 ret = link_free_space(block_group, info);
261 if (ret)
262 kfree(info);
263out:
264 spin_unlock(&block_group->lock);
265 if (ret) {
266 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
267 if (ret == -EEXIST)
268 BUG();
269 }
270
271 if (alloc_info)
272 kfree(alloc_info);
273
274 return ret;
275}
276
277int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
278 u64 offset, u64 bytes)
279{
280 struct btrfs_free_space *info;
281 int ret = 0;
282
283 spin_lock(&block_group->lock);
284 info = tree_search_offset(&block_group->free_space_offset, offset, 0,
285 1);
286
287 if (info && info->offset == offset) {
288 if (info->bytes < bytes) {
289 printk(KERN_ERR "Found free space at %Lu, size %Lu,"
290 "trying to use %Lu\n",
291 info->offset, info->bytes, bytes);
292 WARN_ON(1);
293 ret = -EINVAL;
294 goto out;
295 }
296
297 unlink_free_space(block_group, info);
298
299 if (info->bytes == bytes) {
300 kfree(info);
301 goto out;
302 }
303
304 info->offset += bytes;
305 info->bytes -= bytes;
306
307 ret = link_free_space(block_group, info);
308 BUG_ON(ret);
309 } else {
310 WARN_ON(1);
311 }
312out:
313 spin_unlock(&block_group->lock);
314 return ret;
315}
316
317void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
318 u64 bytes)
319{
320 struct btrfs_free_space *info;
321 struct rb_node *n;
322 int count = 0;
323
324 for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
325 info = rb_entry(n, struct btrfs_free_space, offset_index);
326 if (info->bytes >= bytes)
327 count++;
328 //printk(KERN_INFO "offset=%Lu, bytes=%Lu\n", info->offset,
329 // info->bytes);
330 }
331 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
332 "\n", count);
333}
334
335u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
336{
337 struct btrfs_free_space *info;
338 struct rb_node *n;
339 u64 ret = 0;
340
341 for (n = rb_first(&block_group->free_space_offset); n;
342 n = rb_next(n)) {
343 info = rb_entry(n, struct btrfs_free_space, offset_index);
344 ret += info->bytes;
345 }
346
347 return ret;
348}
349
350void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
351{
352 struct btrfs_free_space *info;
353 struct rb_node *node;
354
355 spin_lock(&block_group->lock);
356 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
357 info = rb_entry(node, struct btrfs_free_space, bytes_index);
358 unlink_free_space(block_group, info);
359 kfree(info);
360 if (need_resched()) {
361 spin_unlock(&block_group->lock);
362 cond_resched();
363 spin_lock(&block_group->lock);
364 }
365 }
366 spin_unlock(&block_group->lock);
367}
368
369struct btrfs_free_space *btrfs_find_free_space_offset(struct
370 btrfs_block_group_cache
371 *block_group, u64 offset,
372 u64 bytes)
373{
374 struct btrfs_free_space *ret;
375
376 spin_lock(&block_group->lock);
377 ret = tree_search_offset(&block_group->free_space_offset, offset,
378 bytes, 0);
379 spin_unlock(&block_group->lock);
380
381 return ret;
382}
383
384struct btrfs_free_space *btrfs_find_free_space_bytes(struct
385 btrfs_block_group_cache
386 *block_group, u64 offset,
387 u64 bytes)
388{
389 struct btrfs_free_space *ret;
390
391 spin_lock(&block_group->lock);
392
393 ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
394 spin_unlock(&block_group->lock);
395
396 return ret;
397}
398
399struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
400 *block_group, u64 offset,
401 u64 bytes)
402{
403 struct btrfs_free_space *ret;
404
405 spin_lock(&block_group->lock);
406 ret = tree_search_offset(&block_group->free_space_offset, offset,
407 bytes, 0);
408 if (!ret)
409 ret = tree_search_bytes(&block_group->free_space_bytes,
410 offset, bytes);
411
412 spin_unlock(&block_group->lock);
413
414 return ret;
415}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
new file mode 100644
index 000000000000..2a020b276768
--- /dev/null
+++ b/fs/btrfs/hash.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __HASH__
20#define __HASH__
21
22#include "crc32c.h"
23static inline u64 btrfs_name_hash(const char *name, int len)
24{
25 return btrfs_crc32c((u32)~1, name, len);
26}
27#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 000000000000..d93451c66ba1
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,206 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22
23int find_name_in_backref(struct btrfs_path *path, const char * name,
24 int name_len, struct btrfs_inode_ref **ref_ret)
25{
26 struct extent_buffer *leaf;
27 struct btrfs_inode_ref *ref;
28 unsigned long ptr;
29 unsigned long name_ptr;
30 u32 item_size;
31 u32 cur_offset = 0;
32 int len;
33
34 leaf = path->nodes[0];
35 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
36 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
37 while (cur_offset < item_size) {
38 ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
39 len = btrfs_inode_ref_name_len(leaf, ref);
40 name_ptr = (unsigned long)(ref + 1);
41 cur_offset += len + sizeof(*ref);
42 if (len != name_len)
43 continue;
44 if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
45 *ref_ret = ref;
46 return 1;
47 }
48 }
49 return 0;
50}
51
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root,
54 const char *name, int name_len,
55 u64 inode_objectid, u64 ref_objectid, u64 *index)
56{
57 struct btrfs_path *path;
58 struct btrfs_key key;
59 struct btrfs_inode_ref *ref;
60 struct extent_buffer *leaf;
61 unsigned long ptr;
62 unsigned long item_start;
63 u32 item_size;
64 u32 sub_item_len;
65 int ret;
66 int del_len = name_len + sizeof(*ref);
67
68 key.objectid = inode_objectid;
69 key.offset = ref_objectid;
70 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
71
72 path = btrfs_alloc_path();
73 if (!path)
74 return -ENOMEM;
75
76 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
77 if (ret > 0) {
78 ret = -ENOENT;
79 goto out;
80 } else if (ret < 0) {
81 goto out;
82 }
83 if (!find_name_in_backref(path, name, name_len, &ref)) {
84 ret = -ENOENT;
85 goto out;
86 }
87 leaf = path->nodes[0];
88 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
89
90 if (index)
91 *index = btrfs_inode_ref_index(leaf, ref);
92
93 if (del_len == item_size) {
94 ret = btrfs_del_item(trans, root, path);
95 goto out;
96 }
97 ptr = (unsigned long)ref;
98 sub_item_len = name_len + sizeof(*ref);
99 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
100 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
101 item_size - (ptr + sub_item_len - item_start));
102 ret = btrfs_truncate_item(trans, root, path,
103 item_size - sub_item_len, 1);
104 BUG_ON(ret);
105out:
106 btrfs_free_path(path);
107 return ret;
108}
109
110int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
111 struct btrfs_root *root,
112 const char *name, int name_len,
113 u64 inode_objectid, u64 ref_objectid, u64 index)
114{
115 struct btrfs_path *path;
116 struct btrfs_key key;
117 struct btrfs_inode_ref *ref;
118 unsigned long ptr;
119 int ret;
120 int ins_len = name_len + sizeof(*ref);
121
122 key.objectid = inode_objectid;
123 key.offset = ref_objectid;
124 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
125
126 path = btrfs_alloc_path();
127 if (!path)
128 return -ENOMEM;
129
130 ret = btrfs_insert_empty_item(trans, root, path, &key,
131 ins_len);
132 if (ret == -EEXIST) {
133 u32 old_size;
134
135 if (find_name_in_backref(path, name, name_len, &ref))
136 goto out;
137
138 old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
139 ret = btrfs_extend_item(trans, root, path, ins_len);
140 BUG_ON(ret);
141 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
142 struct btrfs_inode_ref);
143 ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
144 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
145 btrfs_set_inode_ref_index(path->nodes[0], ref, index);
146 ptr = (unsigned long)(ref + 1);
147 ret = 0;
148 } else if (ret < 0) {
149 goto out;
150 } else {
151 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
152 struct btrfs_inode_ref);
153 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
154 btrfs_set_inode_ref_index(path->nodes[0], ref, index);
155 ptr = (unsigned long)(ref + 1);
156 }
157 write_extent_buffer(path->nodes[0], name, ptr, name_len);
158 btrfs_mark_buffer_dirty(path->nodes[0]);
159
160out:
161 btrfs_free_path(path);
162 return ret;
163}
164
165int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
166 struct btrfs_root *root,
167 struct btrfs_path *path, u64 objectid)
168{
169 struct btrfs_key key;
170 int ret;
171 key.objectid = objectid;
172 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
173 key.offset = 0;
174
175 ret = btrfs_insert_empty_item(trans, root, path, &key,
176 sizeof(struct btrfs_inode_item));
177 if (ret == 0 && objectid > root->highest_inode)
178 root->highest_inode = objectid;
179 return ret;
180}
181
182int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
183 *root, struct btrfs_path *path,
184 struct btrfs_key *location, int mod)
185{
186 int ins_len = mod < 0 ? -1 : 0;
187 int cow = mod != 0;
188 int ret;
189 int slot;
190 struct extent_buffer *leaf;
191 struct btrfs_key found_key;
192
193 ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
194 if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
195 location->offset == (u64)-1 && path->slots[0] != 0) {
196 slot = path->slots[0] - 1;
197 leaf = path->nodes[0];
198 btrfs_item_key_to_cpu(leaf, &found_key, slot);
199 if (found_key.objectid == location->objectid &&
200 btrfs_key_type(&found_key) == btrfs_key_type(location)) {
201 path->slots[0]--;
202 return 0;
203 }
204 }
205 return ret;
206}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
new file mode 100644
index 000000000000..cd6171c2da42
--- /dev/null
+++ b/fs/btrfs/inode-map.c
@@ -0,0 +1,141 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22
23int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
24{
25 struct btrfs_path *path;
26 int ret;
27 struct extent_buffer *l;
28 struct btrfs_key search_key;
29 struct btrfs_key found_key;
30 int slot;
31
32 path = btrfs_alloc_path();
33 BUG_ON(!path);
34
35 search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
36 search_key.type = -1;
37 search_key.offset = (u64)-1;
38 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
39 if (ret < 0)
40 goto error;
41 BUG_ON(ret == 0);
42 if (path->slots[0] > 0) {
43 slot = path->slots[0] - 1;
44 l = path->nodes[0];
45 btrfs_item_key_to_cpu(l, &found_key, slot);
46 *objectid = found_key.objectid;
47 } else {
48 *objectid = BTRFS_FIRST_FREE_OBJECTID;
49 }
50 ret = 0;
51error:
52 btrfs_free_path(path);
53 return ret;
54}
55
56/*
57 * walks the btree of allocated inodes and find a hole.
58 */
59int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root,
61 u64 dirid, u64 *objectid)
62{
63 struct btrfs_path *path;
64 struct btrfs_key key;
65 int ret;
66 int slot = 0;
67 u64 last_ino = 0;
68 int start_found;
69 struct extent_buffer *l;
70 struct btrfs_key search_key;
71 u64 search_start = dirid;
72
73 mutex_lock(&root->objectid_mutex);
74 if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
75 root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
76 *objectid = ++root->last_inode_alloc;
77 mutex_unlock(&root->objectid_mutex);
78 return 0;
79 }
80 path = btrfs_alloc_path();
81 BUG_ON(!path);
82 search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
83 search_key.objectid = search_start;
84 search_key.type = 0;
85 search_key.offset = 0;
86
87 btrfs_init_path(path);
88 start_found = 0;
89 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
90 if (ret < 0)
91 goto error;
92
93 while (1) {
94 l = path->nodes[0];
95 slot = path->slots[0];
96 if (slot >= btrfs_header_nritems(l)) {
97 ret = btrfs_next_leaf(root, path);
98 if (ret == 0)
99 continue;
100 if (ret < 0)
101 goto error;
102 if (!start_found) {
103 *objectid = search_start;
104 start_found = 1;
105 goto found;
106 }
107 *objectid = last_ino > search_start ?
108 last_ino : search_start;
109 goto found;
110 }
111 btrfs_item_key_to_cpu(l, &key, slot);
112 if (key.objectid >= search_start) {
113 if (start_found) {
114 if (last_ino < search_start)
115 last_ino = search_start;
116 if (key.objectid > last_ino) {
117 *objectid = last_ino;
118 goto found;
119 }
120 }
121 }
122 if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
123 break;
124 start_found = 1;
125 last_ino = key.objectid + 1;
126 path->slots[0]++;
127 }
128 // FIXME -ENOSPC
129 BUG_ON(1);
130found:
131 btrfs_release_path(root, path);
132 btrfs_free_path(path);
133 BUG_ON(*objectid < search_start);
134 mutex_unlock(&root->objectid_mutex);
135 return 0;
136error:
137 btrfs_release_path(root, path);
138 btrfs_free_path(path);
139 mutex_unlock(&root->objectid_mutex);
140 return ret;
141}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 000000000000..adb169d739ce
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,3774 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/bit_spinlock.h>
37#include <linux/version.h>
38#include <linux/xattr.h>
39#include <linux/posix_acl.h>
40#include "ctree.h"
41#include "disk-io.h"
42#include "transaction.h"
43#include "btrfs_inode.h"
44#include "ioctl.h"
45#include "print-tree.h"
46#include "volumes.h"
47#include "ordered-data.h"
48#include "xattr.h"
49#include "compat.h"
50#include "tree-log.h"
51
52struct btrfs_iget_args {
53 u64 ino;
54 struct btrfs_root *root;
55};
56
57static struct inode_operations btrfs_dir_inode_operations;
58static struct inode_operations btrfs_symlink_inode_operations;
59static struct inode_operations btrfs_dir_ro_inode_operations;
60static struct inode_operations btrfs_special_inode_operations;
61static struct inode_operations btrfs_file_inode_operations;
62static struct address_space_operations btrfs_aops;
63static struct address_space_operations btrfs_symlink_aops;
64static struct file_operations btrfs_dir_file_operations;
65static struct extent_io_ops btrfs_extent_io_ops;
66
67static struct kmem_cache *btrfs_inode_cachep;
68struct kmem_cache *btrfs_trans_handle_cachep;
69struct kmem_cache *btrfs_transaction_cachep;
70struct kmem_cache *btrfs_bit_radix_cachep;
71struct kmem_cache *btrfs_path_cachep;
72
73#define S_SHIFT 12
74static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
75 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
76 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
77 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
78 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
79 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
80 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
81 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
82};
83
84static void btrfs_truncate(struct inode *inode);
85
86int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
87 int for_del)
88{
89 u64 total;
90 u64 used;
91 u64 thresh;
92 unsigned long flags;
93 int ret = 0;
94
95 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
96 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
97 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
98 if (for_del)
99 thresh = total * 90;
100 else
101 thresh = total * 85;
102
103 do_div(thresh, 100);
104
105 if (used + root->fs_info->delalloc_bytes + num_required > thresh)
106 ret = -ENOSPC;
107 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
108 return ret;
109}
110
111static int cow_file_range(struct inode *inode, u64 start, u64 end)
112{
113 struct btrfs_root *root = BTRFS_I(inode)->root;
114 struct btrfs_trans_handle *trans;
115 u64 alloc_hint = 0;
116 u64 num_bytes;
117 u64 cur_alloc_size;
118 u64 blocksize = root->sectorsize;
119 u64 orig_num_bytes;
120 struct btrfs_key ins;
121 struct extent_map *em;
122 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
123 int ret = 0;
124
125 trans = btrfs_join_transaction(root, 1);
126 BUG_ON(!trans);
127 btrfs_set_trans_block_group(trans, inode);
128
129 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
130 num_bytes = max(blocksize, num_bytes);
131 orig_num_bytes = num_bytes;
132
133 if (alloc_hint == EXTENT_MAP_INLINE)
134 goto out;
135
136 BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
137 mutex_lock(&BTRFS_I(inode)->extent_mutex);
138 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1);
139 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
140
141 while(num_bytes > 0) {
142 cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
143 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
144 root->sectorsize, 0, alloc_hint,
145 (u64)-1, &ins, 1);
146 if (ret) {
147 WARN_ON(1);
148 goto out;
149 }
150 em = alloc_extent_map(GFP_NOFS);
151 em->start = start;
152 em->len = ins.offset;
153 em->block_start = ins.objectid;
154 em->bdev = root->fs_info->fs_devices->latest_bdev;
155 mutex_lock(&BTRFS_I(inode)->extent_mutex);
156 set_bit(EXTENT_FLAG_PINNED, &em->flags);
157 while(1) {
158 spin_lock(&em_tree->lock);
159 ret = add_extent_mapping(em_tree, em);
160 spin_unlock(&em_tree->lock);
161 if (ret != -EEXIST) {
162 free_extent_map(em);
163 break;
164 }
165 btrfs_drop_extent_cache(inode, start,
166 start + ins.offset - 1);
167 }
168 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
169
170 cur_alloc_size = ins.offset;
171 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
172 ins.offset, 0);
173 BUG_ON(ret);
174 if (num_bytes < cur_alloc_size) {
175 printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
176 cur_alloc_size);
177 break;
178 }
179 num_bytes -= cur_alloc_size;
180 alloc_hint = ins.objectid + ins.offset;
181 start += cur_alloc_size;
182 }
183out:
184 btrfs_end_transaction(trans, root);
185 return ret;
186}
187
188static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
189{
190 u64 extent_start;
191 u64 extent_end;
192 u64 bytenr;
193 u64 loops = 0;
194 u64 total_fs_bytes;
195 struct btrfs_root *root = BTRFS_I(inode)->root;
196 struct btrfs_block_group_cache *block_group;
197 struct btrfs_trans_handle *trans;
198 struct extent_buffer *leaf;
199 int found_type;
200 struct btrfs_path *path;
201 struct btrfs_file_extent_item *item;
202 int ret;
203 int err = 0;
204 struct btrfs_key found_key;
205
206 total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
207 path = btrfs_alloc_path();
208 BUG_ON(!path);
209 trans = btrfs_join_transaction(root, 1);
210 BUG_ON(!trans);
211again:
212 ret = btrfs_lookup_file_extent(NULL, root, path,
213 inode->i_ino, start, 0);
214 if (ret < 0) {
215 err = ret;
216 goto out;
217 }
218
219 if (ret != 0) {
220 if (path->slots[0] == 0)
221 goto not_found;
222 path->slots[0]--;
223 }
224
225 leaf = path->nodes[0];
226 item = btrfs_item_ptr(leaf, path->slots[0],
227 struct btrfs_file_extent_item);
228
229 /* are we inside the extent that was found? */
230 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
231 found_type = btrfs_key_type(&found_key);
232 if (found_key.objectid != inode->i_ino ||
233 found_type != BTRFS_EXTENT_DATA_KEY)
234 goto not_found;
235
236 found_type = btrfs_file_extent_type(leaf, item);
237 extent_start = found_key.offset;
238 if (found_type == BTRFS_FILE_EXTENT_REG) {
239 u64 extent_num_bytes;
240
241 extent_num_bytes = btrfs_file_extent_num_bytes(leaf, item);
242 extent_end = extent_start + extent_num_bytes;
243 err = 0;
244
245 if (loops && start != extent_start)
246 goto not_found;
247
248 if (start < extent_start || start >= extent_end)
249 goto not_found;
250
251 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
252 if (bytenr == 0)
253 goto not_found;
254
255 if (btrfs_cross_ref_exists(trans, root, &found_key, bytenr))
256 goto not_found;
257 /*
258 * we may be called by the resizer, make sure we're inside
259 * the limits of the FS
260 */
261 block_group = btrfs_lookup_block_group(root->fs_info,
262 bytenr);
263 if (!block_group || block_group->ro)
264 goto not_found;
265
266 bytenr += btrfs_file_extent_offset(leaf, item);
267 extent_num_bytes = min(end + 1, extent_end) - start;
268 ret = btrfs_add_ordered_extent(inode, start, bytenr,
269 extent_num_bytes, 1);
270 if (ret) {
271 err = ret;
272 goto out;
273 }
274
275 btrfs_release_path(root, path);
276 start = extent_end;
277 if (start <= end) {
278 loops++;
279 goto again;
280 }
281 } else {
282not_found:
283 btrfs_end_transaction(trans, root);
284 btrfs_free_path(path);
285 return cow_file_range(inode, start, end);
286 }
287out:
288 WARN_ON(err);
289 btrfs_end_transaction(trans, root);
290 btrfs_free_path(path);
291 return err;
292}
293
294static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
295{
296 struct btrfs_root *root = BTRFS_I(inode)->root;
297 int ret;
298
299 if (btrfs_test_opt(root, NODATACOW) ||
300 btrfs_test_flag(inode, NODATACOW))
301 ret = run_delalloc_nocow(inode, start, end);
302 else
303 ret = cow_file_range(inode, start, end);
304
305 return ret;
306}
307
308int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
309 unsigned long old, unsigned long bits)
310{
311 unsigned long flags;
312 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
313 struct btrfs_root *root = BTRFS_I(inode)->root;
314 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
315 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
316 root->fs_info->delalloc_bytes += end - start + 1;
317 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
318 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
319 &root->fs_info->delalloc_inodes);
320 }
321 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
322 }
323 return 0;
324}
325
326int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
327 unsigned long old, unsigned long bits)
328{
329 if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
330 struct btrfs_root *root = BTRFS_I(inode)->root;
331 unsigned long flags;
332
333 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
334 if (end - start + 1 > root->fs_info->delalloc_bytes) {
335 printk("warning: delalloc account %Lu %Lu\n",
336 end - start + 1, root->fs_info->delalloc_bytes);
337 root->fs_info->delalloc_bytes = 0;
338 BTRFS_I(inode)->delalloc_bytes = 0;
339 } else {
340 root->fs_info->delalloc_bytes -= end - start + 1;
341 BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
342 }
343 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
344 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
345 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
346 }
347 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
348 }
349 return 0;
350}
351
352int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
353 size_t size, struct bio *bio)
354{
355 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
356 struct btrfs_mapping_tree *map_tree;
357 u64 logical = bio->bi_sector << 9;
358 u64 length = 0;
359 u64 map_length;
360 int ret;
361
362 length = bio->bi_size;
363 map_tree = &root->fs_info->mapping_tree;
364 map_length = length;
365 ret = btrfs_map_block(map_tree, READ, logical,
366 &map_length, NULL, 0);
367
368 if (map_length < length + size) {
369 return 1;
370 }
371 return 0;
372}
373
374int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
375 int mirror_num)
376{
377 struct btrfs_root *root = BTRFS_I(inode)->root;
378 int ret = 0;
379
380 ret = btrfs_csum_one_bio(root, inode, bio);
381 BUG_ON(ret);
382
383 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
384}
385
386int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
387 int mirror_num)
388{
389 struct btrfs_root *root = BTRFS_I(inode)->root;
390 int ret = 0;
391
392 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
393 BUG_ON(ret);
394
395 if (btrfs_test_opt(root, NODATASUM) ||
396 btrfs_test_flag(inode, NODATASUM)) {
397 goto mapit;
398 }
399
400 if (!(rw & (1 << BIO_RW))) {
401 btrfs_lookup_bio_sums(root, inode, bio);
402 goto mapit;
403 }
404 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
405 inode, rw, bio, mirror_num,
406 __btrfs_submit_bio_hook);
407mapit:
408 return btrfs_map_bio(root, rw, bio, mirror_num, 0);
409}
410
411static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
412 struct inode *inode, u64 file_offset,
413 struct list_head *list)
414{
415 struct list_head *cur;
416 struct btrfs_ordered_sum *sum;
417
418 btrfs_set_trans_block_group(trans, inode);
419 list_for_each(cur, list) {
420 sum = list_entry(cur, struct btrfs_ordered_sum, list);
421 btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root,
422 inode, sum);
423 }
424 return 0;
425}
426
427int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
428{
429 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
430 GFP_NOFS);
431}
432
433struct btrfs_writepage_fixup {
434 struct page *page;
435 struct btrfs_work work;
436};
437
438/* see btrfs_writepage_start_hook for details on why this is required */
439void btrfs_writepage_fixup_worker(struct btrfs_work *work)
440{
441 struct btrfs_writepage_fixup *fixup;
442 struct btrfs_ordered_extent *ordered;
443 struct page *page;
444 struct inode *inode;
445 u64 page_start;
446 u64 page_end;
447
448 fixup = container_of(work, struct btrfs_writepage_fixup, work);
449 page = fixup->page;
450again:
451 lock_page(page);
452 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
453 ClearPageChecked(page);
454 goto out_page;
455 }
456
457 inode = page->mapping->host;
458 page_start = page_offset(page);
459 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
460
461 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
462
463 /* already ordered? We're done */
464 if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
465 EXTENT_ORDERED, 0)) {
466 goto out;
467 }
468
469 ordered = btrfs_lookup_ordered_extent(inode, page_start);
470 if (ordered) {
471 unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
472 page_end, GFP_NOFS);
473 unlock_page(page);
474 btrfs_start_ordered_extent(inode, ordered, 1);
475 goto again;
476 }
477
478 btrfs_set_extent_delalloc(inode, page_start, page_end);
479 ClearPageChecked(page);
480out:
481 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
482out_page:
483 unlock_page(page);
484 page_cache_release(page);
485}
486
487/*
488 * There are a few paths in the higher layers of the kernel that directly
489 * set the page dirty bit without asking the filesystem if it is a
490 * good idea. This causes problems because we want to make sure COW
491 * properly happens and the data=ordered rules are followed.
492 *
493 * In our case any range that doesn't have the EXTENT_ORDERED bit set
494 * hasn't been properly setup for IO. We kick off an async process
495 * to fix it up. The async helper will wait for ordered extents, set
496 * the delalloc bit and make it safe to write the page.
497 */
498int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
499{
500 struct inode *inode = page->mapping->host;
501 struct btrfs_writepage_fixup *fixup;
502 struct btrfs_root *root = BTRFS_I(inode)->root;
503 int ret;
504
505 ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
506 EXTENT_ORDERED, 0);
507 if (ret)
508 return 0;
509
510 if (PageChecked(page))
511 return -EAGAIN;
512
513 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
514 if (!fixup)
515 return -EAGAIN;
516
517 SetPageChecked(page);
518 page_cache_get(page);
519 fixup->work.func = btrfs_writepage_fixup_worker;
520 fixup->page = page;
521 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
522 return -EAGAIN;
523}
524
525static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
526{
527 struct btrfs_root *root = BTRFS_I(inode)->root;
528 struct btrfs_trans_handle *trans;
529 struct btrfs_ordered_extent *ordered_extent;
530 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
531 struct btrfs_file_extent_item *extent_item;
532 struct btrfs_path *path = NULL;
533 struct extent_buffer *leaf;
534 u64 alloc_hint = 0;
535 struct list_head list;
536 struct btrfs_key ins;
537 int ret;
538
539 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
540 if (!ret)
541 return 0;
542
543 trans = btrfs_join_transaction(root, 1);
544
545 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
546 BUG_ON(!ordered_extent);
547 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
548 goto nocow;
549
550 path = btrfs_alloc_path();
551 BUG_ON(!path);
552
553 lock_extent(io_tree, ordered_extent->file_offset,
554 ordered_extent->file_offset + ordered_extent->len - 1,
555 GFP_NOFS);
556
557 INIT_LIST_HEAD(&list);
558
559 mutex_lock(&BTRFS_I(inode)->extent_mutex);
560
561 ret = btrfs_drop_extents(trans, root, inode,
562 ordered_extent->file_offset,
563 ordered_extent->file_offset +
564 ordered_extent->len,
565 ordered_extent->file_offset, &alloc_hint);
566 BUG_ON(ret);
567
568 ins.objectid = inode->i_ino;
569 ins.offset = ordered_extent->file_offset;
570 ins.type = BTRFS_EXTENT_DATA_KEY;
571 ret = btrfs_insert_empty_item(trans, root, path, &ins,
572 sizeof(*extent_item));
573 BUG_ON(ret);
574 leaf = path->nodes[0];
575 extent_item = btrfs_item_ptr(leaf, path->slots[0],
576 struct btrfs_file_extent_item);
577 btrfs_set_file_extent_generation(leaf, extent_item, trans->transid);
578 btrfs_set_file_extent_type(leaf, extent_item, BTRFS_FILE_EXTENT_REG);
579 btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
580 ordered_extent->start);
581 btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
582 ordered_extent->len);
583 btrfs_set_file_extent_offset(leaf, extent_item, 0);
584 btrfs_set_file_extent_num_bytes(leaf, extent_item,
585 ordered_extent->len);
586 btrfs_mark_buffer_dirty(leaf);
587
588 btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
589 ordered_extent->file_offset +
590 ordered_extent->len - 1);
591 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
592
593 ins.objectid = ordered_extent->start;
594 ins.offset = ordered_extent->len;
595 ins.type = BTRFS_EXTENT_ITEM_KEY;
596 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
597 root->root_key.objectid,
598 trans->transid, inode->i_ino,
599 ordered_extent->file_offset, &ins);
600 BUG_ON(ret);
601 btrfs_release_path(root, path);
602
603 inode->i_blocks += ordered_extent->len >> 9;
604 unlock_extent(io_tree, ordered_extent->file_offset,
605 ordered_extent->file_offset + ordered_extent->len - 1,
606 GFP_NOFS);
607nocow:
608 add_pending_csums(trans, inode, ordered_extent->file_offset,
609 &ordered_extent->list);
610
611 mutex_lock(&BTRFS_I(inode)->extent_mutex);
612 btrfs_ordered_update_i_size(inode, ordered_extent);
613 btrfs_update_inode(trans, root, inode);
614 btrfs_remove_ordered_extent(inode, ordered_extent);
615 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
616
617 /* once for us */
618 btrfs_put_ordered_extent(ordered_extent);
619 /* once for the tree */
620 btrfs_put_ordered_extent(ordered_extent);
621
622 btrfs_end_transaction(trans, root);
623 if (path)
624 btrfs_free_path(path);
625 return 0;
626}
627
628int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
629 struct extent_state *state, int uptodate)
630{
631 return btrfs_finish_ordered_io(page->mapping->host, start, end);
632}
633
634struct io_failure_record {
635 struct page *page;
636 u64 start;
637 u64 len;
638 u64 logical;
639 int last_mirror;
640};
641
642int btrfs_io_failed_hook(struct bio *failed_bio,
643 struct page *page, u64 start, u64 end,
644 struct extent_state *state)
645{
646 struct io_failure_record *failrec = NULL;
647 u64 private;
648 struct extent_map *em;
649 struct inode *inode = page->mapping->host;
650 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
651 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
652 struct bio *bio;
653 int num_copies;
654 int ret;
655 int rw;
656 u64 logical;
657
658 ret = get_state_private(failure_tree, start, &private);
659 if (ret) {
660 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
661 if (!failrec)
662 return -ENOMEM;
663 failrec->start = start;
664 failrec->len = end - start + 1;
665 failrec->last_mirror = 0;
666
667 spin_lock(&em_tree->lock);
668 em = lookup_extent_mapping(em_tree, start, failrec->len);
669 if (em->start > start || em->start + em->len < start) {
670 free_extent_map(em);
671 em = NULL;
672 }
673 spin_unlock(&em_tree->lock);
674
675 if (!em || IS_ERR(em)) {
676 kfree(failrec);
677 return -EIO;
678 }
679 logical = start - em->start;
680 logical = em->block_start + logical;
681 failrec->logical = logical;
682 free_extent_map(em);
683 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
684 EXTENT_DIRTY, GFP_NOFS);
685 set_state_private(failure_tree, start,
686 (u64)(unsigned long)failrec);
687 } else {
688 failrec = (struct io_failure_record *)(unsigned long)private;
689 }
690 num_copies = btrfs_num_copies(
691 &BTRFS_I(inode)->root->fs_info->mapping_tree,
692 failrec->logical, failrec->len);
693 failrec->last_mirror++;
694 if (!state) {
695 spin_lock_irq(&BTRFS_I(inode)->io_tree.lock);
696 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
697 failrec->start,
698 EXTENT_LOCKED);
699 if (state && state->start != failrec->start)
700 state = NULL;
701 spin_unlock_irq(&BTRFS_I(inode)->io_tree.lock);
702 }
703 if (!state || failrec->last_mirror > num_copies) {
704 set_state_private(failure_tree, failrec->start, 0);
705 clear_extent_bits(failure_tree, failrec->start,
706 failrec->start + failrec->len - 1,
707 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
708 kfree(failrec);
709 return -EIO;
710 }
711 bio = bio_alloc(GFP_NOFS, 1);
712 bio->bi_private = state;
713 bio->bi_end_io = failed_bio->bi_end_io;
714 bio->bi_sector = failrec->logical >> 9;
715 bio->bi_bdev = failed_bio->bi_bdev;
716 bio->bi_size = 0;
717 bio_add_page(bio, page, failrec->len, start - page_offset(page));
718 if (failed_bio->bi_rw & (1 << BIO_RW))
719 rw = WRITE;
720 else
721 rw = READ;
722
723 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
724 failrec->last_mirror);
725 return 0;
726}
727
728int btrfs_clean_io_failures(struct inode *inode, u64 start)
729{
730 u64 private;
731 u64 private_failure;
732 struct io_failure_record *failure;
733 int ret;
734
735 private = 0;
736 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
737 (u64)-1, 1, EXTENT_DIRTY)) {
738 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
739 start, &private_failure);
740 if (ret == 0) {
741 failure = (struct io_failure_record *)(unsigned long)
742 private_failure;
743 set_state_private(&BTRFS_I(inode)->io_failure_tree,
744 failure->start, 0);
745 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
746 failure->start,
747 failure->start + failure->len - 1,
748 EXTENT_DIRTY | EXTENT_LOCKED,
749 GFP_NOFS);
750 kfree(failure);
751 }
752 }
753 return 0;
754}
755
756int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
757 struct extent_state *state)
758{
759 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
760 struct inode *inode = page->mapping->host;
761 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
762 char *kaddr;
763 u64 private = ~(u32)0;
764 int ret;
765 struct btrfs_root *root = BTRFS_I(inode)->root;
766 u32 csum = ~(u32)0;
767 unsigned long flags;
768
769 if (btrfs_test_opt(root, NODATASUM) ||
770 btrfs_test_flag(inode, NODATASUM))
771 return 0;
772 if (state && state->start == start) {
773 private = state->private;
774 ret = 0;
775 } else {
776 ret = get_state_private(io_tree, start, &private);
777 }
778 local_irq_save(flags);
779 kaddr = kmap_atomic(page, KM_IRQ0);
780 if (ret) {
781 goto zeroit;
782 }
783 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1);
784 btrfs_csum_final(csum, (char *)&csum);
785 if (csum != private) {
786 goto zeroit;
787 }
788 kunmap_atomic(kaddr, KM_IRQ0);
789 local_irq_restore(flags);
790
791 /* if the io failure tree for this inode is non-empty,
792 * check to see if we've recovered from a failed IO
793 */
794 btrfs_clean_io_failures(inode, start);
795 return 0;
796
797zeroit:
798 printk("btrfs csum failed ino %lu off %llu csum %u private %Lu\n",
799 page->mapping->host->i_ino, (unsigned long long)start, csum,
800 private);
801 memset(kaddr + offset, 1, end - start + 1);
802 flush_dcache_page(page);
803 kunmap_atomic(kaddr, KM_IRQ0);
804 local_irq_restore(flags);
805 if (private == 0)
806 return 0;
807 return -EIO;
808}
809
810/*
811 * This creates an orphan entry for the given inode in case something goes
812 * wrong in the middle of an unlink/truncate.
813 */
814int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
815{
816 struct btrfs_root *root = BTRFS_I(inode)->root;
817 int ret = 0;
818
819 spin_lock(&root->list_lock);
820
821 /* already on the orphan list, we're good */
822 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
823 spin_unlock(&root->list_lock);
824 return 0;
825 }
826
827 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
828
829 spin_unlock(&root->list_lock);
830
831 /*
832 * insert an orphan item to track this unlinked/truncated file
833 */
834 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
835
836 return ret;
837}
838
839/*
840 * We have done the truncate/delete so we can go ahead and remove the orphan
841 * item for this particular inode.
842 */
843int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
844{
845 struct btrfs_root *root = BTRFS_I(inode)->root;
846 int ret = 0;
847
848 spin_lock(&root->list_lock);
849
850 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
851 spin_unlock(&root->list_lock);
852 return 0;
853 }
854
855 list_del_init(&BTRFS_I(inode)->i_orphan);
856 if (!trans) {
857 spin_unlock(&root->list_lock);
858 return 0;
859 }
860
861 spin_unlock(&root->list_lock);
862
863 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
864
865 return ret;
866}
867
868/*
869 * this cleans up any orphans that may be left on the list from the last use
870 * of this root.
871 */
872void btrfs_orphan_cleanup(struct btrfs_root *root)
873{
874 struct btrfs_path *path;
875 struct extent_buffer *leaf;
876 struct btrfs_item *item;
877 struct btrfs_key key, found_key;
878 struct btrfs_trans_handle *trans;
879 struct inode *inode;
880 int ret = 0, nr_unlink = 0, nr_truncate = 0;
881
882 /* don't do orphan cleanup if the fs is readonly. */
883 if (root->inode->i_sb->s_flags & MS_RDONLY)
884 return;
885
886 path = btrfs_alloc_path();
887 if (!path)
888 return;
889 path->reada = -1;
890
891 key.objectid = BTRFS_ORPHAN_OBJECTID;
892 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
893 key.offset = (u64)-1;
894
895 trans = btrfs_start_transaction(root, 1);
896 btrfs_set_trans_block_group(trans, root->inode);
897
898 while (1) {
899 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
900 if (ret < 0) {
901 printk(KERN_ERR "Error searching slot for orphan: %d"
902 "\n", ret);
903 break;
904 }
905
906 /*
907 * if ret == 0 means we found what we were searching for, which
908 * is weird, but possible, so only screw with path if we didnt
909 * find the key and see if we have stuff that matches
910 */
911 if (ret > 0) {
912 if (path->slots[0] == 0)
913 break;
914 path->slots[0]--;
915 }
916
917 /* pull out the item */
918 leaf = path->nodes[0];
919 item = btrfs_item_nr(leaf, path->slots[0]);
920 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
921
922 /* make sure the item matches what we want */
923 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
924 break;
925 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
926 break;
927
928 /* release the path since we're done with it */
929 btrfs_release_path(root, path);
930
931 /*
932 * this is where we are basically btrfs_lookup, without the
933 * crossing root thing. we store the inode number in the
934 * offset of the orphan item.
935 */
936 inode = btrfs_iget_locked(root->inode->i_sb,
937 found_key.offset, root);
938 if (!inode)
939 break;
940
941 if (inode->i_state & I_NEW) {
942 BTRFS_I(inode)->root = root;
943
944 /* have to set the location manually */
945 BTRFS_I(inode)->location.objectid = inode->i_ino;
946 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
947 BTRFS_I(inode)->location.offset = 0;
948
949 btrfs_read_locked_inode(inode);
950 unlock_new_inode(inode);
951 }
952
953 /*
954 * add this inode to the orphan list so btrfs_orphan_del does
955 * the proper thing when we hit it
956 */
957 spin_lock(&root->list_lock);
958 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
959 spin_unlock(&root->list_lock);
960
961 /*
962 * if this is a bad inode, means we actually succeeded in
963 * removing the inode, but not the orphan record, which means
964 * we need to manually delete the orphan since iput will just
965 * do a destroy_inode
966 */
967 if (is_bad_inode(inode)) {
968 btrfs_orphan_del(trans, inode);
969 iput(inode);
970 continue;
971 }
972
973 /* if we have links, this was a truncate, lets do that */
974 if (inode->i_nlink) {
975 nr_truncate++;
976 btrfs_truncate(inode);
977 } else {
978 nr_unlink++;
979 }
980
981 /* this will do delete_inode and everything for us */
982 iput(inode);
983 }
984
985 if (nr_unlink)
986 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
987 if (nr_truncate)
988 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
989
990 btrfs_free_path(path);
991 btrfs_end_transaction(trans, root);
992}
993
994void btrfs_read_locked_inode(struct inode *inode)
995{
996 struct btrfs_path *path;
997 struct extent_buffer *leaf;
998 struct btrfs_inode_item *inode_item;
999 struct btrfs_timespec *tspec;
1000 struct btrfs_root *root = BTRFS_I(inode)->root;
1001 struct btrfs_key location;
1002 u64 alloc_group_block;
1003 u32 rdev;
1004 int ret;
1005
1006 path = btrfs_alloc_path();
1007 BUG_ON(!path);
1008 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
1009
1010 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
1011 if (ret)
1012 goto make_bad;
1013
1014 leaf = path->nodes[0];
1015 inode_item = btrfs_item_ptr(leaf, path->slots[0],
1016 struct btrfs_inode_item);
1017
1018 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
1019 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
1020 inode->i_uid = btrfs_inode_uid(leaf, inode_item);
1021 inode->i_gid = btrfs_inode_gid(leaf, inode_item);
1022 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
1023
1024 tspec = btrfs_inode_atime(inode_item);
1025 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1026 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
1027
1028 tspec = btrfs_inode_mtime(inode_item);
1029 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1030 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
1031
1032 tspec = btrfs_inode_ctime(inode_item);
1033 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1034 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
1035
1036 inode->i_blocks = btrfs_inode_nblocks(leaf, inode_item);
1037 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
1038 inode->i_generation = BTRFS_I(inode)->generation;
1039 inode->i_rdev = 0;
1040 rdev = btrfs_inode_rdev(leaf, inode_item);
1041
1042 BTRFS_I(inode)->index_cnt = (u64)-1;
1043
1044 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
1045 BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
1046 alloc_group_block);
1047 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
1048 if (!BTRFS_I(inode)->block_group) {
1049 BTRFS_I(inode)->block_group = btrfs_find_block_group(root,
1050 NULL, 0,
1051 BTRFS_BLOCK_GROUP_METADATA, 0);
1052 }
1053 btrfs_free_path(path);
1054 inode_item = NULL;
1055
1056 switch (inode->i_mode & S_IFMT) {
1057 case S_IFREG:
1058 inode->i_mapping->a_ops = &btrfs_aops;
1059 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
1060 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
1061 inode->i_fop = &btrfs_file_operations;
1062 inode->i_op = &btrfs_file_inode_operations;
1063 break;
1064 case S_IFDIR:
1065 inode->i_fop = &btrfs_dir_file_operations;
1066 if (root == root->fs_info->tree_root)
1067 inode->i_op = &btrfs_dir_ro_inode_operations;
1068 else
1069 inode->i_op = &btrfs_dir_inode_operations;
1070 break;
1071 case S_IFLNK:
1072 inode->i_op = &btrfs_symlink_inode_operations;
1073 inode->i_mapping->a_ops = &btrfs_symlink_aops;
1074 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
1075 break;
1076 default:
1077 init_special_inode(inode, inode->i_mode, rdev);
1078 break;
1079 }
1080 return;
1081
1082make_bad:
1083 btrfs_free_path(path);
1084 make_bad_inode(inode);
1085}
1086
1087static void fill_inode_item(struct btrfs_trans_handle *trans,
1088 struct extent_buffer *leaf,
1089 struct btrfs_inode_item *item,
1090 struct inode *inode)
1091{
1092 btrfs_set_inode_uid(leaf, item, inode->i_uid);
1093 btrfs_set_inode_gid(leaf, item, inode->i_gid);
1094 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
1095 btrfs_set_inode_mode(leaf, item, inode->i_mode);
1096 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
1097
1098 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
1099 inode->i_atime.tv_sec);
1100 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
1101 inode->i_atime.tv_nsec);
1102
1103 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
1104 inode->i_mtime.tv_sec);
1105 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
1106 inode->i_mtime.tv_nsec);
1107
1108 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
1109 inode->i_ctime.tv_sec);
1110 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
1111 inode->i_ctime.tv_nsec);
1112
1113 btrfs_set_inode_nblocks(leaf, item, inode->i_blocks);
1114 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
1115 btrfs_set_inode_transid(leaf, item, trans->transid);
1116 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
1117 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
1118 btrfs_set_inode_block_group(leaf, item,
1119 BTRFS_I(inode)->block_group->key.objectid);
1120}
1121
1122int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,
1123 struct btrfs_root *root,
1124 struct inode *inode)
1125{
1126 struct btrfs_inode_item *inode_item;
1127 struct btrfs_path *path;
1128 struct extent_buffer *leaf;
1129 int ret;
1130
1131 path = btrfs_alloc_path();
1132 BUG_ON(!path);
1133 ret = btrfs_lookup_inode(trans, root, path,
1134 &BTRFS_I(inode)->location, 1);
1135 if (ret) {
1136 if (ret > 0)
1137 ret = -ENOENT;
1138 goto failed;
1139 }
1140
1141 leaf = path->nodes[0];
1142 inode_item = btrfs_item_ptr(leaf, path->slots[0],
1143 struct btrfs_inode_item);
1144
1145 fill_inode_item(trans, leaf, inode_item, inode);
1146 btrfs_mark_buffer_dirty(leaf);
1147 btrfs_set_inode_last_trans(trans, inode);
1148 ret = 0;
1149failed:
1150 btrfs_free_path(path);
1151 return ret;
1152}
1153
1154
1155int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
1156 struct btrfs_root *root,
1157 struct inode *dir, struct inode *inode,
1158 const char *name, int name_len)
1159{
1160 struct btrfs_path *path;
1161 int ret = 0;
1162 struct extent_buffer *leaf;
1163 struct btrfs_dir_item *di;
1164 struct btrfs_key key;
1165 u64 index;
1166
1167 path = btrfs_alloc_path();
1168 if (!path) {
1169 ret = -ENOMEM;
1170 goto err;
1171 }
1172
1173 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
1174 name, name_len, -1);
1175 if (IS_ERR(di)) {
1176 ret = PTR_ERR(di);
1177 goto err;
1178 }
1179 if (!di) {
1180 ret = -ENOENT;
1181 goto err;
1182 }
1183 leaf = path->nodes[0];
1184 btrfs_dir_item_key_to_cpu(leaf, di, &key);
1185 ret = btrfs_delete_one_dir_name(trans, root, path, di);
1186 if (ret)
1187 goto err;
1188 btrfs_release_path(root, path);
1189
1190 ret = btrfs_del_inode_ref(trans, root, name, name_len,
1191 inode->i_ino,
1192 dir->i_ino, &index);
1193 if (ret) {
1194 printk("failed to delete reference to %.*s, "
1195 "inode %lu parent %lu\n", name_len, name,
1196 inode->i_ino, dir->i_ino);
1197 goto err;
1198 }
1199
1200 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
1201 index, name, name_len, -1);
1202 if (IS_ERR(di)) {
1203 ret = PTR_ERR(di);
1204 goto err;
1205 }
1206 if (!di) {
1207 ret = -ENOENT;
1208 goto err;
1209 }
1210 ret = btrfs_delete_one_dir_name(trans, root, path, di);
1211 btrfs_release_path(root, path);
1212
1213 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
1214 inode, dir->i_ino);
1215 BUG_ON(ret != 0 && ret != -ENOENT);
1216 if (ret != -ENOENT)
1217 BTRFS_I(dir)->log_dirty_trans = trans->transid;
1218
1219 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
1220 dir, index);
1221 BUG_ON(ret);
1222err:
1223 btrfs_free_path(path);
1224 if (ret)
1225 goto out;
1226
1227 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
1228 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
1229 btrfs_update_inode(trans, root, dir);
1230 btrfs_drop_nlink(inode);
1231 ret = btrfs_update_inode(trans, root, inode);
1232 dir->i_sb->s_dirt = 1;
1233out:
1234 return ret;
1235}
1236
1237static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
1238{
1239 struct btrfs_root *root;
1240 struct btrfs_trans_handle *trans;
1241 struct inode *inode = dentry->d_inode;
1242 int ret;
1243 unsigned long nr = 0;
1244
1245 root = BTRFS_I(dir)->root;
1246
1247 ret = btrfs_check_free_space(root, 1, 1);
1248 if (ret)
1249 goto fail;
1250
1251 trans = btrfs_start_transaction(root, 1);
1252
1253 btrfs_set_trans_block_group(trans, dir);
1254 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
1255 dentry->d_name.name, dentry->d_name.len);
1256
1257 if (inode->i_nlink == 0)
1258 ret = btrfs_orphan_add(trans, inode);
1259
1260 nr = trans->blocks_used;
1261
1262 btrfs_end_transaction_throttle(trans, root);
1263fail:
1264 btrfs_btree_balance_dirty(root, nr);
1265 return ret;
1266}
1267
1268static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
1269{
1270 struct inode *inode = dentry->d_inode;
1271 int err = 0;
1272 int ret;
1273 struct btrfs_root *root = BTRFS_I(dir)->root;
1274 struct btrfs_trans_handle *trans;
1275 unsigned long nr = 0;
1276
1277 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
1278 return -ENOTEMPTY;
1279 }
1280
1281 ret = btrfs_check_free_space(root, 1, 1);
1282 if (ret)
1283 goto fail;
1284
1285 trans = btrfs_start_transaction(root, 1);
1286 btrfs_set_trans_block_group(trans, dir);
1287
1288 err = btrfs_orphan_add(trans, inode);
1289 if (err)
1290 goto fail_trans;
1291
1292 /* now the directory is empty */
1293 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
1294 dentry->d_name.name, dentry->d_name.len);
1295 if (!err) {
1296 btrfs_i_size_write(inode, 0);
1297 }
1298
1299fail_trans:
1300 nr = trans->blocks_used;
1301 ret = btrfs_end_transaction_throttle(trans, root);
1302fail:
1303 btrfs_btree_balance_dirty(root, nr);
1304
1305 if (ret && !err)
1306 err = ret;
1307 return err;
1308}
1309
1310/*
1311 * this can truncate away extent items, csum items and directory items.
1312 * It starts at a high offset and removes keys until it can't find
1313 * any higher than i_size.
1314 *
1315 * csum items that cross the new i_size are truncated to the new size
1316 * as well.
1317 *
1318 * min_type is the minimum key type to truncate down to. If set to 0, this
1319 * will kill all the items on this inode, including the INODE_ITEM_KEY.
1320 */
1321noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
1322 struct btrfs_root *root,
1323 struct inode *inode,
1324 u64 new_size, u32 min_type)
1325{
1326 int ret;
1327 struct btrfs_path *path;
1328 struct btrfs_key key;
1329 struct btrfs_key found_key;
1330 u32 found_type;
1331 struct extent_buffer *leaf;
1332 struct btrfs_file_extent_item *fi;
1333 u64 extent_start = 0;
1334 u64 extent_num_bytes = 0;
1335 u64 item_end = 0;
1336 u64 root_gen = 0;
1337 u64 root_owner = 0;
1338 int found_extent;
1339 int del_item;
1340 int pending_del_nr = 0;
1341 int pending_del_slot = 0;
1342 int extent_type = -1;
1343 u64 mask = root->sectorsize - 1;
1344
1345 if (root->ref_cows)
1346 btrfs_drop_extent_cache(inode,
1347 new_size & (~mask), (u64)-1);
1348 path = btrfs_alloc_path();
1349 path->reada = -1;
1350 BUG_ON(!path);
1351
1352 /* FIXME, add redo link to tree so we don't leak on crash */
1353 key.objectid = inode->i_ino;
1354 key.offset = (u64)-1;
1355 key.type = (u8)-1;
1356
1357 btrfs_init_path(path);
1358search_again:
1359 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1360 if (ret < 0) {
1361 goto error;
1362 }
1363 if (ret > 0) {
1364 /* there are no items in the tree for us to truncate, we're
1365 * done
1366 */
1367 if (path->slots[0] == 0) {
1368 ret = 0;
1369 goto error;
1370 }
1371 path->slots[0]--;
1372 }
1373
1374 while(1) {
1375 fi = NULL;
1376 leaf = path->nodes[0];
1377 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1378 found_type = btrfs_key_type(&found_key);
1379
1380 if (found_key.objectid != inode->i_ino)
1381 break;
1382
1383 if (found_type < min_type)
1384 break;
1385
1386 item_end = found_key.offset;
1387 if (found_type == BTRFS_EXTENT_DATA_KEY) {
1388 fi = btrfs_item_ptr(leaf, path->slots[0],
1389 struct btrfs_file_extent_item);
1390 extent_type = btrfs_file_extent_type(leaf, fi);
1391 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
1392 item_end +=
1393 btrfs_file_extent_num_bytes(leaf, fi);
1394 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1395 struct btrfs_item *item = btrfs_item_nr(leaf,
1396 path->slots[0]);
1397 item_end += btrfs_file_extent_inline_len(leaf,
1398 item);
1399 }
1400 item_end--;
1401 }
1402 if (found_type == BTRFS_CSUM_ITEM_KEY) {
1403 ret = btrfs_csum_truncate(trans, root, path,
1404 new_size);
1405 BUG_ON(ret);
1406 }
1407 if (item_end < new_size) {
1408 if (found_type == BTRFS_DIR_ITEM_KEY) {
1409 found_type = BTRFS_INODE_ITEM_KEY;
1410 } else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
1411 found_type = BTRFS_CSUM_ITEM_KEY;
1412 } else if (found_type == BTRFS_EXTENT_DATA_KEY) {
1413 found_type = BTRFS_XATTR_ITEM_KEY;
1414 } else if (found_type == BTRFS_XATTR_ITEM_KEY) {
1415 found_type = BTRFS_INODE_REF_KEY;
1416 } else if (found_type) {
1417 found_type--;
1418 } else {
1419 break;
1420 }
1421 btrfs_set_key_type(&key, found_type);
1422 goto next;
1423 }
1424 if (found_key.offset >= new_size)
1425 del_item = 1;
1426 else
1427 del_item = 0;
1428 found_extent = 0;
1429
1430 /* FIXME, shrink the extent if the ref count is only 1 */
1431 if (found_type != BTRFS_EXTENT_DATA_KEY)
1432 goto delete;
1433
1434 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
1435 u64 num_dec;
1436 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
1437 if (!del_item) {
1438 u64 orig_num_bytes =
1439 btrfs_file_extent_num_bytes(leaf, fi);
1440 extent_num_bytes = new_size -
1441 found_key.offset + root->sectorsize - 1;
1442 extent_num_bytes = extent_num_bytes &
1443 ~((u64)root->sectorsize - 1);
1444 btrfs_set_file_extent_num_bytes(leaf, fi,
1445 extent_num_bytes);
1446 num_dec = (orig_num_bytes -
1447 extent_num_bytes);
1448 if (root->ref_cows && extent_start != 0)
1449 dec_i_blocks(inode, num_dec);
1450 btrfs_mark_buffer_dirty(leaf);
1451 } else {
1452 extent_num_bytes =
1453 btrfs_file_extent_disk_num_bytes(leaf,
1454 fi);
1455 /* FIXME blocksize != 4096 */
1456 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
1457 if (extent_start != 0) {
1458 found_extent = 1;
1459 if (root->ref_cows)
1460 dec_i_blocks(inode, num_dec);
1461 }
1462 root_gen = btrfs_header_generation(leaf);
1463 root_owner = btrfs_header_owner(leaf);
1464 }
1465 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1466 if (!del_item) {
1467 u32 size = new_size - found_key.offset;
1468
1469 if (root->ref_cows) {
1470 dec_i_blocks(inode, item_end + 1 -
1471 found_key.offset - size);
1472 }
1473 size =
1474 btrfs_file_extent_calc_inline_size(size);
1475 ret = btrfs_truncate_item(trans, root, path,
1476 size, 1);
1477 BUG_ON(ret);
1478 } else if (root->ref_cows) {
1479 dec_i_blocks(inode, item_end + 1 -
1480 found_key.offset);
1481 }
1482 }
1483delete:
1484 if (del_item) {
1485 if (!pending_del_nr) {
1486 /* no pending yet, add ourselves */
1487 pending_del_slot = path->slots[0];
1488 pending_del_nr = 1;
1489 } else if (pending_del_nr &&
1490 path->slots[0] + 1 == pending_del_slot) {
1491 /* hop on the pending chunk */
1492 pending_del_nr++;
1493 pending_del_slot = path->slots[0];
1494 } else {
1495 printk("bad pending slot %d pending_del_nr %d pending_del_slot %d\n", path->slots[0], pending_del_nr, pending_del_slot);
1496 }
1497 } else {
1498 break;
1499 }
1500 if (found_extent) {
1501 ret = btrfs_free_extent(trans, root, extent_start,
1502 extent_num_bytes,
1503 leaf->start, root_owner,
1504 root_gen, inode->i_ino,
1505 found_key.offset, 0);
1506 BUG_ON(ret);
1507 }
1508next:
1509 if (path->slots[0] == 0) {
1510 if (pending_del_nr)
1511 goto del_pending;
1512 btrfs_release_path(root, path);
1513 goto search_again;
1514 }
1515
1516 path->slots[0]--;
1517 if (pending_del_nr &&
1518 path->slots[0] + 1 != pending_del_slot) {
1519 struct btrfs_key debug;
1520del_pending:
1521 btrfs_item_key_to_cpu(path->nodes[0], &debug,
1522 pending_del_slot);
1523 ret = btrfs_del_items(trans, root, path,
1524 pending_del_slot,
1525 pending_del_nr);
1526 BUG_ON(ret);
1527 pending_del_nr = 0;
1528 btrfs_release_path(root, path);
1529 goto search_again;
1530 }
1531 }
1532 ret = 0;
1533error:
1534 if (pending_del_nr) {
1535 ret = btrfs_del_items(trans, root, path, pending_del_slot,
1536 pending_del_nr);
1537 }
1538 btrfs_free_path(path);
1539 inode->i_sb->s_dirt = 1;
1540 return ret;
1541}
1542
1543/*
1544 * taken from block_truncate_page, but does cow as it zeros out
1545 * any bytes left in the last page in the file.
1546 */
1547static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
1548{
1549 struct inode *inode = mapping->host;
1550 struct btrfs_root *root = BTRFS_I(inode)->root;
1551 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1552 struct btrfs_ordered_extent *ordered;
1553 char *kaddr;
1554 u32 blocksize = root->sectorsize;
1555 pgoff_t index = from >> PAGE_CACHE_SHIFT;
1556 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1557 struct page *page;
1558 int ret = 0;
1559 u64 page_start;
1560 u64 page_end;
1561
1562 if ((offset & (blocksize - 1)) == 0)
1563 goto out;
1564
1565 ret = -ENOMEM;
1566again:
1567 page = grab_cache_page(mapping, index);
1568 if (!page)
1569 goto out;
1570
1571 page_start = page_offset(page);
1572 page_end = page_start + PAGE_CACHE_SIZE - 1;
1573
1574 if (!PageUptodate(page)) {
1575 ret = btrfs_readpage(NULL, page);
1576 lock_page(page);
1577 if (page->mapping != mapping) {
1578 unlock_page(page);
1579 page_cache_release(page);
1580 goto again;
1581 }
1582 if (!PageUptodate(page)) {
1583 ret = -EIO;
1584 goto out_unlock;
1585 }
1586 }
1587 wait_on_page_writeback(page);
1588
1589 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
1590 set_page_extent_mapped(page);
1591
1592 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1593 if (ordered) {
1594 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
1595 unlock_page(page);
1596 page_cache_release(page);
1597 btrfs_start_ordered_extent(inode, ordered, 1);
1598 btrfs_put_ordered_extent(ordered);
1599 goto again;
1600 }
1601
1602 btrfs_set_extent_delalloc(inode, page_start, page_end);
1603 ret = 0;
1604 if (offset != PAGE_CACHE_SIZE) {
1605 kaddr = kmap(page);
1606 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
1607 flush_dcache_page(page);
1608 kunmap(page);
1609 }
1610 ClearPageChecked(page);
1611 set_page_dirty(page);
1612 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
1613
1614out_unlock:
1615 unlock_page(page);
1616 page_cache_release(page);
1617out:
1618 return ret;
1619}
1620
1621static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
1622{
1623 struct inode *inode = dentry->d_inode;
1624 int err;
1625
1626 err = inode_change_ok(inode, attr);
1627 if (err)
1628 return err;
1629
1630 if (S_ISREG(inode->i_mode) &&
1631 attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
1632 struct btrfs_trans_handle *trans;
1633 struct btrfs_root *root = BTRFS_I(inode)->root;
1634 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1635
1636 u64 mask = root->sectorsize - 1;
1637 u64 hole_start = (inode->i_size + mask) & ~mask;
1638 u64 block_end = (attr->ia_size + mask) & ~mask;
1639 u64 hole_size;
1640 u64 alloc_hint = 0;
1641
1642 if (attr->ia_size <= hole_start)
1643 goto out;
1644
1645 err = btrfs_check_free_space(root, 1, 0);
1646 if (err)
1647 goto fail;
1648
1649 btrfs_truncate_page(inode->i_mapping, inode->i_size);
1650
1651 hole_size = block_end - hole_start;
1652 while(1) {
1653 struct btrfs_ordered_extent *ordered;
1654 btrfs_wait_ordered_range(inode, hole_start, hole_size);
1655
1656 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
1657 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
1658 if (ordered) {
1659 unlock_extent(io_tree, hole_start,
1660 block_end - 1, GFP_NOFS);
1661 btrfs_put_ordered_extent(ordered);
1662 } else {
1663 break;
1664 }
1665 }
1666
1667 trans = btrfs_start_transaction(root, 1);
1668 btrfs_set_trans_block_group(trans, inode);
1669 mutex_lock(&BTRFS_I(inode)->extent_mutex);
1670 err = btrfs_drop_extents(trans, root, inode,
1671 hole_start, block_end, hole_start,
1672 &alloc_hint);
1673
1674 if (alloc_hint != EXTENT_MAP_INLINE) {
1675 err = btrfs_insert_file_extent(trans, root,
1676 inode->i_ino,
1677 hole_start, 0, 0,
1678 hole_size, 0);
1679 btrfs_drop_extent_cache(inode, hole_start,
1680 (u64)-1);
1681 btrfs_check_file(root, inode);
1682 }
1683 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
1684 btrfs_end_transaction(trans, root);
1685 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
1686 if (err)
1687 return err;
1688 }
1689out:
1690 err = inode_setattr(inode, attr);
1691
1692 if (!err && ((attr->ia_valid & ATTR_MODE)))
1693 err = btrfs_acl_chmod(inode);
1694fail:
1695 return err;
1696}
1697
1698void btrfs_delete_inode(struct inode *inode)
1699{
1700 struct btrfs_trans_handle *trans;
1701 struct btrfs_root *root = BTRFS_I(inode)->root;
1702 unsigned long nr;
1703 int ret;
1704
1705 truncate_inode_pages(&inode->i_data, 0);
1706 if (is_bad_inode(inode)) {
1707 btrfs_orphan_del(NULL, inode);
1708 goto no_delete;
1709 }
1710 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1711
1712 btrfs_i_size_write(inode, 0);
1713 trans = btrfs_start_transaction(root, 1);
1714
1715 btrfs_set_trans_block_group(trans, inode);
1716 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
1717 if (ret) {
1718 btrfs_orphan_del(NULL, inode);
1719 goto no_delete_lock;
1720 }
1721
1722 btrfs_orphan_del(trans, inode);
1723
1724 nr = trans->blocks_used;
1725 clear_inode(inode);
1726
1727 btrfs_end_transaction(trans, root);
1728 btrfs_btree_balance_dirty(root, nr);
1729 return;
1730
1731no_delete_lock:
1732 nr = trans->blocks_used;
1733 btrfs_end_transaction(trans, root);
1734 btrfs_btree_balance_dirty(root, nr);
1735no_delete:
1736 clear_inode(inode);
1737}
1738
1739/*
1740 * this returns the key found in the dir entry in the location pointer.
1741 * If no dir entries were found, location->objectid is 0.
1742 */
1743static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
1744 struct btrfs_key *location)
1745{
1746 const char *name = dentry->d_name.name;
1747 int namelen = dentry->d_name.len;
1748 struct btrfs_dir_item *di;
1749 struct btrfs_path *path;
1750 struct btrfs_root *root = BTRFS_I(dir)->root;
1751 int ret = 0;
1752
1753 path = btrfs_alloc_path();
1754 BUG_ON(!path);
1755
1756 di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
1757 namelen, 0);
1758 if (IS_ERR(di))
1759 ret = PTR_ERR(di);
1760 if (!di || IS_ERR(di)) {
1761 goto out_err;
1762 }
1763 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
1764out:
1765 btrfs_free_path(path);
1766 return ret;
1767out_err:
1768 location->objectid = 0;
1769 goto out;
1770}
1771
1772/*
1773 * when we hit a tree root in a directory, the btrfs part of the inode
1774 * needs to be changed to reflect the root directory of the tree root. This
1775 * is kind of like crossing a mount point.
1776 */
1777static int fixup_tree_root_location(struct btrfs_root *root,
1778 struct btrfs_key *location,
1779 struct btrfs_root **sub_root,
1780 struct dentry *dentry)
1781{
1782 struct btrfs_root_item *ri;
1783
1784 if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
1785 return 0;
1786 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
1787 return 0;
1788
1789 *sub_root = btrfs_read_fs_root(root->fs_info, location,
1790 dentry->d_name.name,
1791 dentry->d_name.len);
1792 if (IS_ERR(*sub_root))
1793 return PTR_ERR(*sub_root);
1794
1795 ri = &(*sub_root)->root_item;
1796 location->objectid = btrfs_root_dirid(ri);
1797 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
1798 location->offset = 0;
1799
1800 return 0;
1801}
1802
1803static noinline void init_btrfs_i(struct inode *inode)
1804{
1805 struct btrfs_inode *bi = BTRFS_I(inode);
1806
1807 bi->i_acl = NULL;
1808 bi->i_default_acl = NULL;
1809
1810 bi->generation = 0;
1811 bi->last_trans = 0;
1812 bi->logged_trans = 0;
1813 bi->delalloc_bytes = 0;
1814 bi->disk_i_size = 0;
1815 bi->flags = 0;
1816 bi->index_cnt = (u64)-1;
1817 bi->log_dirty_trans = 0;
1818 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
1819 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
1820 inode->i_mapping, GFP_NOFS);
1821 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
1822 inode->i_mapping, GFP_NOFS);
1823 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
1824 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
1825 mutex_init(&BTRFS_I(inode)->csum_mutex);
1826 mutex_init(&BTRFS_I(inode)->extent_mutex);
1827 mutex_init(&BTRFS_I(inode)->log_mutex);
1828}
1829
1830static int btrfs_init_locked_inode(struct inode *inode, void *p)
1831{
1832 struct btrfs_iget_args *args = p;
1833 inode->i_ino = args->ino;
1834 init_btrfs_i(inode);
1835 BTRFS_I(inode)->root = args->root;
1836 return 0;
1837}
1838
1839static int btrfs_find_actor(struct inode *inode, void *opaque)
1840{
1841 struct btrfs_iget_args *args = opaque;
1842 return (args->ino == inode->i_ino &&
1843 args->root == BTRFS_I(inode)->root);
1844}
1845
1846struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
1847 struct btrfs_root *root)
1848{
1849 struct inode *inode;
1850 struct btrfs_iget_args args;
1851 args.ino = objectid;
1852 args.root = root;
1853
1854 inode = iget5_locked(s, objectid, btrfs_find_actor,
1855 btrfs_init_locked_inode,
1856 (void *)&args);
1857 return inode;
1858}
1859
1860/* Get an inode object given its location and corresponding root.
1861 * Returns in *is_new if the inode was read from disk
1862 */
1863struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
1864 struct btrfs_root *root, int *is_new)
1865{
1866 struct inode *inode;
1867
1868 inode = btrfs_iget_locked(s, location->objectid, root);
1869 if (!inode)
1870 return ERR_PTR(-EACCES);
1871
1872 if (inode->i_state & I_NEW) {
1873 BTRFS_I(inode)->root = root;
1874 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
1875 btrfs_read_locked_inode(inode);
1876 unlock_new_inode(inode);
1877 if (is_new)
1878 *is_new = 1;
1879 } else {
1880 if (is_new)
1881 *is_new = 0;
1882 }
1883
1884 return inode;
1885}
1886
1887static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
1888 struct nameidata *nd)
1889{
1890 struct inode * inode;
1891 struct btrfs_inode *bi = BTRFS_I(dir);
1892 struct btrfs_root *root = bi->root;
1893 struct btrfs_root *sub_root = root;
1894 struct btrfs_key location;
1895 int ret, new, do_orphan = 0;
1896
1897 if (dentry->d_name.len > BTRFS_NAME_LEN)
1898 return ERR_PTR(-ENAMETOOLONG);
1899
1900 ret = btrfs_inode_by_name(dir, dentry, &location);
1901
1902 if (ret < 0)
1903 return ERR_PTR(ret);
1904
1905 inode = NULL;
1906 if (location.objectid) {
1907 ret = fixup_tree_root_location(root, &location, &sub_root,
1908 dentry);
1909 if (ret < 0)
1910 return ERR_PTR(ret);
1911 if (ret > 0)
1912 return ERR_PTR(-ENOENT);
1913 inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
1914 if (IS_ERR(inode))
1915 return ERR_CAST(inode);
1916
1917 /* the inode and parent dir are two different roots */
1918 if (new && root != sub_root) {
1919 igrab(inode);
1920 sub_root->inode = inode;
1921 do_orphan = 1;
1922 }
1923 }
1924
1925 if (unlikely(do_orphan))
1926 btrfs_orphan_cleanup(sub_root);
1927
1928 return d_splice_alias(inode, dentry);
1929}
1930
1931static unsigned char btrfs_filetype_table[] = {
1932 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
1933};
1934
1935static int btrfs_real_readdir(struct file *filp, void *dirent,
1936 filldir_t filldir)
1937{
1938 struct inode *inode = filp->f_dentry->d_inode;
1939 struct btrfs_root *root = BTRFS_I(inode)->root;
1940 struct btrfs_item *item;
1941 struct btrfs_dir_item *di;
1942 struct btrfs_key key;
1943 struct btrfs_key found_key;
1944 struct btrfs_path *path;
1945 int ret;
1946 u32 nritems;
1947 struct extent_buffer *leaf;
1948 int slot;
1949 int advance;
1950 unsigned char d_type;
1951 int over = 0;
1952 u32 di_cur;
1953 u32 di_total;
1954 u32 di_len;
1955 int key_type = BTRFS_DIR_INDEX_KEY;
1956 char tmp_name[32];
1957 char *name_ptr;
1958 int name_len;
1959
1960 /* FIXME, use a real flag for deciding about the key type */
1961 if (root->fs_info->tree_root == root)
1962 key_type = BTRFS_DIR_ITEM_KEY;
1963
1964 /* special case for "." */
1965 if (filp->f_pos == 0) {
1966 over = filldir(dirent, ".", 1,
1967 1, inode->i_ino,
1968 DT_DIR);
1969 if (over)
1970 return 0;
1971 filp->f_pos = 1;
1972 }
1973 /* special case for .., just use the back ref */
1974 if (filp->f_pos == 1) {
1975 u64 pino = parent_ino(filp->f_path.dentry);
1976 over = filldir(dirent, "..", 2,
1977 2, pino, DT_DIR);
1978 if (over)
1979 return 0;
1980 filp->f_pos = 2;
1981 }
1982
1983 path = btrfs_alloc_path();
1984 path->reada = 2;
1985
1986 btrfs_set_key_type(&key, key_type);
1987 key.offset = filp->f_pos;
1988 key.objectid = inode->i_ino;
1989
1990 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1991 if (ret < 0)
1992 goto err;
1993 advance = 0;
1994
1995 while (1) {
1996 leaf = path->nodes[0];
1997 nritems = btrfs_header_nritems(leaf);
1998 slot = path->slots[0];
1999 if (advance || slot >= nritems) {
2000 if (slot >= nritems - 1) {
2001 ret = btrfs_next_leaf(root, path);
2002 if (ret)
2003 break;
2004 leaf = path->nodes[0];
2005 nritems = btrfs_header_nritems(leaf);
2006 slot = path->slots[0];
2007 } else {
2008 slot++;
2009 path->slots[0]++;
2010 }
2011 }
2012 advance = 1;
2013 item = btrfs_item_nr(leaf, slot);
2014 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2015
2016 if (found_key.objectid != key.objectid)
2017 break;
2018 if (btrfs_key_type(&found_key) != key_type)
2019 break;
2020 if (found_key.offset < filp->f_pos)
2021 continue;
2022
2023 filp->f_pos = found_key.offset;
2024
2025 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
2026 di_cur = 0;
2027 di_total = btrfs_item_size(leaf, item);
2028
2029 while (di_cur < di_total) {
2030 struct btrfs_key location;
2031
2032 name_len = btrfs_dir_name_len(leaf, di);
2033 if (name_len <= sizeof(tmp_name)) {
2034 name_ptr = tmp_name;
2035 } else {
2036 name_ptr = kmalloc(name_len, GFP_NOFS);
2037 if (!name_ptr) {
2038 ret = -ENOMEM;
2039 goto err;
2040 }
2041 }
2042 read_extent_buffer(leaf, name_ptr,
2043 (unsigned long)(di + 1), name_len);
2044
2045 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
2046 btrfs_dir_item_key_to_cpu(leaf, di, &location);
2047 over = filldir(dirent, name_ptr, name_len,
2048 found_key.offset, location.objectid,
2049 d_type);
2050
2051 if (name_ptr != tmp_name)
2052 kfree(name_ptr);
2053
2054 if (over)
2055 goto nopos;
2056
2057 di_len = btrfs_dir_name_len(leaf, di) +
2058 btrfs_dir_data_len(leaf, di) + sizeof(*di);
2059 di_cur += di_len;
2060 di = (struct btrfs_dir_item *)((char *)di + di_len);
2061 }
2062 }
2063
2064 /* Reached end of directory/root. Bump pos past the last item. */
2065 if (key_type == BTRFS_DIR_INDEX_KEY)
2066 filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
2067 else
2068 filp->f_pos++;
2069nopos:
2070 ret = 0;
2071err:
2072 btrfs_free_path(path);
2073 return ret;
2074}
2075
2076/* Kernels earlier than 2.6.28 still have the NFS deadlock where nfsd
2077 will call the file system's ->lookup() method from within its
2078 filldir callback, which in turn was called from the file system's
2079 ->readdir() method. And will deadlock for many file systems. */
2080#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
2081
2082struct nfshack_dirent {
2083 u64 ino;
2084 loff_t offset;
2085 int namlen;
2086 unsigned int d_type;
2087 char name[];
2088};
2089
2090struct nfshack_readdir {
2091 char *dirent;
2092 size_t used;
2093 int full;
2094};
2095
2096
2097
2098static int btrfs_nfshack_filldir(void *__buf, const char *name, int namlen,
2099 loff_t offset, u64 ino, unsigned int d_type)
2100{
2101 struct nfshack_readdir *buf = __buf;
2102 struct nfshack_dirent *de = (void *)(buf->dirent + buf->used);
2103 unsigned int reclen;
2104
2105 reclen = ALIGN(sizeof(struct nfshack_dirent) + namlen, sizeof(u64));
2106 if (buf->used + reclen > PAGE_SIZE) {
2107 buf->full = 1;
2108 return -EINVAL;
2109 }
2110
2111 de->namlen = namlen;
2112 de->offset = offset;
2113 de->ino = ino;
2114 de->d_type = d_type;
2115 memcpy(de->name, name, namlen);
2116 buf->used += reclen;
2117
2118 return 0;
2119}
2120
2121static int btrfs_nfshack_readdir(struct file *file, void *dirent,
2122 filldir_t filldir)
2123{
2124 struct nfshack_readdir buf;
2125 struct nfshack_dirent *de;
2126 int err;
2127 int size;
2128 loff_t offset;
2129
2130 buf.dirent = (void *)__get_free_page(GFP_KERNEL);
2131 if (!buf.dirent)
2132 return -ENOMEM;
2133
2134 offset = file->f_pos;
2135
2136 do {
2137 unsigned int reclen;
2138
2139 buf.used = 0;
2140 buf.full = 0;
2141 err = btrfs_real_readdir(file, &buf, btrfs_nfshack_filldir);
2142 if (err)
2143 break;
2144
2145 size = buf.used;
2146
2147 if (!size)
2148 break;
2149
2150 de = (struct nfshack_dirent *)buf.dirent;
2151 while (size > 0) {
2152 offset = de->offset;
2153
2154 if (filldir(dirent, de->name, de->namlen, de->offset,
2155 de->ino, de->d_type))
2156 goto done;
2157 offset = file->f_pos;
2158
2159 reclen = ALIGN(sizeof(*de) + de->namlen,
2160 sizeof(u64));
2161 size -= reclen;
2162 de = (struct nfshack_dirent *)((char *)de + reclen);
2163 }
2164 } while (buf.full);
2165
2166 done:
2167 free_page((unsigned long)buf.dirent);
2168 file->f_pos = offset;
2169
2170 return err;
2171}
2172#endif
2173
2174int btrfs_write_inode(struct inode *inode, int wait)
2175{
2176 struct btrfs_root *root = BTRFS_I(inode)->root;
2177 struct btrfs_trans_handle *trans;
2178 int ret = 0;
2179
2180 if (root->fs_info->closing > 1)
2181 return 0;
2182
2183 if (wait) {
2184 trans = btrfs_join_transaction(root, 1);
2185 btrfs_set_trans_block_group(trans, inode);
2186 ret = btrfs_commit_transaction(trans, root);
2187 }
2188 return ret;
2189}
2190
2191/*
2192 * This is somewhat expensive, updating the tree every time the
2193 * inode changes. But, it is most likely to find the inode in cache.
2194 * FIXME, needs more benchmarking...there are no reasons other than performance
2195 * to keep or drop this code.
2196 */
2197void btrfs_dirty_inode(struct inode *inode)
2198{
2199 struct btrfs_root *root = BTRFS_I(inode)->root;
2200 struct btrfs_trans_handle *trans;
2201
2202 trans = btrfs_join_transaction(root, 1);
2203 btrfs_set_trans_block_group(trans, inode);
2204 btrfs_update_inode(trans, root, inode);
2205 btrfs_end_transaction(trans, root);
2206}
2207
2208static int btrfs_set_inode_index_count(struct inode *inode)
2209{
2210 struct btrfs_root *root = BTRFS_I(inode)->root;
2211 struct btrfs_key key, found_key;
2212 struct btrfs_path *path;
2213 struct extent_buffer *leaf;
2214 int ret;
2215
2216 key.objectid = inode->i_ino;
2217 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
2218 key.offset = (u64)-1;
2219
2220 path = btrfs_alloc_path();
2221 if (!path)
2222 return -ENOMEM;
2223
2224 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2225 if (ret < 0)
2226 goto out;
2227 /* FIXME: we should be able to handle this */
2228 if (ret == 0)
2229 goto out;
2230 ret = 0;
2231
2232 /*
2233 * MAGIC NUMBER EXPLANATION:
2234 * since we search a directory based on f_pos we have to start at 2
2235 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
2236 * else has to start at 2
2237 */
2238 if (path->slots[0] == 0) {
2239 BTRFS_I(inode)->index_cnt = 2;
2240 goto out;
2241 }
2242
2243 path->slots[0]--;
2244
2245 leaf = path->nodes[0];
2246 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2247
2248 if (found_key.objectid != inode->i_ino ||
2249 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
2250 BTRFS_I(inode)->index_cnt = 2;
2251 goto out;
2252 }
2253
2254 BTRFS_I(inode)->index_cnt = found_key.offset + 1;
2255out:
2256 btrfs_free_path(path);
2257 return ret;
2258}
2259
2260static int btrfs_set_inode_index(struct inode *dir, struct inode *inode,
2261 u64 *index)
2262{
2263 int ret = 0;
2264
2265 if (BTRFS_I(dir)->index_cnt == (u64)-1) {
2266 ret = btrfs_set_inode_index_count(dir);
2267 if (ret) {
2268 return ret;
2269 }
2270 }
2271
2272 *index = BTRFS_I(dir)->index_cnt;
2273 BTRFS_I(dir)->index_cnt++;
2274
2275 return ret;
2276}
2277
2278static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
2279 struct btrfs_root *root,
2280 struct inode *dir,
2281 const char *name, int name_len,
2282 u64 ref_objectid,
2283 u64 objectid,
2284 struct btrfs_block_group_cache *group,
2285 int mode, u64 *index)
2286{
2287 struct inode *inode;
2288 struct btrfs_inode_item *inode_item;
2289 struct btrfs_block_group_cache *new_inode_group;
2290 struct btrfs_key *location;
2291 struct btrfs_path *path;
2292 struct btrfs_inode_ref *ref;
2293 struct btrfs_key key[2];
2294 u32 sizes[2];
2295 unsigned long ptr;
2296 int ret;
2297 int owner;
2298
2299 path = btrfs_alloc_path();
2300 BUG_ON(!path);
2301
2302 inode = new_inode(root->fs_info->sb);
2303 if (!inode)
2304 return ERR_PTR(-ENOMEM);
2305
2306 if (dir) {
2307 ret = btrfs_set_inode_index(dir, inode, index);
2308 if (ret)
2309 return ERR_PTR(ret);
2310 }
2311 /*
2312 * index_cnt is ignored for everything but a dir,
2313 * btrfs_get_inode_index_count has an explanation for the magic
2314 * number
2315 */
2316 init_btrfs_i(inode);
2317 BTRFS_I(inode)->index_cnt = 2;
2318 BTRFS_I(inode)->root = root;
2319 BTRFS_I(inode)->generation = trans->transid;
2320
2321 if (mode & S_IFDIR)
2322 owner = 0;
2323 else
2324 owner = 1;
2325 new_inode_group = btrfs_find_block_group(root, group, 0,
2326 BTRFS_BLOCK_GROUP_METADATA, owner);
2327 if (!new_inode_group) {
2328 printk("find_block group failed\n");
2329 new_inode_group = group;
2330 }
2331 BTRFS_I(inode)->block_group = new_inode_group;
2332
2333 key[0].objectid = objectid;
2334 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
2335 key[0].offset = 0;
2336
2337 key[1].objectid = objectid;
2338 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
2339 key[1].offset = ref_objectid;
2340
2341 sizes[0] = sizeof(struct btrfs_inode_item);
2342 sizes[1] = name_len + sizeof(*ref);
2343
2344 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
2345 if (ret != 0)
2346 goto fail;
2347
2348 if (objectid > root->highest_inode)
2349 root->highest_inode = objectid;
2350
2351 inode->i_uid = current->fsuid;
2352 inode->i_gid = current->fsgid;
2353 inode->i_mode = mode;
2354 inode->i_ino = objectid;
2355 inode->i_blocks = 0;
2356 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
2357 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2358 struct btrfs_inode_item);
2359 fill_inode_item(trans, path->nodes[0], inode_item, inode);
2360
2361 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
2362 struct btrfs_inode_ref);
2363 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
2364 btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
2365 ptr = (unsigned long)(ref + 1);
2366 write_extent_buffer(path->nodes[0], name, ptr, name_len);
2367
2368 btrfs_mark_buffer_dirty(path->nodes[0]);
2369 btrfs_free_path(path);
2370
2371 location = &BTRFS_I(inode)->location;
2372 location->objectid = objectid;
2373 location->offset = 0;
2374 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
2375
2376 insert_inode_hash(inode);
2377 return inode;
2378fail:
2379 if (dir)
2380 BTRFS_I(dir)->index_cnt--;
2381 btrfs_free_path(path);
2382 return ERR_PTR(ret);
2383}
2384
2385static inline u8 btrfs_inode_type(struct inode *inode)
2386{
2387 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
2388}
2389
2390int btrfs_add_link(struct btrfs_trans_handle *trans,
2391 struct inode *parent_inode, struct inode *inode,
2392 const char *name, int name_len, int add_backref, u64 index)
2393{
2394 int ret;
2395 struct btrfs_key key;
2396 struct btrfs_root *root = BTRFS_I(parent_inode)->root;
2397
2398 key.objectid = inode->i_ino;
2399 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
2400 key.offset = 0;
2401
2402 ret = btrfs_insert_dir_item(trans, root, name, name_len,
2403 parent_inode->i_ino,
2404 &key, btrfs_inode_type(inode),
2405 index);
2406 if (ret == 0) {
2407 if (add_backref) {
2408 ret = btrfs_insert_inode_ref(trans, root,
2409 name, name_len,
2410 inode->i_ino,
2411 parent_inode->i_ino,
2412 index);
2413 }
2414 btrfs_i_size_write(parent_inode, parent_inode->i_size +
2415 name_len * 2);
2416 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
2417 ret = btrfs_update_inode(trans, root, parent_inode);
2418 }
2419 return ret;
2420}
2421
2422static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
2423 struct dentry *dentry, struct inode *inode,
2424 int backref, u64 index)
2425{
2426 int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
2427 inode, dentry->d_name.name,
2428 dentry->d_name.len, backref, index);
2429 if (!err) {
2430 d_instantiate(dentry, inode);
2431 return 0;
2432 }
2433 if (err > 0)
2434 err = -EEXIST;
2435 return err;
2436}
2437
2438static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
2439 int mode, dev_t rdev)
2440{
2441 struct btrfs_trans_handle *trans;
2442 struct btrfs_root *root = BTRFS_I(dir)->root;
2443 struct inode *inode = NULL;
2444 int err;
2445 int drop_inode = 0;
2446 u64 objectid;
2447 unsigned long nr = 0;
2448 u64 index = 0;
2449
2450 if (!new_valid_dev(rdev))
2451 return -EINVAL;
2452
2453 err = btrfs_check_free_space(root, 1, 0);
2454 if (err)
2455 goto fail;
2456
2457 trans = btrfs_start_transaction(root, 1);
2458 btrfs_set_trans_block_group(trans, dir);
2459
2460 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
2461 if (err) {
2462 err = -ENOSPC;
2463 goto out_unlock;
2464 }
2465
2466 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
2467 dentry->d_name.len,
2468 dentry->d_parent->d_inode->i_ino, objectid,
2469 BTRFS_I(dir)->block_group, mode, &index);
2470 err = PTR_ERR(inode);
2471 if (IS_ERR(inode))
2472 goto out_unlock;
2473
2474 err = btrfs_init_acl(inode, dir);
2475 if (err) {
2476 drop_inode = 1;
2477 goto out_unlock;
2478 }
2479
2480 btrfs_set_trans_block_group(trans, inode);
2481 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
2482 if (err)
2483 drop_inode = 1;
2484 else {
2485 inode->i_op = &btrfs_special_inode_operations;
2486 init_special_inode(inode, inode->i_mode, rdev);
2487 btrfs_update_inode(trans, root, inode);
2488 }
2489 dir->i_sb->s_dirt = 1;
2490 btrfs_update_inode_block_group(trans, inode);
2491 btrfs_update_inode_block_group(trans, dir);
2492out_unlock:
2493 nr = trans->blocks_used;
2494 btrfs_end_transaction_throttle(trans, root);
2495fail:
2496 if (drop_inode) {
2497 inode_dec_link_count(inode);
2498 iput(inode);
2499 }
2500 btrfs_btree_balance_dirty(root, nr);
2501 return err;
2502}
2503
2504static int btrfs_create(struct inode *dir, struct dentry *dentry,
2505 int mode, struct nameidata *nd)
2506{
2507 struct btrfs_trans_handle *trans;
2508 struct btrfs_root *root = BTRFS_I(dir)->root;
2509 struct inode *inode = NULL;
2510 int err;
2511 int drop_inode = 0;
2512 unsigned long nr = 0;
2513 u64 objectid;
2514 u64 index = 0;
2515
2516 err = btrfs_check_free_space(root, 1, 0);
2517 if (err)
2518 goto fail;
2519 trans = btrfs_start_transaction(root, 1);
2520 btrfs_set_trans_block_group(trans, dir);
2521
2522 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
2523 if (err) {
2524 err = -ENOSPC;
2525 goto out_unlock;
2526 }
2527
2528 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
2529 dentry->d_name.len,
2530 dentry->d_parent->d_inode->i_ino,
2531 objectid, BTRFS_I(dir)->block_group, mode,
2532 &index);
2533 err = PTR_ERR(inode);
2534 if (IS_ERR(inode))
2535 goto out_unlock;
2536
2537 err = btrfs_init_acl(inode, dir);
2538 if (err) {
2539 drop_inode = 1;
2540 goto out_unlock;
2541 }
2542
2543 btrfs_set_trans_block_group(trans, inode);
2544 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
2545 if (err)
2546 drop_inode = 1;
2547 else {
2548 inode->i_mapping->a_ops = &btrfs_aops;
2549 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2550 inode->i_fop = &btrfs_file_operations;
2551 inode->i_op = &btrfs_file_inode_operations;
2552 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
2553 }
2554 dir->i_sb->s_dirt = 1;
2555 btrfs_update_inode_block_group(trans, inode);
2556 btrfs_update_inode_block_group(trans, dir);
2557out_unlock:
2558 nr = trans->blocks_used;
2559 btrfs_end_transaction_throttle(trans, root);
2560fail:
2561 if (drop_inode) {
2562 inode_dec_link_count(inode);
2563 iput(inode);
2564 }
2565 btrfs_btree_balance_dirty(root, nr);
2566 return err;
2567}
2568
2569static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
2570 struct dentry *dentry)
2571{
2572 struct btrfs_trans_handle *trans;
2573 struct btrfs_root *root = BTRFS_I(dir)->root;
2574 struct inode *inode = old_dentry->d_inode;
2575 u64 index;
2576 unsigned long nr = 0;
2577 int err;
2578 int drop_inode = 0;
2579
2580 if (inode->i_nlink == 0)
2581 return -ENOENT;
2582
2583 btrfs_inc_nlink(inode);
2584 err = btrfs_check_free_space(root, 1, 0);
2585 if (err)
2586 goto fail;
2587 err = btrfs_set_inode_index(dir, inode, &index);
2588 if (err)
2589 goto fail;
2590
2591 trans = btrfs_start_transaction(root, 1);
2592
2593 btrfs_set_trans_block_group(trans, dir);
2594 atomic_inc(&inode->i_count);
2595
2596 err = btrfs_add_nondir(trans, dentry, inode, 1, index);
2597
2598 if (err)
2599 drop_inode = 1;
2600
2601 dir->i_sb->s_dirt = 1;
2602 btrfs_update_inode_block_group(trans, dir);
2603 err = btrfs_update_inode(trans, root, inode);
2604
2605 if (err)
2606 drop_inode = 1;
2607
2608 nr = trans->blocks_used;
2609 btrfs_end_transaction_throttle(trans, root);
2610fail:
2611 if (drop_inode) {
2612 inode_dec_link_count(inode);
2613 iput(inode);
2614 }
2615 btrfs_btree_balance_dirty(root, nr);
2616 return err;
2617}
2618
2619static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2620{
2621 struct inode *inode = NULL;
2622 struct btrfs_trans_handle *trans;
2623 struct btrfs_root *root = BTRFS_I(dir)->root;
2624 int err = 0;
2625 int drop_on_err = 0;
2626 u64 objectid = 0;
2627 u64 index = 0;
2628 unsigned long nr = 1;
2629
2630 err = btrfs_check_free_space(root, 1, 0);
2631 if (err)
2632 goto out_unlock;
2633
2634 trans = btrfs_start_transaction(root, 1);
2635 btrfs_set_trans_block_group(trans, dir);
2636
2637 if (IS_ERR(trans)) {
2638 err = PTR_ERR(trans);
2639 goto out_unlock;
2640 }
2641
2642 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
2643 if (err) {
2644 err = -ENOSPC;
2645 goto out_unlock;
2646 }
2647
2648 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
2649 dentry->d_name.len,
2650 dentry->d_parent->d_inode->i_ino, objectid,
2651 BTRFS_I(dir)->block_group, S_IFDIR | mode,
2652 &index);
2653 if (IS_ERR(inode)) {
2654 err = PTR_ERR(inode);
2655 goto out_fail;
2656 }
2657
2658 drop_on_err = 1;
2659
2660 err = btrfs_init_acl(inode, dir);
2661 if (err)
2662 goto out_fail;
2663
2664 inode->i_op = &btrfs_dir_inode_operations;
2665 inode->i_fop = &btrfs_dir_file_operations;
2666 btrfs_set_trans_block_group(trans, inode);
2667
2668 btrfs_i_size_write(inode, 0);
2669 err = btrfs_update_inode(trans, root, inode);
2670 if (err)
2671 goto out_fail;
2672
2673 err = btrfs_add_link(trans, dentry->d_parent->d_inode,
2674 inode, dentry->d_name.name,
2675 dentry->d_name.len, 0, index);
2676 if (err)
2677 goto out_fail;
2678
2679 d_instantiate(dentry, inode);
2680 drop_on_err = 0;
2681 dir->i_sb->s_dirt = 1;
2682 btrfs_update_inode_block_group(trans, inode);
2683 btrfs_update_inode_block_group(trans, dir);
2684
2685out_fail:
2686 nr = trans->blocks_used;
2687 btrfs_end_transaction_throttle(trans, root);
2688
2689out_unlock:
2690 if (drop_on_err)
2691 iput(inode);
2692 btrfs_btree_balance_dirty(root, nr);
2693 return err;
2694}
2695
2696static int merge_extent_mapping(struct extent_map_tree *em_tree,
2697 struct extent_map *existing,
2698 struct extent_map *em,
2699 u64 map_start, u64 map_len)
2700{
2701 u64 start_diff;
2702
2703 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
2704 start_diff = map_start - em->start;
2705 em->start = map_start;
2706 em->len = map_len;
2707 if (em->block_start < EXTENT_MAP_LAST_BYTE)
2708 em->block_start += start_diff;
2709 return add_extent_mapping(em_tree, em);
2710}
2711
2712struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
2713 size_t pg_offset, u64 start, u64 len,
2714 int create)
2715{
2716 int ret;
2717 int err = 0;
2718 u64 bytenr;
2719 u64 extent_start = 0;
2720 u64 extent_end = 0;
2721 u64 objectid = inode->i_ino;
2722 u32 found_type;
2723 struct btrfs_path *path = NULL;
2724 struct btrfs_root *root = BTRFS_I(inode)->root;
2725 struct btrfs_file_extent_item *item;
2726 struct extent_buffer *leaf;
2727 struct btrfs_key found_key;
2728 struct extent_map *em = NULL;
2729 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2730 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2731 struct btrfs_trans_handle *trans = NULL;
2732
2733again:
2734 spin_lock(&em_tree->lock);
2735 em = lookup_extent_mapping(em_tree, start, len);
2736 if (em)
2737 em->bdev = root->fs_info->fs_devices->latest_bdev;
2738 spin_unlock(&em_tree->lock);
2739
2740 if (em) {
2741 if (em->start > start || em->start + em->len <= start)
2742 free_extent_map(em);
2743 else if (em->block_start == EXTENT_MAP_INLINE && page)
2744 free_extent_map(em);
2745 else
2746 goto out;
2747 }
2748 em = alloc_extent_map(GFP_NOFS);
2749 if (!em) {
2750 err = -ENOMEM;
2751 goto out;
2752 }
2753 em->bdev = root->fs_info->fs_devices->latest_bdev;
2754 em->start = EXTENT_MAP_HOLE;
2755 em->len = (u64)-1;
2756
2757 if (!path) {
2758 path = btrfs_alloc_path();
2759 BUG_ON(!path);
2760 }
2761
2762 ret = btrfs_lookup_file_extent(trans, root, path,
2763 objectid, start, trans != NULL);
2764 if (ret < 0) {
2765 err = ret;
2766 goto out;
2767 }
2768
2769 if (ret != 0) {
2770 if (path->slots[0] == 0)
2771 goto not_found;
2772 path->slots[0]--;
2773 }
2774
2775 leaf = path->nodes[0];
2776 item = btrfs_item_ptr(leaf, path->slots[0],
2777 struct btrfs_file_extent_item);
2778 /* are we inside the extent that was found? */
2779 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2780 found_type = btrfs_key_type(&found_key);
2781 if (found_key.objectid != objectid ||
2782 found_type != BTRFS_EXTENT_DATA_KEY) {
2783 goto not_found;
2784 }
2785
2786 found_type = btrfs_file_extent_type(leaf, item);
2787 extent_start = found_key.offset;
2788 if (found_type == BTRFS_FILE_EXTENT_REG) {
2789 extent_end = extent_start +
2790 btrfs_file_extent_num_bytes(leaf, item);
2791 err = 0;
2792 if (start < extent_start || start >= extent_end) {
2793 em->start = start;
2794 if (start < extent_start) {
2795 if (start + len <= extent_start)
2796 goto not_found;
2797 em->len = extent_end - extent_start;
2798 } else {
2799 em->len = len;
2800 }
2801 goto not_found_em;
2802 }
2803 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
2804 if (bytenr == 0) {
2805 em->start = extent_start;
2806 em->len = extent_end - extent_start;
2807 em->block_start = EXTENT_MAP_HOLE;
2808 goto insert;
2809 }
2810 bytenr += btrfs_file_extent_offset(leaf, item);
2811 em->block_start = bytenr;
2812 em->start = extent_start;
2813 em->len = extent_end - extent_start;
2814 goto insert;
2815 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
2816 u64 page_start;
2817 unsigned long ptr;
2818 char *map;
2819 size_t size;
2820 size_t extent_offset;
2821 size_t copy_size;
2822
2823 size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf,
2824 path->slots[0]));
2825 extent_end = (extent_start + size + root->sectorsize - 1) &
2826 ~((u64)root->sectorsize - 1);
2827 if (start < extent_start || start >= extent_end) {
2828 em->start = start;
2829 if (start < extent_start) {
2830 if (start + len <= extent_start)
2831 goto not_found;
2832 em->len = extent_end - extent_start;
2833 } else {
2834 em->len = len;
2835 }
2836 goto not_found_em;
2837 }
2838 em->block_start = EXTENT_MAP_INLINE;
2839
2840 if (!page) {
2841 em->start = extent_start;
2842 em->len = size;
2843 goto out;
2844 }
2845
2846 page_start = page_offset(page) + pg_offset;
2847 extent_offset = page_start - extent_start;
2848 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
2849 size - extent_offset);
2850 em->start = extent_start + extent_offset;
2851 em->len = (copy_size + root->sectorsize - 1) &
2852 ~((u64)root->sectorsize - 1);
2853 map = kmap(page);
2854 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
2855 if (create == 0 && !PageUptodate(page)) {
2856 read_extent_buffer(leaf, map + pg_offset, ptr,
2857 copy_size);
2858 flush_dcache_page(page);
2859 } else if (create && PageUptodate(page)) {
2860 if (!trans) {
2861 kunmap(page);
2862 free_extent_map(em);
2863 em = NULL;
2864 btrfs_release_path(root, path);
2865 trans = btrfs_join_transaction(root, 1);
2866 goto again;
2867 }
2868 write_extent_buffer(leaf, map + pg_offset, ptr,
2869 copy_size);
2870 btrfs_mark_buffer_dirty(leaf);
2871 }
2872 kunmap(page);
2873 set_extent_uptodate(io_tree, em->start,
2874 extent_map_end(em) - 1, GFP_NOFS);
2875 goto insert;
2876 } else {
2877 printk("unkknown found_type %d\n", found_type);
2878 WARN_ON(1);
2879 }
2880not_found:
2881 em->start = start;
2882 em->len = len;
2883not_found_em:
2884 em->block_start = EXTENT_MAP_HOLE;
2885insert:
2886 btrfs_release_path(root, path);
2887 if (em->start > start || extent_map_end(em) <= start) {
2888 printk("bad extent! em: [%Lu %Lu] passed [%Lu %Lu]\n", em->start, em->len, start, len);
2889 err = -EIO;
2890 goto out;
2891 }
2892
2893 err = 0;
2894 spin_lock(&em_tree->lock);
2895 ret = add_extent_mapping(em_tree, em);
2896 /* it is possible that someone inserted the extent into the tree
2897 * while we had the lock dropped. It is also possible that
2898 * an overlapping map exists in the tree
2899 */
2900 if (ret == -EEXIST) {
2901 struct extent_map *existing;
2902
2903 ret = 0;
2904
2905 existing = lookup_extent_mapping(em_tree, start, len);
2906 if (existing && (existing->start > start ||
2907 existing->start + existing->len <= start)) {
2908 free_extent_map(existing);
2909 existing = NULL;
2910 }
2911 if (!existing) {
2912 existing = lookup_extent_mapping(em_tree, em->start,
2913 em->len);
2914 if (existing) {
2915 err = merge_extent_mapping(em_tree, existing,
2916 em, start,
2917 root->sectorsize);
2918 free_extent_map(existing);
2919 if (err) {
2920 free_extent_map(em);
2921 em = NULL;
2922 }
2923 } else {
2924 err = -EIO;
2925 printk("failing to insert %Lu %Lu\n",
2926 start, len);
2927 free_extent_map(em);
2928 em = NULL;
2929 }
2930 } else {
2931 free_extent_map(em);
2932 em = existing;
2933 err = 0;
2934 }
2935 }
2936 spin_unlock(&em_tree->lock);
2937out:
2938 if (path)
2939 btrfs_free_path(path);
2940 if (trans) {
2941 ret = btrfs_end_transaction(trans, root);
2942 if (!err) {
2943 err = ret;
2944 }
2945 }
2946 if (err) {
2947 free_extent_map(em);
2948 WARN_ON(1);
2949 return ERR_PTR(err);
2950 }
2951 return em;
2952}
2953
2954#if 0 /* waiting for O_DIRECT reads */
2955static int btrfs_get_block(struct inode *inode, sector_t iblock,
2956 struct buffer_head *bh_result, int create)
2957{
2958 struct extent_map *em;
2959 u64 start = (u64)iblock << inode->i_blkbits;
2960 struct btrfs_multi_bio *multi = NULL;
2961 struct btrfs_root *root = BTRFS_I(inode)->root;
2962 u64 len;
2963 u64 logical;
2964 u64 map_length;
2965 int ret = 0;
2966
2967 em = btrfs_get_extent(inode, NULL, 0, start, bh_result->b_size, 0);
2968
2969 if (!em || IS_ERR(em))
2970 goto out;
2971
2972 if (em->start > start || em->start + em->len <= start) {
2973 goto out;
2974 }
2975
2976 if (em->block_start == EXTENT_MAP_INLINE) {
2977 ret = -EINVAL;
2978 goto out;
2979 }
2980
2981 len = em->start + em->len - start;
2982 len = min_t(u64, len, INT_LIMIT(typeof(bh_result->b_size)));
2983
2984 if (em->block_start == EXTENT_MAP_HOLE ||
2985 em->block_start == EXTENT_MAP_DELALLOC) {
2986 bh_result->b_size = len;
2987 goto out;
2988 }
2989
2990 logical = start - em->start;
2991 logical = em->block_start + logical;
2992
2993 map_length = len;
2994 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
2995 logical, &map_length, &multi, 0);
2996 BUG_ON(ret);
2997 bh_result->b_blocknr = multi->stripes[0].physical >> inode->i_blkbits;
2998 bh_result->b_size = min(map_length, len);
2999
3000 bh_result->b_bdev = multi->stripes[0].dev->bdev;
3001 set_buffer_mapped(bh_result);
3002 kfree(multi);
3003out:
3004 free_extent_map(em);
3005 return ret;
3006}
3007#endif
3008
3009static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
3010 const struct iovec *iov, loff_t offset,
3011 unsigned long nr_segs)
3012{
3013 return -EINVAL;
3014#if 0
3015 struct file *file = iocb->ki_filp;
3016 struct inode *inode = file->f_mapping->host;
3017
3018 if (rw == WRITE)
3019 return -EINVAL;
3020
3021 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
3022 offset, nr_segs, btrfs_get_block, NULL);
3023#endif
3024}
3025
3026static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
3027{
3028 return extent_bmap(mapping, iblock, btrfs_get_extent);
3029}
3030
3031int btrfs_readpage(struct file *file, struct page *page)
3032{
3033 struct extent_io_tree *tree;
3034 tree = &BTRFS_I(page->mapping->host)->io_tree;
3035 return extent_read_full_page(tree, page, btrfs_get_extent);
3036}
3037
3038static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
3039{
3040 struct extent_io_tree *tree;
3041
3042
3043 if (current->flags & PF_MEMALLOC) {
3044 redirty_page_for_writepage(wbc, page);
3045 unlock_page(page);
3046 return 0;
3047 }
3048 tree = &BTRFS_I(page->mapping->host)->io_tree;
3049 return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
3050}
3051
3052int btrfs_writepages(struct address_space *mapping,
3053 struct writeback_control *wbc)
3054{
3055 struct extent_io_tree *tree;
3056 tree = &BTRFS_I(mapping->host)->io_tree;
3057 return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
3058}
3059
3060static int
3061btrfs_readpages(struct file *file, struct address_space *mapping,
3062 struct list_head *pages, unsigned nr_pages)
3063{
3064 struct extent_io_tree *tree;
3065 tree = &BTRFS_I(mapping->host)->io_tree;
3066 return extent_readpages(tree, mapping, pages, nr_pages,
3067 btrfs_get_extent);
3068}
3069static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
3070{
3071 struct extent_io_tree *tree;
3072 struct extent_map_tree *map;
3073 int ret;
3074
3075 tree = &BTRFS_I(page->mapping->host)->io_tree;
3076 map = &BTRFS_I(page->mapping->host)->extent_tree;
3077 ret = try_release_extent_mapping(map, tree, page, gfp_flags);
3078 if (ret == 1) {
3079 ClearPagePrivate(page);
3080 set_page_private(page, 0);
3081 page_cache_release(page);
3082 }
3083 return ret;
3084}
3085
3086static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
3087{
3088 if (PageWriteback(page) || PageDirty(page))
3089 return 0;
3090 return __btrfs_releasepage(page, gfp_flags);
3091}
3092
3093static void btrfs_invalidatepage(struct page *page, unsigned long offset)
3094{
3095 struct extent_io_tree *tree;
3096 struct btrfs_ordered_extent *ordered;
3097 u64 page_start = page_offset(page);
3098 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
3099
3100 wait_on_page_writeback(page);
3101 tree = &BTRFS_I(page->mapping->host)->io_tree;
3102 if (offset) {
3103 btrfs_releasepage(page, GFP_NOFS);
3104 return;
3105 }
3106
3107 lock_extent(tree, page_start, page_end, GFP_NOFS);
3108 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
3109 page_offset(page));
3110 if (ordered) {
3111 /*
3112 * IO on this page will never be started, so we need
3113 * to account for any ordered extents now
3114 */
3115 clear_extent_bit(tree, page_start, page_end,
3116 EXTENT_DIRTY | EXTENT_DELALLOC |
3117 EXTENT_LOCKED, 1, 0, GFP_NOFS);
3118 btrfs_finish_ordered_io(page->mapping->host,
3119 page_start, page_end);
3120 btrfs_put_ordered_extent(ordered);
3121 lock_extent(tree, page_start, page_end, GFP_NOFS);
3122 }
3123 clear_extent_bit(tree, page_start, page_end,
3124 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
3125 EXTENT_ORDERED,
3126 1, 1, GFP_NOFS);
3127 __btrfs_releasepage(page, GFP_NOFS);
3128
3129 ClearPageChecked(page);
3130 if (PagePrivate(page)) {
3131 ClearPagePrivate(page);
3132 set_page_private(page, 0);
3133 page_cache_release(page);
3134 }
3135}
3136
3137/*
3138 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
3139 * called from a page fault handler when a page is first dirtied. Hence we must
3140 * be careful to check for EOF conditions here. We set the page up correctly
3141 * for a written page which means we get ENOSPC checking when writing into
3142 * holes and correct delalloc and unwritten extent mapping on filesystems that
3143 * support these features.
3144 *
3145 * We are not allowed to take the i_mutex here so we have to play games to
3146 * protect against truncate races as the page could now be beyond EOF. Because
3147 * vmtruncate() writes the inode size before removing pages, once we have the
3148 * page lock we can determine safely if the page is beyond EOF. If it is not
3149 * beyond EOF, then the page is guaranteed safe against truncation until we
3150 * unlock the page.
3151 */
3152int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
3153{
3154 struct inode *inode = fdentry(vma->vm_file)->d_inode;
3155 struct btrfs_root *root = BTRFS_I(inode)->root;
3156 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3157 struct btrfs_ordered_extent *ordered;
3158 char *kaddr;
3159 unsigned long zero_start;
3160 loff_t size;
3161 int ret;
3162 u64 page_start;
3163 u64 page_end;
3164
3165 ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
3166 if (ret)
3167 goto out;
3168
3169 ret = -EINVAL;
3170again:
3171 lock_page(page);
3172 size = i_size_read(inode);
3173 page_start = page_offset(page);
3174 page_end = page_start + PAGE_CACHE_SIZE - 1;
3175
3176 if ((page->mapping != inode->i_mapping) ||
3177 (page_start >= size)) {
3178 /* page got truncated out from underneath us */
3179 goto out_unlock;
3180 }
3181 wait_on_page_writeback(page);
3182
3183 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
3184 set_page_extent_mapped(page);
3185
3186 /*
3187 * we can't set the delalloc bits if there are pending ordered
3188 * extents. Drop our locks and wait for them to finish
3189 */
3190 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3191 if (ordered) {
3192 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3193 unlock_page(page);
3194 btrfs_start_ordered_extent(inode, ordered, 1);
3195 btrfs_put_ordered_extent(ordered);
3196 goto again;
3197 }
3198
3199 btrfs_set_extent_delalloc(inode, page_start, page_end);
3200 ret = 0;
3201
3202 /* page is wholly or partially inside EOF */
3203 if (page_start + PAGE_CACHE_SIZE > size)
3204 zero_start = size & ~PAGE_CACHE_MASK;
3205 else
3206 zero_start = PAGE_CACHE_SIZE;
3207
3208 if (zero_start != PAGE_CACHE_SIZE) {
3209 kaddr = kmap(page);
3210 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
3211 flush_dcache_page(page);
3212 kunmap(page);
3213 }
3214 ClearPageChecked(page);
3215 set_page_dirty(page);
3216 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3217
3218out_unlock:
3219 unlock_page(page);
3220out:
3221 return ret;
3222}
3223
3224static void btrfs_truncate(struct inode *inode)
3225{
3226 struct btrfs_root *root = BTRFS_I(inode)->root;
3227 int ret;
3228 struct btrfs_trans_handle *trans;
3229 unsigned long nr;
3230 u64 mask = root->sectorsize - 1;
3231
3232 if (!S_ISREG(inode->i_mode))
3233 return;
3234 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
3235 return;
3236
3237 btrfs_truncate_page(inode->i_mapping, inode->i_size);
3238 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
3239
3240 trans = btrfs_start_transaction(root, 1);
3241 btrfs_set_trans_block_group(trans, inode);
3242 btrfs_i_size_write(inode, inode->i_size);
3243
3244 ret = btrfs_orphan_add(trans, inode);
3245 if (ret)
3246 goto out;
3247 /* FIXME, add redo link to tree so we don't leak on crash */
3248 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
3249 BTRFS_EXTENT_DATA_KEY);
3250 btrfs_update_inode(trans, root, inode);
3251
3252 ret = btrfs_orphan_del(trans, inode);
3253 BUG_ON(ret);
3254
3255out:
3256 nr = trans->blocks_used;
3257 ret = btrfs_end_transaction_throttle(trans, root);
3258 BUG_ON(ret);
3259 btrfs_btree_balance_dirty(root, nr);
3260}
3261
3262/*
3263 * Invalidate a single dcache entry at the root of the filesystem.
3264 * Needed after creation of snapshot or subvolume.
3265 */
3266void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
3267 int namelen)
3268{
3269 struct dentry *alias, *entry;
3270 struct qstr qstr;
3271
3272 alias = d_find_alias(root->fs_info->sb->s_root->d_inode);
3273 if (alias) {
3274 qstr.name = name;
3275 qstr.len = namelen;
3276 /* change me if btrfs ever gets a d_hash operation */
3277 qstr.hash = full_name_hash(qstr.name, qstr.len);
3278 entry = d_lookup(alias, &qstr);
3279 dput(alias);
3280 if (entry) {
3281 d_invalidate(entry);
3282 dput(entry);
3283 }
3284 }
3285}
3286
3287int btrfs_create_subvol_root(struct btrfs_root *new_root,
3288 struct btrfs_trans_handle *trans, u64 new_dirid,
3289 struct btrfs_block_group_cache *block_group)
3290{
3291 struct inode *inode;
3292 u64 index = 0;
3293
3294 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
3295 new_dirid, block_group, S_IFDIR | 0700, &index);
3296 if (IS_ERR(inode))
3297 return PTR_ERR(inode);
3298 inode->i_op = &btrfs_dir_inode_operations;
3299 inode->i_fop = &btrfs_dir_file_operations;
3300 new_root->inode = inode;
3301
3302 inode->i_nlink = 1;
3303 btrfs_i_size_write(inode, 0);
3304
3305 return btrfs_update_inode(trans, new_root, inode);
3306}
3307
3308unsigned long btrfs_force_ra(struct address_space *mapping,
3309 struct file_ra_state *ra, struct file *file,
3310 pgoff_t offset, pgoff_t last_index)
3311{
3312 pgoff_t req_size = last_index - offset + 1;
3313
3314#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
3315 offset = page_cache_readahead(mapping, ra, file, offset, req_size);
3316 return offset;
3317#else
3318 page_cache_sync_readahead(mapping, ra, file, offset, req_size);
3319 return offset + req_size;
3320#endif
3321}
3322
3323struct inode *btrfs_alloc_inode(struct super_block *sb)
3324{
3325 struct btrfs_inode *ei;
3326
3327 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
3328 if (!ei)
3329 return NULL;
3330 ei->last_trans = 0;
3331 ei->logged_trans = 0;
3332 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
3333 ei->i_acl = BTRFS_ACL_NOT_CACHED;
3334 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
3335 INIT_LIST_HEAD(&ei->i_orphan);
3336 return &ei->vfs_inode;
3337}
3338
3339void btrfs_destroy_inode(struct inode *inode)
3340{
3341 struct btrfs_ordered_extent *ordered;
3342 WARN_ON(!list_empty(&inode->i_dentry));
3343 WARN_ON(inode->i_data.nrpages);
3344
3345 if (BTRFS_I(inode)->i_acl &&
3346 BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
3347 posix_acl_release(BTRFS_I(inode)->i_acl);
3348 if (BTRFS_I(inode)->i_default_acl &&
3349 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
3350 posix_acl_release(BTRFS_I(inode)->i_default_acl);
3351
3352 spin_lock(&BTRFS_I(inode)->root->list_lock);
3353 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
3354 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
3355 " list\n", inode->i_ino);
3356 dump_stack();
3357 }
3358 spin_unlock(&BTRFS_I(inode)->root->list_lock);
3359
3360 while(1) {
3361 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
3362 if (!ordered)
3363 break;
3364 else {
3365 printk("found ordered extent %Lu %Lu\n",
3366 ordered->file_offset, ordered->len);
3367 btrfs_remove_ordered_extent(inode, ordered);
3368 btrfs_put_ordered_extent(ordered);
3369 btrfs_put_ordered_extent(ordered);
3370 }
3371 }
3372 btrfs_drop_extent_cache(inode, 0, (u64)-1);
3373 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
3374}
3375
3376#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
3377static void init_once(void *foo)
3378#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
3379static void init_once(struct kmem_cache * cachep, void *foo)
3380#else
3381static void init_once(void * foo, struct kmem_cache * cachep,
3382 unsigned long flags)
3383#endif
3384{
3385 struct btrfs_inode *ei = (struct btrfs_inode *) foo;
3386
3387 inode_init_once(&ei->vfs_inode);
3388}
3389
3390void btrfs_destroy_cachep(void)
3391{
3392 if (btrfs_inode_cachep)
3393 kmem_cache_destroy(btrfs_inode_cachep);
3394 if (btrfs_trans_handle_cachep)
3395 kmem_cache_destroy(btrfs_trans_handle_cachep);
3396 if (btrfs_transaction_cachep)
3397 kmem_cache_destroy(btrfs_transaction_cachep);
3398 if (btrfs_bit_radix_cachep)
3399 kmem_cache_destroy(btrfs_bit_radix_cachep);
3400 if (btrfs_path_cachep)
3401 kmem_cache_destroy(btrfs_path_cachep);
3402}
3403
3404struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
3405 unsigned long extra_flags,
3406#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
3407 void (*ctor)(void *)
3408#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
3409 void (*ctor)(struct kmem_cache *, void *)
3410#else
3411 void (*ctor)(void *, struct kmem_cache *,
3412 unsigned long)
3413#endif
3414 )
3415{
3416 return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
3417 SLAB_MEM_SPREAD | extra_flags), ctor
3418#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
3419 ,NULL
3420#endif
3421 );
3422}
3423
3424int btrfs_init_cachep(void)
3425{
3426 btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
3427 sizeof(struct btrfs_inode),
3428 0, init_once);
3429 if (!btrfs_inode_cachep)
3430 goto fail;
3431 btrfs_trans_handle_cachep =
3432 btrfs_cache_create("btrfs_trans_handle_cache",
3433 sizeof(struct btrfs_trans_handle),
3434 0, NULL);
3435 if (!btrfs_trans_handle_cachep)
3436 goto fail;
3437 btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
3438 sizeof(struct btrfs_transaction),
3439 0, NULL);
3440 if (!btrfs_transaction_cachep)
3441 goto fail;
3442 btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
3443 sizeof(struct btrfs_path),
3444 0, NULL);
3445 if (!btrfs_path_cachep)
3446 goto fail;
3447 btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
3448 SLAB_DESTROY_BY_RCU, NULL);
3449 if (!btrfs_bit_radix_cachep)
3450 goto fail;
3451 return 0;
3452fail:
3453 btrfs_destroy_cachep();
3454 return -ENOMEM;
3455}
3456
3457static int btrfs_getattr(struct vfsmount *mnt,
3458 struct dentry *dentry, struct kstat *stat)
3459{
3460 struct inode *inode = dentry->d_inode;
3461 generic_fillattr(inode, stat);
3462 stat->blksize = PAGE_CACHE_SIZE;
3463 stat->blocks = inode->i_blocks + (BTRFS_I(inode)->delalloc_bytes >> 9);
3464 return 0;
3465}
3466
3467static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
3468 struct inode * new_dir,struct dentry *new_dentry)
3469{
3470 struct btrfs_trans_handle *trans;
3471 struct btrfs_root *root = BTRFS_I(old_dir)->root;
3472 struct inode *new_inode = new_dentry->d_inode;
3473 struct inode *old_inode = old_dentry->d_inode;
3474 struct timespec ctime = CURRENT_TIME;
3475 u64 index = 0;
3476 int ret;
3477
3478 if (S_ISDIR(old_inode->i_mode) && new_inode &&
3479 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
3480 return -ENOTEMPTY;
3481 }
3482
3483 ret = btrfs_check_free_space(root, 1, 0);
3484 if (ret)
3485 goto out_unlock;
3486
3487 trans = btrfs_start_transaction(root, 1);
3488
3489 btrfs_set_trans_block_group(trans, new_dir);
3490
3491 btrfs_inc_nlink(old_dentry->d_inode);
3492 old_dir->i_ctime = old_dir->i_mtime = ctime;
3493 new_dir->i_ctime = new_dir->i_mtime = ctime;
3494 old_inode->i_ctime = ctime;
3495
3496 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
3497 old_dentry->d_name.name,
3498 old_dentry->d_name.len);
3499 if (ret)
3500 goto out_fail;
3501
3502 if (new_inode) {
3503 new_inode->i_ctime = CURRENT_TIME;
3504 ret = btrfs_unlink_inode(trans, root, new_dir,
3505 new_dentry->d_inode,
3506 new_dentry->d_name.name,
3507 new_dentry->d_name.len);
3508 if (ret)
3509 goto out_fail;
3510 if (new_inode->i_nlink == 0) {
3511 ret = btrfs_orphan_add(trans, new_dentry->d_inode);
3512 if (ret)
3513 goto out_fail;
3514 }
3515
3516 }
3517 ret = btrfs_set_inode_index(new_dir, old_inode, &index);
3518 if (ret)
3519 goto out_fail;
3520
3521 ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
3522 old_inode, new_dentry->d_name.name,
3523 new_dentry->d_name.len, 1, index);
3524 if (ret)
3525 goto out_fail;
3526
3527out_fail:
3528 btrfs_end_transaction_throttle(trans, root);
3529out_unlock:
3530 return ret;
3531}
3532
3533int btrfs_start_delalloc_inodes(struct btrfs_root *root)
3534{
3535 struct list_head *head = &root->fs_info->delalloc_inodes;
3536 struct btrfs_inode *binode;
3537 unsigned long flags;
3538
3539 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
3540 while(!list_empty(head)) {
3541 binode = list_entry(head->next, struct btrfs_inode,
3542 delalloc_inodes);
3543 atomic_inc(&binode->vfs_inode.i_count);
3544 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
3545 filemap_write_and_wait(binode->vfs_inode.i_mapping);
3546 iput(&binode->vfs_inode);
3547 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
3548 }
3549 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
3550 return 0;
3551}
3552
3553static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
3554 const char *symname)
3555{
3556 struct btrfs_trans_handle *trans;
3557 struct btrfs_root *root = BTRFS_I(dir)->root;
3558 struct btrfs_path *path;
3559 struct btrfs_key key;
3560 struct inode *inode = NULL;
3561 int err;
3562 int drop_inode = 0;
3563 u64 objectid;
3564 u64 index = 0 ;
3565 int name_len;
3566 int datasize;
3567 unsigned long ptr;
3568 struct btrfs_file_extent_item *ei;
3569 struct extent_buffer *leaf;
3570 unsigned long nr = 0;
3571
3572 name_len = strlen(symname) + 1;
3573 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
3574 return -ENAMETOOLONG;
3575
3576 err = btrfs_check_free_space(root, 1, 0);
3577 if (err)
3578 goto out_fail;
3579
3580 trans = btrfs_start_transaction(root, 1);
3581 btrfs_set_trans_block_group(trans, dir);
3582
3583 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3584 if (err) {
3585 err = -ENOSPC;
3586 goto out_unlock;
3587 }
3588
3589 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3590 dentry->d_name.len,
3591 dentry->d_parent->d_inode->i_ino, objectid,
3592 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
3593 &index);
3594 err = PTR_ERR(inode);
3595 if (IS_ERR(inode))
3596 goto out_unlock;
3597
3598 err = btrfs_init_acl(inode, dir);
3599 if (err) {
3600 drop_inode = 1;
3601 goto out_unlock;
3602 }
3603
3604 btrfs_set_trans_block_group(trans, inode);
3605 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3606 if (err)
3607 drop_inode = 1;
3608 else {
3609 inode->i_mapping->a_ops = &btrfs_aops;
3610 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3611 inode->i_fop = &btrfs_file_operations;
3612 inode->i_op = &btrfs_file_inode_operations;
3613 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3614 }
3615 dir->i_sb->s_dirt = 1;
3616 btrfs_update_inode_block_group(trans, inode);
3617 btrfs_update_inode_block_group(trans, dir);
3618 if (drop_inode)
3619 goto out_unlock;
3620
3621 path = btrfs_alloc_path();
3622 BUG_ON(!path);
3623 key.objectid = inode->i_ino;
3624 key.offset = 0;
3625 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
3626 datasize = btrfs_file_extent_calc_inline_size(name_len);
3627 err = btrfs_insert_empty_item(trans, root, path, &key,
3628 datasize);
3629 if (err) {
3630 drop_inode = 1;
3631 goto out_unlock;
3632 }
3633 leaf = path->nodes[0];
3634 ei = btrfs_item_ptr(leaf, path->slots[0],
3635 struct btrfs_file_extent_item);
3636 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
3637 btrfs_set_file_extent_type(leaf, ei,
3638 BTRFS_FILE_EXTENT_INLINE);
3639 ptr = btrfs_file_extent_inline_start(ei);
3640 write_extent_buffer(leaf, symname, ptr, name_len);
3641 btrfs_mark_buffer_dirty(leaf);
3642 btrfs_free_path(path);
3643
3644 inode->i_op = &btrfs_symlink_inode_operations;
3645 inode->i_mapping->a_ops = &btrfs_symlink_aops;
3646 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3647 btrfs_i_size_write(inode, name_len - 1);
3648 err = btrfs_update_inode(trans, root, inode);
3649 if (err)
3650 drop_inode = 1;
3651
3652out_unlock:
3653 nr = trans->blocks_used;
3654 btrfs_end_transaction_throttle(trans, root);
3655out_fail:
3656 if (drop_inode) {
3657 inode_dec_link_count(inode);
3658 iput(inode);
3659 }
3660 btrfs_btree_balance_dirty(root, nr);
3661 return err;
3662}
3663
3664static int btrfs_set_page_dirty(struct page *page)
3665{
3666 return __set_page_dirty_nobuffers(page);
3667}
3668
3669#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
3670static int btrfs_permission(struct inode *inode, int mask)
3671#else
3672static int btrfs_permission(struct inode *inode, int mask,
3673 struct nameidata *nd)
3674#endif
3675{
3676 if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
3677 return -EACCES;
3678 return generic_permission(inode, mask, btrfs_check_acl);
3679}
3680
3681static struct inode_operations btrfs_dir_inode_operations = {
3682 .lookup = btrfs_lookup,
3683 .create = btrfs_create,
3684 .unlink = btrfs_unlink,
3685 .link = btrfs_link,
3686 .mkdir = btrfs_mkdir,
3687 .rmdir = btrfs_rmdir,
3688 .rename = btrfs_rename,
3689 .symlink = btrfs_symlink,
3690 .setattr = btrfs_setattr,
3691 .mknod = btrfs_mknod,
3692 .setxattr = btrfs_setxattr,
3693 .getxattr = btrfs_getxattr,
3694 .listxattr = btrfs_listxattr,
3695 .removexattr = btrfs_removexattr,
3696 .permission = btrfs_permission,
3697};
3698static struct inode_operations btrfs_dir_ro_inode_operations = {
3699 .lookup = btrfs_lookup,
3700 .permission = btrfs_permission,
3701};
3702static struct file_operations btrfs_dir_file_operations = {
3703 .llseek = generic_file_llseek,
3704 .read = generic_read_dir,
3705#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
3706 .readdir = btrfs_nfshack_readdir,
3707#else /* NFSd readdir/lookup deadlock is fixed */
3708 .readdir = btrfs_real_readdir,
3709#endif
3710 .unlocked_ioctl = btrfs_ioctl,
3711#ifdef CONFIG_COMPAT
3712 .compat_ioctl = btrfs_ioctl,
3713#endif
3714 .release = btrfs_release_file,
3715 .fsync = btrfs_sync_file,
3716};
3717
3718static struct extent_io_ops btrfs_extent_io_ops = {
3719 .fill_delalloc = run_delalloc_range,
3720 .submit_bio_hook = btrfs_submit_bio_hook,
3721 .merge_bio_hook = btrfs_merge_bio_hook,
3722 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
3723 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
3724 .writepage_start_hook = btrfs_writepage_start_hook,
3725 .readpage_io_failed_hook = btrfs_io_failed_hook,
3726 .set_bit_hook = btrfs_set_bit_hook,
3727 .clear_bit_hook = btrfs_clear_bit_hook,
3728};
3729
3730static struct address_space_operations btrfs_aops = {
3731 .readpage = btrfs_readpage,
3732 .writepage = btrfs_writepage,
3733 .writepages = btrfs_writepages,
3734 .readpages = btrfs_readpages,
3735 .sync_page = block_sync_page,
3736 .bmap = btrfs_bmap,
3737 .direct_IO = btrfs_direct_IO,
3738 .invalidatepage = btrfs_invalidatepage,
3739 .releasepage = btrfs_releasepage,
3740 .set_page_dirty = btrfs_set_page_dirty,
3741};
3742
3743static struct address_space_operations btrfs_symlink_aops = {
3744 .readpage = btrfs_readpage,
3745 .writepage = btrfs_writepage,
3746 .invalidatepage = btrfs_invalidatepage,
3747 .releasepage = btrfs_releasepage,
3748};
3749
3750static struct inode_operations btrfs_file_inode_operations = {
3751 .truncate = btrfs_truncate,
3752 .getattr = btrfs_getattr,
3753 .setattr = btrfs_setattr,
3754 .setxattr = btrfs_setxattr,
3755 .getxattr = btrfs_getxattr,
3756 .listxattr = btrfs_listxattr,
3757 .removexattr = btrfs_removexattr,
3758 .permission = btrfs_permission,
3759};
3760static struct inode_operations btrfs_special_inode_operations = {
3761 .getattr = btrfs_getattr,
3762 .setattr = btrfs_setattr,
3763 .permission = btrfs_permission,
3764 .setxattr = btrfs_setxattr,
3765 .getxattr = btrfs_getxattr,
3766 .listxattr = btrfs_listxattr,
3767 .removexattr = btrfs_removexattr,
3768};
3769static struct inode_operations btrfs_symlink_inode_operations = {
3770 .readlink = generic_readlink,
3771 .follow_link = page_follow_link_light,
3772 .put_link = page_put_link,
3773 .permission = btrfs_permission,
3774};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 000000000000..4c6e0c15754d
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,790 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/bit_spinlock.h>
37#include <linux/version.h>
38#include <linux/xattr.h>
39#include <linux/vmalloc.h>
40#include "ctree.h"
41#include "disk-io.h"
42#include "transaction.h"
43#include "btrfs_inode.h"
44#include "ioctl.h"
45#include "print-tree.h"
46#include "volumes.h"
47#include "locking.h"
48
49
50
51static noinline int create_subvol(struct btrfs_root *root, char *name,
52 int namelen)
53{
54 struct btrfs_trans_handle *trans;
55 struct btrfs_key key;
56 struct btrfs_root_item root_item;
57 struct btrfs_inode_item *inode_item;
58 struct extent_buffer *leaf;
59 struct btrfs_root *new_root = root;
60 struct inode *dir;
61 int ret;
62 int err;
63 u64 objectid;
64 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
65 unsigned long nr = 1;
66
67 ret = btrfs_check_free_space(root, 1, 0);
68 if (ret)
69 goto fail_commit;
70
71 trans = btrfs_start_transaction(root, 1);
72 BUG_ON(!trans);
73
74 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
75 0, &objectid);
76 if (ret)
77 goto fail;
78
79 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
80 objectid, trans->transid, 0, 0, 0);
81 if (IS_ERR(leaf)) {
82 ret = PTR_ERR(leaf);
83 goto fail;
84 }
85
86 btrfs_set_header_nritems(leaf, 0);
87 btrfs_set_header_level(leaf, 0);
88 btrfs_set_header_bytenr(leaf, leaf->start);
89 btrfs_set_header_generation(leaf, trans->transid);
90 btrfs_set_header_owner(leaf, objectid);
91
92 write_extent_buffer(leaf, root->fs_info->fsid,
93 (unsigned long)btrfs_header_fsid(leaf),
94 BTRFS_FSID_SIZE);
95 btrfs_mark_buffer_dirty(leaf);
96
97 inode_item = &root_item.inode;
98 memset(inode_item, 0, sizeof(*inode_item));
99 inode_item->generation = cpu_to_le64(1);
100 inode_item->size = cpu_to_le64(3);
101 inode_item->nlink = cpu_to_le32(1);
102 inode_item->nblocks = cpu_to_le64(1);
103 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
104
105 btrfs_set_root_bytenr(&root_item, leaf->start);
106 btrfs_set_root_level(&root_item, 0);
107 btrfs_set_root_refs(&root_item, 1);
108 btrfs_set_root_used(&root_item, 0);
109
110 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
111 root_item.drop_level = 0;
112
113 btrfs_tree_unlock(leaf);
114 free_extent_buffer(leaf);
115 leaf = NULL;
116
117 btrfs_set_root_dirid(&root_item, new_dirid);
118
119 key.objectid = objectid;
120 key.offset = 1;
121 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
122 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
123 &root_item);
124 if (ret)
125 goto fail;
126
127 /*
128 * insert the directory item
129 */
130 key.offset = (u64)-1;
131 dir = root->fs_info->sb->s_root->d_inode;
132 ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
133 name, namelen, dir->i_ino, &key,
134 BTRFS_FT_DIR, 0);
135 if (ret)
136 goto fail;
137
138 ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
139 name, namelen, objectid,
140 root->fs_info->sb->s_root->d_inode->i_ino, 0);
141 if (ret)
142 goto fail;
143
144 ret = btrfs_commit_transaction(trans, root);
145 if (ret)
146 goto fail_commit;
147
148 new_root = btrfs_read_fs_root(root->fs_info, &key, name, namelen);
149 BUG_ON(!new_root);
150
151 trans = btrfs_start_transaction(new_root, 1);
152 BUG_ON(!trans);
153
154 ret = btrfs_create_subvol_root(new_root, trans, new_dirid,
155 BTRFS_I(dir)->block_group);
156 if (ret)
157 goto fail;
158
159 /* Invalidate existing dcache entry for new subvolume. */
160 btrfs_invalidate_dcache_root(root, name, namelen);
161
162fail:
163 nr = trans->blocks_used;
164 err = btrfs_commit_transaction(trans, new_root);
165 if (err && !ret)
166 ret = err;
167fail_commit:
168 btrfs_btree_balance_dirty(root, nr);
169 return ret;
170}
171
172static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
173{
174 struct btrfs_pending_snapshot *pending_snapshot;
175 struct btrfs_trans_handle *trans;
176 int ret;
177 int err;
178 unsigned long nr = 0;
179
180 if (!root->ref_cows)
181 return -EINVAL;
182
183 ret = btrfs_check_free_space(root, 1, 0);
184 if (ret)
185 goto fail_unlock;
186
187 pending_snapshot = kmalloc(sizeof(*pending_snapshot), GFP_NOFS);
188 if (!pending_snapshot) {
189 ret = -ENOMEM;
190 goto fail_unlock;
191 }
192 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
193 if (!pending_snapshot->name) {
194 ret = -ENOMEM;
195 kfree(pending_snapshot);
196 goto fail_unlock;
197 }
198 memcpy(pending_snapshot->name, name, namelen);
199 pending_snapshot->name[namelen] = '\0';
200 trans = btrfs_start_transaction(root, 1);
201 BUG_ON(!trans);
202 pending_snapshot->root = root;
203 list_add(&pending_snapshot->list,
204 &trans->transaction->pending_snapshots);
205 ret = btrfs_update_inode(trans, root, root->inode);
206 err = btrfs_commit_transaction(trans, root);
207
208fail_unlock:
209 btrfs_btree_balance_dirty(root, nr);
210 return ret;
211}
212
213int btrfs_defrag_file(struct file *file)
214{
215 struct inode *inode = fdentry(file)->d_inode;
216 struct btrfs_root *root = BTRFS_I(inode)->root;
217 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
218 struct btrfs_ordered_extent *ordered;
219 struct page *page;
220 unsigned long last_index;
221 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
222 unsigned long total_read = 0;
223 u64 page_start;
224 u64 page_end;
225 unsigned long i;
226 int ret;
227
228 ret = btrfs_check_free_space(root, inode->i_size, 0);
229 if (ret)
230 return -ENOSPC;
231
232 mutex_lock(&inode->i_mutex);
233 last_index = inode->i_size >> PAGE_CACHE_SHIFT;
234 for (i = 0; i <= last_index; i++) {
235 if (total_read % ra_pages == 0) {
236 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
237 min(last_index, i + ra_pages - 1));
238 }
239 total_read++;
240again:
241 page = grab_cache_page(inode->i_mapping, i);
242 if (!page)
243 goto out_unlock;
244 if (!PageUptodate(page)) {
245 btrfs_readpage(NULL, page);
246 lock_page(page);
247 if (!PageUptodate(page)) {
248 unlock_page(page);
249 page_cache_release(page);
250 goto out_unlock;
251 }
252 }
253
254 wait_on_page_writeback(page);
255
256 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
257 page_end = page_start + PAGE_CACHE_SIZE - 1;
258 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
259
260 ordered = btrfs_lookup_ordered_extent(inode, page_start);
261 if (ordered) {
262 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
263 unlock_page(page);
264 page_cache_release(page);
265 btrfs_start_ordered_extent(inode, ordered, 1);
266 btrfs_put_ordered_extent(ordered);
267 goto again;
268 }
269 set_page_extent_mapped(page);
270
271 /*
272 * this makes sure page_mkwrite is called on the
273 * page if it is dirtied again later
274 */
275 clear_page_dirty_for_io(page);
276
277 btrfs_set_extent_delalloc(inode, page_start, page_end);
278
279 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
280 set_page_dirty(page);
281 unlock_page(page);
282 page_cache_release(page);
283 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
284 }
285
286out_unlock:
287 mutex_unlock(&inode->i_mutex);
288 return 0;
289}
290
291/*
292 * Called inside transaction, so use GFP_NOFS
293 */
294
295static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
296{
297 u64 new_size;
298 u64 old_size;
299 u64 devid = 1;
300 struct btrfs_ioctl_vol_args *vol_args;
301 struct btrfs_trans_handle *trans;
302 struct btrfs_device *device = NULL;
303 char *sizestr;
304 char *devstr = NULL;
305 int ret = 0;
306 int namelen;
307 int mod = 0;
308
309 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
310
311 if (!vol_args)
312 return -ENOMEM;
313
314 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
315 ret = -EFAULT;
316 goto out;
317 }
318
319 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
320 namelen = strlen(vol_args->name);
321
322 mutex_lock(&root->fs_info->volume_mutex);
323 sizestr = vol_args->name;
324 devstr = strchr(sizestr, ':');
325 if (devstr) {
326 char *end;
327 sizestr = devstr + 1;
328 *devstr = '\0';
329 devstr = vol_args->name;
330 devid = simple_strtoull(devstr, &end, 10);
331 printk(KERN_INFO "resizing devid %llu\n", devid);
332 }
333 device = btrfs_find_device(root, devid, NULL);
334 if (!device) {
335 printk(KERN_INFO "resizer unable to find device %llu\n", devid);
336 ret = -EINVAL;
337 goto out_unlock;
338 }
339 if (!strcmp(sizestr, "max"))
340 new_size = device->bdev->bd_inode->i_size;
341 else {
342 if (sizestr[0] == '-') {
343 mod = -1;
344 sizestr++;
345 } else if (sizestr[0] == '+') {
346 mod = 1;
347 sizestr++;
348 }
349 new_size = btrfs_parse_size(sizestr);
350 if (new_size == 0) {
351 ret = -EINVAL;
352 goto out_unlock;
353 }
354 }
355
356 old_size = device->total_bytes;
357
358 if (mod < 0) {
359 if (new_size > old_size) {
360 ret = -EINVAL;
361 goto out_unlock;
362 }
363 new_size = old_size - new_size;
364 } else if (mod > 0) {
365 new_size = old_size + new_size;
366 }
367
368 if (new_size < 256 * 1024 * 1024) {
369 ret = -EINVAL;
370 goto out_unlock;
371 }
372 if (new_size > device->bdev->bd_inode->i_size) {
373 ret = -EFBIG;
374 goto out_unlock;
375 }
376
377 do_div(new_size, root->sectorsize);
378 new_size *= root->sectorsize;
379
380 printk(KERN_INFO "new size for %s is %llu\n",
381 device->name, (unsigned long long)new_size);
382
383 if (new_size > old_size) {
384 trans = btrfs_start_transaction(root, 1);
385 ret = btrfs_grow_device(trans, device, new_size);
386 btrfs_commit_transaction(trans, root);
387 } else {
388 ret = btrfs_shrink_device(device, new_size);
389 }
390
391out_unlock:
392 mutex_unlock(&root->fs_info->volume_mutex);
393out:
394 kfree(vol_args);
395 return ret;
396}
397
398static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root,
399 void __user *arg)
400{
401 struct btrfs_ioctl_vol_args *vol_args;
402 struct btrfs_dir_item *di;
403 struct btrfs_path *path;
404 u64 root_dirid;
405 int namelen;
406 int ret;
407
408 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
409
410 if (!vol_args)
411 return -ENOMEM;
412
413 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
414 ret = -EFAULT;
415 goto out;
416 }
417
418 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
419 namelen = strlen(vol_args->name);
420 if (strchr(vol_args->name, '/')) {
421 ret = -EINVAL;
422 goto out;
423 }
424
425 path = btrfs_alloc_path();
426 if (!path) {
427 ret = -ENOMEM;
428 goto out;
429 }
430
431 root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
432 di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
433 path, root_dirid,
434 vol_args->name, namelen, 0);
435 btrfs_free_path(path);
436
437 if (di && !IS_ERR(di)) {
438 ret = -EEXIST;
439 goto out;
440 }
441
442 if (IS_ERR(di)) {
443 ret = PTR_ERR(di);
444 goto out;
445 }
446
447 mutex_lock(&root->fs_info->drop_mutex);
448 if (root == root->fs_info->tree_root)
449 ret = create_subvol(root, vol_args->name, namelen);
450 else
451 ret = create_snapshot(root, vol_args->name, namelen);
452 mutex_unlock(&root->fs_info->drop_mutex);
453out:
454 kfree(vol_args);
455 return ret;
456}
457
458static int btrfs_ioctl_defrag(struct file *file)
459{
460 struct inode *inode = fdentry(file)->d_inode;
461 struct btrfs_root *root = BTRFS_I(inode)->root;
462
463 switch (inode->i_mode & S_IFMT) {
464 case S_IFDIR:
465 btrfs_defrag_root(root, 0);
466 btrfs_defrag_root(root->fs_info->extent_root, 0);
467 break;
468 case S_IFREG:
469 btrfs_defrag_file(file);
470 break;
471 }
472
473 return 0;
474}
475
476long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
477{
478 struct btrfs_ioctl_vol_args *vol_args;
479 int ret;
480
481 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
482
483 if (!vol_args)
484 return -ENOMEM;
485
486 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
487 ret = -EFAULT;
488 goto out;
489 }
490 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
491 ret = btrfs_init_new_device(root, vol_args->name);
492
493out:
494 kfree(vol_args);
495 return ret;
496}
497
498long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
499{
500 struct btrfs_ioctl_vol_args *vol_args;
501 int ret;
502
503 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
504
505 if (!vol_args)
506 return -ENOMEM;
507
508 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
509 ret = -EFAULT;
510 goto out;
511 }
512 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
513 ret = btrfs_rm_device(root, vol_args->name);
514
515out:
516 kfree(vol_args);
517 return ret;
518}
519
520long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
521{
522 struct inode *inode = fdentry(file)->d_inode;
523 struct btrfs_root *root = BTRFS_I(inode)->root;
524 struct file *src_file;
525 struct inode *src;
526 struct btrfs_trans_handle *trans;
527 struct btrfs_path *path;
528 struct extent_buffer *leaf;
529 char *buf;
530 struct btrfs_key key;
531 u32 nritems;
532 int slot;
533 int ret;
534
535 src_file = fget(src_fd);
536 if (!src_file)
537 return -EBADF;
538 src = src_file->f_dentry->d_inode;
539
540 ret = -EISDIR;
541 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
542 goto out_fput;
543
544 ret = -EXDEV;
545 if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
546 goto out_fput;
547
548 ret = -ENOMEM;
549 buf = vmalloc(btrfs_level_size(root, 0));
550 if (!buf)
551 goto out_fput;
552
553 path = btrfs_alloc_path();
554 if (!path) {
555 vfree(buf);
556 goto out_fput;
557 }
558 path->reada = 2;
559
560 if (inode < src) {
561 mutex_lock(&inode->i_mutex);
562 mutex_lock(&src->i_mutex);
563 } else {
564 mutex_lock(&src->i_mutex);
565 mutex_lock(&inode->i_mutex);
566 }
567
568 ret = -ENOTEMPTY;
569 if (inode->i_size)
570 goto out_unlock;
571
572 /* do any pending delalloc/csum calc on src, one way or
573 another, and lock file content */
574 while (1) {
575 struct btrfs_ordered_extent *ordered;
576 lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
577 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
578 if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
579 break;
580 unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
581 if (ordered)
582 btrfs_put_ordered_extent(ordered);
583 btrfs_wait_ordered_range(src, 0, (u64)-1);
584 }
585
586 trans = btrfs_start_transaction(root, 1);
587 BUG_ON(!trans);
588
589 key.objectid = src->i_ino;
590 key.type = BTRFS_EXTENT_DATA_KEY;
591 key.offset = 0;
592
593 while (1) {
594 /*
595 * note the key will change type as we walk through the
596 * tree.
597 */
598 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
599 if (ret < 0)
600 goto out;
601
602 nritems = btrfs_header_nritems(path->nodes[0]);
603 if (path->slots[0] >= nritems) {
604 ret = btrfs_next_leaf(root, path);
605 if (ret < 0)
606 goto out;
607 if (ret > 0)
608 break;
609 nritems = btrfs_header_nritems(path->nodes[0]);
610 }
611 leaf = path->nodes[0];
612 slot = path->slots[0];
613
614 btrfs_item_key_to_cpu(leaf, &key, slot);
615 if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
616 key.objectid != src->i_ino)
617 break;
618
619 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY ||
620 btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
621 u32 size;
622 struct btrfs_key new_key;
623
624 size = btrfs_item_size_nr(leaf, slot);
625 read_extent_buffer(leaf, buf,
626 btrfs_item_ptr_offset(leaf, slot),
627 size);
628 btrfs_release_path(root, path);
629
630 memcpy(&new_key, &key, sizeof(new_key));
631 new_key.objectid = inode->i_ino;
632 ret = btrfs_insert_empty_item(trans, root, path,
633 &new_key, size);
634 if (ret)
635 goto out;
636
637 leaf = path->nodes[0];
638 slot = path->slots[0];
639 write_extent_buffer(leaf, buf,
640 btrfs_item_ptr_offset(leaf, slot),
641 size);
642 btrfs_mark_buffer_dirty(leaf);
643 }
644
645 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
646 struct btrfs_file_extent_item *extent;
647 int found_type;
648
649 extent = btrfs_item_ptr(leaf, slot,
650 struct btrfs_file_extent_item);
651 found_type = btrfs_file_extent_type(leaf, extent);
652 if (found_type == BTRFS_FILE_EXTENT_REG) {
653 u64 ds = btrfs_file_extent_disk_bytenr(leaf,
654 extent);
655 u64 dl = btrfs_file_extent_disk_num_bytes(leaf,
656 extent);
657 /* ds == 0 means there's a hole */
658 if (ds != 0) {
659 ret = btrfs_inc_extent_ref(trans, root,
660 ds, dl, leaf->start,
661 root->root_key.objectid,
662 trans->transid,
663 inode->i_ino, key.offset);
664 BUG_ON(ret);
665 }
666 }
667 }
668 btrfs_release_path(root, path);
669 key.offset++;
670 }
671 ret = 0;
672out:
673 btrfs_release_path(root, path);
674 if (ret == 0) {
675 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
676 inode->i_blocks = src->i_blocks;
677 btrfs_i_size_write(inode, src->i_size);
678 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
679 ret = btrfs_update_inode(trans, root, inode);
680 }
681 btrfs_end_transaction(trans, root);
682 unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
683 if (ret)
684 vmtruncate(inode, 0);
685out_unlock:
686 mutex_unlock(&src->i_mutex);
687 mutex_unlock(&inode->i_mutex);
688 vfree(buf);
689 btrfs_free_path(path);
690out_fput:
691 fput(src_file);
692 return ret;
693}
694
695/*
696 * there are many ways the trans_start and trans_end ioctls can lead
697 * to deadlocks. They should only be used by applications that
698 * basically own the machine, and have a very in depth understanding
699 * of all the possible deadlocks and enospc problems.
700 */
701long btrfs_ioctl_trans_start(struct file *file)
702{
703 struct inode *inode = fdentry(file)->d_inode;
704 struct btrfs_root *root = BTRFS_I(inode)->root;
705 struct btrfs_trans_handle *trans;
706 int ret = 0;
707
708 if (!capable(CAP_SYS_ADMIN))
709 return -EPERM;
710
711 if (file->private_data) {
712 ret = -EINPROGRESS;
713 goto out;
714 }
715
716 mutex_lock(&root->fs_info->trans_mutex);
717 root->fs_info->open_ioctl_trans++;
718 mutex_unlock(&root->fs_info->trans_mutex);
719
720 trans = btrfs_start_ioctl_transaction(root, 0);
721 if (trans)
722 file->private_data = trans;
723 else
724 ret = -ENOMEM;
725 /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
726out:
727 return ret;
728}
729
730/*
731 * there are many ways the trans_start and trans_end ioctls can lead
732 * to deadlocks. They should only be used by applications that
733 * basically own the machine, and have a very in depth understanding
734 * of all the possible deadlocks and enospc problems.
735 */
736long btrfs_ioctl_trans_end(struct file *file)
737{
738 struct inode *inode = fdentry(file)->d_inode;
739 struct btrfs_root *root = BTRFS_I(inode)->root;
740 struct btrfs_trans_handle *trans;
741 int ret = 0;
742
743 trans = file->private_data;
744 if (!trans) {
745 ret = -EINVAL;
746 goto out;
747 }
748 btrfs_end_transaction(trans, root);
749 file->private_data = NULL;
750
751 mutex_lock(&root->fs_info->trans_mutex);
752 root->fs_info->open_ioctl_trans--;
753 mutex_unlock(&root->fs_info->trans_mutex);
754
755out:
756 return ret;
757}
758
759long btrfs_ioctl(struct file *file, unsigned int
760 cmd, unsigned long arg)
761{
762 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
763
764 switch (cmd) {
765 case BTRFS_IOC_SNAP_CREATE:
766 return btrfs_ioctl_snap_create(root, (void __user *)arg);
767 case BTRFS_IOC_DEFRAG:
768 return btrfs_ioctl_defrag(file);
769 case BTRFS_IOC_RESIZE:
770 return btrfs_ioctl_resize(root, (void __user *)arg);
771 case BTRFS_IOC_ADD_DEV:
772 return btrfs_ioctl_add_dev(root, (void __user *)arg);
773 case BTRFS_IOC_RM_DEV:
774 return btrfs_ioctl_rm_dev(root, (void __user *)arg);
775 case BTRFS_IOC_BALANCE:
776 return btrfs_balance(root->fs_info->dev_root);
777 case BTRFS_IOC_CLONE:
778 return btrfs_ioctl_clone(file, arg);
779 case BTRFS_IOC_TRANS_START:
780 return btrfs_ioctl_trans_start(file);
781 case BTRFS_IOC_TRANS_END:
782 return btrfs_ioctl_trans_end(file);
783 case BTRFS_IOC_SYNC:
784 btrfs_start_delalloc_inodes(root);
785 btrfs_sync_fs(file->f_dentry->d_sb, 1);
786 return 0;
787 }
788
789 return -ENOTTY;
790}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 000000000000..85ed35a775b1
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,55 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __IOCTL_
20#define __IOCTL_
21#include <linux/ioctl.h>
22
23#define BTRFS_IOCTL_MAGIC 0x94
24#define BTRFS_VOL_NAME_MAX 255
25#define BTRFS_PATH_NAME_MAX 4095
26
27struct btrfs_ioctl_vol_args {
28 char name[BTRFS_PATH_NAME_MAX + 1];
29};
30
31#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
32 struct btrfs_ioctl_vol_args)
33#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
34 struct btrfs_ioctl_vol_args)
35#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
36 struct btrfs_ioctl_vol_args)
37#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
38 struct btrfs_ioctl_vol_args)
39/* trans start and trans end are dangerous, and only for
40 * use by applications that know how to avoid the
41 * resulting deadlocks
42 */
43#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
44#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
45#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
46
47#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
48#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
49 struct btrfs_ioctl_vol_args)
50#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
51 struct btrfs_ioctl_vol_args)
52#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
53 struct btrfs_ioctl_vol_args)
54
55#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 000000000000..0cc314c10d66
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,74 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h>
21#include <linux/spinlock.h>
22#include <linux/page-flags.h>
23#include <asm/bug.h>
24#include "ctree.h"
25#include "extent_io.h"
26#include "locking.h"
27
28int btrfs_tree_lock(struct extent_buffer *eb)
29{
30 int i;
31
32 if (mutex_trylock(&eb->mutex))
33 return 0;
34 for (i = 0; i < 512; i++) {
35 cpu_relax();
36 if (mutex_trylock(&eb->mutex))
37 return 0;
38 }
39 cpu_relax();
40 mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
41 return 0;
42}
43
44int btrfs_try_tree_lock(struct extent_buffer *eb)
45{
46 return mutex_trylock(&eb->mutex);
47}
48
49int btrfs_tree_unlock(struct extent_buffer *eb)
50{
51 mutex_unlock(&eb->mutex);
52 return 0;
53}
54
55int btrfs_tree_locked(struct extent_buffer *eb)
56{
57 return mutex_is_locked(&eb->mutex);
58}
59
60int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
61{
62 int i;
63 struct extent_buffer *eb;
64 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
65 eb = path->nodes[i];
66 if (!eb)
67 break;
68 smp_mb();
69 if (!list_empty(&eb->mutex.wait_list))
70 return 1;
71 }
72 return 0;
73}
74
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 000000000000..bc1faef12519
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_LOCKING_
20#define __BTRFS_LOCKING_
21
22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb);
25int btrfs_try_tree_lock(struct extent_buffer *eb);
26int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
27#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 000000000000..da6d43eb41db
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,709 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/gfp.h>
20#include <linux/slab.h>
21#include <linux/blkdev.h>
22#include <linux/writeback.h>
23#include <linux/pagevec.h>
24#include "ctree.h"
25#include "transaction.h"
26#include "btrfs_inode.h"
27#include "extent_io.h"
28
29
30static u64 entry_end(struct btrfs_ordered_extent *entry)
31{
32 if (entry->file_offset + entry->len < entry->file_offset)
33 return (u64)-1;
34 return entry->file_offset + entry->len;
35}
36
37static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
38 struct rb_node *node)
39{
40 struct rb_node ** p = &root->rb_node;
41 struct rb_node * parent = NULL;
42 struct btrfs_ordered_extent *entry;
43
44 while(*p) {
45 parent = *p;
46 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
47
48 if (file_offset < entry->file_offset)
49 p = &(*p)->rb_left;
50 else if (file_offset >= entry_end(entry))
51 p = &(*p)->rb_right;
52 else
53 return parent;
54 }
55
56 rb_link_node(node, parent, p);
57 rb_insert_color(node, root);
58 return NULL;
59}
60
61static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
62 struct rb_node **prev_ret)
63{
64 struct rb_node * n = root->rb_node;
65 struct rb_node *prev = NULL;
66 struct rb_node *test;
67 struct btrfs_ordered_extent *entry;
68 struct btrfs_ordered_extent *prev_entry = NULL;
69
70 while(n) {
71 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
72 prev = n;
73 prev_entry = entry;
74
75 if (file_offset < entry->file_offset)
76 n = n->rb_left;
77 else if (file_offset >= entry_end(entry))
78 n = n->rb_right;
79 else
80 return n;
81 }
82 if (!prev_ret)
83 return NULL;
84
85 while(prev && file_offset >= entry_end(prev_entry)) {
86 test = rb_next(prev);
87 if (!test)
88 break;
89 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
90 rb_node);
91 if (file_offset < entry_end(prev_entry))
92 break;
93
94 prev = test;
95 }
96 if (prev)
97 prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
98 rb_node);
99 while(prev && file_offset < entry_end(prev_entry)) {
100 test = rb_prev(prev);
101 if (!test)
102 break;
103 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
104 rb_node);
105 prev = test;
106 }
107 *prev_ret = prev;
108 return NULL;
109}
110
111static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
112{
113 if (file_offset < entry->file_offset ||
114 entry->file_offset + entry->len <= file_offset)
115 return 0;
116 return 1;
117}
118
119static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
120 u64 file_offset)
121{
122 struct rb_root *root = &tree->tree;
123 struct rb_node *prev;
124 struct rb_node *ret;
125 struct btrfs_ordered_extent *entry;
126
127 if (tree->last) {
128 entry = rb_entry(tree->last, struct btrfs_ordered_extent,
129 rb_node);
130 if (offset_in_entry(entry, file_offset))
131 return tree->last;
132 }
133 ret = __tree_search(root, file_offset, &prev);
134 if (!ret)
135 ret = prev;
136 if (ret)
137 tree->last = ret;
138 return ret;
139}
140
141/* allocate and add a new ordered_extent into the per-inode tree.
142 * file_offset is the logical offset in the file
143 *
144 * start is the disk block number of an extent already reserved in the
145 * extent allocation tree
146 *
147 * len is the length of the extent
148 *
149 * This also sets the EXTENT_ORDERED bit on the range in the inode.
150 *
151 * The tree is given a single reference on the ordered extent that was
152 * inserted.
153 */
154int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
155 u64 start, u64 len, int nocow)
156{
157 struct btrfs_ordered_inode_tree *tree;
158 struct rb_node *node;
159 struct btrfs_ordered_extent *entry;
160
161 tree = &BTRFS_I(inode)->ordered_tree;
162 entry = kzalloc(sizeof(*entry), GFP_NOFS);
163 if (!entry)
164 return -ENOMEM;
165
166 mutex_lock(&tree->mutex);
167 entry->file_offset = file_offset;
168 entry->start = start;
169 entry->len = len;
170 entry->inode = inode;
171 if (nocow)
172 set_bit(BTRFS_ORDERED_NOCOW, &entry->flags);
173
174 /* one ref for the tree */
175 atomic_set(&entry->refs, 1);
176 init_waitqueue_head(&entry->wait);
177 INIT_LIST_HEAD(&entry->list);
178 INIT_LIST_HEAD(&entry->root_extent_list);
179
180 node = tree_insert(&tree->tree, file_offset,
181 &entry->rb_node);
182 if (node) {
183 printk("warning dup entry from add_ordered_extent\n");
184 BUG();
185 }
186 set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
187 entry_end(entry) - 1, GFP_NOFS);
188
189 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
190 list_add_tail(&entry->root_extent_list,
191 &BTRFS_I(inode)->root->fs_info->ordered_extents);
192 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
193
194 mutex_unlock(&tree->mutex);
195 BUG_ON(node);
196 return 0;
197}
198
199/*
200 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
201 * when an ordered extent is finished. If the list covers more than one
202 * ordered extent, it is split across multiples.
203 */
204int btrfs_add_ordered_sum(struct inode *inode,
205 struct btrfs_ordered_extent *entry,
206 struct btrfs_ordered_sum *sum)
207{
208 struct btrfs_ordered_inode_tree *tree;
209
210 tree = &BTRFS_I(inode)->ordered_tree;
211 mutex_lock(&tree->mutex);
212 list_add_tail(&sum->list, &entry->list);
213 mutex_unlock(&tree->mutex);
214 return 0;
215}
216
217/*
218 * this is used to account for finished IO across a given range
219 * of the file. The IO should not span ordered extents. If
220 * a given ordered_extent is completely done, 1 is returned, otherwise
221 * 0.
222 *
223 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
224 * to make sure this function only returns 1 once for a given ordered extent.
225 */
226int btrfs_dec_test_ordered_pending(struct inode *inode,
227 u64 file_offset, u64 io_size)
228{
229 struct btrfs_ordered_inode_tree *tree;
230 struct rb_node *node;
231 struct btrfs_ordered_extent *entry;
232 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
233 int ret;
234
235 tree = &BTRFS_I(inode)->ordered_tree;
236 mutex_lock(&tree->mutex);
237 clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
238 GFP_NOFS);
239 node = tree_search(tree, file_offset);
240 if (!node) {
241 ret = 1;
242 goto out;
243 }
244
245 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
246 if (!offset_in_entry(entry, file_offset)) {
247 ret = 1;
248 goto out;
249 }
250
251 ret = test_range_bit(io_tree, entry->file_offset,
252 entry->file_offset + entry->len - 1,
253 EXTENT_ORDERED, 0);
254 if (ret == 0)
255 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
256out:
257 mutex_unlock(&tree->mutex);
258 return ret == 0;
259}
260
261/*
262 * used to drop a reference on an ordered extent. This will free
263 * the extent if the last reference is dropped
264 */
265int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
266{
267 struct list_head *cur;
268 struct btrfs_ordered_sum *sum;
269
270 if (atomic_dec_and_test(&entry->refs)) {
271 while(!list_empty(&entry->list)) {
272 cur = entry->list.next;
273 sum = list_entry(cur, struct btrfs_ordered_sum, list);
274 list_del(&sum->list);
275 kfree(sum);
276 }
277 kfree(entry);
278 }
279 return 0;
280}
281
282/*
283 * remove an ordered extent from the tree. No references are dropped
284 * but, anyone waiting on this extent is woken up.
285 */
286int btrfs_remove_ordered_extent(struct inode *inode,
287 struct btrfs_ordered_extent *entry)
288{
289 struct btrfs_ordered_inode_tree *tree;
290 struct rb_node *node;
291
292 tree = &BTRFS_I(inode)->ordered_tree;
293 mutex_lock(&tree->mutex);
294 node = &entry->rb_node;
295 rb_erase(node, &tree->tree);
296 tree->last = NULL;
297 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
298
299 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
300 list_del_init(&entry->root_extent_list);
301 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
302
303 mutex_unlock(&tree->mutex);
304 wake_up(&entry->wait);
305 return 0;
306}
307
308int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
309{
310 struct list_head splice;
311 struct list_head *cur;
312 struct list_head *tmp;
313 struct btrfs_ordered_extent *ordered;
314 struct inode *inode;
315
316 INIT_LIST_HEAD(&splice);
317
318 spin_lock(&root->fs_info->ordered_extent_lock);
319 list_splice_init(&root->fs_info->ordered_extents, &splice);
320 list_for_each_safe(cur, tmp, &splice) {
321 cur = splice.next;
322 ordered = list_entry(cur, struct btrfs_ordered_extent,
323 root_extent_list);
324 if (nocow_only &&
325 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
326 cond_resched_lock(&root->fs_info->ordered_extent_lock);
327 continue;
328 }
329
330 list_del_init(&ordered->root_extent_list);
331 atomic_inc(&ordered->refs);
332 inode = ordered->inode;
333
334 /*
335 * the inode can't go away until all the pages are gone
336 * and the pages won't go away while there is still
337 * an ordered extent and the ordered extent won't go
338 * away until it is off this list. So, we can safely
339 * increment i_count here and call iput later
340 */
341 atomic_inc(&inode->i_count);
342 spin_unlock(&root->fs_info->ordered_extent_lock);
343
344 btrfs_start_ordered_extent(inode, ordered, 1);
345 btrfs_put_ordered_extent(ordered);
346 iput(inode);
347
348 spin_lock(&root->fs_info->ordered_extent_lock);
349 }
350 list_splice_init(&splice, &root->fs_info->ordered_extents);
351 spin_unlock(&root->fs_info->ordered_extent_lock);
352 return 0;
353}
354
355/*
356 * Used to start IO or wait for a given ordered extent to finish.
357 *
358 * If wait is one, this effectively waits on page writeback for all the pages
359 * in the extent, and it waits on the io completion code to insert
360 * metadata into the btree corresponding to the extent
361 */
362void btrfs_start_ordered_extent(struct inode *inode,
363 struct btrfs_ordered_extent *entry,
364 int wait)
365{
366 u64 start = entry->file_offset;
367 u64 end = start + entry->len - 1;
368
369 /*
370 * pages in the range can be dirty, clean or writeback. We
371 * start IO on any dirty ones so the wait doesn't stall waiting
372 * for pdflush to find them
373 */
374 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
375 if (wait)
376 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
377 &entry->flags));
378}
379
380/*
381 * Used to wait on ordered extents across a large range of bytes.
382 */
383void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
384{
385 u64 end;
386 u64 orig_end;
387 u64 wait_end;
388 struct btrfs_ordered_extent *ordered;
389
390 if (start + len < start) {
391 orig_end = INT_LIMIT(loff_t);
392 } else {
393 orig_end = start + len - 1;
394 if (orig_end > INT_LIMIT(loff_t))
395 orig_end = INT_LIMIT(loff_t);
396 }
397 wait_end = orig_end;
398again:
399 /* start IO across the range first to instantiate any delalloc
400 * extents
401 */
402 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
403
404 btrfs_wait_on_page_writeback_range(inode->i_mapping,
405 start >> PAGE_CACHE_SHIFT,
406 orig_end >> PAGE_CACHE_SHIFT);
407
408 end = orig_end;
409 while(1) {
410 ordered = btrfs_lookup_first_ordered_extent(inode, end);
411 if (!ordered) {
412 break;
413 }
414 if (ordered->file_offset > orig_end) {
415 btrfs_put_ordered_extent(ordered);
416 break;
417 }
418 if (ordered->file_offset + ordered->len < start) {
419 btrfs_put_ordered_extent(ordered);
420 break;
421 }
422 btrfs_start_ordered_extent(inode, ordered, 1);
423 end = ordered->file_offset;
424 btrfs_put_ordered_extent(ordered);
425 if (end == 0 || end == start)
426 break;
427 end--;
428 }
429 if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
430 EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
431 printk("inode %lu still ordered or delalloc after wait "
432 "%llu %llu\n", inode->i_ino,
433 (unsigned long long)start,
434 (unsigned long long)orig_end);
435 goto again;
436 }
437}
438
439/*
440 * find an ordered extent corresponding to file_offset. return NULL if
441 * nothing is found, otherwise take a reference on the extent and return it
442 */
443struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
444 u64 file_offset)
445{
446 struct btrfs_ordered_inode_tree *tree;
447 struct rb_node *node;
448 struct btrfs_ordered_extent *entry = NULL;
449
450 tree = &BTRFS_I(inode)->ordered_tree;
451 mutex_lock(&tree->mutex);
452 node = tree_search(tree, file_offset);
453 if (!node)
454 goto out;
455
456 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
457 if (!offset_in_entry(entry, file_offset))
458 entry = NULL;
459 if (entry)
460 atomic_inc(&entry->refs);
461out:
462 mutex_unlock(&tree->mutex);
463 return entry;
464}
465
466/*
467 * lookup and return any extent before 'file_offset'. NULL is returned
468 * if none is found
469 */
470struct btrfs_ordered_extent *
471btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset)
472{
473 struct btrfs_ordered_inode_tree *tree;
474 struct rb_node *node;
475 struct btrfs_ordered_extent *entry = NULL;
476
477 tree = &BTRFS_I(inode)->ordered_tree;
478 mutex_lock(&tree->mutex);
479 node = tree_search(tree, file_offset);
480 if (!node)
481 goto out;
482
483 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
484 atomic_inc(&entry->refs);
485out:
486 mutex_unlock(&tree->mutex);
487 return entry;
488}
489
490/*
491 * After an extent is done, call this to conditionally update the on disk
492 * i_size. i_size is updated to cover any fully written part of the file.
493 */
494int btrfs_ordered_update_i_size(struct inode *inode,
495 struct btrfs_ordered_extent *ordered)
496{
497 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
498 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
499 u64 disk_i_size;
500 u64 new_i_size;
501 u64 i_size_test;
502 struct rb_node *node;
503 struct btrfs_ordered_extent *test;
504
505 mutex_lock(&tree->mutex);
506 disk_i_size = BTRFS_I(inode)->disk_i_size;
507
508 /*
509 * if the disk i_size is already at the inode->i_size, or
510 * this ordered extent is inside the disk i_size, we're done
511 */
512 if (disk_i_size >= inode->i_size ||
513 ordered->file_offset + ordered->len <= disk_i_size) {
514 goto out;
515 }
516
517 /*
518 * we can't update the disk_isize if there are delalloc bytes
519 * between disk_i_size and this ordered extent
520 */
521 if (test_range_bit(io_tree, disk_i_size,
522 ordered->file_offset + ordered->len - 1,
523 EXTENT_DELALLOC, 0)) {
524 goto out;
525 }
526 /*
527 * walk backward from this ordered extent to disk_i_size.
528 * if we find an ordered extent then we can't update disk i_size
529 * yet
530 */
531 node = &ordered->rb_node;
532 while(1) {
533 node = rb_prev(node);
534 if (!node)
535 break;
536 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
537 if (test->file_offset + test->len <= disk_i_size)
538 break;
539 if (test->file_offset >= inode->i_size)
540 break;
541 if (test->file_offset >= disk_i_size)
542 goto out;
543 }
544 new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
545
546 /*
547 * at this point, we know we can safely update i_size to at least
548 * the offset from this ordered extent. But, we need to
549 * walk forward and see if ios from higher up in the file have
550 * finished.
551 */
552 node = rb_next(&ordered->rb_node);
553 i_size_test = 0;
554 if (node) {
555 /*
556 * do we have an area where IO might have finished
557 * between our ordered extent and the next one.
558 */
559 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
560 if (test->file_offset > entry_end(ordered)) {
561 i_size_test = test->file_offset;
562 }
563 } else {
564 i_size_test = i_size_read(inode);
565 }
566
567 /*
568 * i_size_test is the end of a region after this ordered
569 * extent where there are no ordered extents. As long as there
570 * are no delalloc bytes in this area, it is safe to update
571 * disk_i_size to the end of the region.
572 */
573 if (i_size_test > entry_end(ordered) &&
574 !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
575 EXTENT_DELALLOC, 0)) {
576 new_i_size = min_t(u64, i_size_test, i_size_read(inode));
577 }
578 BTRFS_I(inode)->disk_i_size = new_i_size;
579out:
580 mutex_unlock(&tree->mutex);
581 return 0;
582}
583
584/*
585 * search the ordered extents for one corresponding to 'offset' and
586 * try to find a checksum. This is used because we allow pages to
587 * be reclaimed before their checksum is actually put into the btree
588 */
589int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
590{
591 struct btrfs_ordered_sum *ordered_sum;
592 struct btrfs_sector_sum *sector_sums;
593 struct btrfs_ordered_extent *ordered;
594 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
595 struct list_head *cur;
596 unsigned long num_sectors;
597 unsigned long i;
598 u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
599 int ret = 1;
600
601 ordered = btrfs_lookup_ordered_extent(inode, offset);
602 if (!ordered)
603 return 1;
604
605 mutex_lock(&tree->mutex);
606 list_for_each_prev(cur, &ordered->list) {
607 ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
608 if (offset >= ordered_sum->file_offset) {
609 num_sectors = ordered_sum->len / sectorsize;
610 sector_sums = ordered_sum->sums;
611 for (i = 0; i < num_sectors; i++) {
612 if (sector_sums[i].offset == offset) {
613 *sum = sector_sums[i].sum;
614 ret = 0;
615 goto out;
616 }
617 }
618 }
619 }
620out:
621 mutex_unlock(&tree->mutex);
622 btrfs_put_ordered_extent(ordered);
623 return ret;
624}
625
626
627/**
628 * taken from mm/filemap.c because it isn't exported
629 *
630 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
631 * @mapping: address space structure to write
632 * @start: offset in bytes where the range starts
633 * @end: offset in bytes where the range ends (inclusive)
634 * @sync_mode: enable synchronous operation
635 *
636 * Start writeback against all of a mapping's dirty pages that lie
637 * within the byte offsets <start, end> inclusive.
638 *
639 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
640 * opposed to a regular memory cleansing writeback. The difference between
641 * these two operations is that if a dirty page/buffer is encountered, it must
642 * be waited upon, and not just skipped over.
643 */
644int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
645 loff_t end, int sync_mode)
646{
647 struct writeback_control wbc = {
648 .sync_mode = sync_mode,
649 .nr_to_write = mapping->nrpages * 2,
650 .range_start = start,
651 .range_end = end,
652 .for_writepages = 1,
653 };
654 return btrfs_writepages(mapping, &wbc);
655}
656
657/**
658 * taken from mm/filemap.c because it isn't exported
659 *
660 * wait_on_page_writeback_range - wait for writeback to complete
661 * @mapping: target address_space
662 * @start: beginning page index
663 * @end: ending page index
664 *
665 * Wait for writeback to complete against pages indexed by start->end
666 * inclusive
667 */
668int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
669 pgoff_t start, pgoff_t end)
670{
671 struct pagevec pvec;
672 int nr_pages;
673 int ret = 0;
674 pgoff_t index;
675
676 if (end < start)
677 return 0;
678
679 pagevec_init(&pvec, 0);
680 index = start;
681 while ((index <= end) &&
682 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
683 PAGECACHE_TAG_WRITEBACK,
684 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
685 unsigned i;
686
687 for (i = 0; i < nr_pages; i++) {
688 struct page *page = pvec.pages[i];
689
690 /* until radix tree lookup accepts end_index */
691 if (page->index > end)
692 continue;
693
694 wait_on_page_writeback(page);
695 if (PageError(page))
696 ret = -EIO;
697 }
698 pagevec_release(&pvec);
699 cond_resched();
700 }
701
702 /* Check for outstanding write errors */
703 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
704 ret = -ENOSPC;
705 if (test_and_clear_bit(AS_EIO, &mapping->flags))
706 ret = -EIO;
707
708 return ret;
709}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 000000000000..fd45519f30a8
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,149 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_ORDERED_DATA__
20#define __BTRFS_ORDERED_DATA__
21
22/* one of these per inode */
23struct btrfs_ordered_inode_tree {
24 struct mutex mutex;
25 struct rb_root tree;
26 struct rb_node *last;
27};
28
29/*
30 * these are used to collect checksums done just before bios submission.
31 * They are attached via a list into the ordered extent, and
32 * checksum items are inserted into the tree after all the blocks in
33 * the ordered extent are on disk
34 */
35struct btrfs_sector_sum {
36 u64 offset;
37 u32 sum;
38};
39
40struct btrfs_ordered_sum {
41 u64 file_offset;
42 /*
43 * this is the length in bytes covered by the sums array below.
44 * But, the sums array may not be contiguous in the file.
45 */
46 unsigned long len;
47 struct list_head list;
48 /* last field is a variable length array of btrfs_sector_sums */
49 struct btrfs_sector_sum sums[];
50};
51
52/*
53 * bits for the flags field:
54 *
55 * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
56 * It is used to make sure metadata is inserted into the tree only once
57 * per extent.
58 *
59 * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
60 * rbtree, just before waking any waiters. It is used to indicate the
61 * IO is done and any metadata is inserted into the tree.
62 */
63#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
64
65#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
66
67#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
68
69struct btrfs_ordered_extent {
70 /* logical offset in the file */
71 u64 file_offset;
72
73 /* disk byte number */
74 u64 start;
75
76 /* length of the extent in bytes */
77 u64 len;
78
79 /* flags (described above) */
80 unsigned long flags;
81
82 /* reference count */
83 atomic_t refs;
84
85 /* the inode we belong to */
86 struct inode *inode;
87
88 /* list of checksums for insertion when the extent io is done */
89 struct list_head list;
90
91 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
92 wait_queue_head_t wait;
93
94 /* our friendly rbtree entry */
95 struct rb_node rb_node;
96
97 /* a per root list of all the pending ordered extents */
98 struct list_head root_extent_list;
99};
100
101
102/*
103 * calculates the total size you need to allocate for an ordered sum
104 * structure spanning 'bytes' in the file
105 */
106static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
107 unsigned long bytes)
108{
109 unsigned long num_sectors = (bytes + root->sectorsize - 1) /
110 root->sectorsize;
111 num_sectors++;
112 return sizeof(struct btrfs_ordered_sum) +
113 num_sectors * sizeof(struct btrfs_sector_sum);
114}
115
116static inline void
117btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
118{
119 mutex_init(&t->mutex);
120 t->tree.rb_node = NULL;
121 t->last = NULL;
122}
123
124int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
125int btrfs_remove_ordered_extent(struct inode *inode,
126 struct btrfs_ordered_extent *entry);
127int btrfs_dec_test_ordered_pending(struct inode *inode,
128 u64 file_offset, u64 io_size);
129int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
130 u64 start, u64 len, int nocow);
131int btrfs_add_ordered_sum(struct inode *inode,
132 struct btrfs_ordered_extent *entry,
133 struct btrfs_ordered_sum *sum);
134struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
135 u64 file_offset);
136void btrfs_start_ordered_extent(struct inode *inode,
137 struct btrfs_ordered_extent *entry, int wait);
138void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
139struct btrfs_ordered_extent *
140btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
141int btrfs_ordered_update_i_size(struct inode *inode,
142 struct btrfs_ordered_extent *ordered);
143int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);
144int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
145 pgoff_t start, pgoff_t end);
146int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
147 loff_t end, int sync_mode);
148int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
149#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 000000000000..3c0d52af4f80
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,67 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21
22int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root, u64 offset)
24{
25 struct btrfs_path *path;
26 struct btrfs_key key;
27 int ret = 0;
28
29 key.objectid = BTRFS_ORPHAN_OBJECTID;
30 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
31 key.offset = offset;
32
33 path = btrfs_alloc_path();
34 if (!path)
35 return -ENOMEM;
36
37 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
38
39 btrfs_free_path(path);
40 return ret;
41}
42
43int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
44 struct btrfs_root *root, u64 offset)
45{
46 struct btrfs_path *path;
47 struct btrfs_key key;
48 int ret = 0;
49
50 key.objectid = BTRFS_ORPHAN_OBJECTID;
51 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
52 key.offset = offset;
53
54 path = btrfs_alloc_path();
55 if (!path)
56 return -ENOMEM;
57
58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
59 if (ret)
60 goto out;
61
62 ret = btrfs_del_item(trans, root, path);
63
64out:
65 btrfs_free_path(path);
66 return ret;
67}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 000000000000..3577badfa5bc
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,201 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "print-tree.h"
22
23static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
24{
25 int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
26 int i;
27 printk("\t\tchunk length %llu owner %llu type %llu num_stripes %d\n",
28 (unsigned long long)btrfs_chunk_length(eb, chunk),
29 (unsigned long long)btrfs_chunk_owner(eb, chunk),
30 (unsigned long long)btrfs_chunk_type(eb, chunk),
31 num_stripes);
32 for (i = 0 ; i < num_stripes ; i++) {
33 printk("\t\t\tstripe %d devid %llu offset %llu\n", i,
34 (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
35 (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
36 }
37}
38static void print_dev_item(struct extent_buffer *eb,
39 struct btrfs_dev_item *dev_item)
40{
41 printk("\t\tdev item devid %llu "
42 "total_bytes %llu bytes used %Lu\n",
43 (unsigned long long)btrfs_device_id(eb, dev_item),
44 (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
45 (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
46}
47void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
48{
49 int i;
50 u32 nr = btrfs_header_nritems(l);
51 struct btrfs_item *item;
52 struct btrfs_extent_item *ei;
53 struct btrfs_root_item *ri;
54 struct btrfs_dir_item *di;
55 struct btrfs_inode_item *ii;
56 struct btrfs_block_group_item *bi;
57 struct btrfs_file_extent_item *fi;
58 struct btrfs_key key;
59 struct btrfs_key found_key;
60 struct btrfs_extent_ref *ref;
61 struct btrfs_dev_extent *dev_extent;
62 u32 type;
63
64 printk("leaf %llu total ptrs %d free space %d\n",
65 (unsigned long long)btrfs_header_bytenr(l), nr,
66 btrfs_leaf_free_space(root, l));
67 for (i = 0 ; i < nr ; i++) {
68 item = btrfs_item_nr(l, i);
69 btrfs_item_key_to_cpu(l, &key, i);
70 type = btrfs_key_type(&key);
71 printk("\titem %d key (%llu %x %llu) itemoff %d itemsize %d\n",
72 i,
73 (unsigned long long)key.objectid, type,
74 (unsigned long long)key.offset,
75 btrfs_item_offset(l, item), btrfs_item_size(l, item));
76 switch (type) {
77 case BTRFS_INODE_ITEM_KEY:
78 ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
79 printk("\t\tinode generation %llu size %llu mode %o\n",
80 (unsigned long long)btrfs_inode_generation(l, ii),
81 (unsigned long long)btrfs_inode_size(l, ii),
82 btrfs_inode_mode(l, ii));
83 break;
84 case BTRFS_DIR_ITEM_KEY:
85 di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
86 btrfs_dir_item_key_to_cpu(l, di, &found_key);
87 printk("\t\tdir oid %llu type %u\n",
88 (unsigned long long)found_key.objectid,
89 btrfs_dir_type(l, di));
90 break;
91 case BTRFS_ROOT_ITEM_KEY:
92 ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
93 printk("\t\troot data bytenr %llu refs %u\n",
94 (unsigned long long)btrfs_disk_root_bytenr(l, ri),
95 btrfs_disk_root_refs(l, ri));
96 break;
97 case BTRFS_EXTENT_ITEM_KEY:
98 ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
99 printk("\t\textent data refs %u\n",
100 btrfs_extent_refs(l, ei));
101 break;
102 case BTRFS_EXTENT_REF_KEY:
103 ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
104 printk("\t\textent back ref root %llu gen %llu "
105 "owner %llu offset %llu num_refs %lu\n",
106 (unsigned long long)btrfs_ref_root(l, ref),
107 (unsigned long long)btrfs_ref_generation(l, ref),
108 (unsigned long long)btrfs_ref_objectid(l, ref),
109 (unsigned long long)btrfs_ref_offset(l, ref),
110 (unsigned long)btrfs_ref_num_refs(l, ref));
111 break;
112
113 case BTRFS_EXTENT_DATA_KEY:
114 fi = btrfs_item_ptr(l, i,
115 struct btrfs_file_extent_item);
116 if (btrfs_file_extent_type(l, fi) ==
117 BTRFS_FILE_EXTENT_INLINE) {
118 printk("\t\tinline extent data size %u\n",
119 btrfs_file_extent_inline_len(l, item));
120 break;
121 }
122 printk("\t\textent data disk bytenr %llu nr %llu\n",
123 (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),
124 (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi));
125 printk("\t\textent data offset %llu nr %llu\n",
126 (unsigned long long)btrfs_file_extent_offset(l, fi),
127 (unsigned long long)btrfs_file_extent_num_bytes(l, fi));
128 break;
129 case BTRFS_BLOCK_GROUP_ITEM_KEY:
130 bi = btrfs_item_ptr(l, i,
131 struct btrfs_block_group_item);
132 printk("\t\tblock group used %llu\n",
133 (unsigned long long)btrfs_disk_block_group_used(l, bi));
134 break;
135 case BTRFS_CHUNK_ITEM_KEY:
136 print_chunk(l, btrfs_item_ptr(l, i, struct btrfs_chunk));
137 break;
138 case BTRFS_DEV_ITEM_KEY:
139 print_dev_item(l, btrfs_item_ptr(l, i,
140 struct btrfs_dev_item));
141 break;
142 case BTRFS_DEV_EXTENT_KEY:
143 dev_extent = btrfs_item_ptr(l, i,
144 struct btrfs_dev_extent);
145 printk("\t\tdev extent chunk_tree %llu\n"
146 "\t\tchunk objectid %llu chunk offset %llu "
147 "length %llu\n",
148 (unsigned long long)
149 btrfs_dev_extent_chunk_tree(l, dev_extent),
150 (unsigned long long)
151 btrfs_dev_extent_chunk_objectid(l, dev_extent),
152 (unsigned long long)
153 btrfs_dev_extent_chunk_offset(l, dev_extent),
154 (unsigned long long)
155 btrfs_dev_extent_length(l, dev_extent));
156 };
157 }
158}
159
160void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
161{
162 int i; u32 nr;
163 struct btrfs_key key;
164 int level;
165
166 if (!c)
167 return;
168 nr = btrfs_header_nritems(c);
169 level = btrfs_header_level(c);
170 if (level == 0) {
171 btrfs_print_leaf(root, c);
172 return;
173 }
174 printk("node %llu level %d total ptrs %d free spc %u\n",
175 (unsigned long long)btrfs_header_bytenr(c),
176 btrfs_header_level(c), nr,
177 (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
178 for (i = 0; i < nr; i++) {
179 btrfs_node_key_to_cpu(c, &key, i);
180 printk("\tkey %d (%llu %u %llu) block %llu\n",
181 i,
182 (unsigned long long)key.objectid,
183 key.type,
184 (unsigned long long)key.offset,
185 (unsigned long long)btrfs_node_blockptr(c, i));
186 }
187 for (i = 0; i < nr; i++) {
188 struct extent_buffer *next = read_tree_block(root,
189 btrfs_node_blockptr(c, i),
190 btrfs_level_size(root, level - 1),
191 btrfs_node_ptr_generation(c, i));
192 if (btrfs_is_leaf(next) &&
193 btrfs_header_level(c) != 1)
194 BUG();
195 if (btrfs_header_level(next) !=
196 btrfs_header_level(c) - 1)
197 BUG();
198 btrfs_print_tree(root, next);
199 free_extent_buffer(next);
200 }
201}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 000000000000..da75efe534d5
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __PRINT_TREE_
20#define __PRINT_TREE_
21void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
22void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
23#endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
new file mode 100644
index 000000000000..272b9890c982
--- /dev/null
+++ b/fs/btrfs/ref-cache.c
@@ -0,0 +1,187 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "ref-cache.h"
22#include "transaction.h"
23
24struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
25 int nr_extents)
26{
27 struct btrfs_leaf_ref *ref;
28 size_t size = btrfs_leaf_ref_size(nr_extents);
29
30 ref = kmalloc(size, GFP_NOFS);
31 if (ref) {
32 spin_lock(&root->fs_info->ref_cache_lock);
33 root->fs_info->total_ref_cache_size += size;
34 spin_unlock(&root->fs_info->ref_cache_lock);
35
36 memset(ref, 0, sizeof(*ref));
37 atomic_set(&ref->usage, 1);
38 INIT_LIST_HEAD(&ref->list);
39 }
40 return ref;
41}
42
43void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
44{
45 if (!ref)
46 return;
47 WARN_ON(atomic_read(&ref->usage) == 0);
48 if (atomic_dec_and_test(&ref->usage)) {
49 size_t size = btrfs_leaf_ref_size(ref->nritems);
50
51 BUG_ON(ref->in_tree);
52 kfree(ref);
53
54 spin_lock(&root->fs_info->ref_cache_lock);
55 root->fs_info->total_ref_cache_size -= size;
56 spin_unlock(&root->fs_info->ref_cache_lock);
57 }
58}
59
60static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
61 struct rb_node *node)
62{
63 struct rb_node ** p = &root->rb_node;
64 struct rb_node * parent = NULL;
65 struct btrfs_leaf_ref *entry;
66
67 while(*p) {
68 parent = *p;
69 entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
70 WARN_ON(!entry->in_tree);
71
72 if (bytenr < entry->bytenr)
73 p = &(*p)->rb_left;
74 else if (bytenr > entry->bytenr)
75 p = &(*p)->rb_right;
76 else
77 return parent;
78 }
79
80 entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
81 entry->in_tree = 1;
82 rb_link_node(node, parent, p);
83 rb_insert_color(node, root);
84 return NULL;
85}
86
87static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
88{
89 struct rb_node * n = root->rb_node;
90 struct btrfs_leaf_ref *entry;
91
92 while(n) {
93 entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
94 WARN_ON(!entry->in_tree);
95
96 if (bytenr < entry->bytenr)
97 n = n->rb_left;
98 else if (bytenr > entry->bytenr)
99 n = n->rb_right;
100 else
101 return n;
102 }
103 return NULL;
104}
105
106int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen)
107{
108 struct btrfs_leaf_ref *ref = NULL;
109 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
110
111 if (!tree)
112 return 0;
113
114 spin_lock(&tree->lock);
115 while(!list_empty(&tree->list)) {
116 ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
117 BUG_ON(!ref->in_tree);
118 if (ref->root_gen > max_root_gen)
119 break;
120
121 rb_erase(&ref->rb_node, &tree->root);
122 ref->in_tree = 0;
123 list_del_init(&ref->list);
124
125 spin_unlock(&tree->lock);
126 btrfs_free_leaf_ref(root, ref);
127 cond_resched();
128 spin_lock(&tree->lock);
129 }
130 spin_unlock(&tree->lock);
131 return 0;
132}
133
134struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
135 u64 bytenr)
136{
137 struct rb_node *rb;
138 struct btrfs_leaf_ref *ref = NULL;
139 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
140
141 if (!tree)
142 return NULL;
143
144 spin_lock(&tree->lock);
145 rb = tree_search(&tree->root, bytenr);
146 if (rb)
147 ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
148 if (ref)
149 atomic_inc(&ref->usage);
150 spin_unlock(&tree->lock);
151 return ref;
152}
153
154int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
155{
156 int ret = 0;
157 struct rb_node *rb;
158 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
159
160 spin_lock(&tree->lock);
161 rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
162 if (rb) {
163 ret = -EEXIST;
164 } else {
165 atomic_inc(&ref->usage);
166 list_add_tail(&ref->list, &tree->list);
167 }
168 spin_unlock(&tree->lock);
169 return ret;
170}
171
172int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
173{
174 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
175
176 BUG_ON(!ref->in_tree);
177 spin_lock(&tree->lock);
178
179 rb_erase(&ref->rb_node, &tree->root);
180 ref->in_tree = 0;
181 list_del_init(&ref->list);
182
183 spin_unlock(&tree->lock);
184
185 btrfs_free_leaf_ref(root, ref);
186 return 0;
187}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
new file mode 100644
index 000000000000..c361b321c0c3
--- /dev/null
+++ b/fs/btrfs/ref-cache.h
@@ -0,0 +1,71 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __REFCACHE__
19#define __REFCACHE__
20
21struct btrfs_extent_info {
22 u64 bytenr;
23 u64 num_bytes;
24 u64 objectid;
25 u64 offset;
26};
27
28struct btrfs_leaf_ref {
29 struct rb_node rb_node;
30 int in_tree;
31 atomic_t usage;
32
33 u64 root_gen;
34 u64 bytenr;
35 u64 owner;
36 u64 generation;
37 int nritems;
38
39 struct list_head list;
40 struct btrfs_extent_info extents[];
41};
42
43static inline size_t btrfs_leaf_ref_size(int nr_extents)
44{
45 return sizeof(struct btrfs_leaf_ref) +
46 sizeof(struct btrfs_extent_info) * nr_extents;
47}
48
49static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
50{
51 tree->root.rb_node = NULL;
52 INIT_LIST_HEAD(&tree->list);
53 spin_lock_init(&tree->lock);
54}
55
56static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
57{
58 return RB_EMPTY_ROOT(&tree->root);
59}
60
61void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
62struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
63 int nr_extents);
64void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
65struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
66 u64 bytenr);
67int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
68int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen);
69int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
70
71#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 000000000000..e3984f902e71
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,257 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "transaction.h"
21#include "disk-io.h"
22#include "print-tree.h"
23
24/*
25 * returns 0 on finding something, 1 if no more roots are there
26 * and < 0 on error
27 */
28int btrfs_search_root(struct btrfs_root *root, u64 search_start,
29 u64 *found_objectid)
30{
31 struct btrfs_path *path;
32 struct btrfs_key search_key;
33 int ret;
34
35 root = root->fs_info->tree_root;
36 search_key.objectid = search_start;
37 search_key.type = (u8)-1;
38 search_key.offset = (u64)-1;
39
40 path = btrfs_alloc_path();
41 BUG_ON(!path);
42again:
43 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
44 if (ret < 0)
45 goto out;
46 if (ret == 0) {
47 ret = 1;
48 goto out;
49 }
50 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
51 ret = btrfs_next_leaf(root, path);
52 if (ret)
53 goto out;
54 }
55 btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
56 if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
57 search_key.offset++;
58 btrfs_release_path(root, path);
59 goto again;
60 }
61 ret = 0;
62 *found_objectid = search_key.objectid;
63
64out:
65 btrfs_free_path(path);
66 return ret;
67}
68
69int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
70 struct btrfs_root_item *item, struct btrfs_key *key)
71{
72 struct btrfs_path *path;
73 struct btrfs_key search_key;
74 struct btrfs_key found_key;
75 struct extent_buffer *l;
76 int ret;
77 int slot;
78
79 search_key.objectid = objectid;
80 search_key.type = (u8)-1;
81 search_key.offset = (u64)-1;
82
83 path = btrfs_alloc_path();
84 BUG_ON(!path);
85 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
86 if (ret < 0)
87 goto out;
88
89 BUG_ON(ret == 0);
90 l = path->nodes[0];
91 BUG_ON(path->slots[0] == 0);
92 slot = path->slots[0] - 1;
93 btrfs_item_key_to_cpu(l, &found_key, slot);
94 if (found_key.objectid != objectid) {
95 ret = 1;
96 goto out;
97 }
98 read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
99 sizeof(*item));
100 memcpy(key, &found_key, sizeof(found_key));
101 ret = 0;
102out:
103 btrfs_free_path(path);
104 return ret;
105}
106
107int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
108 *root, struct btrfs_key *key, struct btrfs_root_item
109 *item)
110{
111 struct btrfs_path *path;
112 struct extent_buffer *l;
113 int ret;
114 int slot;
115 unsigned long ptr;
116
117 path = btrfs_alloc_path();
118 BUG_ON(!path);
119 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
120 if (ret < 0)
121 goto out;
122
123 if (ret != 0) {
124 btrfs_print_leaf(root, path->nodes[0]);
125 printk("unable to update root key %Lu %u %Lu\n",
126 key->objectid, key->type, key->offset);
127 BUG_ON(1);
128 }
129
130 l = path->nodes[0];
131 slot = path->slots[0];
132 ptr = btrfs_item_ptr_offset(l, slot);
133 write_extent_buffer(l, item, ptr, sizeof(*item));
134 btrfs_mark_buffer_dirty(path->nodes[0]);
135out:
136 btrfs_release_path(root, path);
137 btrfs_free_path(path);
138 return ret;
139}
140
141int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
142 *root, struct btrfs_key *key, struct btrfs_root_item
143 *item)
144{
145 int ret;
146 ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
147 return ret;
148}
149
150int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
151 struct btrfs_root *latest)
152{
153 struct btrfs_root *dead_root;
154 struct btrfs_item *item;
155 struct btrfs_root_item *ri;
156 struct btrfs_key key;
157 struct btrfs_key found_key;
158 struct btrfs_path *path;
159 int ret;
160 u32 nritems;
161 struct extent_buffer *leaf;
162 int slot;
163
164 key.objectid = objectid;
165 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
166 key.offset = 0;
167 path = btrfs_alloc_path();
168 if (!path)
169 return -ENOMEM;
170
171again:
172 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
173 if (ret < 0)
174 goto err;
175 while(1) {
176 leaf = path->nodes[0];
177 nritems = btrfs_header_nritems(leaf);
178 slot = path->slots[0];
179 if (slot >= nritems) {
180 ret = btrfs_next_leaf(root, path);
181 if (ret)
182 break;
183 leaf = path->nodes[0];
184 nritems = btrfs_header_nritems(leaf);
185 slot = path->slots[0];
186 }
187 item = btrfs_item_nr(leaf, slot);
188 btrfs_item_key_to_cpu(leaf, &key, slot);
189 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
190 goto next;
191
192 if (key.objectid < objectid)
193 goto next;
194
195 if (key.objectid > objectid)
196 break;
197
198 ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
199 if (btrfs_disk_root_refs(leaf, ri) != 0)
200 goto next;
201
202 memcpy(&found_key, &key, sizeof(key));
203 key.offset++;
204 btrfs_release_path(root, path);
205 dead_root =
206 btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
207 &found_key);
208 if (IS_ERR(dead_root)) {
209 ret = PTR_ERR(dead_root);
210 goto err;
211 }
212
213 ret = btrfs_add_dead_root(dead_root, latest);
214 if (ret)
215 goto err;
216 goto again;
217next:
218 slot++;
219 path->slots[0]++;
220 }
221 ret = 0;
222err:
223 btrfs_free_path(path);
224 return ret;
225}
226
227int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
228 struct btrfs_key *key)
229{
230 struct btrfs_path *path;
231 int ret;
232 u32 refs;
233 struct btrfs_root_item *ri;
234 struct extent_buffer *leaf;
235
236 path = btrfs_alloc_path();
237 BUG_ON(!path);
238 ret = btrfs_search_slot(trans, root, key, path, -1, 1);
239 if (ret < 0)
240 goto out;
241 if (ret) {
242btrfs_print_leaf(root, path->nodes[0]);
243printk("failed to del %Lu %u %Lu\n", key->objectid, key->type, key->offset);
244
245 }
246 BUG_ON(ret != 0);
247 leaf = path->nodes[0];
248 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
249
250 refs = btrfs_disk_root_refs(leaf, ri);
251 BUG_ON(refs != 0);
252 ret = btrfs_del_item(trans, root, path);
253out:
254 btrfs_release_path(root, path);
255 btrfs_free_path(path);
256 return ret;
257}
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 000000000000..ad03a32d1116
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,111 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/highmem.h>
20#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
21u##bits btrfs_##name(struct extent_buffer *eb, \
22 type *s) \
23{ \
24 unsigned long part_offset = (unsigned long)s; \
25 unsigned long offset = part_offset + offsetof(type, member); \
26 type *p; \
27 /* ugly, but we want the fast path here */ \
28 if (eb->map_token && offset >= eb->map_start && \
29 offset + sizeof(((type *)0)->member) <= eb->map_start + \
30 eb->map_len) { \
31 p = (type *)(eb->kaddr + part_offset - eb->map_start); \
32 return le##bits##_to_cpu(p->member); \
33 } \
34 { \
35 int err; \
36 char *map_token; \
37 char *kaddr; \
38 int unmap_on_exit = (eb->map_token == NULL); \
39 unsigned long map_start; \
40 unsigned long map_len; \
41 __le##bits res; \
42 err = map_extent_buffer(eb, offset, \
43 sizeof(((type *)0)->member), \
44 &map_token, &kaddr, \
45 &map_start, &map_len, KM_USER1); \
46 if (err) { \
47 read_eb_member(eb, s, type, member, &res); \
48 return le##bits##_to_cpu(res); \
49 } \
50 p = (type *)(kaddr + part_offset - map_start); \
51 res = le##bits##_to_cpu(p->member); \
52 if (unmap_on_exit) \
53 unmap_extent_buffer(eb, map_token, KM_USER1); \
54 return res; \
55 } \
56} \
57void btrfs_set_##name(struct extent_buffer *eb, \
58 type *s, u##bits val) \
59{ \
60 unsigned long part_offset = (unsigned long)s; \
61 unsigned long offset = part_offset + offsetof(type, member); \
62 type *p; \
63 /* ugly, but we want the fast path here */ \
64 if (eb->map_token && offset >= eb->map_start && \
65 offset + sizeof(((type *)0)->member) <= eb->map_start + \
66 eb->map_len) { \
67 p = (type *)(eb->kaddr + part_offset - eb->map_start); \
68 p->member = cpu_to_le##bits(val); \
69 return; \
70 } \
71 { \
72 int err; \
73 char *map_token; \
74 char *kaddr; \
75 int unmap_on_exit = (eb->map_token == NULL); \
76 unsigned long map_start; \
77 unsigned long map_len; \
78 err = map_extent_buffer(eb, offset, \
79 sizeof(((type *)0)->member), \
80 &map_token, &kaddr, \
81 &map_start, &map_len, KM_USER1); \
82 if (err) { \
83 val = cpu_to_le##bits(val); \
84 write_eb_member(eb, s, type, member, &val); \
85 return; \
86 } \
87 p = (type *)(kaddr + part_offset - map_start); \
88 p->member = cpu_to_le##bits(val); \
89 if (unmap_on_exit) \
90 unmap_extent_buffer(eb, map_token, KM_USER1); \
91 } \
92}
93
94#include "ctree.h"
95
96void btrfs_node_key(struct extent_buffer *eb,
97 struct btrfs_disk_key *disk_key, int nr)
98{
99 unsigned long ptr = btrfs_node_key_ptr_offset(nr);
100 if (eb->map_token && ptr >= eb->map_start &&
101 ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
102 memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
103 sizeof(*disk_key));
104 return;
105 } else if (eb->map_token) {
106 unmap_extent_buffer(eb, eb->map_token, KM_USER1);
107 eb->map_token = NULL;
108 }
109 read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
110 struct btrfs_key_ptr, key, disk_key);
111}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 000000000000..f7b3eac7ac6d
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,663 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/blkdev.h>
20#include <linux/module.h>
21#include <linux/buffer_head.h>
22#include <linux/fs.h>
23#include <linux/pagemap.h>
24#include <linux/highmem.h>
25#include <linux/time.h>
26#include <linux/init.h>
27#include <linux/string.h>
28#include <linux/smp_lock.h>
29#include <linux/backing-dev.h>
30#include <linux/mount.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/parser.h>
37#include <linux/ctype.h>
38#include <linux/namei.h>
39#include <linux/miscdevice.h>
40#include "ctree.h"
41#include "disk-io.h"
42#include "transaction.h"
43#include "btrfs_inode.h"
44#include "ioctl.h"
45#include "print-tree.h"
46#include "xattr.h"
47#include "volumes.h"
48#include "version.h"
49#include "export.h"
50
51#define BTRFS_SUPER_MAGIC 0x9123683E
52
53static struct super_operations btrfs_super_ops;
54
55static void btrfs_put_super (struct super_block * sb)
56{
57 struct btrfs_root *root = btrfs_sb(sb);
58 struct btrfs_fs_info *fs = root->fs_info;
59 int ret;
60
61 ret = close_ctree(root);
62 if (ret) {
63 printk("close ctree returns %d\n", ret);
64 }
65 btrfs_sysfs_del_super(fs);
66 sb->s_fs_info = NULL;
67}
68
69enum {
70 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
71 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
72 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_err,
73};
74
75static match_table_t tokens = {
76 {Opt_degraded, "degraded"},
77 {Opt_subvol, "subvol=%s"},
78 {Opt_device, "device=%s"},
79 {Opt_nodatasum, "nodatasum"},
80 {Opt_nodatacow, "nodatacow"},
81 {Opt_nobarrier, "nobarrier"},
82 {Opt_max_extent, "max_extent=%s"},
83 {Opt_max_inline, "max_inline=%s"},
84 {Opt_alloc_start, "alloc_start=%s"},
85 {Opt_thread_pool, "thread_pool=%d"},
86 {Opt_ssd, "ssd"},
87 {Opt_noacl, "noacl"},
88 {Opt_err, NULL},
89};
90
91u64 btrfs_parse_size(char *str)
92{
93 u64 res;
94 int mult = 1;
95 char *end;
96 char last;
97
98 res = simple_strtoul(str, &end, 10);
99
100 last = end[0];
101 if (isalpha(last)) {
102 last = tolower(last);
103 switch (last) {
104 case 'g':
105 mult *= 1024;
106 case 'm':
107 mult *= 1024;
108 case 'k':
109 mult *= 1024;
110 }
111 res = res * mult;
112 }
113 return res;
114}
115
116/*
117 * Regular mount options parser. Everything that is needed only when
118 * reading in a new superblock is parsed here.
119 */
120int btrfs_parse_options(struct btrfs_root *root, char *options)
121{
122 struct btrfs_fs_info *info = root->fs_info;
123 substring_t args[MAX_OPT_ARGS];
124 char *p, *num;
125 int intarg;
126
127 if (!options)
128 return 0;
129
130 /*
131 * strsep changes the string, duplicate it because parse_options
132 * gets called twice
133 */
134 options = kstrdup(options, GFP_NOFS);
135 if (!options)
136 return -ENOMEM;
137
138
139 while ((p = strsep(&options, ",")) != NULL) {
140 int token;
141 if (!*p)
142 continue;
143
144 token = match_token(p, tokens, args);
145 switch (token) {
146 case Opt_degraded:
147 printk(KERN_INFO "btrfs: allowing degraded mounts\n");
148 btrfs_set_opt(info->mount_opt, DEGRADED);
149 break;
150 case Opt_subvol:
151 case Opt_device:
152 /*
153 * These are parsed by btrfs_parse_early_options
154 * and can be happily ignored here.
155 */
156 break;
157 case Opt_nodatasum:
158 printk(KERN_INFO "btrfs: setting nodatacsum\n");
159 btrfs_set_opt(info->mount_opt, NODATASUM);
160 break;
161 case Opt_nodatacow:
162 printk(KERN_INFO "btrfs: setting nodatacow\n");
163 btrfs_set_opt(info->mount_opt, NODATACOW);
164 btrfs_set_opt(info->mount_opt, NODATASUM);
165 break;
166 case Opt_ssd:
167 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
168 btrfs_set_opt(info->mount_opt, SSD);
169 break;
170 case Opt_nobarrier:
171 printk(KERN_INFO "btrfs: turning off barriers\n");
172 btrfs_set_opt(info->mount_opt, NOBARRIER);
173 break;
174 case Opt_thread_pool:
175 intarg = 0;
176 match_int(&args[0], &intarg);
177 if (intarg) {
178 info->thread_pool_size = intarg;
179 printk(KERN_INFO "btrfs: thread pool %d\n",
180 info->thread_pool_size);
181 }
182 break;
183 case Opt_max_extent:
184 num = match_strdup(&args[0]);
185 if (num) {
186 info->max_extent = btrfs_parse_size(num);
187 kfree(num);
188
189 info->max_extent = max_t(u64,
190 info->max_extent, root->sectorsize);
191 printk(KERN_INFO "btrfs: max_extent at %llu\n",
192 info->max_extent);
193 }
194 break;
195 case Opt_max_inline:
196 num = match_strdup(&args[0]);
197 if (num) {
198 info->max_inline = btrfs_parse_size(num);
199 kfree(num);
200
201 if (info->max_inline) {
202 info->max_inline = max_t(u64,
203 info->max_inline,
204 root->sectorsize);
205 }
206 printk(KERN_INFO "btrfs: max_inline at %llu\n",
207 info->max_inline);
208 }
209 break;
210 case Opt_alloc_start:
211 num = match_strdup(&args[0]);
212 if (num) {
213 info->alloc_start = btrfs_parse_size(num);
214 kfree(num);
215 printk(KERN_INFO
216 "btrfs: allocations start at %llu\n",
217 info->alloc_start);
218 }
219 break;
220 case Opt_noacl:
221 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
222 break;
223 default:
224 break;
225 }
226 }
227 kfree(options);
228 return 0;
229}
230
231/*
232 * Parse mount options that are required early in the mount process.
233 *
234 * All other options will be parsed on much later in the mount process and
235 * only when we need to allocate a new super block.
236 */
237static int btrfs_parse_early_options(const char *options, int flags,
238 void *holder, char **subvol_name,
239 struct btrfs_fs_devices **fs_devices)
240{
241 substring_t args[MAX_OPT_ARGS];
242 char *opts, *p;
243 int error = 0;
244
245 if (!options)
246 goto out;
247
248 /*
249 * strsep changes the string, duplicate it because parse_options
250 * gets called twice
251 */
252 opts = kstrdup(options, GFP_KERNEL);
253 if (!opts)
254 return -ENOMEM;
255
256 while ((p = strsep(&opts, ",")) != NULL) {
257 int token;
258 if (!*p)
259 continue;
260
261 token = match_token(p, tokens, args);
262 switch (token) {
263 case Opt_subvol:
264 *subvol_name = match_strdup(&args[0]);
265 break;
266 case Opt_device:
267 error = btrfs_scan_one_device(match_strdup(&args[0]),
268 flags, holder, fs_devices);
269 if (error)
270 goto out_free_opts;
271 break;
272 default:
273 break;
274 }
275 }
276
277 out_free_opts:
278 kfree(opts);
279 out:
280 /*
281 * If no subvolume name is specified we use the default one. Allocate
282 * a copy of the string "default" here so that code later in the
283 * mount path doesn't care if it's the default volume or another one.
284 */
285 if (!*subvol_name) {
286 *subvol_name = kstrdup("default", GFP_KERNEL);
287 if (!*subvol_name)
288 return -ENOMEM;
289 }
290 return error;
291}
292
293static int btrfs_fill_super(struct super_block * sb,
294 struct btrfs_fs_devices *fs_devices,
295 void * data, int silent)
296{
297 struct inode * inode;
298 struct dentry * root_dentry;
299 struct btrfs_super_block *disk_super;
300 struct btrfs_root *tree_root;
301 struct btrfs_inode *bi;
302 int err;
303
304 sb->s_maxbytes = MAX_LFS_FILESIZE;
305 sb->s_magic = BTRFS_SUPER_MAGIC;
306 sb->s_op = &btrfs_super_ops;
307 sb->s_export_op = &btrfs_export_ops;
308 sb->s_xattr = btrfs_xattr_handlers;
309 sb->s_time_gran = 1;
310 sb->s_flags |= MS_POSIXACL;
311
312 tree_root = open_ctree(sb, fs_devices, (char *)data);
313
314 if (IS_ERR(tree_root)) {
315 printk("btrfs: open_ctree failed\n");
316 return PTR_ERR(tree_root);
317 }
318 sb->s_fs_info = tree_root;
319 disk_super = &tree_root->fs_info->super_copy;
320 inode = btrfs_iget_locked(sb, btrfs_super_root_dir(disk_super),
321 tree_root);
322 bi = BTRFS_I(inode);
323 bi->location.objectid = inode->i_ino;
324 bi->location.offset = 0;
325 bi->root = tree_root;
326
327 btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
328
329 if (!inode) {
330 err = -ENOMEM;
331 goto fail_close;
332 }
333 if (inode->i_state & I_NEW) {
334 btrfs_read_locked_inode(inode);
335 unlock_new_inode(inode);
336 }
337
338 root_dentry = d_alloc_root(inode);
339 if (!root_dentry) {
340 iput(inode);
341 err = -ENOMEM;
342 goto fail_close;
343 }
344
345 /* this does the super kobj at the same time */
346 err = btrfs_sysfs_add_super(tree_root->fs_info);
347 if (err)
348 goto fail_close;
349
350 sb->s_root = root_dentry;
351
352#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25)
353 save_mount_options(sb, data);
354#endif
355
356 return 0;
357
358fail_close:
359 close_ctree(tree_root);
360 return err;
361}
362
363int btrfs_sync_fs(struct super_block *sb, int wait)
364{
365 struct btrfs_trans_handle *trans;
366 struct btrfs_root *root;
367 int ret;
368 root = btrfs_sb(sb);
369
370 sb->s_dirt = 0;
371 if (!wait) {
372 filemap_flush(root->fs_info->btree_inode->i_mapping);
373 return 0;
374 }
375 btrfs_clean_old_snapshots(root);
376 trans = btrfs_start_transaction(root, 1);
377 ret = btrfs_commit_transaction(trans, root);
378 sb->s_dirt = 0;
379 return ret;
380}
381
382static void btrfs_write_super(struct super_block *sb)
383{
384 sb->s_dirt = 0;
385}
386
387static int btrfs_test_super(struct super_block *s, void *data)
388{
389 struct btrfs_fs_devices *test_fs_devices = data;
390 struct btrfs_root *root = btrfs_sb(s);
391
392 return root->fs_info->fs_devices == test_fs_devices;
393}
394
395/*
396 * Find a superblock for the given device / mount point.
397 *
398 * Note: This is based on get_sb_bdev from fs/super.c with a few additions
399 * for multiple device setup. Make sure to keep it in sync.
400 */
401static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
402 const char *dev_name, void *data, struct vfsmount *mnt)
403{
404 char *subvol_name = NULL;
405 struct block_device *bdev = NULL;
406 struct super_block *s;
407 struct dentry *root;
408 struct btrfs_fs_devices *fs_devices = NULL;
409 int error = 0;
410
411 error = btrfs_parse_early_options(data, flags, fs_type,
412 &subvol_name, &fs_devices);
413 if (error)
414 goto error;
415
416 error = btrfs_scan_one_device(dev_name, flags, fs_type, &fs_devices);
417 if (error)
418 goto error_free_subvol_name;
419
420 error = btrfs_open_devices(fs_devices, flags, fs_type);
421 if (error)
422 goto error_free_subvol_name;
423
424 bdev = fs_devices->latest_bdev;
425 s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
426 if (IS_ERR(s))
427 goto error_s;
428
429 if (s->s_root) {
430 if ((flags ^ s->s_flags) & MS_RDONLY) {
431 up_write(&s->s_umount);
432 deactivate_super(s);
433 error = -EBUSY;
434 goto error_bdev;
435 }
436
437 } else {
438 char b[BDEVNAME_SIZE];
439
440 s->s_flags = flags;
441 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
442 error = btrfs_fill_super(s, fs_devices, data,
443 flags & MS_SILENT ? 1 : 0);
444 if (error) {
445 up_write(&s->s_umount);
446 deactivate_super(s);
447 goto error;
448 }
449
450 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
451 s->s_flags |= MS_ACTIVE;
452 }
453
454 if (!strcmp(subvol_name, "."))
455 root = dget(s->s_root);
456 else {
457 mutex_lock(&s->s_root->d_inode->i_mutex);
458 root = lookup_one_len(subvol_name, s->s_root, strlen(subvol_name));
459 mutex_unlock(&s->s_root->d_inode->i_mutex);
460 if (IS_ERR(root)) {
461 up_write(&s->s_umount);
462 deactivate_super(s);
463 error = PTR_ERR(root);
464 goto error;
465 }
466 if (!root->d_inode) {
467 dput(root);
468 up_write(&s->s_umount);
469 deactivate_super(s);
470 error = -ENXIO;
471 goto error;
472 }
473 }
474
475 mnt->mnt_sb = s;
476 mnt->mnt_root = root;
477
478 kfree(subvol_name);
479 return 0;
480
481error_s:
482 error = PTR_ERR(s);
483error_bdev:
484 btrfs_close_devices(fs_devices);
485error_free_subvol_name:
486 kfree(subvol_name);
487error:
488 return error;
489}
490
491static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
492{
493 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
494 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
495 int bits = dentry->d_sb->s_blocksize_bits;
496 __be32 *fsid = (__be32 *)root->fs_info->fsid;
497
498 buf->f_namelen = BTRFS_NAME_LEN;
499 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
500 buf->f_bfree = buf->f_blocks -
501 (btrfs_super_bytes_used(disk_super) >> bits);
502 buf->f_bavail = buf->f_bfree;
503 buf->f_bsize = dentry->d_sb->s_blocksize;
504 buf->f_type = BTRFS_SUPER_MAGIC;
505 /* We treat it as constant endianness (it doesn't matter _which_)
506 because we want the fsid to come out the same whether mounted
507 on a big-endian or little-endian host */
508 buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
509 buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
510 /* Mask in the root object ID too, to disambiguate subvols */
511 buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
512 buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
513
514 return 0;
515}
516
517static struct file_system_type btrfs_fs_type = {
518 .owner = THIS_MODULE,
519 .name = "btrfs",
520 .get_sb = btrfs_get_sb,
521 .kill_sb = kill_anon_super,
522 .fs_flags = FS_REQUIRES_DEV,
523};
524
525static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
526 unsigned long arg)
527{
528 struct btrfs_ioctl_vol_args *vol;
529 struct btrfs_fs_devices *fs_devices;
530 int ret = 0;
531 int len;
532
533 vol = kmalloc(sizeof(*vol), GFP_KERNEL);
534 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
535 ret = -EFAULT;
536 goto out;
537 }
538 len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
539 switch (cmd) {
540 case BTRFS_IOC_SCAN_DEV:
541 ret = btrfs_scan_one_device(vol->name, MS_RDONLY,
542 &btrfs_fs_type, &fs_devices);
543 break;
544 }
545out:
546 kfree(vol);
547 return ret;
548}
549
550static void btrfs_write_super_lockfs(struct super_block *sb)
551{
552 struct btrfs_root *root = btrfs_sb(sb);
553 mutex_lock(&root->fs_info->transaction_kthread_mutex);
554 mutex_lock(&root->fs_info->cleaner_mutex);
555}
556
557static void btrfs_unlockfs(struct super_block *sb)
558{
559 struct btrfs_root *root = btrfs_sb(sb);
560 mutex_unlock(&root->fs_info->cleaner_mutex);
561 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
562}
563
564static struct super_operations btrfs_super_ops = {
565 .delete_inode = btrfs_delete_inode,
566 .put_super = btrfs_put_super,
567 .write_super = btrfs_write_super,
568 .sync_fs = btrfs_sync_fs,
569#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
570 .read_inode = btrfs_read_locked_inode,
571#else
572 .show_options = generic_show_options,
573#endif
574 .write_inode = btrfs_write_inode,
575 .dirty_inode = btrfs_dirty_inode,
576 .alloc_inode = btrfs_alloc_inode,
577 .destroy_inode = btrfs_destroy_inode,
578 .statfs = btrfs_statfs,
579 .write_super_lockfs = btrfs_write_super_lockfs,
580 .unlockfs = btrfs_unlockfs,
581};
582
583static const struct file_operations btrfs_ctl_fops = {
584 .unlocked_ioctl = btrfs_control_ioctl,
585 .compat_ioctl = btrfs_control_ioctl,
586 .owner = THIS_MODULE,
587};
588
589static struct miscdevice btrfs_misc = {
590 .minor = MISC_DYNAMIC_MINOR,
591 .name = "btrfs-control",
592 .fops = &btrfs_ctl_fops
593};
594
595static int btrfs_interface_init(void)
596{
597 return misc_register(&btrfs_misc);
598}
599
600void btrfs_interface_exit(void)
601{
602 if (misc_deregister(&btrfs_misc) < 0)
603 printk("misc_deregister failed for control device");
604}
605
606static int __init init_btrfs_fs(void)
607{
608 int err;
609
610 err = btrfs_init_sysfs();
611 if (err)
612 return err;
613
614 err = btrfs_init_cachep();
615 if (err)
616 goto free_sysfs;
617
618 err = extent_io_init();
619 if (err)
620 goto free_cachep;
621
622 err = extent_map_init();
623 if (err)
624 goto free_extent_io;
625
626 err = btrfs_interface_init();
627 if (err)
628 goto free_extent_map;
629 err = register_filesystem(&btrfs_fs_type);
630 if (err)
631 goto unregister_ioctl;
632
633 printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
634 return 0;
635
636unregister_ioctl:
637 btrfs_interface_exit();
638free_extent_map:
639 extent_map_exit();
640free_extent_io:
641 extent_io_exit();
642free_cachep:
643 btrfs_destroy_cachep();
644free_sysfs:
645 btrfs_exit_sysfs();
646 return err;
647}
648
649static void __exit exit_btrfs_fs(void)
650{
651 btrfs_destroy_cachep();
652 extent_map_exit();
653 extent_io_exit();
654 btrfs_interface_exit();
655 unregister_filesystem(&btrfs_fs_type);
656 btrfs_exit_sysfs();
657 btrfs_cleanup_fs_uuids();
658}
659
660module_init(init_btrfs_fs)
661module_exit(exit_btrfs_fs)
662
663MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 000000000000..61af5d8446e3
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,301 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/slab.h>
21#include <linux/spinlock.h>
22#include <linux/completion.h>
23#include <linux/buffer_head.h>
24#include <linux/module.h>
25#include <linux/kobject.h>
26
27#include "ctree.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25)
32static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
33{
34 return snprintf(buf, PAGE_SIZE, "%llu\n",
35 (unsigned long long)btrfs_root_used(&root->root_item));
36}
37
38static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
39{
40 return snprintf(buf, PAGE_SIZE, "%llu\n",
41 (unsigned long long)btrfs_root_limit(&root->root_item));
42}
43
44static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
45{
46
47 return snprintf(buf, PAGE_SIZE, "%llu\n",
48 (unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
49}
50
51static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
52{
53 return snprintf(buf, PAGE_SIZE, "%llu\n",
54 (unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
55}
56
57static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
58{
59 return snprintf(buf, PAGE_SIZE, "%llu\n",
60 (unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
61}
62
63/* this is for root attrs (subvols/snapshots) */
64struct btrfs_root_attr {
65 struct attribute attr;
66 ssize_t (*show)(struct btrfs_root *, char *);
67 ssize_t (*store)(struct btrfs_root *, const char *, size_t);
68};
69
70#define ROOT_ATTR(name, mode, show, store) \
71static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, show, store)
72
73ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL);
74ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL);
75
76static struct attribute *btrfs_root_attrs[] = {
77 &btrfs_root_attr_blocks_used.attr,
78 &btrfs_root_attr_block_limit.attr,
79 NULL,
80};
81
82/* this is for super attrs (actual full fs) */
83struct btrfs_super_attr {
84 struct attribute attr;
85 ssize_t (*show)(struct btrfs_fs_info *, char *);
86 ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
87};
88
89#define SUPER_ATTR(name, mode, show, store) \
90static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, show, store)
91
92SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL);
93SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL);
94SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL);
95
96static struct attribute *btrfs_super_attrs[] = {
97 &btrfs_super_attr_blocks_used.attr,
98 &btrfs_super_attr_total_blocks.attr,
99 &btrfs_super_attr_blocksize.attr,
100 NULL,
101};
102
103static ssize_t btrfs_super_attr_show(struct kobject *kobj,
104 struct attribute *attr, char *buf)
105{
106 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
107 super_kobj);
108 struct btrfs_super_attr *a = container_of(attr,
109 struct btrfs_super_attr,
110 attr);
111
112 return a->show ? a->show(fs, buf) : 0;
113}
114
115static ssize_t btrfs_super_attr_store(struct kobject *kobj,
116 struct attribute *attr,
117 const char *buf, size_t len)
118{
119 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
120 super_kobj);
121 struct btrfs_super_attr *a = container_of(attr,
122 struct btrfs_super_attr,
123 attr);
124
125 return a->store ? a->store(fs, buf, len) : 0;
126}
127
128static ssize_t btrfs_root_attr_show(struct kobject *kobj,
129 struct attribute *attr, char *buf)
130{
131 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
132 root_kobj);
133 struct btrfs_root_attr *a = container_of(attr,
134 struct btrfs_root_attr,
135 attr);
136
137 return a->show ? a->show(root, buf) : 0;
138}
139
140static ssize_t btrfs_root_attr_store(struct kobject *kobj,
141 struct attribute *attr,
142 const char *buf, size_t len)
143{
144 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
145 root_kobj);
146 struct btrfs_root_attr *a = container_of(attr,
147 struct btrfs_root_attr,
148 attr);
149 return a->store ? a->store(root, buf, len) : 0;
150}
151
152static void btrfs_super_release(struct kobject *kobj)
153{
154 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
155 super_kobj);
156 complete(&fs->kobj_unregister);
157}
158
159static void btrfs_root_release(struct kobject *kobj)
160{
161 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
162 root_kobj);
163 complete(&root->kobj_unregister);
164}
165
166static struct sysfs_ops btrfs_super_attr_ops = {
167 .show = btrfs_super_attr_show,
168 .store = btrfs_super_attr_store,
169};
170
171static struct sysfs_ops btrfs_root_attr_ops = {
172 .show = btrfs_root_attr_show,
173 .store = btrfs_root_attr_store,
174};
175
176static struct kobj_type btrfs_root_ktype = {
177 .default_attrs = btrfs_root_attrs,
178 .sysfs_ops = &btrfs_root_attr_ops,
179 .release = btrfs_root_release,
180};
181
182static struct kobj_type btrfs_super_ktype = {
183 .default_attrs = btrfs_super_attrs,
184 .sysfs_ops = &btrfs_super_attr_ops,
185 .release = btrfs_super_release,
186};
187
188/* /sys/fs/btrfs/ entry */
189static struct kset *btrfs_kset;
190
191int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
192{
193 int error;
194 char *name;
195 char c;
196 int len = strlen(fs->sb->s_id) + 1;
197 int i;
198
199 name = kmalloc(len, GFP_NOFS);
200 if (!name) {
201 error = -ENOMEM;
202 goto fail;
203 }
204
205 for (i = 0; i < len; i++) {
206 c = fs->sb->s_id[i];
207 if (c == '/' || c == '\\')
208 c = '!';
209 name[i] = c;
210 }
211 name[len] = '\0';
212
213 fs->super_kobj.kset = btrfs_kset;
214 error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
215 NULL, "%s", name);
216 if (error)
217 goto fail;
218
219 kfree(name);
220 return 0;
221
222fail:
223 kfree(name);
224 printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
225 return error;
226}
227
228int btrfs_sysfs_add_root(struct btrfs_root *root)
229{
230 int error;
231
232 error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
233 &root->fs_info->super_kobj,
234 "%s", root->name);
235 if (error)
236 goto fail;
237
238 return 0;
239
240fail:
241 printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
242 return error;
243}
244
245void btrfs_sysfs_del_root(struct btrfs_root *root)
246{
247 kobject_put(&root->root_kobj);
248 wait_for_completion(&root->kobj_unregister);
249}
250
251void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
252{
253 kobject_put(&fs->super_kobj);
254 wait_for_completion(&fs->kobj_unregister);
255}
256
257int btrfs_init_sysfs(void)
258{
259 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
260 if (!btrfs_kset)
261 return -ENOMEM;
262 return 0;
263}
264
265void btrfs_exit_sysfs(void)
266{
267 kset_unregister(btrfs_kset);
268}
269
270#else
271
272int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
273{
274 return 0;
275}
276
277int btrfs_sysfs_add_root(struct btrfs_root *root)
278{
279 return 0;
280}
281
282void btrfs_sysfs_del_root(struct btrfs_root *root)
283{
284 return;
285}
286
287void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
288{
289 return;
290}
291
292int btrfs_init_sysfs(void)
293{
294 return 0;
295}
296
297void btrfs_exit_sysfs(void)
298{
299 return;
300}
301#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 000000000000..151b00d52593
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,950 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/sched.h>
21#include <linux/writeback.h>
22#include <linux/pagemap.h>
23#include "ctree.h"
24#include "disk-io.h"
25#include "transaction.h"
26#include "locking.h"
27#include "ref-cache.h"
28#include "tree-log.h"
29
30static int total_trans = 0;
31extern struct kmem_cache *btrfs_trans_handle_cachep;
32extern struct kmem_cache *btrfs_transaction_cachep;
33
34#define BTRFS_ROOT_TRANS_TAG 0
35
36static noinline void put_transaction(struct btrfs_transaction *transaction)
37{
38 WARN_ON(transaction->use_count == 0);
39 transaction->use_count--;
40 if (transaction->use_count == 0) {
41 WARN_ON(total_trans == 0);
42 total_trans--;
43 list_del_init(&transaction->list);
44 memset(transaction, 0, sizeof(*transaction));
45 kmem_cache_free(btrfs_transaction_cachep, transaction);
46 }
47}
48
49static noinline int join_transaction(struct btrfs_root *root)
50{
51 struct btrfs_transaction *cur_trans;
52 cur_trans = root->fs_info->running_transaction;
53 if (!cur_trans) {
54 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
55 GFP_NOFS);
56 total_trans++;
57 BUG_ON(!cur_trans);
58 root->fs_info->generation++;
59 root->fs_info->last_alloc = 0;
60 root->fs_info->last_data_alloc = 0;
61 root->fs_info->last_log_alloc = 0;
62 cur_trans->num_writers = 1;
63 cur_trans->num_joined = 0;
64 cur_trans->transid = root->fs_info->generation;
65 init_waitqueue_head(&cur_trans->writer_wait);
66 init_waitqueue_head(&cur_trans->commit_wait);
67 cur_trans->in_commit = 0;
68 cur_trans->blocked = 0;
69 cur_trans->use_count = 1;
70 cur_trans->commit_done = 0;
71 cur_trans->start_time = get_seconds();
72 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
73 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
74 extent_io_tree_init(&cur_trans->dirty_pages,
75 root->fs_info->btree_inode->i_mapping,
76 GFP_NOFS);
77 spin_lock(&root->fs_info->new_trans_lock);
78 root->fs_info->running_transaction = cur_trans;
79 spin_unlock(&root->fs_info->new_trans_lock);
80 } else {
81 cur_trans->num_writers++;
82 cur_trans->num_joined++;
83 }
84
85 return 0;
86}
87
88noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
89{
90 struct btrfs_dirty_root *dirty;
91 u64 running_trans_id = root->fs_info->running_transaction->transid;
92 if (root->ref_cows && root->last_trans < running_trans_id) {
93 WARN_ON(root == root->fs_info->extent_root);
94 if (root->root_item.refs != 0) {
95 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
96 (unsigned long)root->root_key.objectid,
97 BTRFS_ROOT_TRANS_TAG);
98
99 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
100 BUG_ON(!dirty);
101 dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
102 BUG_ON(!dirty->root);
103 dirty->latest_root = root;
104 INIT_LIST_HEAD(&dirty->list);
105
106 root->commit_root = btrfs_root_node(root);
107
108 memcpy(dirty->root, root, sizeof(*root));
109 spin_lock_init(&dirty->root->node_lock);
110 spin_lock_init(&dirty->root->list_lock);
111 mutex_init(&dirty->root->objectid_mutex);
112 INIT_LIST_HEAD(&dirty->root->dead_list);
113 dirty->root->node = root->commit_root;
114 dirty->root->commit_root = NULL;
115
116 spin_lock(&root->list_lock);
117 list_add(&dirty->root->dead_list, &root->dead_list);
118 spin_unlock(&root->list_lock);
119
120 root->dirty_root = dirty;
121 } else {
122 WARN_ON(1);
123 }
124 root->last_trans = running_trans_id;
125 }
126 return 0;
127}
128
129static void wait_current_trans(struct btrfs_root *root)
130{
131 struct btrfs_transaction *cur_trans;
132
133 cur_trans = root->fs_info->running_transaction;
134 if (cur_trans && cur_trans->blocked) {
135 DEFINE_WAIT(wait);
136 cur_trans->use_count++;
137 while(1) {
138 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
139 TASK_UNINTERRUPTIBLE);
140 if (cur_trans->blocked) {
141 mutex_unlock(&root->fs_info->trans_mutex);
142 schedule();
143 mutex_lock(&root->fs_info->trans_mutex);
144 finish_wait(&root->fs_info->transaction_wait,
145 &wait);
146 } else {
147 finish_wait(&root->fs_info->transaction_wait,
148 &wait);
149 break;
150 }
151 }
152 put_transaction(cur_trans);
153 }
154}
155
156static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
157 int num_blocks, int wait)
158{
159 struct btrfs_trans_handle *h =
160 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
161 int ret;
162
163 mutex_lock(&root->fs_info->trans_mutex);
164 if (!root->fs_info->log_root_recovering &&
165 ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
166 wait_current_trans(root);
167 ret = join_transaction(root);
168 BUG_ON(ret);
169
170 btrfs_record_root_in_trans(root);
171 h->transid = root->fs_info->running_transaction->transid;
172 h->transaction = root->fs_info->running_transaction;
173 h->blocks_reserved = num_blocks;
174 h->blocks_used = 0;
175 h->block_group = NULL;
176 h->alloc_exclude_nr = 0;
177 h->alloc_exclude_start = 0;
178 root->fs_info->running_transaction->use_count++;
179 mutex_unlock(&root->fs_info->trans_mutex);
180 return h;
181}
182
183struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
184 int num_blocks)
185{
186 return start_transaction(root, num_blocks, 1);
187}
188struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
189 int num_blocks)
190{
191 return start_transaction(root, num_blocks, 0);
192}
193
194struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
195 int num_blocks)
196{
197 return start_transaction(r, num_blocks, 2);
198}
199
200
201static noinline int wait_for_commit(struct btrfs_root *root,
202 struct btrfs_transaction *commit)
203{
204 DEFINE_WAIT(wait);
205 mutex_lock(&root->fs_info->trans_mutex);
206 while(!commit->commit_done) {
207 prepare_to_wait(&commit->commit_wait, &wait,
208 TASK_UNINTERRUPTIBLE);
209 if (commit->commit_done)
210 break;
211 mutex_unlock(&root->fs_info->trans_mutex);
212 schedule();
213 mutex_lock(&root->fs_info->trans_mutex);
214 }
215 mutex_unlock(&root->fs_info->trans_mutex);
216 finish_wait(&commit->commit_wait, &wait);
217 return 0;
218}
219
220static void throttle_on_drops(struct btrfs_root *root)
221{
222 struct btrfs_fs_info *info = root->fs_info;
223 int harder_count = 0;
224
225harder:
226 if (atomic_read(&info->throttles)) {
227 DEFINE_WAIT(wait);
228 int thr;
229 thr = atomic_read(&info->throttle_gen);
230
231 do {
232 prepare_to_wait(&info->transaction_throttle,
233 &wait, TASK_UNINTERRUPTIBLE);
234 if (!atomic_read(&info->throttles)) {
235 finish_wait(&info->transaction_throttle, &wait);
236 break;
237 }
238 schedule();
239 finish_wait(&info->transaction_throttle, &wait);
240 } while (thr == atomic_read(&info->throttle_gen));
241 harder_count++;
242
243 if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
244 harder_count < 2)
245 goto harder;
246
247 if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
248 harder_count < 10)
249 goto harder;
250
251 if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
252 harder_count < 20)
253 goto harder;
254 }
255}
256
257void btrfs_throttle(struct btrfs_root *root)
258{
259 mutex_lock(&root->fs_info->trans_mutex);
260 if (!root->fs_info->open_ioctl_trans)
261 wait_current_trans(root);
262 mutex_unlock(&root->fs_info->trans_mutex);
263
264 throttle_on_drops(root);
265}
266
267static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
268 struct btrfs_root *root, int throttle)
269{
270 struct btrfs_transaction *cur_trans;
271 struct btrfs_fs_info *info = root->fs_info;
272
273 mutex_lock(&info->trans_mutex);
274 cur_trans = info->running_transaction;
275 WARN_ON(cur_trans != trans->transaction);
276 WARN_ON(cur_trans->num_writers < 1);
277 cur_trans->num_writers--;
278
279 if (waitqueue_active(&cur_trans->writer_wait))
280 wake_up(&cur_trans->writer_wait);
281 put_transaction(cur_trans);
282 mutex_unlock(&info->trans_mutex);
283 memset(trans, 0, sizeof(*trans));
284 kmem_cache_free(btrfs_trans_handle_cachep, trans);
285
286 if (throttle)
287 throttle_on_drops(root);
288
289 return 0;
290}
291
292int btrfs_end_transaction(struct btrfs_trans_handle *trans,
293 struct btrfs_root *root)
294{
295 return __btrfs_end_transaction(trans, root, 0);
296}
297
298int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
299 struct btrfs_root *root)
300{
301 return __btrfs_end_transaction(trans, root, 1);
302}
303
304
305int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
306 struct extent_io_tree *dirty_pages)
307{
308 int ret;
309 int err = 0;
310 int werr = 0;
311 struct page *page;
312 struct inode *btree_inode = root->fs_info->btree_inode;
313 u64 start = 0;
314 u64 end;
315 unsigned long index;
316
317 while(1) {
318 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
319 EXTENT_DIRTY);
320 if (ret)
321 break;
322 while(start <= end) {
323 cond_resched();
324
325 index = start >> PAGE_CACHE_SHIFT;
326 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
327 page = find_get_page(btree_inode->i_mapping, index);
328 if (!page)
329 continue;
330
331 btree_lock_page_hook(page);
332 if (!page->mapping) {
333 unlock_page(page);
334 page_cache_release(page);
335 continue;
336 }
337
338 if (PageWriteback(page)) {
339 if (PageDirty(page))
340 wait_on_page_writeback(page);
341 else {
342 unlock_page(page);
343 page_cache_release(page);
344 continue;
345 }
346 }
347 err = write_one_page(page, 0);
348 if (err)
349 werr = err;
350 page_cache_release(page);
351 }
352 }
353 while(1) {
354 ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
355 EXTENT_DIRTY);
356 if (ret)
357 break;
358
359 clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
360 while(start <= end) {
361 index = start >> PAGE_CACHE_SHIFT;
362 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
363 page = find_get_page(btree_inode->i_mapping, index);
364 if (!page)
365 continue;
366 if (PageDirty(page)) {
367 btree_lock_page_hook(page);
368 wait_on_page_writeback(page);
369 err = write_one_page(page, 0);
370 if (err)
371 werr = err;
372 }
373 wait_on_page_writeback(page);
374 page_cache_release(page);
375 cond_resched();
376 }
377 }
378 if (err)
379 werr = err;
380 return werr;
381}
382
383int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
384 struct btrfs_root *root)
385{
386 if (!trans || !trans->transaction) {
387 struct inode *btree_inode;
388 btree_inode = root->fs_info->btree_inode;
389 return filemap_write_and_wait(btree_inode->i_mapping);
390 }
391 return btrfs_write_and_wait_marked_extents(root,
392 &trans->transaction->dirty_pages);
393}
394
395static int update_cowonly_root(struct btrfs_trans_handle *trans,
396 struct btrfs_root *root)
397{
398 int ret;
399 u64 old_root_bytenr;
400 struct btrfs_root *tree_root = root->fs_info->tree_root;
401
402 btrfs_write_dirty_block_groups(trans, root);
403 while(1) {
404 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
405 if (old_root_bytenr == root->node->start)
406 break;
407 btrfs_set_root_bytenr(&root->root_item,
408 root->node->start);
409 btrfs_set_root_level(&root->root_item,
410 btrfs_header_level(root->node));
411 ret = btrfs_update_root(trans, tree_root,
412 &root->root_key,
413 &root->root_item);
414 BUG_ON(ret);
415 btrfs_write_dirty_block_groups(trans, root);
416 }
417 return 0;
418}
419
420int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
421 struct btrfs_root *root)
422{
423 struct btrfs_fs_info *fs_info = root->fs_info;
424 struct list_head *next;
425
426 while(!list_empty(&fs_info->dirty_cowonly_roots)) {
427 next = fs_info->dirty_cowonly_roots.next;
428 list_del_init(next);
429 root = list_entry(next, struct btrfs_root, dirty_list);
430 update_cowonly_root(trans, root);
431 }
432 return 0;
433}
434
435int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
436{
437 struct btrfs_dirty_root *dirty;
438
439 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
440 if (!dirty)
441 return -ENOMEM;
442 dirty->root = root;
443 dirty->latest_root = latest;
444
445 mutex_lock(&root->fs_info->trans_mutex);
446 list_add(&dirty->list, &latest->fs_info->dead_roots);
447 mutex_unlock(&root->fs_info->trans_mutex);
448 return 0;
449}
450
451static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
452 struct radix_tree_root *radix,
453 struct list_head *list)
454{
455 struct btrfs_dirty_root *dirty;
456 struct btrfs_root *gang[8];
457 struct btrfs_root *root;
458 int i;
459 int ret;
460 int err = 0;
461 u32 refs;
462
463 while(1) {
464 ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
465 ARRAY_SIZE(gang),
466 BTRFS_ROOT_TRANS_TAG);
467 if (ret == 0)
468 break;
469 for (i = 0; i < ret; i++) {
470 root = gang[i];
471 radix_tree_tag_clear(radix,
472 (unsigned long)root->root_key.objectid,
473 BTRFS_ROOT_TRANS_TAG);
474
475 BUG_ON(!root->ref_tree);
476 dirty = root->dirty_root;
477
478 btrfs_free_log(trans, root);
479
480 if (root->commit_root == root->node) {
481 WARN_ON(root->node->start !=
482 btrfs_root_bytenr(&root->root_item));
483
484 free_extent_buffer(root->commit_root);
485 root->commit_root = NULL;
486 root->dirty_root = NULL;
487
488 spin_lock(&root->list_lock);
489 list_del_init(&dirty->root->dead_list);
490 spin_unlock(&root->list_lock);
491
492 kfree(dirty->root);
493 kfree(dirty);
494
495 /* make sure to update the root on disk
496 * so we get any updates to the block used
497 * counts
498 */
499 err = btrfs_update_root(trans,
500 root->fs_info->tree_root,
501 &root->root_key,
502 &root->root_item);
503 continue;
504 }
505
506 memset(&root->root_item.drop_progress, 0,
507 sizeof(struct btrfs_disk_key));
508 root->root_item.drop_level = 0;
509 root->commit_root = NULL;
510 root->dirty_root = NULL;
511 root->root_key.offset = root->fs_info->generation;
512 btrfs_set_root_bytenr(&root->root_item,
513 root->node->start);
514 btrfs_set_root_level(&root->root_item,
515 btrfs_header_level(root->node));
516 err = btrfs_insert_root(trans, root->fs_info->tree_root,
517 &root->root_key,
518 &root->root_item);
519 if (err)
520 break;
521
522 refs = btrfs_root_refs(&dirty->root->root_item);
523 btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
524 err = btrfs_update_root(trans, root->fs_info->tree_root,
525 &dirty->root->root_key,
526 &dirty->root->root_item);
527
528 BUG_ON(err);
529 if (refs == 1) {
530 list_add(&dirty->list, list);
531 } else {
532 WARN_ON(1);
533 free_extent_buffer(dirty->root->node);
534 kfree(dirty->root);
535 kfree(dirty);
536 }
537 }
538 }
539 return err;
540}
541
542int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
543{
544 struct btrfs_fs_info *info = root->fs_info;
545 int ret;
546 struct btrfs_trans_handle *trans;
547 unsigned long nr;
548
549 smp_mb();
550 if (root->defrag_running)
551 return 0;
552 trans = btrfs_start_transaction(root, 1);
553 while (1) {
554 root->defrag_running = 1;
555 ret = btrfs_defrag_leaves(trans, root, cacheonly);
556 nr = trans->blocks_used;
557 btrfs_end_transaction(trans, root);
558 btrfs_btree_balance_dirty(info->tree_root, nr);
559 cond_resched();
560
561 trans = btrfs_start_transaction(root, 1);
562 if (root->fs_info->closing || ret != -EAGAIN)
563 break;
564 }
565 root->defrag_running = 0;
566 smp_mb();
567 btrfs_end_transaction(trans, root);
568 return 0;
569}
570
571static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
572 struct list_head *list)
573{
574 struct btrfs_dirty_root *dirty;
575 struct btrfs_trans_handle *trans;
576 unsigned long nr;
577 u64 num_bytes;
578 u64 bytes_used;
579 u64 max_useless;
580 int ret = 0;
581 int err;
582
583 while(!list_empty(list)) {
584 struct btrfs_root *root;
585
586 dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
587 list_del_init(&dirty->list);
588
589 num_bytes = btrfs_root_used(&dirty->root->root_item);
590 root = dirty->latest_root;
591 atomic_inc(&root->fs_info->throttles);
592
593 mutex_lock(&root->fs_info->drop_mutex);
594 while(1) {
595 trans = btrfs_start_transaction(tree_root, 1);
596 ret = btrfs_drop_snapshot(trans, dirty->root);
597 if (ret != -EAGAIN) {
598 break;
599 }
600
601 err = btrfs_update_root(trans,
602 tree_root,
603 &dirty->root->root_key,
604 &dirty->root->root_item);
605 if (err)
606 ret = err;
607 nr = trans->blocks_used;
608 ret = btrfs_end_transaction(trans, tree_root);
609 BUG_ON(ret);
610
611 mutex_unlock(&root->fs_info->drop_mutex);
612 btrfs_btree_balance_dirty(tree_root, nr);
613 cond_resched();
614 mutex_lock(&root->fs_info->drop_mutex);
615 }
616 BUG_ON(ret);
617 atomic_dec(&root->fs_info->throttles);
618 wake_up(&root->fs_info->transaction_throttle);
619
620 mutex_lock(&root->fs_info->alloc_mutex);
621 num_bytes -= btrfs_root_used(&dirty->root->root_item);
622 bytes_used = btrfs_root_used(&root->root_item);
623 if (num_bytes) {
624 btrfs_record_root_in_trans(root);
625 btrfs_set_root_used(&root->root_item,
626 bytes_used - num_bytes);
627 }
628 mutex_unlock(&root->fs_info->alloc_mutex);
629
630 ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
631 if (ret) {
632 BUG();
633 break;
634 }
635 mutex_unlock(&root->fs_info->drop_mutex);
636
637 spin_lock(&root->list_lock);
638 list_del_init(&dirty->root->dead_list);
639 if (!list_empty(&root->dead_list)) {
640 struct btrfs_root *oldest;
641 oldest = list_entry(root->dead_list.prev,
642 struct btrfs_root, dead_list);
643 max_useless = oldest->root_key.offset - 1;
644 } else {
645 max_useless = root->root_key.offset - 1;
646 }
647 spin_unlock(&root->list_lock);
648
649 nr = trans->blocks_used;
650 ret = btrfs_end_transaction(trans, tree_root);
651 BUG_ON(ret);
652
653 ret = btrfs_remove_leaf_refs(root, max_useless);
654 BUG_ON(ret);
655
656 free_extent_buffer(dirty->root->node);
657 kfree(dirty->root);
658 kfree(dirty);
659
660 btrfs_btree_balance_dirty(tree_root, nr);
661 cond_resched();
662 }
663 return ret;
664}
665
666static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
667 struct btrfs_fs_info *fs_info,
668 struct btrfs_pending_snapshot *pending)
669{
670 struct btrfs_key key;
671 struct btrfs_root_item *new_root_item;
672 struct btrfs_root *tree_root = fs_info->tree_root;
673 struct btrfs_root *root = pending->root;
674 struct extent_buffer *tmp;
675 struct extent_buffer *old;
676 int ret;
677 int namelen;
678 u64 objectid;
679
680 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
681 if (!new_root_item) {
682 ret = -ENOMEM;
683 goto fail;
684 }
685 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
686 if (ret)
687 goto fail;
688
689 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
690
691 key.objectid = objectid;
692 key.offset = 1;
693 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
694
695 old = btrfs_lock_root_node(root);
696 btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
697
698 btrfs_copy_root(trans, root, old, &tmp, objectid);
699 btrfs_tree_unlock(old);
700 free_extent_buffer(old);
701
702 btrfs_set_root_bytenr(new_root_item, tmp->start);
703 btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
704 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
705 new_root_item);
706 btrfs_tree_unlock(tmp);
707 free_extent_buffer(tmp);
708 if (ret)
709 goto fail;
710
711 /*
712 * insert the directory item
713 */
714 key.offset = (u64)-1;
715 namelen = strlen(pending->name);
716 ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
717 pending->name, namelen,
718 root->fs_info->sb->s_root->d_inode->i_ino,
719 &key, BTRFS_FT_DIR, 0);
720
721 if (ret)
722 goto fail;
723
724 ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
725 pending->name, strlen(pending->name), objectid,
726 root->fs_info->sb->s_root->d_inode->i_ino, 0);
727
728 /* Invalidate existing dcache entry for new snapshot. */
729 btrfs_invalidate_dcache_root(root, pending->name, namelen);
730
731fail:
732 kfree(new_root_item);
733 return ret;
734}
735
736static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
737 struct btrfs_fs_info *fs_info)
738{
739 struct btrfs_pending_snapshot *pending;
740 struct list_head *head = &trans->transaction->pending_snapshots;
741 int ret;
742
743 while(!list_empty(head)) {
744 pending = list_entry(head->next,
745 struct btrfs_pending_snapshot, list);
746 ret = create_pending_snapshot(trans, fs_info, pending);
747 BUG_ON(ret);
748 list_del(&pending->list);
749 kfree(pending->name);
750 kfree(pending);
751 }
752 return 0;
753}
754
755int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
756 struct btrfs_root *root)
757{
758 unsigned long joined = 0;
759 unsigned long timeout = 1;
760 struct btrfs_transaction *cur_trans;
761 struct btrfs_transaction *prev_trans = NULL;
762 struct btrfs_root *chunk_root = root->fs_info->chunk_root;
763 struct list_head dirty_fs_roots;
764 struct extent_io_tree *pinned_copy;
765 DEFINE_WAIT(wait);
766 int ret;
767
768 INIT_LIST_HEAD(&dirty_fs_roots);
769 mutex_lock(&root->fs_info->trans_mutex);
770 if (trans->transaction->in_commit) {
771 cur_trans = trans->transaction;
772 trans->transaction->use_count++;
773 mutex_unlock(&root->fs_info->trans_mutex);
774 btrfs_end_transaction(trans, root);
775
776 ret = wait_for_commit(root, cur_trans);
777 BUG_ON(ret);
778
779 mutex_lock(&root->fs_info->trans_mutex);
780 put_transaction(cur_trans);
781 mutex_unlock(&root->fs_info->trans_mutex);
782
783 return 0;
784 }
785
786 pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
787 if (!pinned_copy)
788 return -ENOMEM;
789
790 extent_io_tree_init(pinned_copy,
791 root->fs_info->btree_inode->i_mapping, GFP_NOFS);
792
793 trans->transaction->in_commit = 1;
794 trans->transaction->blocked = 1;
795 cur_trans = trans->transaction;
796 if (cur_trans->list.prev != &root->fs_info->trans_list) {
797 prev_trans = list_entry(cur_trans->list.prev,
798 struct btrfs_transaction, list);
799 if (!prev_trans->commit_done) {
800 prev_trans->use_count++;
801 mutex_unlock(&root->fs_info->trans_mutex);
802
803 wait_for_commit(root, prev_trans);
804
805 mutex_lock(&root->fs_info->trans_mutex);
806 put_transaction(prev_trans);
807 }
808 }
809
810 do {
811 int snap_pending = 0;
812 joined = cur_trans->num_joined;
813 if (!list_empty(&trans->transaction->pending_snapshots))
814 snap_pending = 1;
815
816 WARN_ON(cur_trans != trans->transaction);
817 prepare_to_wait(&cur_trans->writer_wait, &wait,
818 TASK_UNINTERRUPTIBLE);
819
820 if (cur_trans->num_writers > 1)
821 timeout = MAX_SCHEDULE_TIMEOUT;
822 else
823 timeout = 1;
824
825 mutex_unlock(&root->fs_info->trans_mutex);
826
827 if (snap_pending) {
828 ret = btrfs_wait_ordered_extents(root, 1);
829 BUG_ON(ret);
830 }
831
832 schedule_timeout(timeout);
833
834 mutex_lock(&root->fs_info->trans_mutex);
835 finish_wait(&cur_trans->writer_wait, &wait);
836 } while (cur_trans->num_writers > 1 ||
837 (cur_trans->num_joined != joined));
838
839 ret = create_pending_snapshots(trans, root->fs_info);
840 BUG_ON(ret);
841
842 WARN_ON(cur_trans != trans->transaction);
843
844 /* btrfs_commit_tree_roots is responsible for getting the
845 * various roots consistent with each other. Every pointer
846 * in the tree of tree roots has to point to the most up to date
847 * root for every subvolume and other tree. So, we have to keep
848 * the tree logging code from jumping in and changing any
849 * of the trees.
850 *
851 * At this point in the commit, there can't be any tree-log
852 * writers, but a little lower down we drop the trans mutex
853 * and let new people in. By holding the tree_log_mutex
854 * from now until after the super is written, we avoid races
855 * with the tree-log code.
856 */
857 mutex_lock(&root->fs_info->tree_log_mutex);
858
859 ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
860 &dirty_fs_roots);
861 BUG_ON(ret);
862
863 /* add_dirty_roots gets rid of all the tree log roots, it is now
864 * safe to free the root of tree log roots
865 */
866 btrfs_free_log_root_tree(trans, root->fs_info);
867
868 ret = btrfs_commit_tree_roots(trans, root);
869 BUG_ON(ret);
870
871 cur_trans = root->fs_info->running_transaction;
872 spin_lock(&root->fs_info->new_trans_lock);
873 root->fs_info->running_transaction = NULL;
874 spin_unlock(&root->fs_info->new_trans_lock);
875 btrfs_set_super_generation(&root->fs_info->super_copy,
876 cur_trans->transid);
877 btrfs_set_super_root(&root->fs_info->super_copy,
878 root->fs_info->tree_root->node->start);
879 btrfs_set_super_root_level(&root->fs_info->super_copy,
880 btrfs_header_level(root->fs_info->tree_root->node));
881
882 btrfs_set_super_chunk_root(&root->fs_info->super_copy,
883 chunk_root->node->start);
884 btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
885 btrfs_header_level(chunk_root->node));
886
887 if (!root->fs_info->log_root_recovering) {
888 btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
889 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
890 }
891
892 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
893 sizeof(root->fs_info->super_copy));
894
895 btrfs_copy_pinned(root, pinned_copy);
896
897 trans->transaction->blocked = 0;
898 wake_up(&root->fs_info->transaction_throttle);
899 wake_up(&root->fs_info->transaction_wait);
900
901 mutex_unlock(&root->fs_info->trans_mutex);
902 ret = btrfs_write_and_wait_transaction(trans, root);
903 BUG_ON(ret);
904 write_ctree_super(trans, root);
905
906 /*
907 * the super is written, we can safely allow the tree-loggers
908 * to go about their business
909 */
910 mutex_unlock(&root->fs_info->tree_log_mutex);
911
912 btrfs_finish_extent_commit(trans, root, pinned_copy);
913 mutex_lock(&root->fs_info->trans_mutex);
914
915 kfree(pinned_copy);
916
917 cur_trans->commit_done = 1;
918 root->fs_info->last_trans_committed = cur_trans->transid;
919 wake_up(&cur_trans->commit_wait);
920 put_transaction(cur_trans);
921 put_transaction(cur_trans);
922
923 list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
924 if (root->fs_info->closing)
925 list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
926
927 mutex_unlock(&root->fs_info->trans_mutex);
928 kmem_cache_free(btrfs_trans_handle_cachep, trans);
929
930 if (root->fs_info->closing) {
931 drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
932 }
933 return ret;
934}
935
936int btrfs_clean_old_snapshots(struct btrfs_root *root)
937{
938 struct list_head dirty_roots;
939 INIT_LIST_HEAD(&dirty_roots);
940again:
941 mutex_lock(&root->fs_info->trans_mutex);
942 list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
943 mutex_unlock(&root->fs_info->trans_mutex);
944
945 if (!list_empty(&dirty_roots)) {
946 drop_dirty_roots(root, &dirty_roots);
947 goto again;
948 }
949 return 0;
950}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 000000000000..eef2cb7d7e78
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,104 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_TRANSACTION__
20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h"
22
23struct btrfs_transaction {
24 u64 transid;
25 unsigned long num_writers;
26 unsigned long num_joined;
27 int in_commit;
28 int use_count;
29 int commit_done;
30 int blocked;
31 struct list_head list;
32 struct extent_io_tree dirty_pages;
33 unsigned long start_time;
34 wait_queue_head_t writer_wait;
35 wait_queue_head_t commit_wait;
36 struct list_head pending_snapshots;
37};
38
39struct btrfs_trans_handle {
40 u64 transid;
41 unsigned long blocks_reserved;
42 unsigned long blocks_used;
43 struct btrfs_transaction *transaction;
44 struct btrfs_block_group_cache *block_group;
45 u64 alloc_exclude_start;
46 u64 alloc_exclude_nr;
47};
48
49struct btrfs_pending_snapshot {
50 struct btrfs_root *root;
51 char *name;
52 struct list_head list;
53};
54
55struct btrfs_dirty_root {
56 struct list_head list;
57 struct btrfs_root *root;
58 struct btrfs_root *latest_root;
59};
60
61static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
62 struct inode *inode)
63{
64 trans->block_group = BTRFS_I(inode)->block_group;
65}
66
67static inline void btrfs_update_inode_block_group(struct
68 btrfs_trans_handle *trans,
69 struct inode *inode)
70{
71 BTRFS_I(inode)->block_group = trans->block_group;
72}
73
74static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
75 struct inode *inode)
76{
77 BTRFS_I(inode)->last_trans = trans->transaction->transid;
78}
79
80int btrfs_end_transaction(struct btrfs_trans_handle *trans,
81 struct btrfs_root *root);
82struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
83 int num_blocks);
84struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
85 int num_blocks);
86struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
87 int num_blocks);
88int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
89 struct btrfs_root *root);
90int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
91 struct btrfs_root *root);
92
93int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
94int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
95int btrfs_clean_old_snapshots(struct btrfs_root *root);
96int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root);
98int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
99 struct btrfs_root *root);
100void btrfs_throttle(struct btrfs_root *root);
101int btrfs_record_root_in_trans(struct btrfs_root *root);
102int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
103 struct extent_io_tree *dirty_pages);
104#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 000000000000..b3bb5bbad76e
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,145 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "print-tree.h"
23#include "transaction.h"
24#include "locking.h"
25
26int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
27 struct btrfs_root *root, int cache_only)
28{
29 struct btrfs_path *path = NULL;
30 struct btrfs_key key;
31 int ret = 0;
32 int wret;
33 int level;
34 int orig_level;
35 int is_extent = 0;
36 int next_key_ret = 0;
37 u64 last_ret = 0;
38 u64 min_trans = 0;
39
40 if (cache_only)
41 goto out;
42
43 if (root->fs_info->extent_root == root) {
44 /*
45 * there's recursion here right now in the tree locking,
46 * we can't defrag the extent root without deadlock
47 */
48 goto out;
49 }
50
51 if (root->ref_cows == 0 && !is_extent)
52 goto out;
53
54 if (btrfs_test_opt(root, SSD))
55 goto out;
56
57 path = btrfs_alloc_path();
58 if (!path)
59 return -ENOMEM;
60
61 level = btrfs_header_level(root->node);
62 orig_level = level;
63
64 if (level == 0) {
65 goto out;
66 }
67 if (root->defrag_progress.objectid == 0) {
68 struct extent_buffer *root_node;
69 u32 nritems;
70
71 root_node = btrfs_lock_root_node(root);
72 nritems = btrfs_header_nritems(root_node);
73 root->defrag_max.objectid = 0;
74 /* from above we know this is not a leaf */
75 btrfs_node_key_to_cpu(root_node, &root->defrag_max,
76 nritems - 1);
77 btrfs_tree_unlock(root_node);
78 free_extent_buffer(root_node);
79 memset(&key, 0, sizeof(key));
80 } else {
81 memcpy(&key, &root->defrag_progress, sizeof(key));
82 }
83
84 path->keep_locks = 1;
85 if (cache_only)
86 min_trans = root->defrag_trans_start;
87
88 ret = btrfs_search_forward(root, &key, NULL, path,
89 cache_only, min_trans);
90 if (ret < 0)
91 goto out;
92 if (ret > 0) {
93 ret = 0;
94 goto out;
95 }
96 btrfs_release_path(root, path);
97 wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
98
99 if (wret < 0) {
100 ret = wret;
101 goto out;
102 }
103 if (!path->nodes[1]) {
104 ret = 0;
105 goto out;
106 }
107 path->slots[1] = btrfs_header_nritems(path->nodes[1]);
108 next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
109 min_trans);
110 ret = btrfs_realloc_node(trans, root,
111 path->nodes[1], 0,
112 cache_only, &last_ret,
113 &root->defrag_progress);
114 WARN_ON(ret && ret != -EAGAIN);
115 if (next_key_ret == 0) {
116 memcpy(&root->defrag_progress, &key, sizeof(key));
117 ret = -EAGAIN;
118 }
119
120 btrfs_release_path(root, path);
121 if (is_extent)
122 btrfs_extent_post_op(trans, root);
123out:
124 if (is_extent)
125 mutex_unlock(&root->fs_info->alloc_mutex);
126
127 if (path)
128 btrfs_free_path(path);
129 if (ret == -EAGAIN) {
130 if (root->defrag_max.objectid > root->defrag_progress.objectid)
131 goto done;
132 if (root->defrag_max.type > root->defrag_progress.type)
133 goto done;
134 if (root->defrag_max.offset > root->defrag_progress.offset)
135 goto done;
136 ret = 0;
137 }
138done:
139 if (ret != -EAGAIN) {
140 memset(&root->defrag_progress, 0,
141 sizeof(root->defrag_progress));
142 root->defrag_trans_start = trans->transid;
143 }
144 return ret;
145}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 000000000000..88bbfd959f18
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,2892 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "transaction.h"
22#include "disk-io.h"
23#include "locking.h"
24#include "print-tree.h"
25#include "compat.h"
26
27/* magic values for the inode_only field in btrfs_log_inode:
28 *
29 * LOG_INODE_ALL means to log everything
30 * LOG_INODE_EXISTS means to log just enough to recreate the inode
31 * during log replay
32 */
33#define LOG_INODE_ALL 0
34#define LOG_INODE_EXISTS 1
35
36/*
37 * stages for the tree walking. The first
38 * stage (0) is to only pin down the blocks we find
39 * the second stage (1) is to make sure that all the inodes
40 * we find in the log are created in the subvolume.
41 *
42 * The last stage is to deal with directories and links and extents
43 * and all the other fun semantics
44 */
45#define LOG_WALK_PIN_ONLY 0
46#define LOG_WALK_REPLAY_INODES 1
47#define LOG_WALK_REPLAY_ALL 2
48
49static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
50 struct btrfs_root *root, struct inode *inode,
51 int inode_only);
52
53/*
54 * tree logging is a special write ahead log used to make sure that
55 * fsyncs and O_SYNCs can happen without doing full tree commits.
56 *
57 * Full tree commits are expensive because they require commonly
58 * modified blocks to be recowed, creating many dirty pages in the
59 * extent tree an 4x-6x higher write load than ext3.
60 *
61 * Instead of doing a tree commit on every fsync, we use the
62 * key ranges and transaction ids to find items for a given file or directory
63 * that have changed in this transaction. Those items are copied into
64 * a special tree (one per subvolume root), that tree is written to disk
65 * and then the fsync is considered complete.
66 *
67 * After a crash, items are copied out of the log-tree back into the
68 * subvolume tree. Any file data extents found are recorded in the extent
69 * allocation tree, and the log-tree freed.
70 *
71 * The log tree is read three times, once to pin down all the extents it is
72 * using in ram and once, once to create all the inodes logged in the tree
73 * and once to do all the other items.
74 */
75
76/*
77 * btrfs_add_log_tree adds a new per-subvolume log tree into the
78 * tree of log tree roots. This must be called with a tree log transaction
79 * running (see start_log_trans).
80 */
81int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
82 struct btrfs_root *root)
83{
84 struct btrfs_key key;
85 struct btrfs_root_item root_item;
86 struct btrfs_inode_item *inode_item;
87 struct extent_buffer *leaf;
88 struct btrfs_root *new_root = root;
89 int ret;
90 u64 objectid = root->root_key.objectid;
91
92 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
93 BTRFS_TREE_LOG_OBJECTID,
94 trans->transid, 0, 0, 0);
95 if (IS_ERR(leaf)) {
96 ret = PTR_ERR(leaf);
97 return ret;
98 }
99
100 btrfs_set_header_nritems(leaf, 0);
101 btrfs_set_header_level(leaf, 0);
102 btrfs_set_header_bytenr(leaf, leaf->start);
103 btrfs_set_header_generation(leaf, trans->transid);
104 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
105
106 write_extent_buffer(leaf, root->fs_info->fsid,
107 (unsigned long)btrfs_header_fsid(leaf),
108 BTRFS_FSID_SIZE);
109 btrfs_mark_buffer_dirty(leaf);
110
111 inode_item = &root_item.inode;
112 memset(inode_item, 0, sizeof(*inode_item));
113 inode_item->generation = cpu_to_le64(1);
114 inode_item->size = cpu_to_le64(3);
115 inode_item->nlink = cpu_to_le32(1);
116 inode_item->nblocks = cpu_to_le64(1);
117 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
118
119 btrfs_set_root_bytenr(&root_item, leaf->start);
120 btrfs_set_root_level(&root_item, 0);
121 btrfs_set_root_refs(&root_item, 0);
122 btrfs_set_root_used(&root_item, 0);
123
124 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
125 root_item.drop_level = 0;
126
127 btrfs_tree_unlock(leaf);
128 free_extent_buffer(leaf);
129 leaf = NULL;
130
131 btrfs_set_root_dirid(&root_item, 0);
132
133 key.objectid = BTRFS_TREE_LOG_OBJECTID;
134 key.offset = objectid;
135 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
136 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
137 &root_item);
138 if (ret)
139 goto fail;
140
141 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
142 &key);
143 BUG_ON(!new_root);
144
145 WARN_ON(root->log_root);
146 root->log_root = new_root;
147
148 /*
149 * log trees do not get reference counted because they go away
150 * before a real commit is actually done. They do store pointers
151 * to file data extents, and those reference counts still get
152 * updated (along with back refs to the log tree).
153 */
154 new_root->ref_cows = 0;
155 new_root->last_trans = trans->transid;
156fail:
157 return ret;
158}
159
160/*
161 * start a sub transaction and setup the log tree
162 * this increments the log tree writer count to make the people
163 * syncing the tree wait for us to finish
164 */
165static int start_log_trans(struct btrfs_trans_handle *trans,
166 struct btrfs_root *root)
167{
168 int ret;
169 mutex_lock(&root->fs_info->tree_log_mutex);
170 if (!root->fs_info->log_root_tree) {
171 ret = btrfs_init_log_root_tree(trans, root->fs_info);
172 BUG_ON(ret);
173 }
174 if (!root->log_root) {
175 ret = btrfs_add_log_tree(trans, root);
176 BUG_ON(ret);
177 }
178 atomic_inc(&root->fs_info->tree_log_writers);
179 root->fs_info->tree_log_batch++;
180 mutex_unlock(&root->fs_info->tree_log_mutex);
181 return 0;
182}
183
184/*
185 * returns 0 if there was a log transaction running and we were able
186 * to join, or returns -ENOENT if there were not transactions
187 * in progress
188 */
189static int join_running_log_trans(struct btrfs_root *root)
190{
191 int ret = -ENOENT;
192
193 smp_mb();
194 if (!root->log_root)
195 return -ENOENT;
196
197 mutex_lock(&root->fs_info->tree_log_mutex);
198 if (root->log_root) {
199 ret = 0;
200 atomic_inc(&root->fs_info->tree_log_writers);
201 root->fs_info->tree_log_batch++;
202 }
203 mutex_unlock(&root->fs_info->tree_log_mutex);
204 return ret;
205}
206
207/*
208 * indicate we're done making changes to the log tree
209 * and wake up anyone waiting to do a sync
210 */
211static int end_log_trans(struct btrfs_root *root)
212{
213 atomic_dec(&root->fs_info->tree_log_writers);
214 smp_mb();
215 if (waitqueue_active(&root->fs_info->tree_log_wait))
216 wake_up(&root->fs_info->tree_log_wait);
217 return 0;
218}
219
220
221/*
222 * the walk control struct is used to pass state down the chain when
223 * processing the log tree. The stage field tells us which part
224 * of the log tree processing we are currently doing. The others
225 * are state fields used for that specific part
226 */
227struct walk_control {
228 /* should we free the extent on disk when done? This is used
229 * at transaction commit time while freeing a log tree
230 */
231 int free;
232
233 /* should we write out the extent buffer? This is used
234 * while flushing the log tree to disk during a sync
235 */
236 int write;
237
238 /* should we wait for the extent buffer io to finish? Also used
239 * while flushing the log tree to disk for a sync
240 */
241 int wait;
242
243 /* pin only walk, we record which extents on disk belong to the
244 * log trees
245 */
246 int pin;
247
248 /* what stage of the replay code we're currently in */
249 int stage;
250
251 /* the root we are currently replaying */
252 struct btrfs_root *replay_dest;
253
254 /* the trans handle for the current replay */
255 struct btrfs_trans_handle *trans;
256
257 /* the function that gets used to process blocks we find in the
258 * tree. Note the extent_buffer might not be up to date when it is
259 * passed in, and it must be checked or read if you need the data
260 * inside it
261 */
262 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
263 struct walk_control *wc, u64 gen);
264};
265
266/*
267 * process_func used to pin down extents, write them or wait on them
268 */
269static int process_one_buffer(struct btrfs_root *log,
270 struct extent_buffer *eb,
271 struct walk_control *wc, u64 gen)
272{
273 if (wc->pin) {
274 mutex_lock(&log->fs_info->alloc_mutex);
275 btrfs_update_pinned_extents(log->fs_info->extent_root,
276 eb->start, eb->len, 1);
277 mutex_unlock(&log->fs_info->alloc_mutex);
278 }
279
280 if (btrfs_buffer_uptodate(eb, gen)) {
281 if (wc->write)
282 btrfs_write_tree_block(eb);
283 if (wc->wait)
284 btrfs_wait_tree_block_writeback(eb);
285 }
286 return 0;
287}
288
289/*
290 * Item overwrite used by replay and tree logging. eb, slot and key all refer
291 * to the src data we are copying out.
292 *
293 * root is the tree we are copying into, and path is a scratch
294 * path for use in this function (it should be released on entry and
295 * will be released on exit).
296 *
297 * If the key is already in the destination tree the existing item is
298 * overwritten. If the existing item isn't big enough, it is extended.
299 * If it is too large, it is truncated.
300 *
301 * If the key isn't in the destination yet, a new item is inserted.
302 */
303static noinline int overwrite_item(struct btrfs_trans_handle *trans,
304 struct btrfs_root *root,
305 struct btrfs_path *path,
306 struct extent_buffer *eb, int slot,
307 struct btrfs_key *key)
308{
309 int ret;
310 u32 item_size;
311 u64 saved_i_size = 0;
312 int save_old_i_size = 0;
313 unsigned long src_ptr;
314 unsigned long dst_ptr;
315 int overwrite_root = 0;
316
317 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
318 overwrite_root = 1;
319
320 item_size = btrfs_item_size_nr(eb, slot);
321 src_ptr = btrfs_item_ptr_offset(eb, slot);
322
323 /* look for the key in the destination tree */
324 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
325 if (ret == 0) {
326 char *src_copy;
327 char *dst_copy;
328 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
329 path->slots[0]);
330 if (dst_size != item_size)
331 goto insert;
332
333 if (item_size == 0) {
334 btrfs_release_path(root, path);
335 return 0;
336 }
337 dst_copy = kmalloc(item_size, GFP_NOFS);
338 src_copy = kmalloc(item_size, GFP_NOFS);
339
340 read_extent_buffer(eb, src_copy, src_ptr, item_size);
341
342 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
343 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
344 item_size);
345 ret = memcmp(dst_copy, src_copy, item_size);
346
347 kfree(dst_copy);
348 kfree(src_copy);
349 /*
350 * they have the same contents, just return, this saves
351 * us from cowing blocks in the destination tree and doing
352 * extra writes that may not have been done by a previous
353 * sync
354 */
355 if (ret == 0) {
356 btrfs_release_path(root, path);
357 return 0;
358 }
359
360 }
361insert:
362 btrfs_release_path(root, path);
363 /* try to insert the key into the destination tree */
364 ret = btrfs_insert_empty_item(trans, root, path,
365 key, item_size);
366
367 /* make sure any existing item is the correct size */
368 if (ret == -EEXIST) {
369 u32 found_size;
370 found_size = btrfs_item_size_nr(path->nodes[0],
371 path->slots[0]);
372 if (found_size > item_size) {
373 btrfs_truncate_item(trans, root, path, item_size, 1);
374 } else if (found_size < item_size) {
375 ret = btrfs_del_item(trans, root,
376 path);
377 BUG_ON(ret);
378
379 btrfs_release_path(root, path);
380 ret = btrfs_insert_empty_item(trans,
381 root, path, key, item_size);
382 BUG_ON(ret);
383 }
384 } else if (ret) {
385 BUG();
386 }
387 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
388 path->slots[0]);
389
390 /* don't overwrite an existing inode if the generation number
391 * was logged as zero. This is done when the tree logging code
392 * is just logging an inode to make sure it exists after recovery.
393 *
394 * Also, don't overwrite i_size on directories during replay.
395 * log replay inserts and removes directory items based on the
396 * state of the tree found in the subvolume, and i_size is modified
397 * as it goes
398 */
399 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
400 struct btrfs_inode_item *src_item;
401 struct btrfs_inode_item *dst_item;
402
403 src_item = (struct btrfs_inode_item *)src_ptr;
404 dst_item = (struct btrfs_inode_item *)dst_ptr;
405
406 if (btrfs_inode_generation(eb, src_item) == 0)
407 goto no_copy;
408
409 if (overwrite_root &&
410 S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
411 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
412 save_old_i_size = 1;
413 saved_i_size = btrfs_inode_size(path->nodes[0],
414 dst_item);
415 }
416 }
417
418 copy_extent_buffer(path->nodes[0], eb, dst_ptr,
419 src_ptr, item_size);
420
421 if (save_old_i_size) {
422 struct btrfs_inode_item *dst_item;
423 dst_item = (struct btrfs_inode_item *)dst_ptr;
424 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
425 }
426
427 /* make sure the generation is filled in */
428 if (key->type == BTRFS_INODE_ITEM_KEY) {
429 struct btrfs_inode_item *dst_item;
430 dst_item = (struct btrfs_inode_item *)dst_ptr;
431 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
432 btrfs_set_inode_generation(path->nodes[0], dst_item,
433 trans->transid);
434 }
435 }
436
437 if (overwrite_root &&
438 key->type == BTRFS_EXTENT_DATA_KEY) {
439 int extent_type;
440 struct btrfs_file_extent_item *fi;
441
442 fi = (struct btrfs_file_extent_item *)dst_ptr;
443 extent_type = btrfs_file_extent_type(path->nodes[0], fi);
444 if (extent_type == BTRFS_FILE_EXTENT_REG) {
445 struct btrfs_key ins;
446 ins.objectid = btrfs_file_extent_disk_bytenr(
447 path->nodes[0], fi);
448 ins.offset = btrfs_file_extent_disk_num_bytes(
449 path->nodes[0], fi);
450 ins.type = BTRFS_EXTENT_ITEM_KEY;
451
452 /*
453 * is this extent already allocated in the extent
454 * allocation tree? If so, just add a reference
455 */
456 ret = btrfs_lookup_extent(root, ins.objectid,
457 ins.offset);
458 if (ret == 0) {
459 ret = btrfs_inc_extent_ref(trans, root,
460 ins.objectid, ins.offset,
461 path->nodes[0]->start,
462 root->root_key.objectid,
463 trans->transid,
464 key->objectid, key->offset);
465 } else {
466 /*
467 * insert the extent pointer in the extent
468 * allocation tree
469 */
470 ret = btrfs_alloc_logged_extent(trans, root,
471 path->nodes[0]->start,
472 root->root_key.objectid,
473 trans->transid, key->objectid,
474 key->offset, &ins);
475 BUG_ON(ret);
476 }
477 }
478 }
479no_copy:
480 btrfs_mark_buffer_dirty(path->nodes[0]);
481 btrfs_release_path(root, path);
482 return 0;
483}
484
485/*
486 * simple helper to read an inode off the disk from a given root
487 * This can only be called for subvolume roots and not for the log
488 */
489static noinline struct inode *read_one_inode(struct btrfs_root *root,
490 u64 objectid)
491{
492 struct inode *inode;
493 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
494 if (inode->i_state & I_NEW) {
495 BTRFS_I(inode)->root = root;
496 BTRFS_I(inode)->location.objectid = objectid;
497 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
498 BTRFS_I(inode)->location.offset = 0;
499 btrfs_read_locked_inode(inode);
500 unlock_new_inode(inode);
501
502 }
503 if (is_bad_inode(inode)) {
504 iput(inode);
505 inode = NULL;
506 }
507 return inode;
508}
509
510/* replays a single extent in 'eb' at 'slot' with 'key' into the
511 * subvolume 'root'. path is released on entry and should be released
512 * on exit.
513 *
514 * extents in the log tree have not been allocated out of the extent
515 * tree yet. So, this completes the allocation, taking a reference
516 * as required if the extent already exists or creating a new extent
517 * if it isn't in the extent allocation tree yet.
518 *
519 * The extent is inserted into the file, dropping any existing extents
520 * from the file that overlap the new one.
521 */
522static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
523 struct btrfs_root *root,
524 struct btrfs_path *path,
525 struct extent_buffer *eb, int slot,
526 struct btrfs_key *key)
527{
528 int found_type;
529 u64 mask = root->sectorsize - 1;
530 u64 extent_end;
531 u64 alloc_hint;
532 u64 start = key->offset;
533 struct btrfs_file_extent_item *item;
534 struct inode *inode = NULL;
535 unsigned long size;
536 int ret = 0;
537
538 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
539 found_type = btrfs_file_extent_type(eb, item);
540
541 if (found_type == BTRFS_FILE_EXTENT_REG)
542 extent_end = start + btrfs_file_extent_num_bytes(eb, item);
543 else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
544 size = btrfs_file_extent_inline_len(eb,
545 btrfs_item_nr(eb, slot));
546 extent_end = (start + size + mask) & ~mask;
547 } else {
548 ret = 0;
549 goto out;
550 }
551
552 inode = read_one_inode(root, key->objectid);
553 if (!inode) {
554 ret = -EIO;
555 goto out;
556 }
557
558 /*
559 * first check to see if we already have this extent in the
560 * file. This must be done before the btrfs_drop_extents run
561 * so we don't try to drop this extent.
562 */
563 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
564 start, 0);
565
566 if (ret == 0 && found_type == BTRFS_FILE_EXTENT_REG) {
567 struct btrfs_file_extent_item cmp1;
568 struct btrfs_file_extent_item cmp2;
569 struct btrfs_file_extent_item *existing;
570 struct extent_buffer *leaf;
571
572 leaf = path->nodes[0];
573 existing = btrfs_item_ptr(leaf, path->slots[0],
574 struct btrfs_file_extent_item);
575
576 read_extent_buffer(eb, &cmp1, (unsigned long)item,
577 sizeof(cmp1));
578 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
579 sizeof(cmp2));
580
581 /*
582 * we already have a pointer to this exact extent,
583 * we don't have to do anything
584 */
585 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
586 btrfs_release_path(root, path);
587 goto out;
588 }
589 }
590 btrfs_release_path(root, path);
591
592 /* drop any overlapping extents */
593 ret = btrfs_drop_extents(trans, root, inode,
594 start, extent_end, start, &alloc_hint);
595 BUG_ON(ret);
596
597 /* insert the extent */
598 ret = overwrite_item(trans, root, path, eb, slot, key);
599 BUG_ON(ret);
600
601 /* btrfs_drop_extents changes i_blocks, update it here */
602 inode->i_blocks += (extent_end - start) >> 9;
603 btrfs_update_inode(trans, root, inode);
604out:
605 if (inode)
606 iput(inode);
607 return ret;
608}
609
610/*
611 * when cleaning up conflicts between the directory names in the
612 * subvolume, directory names in the log and directory names in the
613 * inode back references, we may have to unlink inodes from directories.
614 *
615 * This is a helper function to do the unlink of a specific directory
616 * item
617 */
618static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
619 struct btrfs_root *root,
620 struct btrfs_path *path,
621 struct inode *dir,
622 struct btrfs_dir_item *di)
623{
624 struct inode *inode;
625 char *name;
626 int name_len;
627 struct extent_buffer *leaf;
628 struct btrfs_key location;
629 int ret;
630
631 leaf = path->nodes[0];
632
633 btrfs_dir_item_key_to_cpu(leaf, di, &location);
634 name_len = btrfs_dir_name_len(leaf, di);
635 name = kmalloc(name_len, GFP_NOFS);
636 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
637 btrfs_release_path(root, path);
638
639 inode = read_one_inode(root, location.objectid);
640 BUG_ON(!inode);
641
642 btrfs_inc_nlink(inode);
643 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
644 kfree(name);
645
646 iput(inode);
647 return ret;
648}
649
650/*
651 * helper function to see if a given name and sequence number found
652 * in an inode back reference are already in a directory and correctly
653 * point to this inode
654 */
655static noinline int inode_in_dir(struct btrfs_root *root,
656 struct btrfs_path *path,
657 u64 dirid, u64 objectid, u64 index,
658 const char *name, int name_len)
659{
660 struct btrfs_dir_item *di;
661 struct btrfs_key location;
662 int match = 0;
663
664 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
665 index, name, name_len, 0);
666 if (di && !IS_ERR(di)) {
667 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
668 if (location.objectid != objectid)
669 goto out;
670 } else
671 goto out;
672 btrfs_release_path(root, path);
673
674 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
675 if (di && !IS_ERR(di)) {
676 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
677 if (location.objectid != objectid)
678 goto out;
679 } else
680 goto out;
681 match = 1;
682out:
683 btrfs_release_path(root, path);
684 return match;
685}
686
687/*
688 * helper function to check a log tree for a named back reference in
689 * an inode. This is used to decide if a back reference that is
690 * found in the subvolume conflicts with what we find in the log.
691 *
692 * inode backreferences may have multiple refs in a single item,
693 * during replay we process one reference at a time, and we don't
694 * want to delete valid links to a file from the subvolume if that
695 * link is also in the log.
696 */
697static noinline int backref_in_log(struct btrfs_root *log,
698 struct btrfs_key *key,
699 char *name, int namelen)
700{
701 struct btrfs_path *path;
702 struct btrfs_inode_ref *ref;
703 unsigned long ptr;
704 unsigned long ptr_end;
705 unsigned long name_ptr;
706 int found_name_len;
707 int item_size;
708 int ret;
709 int match = 0;
710
711 path = btrfs_alloc_path();
712 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
713 if (ret != 0)
714 goto out;
715
716 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
717 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
718 ptr_end = ptr + item_size;
719 while (ptr < ptr_end) {
720 ref = (struct btrfs_inode_ref *)ptr;
721 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
722 if (found_name_len == namelen) {
723 name_ptr = (unsigned long)(ref + 1);
724 ret = memcmp_extent_buffer(path->nodes[0], name,
725 name_ptr, namelen);
726 if (ret == 0) {
727 match = 1;
728 goto out;
729 }
730 }
731 ptr = (unsigned long)(ref + 1) + found_name_len;
732 }
733out:
734 btrfs_free_path(path);
735 return match;
736}
737
738
739/*
740 * replay one inode back reference item found in the log tree.
741 * eb, slot and key refer to the buffer and key found in the log tree.
742 * root is the destination we are replaying into, and path is for temp
743 * use by this function. (it should be released on return).
744 */
745static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
746 struct btrfs_root *root,
747 struct btrfs_root *log,
748 struct btrfs_path *path,
749 struct extent_buffer *eb, int slot,
750 struct btrfs_key *key)
751{
752 struct inode *dir;
753 int ret;
754 struct btrfs_key location;
755 struct btrfs_inode_ref *ref;
756 struct btrfs_dir_item *di;
757 struct inode *inode;
758 char *name;
759 int namelen;
760 unsigned long ref_ptr;
761 unsigned long ref_end;
762
763 location.objectid = key->objectid;
764 location.type = BTRFS_INODE_ITEM_KEY;
765 location.offset = 0;
766
767 /*
768 * it is possible that we didn't log all the parent directories
769 * for a given inode. If we don't find the dir, just don't
770 * copy the back ref in. The link count fixup code will take
771 * care of the rest
772 */
773 dir = read_one_inode(root, key->offset);
774 if (!dir)
775 return -ENOENT;
776
777 inode = read_one_inode(root, key->objectid);
778 BUG_ON(!dir);
779
780 ref_ptr = btrfs_item_ptr_offset(eb, slot);
781 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
782
783again:
784 ref = (struct btrfs_inode_ref *)ref_ptr;
785
786 namelen = btrfs_inode_ref_name_len(eb, ref);
787 name = kmalloc(namelen, GFP_NOFS);
788 BUG_ON(!name);
789
790 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
791
792 /* if we already have a perfect match, we're done */
793 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
794 btrfs_inode_ref_index(eb, ref),
795 name, namelen)) {
796 goto out;
797 }
798
799 /*
800 * look for a conflicting back reference in the metadata.
801 * if we find one we have to unlink that name of the file
802 * before we add our new link. Later on, we overwrite any
803 * existing back reference, and we don't want to create
804 * dangling pointers in the directory.
805 */
806conflict_again:
807 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
808 if (ret == 0) {
809 char *victim_name;
810 int victim_name_len;
811 struct btrfs_inode_ref *victim_ref;
812 unsigned long ptr;
813 unsigned long ptr_end;
814 struct extent_buffer *leaf = path->nodes[0];
815
816 /* are we trying to overwrite a back ref for the root directory
817 * if so, just jump out, we're done
818 */
819 if (key->objectid == key->offset)
820 goto out_nowrite;
821
822 /* check all the names in this back reference to see
823 * if they are in the log. if so, we allow them to stay
824 * otherwise they must be unlinked as a conflict
825 */
826 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
827 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
828 while(ptr < ptr_end) {
829 victim_ref = (struct btrfs_inode_ref *)ptr;
830 victim_name_len = btrfs_inode_ref_name_len(leaf,
831 victim_ref);
832 victim_name = kmalloc(victim_name_len, GFP_NOFS);
833 BUG_ON(!victim_name);
834
835 read_extent_buffer(leaf, victim_name,
836 (unsigned long)(victim_ref + 1),
837 victim_name_len);
838
839 if (!backref_in_log(log, key, victim_name,
840 victim_name_len)) {
841 btrfs_inc_nlink(inode);
842 btrfs_release_path(root, path);
843 ret = btrfs_unlink_inode(trans, root, dir,
844 inode, victim_name,
845 victim_name_len);
846 kfree(victim_name);
847 btrfs_release_path(root, path);
848 goto conflict_again;
849 }
850 kfree(victim_name);
851 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
852 }
853 BUG_ON(ret);
854 }
855 btrfs_release_path(root, path);
856
857 /* look for a conflicting sequence number */
858 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
859 btrfs_inode_ref_index(eb, ref),
860 name, namelen, 0);
861 if (di && !IS_ERR(di)) {
862 ret = drop_one_dir_item(trans, root, path, dir, di);
863 BUG_ON(ret);
864 }
865 btrfs_release_path(root, path);
866
867
868 /* look for a conflicting name */
869 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
870 name, namelen, 0);
871 if (di && !IS_ERR(di)) {
872 ret = drop_one_dir_item(trans, root, path, dir, di);
873 BUG_ON(ret);
874 }
875 btrfs_release_path(root, path);
876
877 /* insert our name */
878 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
879 btrfs_inode_ref_index(eb, ref));
880 BUG_ON(ret);
881
882 btrfs_update_inode(trans, root, inode);
883
884out:
885 ref_ptr = (unsigned long)(ref + 1) + namelen;
886 kfree(name);
887 if (ref_ptr < ref_end)
888 goto again;
889
890 /* finally write the back reference in the inode */
891 ret = overwrite_item(trans, root, path, eb, slot, key);
892 BUG_ON(ret);
893
894out_nowrite:
895 btrfs_release_path(root, path);
896 iput(dir);
897 iput(inode);
898 return 0;
899}
900
901/*
902 * replay one csum item from the log tree into the subvolume 'root'
903 * eb, slot and key all refer to the log tree
904 * path is for temp use by this function and should be released on return
905 *
906 * This copies the checksums out of the log tree and inserts them into
907 * the subvolume. Any existing checksums for this range in the file
908 * are overwritten, and new items are added where required.
909 *
910 * We keep this simple by reusing the btrfs_ordered_sum code from
911 * the data=ordered mode. This basically means making a copy
912 * of all the checksums in ram, which we have to do anyway for kmap
913 * rules.
914 *
915 * The copy is then sent down to btrfs_csum_file_blocks, which
916 * does all the hard work of finding existing items in the file
917 * or adding new ones.
918 */
919static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
920 struct btrfs_root *root,
921 struct btrfs_path *path,
922 struct extent_buffer *eb, int slot,
923 struct btrfs_key *key)
924{
925 int ret;
926 u32 item_size = btrfs_item_size_nr(eb, slot);
927 u64 cur_offset;
928 unsigned long file_bytes;
929 struct btrfs_ordered_sum *sums;
930 struct btrfs_sector_sum *sector_sum;
931 struct inode *inode;
932 unsigned long ptr;
933
934 file_bytes = (item_size / BTRFS_CRC32_SIZE) * root->sectorsize;
935 inode = read_one_inode(root, key->objectid);
936 if (!inode) {
937 return -EIO;
938 }
939
940 sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
941 if (!sums) {
942 iput(inode);
943 return -ENOMEM;
944 }
945
946 INIT_LIST_HEAD(&sums->list);
947 sums->len = file_bytes;
948 sums->file_offset = key->offset;
949
950 /*
951 * copy all the sums into the ordered sum struct
952 */
953 sector_sum = sums->sums;
954 cur_offset = key->offset;
955 ptr = btrfs_item_ptr_offset(eb, slot);
956 while(item_size > 0) {
957 sector_sum->offset = cur_offset;
958 read_extent_buffer(eb, &sector_sum->sum, ptr, BTRFS_CRC32_SIZE);
959 sector_sum++;
960 item_size -= BTRFS_CRC32_SIZE;
961 ptr += BTRFS_CRC32_SIZE;
962 cur_offset += root->sectorsize;
963 }
964
965 /* let btrfs_csum_file_blocks add them into the file */
966 ret = btrfs_csum_file_blocks(trans, root, inode, sums);
967 BUG_ON(ret);
968 kfree(sums);
969 iput(inode);
970
971 return 0;
972}
973/*
974 * There are a few corners where the link count of the file can't
975 * be properly maintained during replay. So, instead of adding
976 * lots of complexity to the log code, we just scan the backrefs
977 * for any file that has been through replay.
978 *
979 * The scan will update the link count on the inode to reflect the
980 * number of back refs found. If it goes down to zero, the iput
981 * will free the inode.
982 */
983static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
984 struct btrfs_root *root,
985 struct inode *inode)
986{
987 struct btrfs_path *path;
988 int ret;
989 struct btrfs_key key;
990 u64 nlink = 0;
991 unsigned long ptr;
992 unsigned long ptr_end;
993 int name_len;
994
995 key.objectid = inode->i_ino;
996 key.type = BTRFS_INODE_REF_KEY;
997 key.offset = (u64)-1;
998
999 path = btrfs_alloc_path();
1000
1001 while(1) {
1002 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1003 if (ret < 0)
1004 break;
1005 if (ret > 0) {
1006 if (path->slots[0] == 0)
1007 break;
1008 path->slots[0]--;
1009 }
1010 btrfs_item_key_to_cpu(path->nodes[0], &key,
1011 path->slots[0]);
1012 if (key.objectid != inode->i_ino ||
1013 key.type != BTRFS_INODE_REF_KEY)
1014 break;
1015 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1016 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1017 path->slots[0]);
1018 while(ptr < ptr_end) {
1019 struct btrfs_inode_ref *ref;
1020
1021 ref = (struct btrfs_inode_ref *)ptr;
1022 name_len = btrfs_inode_ref_name_len(path->nodes[0],
1023 ref);
1024 ptr = (unsigned long)(ref + 1) + name_len;
1025 nlink++;
1026 }
1027
1028 if (key.offset == 0)
1029 break;
1030 key.offset--;
1031 btrfs_release_path(root, path);
1032 }
1033 btrfs_free_path(path);
1034 if (nlink != inode->i_nlink) {
1035 inode->i_nlink = nlink;
1036 btrfs_update_inode(trans, root, inode);
1037 }
1038 BTRFS_I(inode)->index_cnt = (u64)-1;
1039
1040 return 0;
1041}
1042
1043static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1044 struct btrfs_root *root,
1045 struct btrfs_path *path)
1046{
1047 int ret;
1048 struct btrfs_key key;
1049 struct inode *inode;
1050
1051 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1052 key.type = BTRFS_ORPHAN_ITEM_KEY;
1053 key.offset = (u64)-1;
1054 while(1) {
1055 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1056 if (ret < 0)
1057 break;
1058
1059 if (ret == 1) {
1060 if (path->slots[0] == 0)
1061 break;
1062 path->slots[0]--;
1063 }
1064
1065 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1066 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1067 key.type != BTRFS_ORPHAN_ITEM_KEY)
1068 break;
1069
1070 ret = btrfs_del_item(trans, root, path);
1071 BUG_ON(ret);
1072
1073 btrfs_release_path(root, path);
1074 inode = read_one_inode(root, key.offset);
1075 BUG_ON(!inode);
1076
1077 ret = fixup_inode_link_count(trans, root, inode);
1078 BUG_ON(ret);
1079
1080 iput(inode);
1081
1082 if (key.offset == 0)
1083 break;
1084 key.offset--;
1085 }
1086 btrfs_release_path(root, path);
1087 return 0;
1088}
1089
1090
1091/*
1092 * record a given inode in the fixup dir so we can check its link
1093 * count when replay is done. The link count is incremented here
1094 * so the inode won't go away until we check it
1095 */
1096static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1097 struct btrfs_root *root,
1098 struct btrfs_path *path,
1099 u64 objectid)
1100{
1101 struct btrfs_key key;
1102 int ret = 0;
1103 struct inode *inode;
1104
1105 inode = read_one_inode(root, objectid);
1106 BUG_ON(!inode);
1107
1108 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1109 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1110 key.offset = objectid;
1111
1112 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1113
1114 btrfs_release_path(root, path);
1115 if (ret == 0) {
1116 btrfs_inc_nlink(inode);
1117 btrfs_update_inode(trans, root, inode);
1118 } else if (ret == -EEXIST) {
1119 ret = 0;
1120 } else {
1121 BUG();
1122 }
1123 iput(inode);
1124
1125 return ret;
1126}
1127
1128/*
1129 * when replaying the log for a directory, we only insert names
1130 * for inodes that actually exist. This means an fsync on a directory
1131 * does not implicitly fsync all the new files in it
1132 */
1133static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1134 struct btrfs_root *root,
1135 struct btrfs_path *path,
1136 u64 dirid, u64 index,
1137 char *name, int name_len, u8 type,
1138 struct btrfs_key *location)
1139{
1140 struct inode *inode;
1141 struct inode *dir;
1142 int ret;
1143
1144 inode = read_one_inode(root, location->objectid);
1145 if (!inode)
1146 return -ENOENT;
1147
1148 dir = read_one_inode(root, dirid);
1149 if (!dir) {
1150 iput(inode);
1151 return -EIO;
1152 }
1153 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1154
1155 /* FIXME, put inode into FIXUP list */
1156
1157 iput(inode);
1158 iput(dir);
1159 return ret;
1160}
1161
1162/*
1163 * take a single entry in a log directory item and replay it into
1164 * the subvolume.
1165 *
1166 * if a conflicting item exists in the subdirectory already,
1167 * the inode it points to is unlinked and put into the link count
1168 * fix up tree.
1169 *
1170 * If a name from the log points to a file or directory that does
1171 * not exist in the FS, it is skipped. fsyncs on directories
1172 * do not force down inodes inside that directory, just changes to the
1173 * names or unlinks in a directory.
1174 */
1175static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1176 struct btrfs_root *root,
1177 struct btrfs_path *path,
1178 struct extent_buffer *eb,
1179 struct btrfs_dir_item *di,
1180 struct btrfs_key *key)
1181{
1182 char *name;
1183 int name_len;
1184 struct btrfs_dir_item *dst_di;
1185 struct btrfs_key found_key;
1186 struct btrfs_key log_key;
1187 struct inode *dir;
1188 u8 log_type;
1189 int exists;
1190 int ret;
1191
1192 dir = read_one_inode(root, key->objectid);
1193 BUG_ON(!dir);
1194
1195 name_len = btrfs_dir_name_len(eb, di);
1196 name = kmalloc(name_len, GFP_NOFS);
1197 log_type = btrfs_dir_type(eb, di);
1198 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1199 name_len);
1200
1201 btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1202 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1203 if (exists == 0)
1204 exists = 1;
1205 else
1206 exists = 0;
1207 btrfs_release_path(root, path);
1208
1209 if (key->type == BTRFS_DIR_ITEM_KEY) {
1210 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1211 name, name_len, 1);
1212 }
1213 else if (key->type == BTRFS_DIR_INDEX_KEY) {
1214 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1215 key->objectid,
1216 key->offset, name,
1217 name_len, 1);
1218 } else {
1219 BUG();
1220 }
1221 if (!dst_di || IS_ERR(dst_di)) {
1222 /* we need a sequence number to insert, so we only
1223 * do inserts for the BTRFS_DIR_INDEX_KEY types
1224 */
1225 if (key->type != BTRFS_DIR_INDEX_KEY)
1226 goto out;
1227 goto insert;
1228 }
1229
1230 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1231 /* the existing item matches the logged item */
1232 if (found_key.objectid == log_key.objectid &&
1233 found_key.type == log_key.type &&
1234 found_key.offset == log_key.offset &&
1235 btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1236 goto out;
1237 }
1238
1239 /*
1240 * don't drop the conflicting directory entry if the inode
1241 * for the new entry doesn't exist
1242 */
1243 if (!exists)
1244 goto out;
1245
1246 ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1247 BUG_ON(ret);
1248
1249 if (key->type == BTRFS_DIR_INDEX_KEY)
1250 goto insert;
1251out:
1252 btrfs_release_path(root, path);
1253 kfree(name);
1254 iput(dir);
1255 return 0;
1256
1257insert:
1258 btrfs_release_path(root, path);
1259 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1260 name, name_len, log_type, &log_key);
1261
1262 if (ret && ret != -ENOENT)
1263 BUG();
1264 goto out;
1265}
1266
1267/*
1268 * find all the names in a directory item and reconcile them into
1269 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
1270 * one name in a directory item, but the same code gets used for
1271 * both directory index types
1272 */
1273static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1274 struct btrfs_root *root,
1275 struct btrfs_path *path,
1276 struct extent_buffer *eb, int slot,
1277 struct btrfs_key *key)
1278{
1279 int ret;
1280 u32 item_size = btrfs_item_size_nr(eb, slot);
1281 struct btrfs_dir_item *di;
1282 int name_len;
1283 unsigned long ptr;
1284 unsigned long ptr_end;
1285
1286 ptr = btrfs_item_ptr_offset(eb, slot);
1287 ptr_end = ptr + item_size;
1288 while(ptr < ptr_end) {
1289 di = (struct btrfs_dir_item *)ptr;
1290 name_len = btrfs_dir_name_len(eb, di);
1291 ret = replay_one_name(trans, root, path, eb, di, key);
1292 BUG_ON(ret);
1293 ptr = (unsigned long)(di + 1);
1294 ptr += name_len;
1295 }
1296 return 0;
1297}
1298
1299/*
1300 * directory replay has two parts. There are the standard directory
1301 * items in the log copied from the subvolume, and range items
1302 * created in the log while the subvolume was logged.
1303 *
1304 * The range items tell us which parts of the key space the log
1305 * is authoritative for. During replay, if a key in the subvolume
1306 * directory is in a logged range item, but not actually in the log
1307 * that means it was deleted from the directory before the fsync
1308 * and should be removed.
1309 */
1310static noinline int find_dir_range(struct btrfs_root *root,
1311 struct btrfs_path *path,
1312 u64 dirid, int key_type,
1313 u64 *start_ret, u64 *end_ret)
1314{
1315 struct btrfs_key key;
1316 u64 found_end;
1317 struct btrfs_dir_log_item *item;
1318 int ret;
1319 int nritems;
1320
1321 if (*start_ret == (u64)-1)
1322 return 1;
1323
1324 key.objectid = dirid;
1325 key.type = key_type;
1326 key.offset = *start_ret;
1327
1328 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1329 if (ret < 0)
1330 goto out;
1331 if (ret > 0) {
1332 if (path->slots[0] == 0)
1333 goto out;
1334 path->slots[0]--;
1335 }
1336 if (ret != 0)
1337 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1338
1339 if (key.type != key_type || key.objectid != dirid) {
1340 ret = 1;
1341 goto next;
1342 }
1343 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1344 struct btrfs_dir_log_item);
1345 found_end = btrfs_dir_log_end(path->nodes[0], item);
1346
1347 if (*start_ret >= key.offset && *start_ret <= found_end) {
1348 ret = 0;
1349 *start_ret = key.offset;
1350 *end_ret = found_end;
1351 goto out;
1352 }
1353 ret = 1;
1354next:
1355 /* check the next slot in the tree to see if it is a valid item */
1356 nritems = btrfs_header_nritems(path->nodes[0]);
1357 if (path->slots[0] >= nritems) {
1358 ret = btrfs_next_leaf(root, path);
1359 if (ret)
1360 goto out;
1361 } else {
1362 path->slots[0]++;
1363 }
1364
1365 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1366
1367 if (key.type != key_type || key.objectid != dirid) {
1368 ret = 1;
1369 goto out;
1370 }
1371 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1372 struct btrfs_dir_log_item);
1373 found_end = btrfs_dir_log_end(path->nodes[0], item);
1374 *start_ret = key.offset;
1375 *end_ret = found_end;
1376 ret = 0;
1377out:
1378 btrfs_release_path(root, path);
1379 return ret;
1380}
1381
1382/*
1383 * this looks for a given directory item in the log. If the directory
1384 * item is not in the log, the item is removed and the inode it points
1385 * to is unlinked
1386 */
1387static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1388 struct btrfs_root *root,
1389 struct btrfs_root *log,
1390 struct btrfs_path *path,
1391 struct btrfs_path *log_path,
1392 struct inode *dir,
1393 struct btrfs_key *dir_key)
1394{
1395 int ret;
1396 struct extent_buffer *eb;
1397 int slot;
1398 u32 item_size;
1399 struct btrfs_dir_item *di;
1400 struct btrfs_dir_item *log_di;
1401 int name_len;
1402 unsigned long ptr;
1403 unsigned long ptr_end;
1404 char *name;
1405 struct inode *inode;
1406 struct btrfs_key location;
1407
1408again:
1409 eb = path->nodes[0];
1410 slot = path->slots[0];
1411 item_size = btrfs_item_size_nr(eb, slot);
1412 ptr = btrfs_item_ptr_offset(eb, slot);
1413 ptr_end = ptr + item_size;
1414 while(ptr < ptr_end) {
1415 di = (struct btrfs_dir_item *)ptr;
1416 name_len = btrfs_dir_name_len(eb, di);
1417 name = kmalloc(name_len, GFP_NOFS);
1418 if (!name) {
1419 ret = -ENOMEM;
1420 goto out;
1421 }
1422 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1423 name_len);
1424 log_di = NULL;
1425 if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
1426 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1427 dir_key->objectid,
1428 name, name_len, 0);
1429 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
1430 log_di = btrfs_lookup_dir_index_item(trans, log,
1431 log_path,
1432 dir_key->objectid,
1433 dir_key->offset,
1434 name, name_len, 0);
1435 }
1436 if (!log_di || IS_ERR(log_di)) {
1437 btrfs_dir_item_key_to_cpu(eb, di, &location);
1438 btrfs_release_path(root, path);
1439 btrfs_release_path(log, log_path);
1440 inode = read_one_inode(root, location.objectid);
1441 BUG_ON(!inode);
1442
1443 ret = link_to_fixup_dir(trans, root,
1444 path, location.objectid);
1445 BUG_ON(ret);
1446 btrfs_inc_nlink(inode);
1447 ret = btrfs_unlink_inode(trans, root, dir, inode,
1448 name, name_len);
1449 BUG_ON(ret);
1450 kfree(name);
1451 iput(inode);
1452
1453 /* there might still be more names under this key
1454 * check and repeat if required
1455 */
1456 ret = btrfs_search_slot(NULL, root, dir_key, path,
1457 0, 0);
1458 if (ret == 0)
1459 goto again;
1460 ret = 0;
1461 goto out;
1462 }
1463 btrfs_release_path(log, log_path);
1464 kfree(name);
1465
1466 ptr = (unsigned long)(di + 1);
1467 ptr += name_len;
1468 }
1469 ret = 0;
1470out:
1471 btrfs_release_path(root, path);
1472 btrfs_release_path(log, log_path);
1473 return ret;
1474}
1475
1476/*
1477 * deletion replay happens before we copy any new directory items
1478 * out of the log or out of backreferences from inodes. It
1479 * scans the log to find ranges of keys that log is authoritative for,
1480 * and then scans the directory to find items in those ranges that are
1481 * not present in the log.
1482 *
1483 * Anything we don't find in the log is unlinked and removed from the
1484 * directory.
1485 */
1486static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1487 struct btrfs_root *root,
1488 struct btrfs_root *log,
1489 struct btrfs_path *path,
1490 u64 dirid)
1491{
1492 u64 range_start;
1493 u64 range_end;
1494 int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1495 int ret = 0;
1496 struct btrfs_key dir_key;
1497 struct btrfs_key found_key;
1498 struct btrfs_path *log_path;
1499 struct inode *dir;
1500
1501 dir_key.objectid = dirid;
1502 dir_key.type = BTRFS_DIR_ITEM_KEY;
1503 log_path = btrfs_alloc_path();
1504 if (!log_path)
1505 return -ENOMEM;
1506
1507 dir = read_one_inode(root, dirid);
1508 /* it isn't an error if the inode isn't there, that can happen
1509 * because we replay the deletes before we copy in the inode item
1510 * from the log
1511 */
1512 if (!dir) {
1513 btrfs_free_path(log_path);
1514 return 0;
1515 }
1516again:
1517 range_start = 0;
1518 range_end = 0;
1519 while(1) {
1520 ret = find_dir_range(log, path, dirid, key_type,
1521 &range_start, &range_end);
1522 if (ret != 0)
1523 break;
1524
1525 dir_key.offset = range_start;
1526 while(1) {
1527 int nritems;
1528 ret = btrfs_search_slot(NULL, root, &dir_key, path,
1529 0, 0);
1530 if (ret < 0)
1531 goto out;
1532
1533 nritems = btrfs_header_nritems(path->nodes[0]);
1534 if (path->slots[0] >= nritems) {
1535 ret = btrfs_next_leaf(root, path);
1536 if (ret)
1537 break;
1538 }
1539 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1540 path->slots[0]);
1541 if (found_key.objectid != dirid ||
1542 found_key.type != dir_key.type)
1543 goto next_type;
1544
1545 if (found_key.offset > range_end)
1546 break;
1547
1548 ret = check_item_in_log(trans, root, log, path,
1549 log_path, dir, &found_key);
1550 BUG_ON(ret);
1551 if (found_key.offset == (u64)-1)
1552 break;
1553 dir_key.offset = found_key.offset + 1;
1554 }
1555 btrfs_release_path(root, path);
1556 if (range_end == (u64)-1)
1557 break;
1558 range_start = range_end + 1;
1559 }
1560
1561next_type:
1562 ret = 0;
1563 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1564 key_type = BTRFS_DIR_LOG_INDEX_KEY;
1565 dir_key.type = BTRFS_DIR_INDEX_KEY;
1566 btrfs_release_path(root, path);
1567 goto again;
1568 }
1569out:
1570 btrfs_release_path(root, path);
1571 btrfs_free_path(log_path);
1572 iput(dir);
1573 return ret;
1574}
1575
1576/*
1577 * the process_func used to replay items from the log tree. This
1578 * gets called in two different stages. The first stage just looks
1579 * for inodes and makes sure they are all copied into the subvolume.
1580 *
1581 * The second stage copies all the other item types from the log into
1582 * the subvolume. The two stage approach is slower, but gets rid of
1583 * lots of complexity around inodes referencing other inodes that exist
1584 * only in the log (references come from either directory items or inode
1585 * back refs).
1586 */
1587static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1588 struct walk_control *wc, u64 gen)
1589{
1590 int nritems;
1591 struct btrfs_path *path;
1592 struct btrfs_root *root = wc->replay_dest;
1593 struct btrfs_key key;
1594 u32 item_size;
1595 int level;
1596 int i;
1597 int ret;
1598
1599 btrfs_read_buffer(eb, gen);
1600
1601 level = btrfs_header_level(eb);
1602
1603 if (level != 0)
1604 return 0;
1605
1606 path = btrfs_alloc_path();
1607 BUG_ON(!path);
1608
1609 nritems = btrfs_header_nritems(eb);
1610 for (i = 0; i < nritems; i++) {
1611 btrfs_item_key_to_cpu(eb, &key, i);
1612 item_size = btrfs_item_size_nr(eb, i);
1613
1614 /* inode keys are done during the first stage */
1615 if (key.type == BTRFS_INODE_ITEM_KEY &&
1616 wc->stage == LOG_WALK_REPLAY_INODES) {
1617 struct inode *inode;
1618 struct btrfs_inode_item *inode_item;
1619 u32 mode;
1620
1621 inode_item = btrfs_item_ptr(eb, i,
1622 struct btrfs_inode_item);
1623 mode = btrfs_inode_mode(eb, inode_item);
1624 if (S_ISDIR(mode)) {
1625 ret = replay_dir_deletes(wc->trans,
1626 root, log, path, key.objectid);
1627 BUG_ON(ret);
1628 }
1629 ret = overwrite_item(wc->trans, root, path,
1630 eb, i, &key);
1631 BUG_ON(ret);
1632
1633 /* for regular files, truncate away
1634 * extents past the new EOF
1635 */
1636 if (S_ISREG(mode)) {
1637 inode = read_one_inode(root,
1638 key.objectid);
1639 BUG_ON(!inode);
1640
1641 ret = btrfs_truncate_inode_items(wc->trans,
1642 root, inode, inode->i_size,
1643 BTRFS_EXTENT_DATA_KEY);
1644 BUG_ON(ret);
1645 iput(inode);
1646 }
1647 ret = link_to_fixup_dir(wc->trans, root,
1648 path, key.objectid);
1649 BUG_ON(ret);
1650 }
1651 if (wc->stage < LOG_WALK_REPLAY_ALL)
1652 continue;
1653
1654 /* these keys are simply copied */
1655 if (key.type == BTRFS_XATTR_ITEM_KEY) {
1656 ret = overwrite_item(wc->trans, root, path,
1657 eb, i, &key);
1658 BUG_ON(ret);
1659 } else if (key.type == BTRFS_INODE_REF_KEY) {
1660 ret = add_inode_ref(wc->trans, root, log, path,
1661 eb, i, &key);
1662 BUG_ON(ret && ret != -ENOENT);
1663 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1664 ret = replay_one_extent(wc->trans, root, path,
1665 eb, i, &key);
1666 BUG_ON(ret);
1667 } else if (key.type == BTRFS_CSUM_ITEM_KEY) {
1668 ret = replay_one_csum(wc->trans, root, path,
1669 eb, i, &key);
1670 BUG_ON(ret);
1671 } else if (key.type == BTRFS_DIR_ITEM_KEY ||
1672 key.type == BTRFS_DIR_INDEX_KEY) {
1673 ret = replay_one_dir_item(wc->trans, root, path,
1674 eb, i, &key);
1675 BUG_ON(ret);
1676 }
1677 }
1678 btrfs_free_path(path);
1679 return 0;
1680}
1681
1682static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
1683 struct btrfs_root *root,
1684 struct btrfs_path *path, int *level,
1685 struct walk_control *wc)
1686{
1687 u64 root_owner;
1688 u64 root_gen;
1689 u64 bytenr;
1690 u64 ptr_gen;
1691 struct extent_buffer *next;
1692 struct extent_buffer *cur;
1693 struct extent_buffer *parent;
1694 u32 blocksize;
1695 int ret = 0;
1696
1697 WARN_ON(*level < 0);
1698 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1699
1700 while(*level > 0) {
1701 WARN_ON(*level < 0);
1702 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1703 cur = path->nodes[*level];
1704
1705 if (btrfs_header_level(cur) != *level)
1706 WARN_ON(1);
1707
1708 if (path->slots[*level] >=
1709 btrfs_header_nritems(cur))
1710 break;
1711
1712 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1713 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
1714 blocksize = btrfs_level_size(root, *level - 1);
1715
1716 parent = path->nodes[*level];
1717 root_owner = btrfs_header_owner(parent);
1718 root_gen = btrfs_header_generation(parent);
1719
1720 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1721
1722 wc->process_func(root, next, wc, ptr_gen);
1723
1724 if (*level == 1) {
1725 path->slots[*level]++;
1726 if (wc->free) {
1727 btrfs_read_buffer(next, ptr_gen);
1728
1729 btrfs_tree_lock(next);
1730 clean_tree_block(trans, root, next);
1731 btrfs_wait_tree_block_writeback(next);
1732 btrfs_tree_unlock(next);
1733
1734 ret = btrfs_drop_leaf_ref(trans, root, next);
1735 BUG_ON(ret);
1736
1737 WARN_ON(root_owner !=
1738 BTRFS_TREE_LOG_OBJECTID);
1739 ret = btrfs_free_reserved_extent(root,
1740 bytenr, blocksize);
1741 BUG_ON(ret);
1742 }
1743 free_extent_buffer(next);
1744 continue;
1745 }
1746 btrfs_read_buffer(next, ptr_gen);
1747
1748 WARN_ON(*level <= 0);
1749 if (path->nodes[*level-1])
1750 free_extent_buffer(path->nodes[*level-1]);
1751 path->nodes[*level-1] = next;
1752 *level = btrfs_header_level(next);
1753 path->slots[*level] = 0;
1754 cond_resched();
1755 }
1756 WARN_ON(*level < 0);
1757 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1758
1759 if (path->nodes[*level] == root->node) {
1760 parent = path->nodes[*level];
1761 } else {
1762 parent = path->nodes[*level + 1];
1763 }
1764 bytenr = path->nodes[*level]->start;
1765
1766 blocksize = btrfs_level_size(root, *level);
1767 root_owner = btrfs_header_owner(parent);
1768 root_gen = btrfs_header_generation(parent);
1769
1770 wc->process_func(root, path->nodes[*level], wc,
1771 btrfs_header_generation(path->nodes[*level]));
1772
1773 if (wc->free) {
1774 next = path->nodes[*level];
1775 btrfs_tree_lock(next);
1776 clean_tree_block(trans, root, next);
1777 btrfs_wait_tree_block_writeback(next);
1778 btrfs_tree_unlock(next);
1779
1780 if (*level == 0) {
1781 ret = btrfs_drop_leaf_ref(trans, root, next);
1782 BUG_ON(ret);
1783 }
1784 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1785 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1786 BUG_ON(ret);
1787 }
1788 free_extent_buffer(path->nodes[*level]);
1789 path->nodes[*level] = NULL;
1790 *level += 1;
1791
1792 cond_resched();
1793 return 0;
1794}
1795
1796static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
1797 struct btrfs_root *root,
1798 struct btrfs_path *path, int *level,
1799 struct walk_control *wc)
1800{
1801 u64 root_owner;
1802 u64 root_gen;
1803 int i;
1804 int slot;
1805 int ret;
1806
1807 for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1808 slot = path->slots[i];
1809 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
1810 struct extent_buffer *node;
1811 node = path->nodes[i];
1812 path->slots[i]++;
1813 *level = i;
1814 WARN_ON(*level == 0);
1815 return 0;
1816 } else {
1817 struct extent_buffer *parent;
1818 if (path->nodes[*level] == root->node)
1819 parent = path->nodes[*level];
1820 else
1821 parent = path->nodes[*level + 1];
1822
1823 root_owner = btrfs_header_owner(parent);
1824 root_gen = btrfs_header_generation(parent);
1825 wc->process_func(root, path->nodes[*level], wc,
1826 btrfs_header_generation(path->nodes[*level]));
1827 if (wc->free) {
1828 struct extent_buffer *next;
1829
1830 next = path->nodes[*level];
1831
1832 btrfs_tree_lock(next);
1833 clean_tree_block(trans, root, next);
1834 btrfs_wait_tree_block_writeback(next);
1835 btrfs_tree_unlock(next);
1836
1837 if (*level == 0) {
1838 ret = btrfs_drop_leaf_ref(trans, root,
1839 next);
1840 BUG_ON(ret);
1841 }
1842
1843 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1844 ret = btrfs_free_reserved_extent(root,
1845 path->nodes[*level]->start,
1846 path->nodes[*level]->len);
1847 BUG_ON(ret);
1848 }
1849 free_extent_buffer(path->nodes[*level]);
1850 path->nodes[*level] = NULL;
1851 *level = i + 1;
1852 }
1853 }
1854 return 1;
1855}
1856
1857/*
1858 * drop the reference count on the tree rooted at 'snap'. This traverses
1859 * the tree freeing any blocks that have a ref count of zero after being
1860 * decremented.
1861 */
1862static int walk_log_tree(struct btrfs_trans_handle *trans,
1863 struct btrfs_root *log, struct walk_control *wc)
1864{
1865 int ret = 0;
1866 int wret;
1867 int level;
1868 struct btrfs_path *path;
1869 int i;
1870 int orig_level;
1871
1872 path = btrfs_alloc_path();
1873 BUG_ON(!path);
1874
1875 level = btrfs_header_level(log->node);
1876 orig_level = level;
1877 path->nodes[level] = log->node;
1878 extent_buffer_get(log->node);
1879 path->slots[level] = 0;
1880
1881 while(1) {
1882 wret = walk_down_log_tree(trans, log, path, &level, wc);
1883 if (wret > 0)
1884 break;
1885 if (wret < 0)
1886 ret = wret;
1887
1888 wret = walk_up_log_tree(trans, log, path, &level, wc);
1889 if (wret > 0)
1890 break;
1891 if (wret < 0)
1892 ret = wret;
1893 }
1894
1895 /* was the root node processed? if not, catch it here */
1896 if (path->nodes[orig_level]) {
1897 wc->process_func(log, path->nodes[orig_level], wc,
1898 btrfs_header_generation(path->nodes[orig_level]));
1899 if (wc->free) {
1900 struct extent_buffer *next;
1901
1902 next = path->nodes[orig_level];
1903
1904 btrfs_tree_lock(next);
1905 clean_tree_block(trans, log, next);
1906 btrfs_wait_tree_block_writeback(next);
1907 btrfs_tree_unlock(next);
1908
1909 if (orig_level == 0) {
1910 ret = btrfs_drop_leaf_ref(trans, log,
1911 next);
1912 BUG_ON(ret);
1913 }
1914 WARN_ON(log->root_key.objectid !=
1915 BTRFS_TREE_LOG_OBJECTID);
1916 ret = btrfs_free_reserved_extent(log, next->start,
1917 next->len);
1918 BUG_ON(ret);
1919 }
1920 }
1921
1922 for (i = 0; i <= orig_level; i++) {
1923 if (path->nodes[i]) {
1924 free_extent_buffer(path->nodes[i]);
1925 path->nodes[i] = NULL;
1926 }
1927 }
1928 btrfs_free_path(path);
1929 if (wc->free)
1930 free_extent_buffer(log->node);
1931 return ret;
1932}
1933
1934int wait_log_commit(struct btrfs_root *log)
1935{
1936 DEFINE_WAIT(wait);
1937 u64 transid = log->fs_info->tree_log_transid;
1938
1939 do {
1940 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1941 TASK_UNINTERRUPTIBLE);
1942 mutex_unlock(&log->fs_info->tree_log_mutex);
1943 if (atomic_read(&log->fs_info->tree_log_commit))
1944 schedule();
1945 finish_wait(&log->fs_info->tree_log_wait, &wait);
1946 mutex_lock(&log->fs_info->tree_log_mutex);
1947 } while(transid == log->fs_info->tree_log_transid &&
1948 atomic_read(&log->fs_info->tree_log_commit));
1949 return 0;
1950}
1951
1952/*
1953 * btrfs_sync_log does sends a given tree log down to the disk and
1954 * updates the super blocks to record it. When this call is done,
1955 * you know that any inodes previously logged are safely on disk
1956 */
1957int btrfs_sync_log(struct btrfs_trans_handle *trans,
1958 struct btrfs_root *root)
1959{
1960 int ret;
1961 unsigned long batch;
1962 struct btrfs_root *log = root->log_root;
1963
1964 mutex_lock(&log->fs_info->tree_log_mutex);
1965 if (atomic_read(&log->fs_info->tree_log_commit)) {
1966 wait_log_commit(log);
1967 goto out;
1968 }
1969 atomic_set(&log->fs_info->tree_log_commit, 1);
1970
1971 while(1) {
1972 batch = log->fs_info->tree_log_batch;
1973 mutex_unlock(&log->fs_info->tree_log_mutex);
1974 schedule_timeout_uninterruptible(1);
1975 mutex_lock(&log->fs_info->tree_log_mutex);
1976
1977 while(atomic_read(&log->fs_info->tree_log_writers)) {
1978 DEFINE_WAIT(wait);
1979 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1980 TASK_UNINTERRUPTIBLE);
1981 mutex_unlock(&log->fs_info->tree_log_mutex);
1982 if (atomic_read(&log->fs_info->tree_log_writers))
1983 schedule();
1984 mutex_lock(&log->fs_info->tree_log_mutex);
1985 finish_wait(&log->fs_info->tree_log_wait, &wait);
1986 }
1987 if (batch == log->fs_info->tree_log_batch)
1988 break;
1989 }
1990
1991 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1992 BUG_ON(ret);
1993 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
1994 &root->fs_info->log_root_tree->dirty_log_pages);
1995 BUG_ON(ret);
1996
1997 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
1998 log->fs_info->log_root_tree->node->start);
1999 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
2000 btrfs_header_level(log->fs_info->log_root_tree->node));
2001
2002 write_ctree_super(trans, log->fs_info->tree_root);
2003 log->fs_info->tree_log_transid++;
2004 log->fs_info->tree_log_batch = 0;
2005 atomic_set(&log->fs_info->tree_log_commit, 0);
2006 smp_mb();
2007 if (waitqueue_active(&log->fs_info->tree_log_wait))
2008 wake_up(&log->fs_info->tree_log_wait);
2009out:
2010 mutex_unlock(&log->fs_info->tree_log_mutex);
2011 return 0;
2012
2013}
2014
2015/* * free all the extents used by the tree log. This should be called
2016 * at commit time of the full transaction
2017 */
2018int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2019{
2020 int ret;
2021 struct btrfs_root *log;
2022 struct key;
2023 u64 start;
2024 u64 end;
2025 struct walk_control wc = {
2026 .free = 1,
2027 .process_func = process_one_buffer
2028 };
2029
2030 if (!root->log_root)
2031 return 0;
2032
2033 log = root->log_root;
2034 ret = walk_log_tree(trans, log, &wc);
2035 BUG_ON(ret);
2036
2037 while(1) {
2038 ret = find_first_extent_bit(&log->dirty_log_pages,
2039 0, &start, &end, EXTENT_DIRTY);
2040 if (ret)
2041 break;
2042
2043 clear_extent_dirty(&log->dirty_log_pages,
2044 start, end, GFP_NOFS);
2045 }
2046
2047 log = root->log_root;
2048 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2049 &log->root_key);
2050 BUG_ON(ret);
2051 root->log_root = NULL;
2052 kfree(root->log_root);
2053 return 0;
2054}
2055
2056/*
2057 * helper function to update the item for a given subvolumes log root
2058 * in the tree of log roots
2059 */
2060static int update_log_root(struct btrfs_trans_handle *trans,
2061 struct btrfs_root *log)
2062{
2063 u64 bytenr = btrfs_root_bytenr(&log->root_item);
2064 int ret;
2065
2066 if (log->node->start == bytenr)
2067 return 0;
2068
2069 btrfs_set_root_bytenr(&log->root_item, log->node->start);
2070 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2071 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2072 &log->root_key, &log->root_item);
2073 BUG_ON(ret);
2074 return ret;
2075}
2076
2077/*
2078 * If both a file and directory are logged, and unlinks or renames are
2079 * mixed in, we have a few interesting corners:
2080 *
2081 * create file X in dir Y
2082 * link file X to X.link in dir Y
2083 * fsync file X
2084 * unlink file X but leave X.link
2085 * fsync dir Y
2086 *
2087 * After a crash we would expect only X.link to exist. But file X
2088 * didn't get fsync'd again so the log has back refs for X and X.link.
2089 *
2090 * We solve this by removing directory entries and inode backrefs from the
2091 * log when a file that was logged in the current transaction is
2092 * unlinked. Any later fsync will include the updated log entries, and
2093 * we'll be able to reconstruct the proper directory items from backrefs.
2094 *
2095 * This optimizations allows us to avoid relogging the entire inode
2096 * or the entire directory.
2097 */
2098int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2099 struct btrfs_root *root,
2100 const char *name, int name_len,
2101 struct inode *dir, u64 index)
2102{
2103 struct btrfs_root *log;
2104 struct btrfs_dir_item *di;
2105 struct btrfs_path *path;
2106 int ret;
2107 int bytes_del = 0;
2108
2109 if (BTRFS_I(dir)->logged_trans < trans->transid)
2110 return 0;
2111
2112 ret = join_running_log_trans(root);
2113 if (ret)
2114 return 0;
2115
2116 mutex_lock(&BTRFS_I(dir)->log_mutex);
2117
2118 log = root->log_root;
2119 path = btrfs_alloc_path();
2120 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2121 name, name_len, -1);
2122 if (di && !IS_ERR(di)) {
2123 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2124 bytes_del += name_len;
2125 BUG_ON(ret);
2126 }
2127 btrfs_release_path(log, path);
2128 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2129 index, name, name_len, -1);
2130 if (di && !IS_ERR(di)) {
2131 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2132 bytes_del += name_len;
2133 BUG_ON(ret);
2134 }
2135
2136 /* update the directory size in the log to reflect the names
2137 * we have removed
2138 */
2139 if (bytes_del) {
2140 struct btrfs_key key;
2141
2142 key.objectid = dir->i_ino;
2143 key.offset = 0;
2144 key.type = BTRFS_INODE_ITEM_KEY;
2145 btrfs_release_path(log, path);
2146
2147 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2148 if (ret == 0) {
2149 struct btrfs_inode_item *item;
2150 u64 i_size;
2151
2152 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2153 struct btrfs_inode_item);
2154 i_size = btrfs_inode_size(path->nodes[0], item);
2155 if (i_size > bytes_del)
2156 i_size -= bytes_del;
2157 else
2158 i_size = 0;
2159 btrfs_set_inode_size(path->nodes[0], item, i_size);
2160 btrfs_mark_buffer_dirty(path->nodes[0]);
2161 } else
2162 ret = 0;
2163 btrfs_release_path(log, path);
2164 }
2165
2166 btrfs_free_path(path);
2167 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2168 end_log_trans(root);
2169
2170 return 0;
2171}
2172
2173/* see comments for btrfs_del_dir_entries_in_log */
2174int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2175 struct btrfs_root *root,
2176 const char *name, int name_len,
2177 struct inode *inode, u64 dirid)
2178{
2179 struct btrfs_root *log;
2180 u64 index;
2181 int ret;
2182
2183 if (BTRFS_I(inode)->logged_trans < trans->transid)
2184 return 0;
2185
2186 ret = join_running_log_trans(root);
2187 if (ret)
2188 return 0;
2189 log = root->log_root;
2190 mutex_lock(&BTRFS_I(inode)->log_mutex);
2191
2192 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2193 dirid, &index);
2194 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2195 end_log_trans(root);
2196
2197 return ret;
2198}
2199
2200/*
2201 * creates a range item in the log for 'dirid'. first_offset and
2202 * last_offset tell us which parts of the key space the log should
2203 * be considered authoritative for.
2204 */
2205static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2206 struct btrfs_root *log,
2207 struct btrfs_path *path,
2208 int key_type, u64 dirid,
2209 u64 first_offset, u64 last_offset)
2210{
2211 int ret;
2212 struct btrfs_key key;
2213 struct btrfs_dir_log_item *item;
2214
2215 key.objectid = dirid;
2216 key.offset = first_offset;
2217 if (key_type == BTRFS_DIR_ITEM_KEY)
2218 key.type = BTRFS_DIR_LOG_ITEM_KEY;
2219 else
2220 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2221 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2222 BUG_ON(ret);
2223
2224 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2225 struct btrfs_dir_log_item);
2226 btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2227 btrfs_mark_buffer_dirty(path->nodes[0]);
2228 btrfs_release_path(log, path);
2229 return 0;
2230}
2231
2232/*
2233 * log all the items included in the current transaction for a given
2234 * directory. This also creates the range items in the log tree required
2235 * to replay anything deleted before the fsync
2236 */
2237static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2238 struct btrfs_root *root, struct inode *inode,
2239 struct btrfs_path *path,
2240 struct btrfs_path *dst_path, int key_type,
2241 u64 min_offset, u64 *last_offset_ret)
2242{
2243 struct btrfs_key min_key;
2244 struct btrfs_key max_key;
2245 struct btrfs_root *log = root->log_root;
2246 struct extent_buffer *src;
2247 int ret;
2248 int i;
2249 int nritems;
2250 u64 first_offset = min_offset;
2251 u64 last_offset = (u64)-1;
2252
2253 log = root->log_root;
2254 max_key.objectid = inode->i_ino;
2255 max_key.offset = (u64)-1;
2256 max_key.type = key_type;
2257
2258 min_key.objectid = inode->i_ino;
2259 min_key.type = key_type;
2260 min_key.offset = min_offset;
2261
2262 path->keep_locks = 1;
2263
2264 ret = btrfs_search_forward(root, &min_key, &max_key,
2265 path, 0, trans->transid);
2266
2267 /*
2268 * we didn't find anything from this transaction, see if there
2269 * is anything at all
2270 */
2271 if (ret != 0 || min_key.objectid != inode->i_ino ||
2272 min_key.type != key_type) {
2273 min_key.objectid = inode->i_ino;
2274 min_key.type = key_type;
2275 min_key.offset = (u64)-1;
2276 btrfs_release_path(root, path);
2277 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2278 if (ret < 0) {
2279 btrfs_release_path(root, path);
2280 return ret;
2281 }
2282 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2283
2284 /* if ret == 0 there are items for this type,
2285 * create a range to tell us the last key of this type.
2286 * otherwise, there are no items in this directory after
2287 * *min_offset, and we create a range to indicate that.
2288 */
2289 if (ret == 0) {
2290 struct btrfs_key tmp;
2291 btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2292 path->slots[0]);
2293 if (key_type == tmp.type) {
2294 first_offset = max(min_offset, tmp.offset) + 1;
2295 }
2296 }
2297 goto done;
2298 }
2299
2300 /* go backward to find any previous key */
2301 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2302 if (ret == 0) {
2303 struct btrfs_key tmp;
2304 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2305 if (key_type == tmp.type) {
2306 first_offset = tmp.offset;
2307 ret = overwrite_item(trans, log, dst_path,
2308 path->nodes[0], path->slots[0],
2309 &tmp);
2310 }
2311 }
2312 btrfs_release_path(root, path);
2313
2314 /* find the first key from this transaction again */
2315 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2316 if (ret != 0) {
2317 WARN_ON(1);
2318 goto done;
2319 }
2320
2321 /*
2322 * we have a block from this transaction, log every item in it
2323 * from our directory
2324 */
2325 while(1) {
2326 struct btrfs_key tmp;
2327 src = path->nodes[0];
2328 nritems = btrfs_header_nritems(src);
2329 for (i = path->slots[0]; i < nritems; i++) {
2330 btrfs_item_key_to_cpu(src, &min_key, i);
2331
2332 if (min_key.objectid != inode->i_ino ||
2333 min_key.type != key_type)
2334 goto done;
2335 ret = overwrite_item(trans, log, dst_path, src, i,
2336 &min_key);
2337 BUG_ON(ret);
2338 }
2339 path->slots[0] = nritems;
2340
2341 /*
2342 * look ahead to the next item and see if it is also
2343 * from this directory and from this transaction
2344 */
2345 ret = btrfs_next_leaf(root, path);
2346 if (ret == 1) {
2347 last_offset = (u64)-1;
2348 goto done;
2349 }
2350 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2351 if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
2352 last_offset = (u64)-1;
2353 goto done;
2354 }
2355 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2356 ret = overwrite_item(trans, log, dst_path,
2357 path->nodes[0], path->slots[0],
2358 &tmp);
2359
2360 BUG_ON(ret);
2361 last_offset = tmp.offset;
2362 goto done;
2363 }
2364 }
2365done:
2366 *last_offset_ret = last_offset;
2367 btrfs_release_path(root, path);
2368 btrfs_release_path(log, dst_path);
2369
2370 /* insert the log range keys to indicate where the log is valid */
2371 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
2372 first_offset, last_offset);
2373 BUG_ON(ret);
2374 return 0;
2375}
2376
2377/*
2378 * logging directories is very similar to logging inodes, We find all the items
2379 * from the current transaction and write them to the log.
2380 *
2381 * The recovery code scans the directory in the subvolume, and if it finds a
2382 * key in the range logged that is not present in the log tree, then it means
2383 * that dir entry was unlinked during the transaction.
2384 *
2385 * In order for that scan to work, we must include one key smaller than
2386 * the smallest logged by this transaction and one key larger than the largest
2387 * key logged by this transaction.
2388 */
2389static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2390 struct btrfs_root *root, struct inode *inode,
2391 struct btrfs_path *path,
2392 struct btrfs_path *dst_path)
2393{
2394 u64 min_key;
2395 u64 max_key;
2396 int ret;
2397 int key_type = BTRFS_DIR_ITEM_KEY;
2398
2399again:
2400 min_key = 0;
2401 max_key = 0;
2402 while(1) {
2403 ret = log_dir_items(trans, root, inode, path,
2404 dst_path, key_type, min_key,
2405 &max_key);
2406 BUG_ON(ret);
2407 if (max_key == (u64)-1)
2408 break;
2409 min_key = max_key + 1;
2410 }
2411
2412 if (key_type == BTRFS_DIR_ITEM_KEY) {
2413 key_type = BTRFS_DIR_INDEX_KEY;
2414 goto again;
2415 }
2416 return 0;
2417}
2418
2419/*
2420 * a helper function to drop items from the log before we relog an
2421 * inode. max_key_type indicates the highest item type to remove.
2422 * This cannot be run for file data extents because it does not
2423 * free the extents they point to.
2424 */
2425static int drop_objectid_items(struct btrfs_trans_handle *trans,
2426 struct btrfs_root *log,
2427 struct btrfs_path *path,
2428 u64 objectid, int max_key_type)
2429{
2430 int ret;
2431 struct btrfs_key key;
2432 struct btrfs_key found_key;
2433
2434 key.objectid = objectid;
2435 key.type = max_key_type;
2436 key.offset = (u64)-1;
2437
2438 while(1) {
2439 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2440
2441 if (ret != 1)
2442 break;
2443
2444 if (path->slots[0] == 0)
2445 break;
2446
2447 path->slots[0]--;
2448 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2449 path->slots[0]);
2450
2451 if (found_key.objectid != objectid)
2452 break;
2453
2454 ret = btrfs_del_item(trans, log, path);
2455 BUG_ON(ret);
2456 btrfs_release_path(log, path);
2457 }
2458 btrfs_release_path(log, path);
2459 return 0;
2460}
2461
2462static noinline int copy_items(struct btrfs_trans_handle *trans,
2463 struct btrfs_root *log,
2464 struct btrfs_path *dst_path,
2465 struct extent_buffer *src,
2466 int start_slot, int nr, int inode_only)
2467{
2468 unsigned long src_offset;
2469 unsigned long dst_offset;
2470 struct btrfs_file_extent_item *extent;
2471 struct btrfs_inode_item *inode_item;
2472 int ret;
2473 struct btrfs_key *ins_keys;
2474 u32 *ins_sizes;
2475 char *ins_data;
2476 int i;
2477
2478 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2479 nr * sizeof(u32), GFP_NOFS);
2480 ins_sizes = (u32 *)ins_data;
2481 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2482
2483 for (i = 0; i < nr; i++) {
2484 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
2485 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
2486 }
2487 ret = btrfs_insert_empty_items(trans, log, dst_path,
2488 ins_keys, ins_sizes, nr);
2489 BUG_ON(ret);
2490
2491 for (i = 0; i < nr; i++) {
2492 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
2493 dst_path->slots[0]);
2494
2495 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2496
2497 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2498 src_offset, ins_sizes[i]);
2499
2500 if (inode_only == LOG_INODE_EXISTS &&
2501 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2502 inode_item = btrfs_item_ptr(dst_path->nodes[0],
2503 dst_path->slots[0],
2504 struct btrfs_inode_item);
2505 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
2506
2507 /* set the generation to zero so the recover code
2508 * can tell the difference between an logging
2509 * just to say 'this inode exists' and a logging
2510 * to say 'update this inode with these values'
2511 */
2512 btrfs_set_inode_generation(dst_path->nodes[0],
2513 inode_item, 0);
2514 }
2515 /* take a reference on file data extents so that truncates
2516 * or deletes of this inode don't have to relog the inode
2517 * again
2518 */
2519 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
2520 int found_type;
2521 extent = btrfs_item_ptr(src, start_slot + i,
2522 struct btrfs_file_extent_item);
2523
2524 found_type = btrfs_file_extent_type(src, extent);
2525 if (found_type == BTRFS_FILE_EXTENT_REG) {
2526 u64 ds = btrfs_file_extent_disk_bytenr(src,
2527 extent);
2528 u64 dl = btrfs_file_extent_disk_num_bytes(src,
2529 extent);
2530 /* ds == 0 is a hole */
2531 if (ds != 0) {
2532 ret = btrfs_inc_extent_ref(trans, log,
2533 ds, dl,
2534 dst_path->nodes[0]->start,
2535 BTRFS_TREE_LOG_OBJECTID,
2536 trans->transid,
2537 ins_keys[i].objectid,
2538 ins_keys[i].offset);
2539 BUG_ON(ret);
2540 }
2541 }
2542 }
2543 dst_path->slots[0]++;
2544 }
2545
2546 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
2547 btrfs_release_path(log, dst_path);
2548 kfree(ins_data);
2549 return 0;
2550}
2551
2552/* log a single inode in the tree log.
2553 * At least one parent directory for this inode must exist in the tree
2554 * or be logged already.
2555 *
2556 * Any items from this inode changed by the current transaction are copied
2557 * to the log tree. An extra reference is taken on any extents in this
2558 * file, allowing us to avoid a whole pile of corner cases around logging
2559 * blocks that have been removed from the tree.
2560 *
2561 * See LOG_INODE_ALL and related defines for a description of what inode_only
2562 * does.
2563 *
2564 * This handles both files and directories.
2565 */
2566static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2567 struct btrfs_root *root, struct inode *inode,
2568 int inode_only)
2569{
2570 struct btrfs_path *path;
2571 struct btrfs_path *dst_path;
2572 struct btrfs_key min_key;
2573 struct btrfs_key max_key;
2574 struct btrfs_root *log = root->log_root;
2575 struct extent_buffer *src = NULL;
2576 u32 size;
2577 int ret;
2578 int nritems;
2579 int ins_start_slot = 0;
2580 int ins_nr;
2581
2582 log = root->log_root;
2583
2584 path = btrfs_alloc_path();
2585 dst_path = btrfs_alloc_path();
2586
2587 min_key.objectid = inode->i_ino;
2588 min_key.type = BTRFS_INODE_ITEM_KEY;
2589 min_key.offset = 0;
2590
2591 max_key.objectid = inode->i_ino;
2592 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2593 max_key.type = BTRFS_XATTR_ITEM_KEY;
2594 else
2595 max_key.type = (u8)-1;
2596 max_key.offset = (u64)-1;
2597
2598 /*
2599 * if this inode has already been logged and we're in inode_only
2600 * mode, we don't want to delete the things that have already
2601 * been written to the log.
2602 *
2603 * But, if the inode has been through an inode_only log,
2604 * the logged_trans field is not set. This allows us to catch
2605 * any new names for this inode in the backrefs by logging it
2606 * again
2607 */
2608 if (inode_only == LOG_INODE_EXISTS &&
2609 BTRFS_I(inode)->logged_trans == trans->transid) {
2610 btrfs_free_path(path);
2611 btrfs_free_path(dst_path);
2612 goto out;
2613 }
2614 mutex_lock(&BTRFS_I(inode)->log_mutex);
2615
2616 /*
2617 * a brute force approach to making sure we get the most uptodate
2618 * copies of everything.
2619 */
2620 if (S_ISDIR(inode->i_mode)) {
2621 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
2622
2623 if (inode_only == LOG_INODE_EXISTS)
2624 max_key_type = BTRFS_XATTR_ITEM_KEY;
2625 ret = drop_objectid_items(trans, log, path,
2626 inode->i_ino, max_key_type);
2627 } else {
2628 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2629 }
2630 BUG_ON(ret);
2631 path->keep_locks = 1;
2632
2633 while(1) {
2634 ins_nr = 0;
2635 ret = btrfs_search_forward(root, &min_key, &max_key,
2636 path, 0, trans->transid);
2637 if (ret != 0)
2638 break;
2639again:
2640 /* note, ins_nr might be > 0 here, cleanup outside the loop */
2641 if (min_key.objectid != inode->i_ino)
2642 break;
2643 if (min_key.type > max_key.type)
2644 break;
2645
2646 src = path->nodes[0];
2647 size = btrfs_item_size_nr(src, path->slots[0]);
2648 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2649 ins_nr++;
2650 goto next_slot;
2651 } else if (!ins_nr) {
2652 ins_start_slot = path->slots[0];
2653 ins_nr = 1;
2654 goto next_slot;
2655 }
2656
2657 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2658 ins_nr, inode_only);
2659 BUG_ON(ret);
2660 ins_nr = 1;
2661 ins_start_slot = path->slots[0];
2662next_slot:
2663
2664 nritems = btrfs_header_nritems(path->nodes[0]);
2665 path->slots[0]++;
2666 if (path->slots[0] < nritems) {
2667 btrfs_item_key_to_cpu(path->nodes[0], &min_key,
2668 path->slots[0]);
2669 goto again;
2670 }
2671 if (ins_nr) {
2672 ret = copy_items(trans, log, dst_path, src,
2673 ins_start_slot,
2674 ins_nr, inode_only);
2675 BUG_ON(ret);
2676 ins_nr = 0;
2677 }
2678 btrfs_release_path(root, path);
2679
2680 if (min_key.offset < (u64)-1)
2681 min_key.offset++;
2682 else if (min_key.type < (u8)-1)
2683 min_key.type++;
2684 else if (min_key.objectid < (u64)-1)
2685 min_key.objectid++;
2686 else
2687 break;
2688 }
2689 if (ins_nr) {
2690 ret = copy_items(trans, log, dst_path, src,
2691 ins_start_slot,
2692 ins_nr, inode_only);
2693 BUG_ON(ret);
2694 ins_nr = 0;
2695 }
2696 WARN_ON(ins_nr);
2697 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2698 btrfs_release_path(root, path);
2699 btrfs_release_path(log, dst_path);
2700 BTRFS_I(inode)->log_dirty_trans = 0;
2701 ret = log_directory_changes(trans, root, inode, path, dst_path);
2702 BUG_ON(ret);
2703 }
2704 BTRFS_I(inode)->logged_trans = trans->transid;
2705 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2706
2707 btrfs_free_path(path);
2708 btrfs_free_path(dst_path);
2709
2710 mutex_lock(&root->fs_info->tree_log_mutex);
2711 ret = update_log_root(trans, log);
2712 BUG_ON(ret);
2713 mutex_unlock(&root->fs_info->tree_log_mutex);
2714out:
2715 return 0;
2716}
2717
2718int btrfs_log_inode(struct btrfs_trans_handle *trans,
2719 struct btrfs_root *root, struct inode *inode,
2720 int inode_only)
2721{
2722 int ret;
2723
2724 start_log_trans(trans, root);
2725 ret = __btrfs_log_inode(trans, root, inode, inode_only);
2726 end_log_trans(root);
2727 return ret;
2728}
2729
2730/*
2731 * helper function around btrfs_log_inode to make sure newly created
2732 * parent directories also end up in the log. A minimal inode and backref
2733 * only logging is done of any parent directories that are older than
2734 * the last committed transaction
2735 */
2736int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2737 struct btrfs_root *root, struct dentry *dentry)
2738{
2739 int inode_only = LOG_INODE_ALL;
2740 struct super_block *sb;
2741 int ret;
2742
2743 start_log_trans(trans, root);
2744 sb = dentry->d_inode->i_sb;
2745 while(1) {
2746 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2747 inode_only);
2748 BUG_ON(ret);
2749 inode_only = LOG_INODE_EXISTS;
2750
2751 dentry = dentry->d_parent;
2752 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
2753 break;
2754
2755 if (BTRFS_I(dentry->d_inode)->generation <=
2756 root->fs_info->last_trans_committed)
2757 break;
2758 }
2759 end_log_trans(root);
2760 return 0;
2761}
2762
2763/*
2764 * it is not safe to log dentry if the chunk root has added new
2765 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
2766 * If this returns 1, you must commit the transaction to safely get your
2767 * data on disk.
2768 */
2769int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2770 struct btrfs_root *root, struct dentry *dentry)
2771{
2772 u64 gen;
2773 gen = root->fs_info->last_trans_new_blockgroup;
2774 if (gen > root->fs_info->last_trans_committed)
2775 return 1;
2776 else
2777 return btrfs_log_dentry(trans, root, dentry);
2778}
2779
2780/*
2781 * should be called during mount to recover any replay any log trees
2782 * from the FS
2783 */
2784int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
2785{
2786 int ret;
2787 struct btrfs_path *path;
2788 struct btrfs_trans_handle *trans;
2789 struct btrfs_key key;
2790 struct btrfs_key found_key;
2791 struct btrfs_key tmp_key;
2792 struct btrfs_root *log;
2793 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
2794 u64 highest_inode;
2795 struct walk_control wc = {
2796 .process_func = process_one_buffer,
2797 .stage = 0,
2798 };
2799
2800 fs_info->log_root_recovering = 1;
2801 path = btrfs_alloc_path();
2802 BUG_ON(!path);
2803
2804 trans = btrfs_start_transaction(fs_info->tree_root, 1);
2805
2806 wc.trans = trans;
2807 wc.pin = 1;
2808
2809 walk_log_tree(trans, log_root_tree, &wc);
2810
2811again:
2812 key.objectid = BTRFS_TREE_LOG_OBJECTID;
2813 key.offset = (u64)-1;
2814 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
2815
2816 while(1) {
2817 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
2818 if (ret < 0)
2819 break;
2820 if (ret > 0) {
2821 if (path->slots[0] == 0)
2822 break;
2823 path->slots[0]--;
2824 }
2825 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2826 path->slots[0]);
2827 btrfs_release_path(log_root_tree, path);
2828 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
2829 break;
2830
2831 log = btrfs_read_fs_root_no_radix(log_root_tree,
2832 &found_key);
2833 BUG_ON(!log);
2834
2835
2836 tmp_key.objectid = found_key.offset;
2837 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
2838 tmp_key.offset = (u64)-1;
2839
2840 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
2841
2842 BUG_ON(!wc.replay_dest);
2843
2844 btrfs_record_root_in_trans(wc.replay_dest);
2845 ret = walk_log_tree(trans, log, &wc);
2846 BUG_ON(ret);
2847
2848 if (wc.stage == LOG_WALK_REPLAY_ALL) {
2849 ret = fixup_inode_link_counts(trans, wc.replay_dest,
2850 path);
2851 BUG_ON(ret);
2852 }
2853 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
2854 if (ret == 0) {
2855 wc.replay_dest->highest_inode = highest_inode;
2856 wc.replay_dest->last_inode_alloc = highest_inode;
2857 }
2858
2859 key.offset = found_key.offset - 1;
2860 free_extent_buffer(log->node);
2861 kfree(log);
2862
2863 if (found_key.offset == 0)
2864 break;
2865 }
2866 btrfs_release_path(log_root_tree, path);
2867
2868 /* step one is to pin it all, step two is to replay just inodes */
2869 if (wc.pin) {
2870 wc.pin = 0;
2871 wc.process_func = replay_one_buffer;
2872 wc.stage = LOG_WALK_REPLAY_INODES;
2873 goto again;
2874 }
2875 /* step three is to replay everything */
2876 if (wc.stage < LOG_WALK_REPLAY_ALL) {
2877 wc.stage++;
2878 goto again;
2879 }
2880
2881 btrfs_free_path(path);
2882
2883 free_extent_buffer(log_root_tree->node);
2884 log_root_tree->log_root = NULL;
2885 fs_info->log_root_recovering = 0;
2886
2887 /* step 4: commit the transaction, which also unpins the blocks */
2888 btrfs_commit_transaction(trans, fs_info->tree_root);
2889
2890 kfree(log_root_tree);
2891 return 0;
2892}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 000000000000..b9409b32ed02
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __TREE_LOG_
20#define __TREE_LOG_
21
22int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
25int btrfs_log_dentry(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root, struct dentry *dentry);
27int btrfs_recover_log_trees(struct btrfs_root *tree_root);
28int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
29 struct btrfs_root *root, struct dentry *dentry);
30int btrfs_log_inode(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, struct inode *inode,
32 int inode_only);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root,
35 const char *name, int name_len,
36 struct inode *dir, u64 index);
37int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root,
39 const char *name, int name_len,
40 struct inode *inode, u64 dirid);
41#endif
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
new file mode 100644
index 000000000000..0f57f24404d9
--- /dev/null
+++ b/fs/btrfs/version.sh
@@ -0,0 +1,43 @@
1#!/bin/bash
2#
3# determine-version -- report a useful version for releases
4#
5# Copyright 2008, Aron Griffis <agriffis@n01se.net>
6# Copyright 2008, Oracle
7# Released under the GNU GPLv2
8
9v="v0.16"
10
11which hg > /dev/null
12if [ -d .hg ] && [ $? == 0 ]; then
13 last=$(hg tags | grep -m1 -o '^v[0-9.]\+')
14
15 # now check if the repo has commits since then...
16 if [[ $(hg id -t) == $last || \
17 $(hg di -r "$last:." | awk '/^diff/{print $NF}' | sort -u) == .hgtags ]]
18 then
19 # check if it's dirty
20 if [[ $(hg id | cut -d' ' -f1) == *+ ]]; then
21 v=$last+
22 else
23 v=$last
24 fi
25 else
26 # includes dirty flag
27 v=$last+$(hg id -i)
28 fi
29fi
30
31echo "#ifndef __BUILD_VERSION" > .build-version.h
32echo "#define __BUILD_VERSION" >> .build-version.h
33echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
34echo "#endif" >> .build-version.h
35
36diff -q version.h .build-version.h >& /dev/null
37
38if [ $? == 0 ]; then
39 rm .build-version.h
40 exit 0
41fi
42
43mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 000000000000..b9e5c2d82dde
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,2565 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/bio.h>
20#include <linux/buffer_head.h>
21#include <linux/blkdev.h>
22#include <linux/random.h>
23#include <asm/div64.h>
24#include "ctree.h"
25#include "extent_map.h"
26#include "disk-io.h"
27#include "transaction.h"
28#include "print-tree.h"
29#include "volumes.h"
30#include "async-thread.h"
31
32struct map_lookup {
33 u64 type;
34 int io_align;
35 int io_width;
36 int stripe_len;
37 int sector_size;
38 int num_stripes;
39 int sub_stripes;
40 struct btrfs_bio_stripe stripes[];
41};
42
43#define map_lookup_size(n) (sizeof(struct map_lookup) + \
44 (sizeof(struct btrfs_bio_stripe) * (n)))
45
46static DEFINE_MUTEX(uuid_mutex);
47static LIST_HEAD(fs_uuids);
48
49void btrfs_lock_volumes(void)
50{
51 mutex_lock(&uuid_mutex);
52}
53
54void btrfs_unlock_volumes(void)
55{
56 mutex_unlock(&uuid_mutex);
57}
58
59static void lock_chunks(struct btrfs_root *root)
60{
61 mutex_lock(&root->fs_info->alloc_mutex);
62 mutex_lock(&root->fs_info->chunk_mutex);
63}
64
65static void unlock_chunks(struct btrfs_root *root)
66{
67 mutex_unlock(&root->fs_info->chunk_mutex);
68 mutex_unlock(&root->fs_info->alloc_mutex);
69}
70
71int btrfs_cleanup_fs_uuids(void)
72{
73 struct btrfs_fs_devices *fs_devices;
74 struct list_head *uuid_cur;
75 struct list_head *devices_cur;
76 struct btrfs_device *dev;
77
78 list_for_each(uuid_cur, &fs_uuids) {
79 fs_devices = list_entry(uuid_cur, struct btrfs_fs_devices,
80 list);
81 while(!list_empty(&fs_devices->devices)) {
82 devices_cur = fs_devices->devices.next;
83 dev = list_entry(devices_cur, struct btrfs_device,
84 dev_list);
85 if (dev->bdev) {
86 close_bdev_excl(dev->bdev);
87 fs_devices->open_devices--;
88 }
89 list_del(&dev->dev_list);
90 kfree(dev->name);
91 kfree(dev);
92 }
93 }
94 return 0;
95}
96
97static noinline struct btrfs_device *__find_device(struct list_head *head,
98 u64 devid, u8 *uuid)
99{
100 struct btrfs_device *dev;
101 struct list_head *cur;
102
103 list_for_each(cur, head) {
104 dev = list_entry(cur, struct btrfs_device, dev_list);
105 if (dev->devid == devid &&
106 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
107 return dev;
108 }
109 }
110 return NULL;
111}
112
113static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
114{
115 struct list_head *cur;
116 struct btrfs_fs_devices *fs_devices;
117
118 list_for_each(cur, &fs_uuids) {
119 fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
120 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
121 return fs_devices;
122 }
123 return NULL;
124}
125
126/*
127 * we try to collect pending bios for a device so we don't get a large
128 * number of procs sending bios down to the same device. This greatly
129 * improves the schedulers ability to collect and merge the bios.
130 *
131 * But, it also turns into a long list of bios to process and that is sure
132 * to eventually make the worker thread block. The solution here is to
133 * make some progress and then put this work struct back at the end of
134 * the list if the block device is congested. This way, multiple devices
135 * can make progress from a single worker thread.
136 */
137static int noinline run_scheduled_bios(struct btrfs_device *device)
138{
139 struct bio *pending;
140 struct backing_dev_info *bdi;
141 struct btrfs_fs_info *fs_info;
142 struct bio *tail;
143 struct bio *cur;
144 int again = 0;
145 unsigned long num_run = 0;
146 unsigned long limit;
147
148 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
149 fs_info = device->dev_root->fs_info;
150 limit = btrfs_async_submit_limit(fs_info);
151 limit = limit * 2 / 3;
152
153loop:
154 spin_lock(&device->io_lock);
155
156 /* take all the bios off the list at once and process them
157 * later on (without the lock held). But, remember the
158 * tail and other pointers so the bios can be properly reinserted
159 * into the list if we hit congestion
160 */
161 pending = device->pending_bios;
162 tail = device->pending_bio_tail;
163 WARN_ON(pending && !tail);
164 device->pending_bios = NULL;
165 device->pending_bio_tail = NULL;
166
167 /*
168 * if pending was null this time around, no bios need processing
169 * at all and we can stop. Otherwise it'll loop back up again
170 * and do an additional check so no bios are missed.
171 *
172 * device->running_pending is used to synchronize with the
173 * schedule_bio code.
174 */
175 if (pending) {
176 again = 1;
177 device->running_pending = 1;
178 } else {
179 again = 0;
180 device->running_pending = 0;
181 }
182 spin_unlock(&device->io_lock);
183
184 while(pending) {
185 cur = pending;
186 pending = pending->bi_next;
187 cur->bi_next = NULL;
188 atomic_dec(&fs_info->nr_async_bios);
189
190 if (atomic_read(&fs_info->nr_async_bios) < limit &&
191 waitqueue_active(&fs_info->async_submit_wait))
192 wake_up(&fs_info->async_submit_wait);
193
194 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
195 bio_get(cur);
196 submit_bio(cur->bi_rw, cur);
197 bio_put(cur);
198 num_run++;
199
200 /*
201 * we made progress, there is more work to do and the bdi
202 * is now congested. Back off and let other work structs
203 * run instead
204 */
205 if (pending && bdi_write_congested(bdi)) {
206 struct bio *old_head;
207
208 spin_lock(&device->io_lock);
209
210 old_head = device->pending_bios;
211 device->pending_bios = pending;
212 if (device->pending_bio_tail)
213 tail->bi_next = old_head;
214 else
215 device->pending_bio_tail = tail;
216
217 spin_unlock(&device->io_lock);
218 btrfs_requeue_work(&device->work);
219 goto done;
220 }
221 }
222 if (again)
223 goto loop;
224done:
225 return 0;
226}
227
228void pending_bios_fn(struct btrfs_work *work)
229{
230 struct btrfs_device *device;
231
232 device = container_of(work, struct btrfs_device, work);
233 run_scheduled_bios(device);
234}
235
236static noinline int device_list_add(const char *path,
237 struct btrfs_super_block *disk_super,
238 u64 devid, struct btrfs_fs_devices **fs_devices_ret)
239{
240 struct btrfs_device *device;
241 struct btrfs_fs_devices *fs_devices;
242 u64 found_transid = btrfs_super_generation(disk_super);
243
244 fs_devices = find_fsid(disk_super->fsid);
245 if (!fs_devices) {
246 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
247 if (!fs_devices)
248 return -ENOMEM;
249 INIT_LIST_HEAD(&fs_devices->devices);
250 INIT_LIST_HEAD(&fs_devices->alloc_list);
251 list_add(&fs_devices->list, &fs_uuids);
252 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
253 fs_devices->latest_devid = devid;
254 fs_devices->latest_trans = found_transid;
255 device = NULL;
256 } else {
257 device = __find_device(&fs_devices->devices, devid,
258 disk_super->dev_item.uuid);
259 }
260 if (!device) {
261 device = kzalloc(sizeof(*device), GFP_NOFS);
262 if (!device) {
263 /* we can safely leave the fs_devices entry around */
264 return -ENOMEM;
265 }
266 device->devid = devid;
267 device->work.func = pending_bios_fn;
268 memcpy(device->uuid, disk_super->dev_item.uuid,
269 BTRFS_UUID_SIZE);
270 device->barriers = 1;
271 spin_lock_init(&device->io_lock);
272 device->name = kstrdup(path, GFP_NOFS);
273 if (!device->name) {
274 kfree(device);
275 return -ENOMEM;
276 }
277 list_add(&device->dev_list, &fs_devices->devices);
278 list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
279 fs_devices->num_devices++;
280 }
281
282 if (found_transid > fs_devices->latest_trans) {
283 fs_devices->latest_devid = devid;
284 fs_devices->latest_trans = found_transid;
285 }
286 *fs_devices_ret = fs_devices;
287 return 0;
288}
289
290int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
291{
292 struct list_head *head = &fs_devices->devices;
293 struct list_head *cur;
294 struct btrfs_device *device;
295
296 mutex_lock(&uuid_mutex);
297again:
298 list_for_each(cur, head) {
299 device = list_entry(cur, struct btrfs_device, dev_list);
300 if (!device->in_fs_metadata) {
301 struct block_device *bdev;
302 list_del(&device->dev_list);
303 list_del(&device->dev_alloc_list);
304 fs_devices->num_devices--;
305 if (device->bdev) {
306 bdev = device->bdev;
307 fs_devices->open_devices--;
308 mutex_unlock(&uuid_mutex);
309 close_bdev_excl(bdev);
310 mutex_lock(&uuid_mutex);
311 }
312 kfree(device->name);
313 kfree(device);
314 goto again;
315 }
316 }
317 mutex_unlock(&uuid_mutex);
318 return 0;
319}
320
321int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
322{
323 struct list_head *head = &fs_devices->devices;
324 struct list_head *cur;
325 struct btrfs_device *device;
326
327 mutex_lock(&uuid_mutex);
328 list_for_each(cur, head) {
329 device = list_entry(cur, struct btrfs_device, dev_list);
330 if (device->bdev) {
331 close_bdev_excl(device->bdev);
332 fs_devices->open_devices--;
333 }
334 device->bdev = NULL;
335 device->in_fs_metadata = 0;
336 }
337 fs_devices->mounted = 0;
338 mutex_unlock(&uuid_mutex);
339 return 0;
340}
341
342int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
343 int flags, void *holder)
344{
345 struct block_device *bdev;
346 struct list_head *head = &fs_devices->devices;
347 struct list_head *cur;
348 struct btrfs_device *device;
349 struct block_device *latest_bdev = NULL;
350 struct buffer_head *bh;
351 struct btrfs_super_block *disk_super;
352 u64 latest_devid = 0;
353 u64 latest_transid = 0;
354 u64 transid;
355 u64 devid;
356 int ret = 0;
357
358 mutex_lock(&uuid_mutex);
359 if (fs_devices->mounted)
360 goto out;
361
362 list_for_each(cur, head) {
363 device = list_entry(cur, struct btrfs_device, dev_list);
364 if (device->bdev)
365 continue;
366
367 if (!device->name)
368 continue;
369
370 bdev = open_bdev_excl(device->name, flags, holder);
371
372 if (IS_ERR(bdev)) {
373 printk("open %s failed\n", device->name);
374 goto error;
375 }
376 set_blocksize(bdev, 4096);
377
378 bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
379 if (!bh)
380 goto error_close;
381
382 disk_super = (struct btrfs_super_block *)bh->b_data;
383 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
384 sizeof(disk_super->magic)))
385 goto error_brelse;
386
387 devid = le64_to_cpu(disk_super->dev_item.devid);
388 if (devid != device->devid)
389 goto error_brelse;
390
391 transid = btrfs_super_generation(disk_super);
392 if (!latest_transid || transid > latest_transid) {
393 latest_devid = devid;
394 latest_transid = transid;
395 latest_bdev = bdev;
396 }
397
398 device->bdev = bdev;
399 device->in_fs_metadata = 0;
400 fs_devices->open_devices++;
401 continue;
402
403error_brelse:
404 brelse(bh);
405error_close:
406 close_bdev_excl(bdev);
407error:
408 continue;
409 }
410 if (fs_devices->open_devices == 0) {
411 ret = -EIO;
412 goto out;
413 }
414 fs_devices->mounted = 1;
415 fs_devices->latest_bdev = latest_bdev;
416 fs_devices->latest_devid = latest_devid;
417 fs_devices->latest_trans = latest_transid;
418out:
419 mutex_unlock(&uuid_mutex);
420 return ret;
421}
422
423int btrfs_scan_one_device(const char *path, int flags, void *holder,
424 struct btrfs_fs_devices **fs_devices_ret)
425{
426 struct btrfs_super_block *disk_super;
427 struct block_device *bdev;
428 struct buffer_head *bh;
429 int ret;
430 u64 devid;
431 u64 transid;
432
433 mutex_lock(&uuid_mutex);
434
435 bdev = open_bdev_excl(path, flags, holder);
436
437 if (IS_ERR(bdev)) {
438 ret = PTR_ERR(bdev);
439 goto error;
440 }
441
442 ret = set_blocksize(bdev, 4096);
443 if (ret)
444 goto error_close;
445 bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
446 if (!bh) {
447 ret = -EIO;
448 goto error_close;
449 }
450 disk_super = (struct btrfs_super_block *)bh->b_data;
451 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
452 sizeof(disk_super->magic))) {
453 ret = -EINVAL;
454 goto error_brelse;
455 }
456 devid = le64_to_cpu(disk_super->dev_item.devid);
457 transid = btrfs_super_generation(disk_super);
458 if (disk_super->label[0])
459 printk("device label %s ", disk_super->label);
460 else {
461 /* FIXME, make a readl uuid parser */
462 printk("device fsid %llx-%llx ",
463 *(unsigned long long *)disk_super->fsid,
464 *(unsigned long long *)(disk_super->fsid + 8));
465 }
466 printk("devid %Lu transid %Lu %s\n", devid, transid, path);
467 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
468
469error_brelse:
470 brelse(bh);
471error_close:
472 close_bdev_excl(bdev);
473error:
474 mutex_unlock(&uuid_mutex);
475 return ret;
476}
477
478/*
479 * this uses a pretty simple search, the expectation is that it is
480 * called very infrequently and that a given device has a small number
481 * of extents
482 */
483static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
484 struct btrfs_device *device,
485 struct btrfs_path *path,
486 u64 num_bytes, u64 *start)
487{
488 struct btrfs_key key;
489 struct btrfs_root *root = device->dev_root;
490 struct btrfs_dev_extent *dev_extent = NULL;
491 u64 hole_size = 0;
492 u64 last_byte = 0;
493 u64 search_start = 0;
494 u64 search_end = device->total_bytes;
495 int ret;
496 int slot = 0;
497 int start_found;
498 struct extent_buffer *l;
499
500 start_found = 0;
501 path->reada = 2;
502
503 /* FIXME use last free of some kind */
504
505 /* we don't want to overwrite the superblock on the drive,
506 * so we make sure to start at an offset of at least 1MB
507 */
508 search_start = max((u64)1024 * 1024, search_start);
509
510 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
511 search_start = max(root->fs_info->alloc_start, search_start);
512
513 key.objectid = device->devid;
514 key.offset = search_start;
515 key.type = BTRFS_DEV_EXTENT_KEY;
516 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
517 if (ret < 0)
518 goto error;
519 ret = btrfs_previous_item(root, path, 0, key.type);
520 if (ret < 0)
521 goto error;
522 l = path->nodes[0];
523 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
524 while (1) {
525 l = path->nodes[0];
526 slot = path->slots[0];
527 if (slot >= btrfs_header_nritems(l)) {
528 ret = btrfs_next_leaf(root, path);
529 if (ret == 0)
530 continue;
531 if (ret < 0)
532 goto error;
533no_more_items:
534 if (!start_found) {
535 if (search_start >= search_end) {
536 ret = -ENOSPC;
537 goto error;
538 }
539 *start = search_start;
540 start_found = 1;
541 goto check_pending;
542 }
543 *start = last_byte > search_start ?
544 last_byte : search_start;
545 if (search_end <= *start) {
546 ret = -ENOSPC;
547 goto error;
548 }
549 goto check_pending;
550 }
551 btrfs_item_key_to_cpu(l, &key, slot);
552
553 if (key.objectid < device->devid)
554 goto next;
555
556 if (key.objectid > device->devid)
557 goto no_more_items;
558
559 if (key.offset >= search_start && key.offset > last_byte &&
560 start_found) {
561 if (last_byte < search_start)
562 last_byte = search_start;
563 hole_size = key.offset - last_byte;
564 if (key.offset > last_byte &&
565 hole_size >= num_bytes) {
566 *start = last_byte;
567 goto check_pending;
568 }
569 }
570 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) {
571 goto next;
572 }
573
574 start_found = 1;
575 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
576 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
577next:
578 path->slots[0]++;
579 cond_resched();
580 }
581check_pending:
582 /* we have to make sure we didn't find an extent that has already
583 * been allocated by the map tree or the original allocation
584 */
585 btrfs_release_path(root, path);
586 BUG_ON(*start < search_start);
587
588 if (*start + num_bytes > search_end) {
589 ret = -ENOSPC;
590 goto error;
591 }
592 /* check for pending inserts here */
593 return 0;
594
595error:
596 btrfs_release_path(root, path);
597 return ret;
598}
599
600int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
601 struct btrfs_device *device,
602 u64 start)
603{
604 int ret;
605 struct btrfs_path *path;
606 struct btrfs_root *root = device->dev_root;
607 struct btrfs_key key;
608 struct btrfs_key found_key;
609 struct extent_buffer *leaf = NULL;
610 struct btrfs_dev_extent *extent = NULL;
611
612 path = btrfs_alloc_path();
613 if (!path)
614 return -ENOMEM;
615
616 key.objectid = device->devid;
617 key.offset = start;
618 key.type = BTRFS_DEV_EXTENT_KEY;
619
620 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
621 if (ret > 0) {
622 ret = btrfs_previous_item(root, path, key.objectid,
623 BTRFS_DEV_EXTENT_KEY);
624 BUG_ON(ret);
625 leaf = path->nodes[0];
626 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
627 extent = btrfs_item_ptr(leaf, path->slots[0],
628 struct btrfs_dev_extent);
629 BUG_ON(found_key.offset > start || found_key.offset +
630 btrfs_dev_extent_length(leaf, extent) < start);
631 ret = 0;
632 } else if (ret == 0) {
633 leaf = path->nodes[0];
634 extent = btrfs_item_ptr(leaf, path->slots[0],
635 struct btrfs_dev_extent);
636 }
637 BUG_ON(ret);
638
639 if (device->bytes_used > 0)
640 device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
641 ret = btrfs_del_item(trans, root, path);
642 BUG_ON(ret);
643
644 btrfs_free_path(path);
645 return ret;
646}
647
648int noinline btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
649 struct btrfs_device *device,
650 u64 chunk_tree, u64 chunk_objectid,
651 u64 chunk_offset,
652 u64 num_bytes, u64 *start)
653{
654 int ret;
655 struct btrfs_path *path;
656 struct btrfs_root *root = device->dev_root;
657 struct btrfs_dev_extent *extent;
658 struct extent_buffer *leaf;
659 struct btrfs_key key;
660
661 WARN_ON(!device->in_fs_metadata);
662 path = btrfs_alloc_path();
663 if (!path)
664 return -ENOMEM;
665
666 ret = find_free_dev_extent(trans, device, path, num_bytes, start);
667 if (ret) {
668 goto err;
669 }
670
671 key.objectid = device->devid;
672 key.offset = *start;
673 key.type = BTRFS_DEV_EXTENT_KEY;
674 ret = btrfs_insert_empty_item(trans, root, path, &key,
675 sizeof(*extent));
676 BUG_ON(ret);
677
678 leaf = path->nodes[0];
679 extent = btrfs_item_ptr(leaf, path->slots[0],
680 struct btrfs_dev_extent);
681 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
682 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
683 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
684
685 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
686 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
687 BTRFS_UUID_SIZE);
688
689 btrfs_set_dev_extent_length(leaf, extent, num_bytes);
690 btrfs_mark_buffer_dirty(leaf);
691err:
692 btrfs_free_path(path);
693 return ret;
694}
695
696static noinline int find_next_chunk(struct btrfs_root *root,
697 u64 objectid, u64 *offset)
698{
699 struct btrfs_path *path;
700 int ret;
701 struct btrfs_key key;
702 struct btrfs_chunk *chunk;
703 struct btrfs_key found_key;
704
705 path = btrfs_alloc_path();
706 BUG_ON(!path);
707
708 key.objectid = objectid;
709 key.offset = (u64)-1;
710 key.type = BTRFS_CHUNK_ITEM_KEY;
711
712 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
713 if (ret < 0)
714 goto error;
715
716 BUG_ON(ret == 0);
717
718 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
719 if (ret) {
720 *offset = 0;
721 } else {
722 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
723 path->slots[0]);
724 if (found_key.objectid != objectid)
725 *offset = 0;
726 else {
727 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
728 struct btrfs_chunk);
729 *offset = found_key.offset +
730 btrfs_chunk_length(path->nodes[0], chunk);
731 }
732 }
733 ret = 0;
734error:
735 btrfs_free_path(path);
736 return ret;
737}
738
739static noinline int find_next_devid(struct btrfs_root *root,
740 struct btrfs_path *path, u64 *objectid)
741{
742 int ret;
743 struct btrfs_key key;
744 struct btrfs_key found_key;
745
746 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
747 key.type = BTRFS_DEV_ITEM_KEY;
748 key.offset = (u64)-1;
749
750 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
751 if (ret < 0)
752 goto error;
753
754 BUG_ON(ret == 0);
755
756 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
757 BTRFS_DEV_ITEM_KEY);
758 if (ret) {
759 *objectid = 1;
760 } else {
761 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
762 path->slots[0]);
763 *objectid = found_key.offset + 1;
764 }
765 ret = 0;
766error:
767 btrfs_release_path(root, path);
768 return ret;
769}
770
771/*
772 * the device information is stored in the chunk root
773 * the btrfs_device struct should be fully filled in
774 */
775int btrfs_add_device(struct btrfs_trans_handle *trans,
776 struct btrfs_root *root,
777 struct btrfs_device *device)
778{
779 int ret;
780 struct btrfs_path *path;
781 struct btrfs_dev_item *dev_item;
782 struct extent_buffer *leaf;
783 struct btrfs_key key;
784 unsigned long ptr;
785 u64 free_devid = 0;
786
787 root = root->fs_info->chunk_root;
788
789 path = btrfs_alloc_path();
790 if (!path)
791 return -ENOMEM;
792
793 ret = find_next_devid(root, path, &free_devid);
794 if (ret)
795 goto out;
796
797 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
798 key.type = BTRFS_DEV_ITEM_KEY;
799 key.offset = free_devid;
800
801 ret = btrfs_insert_empty_item(trans, root, path, &key,
802 sizeof(*dev_item));
803 if (ret)
804 goto out;
805
806 leaf = path->nodes[0];
807 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
808
809 device->devid = free_devid;
810 btrfs_set_device_id(leaf, dev_item, device->devid);
811 btrfs_set_device_type(leaf, dev_item, device->type);
812 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
813 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
814 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
815 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
816 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
817 btrfs_set_device_group(leaf, dev_item, 0);
818 btrfs_set_device_seek_speed(leaf, dev_item, 0);
819 btrfs_set_device_bandwidth(leaf, dev_item, 0);
820
821 ptr = (unsigned long)btrfs_device_uuid(dev_item);
822 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
823 btrfs_mark_buffer_dirty(leaf);
824 ret = 0;
825
826out:
827 btrfs_free_path(path);
828 return ret;
829}
830
831static int btrfs_rm_dev_item(struct btrfs_root *root,
832 struct btrfs_device *device)
833{
834 int ret;
835 struct btrfs_path *path;
836 struct block_device *bdev = device->bdev;
837 struct btrfs_device *next_dev;
838 struct btrfs_key key;
839 u64 total_bytes;
840 struct btrfs_fs_devices *fs_devices;
841 struct btrfs_trans_handle *trans;
842
843 root = root->fs_info->chunk_root;
844
845 path = btrfs_alloc_path();
846 if (!path)
847 return -ENOMEM;
848
849 trans = btrfs_start_transaction(root, 1);
850 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
851 key.type = BTRFS_DEV_ITEM_KEY;
852 key.offset = device->devid;
853 lock_chunks(root);
854
855 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
856 if (ret < 0)
857 goto out;
858
859 if (ret > 0) {
860 ret = -ENOENT;
861 goto out;
862 }
863
864 ret = btrfs_del_item(trans, root, path);
865 if (ret)
866 goto out;
867
868 /*
869 * at this point, the device is zero sized. We want to
870 * remove it from the devices list and zero out the old super
871 */
872 list_del_init(&device->dev_list);
873 list_del_init(&device->dev_alloc_list);
874 fs_devices = root->fs_info->fs_devices;
875
876 next_dev = list_entry(fs_devices->devices.next, struct btrfs_device,
877 dev_list);
878 if (bdev == root->fs_info->sb->s_bdev)
879 root->fs_info->sb->s_bdev = next_dev->bdev;
880 if (bdev == fs_devices->latest_bdev)
881 fs_devices->latest_bdev = next_dev->bdev;
882
883 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
884 btrfs_set_super_num_devices(&root->fs_info->super_copy,
885 total_bytes - 1);
886out:
887 btrfs_free_path(path);
888 unlock_chunks(root);
889 btrfs_commit_transaction(trans, root);
890 return ret;
891}
892
893int btrfs_rm_device(struct btrfs_root *root, char *device_path)
894{
895 struct btrfs_device *device;
896 struct block_device *bdev;
897 struct buffer_head *bh = NULL;
898 struct btrfs_super_block *disk_super;
899 u64 all_avail;
900 u64 devid;
901 int ret = 0;
902
903 mutex_lock(&uuid_mutex);
904 mutex_lock(&root->fs_info->volume_mutex);
905
906 all_avail = root->fs_info->avail_data_alloc_bits |
907 root->fs_info->avail_system_alloc_bits |
908 root->fs_info->avail_metadata_alloc_bits;
909
910 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
911 btrfs_super_num_devices(&root->fs_info->super_copy) <= 4) {
912 printk("btrfs: unable to go below four devices on raid10\n");
913 ret = -EINVAL;
914 goto out;
915 }
916
917 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
918 btrfs_super_num_devices(&root->fs_info->super_copy) <= 2) {
919 printk("btrfs: unable to go below two devices on raid1\n");
920 ret = -EINVAL;
921 goto out;
922 }
923
924 if (strcmp(device_path, "missing") == 0) {
925 struct list_head *cur;
926 struct list_head *devices;
927 struct btrfs_device *tmp;
928
929 device = NULL;
930 devices = &root->fs_info->fs_devices->devices;
931 list_for_each(cur, devices) {
932 tmp = list_entry(cur, struct btrfs_device, dev_list);
933 if (tmp->in_fs_metadata && !tmp->bdev) {
934 device = tmp;
935 break;
936 }
937 }
938 bdev = NULL;
939 bh = NULL;
940 disk_super = NULL;
941 if (!device) {
942 printk("btrfs: no missing devices found to remove\n");
943 goto out;
944 }
945
946 } else {
947 bdev = open_bdev_excl(device_path, 0,
948 root->fs_info->bdev_holder);
949 if (IS_ERR(bdev)) {
950 ret = PTR_ERR(bdev);
951 goto out;
952 }
953
954 bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
955 if (!bh) {
956 ret = -EIO;
957 goto error_close;
958 }
959 disk_super = (struct btrfs_super_block *)bh->b_data;
960 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
961 sizeof(disk_super->magic))) {
962 ret = -ENOENT;
963 goto error_brelse;
964 }
965 if (memcmp(disk_super->fsid, root->fs_info->fsid,
966 BTRFS_FSID_SIZE)) {
967 ret = -ENOENT;
968 goto error_brelse;
969 }
970 devid = le64_to_cpu(disk_super->dev_item.devid);
971 device = btrfs_find_device(root, devid, NULL);
972 if (!device) {
973 ret = -ENOENT;
974 goto error_brelse;
975 }
976
977 }
978 root->fs_info->fs_devices->num_devices--;
979 root->fs_info->fs_devices->open_devices--;
980
981 ret = btrfs_shrink_device(device, 0);
982 if (ret)
983 goto error_brelse;
984
985
986 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
987 if (ret)
988 goto error_brelse;
989
990 if (bh) {
991 /* make sure this device isn't detected as part of
992 * the FS anymore
993 */
994 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
995 set_buffer_dirty(bh);
996 sync_dirty_buffer(bh);
997
998 brelse(bh);
999 }
1000
1001 if (device->bdev) {
1002 /* one close for the device struct or super_block */
1003 close_bdev_excl(device->bdev);
1004 }
1005 if (bdev) {
1006 /* one close for us */
1007 close_bdev_excl(bdev);
1008 }
1009 kfree(device->name);
1010 kfree(device);
1011 ret = 0;
1012 goto out;
1013
1014error_brelse:
1015 brelse(bh);
1016error_close:
1017 if (bdev)
1018 close_bdev_excl(bdev);
1019out:
1020 mutex_unlock(&root->fs_info->volume_mutex);
1021 mutex_unlock(&uuid_mutex);
1022 return ret;
1023}
1024
1025int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1026{
1027 struct btrfs_trans_handle *trans;
1028 struct btrfs_device *device;
1029 struct block_device *bdev;
1030 struct list_head *cur;
1031 struct list_head *devices;
1032 u64 total_bytes;
1033 int ret = 0;
1034
1035
1036 bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
1037 if (!bdev) {
1038 return -EIO;
1039 }
1040
1041 mutex_lock(&root->fs_info->volume_mutex);
1042
1043 trans = btrfs_start_transaction(root, 1);
1044 lock_chunks(root);
1045 devices = &root->fs_info->fs_devices->devices;
1046 list_for_each(cur, devices) {
1047 device = list_entry(cur, struct btrfs_device, dev_list);
1048 if (device->bdev == bdev) {
1049 ret = -EEXIST;
1050 goto out;
1051 }
1052 }
1053
1054 device = kzalloc(sizeof(*device), GFP_NOFS);
1055 if (!device) {
1056 /* we can safely leave the fs_devices entry around */
1057 ret = -ENOMEM;
1058 goto out_close_bdev;
1059 }
1060
1061 device->barriers = 1;
1062 device->work.func = pending_bios_fn;
1063 generate_random_uuid(device->uuid);
1064 spin_lock_init(&device->io_lock);
1065 device->name = kstrdup(device_path, GFP_NOFS);
1066 if (!device->name) {
1067 kfree(device);
1068 goto out_close_bdev;
1069 }
1070 device->io_width = root->sectorsize;
1071 device->io_align = root->sectorsize;
1072 device->sector_size = root->sectorsize;
1073 device->total_bytes = i_size_read(bdev->bd_inode);
1074 device->dev_root = root->fs_info->dev_root;
1075 device->bdev = bdev;
1076 device->in_fs_metadata = 1;
1077
1078 ret = btrfs_add_device(trans, root, device);
1079 if (ret)
1080 goto out_close_bdev;
1081
1082 set_blocksize(device->bdev, 4096);
1083
1084 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1085 btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1086 total_bytes + device->total_bytes);
1087
1088 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1089 btrfs_set_super_num_devices(&root->fs_info->super_copy,
1090 total_bytes + 1);
1091
1092 list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1093 list_add(&device->dev_alloc_list,
1094 &root->fs_info->fs_devices->alloc_list);
1095 root->fs_info->fs_devices->num_devices++;
1096 root->fs_info->fs_devices->open_devices++;
1097out:
1098 unlock_chunks(root);
1099 btrfs_end_transaction(trans, root);
1100 mutex_unlock(&root->fs_info->volume_mutex);
1101
1102 return ret;
1103
1104out_close_bdev:
1105 close_bdev_excl(bdev);
1106 goto out;
1107}
1108
1109int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
1110 struct btrfs_device *device)
1111{
1112 int ret;
1113 struct btrfs_path *path;
1114 struct btrfs_root *root;
1115 struct btrfs_dev_item *dev_item;
1116 struct extent_buffer *leaf;
1117 struct btrfs_key key;
1118
1119 root = device->dev_root->fs_info->chunk_root;
1120
1121 path = btrfs_alloc_path();
1122 if (!path)
1123 return -ENOMEM;
1124
1125 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1126 key.type = BTRFS_DEV_ITEM_KEY;
1127 key.offset = device->devid;
1128
1129 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1130 if (ret < 0)
1131 goto out;
1132
1133 if (ret > 0) {
1134 ret = -ENOENT;
1135 goto out;
1136 }
1137
1138 leaf = path->nodes[0];
1139 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1140
1141 btrfs_set_device_id(leaf, dev_item, device->devid);
1142 btrfs_set_device_type(leaf, dev_item, device->type);
1143 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1144 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1145 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1146 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1147 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1148 btrfs_mark_buffer_dirty(leaf);
1149
1150out:
1151 btrfs_free_path(path);
1152 return ret;
1153}
1154
1155static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1156 struct btrfs_device *device, u64 new_size)
1157{
1158 struct btrfs_super_block *super_copy =
1159 &device->dev_root->fs_info->super_copy;
1160 u64 old_total = btrfs_super_total_bytes(super_copy);
1161 u64 diff = new_size - device->total_bytes;
1162
1163 btrfs_set_super_total_bytes(super_copy, old_total + diff);
1164 return btrfs_update_device(trans, device);
1165}
1166
1167int btrfs_grow_device(struct btrfs_trans_handle *trans,
1168 struct btrfs_device *device, u64 new_size)
1169{
1170 int ret;
1171 lock_chunks(device->dev_root);
1172 ret = __btrfs_grow_device(trans, device, new_size);
1173 unlock_chunks(device->dev_root);
1174 return ret;
1175}
1176
1177static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1178 struct btrfs_root *root,
1179 u64 chunk_tree, u64 chunk_objectid,
1180 u64 chunk_offset)
1181{
1182 int ret;
1183 struct btrfs_path *path;
1184 struct btrfs_key key;
1185
1186 root = root->fs_info->chunk_root;
1187 path = btrfs_alloc_path();
1188 if (!path)
1189 return -ENOMEM;
1190
1191 key.objectid = chunk_objectid;
1192 key.offset = chunk_offset;
1193 key.type = BTRFS_CHUNK_ITEM_KEY;
1194
1195 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1196 BUG_ON(ret);
1197
1198 ret = btrfs_del_item(trans, root, path);
1199 BUG_ON(ret);
1200
1201 btrfs_free_path(path);
1202 return 0;
1203}
1204
1205int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1206 chunk_offset)
1207{
1208 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1209 struct btrfs_disk_key *disk_key;
1210 struct btrfs_chunk *chunk;
1211 u8 *ptr;
1212 int ret = 0;
1213 u32 num_stripes;
1214 u32 array_size;
1215 u32 len = 0;
1216 u32 cur;
1217 struct btrfs_key key;
1218
1219 array_size = btrfs_super_sys_array_size(super_copy);
1220
1221 ptr = super_copy->sys_chunk_array;
1222 cur = 0;
1223
1224 while (cur < array_size) {
1225 disk_key = (struct btrfs_disk_key *)ptr;
1226 btrfs_disk_key_to_cpu(&key, disk_key);
1227
1228 len = sizeof(*disk_key);
1229
1230 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1231 chunk = (struct btrfs_chunk *)(ptr + len);
1232 num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1233 len += btrfs_chunk_item_size(num_stripes);
1234 } else {
1235 ret = -EIO;
1236 break;
1237 }
1238 if (key.objectid == chunk_objectid &&
1239 key.offset == chunk_offset) {
1240 memmove(ptr, ptr + len, array_size - (cur + len));
1241 array_size -= len;
1242 btrfs_set_super_sys_array_size(super_copy, array_size);
1243 } else {
1244 ptr += len;
1245 cur += len;
1246 }
1247 }
1248 return ret;
1249}
1250
1251
1252int btrfs_relocate_chunk(struct btrfs_root *root,
1253 u64 chunk_tree, u64 chunk_objectid,
1254 u64 chunk_offset)
1255{
1256 struct extent_map_tree *em_tree;
1257 struct btrfs_root *extent_root;
1258 struct btrfs_trans_handle *trans;
1259 struct extent_map *em;
1260 struct map_lookup *map;
1261 int ret;
1262 int i;
1263
1264 printk("btrfs relocating chunk %llu\n",
1265 (unsigned long long)chunk_offset);
1266 root = root->fs_info->chunk_root;
1267 extent_root = root->fs_info->extent_root;
1268 em_tree = &root->fs_info->mapping_tree.map_tree;
1269
1270 /* step one, relocate all the extents inside this chunk */
1271 ret = btrfs_shrink_extent_tree(extent_root, chunk_offset);
1272 BUG_ON(ret);
1273
1274 trans = btrfs_start_transaction(root, 1);
1275 BUG_ON(!trans);
1276
1277 lock_chunks(root);
1278
1279 /*
1280 * step two, delete the device extents and the
1281 * chunk tree entries
1282 */
1283 spin_lock(&em_tree->lock);
1284 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1285 spin_unlock(&em_tree->lock);
1286
1287 BUG_ON(em->start > chunk_offset ||
1288 em->start + em->len < chunk_offset);
1289 map = (struct map_lookup *)em->bdev;
1290
1291 for (i = 0; i < map->num_stripes; i++) {
1292 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1293 map->stripes[i].physical);
1294 BUG_ON(ret);
1295
1296 if (map->stripes[i].dev) {
1297 ret = btrfs_update_device(trans, map->stripes[i].dev);
1298 BUG_ON(ret);
1299 }
1300 }
1301 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1302 chunk_offset);
1303
1304 BUG_ON(ret);
1305
1306 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1307 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1308 BUG_ON(ret);
1309 }
1310
1311 spin_lock(&em_tree->lock);
1312 remove_extent_mapping(em_tree, em);
1313 kfree(map);
1314 em->bdev = NULL;
1315
1316 /* once for the tree */
1317 free_extent_map(em);
1318 spin_unlock(&em_tree->lock);
1319
1320 /* once for us */
1321 free_extent_map(em);
1322
1323 unlock_chunks(root);
1324 btrfs_end_transaction(trans, root);
1325 return 0;
1326}
1327
1328static u64 div_factor(u64 num, int factor)
1329{
1330 if (factor == 10)
1331 return num;
1332 num *= factor;
1333 do_div(num, 10);
1334 return num;
1335}
1336
1337
1338int btrfs_balance(struct btrfs_root *dev_root)
1339{
1340 int ret;
1341 struct list_head *cur;
1342 struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1343 struct btrfs_device *device;
1344 u64 old_size;
1345 u64 size_to_free;
1346 struct btrfs_path *path;
1347 struct btrfs_key key;
1348 struct btrfs_chunk *chunk;
1349 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1350 struct btrfs_trans_handle *trans;
1351 struct btrfs_key found_key;
1352
1353
1354 mutex_lock(&dev_root->fs_info->volume_mutex);
1355 dev_root = dev_root->fs_info->dev_root;
1356
1357 /* step one make some room on all the devices */
1358 list_for_each(cur, devices) {
1359 device = list_entry(cur, struct btrfs_device, dev_list);
1360 old_size = device->total_bytes;
1361 size_to_free = div_factor(old_size, 1);
1362 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
1363 if (device->total_bytes - device->bytes_used > size_to_free)
1364 continue;
1365
1366 ret = btrfs_shrink_device(device, old_size - size_to_free);
1367 BUG_ON(ret);
1368
1369 trans = btrfs_start_transaction(dev_root, 1);
1370 BUG_ON(!trans);
1371
1372 ret = btrfs_grow_device(trans, device, old_size);
1373 BUG_ON(ret);
1374
1375 btrfs_end_transaction(trans, dev_root);
1376 }
1377
1378 /* step two, relocate all the chunks */
1379 path = btrfs_alloc_path();
1380 BUG_ON(!path);
1381
1382 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1383 key.offset = (u64)-1;
1384 key.type = BTRFS_CHUNK_ITEM_KEY;
1385
1386 while(1) {
1387 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1388 if (ret < 0)
1389 goto error;
1390
1391 /*
1392 * this shouldn't happen, it means the last relocate
1393 * failed
1394 */
1395 if (ret == 0)
1396 break;
1397
1398 ret = btrfs_previous_item(chunk_root, path, 0,
1399 BTRFS_CHUNK_ITEM_KEY);
1400 if (ret)
1401 break;
1402
1403 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1404 path->slots[0]);
1405 if (found_key.objectid != key.objectid)
1406 break;
1407
1408 chunk = btrfs_item_ptr(path->nodes[0],
1409 path->slots[0],
1410 struct btrfs_chunk);
1411 key.offset = found_key.offset;
1412 /* chunk zero is special */
1413 if (key.offset == 0)
1414 break;
1415
1416 btrfs_release_path(chunk_root, path);
1417 ret = btrfs_relocate_chunk(chunk_root,
1418 chunk_root->root_key.objectid,
1419 found_key.objectid,
1420 found_key.offset);
1421 BUG_ON(ret);
1422 }
1423 ret = 0;
1424error:
1425 btrfs_free_path(path);
1426 mutex_unlock(&dev_root->fs_info->volume_mutex);
1427 return ret;
1428}
1429
1430/*
1431 * shrinking a device means finding all of the device extents past
1432 * the new size, and then following the back refs to the chunks.
1433 * The chunk relocation code actually frees the device extent
1434 */
1435int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1436{
1437 struct btrfs_trans_handle *trans;
1438 struct btrfs_root *root = device->dev_root;
1439 struct btrfs_dev_extent *dev_extent = NULL;
1440 struct btrfs_path *path;
1441 u64 length;
1442 u64 chunk_tree;
1443 u64 chunk_objectid;
1444 u64 chunk_offset;
1445 int ret;
1446 int slot;
1447 struct extent_buffer *l;
1448 struct btrfs_key key;
1449 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1450 u64 old_total = btrfs_super_total_bytes(super_copy);
1451 u64 diff = device->total_bytes - new_size;
1452
1453
1454 path = btrfs_alloc_path();
1455 if (!path)
1456 return -ENOMEM;
1457
1458 trans = btrfs_start_transaction(root, 1);
1459 if (!trans) {
1460 ret = -ENOMEM;
1461 goto done;
1462 }
1463
1464 path->reada = 2;
1465
1466 lock_chunks(root);
1467
1468 device->total_bytes = new_size;
1469 ret = btrfs_update_device(trans, device);
1470 if (ret) {
1471 unlock_chunks(root);
1472 btrfs_end_transaction(trans, root);
1473 goto done;
1474 }
1475 WARN_ON(diff > old_total);
1476 btrfs_set_super_total_bytes(super_copy, old_total - diff);
1477 unlock_chunks(root);
1478 btrfs_end_transaction(trans, root);
1479
1480 key.objectid = device->devid;
1481 key.offset = (u64)-1;
1482 key.type = BTRFS_DEV_EXTENT_KEY;
1483
1484 while (1) {
1485 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1486 if (ret < 0)
1487 goto done;
1488
1489 ret = btrfs_previous_item(root, path, 0, key.type);
1490 if (ret < 0)
1491 goto done;
1492 if (ret) {
1493 ret = 0;
1494 goto done;
1495 }
1496
1497 l = path->nodes[0];
1498 slot = path->slots[0];
1499 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1500
1501 if (key.objectid != device->devid)
1502 goto done;
1503
1504 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1505 length = btrfs_dev_extent_length(l, dev_extent);
1506
1507 if (key.offset + length <= new_size)
1508 goto done;
1509
1510 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1511 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
1512 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
1513 btrfs_release_path(root, path);
1514
1515 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
1516 chunk_offset);
1517 if (ret)
1518 goto done;
1519 }
1520
1521done:
1522 btrfs_free_path(path);
1523 return ret;
1524}
1525
1526int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
1527 struct btrfs_root *root,
1528 struct btrfs_key *key,
1529 struct btrfs_chunk *chunk, int item_size)
1530{
1531 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1532 struct btrfs_disk_key disk_key;
1533 u32 array_size;
1534 u8 *ptr;
1535
1536 array_size = btrfs_super_sys_array_size(super_copy);
1537 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
1538 return -EFBIG;
1539
1540 ptr = super_copy->sys_chunk_array + array_size;
1541 btrfs_cpu_key_to_disk(&disk_key, key);
1542 memcpy(ptr, &disk_key, sizeof(disk_key));
1543 ptr += sizeof(disk_key);
1544 memcpy(ptr, chunk, item_size);
1545 item_size += sizeof(disk_key);
1546 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
1547 return 0;
1548}
1549
1550static u64 noinline chunk_bytes_by_type(u64 type, u64 calc_size,
1551 int num_stripes, int sub_stripes)
1552{
1553 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
1554 return calc_size;
1555 else if (type & BTRFS_BLOCK_GROUP_RAID10)
1556 return calc_size * (num_stripes / sub_stripes);
1557 else
1558 return calc_size * num_stripes;
1559}
1560
1561
1562int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
1563 struct btrfs_root *extent_root, u64 *start,
1564 u64 *num_bytes, u64 type)
1565{
1566 u64 dev_offset;
1567 struct btrfs_fs_info *info = extent_root->fs_info;
1568 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
1569 struct btrfs_path *path;
1570 struct btrfs_stripe *stripes;
1571 struct btrfs_device *device = NULL;
1572 struct btrfs_chunk *chunk;
1573 struct list_head private_devs;
1574 struct list_head *dev_list;
1575 struct list_head *cur;
1576 struct extent_map_tree *em_tree;
1577 struct map_lookup *map;
1578 struct extent_map *em;
1579 int min_stripe_size = 1 * 1024 * 1024;
1580 u64 physical;
1581 u64 calc_size = 1024 * 1024 * 1024;
1582 u64 max_chunk_size = calc_size;
1583 u64 min_free;
1584 u64 avail;
1585 u64 max_avail = 0;
1586 u64 percent_max;
1587 int num_stripes = 1;
1588 int min_stripes = 1;
1589 int sub_stripes = 0;
1590 int looped = 0;
1591 int ret;
1592 int index;
1593 int stripe_len = 64 * 1024;
1594 struct btrfs_key key;
1595
1596 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
1597 (type & BTRFS_BLOCK_GROUP_DUP)) {
1598 WARN_ON(1);
1599 type &= ~BTRFS_BLOCK_GROUP_DUP;
1600 }
1601 dev_list = &extent_root->fs_info->fs_devices->alloc_list;
1602 if (list_empty(dev_list))
1603 return -ENOSPC;
1604
1605 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
1606 num_stripes = extent_root->fs_info->fs_devices->open_devices;
1607 min_stripes = 2;
1608 }
1609 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
1610 num_stripes = 2;
1611 min_stripes = 2;
1612 }
1613 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
1614 num_stripes = min_t(u64, 2,
1615 extent_root->fs_info->fs_devices->open_devices);
1616 if (num_stripes < 2)
1617 return -ENOSPC;
1618 min_stripes = 2;
1619 }
1620 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
1621 num_stripes = extent_root->fs_info->fs_devices->open_devices;
1622 if (num_stripes < 4)
1623 return -ENOSPC;
1624 num_stripes &= ~(u32)1;
1625 sub_stripes = 2;
1626 min_stripes = 4;
1627 }
1628
1629 if (type & BTRFS_BLOCK_GROUP_DATA) {
1630 max_chunk_size = 10 * calc_size;
1631 min_stripe_size = 64 * 1024 * 1024;
1632 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
1633 max_chunk_size = 4 * calc_size;
1634 min_stripe_size = 32 * 1024 * 1024;
1635 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
1636 calc_size = 8 * 1024 * 1024;
1637 max_chunk_size = calc_size * 2;
1638 min_stripe_size = 1 * 1024 * 1024;
1639 }
1640
1641 path = btrfs_alloc_path();
1642 if (!path)
1643 return -ENOMEM;
1644
1645 /* we don't want a chunk larger than 10% of the FS */
1646 percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
1647 max_chunk_size = min(percent_max, max_chunk_size);
1648
1649again:
1650 if (calc_size * num_stripes > max_chunk_size) {
1651 calc_size = max_chunk_size;
1652 do_div(calc_size, num_stripes);
1653 do_div(calc_size, stripe_len);
1654 calc_size *= stripe_len;
1655 }
1656 /* we don't want tiny stripes */
1657 calc_size = max_t(u64, min_stripe_size, calc_size);
1658
1659 do_div(calc_size, stripe_len);
1660 calc_size *= stripe_len;
1661
1662 INIT_LIST_HEAD(&private_devs);
1663 cur = dev_list->next;
1664 index = 0;
1665
1666 if (type & BTRFS_BLOCK_GROUP_DUP)
1667 min_free = calc_size * 2;
1668 else
1669 min_free = calc_size;
1670
1671 /*
1672 * we add 1MB because we never use the first 1MB of the device, unless
1673 * we've looped, then we are likely allocating the maximum amount of
1674 * space left already
1675 */
1676 if (!looped)
1677 min_free += 1024 * 1024;
1678
1679 /* build a private list of devices we will allocate from */
1680 while(index < num_stripes) {
1681 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
1682
1683 if (device->total_bytes > device->bytes_used)
1684 avail = device->total_bytes - device->bytes_used;
1685 else
1686 avail = 0;
1687 cur = cur->next;
1688
1689 if (device->in_fs_metadata && avail >= min_free) {
1690 u64 ignored_start = 0;
1691 ret = find_free_dev_extent(trans, device, path,
1692 min_free,
1693 &ignored_start);
1694 if (ret == 0) {
1695 list_move_tail(&device->dev_alloc_list,
1696 &private_devs);
1697 index++;
1698 if (type & BTRFS_BLOCK_GROUP_DUP)
1699 index++;
1700 }
1701 } else if (device->in_fs_metadata && avail > max_avail)
1702 max_avail = avail;
1703 if (cur == dev_list)
1704 break;
1705 }
1706 if (index < num_stripes) {
1707 list_splice(&private_devs, dev_list);
1708 if (index >= min_stripes) {
1709 num_stripes = index;
1710 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
1711 num_stripes /= sub_stripes;
1712 num_stripes *= sub_stripes;
1713 }
1714 looped = 1;
1715 goto again;
1716 }
1717 if (!looped && max_avail > 0) {
1718 looped = 1;
1719 calc_size = max_avail;
1720 goto again;
1721 }
1722 btrfs_free_path(path);
1723 return -ENOSPC;
1724 }
1725 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1726 key.type = BTRFS_CHUNK_ITEM_KEY;
1727 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
1728 &key.offset);
1729 if (ret) {
1730 btrfs_free_path(path);
1731 return ret;
1732 }
1733
1734 chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
1735 if (!chunk) {
1736 btrfs_free_path(path);
1737 return -ENOMEM;
1738 }
1739
1740 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
1741 if (!map) {
1742 kfree(chunk);
1743 btrfs_free_path(path);
1744 return -ENOMEM;
1745 }
1746 btrfs_free_path(path);
1747 path = NULL;
1748
1749 stripes = &chunk->stripe;
1750 *num_bytes = chunk_bytes_by_type(type, calc_size,
1751 num_stripes, sub_stripes);
1752
1753 index = 0;
1754 while(index < num_stripes) {
1755 struct btrfs_stripe *stripe;
1756 BUG_ON(list_empty(&private_devs));
1757 cur = private_devs.next;
1758 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
1759
1760 /* loop over this device again if we're doing a dup group */
1761 if (!(type & BTRFS_BLOCK_GROUP_DUP) ||
1762 (index == num_stripes - 1))
1763 list_move_tail(&device->dev_alloc_list, dev_list);
1764
1765 ret = btrfs_alloc_dev_extent(trans, device,
1766 info->chunk_root->root_key.objectid,
1767 BTRFS_FIRST_CHUNK_TREE_OBJECTID, key.offset,
1768 calc_size, &dev_offset);
1769 BUG_ON(ret);
1770 device->bytes_used += calc_size;
1771 ret = btrfs_update_device(trans, device);
1772 BUG_ON(ret);
1773
1774 map->stripes[index].dev = device;
1775 map->stripes[index].physical = dev_offset;
1776 stripe = stripes + index;
1777 btrfs_set_stack_stripe_devid(stripe, device->devid);
1778 btrfs_set_stack_stripe_offset(stripe, dev_offset);
1779 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
1780 physical = dev_offset;
1781 index++;
1782 }
1783 BUG_ON(!list_empty(&private_devs));
1784
1785 /* key was set above */
1786 btrfs_set_stack_chunk_length(chunk, *num_bytes);
1787 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
1788 btrfs_set_stack_chunk_stripe_len(chunk, stripe_len);
1789 btrfs_set_stack_chunk_type(chunk, type);
1790 btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
1791 btrfs_set_stack_chunk_io_align(chunk, stripe_len);
1792 btrfs_set_stack_chunk_io_width(chunk, stripe_len);
1793 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
1794 btrfs_set_stack_chunk_sub_stripes(chunk, sub_stripes);
1795 map->sector_size = extent_root->sectorsize;
1796 map->stripe_len = stripe_len;
1797 map->io_align = stripe_len;
1798 map->io_width = stripe_len;
1799 map->type = type;
1800 map->num_stripes = num_stripes;
1801 map->sub_stripes = sub_stripes;
1802
1803 ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
1804 btrfs_chunk_item_size(num_stripes));
1805 BUG_ON(ret);
1806 *start = key.offset;;
1807
1808 em = alloc_extent_map(GFP_NOFS);
1809 if (!em)
1810 return -ENOMEM;
1811 em->bdev = (struct block_device *)map;
1812 em->start = key.offset;
1813 em->len = *num_bytes;
1814 em->block_start = 0;
1815
1816 if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
1817 ret = btrfs_add_system_chunk(trans, chunk_root, &key,
1818 chunk, btrfs_chunk_item_size(num_stripes));
1819 BUG_ON(ret);
1820 }
1821 kfree(chunk);
1822
1823 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
1824 spin_lock(&em_tree->lock);
1825 ret = add_extent_mapping(em_tree, em);
1826 spin_unlock(&em_tree->lock);
1827 BUG_ON(ret);
1828 free_extent_map(em);
1829 return ret;
1830}
1831
1832void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
1833{
1834 extent_map_tree_init(&tree->map_tree, GFP_NOFS);
1835}
1836
1837void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
1838{
1839 struct extent_map *em;
1840
1841 while(1) {
1842 spin_lock(&tree->map_tree.lock);
1843 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
1844 if (em)
1845 remove_extent_mapping(&tree->map_tree, em);
1846 spin_unlock(&tree->map_tree.lock);
1847 if (!em)
1848 break;
1849 kfree(em->bdev);
1850 /* once for us */
1851 free_extent_map(em);
1852 /* once for the tree */
1853 free_extent_map(em);
1854 }
1855}
1856
1857int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
1858{
1859 struct extent_map *em;
1860 struct map_lookup *map;
1861 struct extent_map_tree *em_tree = &map_tree->map_tree;
1862 int ret;
1863
1864 spin_lock(&em_tree->lock);
1865 em = lookup_extent_mapping(em_tree, logical, len);
1866 spin_unlock(&em_tree->lock);
1867 BUG_ON(!em);
1868
1869 BUG_ON(em->start > logical || em->start + em->len < logical);
1870 map = (struct map_lookup *)em->bdev;
1871 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
1872 ret = map->num_stripes;
1873 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
1874 ret = map->sub_stripes;
1875 else
1876 ret = 1;
1877 free_extent_map(em);
1878 return ret;
1879}
1880
1881static int find_live_mirror(struct map_lookup *map, int first, int num,
1882 int optimal)
1883{
1884 int i;
1885 if (map->stripes[optimal].dev->bdev)
1886 return optimal;
1887 for (i = first; i < first + num; i++) {
1888 if (map->stripes[i].dev->bdev)
1889 return i;
1890 }
1891 /* we couldn't find one that doesn't fail. Just return something
1892 * and the io error handling code will clean up eventually
1893 */
1894 return optimal;
1895}
1896
1897static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
1898 u64 logical, u64 *length,
1899 struct btrfs_multi_bio **multi_ret,
1900 int mirror_num, struct page *unplug_page)
1901{
1902 struct extent_map *em;
1903 struct map_lookup *map;
1904 struct extent_map_tree *em_tree = &map_tree->map_tree;
1905 u64 offset;
1906 u64 stripe_offset;
1907 u64 stripe_nr;
1908 int stripes_allocated = 8;
1909 int stripes_required = 1;
1910 int stripe_index;
1911 int i;
1912 int num_stripes;
1913 int max_errors = 0;
1914 struct btrfs_multi_bio *multi = NULL;
1915
1916 if (multi_ret && !(rw & (1 << BIO_RW))) {
1917 stripes_allocated = 1;
1918 }
1919again:
1920 if (multi_ret) {
1921 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
1922 GFP_NOFS);
1923 if (!multi)
1924 return -ENOMEM;
1925
1926 atomic_set(&multi->error, 0);
1927 }
1928
1929 spin_lock(&em_tree->lock);
1930 em = lookup_extent_mapping(em_tree, logical, *length);
1931 spin_unlock(&em_tree->lock);
1932
1933 if (!em && unplug_page)
1934 return 0;
1935
1936 if (!em) {
1937 printk("unable to find logical %Lu len %Lu\n", logical, *length);
1938 BUG();
1939 }
1940
1941 BUG_ON(em->start > logical || em->start + em->len < logical);
1942 map = (struct map_lookup *)em->bdev;
1943 offset = logical - em->start;
1944
1945 if (mirror_num > map->num_stripes)
1946 mirror_num = 0;
1947
1948 /* if our multi bio struct is too small, back off and try again */
1949 if (rw & (1 << BIO_RW)) {
1950 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
1951 BTRFS_BLOCK_GROUP_DUP)) {
1952 stripes_required = map->num_stripes;
1953 max_errors = 1;
1954 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
1955 stripes_required = map->sub_stripes;
1956 max_errors = 1;
1957 }
1958 }
1959 if (multi_ret && rw == WRITE &&
1960 stripes_allocated < stripes_required) {
1961 stripes_allocated = map->num_stripes;
1962 free_extent_map(em);
1963 kfree(multi);
1964 goto again;
1965 }
1966 stripe_nr = offset;
1967 /*
1968 * stripe_nr counts the total number of stripes we have to stride
1969 * to get to this block
1970 */
1971 do_div(stripe_nr, map->stripe_len);
1972
1973 stripe_offset = stripe_nr * map->stripe_len;
1974 BUG_ON(offset < stripe_offset);
1975
1976 /* stripe_offset is the offset of this block in its stripe*/
1977 stripe_offset = offset - stripe_offset;
1978
1979 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
1980 BTRFS_BLOCK_GROUP_RAID10 |
1981 BTRFS_BLOCK_GROUP_DUP)) {
1982 /* we limit the length of each bio to what fits in a stripe */
1983 *length = min_t(u64, em->len - offset,
1984 map->stripe_len - stripe_offset);
1985 } else {
1986 *length = em->len - offset;
1987 }
1988
1989 if (!multi_ret && !unplug_page)
1990 goto out;
1991
1992 num_stripes = 1;
1993 stripe_index = 0;
1994 if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
1995 if (unplug_page || (rw & (1 << BIO_RW)))
1996 num_stripes = map->num_stripes;
1997 else if (mirror_num)
1998 stripe_index = mirror_num - 1;
1999 else {
2000 stripe_index = find_live_mirror(map, 0,
2001 map->num_stripes,
2002 current->pid % map->num_stripes);
2003 }
2004
2005 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2006 if (rw & (1 << BIO_RW))
2007 num_stripes = map->num_stripes;
2008 else if (mirror_num)
2009 stripe_index = mirror_num - 1;
2010
2011 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2012 int factor = map->num_stripes / map->sub_stripes;
2013
2014 stripe_index = do_div(stripe_nr, factor);
2015 stripe_index *= map->sub_stripes;
2016
2017 if (unplug_page || (rw & (1 << BIO_RW)))
2018 num_stripes = map->sub_stripes;
2019 else if (mirror_num)
2020 stripe_index += mirror_num - 1;
2021 else {
2022 stripe_index = find_live_mirror(map, stripe_index,
2023 map->sub_stripes, stripe_index +
2024 current->pid % map->sub_stripes);
2025 }
2026 } else {
2027 /*
2028 * after this do_div call, stripe_nr is the number of stripes
2029 * on this device we have to walk to find the data, and
2030 * stripe_index is the number of our device in the stripe array
2031 */
2032 stripe_index = do_div(stripe_nr, map->num_stripes);
2033 }
2034 BUG_ON(stripe_index >= map->num_stripes);
2035
2036 for (i = 0; i < num_stripes; i++) {
2037 if (unplug_page) {
2038 struct btrfs_device *device;
2039 struct backing_dev_info *bdi;
2040
2041 device = map->stripes[stripe_index].dev;
2042 if (device->bdev) {
2043 bdi = blk_get_backing_dev_info(device->bdev);
2044 if (bdi->unplug_io_fn) {
2045 bdi->unplug_io_fn(bdi, unplug_page);
2046 }
2047 }
2048 } else {
2049 multi->stripes[i].physical =
2050 map->stripes[stripe_index].physical +
2051 stripe_offset + stripe_nr * map->stripe_len;
2052 multi->stripes[i].dev = map->stripes[stripe_index].dev;
2053 }
2054 stripe_index++;
2055 }
2056 if (multi_ret) {
2057 *multi_ret = multi;
2058 multi->num_stripes = num_stripes;
2059 multi->max_errors = max_errors;
2060 }
2061out:
2062 free_extent_map(em);
2063 return 0;
2064}
2065
2066int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2067 u64 logical, u64 *length,
2068 struct btrfs_multi_bio **multi_ret, int mirror_num)
2069{
2070 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
2071 mirror_num, NULL);
2072}
2073
2074int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
2075 u64 logical, struct page *page)
2076{
2077 u64 length = PAGE_CACHE_SIZE;
2078 return __btrfs_map_block(map_tree, READ, logical, &length,
2079 NULL, 0, page);
2080}
2081
2082
2083#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
2084static void end_bio_multi_stripe(struct bio *bio, int err)
2085#else
2086static int end_bio_multi_stripe(struct bio *bio,
2087 unsigned int bytes_done, int err)
2088#endif
2089{
2090 struct btrfs_multi_bio *multi = bio->bi_private;
2091 int is_orig_bio = 0;
2092
2093#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
2094 if (bio->bi_size)
2095 return 1;
2096#endif
2097 if (err)
2098 atomic_inc(&multi->error);
2099
2100 if (bio == multi->orig_bio)
2101 is_orig_bio = 1;
2102
2103 if (atomic_dec_and_test(&multi->stripes_pending)) {
2104 if (!is_orig_bio) {
2105 bio_put(bio);
2106 bio = multi->orig_bio;
2107 }
2108 bio->bi_private = multi->private;
2109 bio->bi_end_io = multi->end_io;
2110 /* only send an error to the higher layers if it is
2111 * beyond the tolerance of the multi-bio
2112 */
2113 if (atomic_read(&multi->error) > multi->max_errors) {
2114 err = -EIO;
2115 } else if (err) {
2116 /*
2117 * this bio is actually up to date, we didn't
2118 * go over the max number of errors
2119 */
2120 set_bit(BIO_UPTODATE, &bio->bi_flags);
2121 err = 0;
2122 }
2123 kfree(multi);
2124
2125#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
2126 bio_endio(bio, bio->bi_size, err);
2127#else
2128 bio_endio(bio, err);
2129#endif
2130 } else if (!is_orig_bio) {
2131 bio_put(bio);
2132 }
2133#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
2134 return 0;
2135#endif
2136}
2137
2138struct async_sched {
2139 struct bio *bio;
2140 int rw;
2141 struct btrfs_fs_info *info;
2142 struct btrfs_work work;
2143};
2144
2145/*
2146 * see run_scheduled_bios for a description of why bios are collected for
2147 * async submit.
2148 *
2149 * This will add one bio to the pending list for a device and make sure
2150 * the work struct is scheduled.
2151 */
2152static int noinline schedule_bio(struct btrfs_root *root,
2153 struct btrfs_device *device,
2154 int rw, struct bio *bio)
2155{
2156 int should_queue = 1;
2157
2158 /* don't bother with additional async steps for reads, right now */
2159 if (!(rw & (1 << BIO_RW))) {
2160 bio_get(bio);
2161 submit_bio(rw, bio);
2162 bio_put(bio);
2163 return 0;
2164 }
2165
2166 /*
2167 * nr_async_bios allows us to reliably return congestion to the
2168 * higher layers. Otherwise, the async bio makes it appear we have
2169 * made progress against dirty pages when we've really just put it
2170 * on a queue for later
2171 */
2172 atomic_inc(&root->fs_info->nr_async_bios);
2173 WARN_ON(bio->bi_next);
2174 bio->bi_next = NULL;
2175 bio->bi_rw |= rw;
2176
2177 spin_lock(&device->io_lock);
2178
2179 if (device->pending_bio_tail)
2180 device->pending_bio_tail->bi_next = bio;
2181
2182 device->pending_bio_tail = bio;
2183 if (!device->pending_bios)
2184 device->pending_bios = bio;
2185 if (device->running_pending)
2186 should_queue = 0;
2187
2188 spin_unlock(&device->io_lock);
2189
2190 if (should_queue)
2191 btrfs_queue_worker(&root->fs_info->submit_workers,
2192 &device->work);
2193 return 0;
2194}
2195
2196int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
2197 int mirror_num, int async_submit)
2198{
2199 struct btrfs_mapping_tree *map_tree;
2200 struct btrfs_device *dev;
2201 struct bio *first_bio = bio;
2202 u64 logical = bio->bi_sector << 9;
2203 u64 length = 0;
2204 u64 map_length;
2205 struct btrfs_multi_bio *multi = NULL;
2206 int ret;
2207 int dev_nr = 0;
2208 int total_devs = 1;
2209
2210 length = bio->bi_size;
2211 map_tree = &root->fs_info->mapping_tree;
2212 map_length = length;
2213
2214 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
2215 mirror_num);
2216 BUG_ON(ret);
2217
2218 total_devs = multi->num_stripes;
2219 if (map_length < length) {
2220 printk("mapping failed logical %Lu bio len %Lu "
2221 "len %Lu\n", logical, length, map_length);
2222 BUG();
2223 }
2224 multi->end_io = first_bio->bi_end_io;
2225 multi->private = first_bio->bi_private;
2226 multi->orig_bio = first_bio;
2227 atomic_set(&multi->stripes_pending, multi->num_stripes);
2228
2229 while(dev_nr < total_devs) {
2230 if (total_devs > 1) {
2231 if (dev_nr < total_devs - 1) {
2232 bio = bio_clone(first_bio, GFP_NOFS);
2233 BUG_ON(!bio);
2234 } else {
2235 bio = first_bio;
2236 }
2237 bio->bi_private = multi;
2238 bio->bi_end_io = end_bio_multi_stripe;
2239 }
2240 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
2241 dev = multi->stripes[dev_nr].dev;
2242 if (dev && dev->bdev) {
2243 bio->bi_bdev = dev->bdev;
2244 if (async_submit)
2245 schedule_bio(root, dev, rw, bio);
2246 else
2247 submit_bio(rw, bio);
2248 } else {
2249 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
2250 bio->bi_sector = logical >> 9;
2251#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
2252 bio_endio(bio, bio->bi_size, -EIO);
2253#else
2254 bio_endio(bio, -EIO);
2255#endif
2256 }
2257 dev_nr++;
2258 }
2259 if (total_devs == 1)
2260 kfree(multi);
2261 return 0;
2262}
2263
2264struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
2265 u8 *uuid)
2266{
2267 struct list_head *head = &root->fs_info->fs_devices->devices;
2268
2269 return __find_device(head, devid, uuid);
2270}
2271
2272static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
2273 u64 devid, u8 *dev_uuid)
2274{
2275 struct btrfs_device *device;
2276 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2277
2278 device = kzalloc(sizeof(*device), GFP_NOFS);
2279 list_add(&device->dev_list,
2280 &fs_devices->devices);
2281 list_add(&device->dev_alloc_list,
2282 &fs_devices->alloc_list);
2283 device->barriers = 1;
2284 device->dev_root = root->fs_info->dev_root;
2285 device->devid = devid;
2286 device->work.func = pending_bios_fn;
2287 fs_devices->num_devices++;
2288 spin_lock_init(&device->io_lock);
2289 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
2290 return device;
2291}
2292
2293
2294static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2295 struct extent_buffer *leaf,
2296 struct btrfs_chunk *chunk)
2297{
2298 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2299 struct map_lookup *map;
2300 struct extent_map *em;
2301 u64 logical;
2302 u64 length;
2303 u64 devid;
2304 u8 uuid[BTRFS_UUID_SIZE];
2305 int num_stripes;
2306 int ret;
2307 int i;
2308
2309 logical = key->offset;
2310 length = btrfs_chunk_length(leaf, chunk);
2311
2312 spin_lock(&map_tree->map_tree.lock);
2313 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
2314 spin_unlock(&map_tree->map_tree.lock);
2315
2316 /* already mapped? */
2317 if (em && em->start <= logical && em->start + em->len > logical) {
2318 free_extent_map(em);
2319 return 0;
2320 } else if (em) {
2321 free_extent_map(em);
2322 }
2323
2324 map = kzalloc(sizeof(*map), GFP_NOFS);
2325 if (!map)
2326 return -ENOMEM;
2327
2328 em = alloc_extent_map(GFP_NOFS);
2329 if (!em)
2330 return -ENOMEM;
2331 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2332 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2333 if (!map) {
2334 free_extent_map(em);
2335 return -ENOMEM;
2336 }
2337
2338 em->bdev = (struct block_device *)map;
2339 em->start = logical;
2340 em->len = length;
2341 em->block_start = 0;
2342
2343 map->num_stripes = num_stripes;
2344 map->io_width = btrfs_chunk_io_width(leaf, chunk);
2345 map->io_align = btrfs_chunk_io_align(leaf, chunk);
2346 map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
2347 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
2348 map->type = btrfs_chunk_type(leaf, chunk);
2349 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
2350 for (i = 0; i < num_stripes; i++) {
2351 map->stripes[i].physical =
2352 btrfs_stripe_offset_nr(leaf, chunk, i);
2353 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
2354 read_extent_buffer(leaf, uuid, (unsigned long)
2355 btrfs_stripe_dev_uuid_nr(chunk, i),
2356 BTRFS_UUID_SIZE);
2357 map->stripes[i].dev = btrfs_find_device(root, devid, uuid);
2358
2359 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
2360 kfree(map);
2361 free_extent_map(em);
2362 return -EIO;
2363 }
2364 if (!map->stripes[i].dev) {
2365 map->stripes[i].dev =
2366 add_missing_dev(root, devid, uuid);
2367 if (!map->stripes[i].dev) {
2368 kfree(map);
2369 free_extent_map(em);
2370 return -EIO;
2371 }
2372 }
2373 map->stripes[i].dev->in_fs_metadata = 1;
2374 }
2375
2376 spin_lock(&map_tree->map_tree.lock);
2377 ret = add_extent_mapping(&map_tree->map_tree, em);
2378 spin_unlock(&map_tree->map_tree.lock);
2379 BUG_ON(ret);
2380 free_extent_map(em);
2381
2382 return 0;
2383}
2384
2385static int fill_device_from_item(struct extent_buffer *leaf,
2386 struct btrfs_dev_item *dev_item,
2387 struct btrfs_device *device)
2388{
2389 unsigned long ptr;
2390
2391 device->devid = btrfs_device_id(leaf, dev_item);
2392 device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
2393 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
2394 device->type = btrfs_device_type(leaf, dev_item);
2395 device->io_align = btrfs_device_io_align(leaf, dev_item);
2396 device->io_width = btrfs_device_io_width(leaf, dev_item);
2397 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
2398
2399 ptr = (unsigned long)btrfs_device_uuid(dev_item);
2400 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
2401
2402 return 0;
2403}
2404
2405static int read_one_dev(struct btrfs_root *root,
2406 struct extent_buffer *leaf,
2407 struct btrfs_dev_item *dev_item)
2408{
2409 struct btrfs_device *device;
2410 u64 devid;
2411 int ret;
2412 u8 dev_uuid[BTRFS_UUID_SIZE];
2413
2414 devid = btrfs_device_id(leaf, dev_item);
2415 read_extent_buffer(leaf, dev_uuid,
2416 (unsigned long)btrfs_device_uuid(dev_item),
2417 BTRFS_UUID_SIZE);
2418 device = btrfs_find_device(root, devid, dev_uuid);
2419 if (!device) {
2420 printk("warning devid %Lu missing\n", devid);
2421 device = add_missing_dev(root, devid, dev_uuid);
2422 if (!device)
2423 return -ENOMEM;
2424 }
2425
2426 fill_device_from_item(leaf, dev_item, device);
2427 device->dev_root = root->fs_info->dev_root;
2428 device->in_fs_metadata = 1;
2429 ret = 0;
2430#if 0
2431 ret = btrfs_open_device(device);
2432 if (ret) {
2433 kfree(device);
2434 }
2435#endif
2436 return ret;
2437}
2438
2439int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
2440{
2441 struct btrfs_dev_item *dev_item;
2442
2443 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
2444 dev_item);
2445 return read_one_dev(root, buf, dev_item);
2446}
2447
2448int btrfs_read_sys_array(struct btrfs_root *root)
2449{
2450 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
2451 struct extent_buffer *sb;
2452 struct btrfs_disk_key *disk_key;
2453 struct btrfs_chunk *chunk;
2454 u8 *ptr;
2455 unsigned long sb_ptr;
2456 int ret = 0;
2457 u32 num_stripes;
2458 u32 array_size;
2459 u32 len = 0;
2460 u32 cur;
2461 struct btrfs_key key;
2462
2463 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
2464 BTRFS_SUPER_INFO_SIZE);
2465 if (!sb)
2466 return -ENOMEM;
2467 btrfs_set_buffer_uptodate(sb);
2468 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
2469 array_size = btrfs_super_sys_array_size(super_copy);
2470
2471 ptr = super_copy->sys_chunk_array;
2472 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
2473 cur = 0;
2474
2475 while (cur < array_size) {
2476 disk_key = (struct btrfs_disk_key *)ptr;
2477 btrfs_disk_key_to_cpu(&key, disk_key);
2478
2479 len = sizeof(*disk_key); ptr += len;
2480 sb_ptr += len;
2481 cur += len;
2482
2483 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2484 chunk = (struct btrfs_chunk *)sb_ptr;
2485 ret = read_one_chunk(root, &key, sb, chunk);
2486 if (ret)
2487 break;
2488 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
2489 len = btrfs_chunk_item_size(num_stripes);
2490 } else {
2491 ret = -EIO;
2492 break;
2493 }
2494 ptr += len;
2495 sb_ptr += len;
2496 cur += len;
2497 }
2498 free_extent_buffer(sb);
2499 return ret;
2500}
2501
2502int btrfs_read_chunk_tree(struct btrfs_root *root)
2503{
2504 struct btrfs_path *path;
2505 struct extent_buffer *leaf;
2506 struct btrfs_key key;
2507 struct btrfs_key found_key;
2508 int ret;
2509 int slot;
2510
2511 root = root->fs_info->chunk_root;
2512
2513 path = btrfs_alloc_path();
2514 if (!path)
2515 return -ENOMEM;
2516
2517 /* first we search for all of the device items, and then we
2518 * read in all of the chunk items. This way we can create chunk
2519 * mappings that reference all of the devices that are afound
2520 */
2521 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2522 key.offset = 0;
2523 key.type = 0;
2524again:
2525 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2526 while(1) {
2527 leaf = path->nodes[0];
2528 slot = path->slots[0];
2529 if (slot >= btrfs_header_nritems(leaf)) {
2530 ret = btrfs_next_leaf(root, path);
2531 if (ret == 0)
2532 continue;
2533 if (ret < 0)
2534 goto error;
2535 break;
2536 }
2537 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2538 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
2539 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
2540 break;
2541 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
2542 struct btrfs_dev_item *dev_item;
2543 dev_item = btrfs_item_ptr(leaf, slot,
2544 struct btrfs_dev_item);
2545 ret = read_one_dev(root, leaf, dev_item);
2546 BUG_ON(ret);
2547 }
2548 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
2549 struct btrfs_chunk *chunk;
2550 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2551 ret = read_one_chunk(root, &found_key, leaf, chunk);
2552 }
2553 path->slots[0]++;
2554 }
2555 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
2556 key.objectid = 0;
2557 btrfs_release_path(root, path);
2558 goto again;
2559 }
2560
2561 btrfs_free_path(path);
2562 ret = 0;
2563error:
2564 return ret;
2565}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 000000000000..c50e50580b51
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,150 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_VOLUMES_
20#define __BTRFS_VOLUMES_
21
22#include <linux/bio.h>
23#include "async-thread.h"
24
25struct buffer_head;
26struct btrfs_device {
27 struct list_head dev_list;
28 struct list_head dev_alloc_list;
29 struct btrfs_root *dev_root;
30 struct buffer_head *pending_io;
31 struct bio *pending_bios;
32 struct bio *pending_bio_tail;
33 int running_pending;
34 u64 generation;
35
36 int barriers;
37 int in_fs_metadata;
38
39 spinlock_t io_lock;
40
41 struct block_device *bdev;
42
43 char *name;
44
45 /* the internal btrfs device id */
46 u64 devid;
47
48 /* size of the device */
49 u64 total_bytes;
50
51 /* bytes used */
52 u64 bytes_used;
53
54 /* optimal io alignment for this device */
55 u32 io_align;
56
57 /* optimal io width for this device */
58 u32 io_width;
59
60 /* minimal io size for this device */
61 u32 sector_size;
62
63 /* type and info about this device */
64 u64 type;
65
66 /* physical drive uuid (or lvm uuid) */
67 u8 uuid[BTRFS_UUID_SIZE];
68
69 struct btrfs_work work;
70};
71
72struct btrfs_fs_devices {
73 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
74
75 /* the device with this id has the most recent coyp of the super */
76 u64 latest_devid;
77 u64 latest_trans;
78 u64 num_devices;
79 u64 open_devices;
80 struct block_device *latest_bdev;
81 /* all of the devices in the FS */
82 struct list_head devices;
83
84 /* devices not currently being allocated */
85 struct list_head alloc_list;
86 struct list_head list;
87 int mounted;
88};
89
90struct btrfs_bio_stripe {
91 struct btrfs_device *dev;
92 u64 physical;
93};
94
95struct btrfs_multi_bio {
96 atomic_t stripes_pending;
97 bio_end_io_t *end_io;
98 struct bio *orig_bio;
99 void *private;
100 atomic_t error;
101 int max_errors;
102 int num_stripes;
103 struct btrfs_bio_stripe stripes[];
104};
105
106#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
107 (sizeof(struct btrfs_bio_stripe) * (n)))
108
109int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
110 struct btrfs_device *device,
111 u64 chunk_tree, u64 chunk_objectid,
112 u64 chunk_offset,
113 u64 num_bytes, u64 *start);
114int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
115 u64 logical, u64 *length,
116 struct btrfs_multi_bio **multi_ret, int mirror_num);
117int btrfs_read_sys_array(struct btrfs_root *root);
118int btrfs_read_chunk_tree(struct btrfs_root *root);
119int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
120 struct btrfs_root *extent_root, u64 *start,
121 u64 *num_bytes, u64 type);
122void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
123void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
124int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
125 int mirror_num, int async_submit);
126int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
127int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
128 int flags, void *holder);
129int btrfs_scan_one_device(const char *path, int flags, void *holder,
130 struct btrfs_fs_devices **fs_devices_ret);
131int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
132int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
133int btrfs_add_device(struct btrfs_trans_handle *trans,
134 struct btrfs_root *root,
135 struct btrfs_device *device);
136int btrfs_rm_device(struct btrfs_root *root, char *device_path);
137int btrfs_cleanup_fs_uuids(void);
138int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
139int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
140 u64 logical, struct page *page);
141int btrfs_grow_device(struct btrfs_trans_handle *trans,
142 struct btrfs_device *device, u64 new_size);
143struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
144 u8 *uuid);
145int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
146int btrfs_init_new_device(struct btrfs_root *root, char *path);
147int btrfs_balance(struct btrfs_root *dev_root);
148void btrfs_unlock_volumes(void);
149void btrfs_lock_volumes(void);
150#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 000000000000..adb4b32a9d51
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,321 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/init.h>
20#include <linux/fs.h>
21#include <linux/slab.h>
22#include <linux/rwsem.h>
23#include <linux/xattr.h>
24#include "ctree.h"
25#include "btrfs_inode.h"
26#include "transaction.h"
27#include "xattr.h"
28#include "disk-io.h"
29
30
31ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
32 void *buffer, size_t size)
33{
34 struct btrfs_dir_item *di;
35 struct btrfs_root *root = BTRFS_I(inode)->root;
36 struct btrfs_path *path;
37 struct extent_buffer *leaf;
38 int ret = 0;
39 unsigned long data_ptr;
40
41 path = btrfs_alloc_path();
42 if (!path)
43 return -ENOMEM;
44
45 /* lookup the xattr by name */
46 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
47 strlen(name), 0);
48 if (!di || IS_ERR(di)) {
49 ret = -ENODATA;
50 goto out;
51 }
52
53 leaf = path->nodes[0];
54 /* if size is 0, that means we want the size of the attr */
55 if (!size) {
56 ret = btrfs_dir_data_len(leaf, di);
57 goto out;
58 }
59
60 /* now get the data out of our dir_item */
61 if (btrfs_dir_data_len(leaf, di) > size) {
62 ret = -ERANGE;
63 goto out;
64 }
65 data_ptr = (unsigned long)((char *)(di + 1) +
66 btrfs_dir_name_len(leaf, di));
67 read_extent_buffer(leaf, buffer, data_ptr,
68 btrfs_dir_data_len(leaf, di));
69 ret = btrfs_dir_data_len(leaf, di);
70
71out:
72 btrfs_free_path(path);
73 return ret;
74}
75
76int __btrfs_setxattr(struct inode *inode, const char *name,
77 const void *value, size_t size, int flags)
78{
79 struct btrfs_dir_item *di;
80 struct btrfs_root *root = BTRFS_I(inode)->root;
81 struct btrfs_trans_handle *trans;
82 struct btrfs_path *path;
83 int ret = 0, mod = 0;
84
85 path = btrfs_alloc_path();
86 if (!path)
87 return -ENOMEM;
88
89 trans = btrfs_start_transaction(root, 1);
90 btrfs_set_trans_block_group(trans, inode);
91
92 /* first lets see if we already have this xattr */
93 di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
94 strlen(name), -1);
95 if (IS_ERR(di)) {
96 ret = PTR_ERR(di);
97 goto out;
98 }
99
100 /* ok we already have this xattr, lets remove it */
101 if (di) {
102 /* if we want create only exit */
103 if (flags & XATTR_CREATE) {
104 ret = -EEXIST;
105 goto out;
106 }
107
108 ret = btrfs_delete_one_dir_name(trans, root, path, di);
109 if (ret)
110 goto out;
111 btrfs_release_path(root, path);
112
113 /* if we don't have a value then we are removing the xattr */
114 if (!value) {
115 mod = 1;
116 goto out;
117 }
118 } else {
119 btrfs_release_path(root, path);
120
121 if (flags & XATTR_REPLACE) {
122 /* we couldn't find the attr to replace */
123 ret = -ENODATA;
124 goto out;
125 }
126 }
127
128 /* ok we have to create a completely new xattr */
129 ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
130 value, size, inode->i_ino);
131 if (ret)
132 goto out;
133 mod = 1;
134
135out:
136 if (mod) {
137 inode->i_ctime = CURRENT_TIME;
138 ret = btrfs_update_inode(trans, root, inode);
139 }
140
141 btrfs_end_transaction(trans, root);
142 btrfs_free_path(path);
143 return ret;
144}
145
146ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
147{
148 struct btrfs_key key, found_key;
149 struct inode *inode = dentry->d_inode;
150 struct btrfs_root *root = BTRFS_I(inode)->root;
151 struct btrfs_path *path;
152 struct btrfs_item *item;
153 struct extent_buffer *leaf;
154 struct btrfs_dir_item *di;
155 int ret = 0, slot, advance;
156 size_t total_size = 0, size_left = size;
157 unsigned long name_ptr;
158 size_t name_len;
159 u32 nritems;
160
161 /*
162 * ok we want all objects associated with this id.
163 * NOTE: we set key.offset = 0; because we want to start with the
164 * first xattr that we find and walk forward
165 */
166 key.objectid = inode->i_ino;
167 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
168 key.offset = 0;
169
170 path = btrfs_alloc_path();
171 if (!path)
172 return -ENOMEM;
173 path->reada = 2;
174
175 /* search for our xattrs */
176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
177 if (ret < 0)
178 goto err;
179 ret = 0;
180 advance = 0;
181 while (1) {
182 leaf = path->nodes[0];
183 nritems = btrfs_header_nritems(leaf);
184 slot = path->slots[0];
185
186 /* this is where we start walking through the path */
187 if (advance || slot >= nritems) {
188 /*
189 * if we've reached the last slot in this leaf we need
190 * to go to the next leaf and reset everything
191 */
192 if (slot >= nritems-1) {
193 ret = btrfs_next_leaf(root, path);
194 if (ret)
195 break;
196 leaf = path->nodes[0];
197 nritems = btrfs_header_nritems(leaf);
198 slot = path->slots[0];
199 } else {
200 /*
201 * just walking through the slots on this leaf
202 */
203 slot++;
204 path->slots[0]++;
205 }
206 }
207 advance = 1;
208
209 item = btrfs_item_nr(leaf, slot);
210 btrfs_item_key_to_cpu(leaf, &found_key, slot);
211
212 /* check to make sure this item is what we want */
213 if (found_key.objectid != key.objectid)
214 break;
215 if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
216 break;
217
218 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
219
220 name_len = btrfs_dir_name_len(leaf, di);
221 total_size += name_len + 1;
222
223 /* we are just looking for how big our buffer needs to be */
224 if (!size)
225 continue;
226
227 if (!buffer || (name_len + 1) > size_left) {
228 ret = -ERANGE;
229 break;
230 }
231
232 name_ptr = (unsigned long)(di + 1);
233 read_extent_buffer(leaf, buffer, name_ptr, name_len);
234 buffer[name_len] = '\0';
235
236 size_left -= name_len + 1;
237 buffer += name_len + 1;
238 }
239 ret = total_size;
240
241err:
242 btrfs_free_path(path);
243
244 return ret;
245}
246
247/*
248 * List of handlers for synthetic system.* attributes. All real ondisk
249 * attributes are handled directly.
250 */
251struct xattr_handler *btrfs_xattr_handlers[] = {
252#ifdef CONFIG_FS_POSIX_ACL
253 &btrfs_xattr_acl_access_handler,
254 &btrfs_xattr_acl_default_handler,
255#endif
256 NULL,
257};
258
259/*
260 * Check if the attribute is in a supported namespace.
261 *
262 * This applied after the check for the synthetic attributes in the system
263 * namespace.
264 */
265static bool btrfs_is_valid_xattr(const char *name)
266{
267 return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
268 !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
269 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
270 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
271}
272
273ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
274 void *buffer, size_t size)
275{
276 /*
277 * If this is a request for a synthetic attribute in the system.*
278 * namespace use the generic infrastructure to resolve a handler
279 * for it via sb->s_xattr.
280 */
281 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
282 return generic_getxattr(dentry, name, buffer, size);
283
284 if (!btrfs_is_valid_xattr(name))
285 return -EOPNOTSUPP;
286 return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
287}
288
289int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
290 size_t size, int flags)
291{
292 /*
293 * If this is a request for a synthetic attribute in the system.*
294 * namespace use the generic infrastructure to resolve a handler
295 * for it via sb->s_xattr.
296 */
297 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
298 return generic_setxattr(dentry, name, value, size, flags);
299
300 if (!btrfs_is_valid_xattr(name))
301 return -EOPNOTSUPP;
302
303 if (size == 0)
304 value = ""; /* empty EA, do not remove */
305 return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
306}
307
308int btrfs_removexattr(struct dentry *dentry, const char *name)
309{
310 /*
311 * If this is a request for a synthetic attribute in the system.*
312 * namespace use the generic infrastructure to resolve a handler
313 * for it via sb->s_xattr.
314 */
315 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
316 return generic_removexattr(dentry, name);
317
318 if (!btrfs_is_valid_xattr(name))
319 return -EOPNOTSUPP;
320 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
321}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 000000000000..5b1d08f8e68d
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __XATTR__
20#define __XATTR__
21
22#include <linux/xattr.h>
23
24extern struct xattr_handler btrfs_xattr_acl_access_handler;
25extern struct xattr_handler btrfs_xattr_acl_default_handler;
26extern struct xattr_handler *btrfs_xattr_handlers[];
27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size);
30extern int __btrfs_setxattr(struct inode *inode, const char *name,
31 const void *value, size_t size, int flags);
32
33extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
34 void *buffer, size_t size);
35extern int btrfs_setxattr(struct dentry *dentry, const char *name,
36 const void *value, size_t size, int flags);
37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38
39#endif /* __XATTR__ */