aboutsummaryrefslogtreecommitdiffstats
path: root/fs/reiserfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/reiserfs')
-rw-r--r--fs/reiserfs/Makefile36
-rw-r--r--fs/reiserfs/README161
-rw-r--r--fs/reiserfs/bitmap.c1169
-rw-r--r--fs/reiserfs/dir.c275
-rw-r--r--fs/reiserfs/do_balan.c1597
-rw-r--r--fs/reiserfs/file.c1408
-rw-r--r--fs/reiserfs/fix_node.c2518
-rw-r--r--fs/reiserfs/hashes.c209
-rw-r--r--fs/reiserfs/ibalance.c1058
-rw-r--r--fs/reiserfs/inode.c2846
-rw-r--r--fs/reiserfs/ioctl.c151
-rw-r--r--fs/reiserfs/item_ops.c788
-rw-r--r--fs/reiserfs/journal.c3876
-rw-r--r--fs/reiserfs/lbalance.c1222
-rw-r--r--fs/reiserfs/namei.c1491
-rw-r--r--fs/reiserfs/objectid.c206
-rw-r--r--fs/reiserfs/prints.c727
-rw-r--r--fs/reiserfs/procfs.c664
-rw-r--r--fs/reiserfs/resize.c182
-rw-r--r--fs/reiserfs/stree.c2073
-rw-r--r--fs/reiserfs/super.c2148
-rw-r--r--fs/reiserfs/tail_conversion.c276
-rw-r--r--fs/reiserfs/xattr.c1450
-rw-r--r--fs/reiserfs/xattr_acl.c571
-rw-r--r--fs/reiserfs/xattr_security.c69
-rw-r--r--fs/reiserfs/xattr_trusted.c81
-rw-r--r--fs/reiserfs/xattr_user.c99
27 files changed, 27351 insertions, 0 deletions
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
new file mode 100644
index 000000000000..3a59309f3ca9
--- /dev/null
+++ b/fs/reiserfs/Makefile
@@ -0,0 +1,36 @@
1#
2# Makefile for the linux reiser-filesystem routines.
3#
4
5obj-$(CONFIG_REISERFS_FS) += reiserfs.o
6
7reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
8 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
9 hashes.o tail_conversion.o journal.o resize.o \
10 item_ops.o ioctl.o procfs.o
11
12ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
13reiserfs-objs += xattr.o xattr_user.o xattr_trusted.o
14endif
15
16ifeq ($(CONFIG_REISERFS_FS_SECURITY),y)
17reiserfs-objs += xattr_security.o
18endif
19
20ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y)
21reiserfs-objs += xattr_acl.o
22endif
23
24# gcc -O2 (the kernel default) is overaggressive on ppc32 when many inline
25# functions are used. This causes the compiler to advance the stack
26# pointer out of the available stack space, corrupting kernel space,
27# and causing a panic. Since this behavior only affects ppc32, this ifeq
28# will work around it. If any other architecture displays this behavior,
29# add it here.
30ifeq ($(CONFIG_PPC32),y)
31EXTRA_CFLAGS := -O1
32endif
33
34TAGS:
35 etags *.c
36
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
new file mode 100644
index 000000000000..90e1670e4e6f
--- /dev/null
+++ b/fs/reiserfs/README
@@ -0,0 +1,161 @@
1[LICENSING]
2
3ReiserFS is hereby licensed under the GNU General
4Public License version 2.
5
6Source code files that contain the phrase "licensing governed by
7reiserfs/README" are "governed files" throughout this file. Governed
8files are licensed under the GPL. The portions of them owned by Hans
9Reiser, or authorized to be licensed by him, have been in the past,
10and likely will be in the future, licensed to other parties under
11other licenses. If you add your code to governed files, and don't
12want it to be owned by Hans Reiser, put your copyright label on that
13code so the poor blight and his customers can keep things straight.
14All portions of governed files not labeled otherwise are owned by Hans
15Reiser, and by adding your code to it, widely distributing it to
16others or sending us a patch, and leaving the sentence in stating that
17licensing is governed by the statement in this file, you accept this.
18It will be a kindness if you identify whether Hans Reiser is allowed
19to license code labeled as owned by you on your behalf other than
20under the GPL, because he wants to know if it is okay to do so and put
21a check in the mail to you (for non-trivial improvements) when he
22makes his next sale. He makes no guarantees as to the amount if any,
23though he feels motivated to motivate contributors, and you can surely
24discuss this with him before or after contributing. You have the
25right to decline to allow him to license your code contribution other
26than under the GPL.
27
28Further licensing options are available for commercial and/or other
29interests directly from Hans Reiser: hans@reiser.to. If you interpret
30the GPL as not allowing those additional licensing options, you read
31it wrongly, and Richard Stallman agrees with me, when carefully read
32you can see that those restrictions on additional terms do not apply
33to the owner of the copyright, and my interpretation of this shall
34govern for this license.
35
36Finally, nothing in this license shall be interpreted to allow you to
37fail to fairly credit me, or to remove my credits, without my
38permission, unless you are an end user not redistributing to others.
39If you have doubts about how to properly do that, or about what is
40fair, ask. (Last I spoke with him Richard was contemplating how best
41to address the fair crediting issue in the next GPL version.)
42
43[END LICENSING]
44
45Reiserfs is a file system based on balanced tree algorithms, which is
46described at http://devlinux.com/namesys.
47
48Stop reading here. Go there, then return.
49
50Send bug reports to yura@namesys.botik.ru.
51
52mkreiserfs and other utilities are in reiserfs/utils, or wherever your
53Linux provider put them. There is some disagreement about how useful
54it is for users to get their fsck and mkreiserfs out of sync with the
55version of reiserfs that is in their kernel, with many important
56distributors wanting them out of sync.:-) Please try to remember to
57recompile and reinstall fsck and mkreiserfs with every update of
58reiserfs, this is a common source of confusion. Note that some of the
59utilities cannot be compiled without accessing the balancing code
60which is in the kernel code, and relocating the utilities may require
61you to specify where that code can be found.
62
63Yes, if you update your reiserfs kernel module you do have to
64recompile your kernel, most of the time. The errors you get will be
65quite cryptic if your forget to do so.
66
67Real users, as opposed to folks who want to hack and then understand
68what went wrong, will want REISERFS_CHECK off.
69
70Hideous Commercial Pitch: Spread your development costs across other OS
71vendors. Select from the best in the world, not the best in your
72building, by buying from third party OS component suppliers. Leverage
73the software component development power of the internet. Be the most
74aggressive in taking advantage of the commercial possibilities of
75decentralized internet development, and add value through your branded
76integration that you sell as an operating system. Let your competitors
77be the ones to compete against the entire internet by themselves. Be
78hip, get with the new economic trend, before your competitors do. Send
79email to hans@reiser.to.
80
81To understand the code, after reading the website, start reading the
82code by reading reiserfs_fs.h first.
83
84Hans Reiser was the project initiator, primary architect, source of all
85funding for the first 5.5 years, and one of the programmers. He owns
86the copyright.
87
88Vladimir Saveljev was one of the programmers, and he worked long hours
89writing the cleanest code. He always made the effort to be the best he
90could be, and to make his code the best that it could be. What resulted
91was quite remarkable. I don't think that money can ever motivate someone
92to work the way he did, he is one of the most selfless men I know.
93
94Yura helps with benchmarking, coding hashes, and block pre-allocation
95code.
96
97Anatoly Pinchuk is a former member of our team who worked closely with
98Vladimir throughout the project's development. He wrote a quite
99substantial portion of the total code. He realized that there was a
100space problem with packing tails of files for files larger than a node
101that start on a node aligned boundary (there are reasons to want to node
102align files), and he invented and implemented indirect items and
103unformatted nodes as the solution.
104
105Konstantin Shvachko, with the help of the Russian version of a VC,
106tried to put me in a position where I was forced into giving control
107of the project to him. (Fortunately, as the person paying the money
108for all salaries from my dayjob I owned all copyrights, and you can't
109really force takeovers of sole proprietorships.) This was something
110curious, because he never really understood the value of our project,
111why we should do what we do, or why innovation was possible in
112general, but he was sure that he ought to be controlling it. Every
113innovation had to be forced past him while he was with us. He added
114two years to the time required to complete reiserfs, and was a net
115loss for me. Mikhail Gilula was a brilliant innovator who also left
116in a destructive way that erased the value of his contributions, and
117that he was shown much generosity just makes it more painful.
118
119Grigory Zaigralin was an extremely effective system administrator for
120our group.
121
122Igor Krasheninnikov was wonderful at hardware procurement, repair, and
123network installation.
124
125Jeremy Fitzhardinge wrote the teahash.c code, and he gives credit to a
126textbook he got the algorithm from in the code. Note that his analysis
127of how we could use the hashing code in making 32 bit NFS cookies work
128was probably more important than the actual algorithm. Colin Plumb also
129contributed to it.
130
131Chris Mason dived right into our code, and in just a few months produced
132the journaling code that dramatically increased the value of ReiserFS.
133He is just an amazing programmer.
134
135Igor Zagorovsky is writing much of the new item handler and extent code
136for our next major release.
137
138Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
139resizer, and is hard at work on implementing allocate on flush. SGI
140implemented allocate on flush before us for XFS, and generously took
141the time to convince me we should do it also. They are great people,
142and a great company.
143
144Yuri Shevchuk and Nikita Danilov are doing squid cache optimization.
145
146Vitaly Fertman is doing fsck.
147
148Jeff Mahoney, of SuSE, contributed a few cleanup fixes, most notably
149the endian safe patches which allow ReiserFS to run on any platform
150supported by the Linux kernel.
151
152SuSE, IntegratedLinux.com, Ecila, MP3.com, bigstorage.com, and the
153Alpha PC Company made it possible for me to not have a day job
154anymore, and to dramatically increase our staffing. Ecila funded
155hypertext feature development, MP3.com funded journaling, SuSE funded
156core development, IntegratedLinux.com funded squid web cache
157appliances, bigstorage.com funded HSM, and the alpha PC company funded
158the alpha port. Many of these tasks were helped by sponsors other
159than the ones just named. SuSE has helped in much more than just
160funding....
161
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
new file mode 100644
index 000000000000..a4e2ed544bbe
--- /dev/null
+++ b/fs/reiserfs/bitmap.c
@@ -0,0 +1,1169 @@
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4/* Reiserfs block (de)allocator, bitmap-based. */
5
6#include <linux/config.h>
7#include <linux/time.h>
8#include <linux/reiserfs_fs.h>
9#include <linux/errno.h>
10#include <linux/buffer_head.h>
11#include <linux/kernel.h>
12#include <linux/pagemap.h>
13#include <linux/reiserfs_fs_sb.h>
14#include <linux/reiserfs_fs_i.h>
15#include <linux/quotaops.h>
16
17#define PREALLOCATION_SIZE 9
18
19/* different reiserfs block allocator options */
20
21#define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits)
22
23#define _ALLOC_concentrating_formatted_nodes 0
24#define _ALLOC_displacing_large_files 1
25#define _ALLOC_displacing_new_packing_localities 2
26#define _ALLOC_old_hashed_relocation 3
27#define _ALLOC_new_hashed_relocation 4
28#define _ALLOC_skip_busy 5
29#define _ALLOC_displace_based_on_dirid 6
30#define _ALLOC_hashed_formatted_nodes 7
31#define _ALLOC_old_way 8
32#define _ALLOC_hundredth_slices 9
33#define _ALLOC_dirid_groups 10
34#define _ALLOC_oid_groups 11
35#define _ALLOC_packing_groups 12
36
37#define concentrating_formatted_nodes(s) test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s))
38#define displacing_large_files(s) test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s))
39#define displacing_new_packing_localities(s) test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s))
40
41#define SET_OPTION(optname) \
42 do { \
43 reiserfs_warning(s, "reiserfs: option \"%s\" is set", #optname); \
44 set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \
45 } while(0)
46#define TEST_OPTION(optname, s) \
47 test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s))
48
49static inline void get_bit_address (struct super_block * s,
50 b_blocknr_t block, int * bmap_nr, int * offset)
51{
52 /* It is in the bitmap block number equal to the block
53 * number divided by the number of bits in a block. */
54 *bmap_nr = block / (s->s_blocksize << 3);
55 /* Within that bitmap block it is located at bit offset *offset. */
56 *offset = block & ((s->s_blocksize << 3) - 1 );
57 return;
58}
59
60#ifdef CONFIG_REISERFS_CHECK
61int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value)
62{
63 int i, j;
64
65 if (block == 0 || block >= SB_BLOCK_COUNT (s)) {
66 reiserfs_warning (s, "vs-4010: is_reusable: block number is out of range %lu (%u)",
67 block, SB_BLOCK_COUNT (s));
68 return 0;
69 }
70
71 /* it can't be one of the bitmap blocks */
72 for (i = 0; i < SB_BMAP_NR (s); i ++)
73 if (block == SB_AP_BITMAP (s)[i].bh->b_blocknr) {
74 reiserfs_warning (s, "vs: 4020: is_reusable: "
75 "bitmap block %lu(%u) can't be freed or reused",
76 block, SB_BMAP_NR (s));
77 return 0;
78 }
79
80 get_bit_address (s, block, &i, &j);
81
82 if (i >= SB_BMAP_NR (s)) {
83 reiserfs_warning (s, "vs-4030: is_reusable: there is no so many bitmap blocks: "
84 "block=%lu, bitmap_nr=%d", block, i);
85 return 0;
86 }
87
88 if ((bit_value == 0 &&
89 reiserfs_test_le_bit(j, SB_AP_BITMAP(s)[i].bh->b_data)) ||
90 (bit_value == 1 &&
91 reiserfs_test_le_bit(j, SB_AP_BITMAP (s)[i].bh->b_data) == 0)) {
92 reiserfs_warning (s, "vs-4040: is_reusable: corresponding bit of block %lu does not "
93 "match required value (i==%d, j==%d) test_bit==%d",
94 block, i, j, reiserfs_test_le_bit (j, SB_AP_BITMAP (s)[i].bh->b_data));
95
96 return 0;
97 }
98
99 if (bit_value == 0 && block == SB_ROOT_BLOCK (s)) {
100 reiserfs_warning (s, "vs-4050: is_reusable: this is root block (%u), "
101 "it must be busy", SB_ROOT_BLOCK (s));
102 return 0;
103 }
104
105 return 1;
106}
107#endif /* CONFIG_REISERFS_CHECK */
108
109/* searches in journal structures for a given block number (bmap, off). If block
110 is found in reiserfs journal it suggests next free block candidate to test. */
111static inline int is_block_in_journal (struct super_block * s, int bmap, int
112off, int *next)
113{
114 b_blocknr_t tmp;
115
116 if (reiserfs_in_journal (s, bmap, off, 1, &tmp)) {
117 if (tmp) { /* hint supplied */
118 *next = tmp;
119 PROC_INFO_INC( s, scan_bitmap.in_journal_hint );
120 } else {
121 (*next) = off + 1; /* inc offset to avoid looping. */
122 PROC_INFO_INC( s, scan_bitmap.in_journal_nohint );
123 }
124 PROC_INFO_INC( s, scan_bitmap.retry );
125 return 1;
126 }
127 return 0;
128}
129
130/* it searches for a window of zero bits with given minimum and maximum lengths in one bitmap
131 * block; */
132static int scan_bitmap_block (struct reiserfs_transaction_handle *th,
133 int bmap_n, int *beg, int boundary, int min, int max, int unfm)
134{
135 struct super_block *s = th->t_super;
136 struct reiserfs_bitmap_info *bi=&SB_AP_BITMAP(s)[bmap_n];
137 int end, next;
138 int org = *beg;
139
140 BUG_ON (!th->t_trans_id);
141
142 RFALSE(bmap_n >= SB_BMAP_NR (s), "Bitmap %d is out of range (0..%d)",bmap_n, SB_BMAP_NR (s) - 1);
143 PROC_INFO_INC( s, scan_bitmap.bmap );
144/* this is unclear and lacks comments, explain how journal bitmaps
145 work here for the reader. Convey a sense of the design here. What
146 is a window? */
147/* - I mean `a window of zero bits' as in description of this function - Zam. */
148
149 if ( !bi ) {
150 reiserfs_warning (s, "NULL bitmap info pointer for bitmap %d", bmap_n);
151 return 0;
152 }
153 if (buffer_locked (bi->bh)) {
154 PROC_INFO_INC( s, scan_bitmap.wait );
155 __wait_on_buffer (bi->bh);
156 }
157
158 while (1) {
159 cont:
160 if (bi->free_count < min)
161 return 0; // No free blocks in this bitmap
162
163 /* search for a first zero bit -- beggining of a window */
164 *beg = reiserfs_find_next_zero_le_bit
165 ((unsigned long*)(bi->bh->b_data), boundary, *beg);
166
167 if (*beg + min > boundary) { /* search for a zero bit fails or the rest of bitmap block
168 * cannot contain a zero window of minimum size */
169 return 0;
170 }
171
172 if (unfm && is_block_in_journal(s,bmap_n, *beg, beg))
173 continue;
174 /* first zero bit found; we check next bits */
175 for (end = *beg + 1;; end ++) {
176 if (end >= *beg + max || end >= boundary || reiserfs_test_le_bit (end, bi->bh->b_data)) {
177 next = end;
178 break;
179 }
180 /* finding the other end of zero bit window requires looking into journal structures (in
181 * case of searching for free blocks for unformatted nodes) */
182 if (unfm && is_block_in_journal(s, bmap_n, end, &next))
183 break;
184 }
185
186 /* now (*beg) points to beginning of zero bits window,
187 * (end) points to one bit after the window end */
188 if (end - *beg >= min) { /* it seems we have found window of proper size */
189 int i;
190 reiserfs_prepare_for_journal (s, bi->bh, 1);
191 /* try to set all blocks used checking are they still free */
192 for (i = *beg; i < end; i++) {
193 /* It seems that we should not check in journal again. */
194 if (reiserfs_test_and_set_le_bit (i, bi->bh->b_data)) {
195 /* bit was set by another process
196 * while we slept in prepare_for_journal() */
197 PROC_INFO_INC( s, scan_bitmap.stolen );
198 if (i >= *beg + min) { /* we can continue with smaller set of allocated blocks,
199 * if length of this set is more or equal to `min' */
200 end = i;
201 break;
202 }
203 /* otherwise we clear all bit were set ... */
204 while (--i >= *beg)
205 reiserfs_test_and_clear_le_bit (i, bi->bh->b_data);
206 reiserfs_restore_prepared_buffer (s, bi->bh);
207 *beg = org;
208 /* ... and search again in current block from beginning */
209 goto cont;
210 }
211 }
212 bi->free_count -= (end - *beg);
213 journal_mark_dirty (th, s, bi->bh);
214
215 /* free block count calculation */
216 reiserfs_prepare_for_journal (s, SB_BUFFER_WITH_SB(s), 1);
217 PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg));
218 journal_mark_dirty (th, s, SB_BUFFER_WITH_SB(s));
219
220 return end - (*beg);
221 } else {
222 *beg = next;
223 }
224 }
225}
226
227static int bmap_hash_id(struct super_block *s, u32 id) {
228 char * hash_in = NULL;
229 unsigned long hash;
230 unsigned bm;
231
232 if (id <= 2) {
233 bm = 1;
234 } else {
235 hash_in = (char *)(&id);
236 hash = keyed_hash(hash_in, 4);
237 bm = hash % SB_BMAP_NR(s);
238 if (!bm)
239 bm = 1;
240 }
241 /* this can only be true when SB_BMAP_NR = 1 */
242 if (bm >= SB_BMAP_NR(s))
243 bm = 0;
244 return bm;
245}
246
247/*
248 * hashes the id and then returns > 0 if the block group for the
249 * corresponding hash is full
250 */
251static inline int block_group_used(struct super_block *s, u32 id) {
252 int bm;
253 bm = bmap_hash_id(s, id);
254 if (SB_AP_BITMAP(s)[bm].free_count > ((s->s_blocksize << 3) * 60 / 100) ) {
255 return 0;
256 }
257 return 1;
258}
259
260/*
261 * the packing is returned in disk byte order
262 */
263u32 reiserfs_choose_packing(struct inode *dir) {
264 u32 packing;
265 if (TEST_OPTION(packing_groups, dir->i_sb)) {
266 u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id);
267 /*
268 * some versions of reiserfsck expect packing locality 1 to be
269 * special
270 */
271 if (parent_dir == 1 || block_group_used(dir->i_sb,parent_dir))
272 packing = INODE_PKEY(dir)->k_objectid;
273 else
274 packing = INODE_PKEY(dir)->k_dir_id;
275 } else
276 packing = INODE_PKEY(dir)->k_objectid;
277 return packing;
278}
279
280/* Tries to find contiguous zero bit window (given size) in given region of
281 * bitmap and place new blocks there. Returns number of allocated blocks. */
282static int scan_bitmap (struct reiserfs_transaction_handle *th,
283 b_blocknr_t *start, b_blocknr_t finish,
284 int min, int max, int unfm, unsigned long file_block)
285{
286 int nr_allocated=0;
287 struct super_block * s = th->t_super;
288 /* find every bm and bmap and bmap_nr in this file, and change them all to bitmap_blocknr
289 * - Hans, it is not a block number - Zam. */
290
291 int bm, off;
292 int end_bm, end_off;
293 int off_max = s->s_blocksize << 3;
294
295 BUG_ON (!th->t_trans_id);
296
297 PROC_INFO_INC( s, scan_bitmap.call );
298 if ( SB_FREE_BLOCKS(s) <= 0)
299 return 0; // No point in looking for more free blocks
300
301 get_bit_address (s, *start, &bm, &off);
302 get_bit_address (s, finish, &end_bm, &end_off);
303 if (bm > SB_BMAP_NR(s))
304 return 0;
305 if (end_bm > SB_BMAP_NR(s))
306 end_bm = SB_BMAP_NR(s);
307
308 /* When the bitmap is more than 10% free, anyone can allocate.
309 * When it's less than 10% free, only files that already use the
310 * bitmap are allowed. Once we pass 80% full, this restriction
311 * is lifted.
312 *
313 * We do this so that files that grow later still have space close to
314 * their original allocation. This improves locality, and presumably
315 * performance as a result.
316 *
317 * This is only an allocation policy and does not make up for getting a
318 * bad hint. Decent hinting must be implemented for this to work well.
319 */
320 if ( TEST_OPTION(skip_busy, s) && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s)/20 ) {
321 for (;bm < end_bm; bm++, off = 0) {
322 if ( ( off && (!unfm || (file_block != 0))) || SB_AP_BITMAP(s)[bm].free_count > (s->s_blocksize << 3) / 10 )
323 nr_allocated = scan_bitmap_block(th, bm, &off, off_max, min, max, unfm);
324 if (nr_allocated)
325 goto ret;
326 }
327 /* we know from above that start is a reasonable number */
328 get_bit_address (s, *start, &bm, &off);
329 }
330
331 for (;bm < end_bm; bm++, off = 0) {
332 nr_allocated = scan_bitmap_block(th, bm, &off, off_max, min, max, unfm);
333 if (nr_allocated)
334 goto ret;
335 }
336
337 nr_allocated = scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm);
338
339 ret:
340 *start = bm * off_max + off;
341 return nr_allocated;
342
343}
344
345static void _reiserfs_free_block (struct reiserfs_transaction_handle *th,
346 struct inode *inode, b_blocknr_t block,
347 int for_unformatted)
348{
349 struct super_block * s = th->t_super;
350 struct reiserfs_super_block * rs;
351 struct buffer_head * sbh;
352 struct reiserfs_bitmap_info *apbi;
353 int nr, offset;
354
355 BUG_ON (!th->t_trans_id);
356
357 PROC_INFO_INC( s, free_block );
358
359 rs = SB_DISK_SUPER_BLOCK (s);
360 sbh = SB_BUFFER_WITH_SB (s);
361 apbi = SB_AP_BITMAP(s);
362
363 get_bit_address (s, block, &nr, &offset);
364
365 if (nr >= sb_bmap_nr (rs)) {
366 reiserfs_warning (s, "vs-4075: reiserfs_free_block: "
367 "block %lu is out of range on %s",
368 block, reiserfs_bdevname (s));
369 return;
370 }
371
372 reiserfs_prepare_for_journal(s, apbi[nr].bh, 1 ) ;
373
374 /* clear bit for the given block in bit map */
375 if (!reiserfs_test_and_clear_le_bit (offset, apbi[nr].bh->b_data)) {
376 reiserfs_warning (s, "vs-4080: reiserfs_free_block: "
377 "free_block (%s:%lu)[dev:blocknr]: bit already cleared",
378 reiserfs_bdevname (s), block);
379 }
380 apbi[nr].free_count ++;
381 journal_mark_dirty (th, s, apbi[nr].bh);
382
383 reiserfs_prepare_for_journal(s, sbh, 1) ;
384 /* update super block */
385 set_sb_free_blocks( rs, sb_free_blocks(rs) + 1 );
386
387 journal_mark_dirty (th, s, sbh);
388 if (for_unformatted)
389 DQUOT_FREE_BLOCK_NODIRTY(inode, 1);
390}
391
392void reiserfs_free_block (struct reiserfs_transaction_handle *th,
393 struct inode *inode, b_blocknr_t block,
394 int for_unformatted)
395{
396 struct super_block * s = th->t_super;
397
398 BUG_ON (!th->t_trans_id);
399
400 RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
401 RFALSE(is_reusable (s, block, 1) == 0, "vs-4071: can not free such block");
402 /* mark it before we clear it, just in case */
403 journal_mark_freed(th, s, block) ;
404 _reiserfs_free_block(th, inode, block, for_unformatted) ;
405}
406
407/* preallocated blocks don't need to be run through journal_mark_freed */
408static void reiserfs_free_prealloc_block (struct reiserfs_transaction_handle *th,
409 struct inode *inode, b_blocknr_t block) {
410 RFALSE(!th->t_super, "vs-4060: trying to free block on nonexistent device");
411 RFALSE(is_reusable (th->t_super, block, 1) == 0, "vs-4070: can not free such block");
412 BUG_ON (!th->t_trans_id);
413 _reiserfs_free_block(th, inode, block, 1) ;
414}
415
416static void __discard_prealloc (struct reiserfs_transaction_handle * th,
417 struct reiserfs_inode_info *ei)
418{
419 unsigned long save = ei->i_prealloc_block ;
420 int dirty = 0;
421 struct inode *inode = &ei->vfs_inode;
422 BUG_ON (!th->t_trans_id);
423#ifdef CONFIG_REISERFS_CHECK
424 if (ei->i_prealloc_count < 0)
425 reiserfs_warning (th->t_super, "zam-4001:%s: inode has negative prealloc blocks count.", __FUNCTION__ );
426#endif
427 while (ei->i_prealloc_count > 0) {
428 reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block);
429 ei->i_prealloc_block++;
430 ei->i_prealloc_count --;
431 dirty = 1;
432 }
433 if (dirty)
434 reiserfs_update_sd(th, inode);
435 ei->i_prealloc_block = save;
436 list_del_init(&(ei->i_prealloc_list));
437}
438
439/* FIXME: It should be inline function */
440void reiserfs_discard_prealloc (struct reiserfs_transaction_handle *th,
441 struct inode *inode)
442{
443 struct reiserfs_inode_info *ei = REISERFS_I(inode);
444 BUG_ON (!th->t_trans_id);
445 if (ei->i_prealloc_count)
446 __discard_prealloc(th, ei);
447}
448
449void reiserfs_discard_all_prealloc (struct reiserfs_transaction_handle *th)
450{
451 struct list_head * plist = &SB_JOURNAL(th->t_super)->j_prealloc_list;
452
453 BUG_ON (!th->t_trans_id);
454
455 while (!list_empty(plist)) {
456 struct reiserfs_inode_info *ei;
457 ei = list_entry(plist->next, struct reiserfs_inode_info, i_prealloc_list);
458#ifdef CONFIG_REISERFS_CHECK
459 if (!ei->i_prealloc_count) {
460 reiserfs_warning (th->t_super, "zam-4001:%s: inode is in prealloc list but has no preallocated blocks.", __FUNCTION__);
461 }
462#endif
463 __discard_prealloc(th, ei);
464 }
465}
466
467void reiserfs_init_alloc_options (struct super_block *s)
468{
469 set_bit (_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
470 set_bit (_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
471 set_bit (_ALLOC_packing_groups, &SB_ALLOC_OPTS(s));
472}
473
474/* block allocator related options are parsed here */
475int reiserfs_parse_alloc_options(struct super_block * s, char * options)
476{
477 char * this_char, * value;
478
479 REISERFS_SB(s)->s_alloc_options.bits = 0; /* clear default settings */
480
481 while ( (this_char = strsep (&options, ":")) != NULL ) {
482 if ((value = strchr (this_char, '=')) != NULL)
483 *value++ = 0;
484
485 if (!strcmp(this_char, "concentrating_formatted_nodes")) {
486 int temp;
487 SET_OPTION(concentrating_formatted_nodes);
488 temp = (value && *value) ? simple_strtoul (value, &value, 0) : 10;
489 if (temp <= 0 || temp > 100) {
490 REISERFS_SB(s)->s_alloc_options.border = 10;
491 } else {
492 REISERFS_SB(s)->s_alloc_options.border = 100 / temp;
493 }
494 continue;
495 }
496 if (!strcmp(this_char, "displacing_large_files")) {
497 SET_OPTION(displacing_large_files);
498 REISERFS_SB(s)->s_alloc_options.large_file_size =
499 (value && *value) ? simple_strtoul (value, &value, 0) : 16;
500 continue;
501 }
502 if (!strcmp(this_char, "displacing_new_packing_localities")) {
503 SET_OPTION(displacing_new_packing_localities);
504 continue;
505 };
506
507 if (!strcmp(this_char, "old_hashed_relocation")) {
508 SET_OPTION(old_hashed_relocation);
509 continue;
510 }
511
512 if (!strcmp(this_char, "new_hashed_relocation")) {
513 SET_OPTION(new_hashed_relocation);
514 continue;
515 }
516
517 if (!strcmp(this_char, "dirid_groups")) {
518 SET_OPTION(dirid_groups);
519 continue;
520 }
521 if (!strcmp(this_char, "oid_groups")) {
522 SET_OPTION(oid_groups);
523 continue;
524 }
525 if (!strcmp(this_char, "packing_groups")) {
526 SET_OPTION(packing_groups);
527 continue;
528 }
529 if (!strcmp(this_char, "hashed_formatted_nodes")) {
530 SET_OPTION(hashed_formatted_nodes);
531 continue;
532 }
533
534 if (!strcmp(this_char, "skip_busy")) {
535 SET_OPTION(skip_busy);
536 continue;
537 }
538
539 if (!strcmp(this_char, "hundredth_slices")) {
540 SET_OPTION(hundredth_slices);
541 continue;
542 }
543
544 if (!strcmp(this_char, "old_way")) {
545 SET_OPTION(old_way);
546 continue;
547 }
548
549 if (!strcmp(this_char, "displace_based_on_dirid")) {
550 SET_OPTION(displace_based_on_dirid);
551 continue;
552 }
553
554 if (!strcmp(this_char, "preallocmin")) {
555 REISERFS_SB(s)->s_alloc_options.preallocmin =
556 (value && *value) ? simple_strtoul (value, &value, 0) : 4;
557 continue;
558 }
559
560 if (!strcmp(this_char, "preallocsize")) {
561 REISERFS_SB(s)->s_alloc_options.preallocsize =
562 (value && *value) ? simple_strtoul (value, &value, 0) : PREALLOCATION_SIZE;
563 continue;
564 }
565
566 reiserfs_warning (s, "zam-4001: %s : unknown option - %s",
567 __FUNCTION__ , this_char);
568 return 1;
569 }
570
571 reiserfs_warning (s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
572 return 0;
573}
574
575static inline void new_hashed_relocation (reiserfs_blocknr_hint_t * hint)
576{
577 char * hash_in;
578 if (hint->formatted_node) {
579 hash_in = (char*)&hint->key.k_dir_id;
580 } else {
581 if (!hint->inode) {
582 //hint->search_start = hint->beg;
583 hash_in = (char*)&hint->key.k_dir_id;
584 } else
585 if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
586 hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
587 else
588 hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid);
589 }
590
591 hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
592}
593
594/*
595 * Relocation based on dirid, hashing them into a given bitmap block
596 * files. Formatted nodes are unaffected, a seperate policy covers them
597 */
598static void
599dirid_groups (reiserfs_blocknr_hint_t *hint)
600{
601 unsigned long hash;
602 __u32 dirid = 0;
603 int bm = 0;
604 struct super_block *sb = hint->th->t_super;
605 if (hint->inode)
606 dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
607 else if (hint->formatted_node)
608 dirid = hint->key.k_dir_id;
609
610 if (dirid) {
611 bm = bmap_hash_id(sb, dirid);
612 hash = bm * (sb->s_blocksize << 3);
613 /* give a portion of the block group to metadata */
614 if (hint->inode)
615 hash += sb->s_blocksize/2;
616 hint->search_start = hash;
617 }
618}
619
620/*
621 * Relocation based on oid, hashing them into a given bitmap block
622 * files. Formatted nodes are unaffected, a seperate policy covers them
623 */
624static void
625oid_groups (reiserfs_blocknr_hint_t *hint)
626{
627 if (hint->inode) {
628 unsigned long hash;
629 __u32 oid;
630 __u32 dirid;
631 int bm;
632
633 dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
634
635 /* keep the root dir and it's first set of subdirs close to
636 * the start of the disk
637 */
638 if (dirid <= 2)
639 hash = (hint->inode->i_sb->s_blocksize << 3);
640 else {
641 oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
642 bm = bmap_hash_id(hint->inode->i_sb, oid);
643 hash = bm * (hint->inode->i_sb->s_blocksize << 3);
644 }
645 hint->search_start = hash;
646 }
647}
648
649/* returns 1 if it finds an indirect item and gets valid hint info
650 * from it, otherwise 0
651 */
652static int get_left_neighbor(reiserfs_blocknr_hint_t *hint)
653{
654 struct path * path;
655 struct buffer_head * bh;
656 struct item_head * ih;
657 int pos_in_item;
658 __u32 * item;
659 int ret = 0;
660
661 if (!hint->path) /* reiserfs code can call this function w/o pointer to path
662 * structure supplied; then we rely on supplied search_start */
663 return 0;
664
665 path = hint->path;
666 bh = get_last_bh(path);
667 RFALSE( !bh, "green-4002: Illegal path specified to get_left_neighbor");
668 ih = get_ih(path);
669 pos_in_item = path->pos_in_item;
670 item = get_item (path);
671
672 hint->search_start = bh->b_blocknr;
673
674 if (!hint->formatted_node && is_indirect_le_ih (ih)) {
675 /* for indirect item: go to left and look for the first non-hole entry
676 in the indirect item */
677 if (pos_in_item == I_UNFM_NUM (ih))
678 pos_in_item--;
679// pos_in_item = I_UNFM_NUM (ih) - 1;
680 while (pos_in_item >= 0) {
681 int t=get_block_num(item,pos_in_item);
682 if (t) {
683 hint->search_start = t;
684 ret = 1;
685 break;
686 }
687 pos_in_item --;
688 }
689 }
690
691 /* does result value fit into specified region? */
692 return ret;
693}
694
695/* should be, if formatted node, then try to put on first part of the device
696 specified as number of percent with mount option device, else try to put
697 on last of device. This is not to say it is good code to do so,
698 but the effect should be measured. */
699static inline void set_border_in_hint(struct super_block *s, reiserfs_blocknr_hint_t *hint)
700{
701 b_blocknr_t border = SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border;
702
703 if (hint->formatted_node)
704 hint->end = border - 1;
705 else
706 hint->beg = border;
707}
708
709static inline void displace_large_file(reiserfs_blocknr_hint_t *hint)
710{
711 if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
712 hint->search_start = hint->beg + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id), 4) % (hint->end - hint->beg);
713 else
714 hint->search_start = hint->beg + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid), 4) % (hint->end - hint->beg);
715}
716
717static inline void hash_formatted_node(reiserfs_blocknr_hint_t *hint)
718{
719 char * hash_in;
720
721 if (!hint->inode)
722 hash_in = (char*)&hint->key.k_dir_id;
723 else if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
724 hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
725 else
726 hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid);
727
728 hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
729}
730
731static inline int this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t *hint)
732{
733 return hint->block == REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size;
734}
735
736#ifdef DISPLACE_NEW_PACKING_LOCALITIES
737static inline void displace_new_packing_locality (reiserfs_blocknr_hint_t *hint)
738{
739 struct reiserfs_key * key = &hint->key;
740
741 hint->th->displace_new_blocks = 0;
742 hint->search_start = hint->beg + keyed_hash((char*)(&key->k_objectid),4) % (hint->end - hint->beg);
743}
744 #endif
745
746static inline int old_hashed_relocation (reiserfs_blocknr_hint_t * hint)
747{
748 b_blocknr_t border;
749 u32 hash_in;
750
751 if (hint->formatted_node || hint->inode == NULL) {
752 return 0;
753 }
754
755 hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id);
756 border = hint->beg + (u32) keyed_hash(((char *) (&hash_in)), 4) % (hint->end - hint->beg - 1);
757 if (border > hint->search_start)
758 hint->search_start = border;
759
760 return 1;
761 }
762
763static inline int old_way (reiserfs_blocknr_hint_t * hint)
764{
765 b_blocknr_t border;
766
767 if (hint->formatted_node || hint->inode == NULL) {
768 return 0;
769 }
770
771 border = hint->beg + le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end - hint->beg);
772 if (border > hint->search_start)
773 hint->search_start = border;
774
775 return 1;
776}
777
778static inline void hundredth_slices (reiserfs_blocknr_hint_t * hint)
779{
780 struct reiserfs_key * key = &hint->key;
781 b_blocknr_t slice_start;
782
783 slice_start = (keyed_hash((char*)(&key->k_dir_id),4) % 100) * (hint->end / 100);
784 if ( slice_start > hint->search_start || slice_start + (hint->end / 100) <= hint->search_start) {
785 hint->search_start = slice_start;
786 }
787}
788
789static void determine_search_start(reiserfs_blocknr_hint_t *hint,
790 int amount_needed)
791{
792 struct super_block *s = hint->th->t_super;
793 int unfm_hint;
794
795 hint->beg = 0;
796 hint->end = SB_BLOCK_COUNT(s) - 1;
797
798 /* This is former border algorithm. Now with tunable border offset */
799 if (concentrating_formatted_nodes(s))
800 set_border_in_hint(s, hint);
801
802#ifdef DISPLACE_NEW_PACKING_LOCALITIES
803 /* whenever we create a new directory, we displace it. At first we will
804 hash for location, later we might look for a moderately empty place for
805 it */
806 if (displacing_new_packing_localities(s)
807 && hint->th->displace_new_blocks) {
808 displace_new_packing_locality(hint);
809
810 /* we do not continue determine_search_start,
811 * if new packing locality is being displaced */
812 return;
813 }
814#endif
815
816 /* all persons should feel encouraged to add more special cases here and
817 * test them */
818
819 if (displacing_large_files(s) && !hint->formatted_node
820 && this_blocknr_allocation_would_make_it_a_large_file(hint)) {
821 displace_large_file(hint);
822 return;
823 }
824
825 /* if none of our special cases is relevant, use the left neighbor in the
826 tree order of the new node we are allocating for */
827 if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes,s)) {
828 hash_formatted_node(hint);
829 return;
830 }
831
832 unfm_hint = get_left_neighbor(hint);
833
834 /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation,
835 new blocks are displaced based on directory ID. Also, if suggested search_start
836 is less than last preallocated block, we start searching from it, assuming that
837 HDD dataflow is faster in forward direction */
838 if ( TEST_OPTION(old_way, s)) {
839 if (!hint->formatted_node) {
840 if ( !reiserfs_hashed_relocation(s))
841 old_way(hint);
842 else if (!reiserfs_no_unhashed_relocation(s))
843 old_hashed_relocation(hint);
844
845 if ( hint->inode && hint->search_start < REISERFS_I(hint->inode)->i_prealloc_block)
846 hint->search_start = REISERFS_I(hint->inode)->i_prealloc_block;
847 }
848 return;
849 }
850
851 /* This is an approach proposed by Hans */
852 if ( TEST_OPTION(hundredth_slices, s) && ! (displacing_large_files(s) && !hint->formatted_node)) {
853 hundredth_slices(hint);
854 return;
855 }
856
857 /* old_hashed_relocation only works on unformatted */
858 if (!unfm_hint && !hint->formatted_node &&
859 TEST_OPTION(old_hashed_relocation, s))
860 {
861 old_hashed_relocation(hint);
862 }
863 /* new_hashed_relocation works with both formatted/unformatted nodes */
864 if ((!unfm_hint || hint->formatted_node) &&
865 TEST_OPTION(new_hashed_relocation, s))
866 {
867 new_hashed_relocation(hint);
868 }
869 /* dirid grouping works only on unformatted nodes */
870 if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups,s))
871 {
872 dirid_groups(hint);
873 }
874
875#ifdef DISPLACE_NEW_PACKING_LOCALITIES
876 if (hint->formatted_node && TEST_OPTION(dirid_groups,s))
877 {
878 dirid_groups(hint);
879 }
880#endif
881
882 /* oid grouping works only on unformatted nodes */
883 if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups,s))
884 {
885 oid_groups(hint);
886 }
887 return;
888}
889
890static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint)
891{
892 /* make minimum size a mount option and benchmark both ways */
893 /* we preallocate blocks only for regular files, specific size */
894 /* benchmark preallocating always and see what happens */
895
896 hint->prealloc_size = 0;
897
898 if (!hint->formatted_node && hint->preallocate) {
899 if (S_ISREG(hint->inode->i_mode)
900 && hint->inode->i_size >= REISERFS_SB(hint->th->t_super)->s_alloc_options.preallocmin * hint->inode->i_sb->s_blocksize)
901 hint->prealloc_size = REISERFS_SB(hint->th->t_super)->s_alloc_options.preallocsize - 1;
902 }
903 return CARRY_ON;
904}
905
906/* XXX I know it could be merged with upper-level function;
907 but may be result function would be too complex. */
908static inline int allocate_without_wrapping_disk (reiserfs_blocknr_hint_t * hint,
909 b_blocknr_t * new_blocknrs,
910 b_blocknr_t start, b_blocknr_t finish,
911 int min,
912 int amount_needed, int prealloc_size)
913{
914 int rest = amount_needed;
915 int nr_allocated;
916
917 while (rest > 0 && start <= finish) {
918 nr_allocated = scan_bitmap (hint->th, &start, finish, min,
919 rest + prealloc_size, !hint->formatted_node,
920 hint->block);
921
922 if (nr_allocated == 0) /* no new blocks allocated, return */
923 break;
924
925 /* fill free_blocknrs array first */
926 while (rest > 0 && nr_allocated > 0) {
927 * new_blocknrs ++ = start ++;
928 rest --; nr_allocated --;
929 }
930
931 /* do we have something to fill prealloc. array also ? */
932 if (nr_allocated > 0) {
933 /* it means prealloc_size was greater that 0 and we do preallocation */
934 list_add(&REISERFS_I(hint->inode)->i_prealloc_list,
935 &SB_JOURNAL(hint->th->t_super)->j_prealloc_list);
936 REISERFS_I(hint->inode)->i_prealloc_block = start;
937 REISERFS_I(hint->inode)->i_prealloc_count = nr_allocated;
938 break;
939 }
940 }
941
942 return (amount_needed - rest);
943}
944
945static inline int blocknrs_and_prealloc_arrays_from_search_start
946 (reiserfs_blocknr_hint_t *hint, b_blocknr_t *new_blocknrs, int amount_needed)
947{
948 struct super_block *s = hint->th->t_super;
949 b_blocknr_t start = hint->search_start;
950 b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
951 int passno = 0;
952 int nr_allocated = 0;
953 int bigalloc = 0;
954
955 determine_prealloc_size(hint);
956 if (!hint->formatted_node) {
957 int quota_ret;
958#ifdef REISERQUOTA_DEBUG
959 reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: allocating %d blocks id=%u", amount_needed, hint->inode->i_uid);
960#endif
961 quota_ret = DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed);
962 if (quota_ret) /* Quota exceeded? */
963 return QUOTA_EXCEEDED;
964 if (hint->preallocate && hint->prealloc_size ) {
965#ifdef REISERQUOTA_DEBUG
966 reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: allocating (prealloc) %d blocks id=%u", hint->prealloc_size, hint->inode->i_uid);
967#endif
968 quota_ret = DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode, hint->prealloc_size);
969 if (quota_ret)
970 hint->preallocate=hint->prealloc_size=0;
971 }
972 /* for unformatted nodes, force large allocations */
973 bigalloc = amount_needed;
974 }
975
976 do {
977 /* in bigalloc mode, nr_allocated should stay zero until
978 * the entire allocation is filled
979 */
980 if (unlikely(bigalloc && nr_allocated)) {
981 reiserfs_warning(s, "bigalloc is %d, nr_allocated %d\n",
982 bigalloc, nr_allocated);
983 /* reset things to a sane value */
984 bigalloc = amount_needed - nr_allocated;
985 }
986 /*
987 * try pass 0 and pass 1 looking for a nice big
988 * contiguous allocation. Then reset and look
989 * for anything you can find.
990 */
991 if (passno == 2 && bigalloc) {
992 passno = 0;
993 bigalloc = 0;
994 }
995 switch (passno++) {
996 case 0: /* Search from hint->search_start to end of disk */
997 start = hint->search_start;
998 finish = SB_BLOCK_COUNT(s) - 1;
999 break;
1000 case 1: /* Search from hint->beg to hint->search_start */
1001 start = hint->beg;
1002 finish = hint->search_start;
1003 break;
1004 case 2: /* Last chance: Search from 0 to hint->beg */
1005 start = 0;
1006 finish = hint->beg;
1007 break;
1008 default: /* We've tried searching everywhere, not enough space */
1009 /* Free the blocks */
1010 if (!hint->formatted_node) {
1011#ifdef REISERQUOTA_DEBUG
1012 reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: freeing (nospace) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated, hint->inode->i_uid);
1013#endif
1014 DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); /* Free not allocated blocks */
1015 }
1016 while (nr_allocated --)
1017 reiserfs_free_block(hint->th, hint->inode, new_blocknrs[nr_allocated], !hint->formatted_node);
1018
1019 return NO_DISK_SPACE;
1020 }
1021 } while ((nr_allocated += allocate_without_wrapping_disk (hint,
1022 new_blocknrs + nr_allocated, start, finish,
1023 bigalloc ? bigalloc : 1,
1024 amount_needed - nr_allocated,
1025 hint->prealloc_size))
1026 < amount_needed);
1027 if ( !hint->formatted_node &&
1028 amount_needed + hint->prealloc_size >
1029 nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) {
1030 /* Some of preallocation blocks were not allocated */
1031#ifdef REISERQUOTA_DEBUG
1032 reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: freeing (failed prealloc) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated - REISERFS_I(hint->inode)->i_prealloc_count, hint->inode->i_uid);
1033#endif
1034 DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed +
1035 hint->prealloc_size - nr_allocated -
1036 REISERFS_I(hint->inode)->i_prealloc_count);
1037 }
1038
1039 return CARRY_ON;
1040}
1041
1042/* grab new blocknrs from preallocated list */
1043/* return amount still needed after using them */
1044static int use_preallocated_list_if_available (reiserfs_blocknr_hint_t *hint,
1045 b_blocknr_t *new_blocknrs, int amount_needed)
1046{
1047 struct inode * inode = hint->inode;
1048
1049 if (REISERFS_I(inode)->i_prealloc_count > 0) {
1050 while (amount_needed) {
1051
1052 *new_blocknrs ++ = REISERFS_I(inode)->i_prealloc_block ++;
1053 REISERFS_I(inode)->i_prealloc_count --;
1054
1055 amount_needed --;
1056
1057 if (REISERFS_I(inode)->i_prealloc_count <= 0) {
1058 list_del(&REISERFS_I(inode)->i_prealloc_list);
1059 break;
1060 }
1061 }
1062 }
1063 /* return amount still needed after using preallocated blocks */
1064 return amount_needed;
1065}
1066
1067int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint,
1068 b_blocknr_t * new_blocknrs, int amount_needed,
1069 int reserved_by_us /* Amount of blocks we have
1070 already reserved */)
1071{
1072 int initial_amount_needed = amount_needed;
1073 int ret;
1074 struct super_block *s = hint->th->t_super;
1075
1076 /* Check if there is enough space, taking into account reserved space */
1077 if ( SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks <
1078 amount_needed - reserved_by_us)
1079 return NO_DISK_SPACE;
1080 /* should this be if !hint->inode && hint->preallocate? */
1081 /* do you mean hint->formatted_node can be removed ? - Zam */
1082 /* hint->formatted_node cannot be removed because we try to access
1083 inode information here, and there is often no inode assotiated with
1084 metadata allocations - green */
1085
1086 if (!hint->formatted_node && hint->preallocate) {
1087 amount_needed = use_preallocated_list_if_available
1088 (hint, new_blocknrs, amount_needed);
1089 if (amount_needed == 0) /* all blocknrs we need we got from
1090 prealloc. list */
1091 return CARRY_ON;
1092 new_blocknrs += (initial_amount_needed - amount_needed);
1093 }
1094
1095 /* find search start and save it in hint structure */
1096 determine_search_start(hint, amount_needed);
1097 if (hint->search_start >= SB_BLOCK_COUNT(s))
1098 hint->search_start = SB_BLOCK_COUNT(s) - 1;
1099
1100 /* allocation itself; fill new_blocknrs and preallocation arrays */
1101 ret = blocknrs_and_prealloc_arrays_from_search_start
1102 (hint, new_blocknrs, amount_needed);
1103
1104 /* we used prealloc. list to fill (partially) new_blocknrs array. If final allocation fails we
1105 * need to return blocks back to prealloc. list or just free them. -- Zam (I chose second
1106 * variant) */
1107
1108 if (ret != CARRY_ON) {
1109 while (amount_needed ++ < initial_amount_needed) {
1110 reiserfs_free_block(hint->th, hint->inode, *(--new_blocknrs), 1);
1111 }
1112 }
1113 return ret;
1114}
1115
1116/* These 2 functions are here to provide blocks reservation to the rest of kernel */
1117/* Reserve @blocks amount of blocks in fs pointed by @sb. Caller must make sure
1118 there are actually this much blocks on the FS available */
1119void reiserfs_claim_blocks_to_be_allocated(
1120 struct super_block *sb, /* super block of
1121 filesystem where
1122 blocks should be
1123 reserved */
1124 int blocks /* How much to reserve */
1125 )
1126{
1127
1128 /* Fast case, if reservation is zero - exit immediately. */
1129 if ( !blocks )
1130 return;
1131
1132 spin_lock(&REISERFS_SB(sb)->bitmap_lock);
1133 REISERFS_SB(sb)->reserved_blocks += blocks;
1134 spin_unlock(&REISERFS_SB(sb)->bitmap_lock);
1135}
1136
1137/* Unreserve @blocks amount of blocks in fs pointed by @sb */
1138void reiserfs_release_claimed_blocks(
1139 struct super_block *sb, /* super block of
1140 filesystem where
1141 blocks should be
1142 reserved */
1143 int blocks /* How much to unreserve */
1144 )
1145{
1146
1147 /* Fast case, if unreservation is zero - exit immediately. */
1148 if ( !blocks )
1149 return;
1150
1151 spin_lock(&REISERFS_SB(sb)->bitmap_lock);
1152 REISERFS_SB(sb)->reserved_blocks -= blocks;
1153 spin_unlock(&REISERFS_SB(sb)->bitmap_lock);
1154 RFALSE( REISERFS_SB(sb)->reserved_blocks < 0, "amount of blocks reserved became zero?");
1155}
1156
1157/* This function estimates how much pages we will be able to write to FS
1158 used for reiserfs_file_write() purposes for now. */
1159int reiserfs_can_fit_pages ( struct super_block *sb /* superblock of filesystem
1160 to estimate space */ )
1161{
1162 int space;
1163
1164 spin_lock(&REISERFS_SB(sb)->bitmap_lock);
1165 space = (SB_FREE_BLOCKS(sb) - REISERFS_SB(sb)->reserved_blocks) >> ( PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
1166 spin_unlock(&REISERFS_SB(sb)->bitmap_lock);
1167
1168 return space>0?space:0;
1169}
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
new file mode 100644
index 000000000000..d1514a9b0514
--- /dev/null
+++ b/fs/reiserfs/dir.c
@@ -0,0 +1,275 @@
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5#include <linux/config.h>
6#include <linux/string.h>
7#include <linux/errno.h>
8#include <linux/fs.h>
9#include <linux/reiserfs_fs.h>
10#include <linux/stat.h>
11#include <linux/smp_lock.h>
12#include <linux/buffer_head.h>
13#include <asm/uaccess.h>
14
15extern struct reiserfs_key MIN_KEY;
16
17static int reiserfs_readdir (struct file *, void *, filldir_t);
18static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) ;
19
20struct file_operations reiserfs_dir_operations = {
21 .read = generic_read_dir,
22 .readdir = reiserfs_readdir,
23 .fsync = reiserfs_dir_fsync,
24 .ioctl = reiserfs_ioctl,
25};
26
27static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) {
28 struct inode *inode = dentry->d_inode;
29 int err;
30 reiserfs_write_lock(inode->i_sb);
31 err = reiserfs_commit_for_inode(inode) ;
32 reiserfs_write_unlock(inode->i_sb) ;
33 if (err < 0)
34 return err;
35 return 0;
36}
37
38
39#define store_ih(where,what) copy_item_head (where, what)
40
41//
42static int reiserfs_readdir (struct file * filp, void * dirent, filldir_t filldir)
43{
44 struct inode *inode = filp->f_dentry->d_inode;
45 struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */
46 INITIALIZE_PATH (path_to_entry);
47 struct buffer_head * bh;
48 int item_num, entry_num;
49 const struct reiserfs_key * rkey;
50 struct item_head * ih, tmp_ih;
51 int search_res;
52 char * local_buf;
53 loff_t next_pos;
54 char small_buf[32] ; /* avoid kmalloc if we can */
55 struct reiserfs_dir_entry de;
56 int ret = 0;
57
58 reiserfs_write_lock(inode->i_sb);
59
60 reiserfs_check_lock_depth(inode->i_sb, "readdir") ;
61
62 /* form key for search the next directory entry using f_pos field of
63 file structure */
64 make_cpu_key (&pos_key, inode, (filp->f_pos) ? (filp->f_pos) : DOT_OFFSET,
65 TYPE_DIRENTRY, 3);
66 next_pos = cpu_key_k_offset (&pos_key);
67
68 /* reiserfs_warning (inode->i_sb, "reiserfs_readdir 1: f_pos = %Ld", filp->f_pos);*/
69
70 path_to_entry.reada = PATH_READA;
71 while (1) {
72 research:
73 /* search the directory item, containing entry with specified key */
74 search_res = search_by_entry_key (inode->i_sb, &pos_key, &path_to_entry, &de);
75 if (search_res == IO_ERROR) {
76 // FIXME: we could just skip part of directory which could
77 // not be read
78 ret = -EIO;
79 goto out;
80 }
81 entry_num = de.de_entry_num;
82 bh = de.de_bh;
83 item_num = de.de_item_num;
84 ih = de.de_ih;
85 store_ih (&tmp_ih, ih);
86
87 /* we must have found item, that is item of this directory, */
88 RFALSE( COMP_SHORT_KEYS (&(ih->ih_key), &pos_key),
89 "vs-9000: found item %h does not match to dir we readdir %K",
90 ih, &pos_key);
91 RFALSE( item_num > B_NR_ITEMS (bh) - 1,
92 "vs-9005 item_num == %d, item amount == %d",
93 item_num, B_NR_ITEMS (bh));
94
95 /* and entry must be not more than number of entries in the item */
96 RFALSE( I_ENTRY_COUNT (ih) < entry_num,
97 "vs-9010: entry number is too big %d (%d)",
98 entry_num, I_ENTRY_COUNT (ih));
99
100 if (search_res == POSITION_FOUND || entry_num < I_ENTRY_COUNT (ih)) {
101 /* go through all entries in the directory item beginning from the entry, that has been found */
102 struct reiserfs_de_head * deh = B_I_DEH (bh, ih) + entry_num;
103
104 for (; entry_num < I_ENTRY_COUNT (ih); entry_num ++, deh ++) {
105 int d_reclen;
106 char * d_name;
107 off_t d_off;
108 ino_t d_ino;
109
110 if (!de_visible (deh))
111 /* it is hidden entry */
112 continue;
113 d_reclen = entry_length (bh, ih, entry_num);
114 d_name = B_I_DEH_ENTRY_FILE_NAME (bh, ih, deh);
115 if (!d_name[d_reclen - 1])
116 d_reclen = strlen (d_name);
117
118 if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)){
119 /* too big to send back to VFS */
120 continue ;
121 }
122
123 /* Ignore the .reiserfs_priv entry */
124 if (reiserfs_xattrs (inode->i_sb) &&
125 !old_format_only(inode->i_sb) &&
126 filp->f_dentry == inode->i_sb->s_root &&
127 REISERFS_SB(inode->i_sb)->priv_root &&
128 REISERFS_SB(inode->i_sb)->priv_root->d_inode &&
129 deh_objectid(deh) == le32_to_cpu (INODE_PKEY(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->k_objectid)) {
130 continue;
131 }
132
133 d_off = deh_offset (deh);
134 filp->f_pos = d_off ;
135 d_ino = deh_objectid (deh);
136 if (d_reclen <= 32) {
137 local_buf = small_buf ;
138 } else {
139 local_buf = reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb) ;
140 if (!local_buf) {
141 pathrelse (&path_to_entry);
142 ret = -ENOMEM ;
143 goto out;
144 }
145 if (item_moved (&tmp_ih, &path_to_entry)) {
146 reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
147 goto research;
148 }
149 }
150 // Note, that we copy name to user space via temporary
151 // buffer (local_buf) because filldir will block if
152 // user space buffer is swapped out. At that time
153 // entry can move to somewhere else
154 memcpy (local_buf, d_name, d_reclen);
155 if (filldir (dirent, local_buf, d_reclen, d_off, d_ino,
156 DT_UNKNOWN) < 0) {
157 if (local_buf != small_buf) {
158 reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
159 }
160 goto end;
161 }
162 if (local_buf != small_buf) {
163 reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
164 }
165
166 // next entry should be looked for with such offset
167 next_pos = deh_offset (deh) + 1;
168
169 if (item_moved (&tmp_ih, &path_to_entry)) {
170 goto research;
171 }
172 } /* for */
173 }
174
175 if (item_num != B_NR_ITEMS (bh) - 1)
176 // end of directory has been reached
177 goto end;
178
179 /* item we went through is last item of node. Using right
180 delimiting key check is it directory end */
181 rkey = get_rkey (&path_to_entry, inode->i_sb);
182 if (! comp_le_keys (rkey, &MIN_KEY)) {
183 /* set pos_key to key, that is the smallest and greater
184 that key of the last entry in the item */
185 set_cpu_key_k_offset (&pos_key, next_pos);
186 continue;
187 }
188
189 if ( COMP_SHORT_KEYS (rkey, &pos_key)) {
190 // end of directory has been reached
191 goto end;
192 }
193
194 /* directory continues in the right neighboring block */
195 set_cpu_key_k_offset (&pos_key, le_key_k_offset (KEY_FORMAT_3_5, rkey));
196
197 } /* while */
198
199
200 end:
201 filp->f_pos = next_pos;
202 pathrelse (&path_to_entry);
203 reiserfs_check_path(&path_to_entry) ;
204 out:
205 reiserfs_write_unlock(inode->i_sb);
206 return ret;
207}
208
209/* compose directory item containing "." and ".." entries (entries are
210 not aligned to 4 byte boundary) */
211/* the last four params are LE */
212void make_empty_dir_item_v1 (char * body, __u32 dirid, __u32 objid,
213 __u32 par_dirid, __u32 par_objid)
214{
215 struct reiserfs_de_head * deh;
216
217 memset (body, 0, EMPTY_DIR_SIZE_V1);
218 deh = (struct reiserfs_de_head *)body;
219
220 /* direntry header of "." */
221 put_deh_offset( &(deh[0]), DOT_OFFSET );
222 /* these two are from make_le_item_head, and are are LE */
223 deh[0].deh_dir_id = dirid;
224 deh[0].deh_objectid = objid;
225 deh[0].deh_state = 0; /* Endian safe if 0 */
226 put_deh_location( &(deh[0]), EMPTY_DIR_SIZE_V1 - strlen( "." ));
227 mark_de_visible(&(deh[0]));
228
229 /* direntry header of ".." */
230 put_deh_offset( &(deh[1]), DOT_DOT_OFFSET);
231 /* key of ".." for the root directory */
232 /* these two are from the inode, and are are LE */
233 deh[1].deh_dir_id = par_dirid;
234 deh[1].deh_objectid = par_objid;
235 deh[1].deh_state = 0; /* Endian safe if 0 */
236 put_deh_location( &(deh[1]), deh_location( &(deh[0]) ) - strlen( ".." ) );
237 mark_de_visible(&(deh[1]));
238
239 /* copy ".." and "." */
240 memcpy (body + deh_location( &(deh[0]) ), ".", 1);
241 memcpy (body + deh_location( &(deh[1]) ), "..", 2);
242}
243
244/* compose directory item containing "." and ".." entries */
245void make_empty_dir_item (char * body, __u32 dirid, __u32 objid,
246 __u32 par_dirid, __u32 par_objid)
247{
248 struct reiserfs_de_head * deh;
249
250 memset (body, 0, EMPTY_DIR_SIZE);
251 deh = (struct reiserfs_de_head *)body;
252
253 /* direntry header of "." */
254 put_deh_offset( &(deh[0]), DOT_OFFSET );
255 /* these two are from make_le_item_head, and are are LE */
256 deh[0].deh_dir_id = dirid;
257 deh[0].deh_objectid = objid;
258 deh[0].deh_state = 0; /* Endian safe if 0 */
259 put_deh_location( &(deh[0]), EMPTY_DIR_SIZE - ROUND_UP( strlen( "." ) ) );
260 mark_de_visible(&(deh[0]));
261
262 /* direntry header of ".." */
263 put_deh_offset( &(deh[1]), DOT_DOT_OFFSET );
264 /* key of ".." for the root directory */
265 /* these two are from the inode, and are are LE */
266 deh[1].deh_dir_id = par_dirid;
267 deh[1].deh_objectid = par_objid;
268 deh[1].deh_state = 0; /* Endian safe if 0 */
269 put_deh_location( &(deh[1]), deh_location( &(deh[0])) - ROUND_UP( strlen( ".." ) ) );
270 mark_de_visible(&(deh[1]));
271
272 /* copy ".." and "." */
273 memcpy (body + deh_location( &(deh[0]) ), ".", 1);
274 memcpy (body + deh_location( &(deh[1]) ), "..", 2);
275}
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
new file mode 100644
index 000000000000..2118db2896c7
--- /dev/null
+++ b/fs/reiserfs/do_balan.c
@@ -0,0 +1,1597 @@
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5/* Now we have all buffers that must be used in balancing of the tree */
6/* Further calculations can not cause schedule(), and thus the buffer */
7/* tree will be stable until the balancing will be finished */
8/* balance the tree according to the analysis made before, */
9/* and using buffers obtained after all above. */
10
11
12/**
13 ** balance_leaf_when_delete
14 ** balance_leaf
15 ** do_balance
16 **
17 **/
18
19#include <linux/config.h>
20#include <asm/uaccess.h>
21#include <linux/time.h>
22#include <linux/reiserfs_fs.h>
23#include <linux/buffer_head.h>
24
25#ifdef CONFIG_REISERFS_CHECK
26
27struct tree_balance * cur_tb = NULL; /* detects whether more than one
28 copy of tb exists as a means
29 of checking whether schedule
30 is interrupting do_balance */
31#endif
32
33inline void do_balance_mark_leaf_dirty (struct tree_balance * tb,
34 struct buffer_head * bh, int flag)
35{
36 journal_mark_dirty(tb->transaction_handle,
37 tb->transaction_handle->t_super, bh) ;
38}
39
40#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
41#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
42
43
44/* summary:
45 if deleting something ( tb->insert_size[0] < 0 )
46 return(balance_leaf_when_delete()); (flag d handled here)
47 else
48 if lnum is larger than 0 we put items into the left node
49 if rnum is larger than 0 we put items into the right node
50 if snum1 is larger than 0 we put items into the new node s1
51 if snum2 is larger than 0 we put items into the new node s2
52Note that all *num* count new items being created.
53
54It would be easier to read balance_leaf() if each of these summary
55lines was a separate procedure rather than being inlined. I think
56that there are many passages here and in balance_leaf_when_delete() in
57which two calls to one procedure can replace two passages, and it
58might save cache space and improve software maintenance costs to do so.
59
60Vladimir made the perceptive comment that we should offload most of
61the decision making in this function into fix_nodes/check_balance, and
62then create some sort of structure in tb that says what actions should
63be performed by do_balance.
64
65-Hans */
66
67
68
69/* Balance leaf node in case of delete or cut: insert_size[0] < 0
70 *
71 * lnum, rnum can have values >= -1
72 * -1 means that the neighbor must be joined with S
73 * 0 means that nothing should be done with the neighbor
74 * >0 means to shift entirely or partly the specified number of items to the neighbor
75 */
76static int balance_leaf_when_delete (struct tree_balance * tb, int flag)
77{
78 struct buffer_head * tbS0 = PATH_PLAST_BUFFER (tb->tb_path);
79 int item_pos = PATH_LAST_POSITION (tb->tb_path);
80 int pos_in_item = tb->tb_path->pos_in_item;
81 struct buffer_info bi;
82 int n;
83 struct item_head * ih;
84
85 RFALSE( tb->FR[0] && B_LEVEL (tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1,
86 "vs- 12000: level: wrong FR %z", tb->FR[0]);
87 RFALSE( tb->blknum[0] > 1,
88 "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]);
89 RFALSE( ! tb->blknum[0] && ! PATH_H_PPARENT(tb->tb_path, 0),
90 "PAP-12010: tree can not be empty");
91
92 ih = B_N_PITEM_HEAD (tbS0, item_pos);
93
94 /* Delete or truncate the item */
95
96 switch (flag) {
97 case M_DELETE: /* delete item in S[0] */
98
99 RFALSE( ih_item_len(ih) + IH_SIZE != -tb->insert_size[0],
100 "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
101 -tb->insert_size [0], ih);
102
103 bi.tb = tb;
104 bi.bi_bh = tbS0;
105 bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
106 bi.bi_position = PATH_H_POSITION (tb->tb_path, 1);
107 leaf_delete_items (&bi, 0, item_pos, 1, -1);
108
109 if ( ! item_pos && tb->CFL[0] ) {
110 if ( B_NR_ITEMS(tbS0) ) {
111 replace_key(tb, tb->CFL[0],tb->lkey[0],tbS0,0);
112 }
113 else {
114 if ( ! PATH_H_POSITION (tb->tb_path, 1) )
115 replace_key(tb, tb->CFL[0],tb->lkey[0],PATH_H_PPARENT(tb->tb_path, 0),0);
116 }
117 }
118
119 RFALSE( ! item_pos && !tb->CFL[0],
120 "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0], tb->L[0]);
121
122 break;
123
124 case M_CUT: { /* cut item in S[0] */
125 bi.tb = tb;
126 bi.bi_bh = tbS0;
127 bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
128 bi.bi_position = PATH_H_POSITION (tb->tb_path, 1);
129 if (is_direntry_le_ih (ih)) {
130
131 /* UFS unlink semantics are such that you can only delete one directory entry at a time. */
132 /* when we cut a directory tb->insert_size[0] means number of entries to be cut (always 1) */
133 tb->insert_size[0] = -1;
134 leaf_cut_from_buffer (&bi, item_pos, pos_in_item, -tb->insert_size[0]);
135
136 RFALSE( ! item_pos && ! pos_in_item && ! tb->CFL[0],
137 "PAP-12030: can not change delimiting key. CFL[0]=%p",
138 tb->CFL[0]);
139
140 if ( ! item_pos && ! pos_in_item && tb->CFL[0] ) {
141 replace_key(tb, tb->CFL[0],tb->lkey[0],tbS0,0);
142 }
143 } else {
144 leaf_cut_from_buffer (&bi, item_pos, pos_in_item, -tb->insert_size[0]);
145
146 RFALSE( ! ih_item_len(ih),
147 "PAP-12035: cut must leave non-zero dynamic length of item");
148 }
149 break;
150 }
151
152 default:
153 print_cur_tb ("12040");
154 reiserfs_panic (tb->tb_sb, "PAP-12040: balance_leaf_when_delete: unexpectable mode: %s(%d)",
155 (flag == M_PASTE) ? "PASTE" : ((flag == M_INSERT) ? "INSERT" : "UNKNOWN"), flag);
156 }
157
158 /* the rule is that no shifting occurs unless by shifting a node can be freed */
159 n = B_NR_ITEMS(tbS0);
160 if ( tb->lnum[0] ) /* L[0] takes part in balancing */
161 {
162 if ( tb->lnum[0] == -1 ) /* L[0] must be joined with S[0] */
163 {
164 if ( tb->rnum[0] == -1 ) /* R[0] must be also joined with S[0] */
165 {
166 if ( tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0) )
167 {
168 /* all contents of all the 3 buffers will be in L[0] */
169 if ( PATH_H_POSITION (tb->tb_path, 1) == 0 && 1 < B_NR_ITEMS(tb->FR[0]) )
170 replace_key(tb, tb->CFL[0],tb->lkey[0],tb->FR[0],1);
171
172 leaf_move_items (LEAF_FROM_S_TO_L, tb, n, -1, NULL);
173 leaf_move_items (LEAF_FROM_R_TO_L, tb, B_NR_ITEMS(tb->R[0]), -1, NULL);
174
175 reiserfs_invalidate_buffer (tb, tbS0);
176 reiserfs_invalidate_buffer (tb, tb->R[0]);
177
178 return 0;
179 }
180 /* all contents of all the 3 buffers will be in R[0] */
181 leaf_move_items (LEAF_FROM_S_TO_R, tb, n, -1, NULL);
182 leaf_move_items (LEAF_FROM_L_TO_R, tb, B_NR_ITEMS(tb->L[0]), -1, NULL);
183
184 /* right_delimiting_key is correct in R[0] */
185 replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0);
186
187 reiserfs_invalidate_buffer (tb, tbS0);
188 reiserfs_invalidate_buffer (tb, tb->L[0]);
189
190 return -1;
191 }
192
193 RFALSE( tb->rnum[0] != 0,
194 "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]);
195 /* all contents of L[0] and S[0] will be in L[0] */
196 leaf_shift_left(tb, n, -1);
197
198 reiserfs_invalidate_buffer (tb, tbS0);
199
200 return 0;
201 }
202 /* a part of contents of S[0] will be in L[0] and the rest part of S[0] will be in R[0] */
203
204 RFALSE( ( tb->lnum[0] + tb->rnum[0] < n ) ||
205 ( tb->lnum[0] + tb->rnum[0] > n+1 ),
206 "PAP-12050: rnum(%d) and lnum(%d) and item number(%d) in S[0] are not consistent",
207 tb->rnum[0], tb->lnum[0], n);
208 RFALSE( ( tb->lnum[0] + tb->rnum[0] == n ) &&
209 (tb->lbytes != -1 || tb->rbytes != -1),
210 "PAP-12055: bad rbytes (%d)/lbytes (%d) parameters when items are not split",
211 tb->rbytes, tb->lbytes);
212 RFALSE( ( tb->lnum[0] + tb->rnum[0] == n + 1 ) &&
213 (tb->lbytes < 1 || tb->rbytes != -1),
214 "PAP-12060: bad rbytes (%d)/lbytes (%d) parameters when items are split",
215 tb->rbytes, tb->lbytes);
216
217 leaf_shift_left (tb, tb->lnum[0], tb->lbytes);
218 leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
219
220 reiserfs_invalidate_buffer (tb, tbS0);
221
222 return 0;
223 }
224
225 if ( tb->rnum[0] == -1 ) {
226 /* all contents of R[0] and S[0] will be in R[0] */
227 leaf_shift_right(tb, n, -1);
228 reiserfs_invalidate_buffer (tb, tbS0);
229 return 0;
230 }
231
232 RFALSE( tb->rnum[0],
233 "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]);
234 return 0;
235}
236
237
238static int balance_leaf (struct tree_balance * tb,
239 struct item_head * ih, /* item header of inserted item (this is on little endian) */
240 const char * body, /* body of inserted item or bytes to paste */
241 int flag, /* i - insert, d - delete, c - cut, p - paste
242 (see comment to do_balance) */
243 struct item_head * insert_key, /* in our processing of one level we sometimes determine what
244 must be inserted into the next higher level. This insertion
245 consists of a key or two keys and their corresponding
246 pointers */
247 struct buffer_head ** insert_ptr /* inserted node-ptrs for the next level */
248 )
249{
250 struct buffer_head * tbS0 = PATH_PLAST_BUFFER (tb->tb_path);
251 int item_pos = PATH_LAST_POSITION (tb->tb_path); /* index into the array of item headers in S[0]
252 of the affected item */
253 struct buffer_info bi;
254 struct buffer_head *S_new[2]; /* new nodes allocated to hold what could not fit into S */
255 int snum[2]; /* number of items that will be placed
256 into S_new (includes partially shifted
257 items) */
258 int sbytes[2]; /* if an item is partially shifted into S_new then
259 if it is a directory item
260 it is the number of entries from the item that are shifted into S_new
261 else
262 it is the number of bytes from the item that are shifted into S_new
263 */
264 int n, i;
265 int ret_val;
266 int pos_in_item;
267 int zeros_num;
268
269 PROC_INFO_INC( tb -> tb_sb, balance_at[ 0 ] );
270
271 /* Make balance in case insert_size[0] < 0 */
272 if ( tb->insert_size[0] < 0 )
273 return balance_leaf_when_delete (tb, flag);
274
275 zeros_num = 0;
276 if (flag == M_INSERT && body == 0)
277 zeros_num = ih_item_len( ih );
278
279 pos_in_item = tb->tb_path->pos_in_item;
280 /* for indirect item pos_in_item is measured in unformatted node
281 pointers. Recalculate to bytes */
282 if (flag != M_INSERT && is_indirect_le_ih (B_N_PITEM_HEAD (tbS0, item_pos)))
283 pos_in_item *= UNFM_P_SIZE;
284
285 if ( tb->lnum[0] > 0 ) {
286 /* Shift lnum[0] items from S[0] to the left neighbor L[0] */
287 if ( item_pos < tb->lnum[0] ) {
288 /* new item or it part falls to L[0], shift it too */
289 n = B_NR_ITEMS(tb->L[0]);
290
291 switch (flag) {
292 case M_INSERT: /* insert item into L[0] */
293
294 if ( item_pos == tb->lnum[0] - 1 && tb->lbytes != -1 ) {
295 /* part of new item falls into L[0] */
296 int new_item_len;
297 int version;
298
299 ret_val = leaf_shift_left (tb, tb->lnum[0]-1, -1);
300
301 /* Calculate item length to insert to S[0] */
302 new_item_len = ih_item_len(ih) - tb->lbytes;
303 /* Calculate and check item length to insert to L[0] */
304 put_ih_item_len(ih, ih_item_len(ih) - new_item_len );
305
306 RFALSE( ih_item_len(ih) <= 0,
307 "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d",
308 ih_item_len(ih));
309
310 /* Insert new item into L[0] */
311 bi.tb = tb;
312 bi.bi_bh = tb->L[0];
313 bi.bi_parent = tb->FL[0];
314 bi.bi_position = get_left_neighbor_position (tb, 0);
315 leaf_insert_into_buf (&bi, n + item_pos - ret_val, ih, body,
316 zeros_num > ih_item_len(ih) ? ih_item_len(ih) : zeros_num);
317
318 version = ih_version (ih);
319
320 /* Calculate key component, item length and body to insert into S[0] */
321 set_le_ih_k_offset( ih, le_ih_k_offset( ih ) + (tb->lbytes << (is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0)) );
322
323 put_ih_item_len( ih, new_item_len );
324 if ( tb->lbytes > zeros_num ) {
325 body += (tb->lbytes - zeros_num);
326 zeros_num = 0;
327 }
328 else
329 zeros_num -= tb->lbytes;
330
331 RFALSE( ih_item_len(ih) <= 0,
332 "PAP-12085: there is nothing to insert into S[0]: ih_item_len=%d",
333 ih_item_len(ih));
334 } else {
335 /* new item in whole falls into L[0] */
336 /* Shift lnum[0]-1 items to L[0] */
337 ret_val = leaf_shift_left(tb, tb->lnum[0]-1, tb->lbytes);
338 /* Insert new item into L[0] */
339 bi.tb = tb;
340 bi.bi_bh = tb->L[0];
341 bi.bi_parent = tb->FL[0];
342 bi.bi_position = get_left_neighbor_position (tb, 0);
343 leaf_insert_into_buf (&bi, n + item_pos - ret_val, ih, body, zeros_num);
344 tb->insert_size[0] = 0;
345 zeros_num = 0;
346 }
347 break;
348
349 case M_PASTE: /* append item in L[0] */
350
351 if ( item_pos == tb->lnum[0] - 1 && tb->lbytes != -1 ) {
352 /* we must shift the part of the appended item */
353 if ( is_direntry_le_ih (B_N_PITEM_HEAD (tbS0, item_pos))) {
354
355 RFALSE( zeros_num,
356 "PAP-12090: invalid parameter in case of a directory");
357 /* directory item */
358 if ( tb->lbytes > pos_in_item ) {
359 /* new directory entry falls into L[0] */
360 struct item_head * pasted;
361 int l_pos_in_item = pos_in_item;
362
363 /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */
364 ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1);
365 if ( ret_val && ! item_pos ) {
366 pasted = B_N_PITEM_HEAD(tb->L[0],B_NR_ITEMS(tb->L[0])-1);
367 l_pos_in_item += I_ENTRY_COUNT(pasted) - (tb->lbytes-1);
368 }
369
370 /* Append given directory entry to directory item */
371 bi.tb = tb;
372 bi.bi_bh = tb->L[0];
373 bi.bi_parent = tb->FL[0];
374 bi.bi_position = get_left_neighbor_position (tb, 0);
375 leaf_paste_in_buffer (&bi, n + item_pos - ret_val, l_pos_in_item,
376 tb->insert_size[0], body, zeros_num);
377
378 /* previous string prepared space for pasting new entry, following string pastes this entry */
379
380 /* when we have merge directory item, pos_in_item has been changed too */
381
382 /* paste new directory entry. 1 is entry number */
383 leaf_paste_entries (bi.bi_bh, n + item_pos - ret_val, l_pos_in_item, 1,
384 (struct reiserfs_de_head *)body,
385 body + DEH_SIZE, tb->insert_size[0]
386 );
387 tb->insert_size[0] = 0;
388 } else {
389 /* new directory item doesn't fall into L[0] */
390 /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */
391 leaf_shift_left (tb, tb->lnum[0], tb->lbytes);
392 }
393 /* Calculate new position to append in item body */
394 pos_in_item -= tb->lbytes;
395 }
396 else {
397 /* regular object */
398 RFALSE( tb->lbytes <= 0,
399 "PAP-12095: there is nothing to shift to L[0]. lbytes=%d",
400 tb->lbytes);
401 RFALSE( pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)),
402 "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d",
403 ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos)), pos_in_item);
404
405 if ( tb->lbytes >= pos_in_item ) {
406 /* appended item will be in L[0] in whole */
407 int l_n;
408
409 /* this bytes number must be appended to the last item of L[h] */
410 l_n = tb->lbytes - pos_in_item;
411
412 /* Calculate new insert_size[0] */
413 tb->insert_size[0] -= l_n;
414
415 RFALSE( tb->insert_size[0] <= 0,
416 "PAP-12105: there is nothing to paste into L[0]. insert_size=%d",
417 tb->insert_size[0]);
418 ret_val = leaf_shift_left(tb,tb->lnum[0],
419 ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos)));
420 /* Append to body of item in L[0] */
421 bi.tb = tb;
422 bi.bi_bh = tb->L[0];
423 bi.bi_parent = tb->FL[0];
424 bi.bi_position = get_left_neighbor_position (tb, 0);
425 leaf_paste_in_buffer(
426 &bi,n + item_pos - ret_val,
427 ih_item_len( B_N_PITEM_HEAD(tb->L[0],n+item_pos-ret_val)),
428 l_n,body, zeros_num > l_n ? l_n : zeros_num
429 );
430 /* 0-th item in S0 can be only of DIRECT type when l_n != 0*/
431 {
432 int version;
433 int temp_l = l_n;
434
435 RFALSE (ih_item_len (B_N_PITEM_HEAD (tbS0, 0)),
436 "PAP-12106: item length must be 0");
437 RFALSE (comp_short_le_keys (B_N_PKEY (tbS0, 0),
438 B_N_PKEY (tb->L[0],
439 n + item_pos - ret_val)),
440 "PAP-12107: items must be of the same file");
441 if (is_indirect_le_ih(B_N_PITEM_HEAD (tb->L[0],
442 n + item_pos - ret_val))) {
443 temp_l = l_n << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT);
444 }
445 /* update key of first item in S0 */
446 version = ih_version (B_N_PITEM_HEAD (tbS0, 0));
447 set_le_key_k_offset (version, B_N_PKEY (tbS0, 0),
448 le_key_k_offset (version, B_N_PKEY (tbS0, 0)) + temp_l);
449 /* update left delimiting key */
450 set_le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0]),
451 le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0])) + temp_l);
452 }
453
454 /* Calculate new body, position in item and insert_size[0] */
455 if ( l_n > zeros_num ) {
456 body += (l_n - zeros_num);
457 zeros_num = 0;
458 }
459 else
460 zeros_num -= l_n;
461 pos_in_item = 0;
462
463 RFALSE( comp_short_le_keys
464 (B_N_PKEY(tbS0,0),
465 B_N_PKEY(tb->L[0],B_NR_ITEMS(tb->L[0])-1)) ||
466
467 !op_is_left_mergeable
468 (B_N_PKEY (tbS0, 0), tbS0->b_size) ||
469 !op_is_left_mergeable
470 (B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0]),
471 tbS0->b_size),
472 "PAP-12120: item must be merge-able with left neighboring item");
473 }
474 else /* only part of the appended item will be in L[0] */
475 {
476 /* Calculate position in item for append in S[0] */
477 pos_in_item -= tb->lbytes;
478
479 RFALSE( pos_in_item <= 0,
480 "PAP-12125: no place for paste. pos_in_item=%d", pos_in_item);
481
482 /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
483 leaf_shift_left(tb,tb->lnum[0],tb->lbytes);
484 }
485 }
486 }
487 else /* appended item will be in L[0] in whole */
488 {
489 struct item_head * pasted;
490
491 if ( ! item_pos && op_is_left_mergeable (B_N_PKEY (tbS0, 0), tbS0->b_size) )
492 { /* if we paste into first item of S[0] and it is left mergable */
493 /* then increment pos_in_item by the size of the last item in L[0] */
494 pasted = B_N_PITEM_HEAD(tb->L[0],n-1);
495 if ( is_direntry_le_ih (pasted) )
496 pos_in_item += ih_entry_count(pasted);
497 else
498 pos_in_item += ih_item_len(pasted);
499 }
500
501 /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
502 ret_val = leaf_shift_left(tb,tb->lnum[0],tb->lbytes);
503 /* Append to body of item in L[0] */
504 bi.tb = tb;
505 bi.bi_bh = tb->L[0];
506 bi.bi_parent = tb->FL[0];
507 bi.bi_position = get_left_neighbor_position (tb, 0);
508 leaf_paste_in_buffer (&bi, n + item_pos - ret_val, pos_in_item, tb->insert_size[0],
509 body, zeros_num);
510
511 /* if appended item is directory, paste entry */
512 pasted = B_N_PITEM_HEAD (tb->L[0], n + item_pos - ret_val);
513 if (is_direntry_le_ih (pasted))
514 leaf_paste_entries (
515 bi.bi_bh, n + item_pos - ret_val, pos_in_item, 1,
516 (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0]
517 );
518 /* if appended item is indirect item, put unformatted node into un list */
519 if (is_indirect_le_ih (pasted))
520 set_ih_free_space (pasted, 0);
521 tb->insert_size[0] = 0;
522 zeros_num = 0;
523 }
524 break;
525 default: /* cases d and t */
526 reiserfs_panic (tb->tb_sb, "PAP-12130: balance_leaf: lnum > 0: unexpectable mode: %s(%d)",
527 (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
528 }
529 } else {
530 /* new item doesn't fall into L[0] */
531 leaf_shift_left(tb,tb->lnum[0],tb->lbytes);
532 }
533 } /* tb->lnum[0] > 0 */
534
535 /* Calculate new item position */
536 item_pos -= ( tb->lnum[0] - (( tb->lbytes != -1 ) ? 1 : 0));
537
538 if ( tb->rnum[0] > 0 ) {
539 /* shift rnum[0] items from S[0] to the right neighbor R[0] */
540 n = B_NR_ITEMS(tbS0);
541 switch ( flag ) {
542
543 case M_INSERT: /* insert item */
544 if ( n - tb->rnum[0] < item_pos )
545 { /* new item or its part falls to R[0] */
546 if ( item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1 )
547 { /* part of new item falls into R[0] */
548 loff_t old_key_comp, old_len, r_zeros_number;
549 const char * r_body;
550 int version;
551 loff_t offset;
552
553 leaf_shift_right(tb,tb->rnum[0]-1,-1);
554
555 version = ih_version(ih);
556 /* Remember key component and item length */
557 old_key_comp = le_ih_k_offset( ih );
558 old_len = ih_item_len(ih);
559
560 /* Calculate key component and item length to insert into R[0] */
561 offset = le_ih_k_offset( ih ) + ((old_len - tb->rbytes )<<(is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0));
562 set_le_ih_k_offset( ih, offset );
563 put_ih_item_len( ih, tb->rbytes);
564 /* Insert part of the item into R[0] */
565 bi.tb = tb;
566 bi.bi_bh = tb->R[0];
567 bi.bi_parent = tb->FR[0];
568 bi.bi_position = get_right_neighbor_position (tb, 0);
569 if ( (old_len - tb->rbytes) > zeros_num ) {
570 r_zeros_number = 0;
571 r_body = body + (old_len - tb->rbytes) - zeros_num;
572 }
573 else {
574 r_body = body;
575 r_zeros_number = zeros_num - (old_len - tb->rbytes);
576 zeros_num -= r_zeros_number;
577 }
578
579 leaf_insert_into_buf (&bi, 0, ih, r_body, r_zeros_number);
580
581 /* Replace right delimiting key by first key in R[0] */
582 replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0);
583
584 /* Calculate key component and item length to insert into S[0] */
585 set_le_ih_k_offset( ih, old_key_comp );
586 put_ih_item_len( ih, old_len - tb->rbytes );
587
588 tb->insert_size[0] -= tb->rbytes;
589
590 }
591 else /* whole new item falls into R[0] */
592 {
593 /* Shift rnum[0]-1 items to R[0] */
594 ret_val = leaf_shift_right(tb,tb->rnum[0]-1,tb->rbytes);
595 /* Insert new item into R[0] */
596 bi.tb = tb;
597 bi.bi_bh = tb->R[0];
598 bi.bi_parent = tb->FR[0];
599 bi.bi_position = get_right_neighbor_position (tb, 0);
600 leaf_insert_into_buf (&bi, item_pos - n + tb->rnum[0] - 1, ih, body, zeros_num);
601
602 if ( item_pos - n + tb->rnum[0] - 1 == 0 ) {
603 replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0);
604
605 }
606 zeros_num = tb->insert_size[0] = 0;
607 }
608 }
609 else /* new item or part of it doesn't fall into R[0] */
610 {
611 leaf_shift_right(tb,tb->rnum[0],tb->rbytes);
612 }
613 break;
614
615 case M_PASTE: /* append item */
616
617 if ( n - tb->rnum[0] <= item_pos ) /* pasted item or part of it falls to R[0] */
618 {
619 if ( item_pos == n - tb->rnum[0] && tb->rbytes != -1 )
620 { /* we must shift the part of the appended item */
621 if ( is_direntry_le_ih (B_N_PITEM_HEAD(tbS0, item_pos)))
622 { /* we append to directory item */
623 int entry_count;
624
625 RFALSE( zeros_num,
626 "PAP-12145: invalid parameter in case of a directory");
627 entry_count = I_ENTRY_COUNT(B_N_PITEM_HEAD(tbS0, item_pos));
628 if ( entry_count - tb->rbytes < pos_in_item )
629 /* new directory entry falls into R[0] */
630 {
631 int paste_entry_position;
632
633 RFALSE( tb->rbytes - 1 >= entry_count ||
634 ! tb->insert_size[0],
635 "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d",
636 tb->rbytes, entry_count);
637 /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */
638 leaf_shift_right(tb,tb->rnum[0],tb->rbytes - 1);
639 /* Paste given directory entry to directory item */
640 paste_entry_position = pos_in_item - entry_count + tb->rbytes - 1;
641 bi.tb = tb;
642 bi.bi_bh = tb->R[0];
643 bi.bi_parent = tb->FR[0];
644 bi.bi_position = get_right_neighbor_position (tb, 0);
645 leaf_paste_in_buffer (&bi, 0, paste_entry_position,
646 tb->insert_size[0],body,zeros_num);
647 /* paste entry */
648 leaf_paste_entries (
649 bi.bi_bh, 0, paste_entry_position, 1, (struct reiserfs_de_head *)body,
650 body + DEH_SIZE, tb->insert_size[0]
651 );
652
653 if ( paste_entry_position == 0 ) {
654 /* change delimiting keys */
655 replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0);
656 }
657
658 tb->insert_size[0] = 0;
659 pos_in_item++;
660 }
661 else /* new directory entry doesn't fall into R[0] */
662 {
663 leaf_shift_right(tb,tb->rnum[0],tb->rbytes);
664 }
665 }
666 else /* regular object */
667 {
668 int n_shift, n_rem, r_zeros_number;
669 const char * r_body;
670
671 /* Calculate number of bytes which must be shifted from appended item */
672 if ( (n_shift = tb->rbytes - tb->insert_size[0]) < 0 )
673 n_shift = 0;
674
675 RFALSE(pos_in_item != ih_item_len(B_N_PITEM_HEAD (tbS0, item_pos)),
676 "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d",
677 pos_in_item, ih_item_len( B_N_PITEM_HEAD(tbS0,item_pos)));
678
679 leaf_shift_right(tb,tb->rnum[0],n_shift);
680 /* Calculate number of bytes which must remain in body after appending to R[0] */
681 if ( (n_rem = tb->insert_size[0] - tb->rbytes) < 0 )
682 n_rem = 0;
683
684 {
685 int version;
686 unsigned long temp_rem = n_rem;
687
688 version = ih_version (B_N_PITEM_HEAD (tb->R[0],0));
689 if (is_indirect_le_key(version,B_N_PKEY(tb->R[0],0))){
690 temp_rem = n_rem << (tb->tb_sb->s_blocksize_bits -
691 UNFM_P_SHIFT);
692 }
693 set_le_key_k_offset (version, B_N_PKEY(tb->R[0],0),
694 le_key_k_offset (version, B_N_PKEY(tb->R[0],0)) + temp_rem);
695 set_le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0]),
696 le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) + temp_rem);
697 }
698/* k_offset (B_N_PKEY(tb->R[0],0)) += n_rem;
699 k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/
700 do_balance_mark_internal_dirty (tb, tb->CFR[0], 0);
701
702 /* Append part of body into R[0] */
703 bi.tb = tb;
704 bi.bi_bh = tb->R[0];
705 bi.bi_parent = tb->FR[0];
706 bi.bi_position = get_right_neighbor_position (tb, 0);
707 if ( n_rem > zeros_num ) {
708 r_zeros_number = 0;
709 r_body = body + n_rem - zeros_num;
710 }
711 else {
712 r_body = body;
713 r_zeros_number = zeros_num - n_rem;
714 zeros_num -= r_zeros_number;
715 }
716
717 leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, r_body, r_zeros_number);
718
719 if (is_indirect_le_ih (B_N_PITEM_HEAD(tb->R[0],0))) {
720#if 0
721 RFALSE( n_rem,
722 "PAP-12160: paste more than one unformatted node pointer");
723#endif
724 set_ih_free_space (B_N_PITEM_HEAD(tb->R[0],0), 0);
725 }
726 tb->insert_size[0] = n_rem;
727 if ( ! n_rem )
728 pos_in_item ++;
729 }
730 }
731 else /* pasted item in whole falls into R[0] */
732 {
733 struct item_head * pasted;
734
735 ret_val = leaf_shift_right(tb,tb->rnum[0],tb->rbytes);
736 /* append item in R[0] */
737 if ( pos_in_item >= 0 ) {
738 bi.tb = tb;
739 bi.bi_bh = tb->R[0];
740 bi.bi_parent = tb->FR[0];
741 bi.bi_position = get_right_neighbor_position (tb, 0);
742 leaf_paste_in_buffer(&bi,item_pos - n + tb->rnum[0], pos_in_item,
743 tb->insert_size[0],body, zeros_num);
744 }
745
746 /* paste new entry, if item is directory item */
747 pasted = B_N_PITEM_HEAD(tb->R[0], item_pos - n + tb->rnum[0]);
748 if (is_direntry_le_ih (pasted) && pos_in_item >= 0 ) {
749 leaf_paste_entries (
750 bi.bi_bh, item_pos - n + tb->rnum[0], pos_in_item, 1,
751 (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0]
752 );
753 if ( ! pos_in_item ) {
754
755 RFALSE( item_pos - n + tb->rnum[0],
756 "PAP-12165: directory item must be first item of node when pasting is in 0th position");
757
758 /* update delimiting keys */
759 replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0);
760 }
761 }
762
763 if (is_indirect_le_ih (pasted))
764 set_ih_free_space (pasted, 0);
765 zeros_num = tb->insert_size[0] = 0;
766 }
767 }
768 else /* new item doesn't fall into R[0] */
769 {
770 leaf_shift_right(tb,tb->rnum[0],tb->rbytes);
771 }
772 break;
773 default: /* cases d and t */
774 reiserfs_panic (tb->tb_sb, "PAP-12175: balance_leaf: rnum > 0: unexpectable mode: %s(%d)",
775 (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
776 }
777
778 } /* tb->rnum[0] > 0 */
779
780
781 RFALSE( tb->blknum[0] > 3,
782 "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]);
783 RFALSE( tb->blknum[0] < 0,
784 "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]);
785
786 /* if while adding to a node we discover that it is possible to split
787 it in two, and merge the left part into the left neighbor and the
788 right part into the right neighbor, eliminating the node */
789 if ( tb->blknum[0] == 0 ) { /* node S[0] is empty now */
790
791 RFALSE( ! tb->lnum[0] || ! tb->rnum[0],
792 "PAP-12190: lnum and rnum must not be zero");
793 /* if insertion was done before 0-th position in R[0], right
794 delimiting key of the tb->L[0]'s and left delimiting key are
795 not set correctly */
796 if (tb->CFL[0]) {
797 if (!tb->CFR[0])
798 reiserfs_panic (tb->tb_sb, "vs-12195: balance_leaf: CFR not initialized");
799 copy_key (B_N_PDELIM_KEY (tb->CFL[0], tb->lkey[0]), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]));
800 do_balance_mark_internal_dirty (tb, tb->CFL[0], 0);
801 }
802
803 reiserfs_invalidate_buffer(tb,tbS0);
804 return 0;
805 }
806
807
808 /* Fill new nodes that appear in place of S[0] */
809
810 /* I am told that this copying is because we need an array to enable
811 the looping code. -Hans */
812 snum[0] = tb->s1num,
813 snum[1] = tb->s2num;
814 sbytes[0] = tb->s1bytes;
815 sbytes[1] = tb->s2bytes;
816 for( i = tb->blknum[0] - 2; i >= 0; i-- ) {
817
818 RFALSE( !snum[i], "PAP-12200: snum[%d] == %d. Must be > 0", i, snum[i]);
819
820 /* here we shift from S to S_new nodes */
821
822 S_new[i] = get_FEB(tb);
823
824 /* initialized block type and tree level */
825 set_blkh_level( B_BLK_HEAD(S_new[i]), DISK_LEAF_NODE_LEVEL );
826
827
828 n = B_NR_ITEMS(tbS0);
829
830 switch (flag) {
831 case M_INSERT: /* insert item */
832
833 if ( n - snum[i] < item_pos )
834 { /* new item or it's part falls to first new node S_new[i]*/
835 if ( item_pos == n - snum[i] + 1 && sbytes[i] != -1 )
836 { /* part of new item falls into S_new[i] */
837 int old_key_comp, old_len, r_zeros_number;
838 const char * r_body;
839 int version;
840
841 /* Move snum[i]-1 items from S[0] to S_new[i] */
842 leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i] - 1, -1, S_new[i]);
843 /* Remember key component and item length */
844 version = ih_version (ih);
845 old_key_comp = le_ih_k_offset( ih );
846 old_len = ih_item_len(ih);
847
848 /* Calculate key component and item length to insert into S_new[i] */
849 set_le_ih_k_offset( ih,
850 le_ih_k_offset(ih) + ((old_len - sbytes[i] )<<(is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0)) );
851
852 put_ih_item_len( ih, sbytes[i] );
853
854 /* Insert part of the item into S_new[i] before 0-th item */
855 bi.tb = tb;
856 bi.bi_bh = S_new[i];
857 bi.bi_parent = NULL;
858 bi.bi_position = 0;
859
860 if ( (old_len - sbytes[i]) > zeros_num ) {
861 r_zeros_number = 0;
862 r_body = body + (old_len - sbytes[i]) - zeros_num;
863 }
864 else {
865 r_body = body;
866 r_zeros_number = zeros_num - (old_len - sbytes[i]);
867 zeros_num -= r_zeros_number;
868 }
869
870 leaf_insert_into_buf (&bi, 0, ih, r_body, r_zeros_number);
871
872 /* Calculate key component and item length to insert into S[i] */
873 set_le_ih_k_offset( ih, old_key_comp );
874 put_ih_item_len( ih, old_len - sbytes[i] );
875 tb->insert_size[0] -= sbytes[i];
876 }
877 else /* whole new item falls into S_new[i] */
878 {
879 /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */
880 leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i] - 1, sbytes[i], S_new[i]);
881
882 /* Insert new item into S_new[i] */
883 bi.tb = tb;
884 bi.bi_bh = S_new[i];
885 bi.bi_parent = NULL;
886 bi.bi_position = 0;
887 leaf_insert_into_buf (&bi, item_pos - n + snum[i] - 1, ih, body, zeros_num);
888
889 zeros_num = tb->insert_size[0] = 0;
890 }
891 }
892
893 else /* new item or it part don't falls into S_new[i] */
894 {
895 leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]);
896 }
897 break;
898
899 case M_PASTE: /* append item */
900
901 if ( n - snum[i] <= item_pos ) /* pasted item or part if it falls to S_new[i] */
902 {
903 if ( item_pos == n - snum[i] && sbytes[i] != -1 )
904 { /* we must shift part of the appended item */
905 struct item_head * aux_ih;
906
907 RFALSE( ih, "PAP-12210: ih must be 0");
908
909 if ( is_direntry_le_ih (aux_ih = B_N_PITEM_HEAD(tbS0,item_pos))) {
910 /* we append to directory item */
911
912 int entry_count;
913
914 entry_count = ih_entry_count(aux_ih);
915
916 if ( entry_count - sbytes[i] < pos_in_item && pos_in_item <= entry_count ) {
917 /* new directory entry falls into S_new[i] */
918
919 RFALSE( ! tb->insert_size[0],
920 "PAP-12215: insert_size is already 0");
921 RFALSE( sbytes[i] - 1 >= entry_count,
922 "PAP-12220: there are no so much entries (%d), only %d",
923 sbytes[i] - 1, entry_count);
924
925 /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */
926 leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i]-1, S_new[i]);
927 /* Paste given directory entry to directory item */
928 bi.tb = tb;
929 bi.bi_bh = S_new[i];
930 bi.bi_parent = NULL;
931 bi.bi_position = 0;
932 leaf_paste_in_buffer (&bi, 0, pos_in_item - entry_count + sbytes[i] - 1,
933 tb->insert_size[0], body,zeros_num);
934 /* paste new directory entry */
935 leaf_paste_entries (
936 bi.bi_bh, 0, pos_in_item - entry_count + sbytes[i] - 1,
937 1, (struct reiserfs_de_head *)body, body + DEH_SIZE,
938 tb->insert_size[0]
939 );
940 tb->insert_size[0] = 0;
941 pos_in_item++;
942 } else { /* new directory entry doesn't fall into S_new[i] */
943 leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]);
944 }
945 }
946 else /* regular object */
947 {
948 int n_shift, n_rem, r_zeros_number;
949 const char * r_body;
950
951 RFALSE( pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos)) ||
952 tb->insert_size[0] <= 0,
953 "PAP-12225: item too short or insert_size <= 0");
954
955 /* Calculate number of bytes which must be shifted from appended item */
956 n_shift = sbytes[i] - tb->insert_size[0];
957 if ( n_shift < 0 )
958 n_shift = 0;
959 leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], n_shift, S_new[i]);
960
961 /* Calculate number of bytes which must remain in body after append to S_new[i] */
962 n_rem = tb->insert_size[0] - sbytes[i];
963 if ( n_rem < 0 )
964 n_rem = 0;
965 /* Append part of body into S_new[0] */
966 bi.tb = tb;
967 bi.bi_bh = S_new[i];
968 bi.bi_parent = NULL;
969 bi.bi_position = 0;
970
971 if ( n_rem > zeros_num ) {
972 r_zeros_number = 0;
973 r_body = body + n_rem - zeros_num;
974 }
975 else {
976 r_body = body;
977 r_zeros_number = zeros_num - n_rem;
978 zeros_num -= r_zeros_number;
979 }
980
981 leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0]-n_rem, r_body,r_zeros_number);
982 {
983 struct item_head * tmp;
984
985 tmp = B_N_PITEM_HEAD(S_new[i],0);
986 if (is_indirect_le_ih (tmp)) {
987 set_ih_free_space (tmp, 0);
988 set_le_ih_k_offset( tmp, le_ih_k_offset(tmp) +
989 (n_rem << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT)));
990 } else {
991 set_le_ih_k_offset( tmp, le_ih_k_offset(tmp) +
992 n_rem );
993 }
994 }
995
996 tb->insert_size[0] = n_rem;
997 if ( ! n_rem )
998 pos_in_item++;
999 }
1000 }
1001 else
1002 /* item falls wholly into S_new[i] */
1003 {
1004 int ret_val;
1005 struct item_head * pasted;
1006
1007#ifdef CONFIG_REISERFS_CHECK
1008 struct item_head * ih = B_N_PITEM_HEAD(tbS0,item_pos);
1009
1010 if ( ! is_direntry_le_ih(ih) && (pos_in_item != ih_item_len(ih) ||
1011 tb->insert_size[0] <= 0) )
1012 reiserfs_panic (tb->tb_sb, "PAP-12235: balance_leaf: pos_in_item must be equal to ih_item_len");
1013#endif /* CONFIG_REISERFS_CHECK */
1014
1015 ret_val = leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]);
1016
1017 RFALSE( ret_val,
1018 "PAP-12240: unexpected value returned by leaf_move_items (%d)",
1019 ret_val);
1020
1021 /* paste into item */
1022 bi.tb = tb;
1023 bi.bi_bh = S_new[i];
1024 bi.bi_parent = NULL;
1025 bi.bi_position = 0;
1026 leaf_paste_in_buffer(&bi, item_pos - n + snum[i], pos_in_item, tb->insert_size[0], body, zeros_num);
1027
1028 pasted = B_N_PITEM_HEAD(S_new[i], item_pos - n + snum[i]);
1029 if (is_direntry_le_ih (pasted))
1030 {
1031 leaf_paste_entries (
1032 bi.bi_bh, item_pos - n + snum[i], pos_in_item, 1,
1033 (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0]
1034 );
1035 }
1036
1037 /* if we paste to indirect item update ih_free_space */
1038 if (is_indirect_le_ih (pasted))
1039 set_ih_free_space (pasted, 0);
1040 zeros_num = tb->insert_size[0] = 0;
1041 }
1042 }
1043
1044 else /* pasted item doesn't fall into S_new[i] */
1045 {
1046 leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]);
1047 }
1048 break;
1049 default: /* cases d and t */
1050 reiserfs_panic (tb->tb_sb, "PAP-12245: balance_leaf: blknum > 2: unexpectable mode: %s(%d)",
1051 (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
1052 }
1053
1054 memcpy (insert_key + i,B_N_PKEY(S_new[i],0),KEY_SIZE);
1055 insert_ptr[i] = S_new[i];
1056
1057 RFALSE (!buffer_journaled (S_new [i]) || buffer_journal_dirty (S_new [i]) ||
1058 buffer_dirty (S_new [i]),
1059 "PAP-12247: S_new[%d] : (%b)", i, S_new[i]);
1060 }
1061
1062 /* if the affected item was not wholly shifted then we perform all necessary operations on that part or whole of the
1063 affected item which remains in S */
1064 if ( 0 <= item_pos && item_pos < tb->s0num )
1065 { /* if we must insert or append into buffer S[0] */
1066
1067 switch (flag)
1068 {
1069 case M_INSERT: /* insert item into S[0] */
1070 bi.tb = tb;
1071 bi.bi_bh = tbS0;
1072 bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
1073 bi.bi_position = PATH_H_POSITION (tb->tb_path, 1);
1074 leaf_insert_into_buf (&bi, item_pos, ih, body, zeros_num);
1075
1076 /* If we insert the first key change the delimiting key */
1077 if( item_pos == 0 ) {
1078 if (tb->CFL[0]) /* can be 0 in reiserfsck */
1079 replace_key(tb, tb->CFL[0], tb->lkey[0],tbS0,0);
1080
1081 }
1082 break;
1083
1084 case M_PASTE: { /* append item in S[0] */
1085 struct item_head * pasted;
1086
1087 pasted = B_N_PITEM_HEAD (tbS0, item_pos);
1088 /* when directory, may be new entry already pasted */
1089 if (is_direntry_le_ih (pasted)) {
1090 if ( pos_in_item >= 0 &&
1091 pos_in_item <= ih_entry_count(pasted) ) {
1092
1093 RFALSE( ! tb->insert_size[0],
1094 "PAP-12260: insert_size is 0 already");
1095
1096 /* prepare space */
1097 bi.tb = tb;
1098 bi.bi_bh = tbS0;
1099 bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
1100 bi.bi_position = PATH_H_POSITION (tb->tb_path, 1);
1101 leaf_paste_in_buffer(&bi, item_pos, pos_in_item, tb->insert_size[0], body, zeros_num);
1102
1103 /* paste entry */
1104 leaf_paste_entries (
1105 bi.bi_bh, item_pos, pos_in_item, 1, (struct reiserfs_de_head *)body,
1106 body + DEH_SIZE, tb->insert_size[0]
1107 );
1108 if ( ! item_pos && ! pos_in_item ) {
1109 RFALSE( !tb->CFL[0] || !tb->L[0],
1110 "PAP-12270: CFL[0]/L[0] must be specified");
1111 if (tb->CFL[0]) {
1112 replace_key(tb, tb->CFL[0], tb->lkey[0],tbS0,0);
1113
1114 }
1115 }
1116 tb->insert_size[0] = 0;
1117 }
1118 } else { /* regular object */
1119 if ( pos_in_item == ih_item_len(pasted) ) {
1120
1121 RFALSE( tb->insert_size[0] <= 0,
1122 "PAP-12275: insert size must not be %d",
1123 tb->insert_size[0]);
1124 bi.tb = tb;
1125 bi.bi_bh = tbS0;
1126 bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
1127 bi.bi_position = PATH_H_POSITION (tb->tb_path, 1);
1128 leaf_paste_in_buffer (&bi, item_pos, pos_in_item, tb->insert_size[0], body, zeros_num);
1129
1130 if (is_indirect_le_ih (pasted)) {
1131#if 0
1132 RFALSE( tb->insert_size[0] != UNFM_P_SIZE,
1133 "PAP-12280: insert_size for indirect item must be %d, not %d",
1134 UNFM_P_SIZE, tb->insert_size[0]);
1135#endif
1136 set_ih_free_space (pasted, 0);
1137 }
1138 tb->insert_size[0] = 0;
1139 }
1140
1141#ifdef CONFIG_REISERFS_CHECK
1142 else {
1143 if ( tb->insert_size[0] ) {
1144 print_cur_tb ("12285");
1145 reiserfs_panic (tb->tb_sb, "PAP-12285: balance_leaf: insert_size must be 0 (%d)", tb->insert_size[0]);
1146 }
1147 }
1148#endif /* CONFIG_REISERFS_CHECK */
1149
1150 }
1151 } /* case M_PASTE: */
1152 }
1153 }
1154
1155#ifdef CONFIG_REISERFS_CHECK
1156 if ( flag == M_PASTE && tb->insert_size[0] ) {
1157 print_cur_tb ("12290");
1158 reiserfs_panic (tb->tb_sb, "PAP-12290: balance_leaf: insert_size is still not 0 (%d)", tb->insert_size[0]);
1159 }
1160#endif /* CONFIG_REISERFS_CHECK */
1161
1162 return 0;
1163} /* Leaf level of the tree is balanced (end of balance_leaf) */
1164
1165
1166
1167/* Make empty node */
1168void make_empty_node (struct buffer_info * bi)
1169{
1170 struct block_head * blkh;
1171
1172 RFALSE( bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL");
1173
1174 blkh = B_BLK_HEAD(bi->bi_bh);
1175 set_blkh_nr_item( blkh, 0 );
1176 set_blkh_free_space( blkh, MAX_CHILD_SIZE(bi->bi_bh) );
1177
1178 if (bi->bi_parent)
1179 B_N_CHILD (bi->bi_parent, bi->bi_position)->dc_size = 0; /* Endian safe if 0 */
1180}
1181
1182
1183/* Get first empty buffer */
1184struct buffer_head * get_FEB (struct tree_balance * tb)
1185{
1186 int i;
1187 struct buffer_head * first_b;
1188 struct buffer_info bi;
1189
1190 for (i = 0; i < MAX_FEB_SIZE; i ++)
1191 if (tb->FEB[i] != 0)
1192 break;
1193
1194 if (i == MAX_FEB_SIZE)
1195 reiserfs_panic(tb->tb_sb, "vs-12300: get_FEB: FEB list is empty");
1196
1197 bi.tb = tb;
1198 bi.bi_bh = first_b = tb->FEB[i];
1199 bi.bi_parent = NULL;
1200 bi.bi_position = 0;
1201 make_empty_node (&bi);
1202 set_buffer_uptodate(first_b);
1203 tb->FEB[i] = NULL;
1204 tb->used[i] = first_b;
1205
1206 return(first_b);
1207}
1208
1209
1210/* This is now used because reiserfs_free_block has to be able to
1211** schedule.
1212*/
1213static void store_thrown (struct tree_balance * tb, struct buffer_head * bh)
1214{
1215 int i;
1216
1217 if (buffer_dirty (bh))
1218 reiserfs_warning (tb->tb_sb, "store_thrown deals with dirty buffer");
1219 for (i = 0; i < sizeof (tb->thrown)/sizeof (tb->thrown[0]); i ++)
1220 if (!tb->thrown[i]) {
1221 tb->thrown[i] = bh;
1222 get_bh(bh) ; /* free_thrown puts this */
1223 return;
1224 }
1225 reiserfs_warning (tb->tb_sb, "store_thrown: too many thrown buffers");
1226}
1227
1228static void free_thrown(struct tree_balance *tb) {
1229 int i ;
1230 b_blocknr_t blocknr ;
1231 for (i = 0; i < sizeof (tb->thrown)/sizeof (tb->thrown[0]); i++) {
1232 if (tb->thrown[i]) {
1233 blocknr = tb->thrown[i]->b_blocknr ;
1234 if (buffer_dirty (tb->thrown[i]))
1235 reiserfs_warning (tb->tb_sb,
1236 "free_thrown deals with dirty buffer %d",
1237 blocknr);
1238 brelse(tb->thrown[i]) ; /* incremented in store_thrown */
1239 reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0);
1240 }
1241 }
1242}
1243
1244void reiserfs_invalidate_buffer (struct tree_balance * tb, struct buffer_head * bh)
1245{
1246 struct block_head *blkh;
1247 blkh = B_BLK_HEAD(bh);
1248 set_blkh_level( blkh, FREE_LEVEL );
1249 set_blkh_nr_item( blkh, 0 );
1250
1251 clear_buffer_dirty(bh);
1252 store_thrown (tb, bh);
1253}
1254
1255/* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/
1256void replace_key (struct tree_balance * tb, struct buffer_head * dest, int n_dest,
1257 struct buffer_head * src, int n_src)
1258{
1259
1260 RFALSE( dest == NULL || src == NULL,
1261 "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)",
1262 src, dest);
1263 RFALSE( ! B_IS_KEYS_LEVEL (dest),
1264 "vs-12310: invalid level (%z) for destination buffer. dest must be leaf",
1265 dest);
1266 RFALSE( n_dest < 0 || n_src < 0,
1267 "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest);
1268 RFALSE( n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src),
1269 "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big",
1270 n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest));
1271
1272 if (B_IS_ITEMS_LEVEL (src))
1273 /* source buffer contains leaf node */
1274 memcpy (B_N_PDELIM_KEY(dest,n_dest), B_N_PITEM_HEAD(src,n_src), KEY_SIZE);
1275 else
1276 memcpy (B_N_PDELIM_KEY(dest,n_dest), B_N_PDELIM_KEY(src,n_src), KEY_SIZE);
1277
1278 do_balance_mark_internal_dirty (tb, dest, 0);
1279}
1280
1281
1282int get_left_neighbor_position (
1283 struct tree_balance * tb,
1284 int h
1285 )
1286{
1287 int Sh_position = PATH_H_POSITION (tb->tb_path, h + 1);
1288
1289 RFALSE( PATH_H_PPARENT (tb->tb_path, h) == 0 || tb->FL[h] == 0,
1290 "vs-12325: FL[%d](%p) or F[%d](%p) does not exist",
1291 h, tb->FL[h], h, PATH_H_PPARENT (tb->tb_path, h));
1292
1293 if (Sh_position == 0)
1294 return B_NR_ITEMS (tb->FL[h]);
1295 else
1296 return Sh_position - 1;
1297}
1298
1299
1300int get_right_neighbor_position (struct tree_balance * tb, int h)
1301{
1302 int Sh_position = PATH_H_POSITION (tb->tb_path, h + 1);
1303
1304 RFALSE( PATH_H_PPARENT (tb->tb_path, h) == 0 || tb->FR[h] == 0,
1305 "vs-12330: F[%d](%p) or FR[%d](%p) does not exist",
1306 h, PATH_H_PPARENT (tb->tb_path, h), h, tb->FR[h]);
1307
1308 if (Sh_position == B_NR_ITEMS (PATH_H_PPARENT (tb->tb_path, h)))
1309 return 0;
1310 else
1311 return Sh_position + 1;
1312}
1313
1314
1315#ifdef CONFIG_REISERFS_CHECK
1316
1317int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value);
1318static void check_internal_node (struct super_block * s, struct buffer_head * bh, char * mes)
1319{
1320 struct disk_child * dc;
1321 int i;
1322
1323 RFALSE( !bh, "PAP-12336: bh == 0");
1324
1325 if (!bh || !B_IS_IN_TREE (bh))
1326 return;
1327
1328 RFALSE( !buffer_dirty (bh) &&
1329 !(buffer_journaled(bh) || buffer_journal_dirty(bh)),
1330 "PAP-12337: buffer (%b) must be dirty", bh);
1331 dc = B_N_CHILD (bh, 0);
1332
1333 for (i = 0; i <= B_NR_ITEMS (bh); i ++, dc ++) {
1334 if (!is_reusable (s, dc_block_number(dc), 1) ) {
1335 print_cur_tb (mes);
1336 reiserfs_panic (s, "PAP-12338: check_internal_node: invalid child pointer %y in %b", dc, bh);
1337 }
1338 }
1339}
1340
1341
1342static int locked_or_not_in_tree (struct buffer_head * bh, char * which)
1343{
1344 if ( (!buffer_journal_prepared (bh) && buffer_locked (bh)) ||
1345 !B_IS_IN_TREE (bh) ) {
1346 reiserfs_warning (NULL, "vs-12339: locked_or_not_in_tree: %s (%b)",
1347 which, bh);
1348 return 1;
1349 }
1350 return 0;
1351}
1352
1353
1354static int check_before_balancing (struct tree_balance * tb)
1355{
1356 int retval = 0;
1357
1358 if ( cur_tb ) {
1359 reiserfs_panic (tb->tb_sb, "vs-12335: check_before_balancing: "
1360 "suspect that schedule occurred based on cur_tb not being null at this point in code. "
1361 "do_balance cannot properly handle schedule occurring while it runs.");
1362 }
1363
1364 /* double check that buffers that we will modify are unlocked. (fix_nodes should already have
1365 prepped all of these for us). */
1366 if ( tb->lnum[0] ) {
1367 retval |= locked_or_not_in_tree (tb->L[0], "L[0]");
1368 retval |= locked_or_not_in_tree (tb->FL[0], "FL[0]");
1369 retval |= locked_or_not_in_tree (tb->CFL[0], "CFL[0]");
1370 check_leaf (tb->L[0]);
1371 }
1372 if ( tb->rnum[0] ) {
1373 retval |= locked_or_not_in_tree (tb->R[0], "R[0]");
1374 retval |= locked_or_not_in_tree (tb->FR[0], "FR[0]");
1375 retval |= locked_or_not_in_tree (tb->CFR[0], "CFR[0]");
1376 check_leaf (tb->R[0]);
1377 }
1378 retval |= locked_or_not_in_tree (PATH_PLAST_BUFFER (tb->tb_path), "S[0]");
1379 check_leaf (PATH_PLAST_BUFFER (tb->tb_path));
1380
1381 return retval;
1382}
1383
1384
1385static void check_after_balance_leaf (struct tree_balance * tb)
1386{
1387 if (tb->lnum[0]) {
1388 if (B_FREE_SPACE (tb->L[0]) !=
1389 MAX_CHILD_SIZE (tb->L[0]) - dc_size(B_N_CHILD (tb->FL[0], get_left_neighbor_position (tb, 0)))) {
1390 print_cur_tb ("12221");
1391 reiserfs_panic (tb->tb_sb, "PAP-12355: check_after_balance_leaf: shift to left was incorrect");
1392 }
1393 }
1394 if (tb->rnum[0]) {
1395 if (B_FREE_SPACE (tb->R[0]) !=
1396 MAX_CHILD_SIZE (tb->R[0]) - dc_size(B_N_CHILD (tb->FR[0], get_right_neighbor_position (tb, 0)))) {
1397 print_cur_tb ("12222");
1398 reiserfs_panic (tb->tb_sb, "PAP-12360: check_after_balance_leaf: shift to right was incorrect");
1399 }
1400 }
1401 if (PATH_H_PBUFFER(tb->tb_path,1) &&
1402 (B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) !=
1403 (MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)) -
1404 dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1),
1405 PATH_H_POSITION (tb->tb_path, 1)))) )) {
1406 int left = B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0));
1407 int right = (MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)) -
1408 dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1),
1409 PATH_H_POSITION (tb->tb_path, 1))));
1410 print_cur_tb ("12223");
1411 reiserfs_warning (tb->tb_sb,
1412 "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; "
1413 "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d",
1414 left,
1415 MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)),
1416 PATH_H_PBUFFER(tb->tb_path,1),
1417 PATH_H_POSITION (tb->tb_path, 1),
1418 dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1), PATH_H_POSITION (tb->tb_path, 1 )) ),
1419 right );
1420 reiserfs_panic (tb->tb_sb, "PAP-12365: check_after_balance_leaf: S is incorrect");
1421 }
1422}
1423
1424
1425static void check_leaf_level (struct tree_balance * tb)
1426{
1427 check_leaf (tb->L[0]);
1428 check_leaf (tb->R[0]);
1429 check_leaf (PATH_PLAST_BUFFER (tb->tb_path));
1430}
1431
1432static void check_internal_levels (struct tree_balance * tb)
1433{
1434 int h;
1435
1436 /* check all internal nodes */
1437 for (h = 1; tb->insert_size[h]; h ++) {
1438 check_internal_node (tb->tb_sb, PATH_H_PBUFFER (tb->tb_path, h), "BAD BUFFER ON PATH");
1439 if (tb->lnum[h])
1440 check_internal_node (tb->tb_sb, tb->L[h], "BAD L");
1441 if (tb->rnum[h])
1442 check_internal_node (tb->tb_sb, tb->R[h], "BAD R");
1443 }
1444
1445}
1446
1447#endif
1448
1449
1450
1451
1452
1453
1454/* Now we have all of the buffers that must be used in balancing of
1455 the tree. We rely on the assumption that schedule() will not occur
1456 while do_balance works. ( Only interrupt handlers are acceptable.)
1457 We balance the tree according to the analysis made before this,
1458 using buffers already obtained. For SMP support it will someday be
1459 necessary to add ordered locking of tb. */
1460
1461/* Some interesting rules of balancing:
1462
1463 we delete a maximum of two nodes per level per balancing: we never
1464 delete R, when we delete two of three nodes L, S, R then we move
1465 them into R.
1466
1467 we only delete L if we are deleting two nodes, if we delete only
1468 one node we delete S
1469
1470 if we shift leaves then we shift as much as we can: this is a
1471 deliberate policy of extremism in node packing which results in
1472 higher average utilization after repeated random balance operations
1473 at the cost of more memory copies and more balancing as a result of
1474 small insertions to full nodes.
1475
1476 if we shift internal nodes we try to evenly balance the node
1477 utilization, with consequent less balancing at the cost of lower
1478 utilization.
1479
1480 one could argue that the policy for directories in leaves should be
1481 that of internal nodes, but we will wait until another day to
1482 evaluate this.... It would be nice to someday measure and prove
1483 these assumptions as to what is optimal....
1484
1485*/
1486
1487static inline void do_balance_starts (struct tree_balance *tb)
1488{
1489 /* use print_cur_tb() to see initial state of struct
1490 tree_balance */
1491
1492 /* store_print_tb (tb); */
1493
1494 /* do not delete, just comment it out */
1495/* print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb,
1496 "check");*/
1497 RFALSE( check_before_balancing (tb), "PAP-12340: locked buffers in TB");
1498#ifdef CONFIG_REISERFS_CHECK
1499 cur_tb = tb;
1500#endif
1501}
1502
1503
1504static inline void do_balance_completed (struct tree_balance * tb)
1505{
1506
1507#ifdef CONFIG_REISERFS_CHECK
1508 check_leaf_level (tb);
1509 check_internal_levels (tb);
1510 cur_tb = NULL;
1511#endif
1512
1513 /* reiserfs_free_block is no longer schedule safe. So, we need to
1514 ** put the buffers we want freed on the thrown list during do_balance,
1515 ** and then free them now
1516 */
1517
1518 REISERFS_SB(tb->tb_sb)->s_do_balance ++;
1519
1520
1521 /* release all nodes hold to perform the balancing */
1522 unfix_nodes(tb);
1523
1524 free_thrown(tb) ;
1525}
1526
1527
1528
1529
1530
1531void do_balance (struct tree_balance * tb, /* tree_balance structure */
1532 struct item_head * ih, /* item header of inserted item */
1533 const char * body, /* body of inserted item or bytes to paste */
1534 int flag) /* i - insert, d - delete
1535 c - cut, p - paste
1536
1537 Cut means delete part of an item
1538 (includes removing an entry from a
1539 directory).
1540
1541 Delete means delete whole item.
1542
1543 Insert means add a new item into the
1544 tree.
1545
1546 Paste means to append to the end of an
1547 existing file or to insert a directory
1548 entry. */
1549{
1550 int child_pos, /* position of a child node in its parent */
1551 h; /* level of the tree being processed */
1552 struct item_head insert_key[2]; /* in our processing of one level
1553 we sometimes determine what
1554 must be inserted into the next
1555 higher level. This insertion
1556 consists of a key or two keys
1557 and their corresponding
1558 pointers */
1559 struct buffer_head *insert_ptr[2]; /* inserted node-ptrs for the next
1560 level */
1561
1562 tb->tb_mode = flag;
1563 tb->need_balance_dirty = 0;
1564
1565 if (FILESYSTEM_CHANGED_TB(tb)) {
1566 reiserfs_panic(tb->tb_sb, "clm-6000: do_balance, fs generation has changed\n") ;
1567 }
1568 /* if we have no real work to do */
1569 if ( ! tb->insert_size[0] ) {
1570 reiserfs_warning (tb->tb_sb,
1571 "PAP-12350: do_balance: insert_size == 0, mode == %c",
1572 flag);
1573 unfix_nodes(tb);
1574 return;
1575 }
1576
1577 atomic_inc (&(fs_generation (tb->tb_sb)));
1578 do_balance_starts (tb);
1579
1580 /* balance leaf returns 0 except if combining L R and S into
1581 one node. see balance_internal() for explanation of this
1582 line of code.*/
1583 child_pos = PATH_H_B_ITEM_ORDER (tb->tb_path, 0) +
1584 balance_leaf (tb, ih, body, flag, insert_key, insert_ptr);
1585
1586#ifdef CONFIG_REISERFS_CHECK
1587 check_after_balance_leaf (tb);
1588#endif
1589
1590 /* Balance internal level of the tree. */
1591 for ( h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++ )
1592 child_pos = balance_internal (tb, h, child_pos, insert_key, insert_ptr);
1593
1594
1595 do_balance_completed (tb);
1596
1597}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
new file mode 100644
index 000000000000..26950113af8c
--- /dev/null
+++ b/fs/reiserfs/file.c
@@ -0,0 +1,1408 @@
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5
6#include <linux/time.h>
7#include <linux/reiserfs_fs.h>
8#include <linux/reiserfs_acl.h>
9#include <linux/reiserfs_xattr.h>
10#include <linux/smp_lock.h>
11#include <asm/uaccess.h>
12#include <linux/pagemap.h>
13#include <linux/swap.h>
14#include <linux/writeback.h>
15#include <linux/blkdev.h>
16#include <linux/buffer_head.h>
17#include <linux/quotaops.h>
18
19/*
20** We pack the tails of files on file close, not at the time they are written.
21** This implies an unnecessary copy of the tail and an unnecessary indirect item
22** insertion/balancing, for files that are written in one write.
23** It avoids unnecessary tail packings (balances) for files that are written in
24** multiple writes and are small enough to have tails.
25**
26** file_release is called by the VFS layer when the file is closed. If
27** this is the last open file descriptor, and the file
28** small enough to have a tail, and the tail is currently in an
29** unformatted node, the tail is converted back into a direct item.
30**
31** We use reiserfs_truncate_file to pack the tail, since it already has
32** all the conditions coded.
33*/
34static int reiserfs_file_release (struct inode * inode, struct file * filp)
35{
36
37 struct reiserfs_transaction_handle th ;
38 int err;
39 int jbegin_failure = 0;
40
41 if (!S_ISREG (inode->i_mode))
42 BUG ();
43
44 /* fast out for when nothing needs to be done */
45 if ((atomic_read(&inode->i_count) > 1 ||
46 !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
47 !tail_has_to_be_packed(inode)) &&
48 REISERFS_I(inode)->i_prealloc_count <= 0) {
49 return 0;
50 }
51
52 reiserfs_write_lock(inode->i_sb);
53 down (&inode->i_sem);
54 /* freeing preallocation only involves relogging blocks that
55 * are already in the current transaction. preallocation gets
56 * freed at the end of each transaction, so it is impossible for
57 * us to log any additional blocks (including quota blocks)
58 */
59 err = journal_begin(&th, inode->i_sb, 1);
60 if (err) {
61 /* uh oh, we can't allow the inode to go away while there
62 * is still preallocation blocks pending. Try to join the
63 * aborted transaction
64 */
65 jbegin_failure = err;
66 err = journal_join_abort(&th, inode->i_sb, 1);
67
68 if (err) {
69 /* hmpf, our choices here aren't good. We can pin the inode
70 * which will disallow unmount from every happening, we can
71 * do nothing, which will corrupt random memory on unmount,
72 * or we can forcibly remove the file from the preallocation
73 * list, which will leak blocks on disk. Lets pin the inode
74 * and let the admin know what is going on.
75 */
76 igrab(inode);
77 reiserfs_warning(inode->i_sb, "pinning inode %lu because the "
78 "preallocation can't be freed");
79 goto out;
80 }
81 }
82 reiserfs_update_inode_transaction(inode) ;
83
84#ifdef REISERFS_PREALLOCATE
85 reiserfs_discard_prealloc (&th, inode);
86#endif
87 err = journal_end(&th, inode->i_sb, 1);
88
89 /* copy back the error code from journal_begin */
90 if (!err)
91 err = jbegin_failure;
92
93 if (!err && atomic_read(&inode->i_count) <= 1 &&
94 (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
95 tail_has_to_be_packed (inode)) {
96 /* if regular file is released by last holder and it has been
97 appended (we append by unformatted node only) or its direct
98 item(s) had to be converted, then it may have to be
99 indirect2direct converted */
100 err = reiserfs_truncate_file(inode, 0) ;
101 }
102out:
103 up (&inode->i_sem);
104 reiserfs_write_unlock(inode->i_sb);
105 return err;
106}
107
108static void reiserfs_vfs_truncate_file(struct inode *inode) {
109 reiserfs_truncate_file(inode, 1) ;
110}
111
112/* Sync a reiserfs file. */
113
114/*
115 * FIXME: sync_mapping_buffers() never has anything to sync. Can
116 * be removed...
117 */
118
119static int reiserfs_sync_file(
120 struct file * p_s_filp,
121 struct dentry * p_s_dentry,
122 int datasync
123 ) {
124 struct inode * p_s_inode = p_s_dentry->d_inode;
125 int n_err;
126 int barrier_done;
127
128 if (!S_ISREG(p_s_inode->i_mode))
129 BUG ();
130 n_err = sync_mapping_buffers(p_s_inode->i_mapping) ;
131 reiserfs_write_lock(p_s_inode->i_sb);
132 barrier_done = reiserfs_commit_for_inode(p_s_inode);
133 reiserfs_write_unlock(p_s_inode->i_sb);
134 if (barrier_done != 1)
135 blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
136 if (barrier_done < 0)
137 return barrier_done;
138 return ( n_err < 0 ) ? -EIO : 0;
139}
140
141/* I really do not want to play with memory shortage right now, so
142 to simplify the code, we are not going to write more than this much pages at
143 a time. This still should considerably improve performance compared to 4k
144 at a time case. This is 32 pages of 4k size. */
145#define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE
146
147/* Allocates blocks for a file to fulfil write request.
148 Maps all unmapped but prepared pages from the list.
149 Updates metadata with newly allocated blocknumbers as needed */
150static int reiserfs_allocate_blocks_for_region(
151 struct reiserfs_transaction_handle *th,
152 struct inode *inode, /* Inode we work with */
153 loff_t pos, /* Writing position */
154 int num_pages, /* number of pages write going
155 to touch */
156 int write_bytes, /* amount of bytes to write */
157 struct page **prepared_pages, /* array of
158 prepared pages
159 */
160 int blocks_to_allocate /* Amount of blocks we
161 need to allocate to
162 fit the data into file
163 */
164 )
165{
166 struct cpu_key key; // cpu key of item that we are going to deal with
167 struct item_head *ih; // pointer to item head that we are going to deal with
168 struct buffer_head *bh; // Buffer head that contains items that we are going to deal with
169 __u32 * item; // pointer to item we are going to deal with
170 INITIALIZE_PATH(path); // path to item, that we are going to deal with.
171 b_blocknr_t *allocated_blocks; // Pointer to a place where allocated blocknumbers would be stored.
172 reiserfs_blocknr_hint_t hint; // hint structure for block allocator.
173 size_t res; // return value of various functions that we call.
174 int curr_block; // current block used to keep track of unmapped blocks.
175 int i; // loop counter
176 int itempos; // position in item
177 unsigned int from = (pos & (PAGE_CACHE_SIZE - 1)); // writing position in
178 // first page
179 unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1; /* last modified byte offset in last page */
180 __u64 hole_size ; // amount of blocks for a file hole, if it needed to be created.
181 int modifying_this_item = 0; // Flag for items traversal code to keep track
182 // of the fact that we already prepared
183 // current block for journal
184 int will_prealloc = 0;
185 RFALSE(!blocks_to_allocate, "green-9004: tried to allocate zero blocks?");
186
187 /* only preallocate if this is a small write */
188 if (REISERFS_I(inode)->i_prealloc_count ||
189 (!(write_bytes & (inode->i_sb->s_blocksize -1)) &&
190 blocks_to_allocate <
191 REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
192 will_prealloc = REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
193
194 allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
195 sizeof(b_blocknr_t), GFP_NOFS);
196
197 /* First we compose a key to point at the writing position, we want to do
198 that outside of any locking region. */
199 make_cpu_key (&key, inode, pos+1, TYPE_ANY, 3/*key length*/);
200
201 /* If we came here, it means we absolutely need to open a transaction,
202 since we need to allocate some blocks */
203 reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that.
204 res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS); // Wish I know if this number enough
205 if (res)
206 goto error_exit;
207 reiserfs_update_inode_transaction(inode) ;
208
209 /* Look for the in-tree position of our write, need path for block allocator */
210 res = search_for_position_by_key(inode->i_sb, &key, &path);
211 if ( res == IO_ERROR ) {
212 res = -EIO;
213 goto error_exit;
214 }
215
216 /* Allocate blocks */
217 /* First fill in "hint" structure for block allocator */
218 hint.th = th; // transaction handle.
219 hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine.
220 hint.inode = inode; // Inode is needed by block allocator too.
221 hint.search_start = 0; // We have no hint on where to search free blocks for block allocator.
222 hint.key = key.on_disk_key; // on disk key of file.
223 hint.block = inode->i_blocks>>(inode->i_sb->s_blocksize_bits-9); // Number of disk blocks this file occupies already.
224 hint.formatted_node = 0; // We are allocating blocks for unformatted node.
225 hint.preallocate = will_prealloc;
226
227 /* Call block allocator to allocate blocks */
228 res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
229 if ( res != CARRY_ON ) {
230 if ( res == NO_DISK_SPACE ) {
231 /* We flush the transaction in case of no space. This way some
232 blocks might become free */
233 SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
234 res = restart_transaction(th, inode, &path);
235 if (res)
236 goto error_exit;
237
238 /* We might have scheduled, so search again */
239 res = search_for_position_by_key(inode->i_sb, &key, &path);
240 if ( res == IO_ERROR ) {
241 res = -EIO;
242 goto error_exit;
243 }
244
245 /* update changed info for hint structure. */
246 res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
247 if ( res != CARRY_ON ) {
248 res = -ENOSPC;
249 pathrelse(&path);
250 goto error_exit;
251 }
252 } else {
253 res = -ENOSPC;
254 pathrelse(&path);
255 goto error_exit;
256 }
257 }
258
259#ifdef __BIG_ENDIAN
260 // Too bad, I have not found any way to convert a given region from
261 // cpu format to little endian format
262 {
263 int i;
264 for ( i = 0; i < blocks_to_allocate ; i++)
265 allocated_blocks[i]=cpu_to_le32(allocated_blocks[i]);
266 }
267#endif
268
269 /* Blocks allocating well might have scheduled and tree might have changed,
270 let's search the tree again */
271 /* find where in the tree our write should go */
272 res = search_for_position_by_key(inode->i_sb, &key, &path);
273 if ( res == IO_ERROR ) {
274 res = -EIO;
275 goto error_exit_free_blocks;
276 }
277
278 bh = get_last_bh( &path ); // Get a bufferhead for last element in path.
279 ih = get_ih( &path ); // Get a pointer to last item head in path.
280 item = get_item( &path ); // Get a pointer to last item in path
281
282 /* Let's see what we have found */
283 if ( res != POSITION_FOUND ) { /* position not found, this means that we
284 might need to append file with holes
285 first */
286 // Since we are writing past the file's end, we need to find out if
287 // there is a hole that needs to be inserted before our writing
288 // position, and how many blocks it is going to cover (we need to
289 // populate pointers to file blocks representing the hole with zeros)
290
291 {
292 int item_offset = 1;
293 /*
294 * if ih is stat data, its offset is 0 and we don't want to
295 * add 1 to pos in the hole_size calculation
296 */
297 if (is_statdata_le_ih(ih))
298 item_offset = 0;
299 hole_size = (pos + item_offset -
300 (le_key_k_offset( get_inode_item_key_version(inode),
301 &(ih->ih_key)) +
302 op_bytes_number(ih, inode->i_sb->s_blocksize))) >>
303 inode->i_sb->s_blocksize_bits;
304 }
305
306 if ( hole_size > 0 ) {
307 int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE ); // How much data to insert first time.
308 /* area filled with zeroes, to supply as list of zero blocknumbers
309 We allocate it outside of loop just in case loop would spin for
310 several iterations. */
311 char *zeros = kmalloc(to_paste*UNFM_P_SIZE, GFP_ATOMIC); // We cannot insert more than MAX_ITEM_LEN bytes anyway.
312 if ( !zeros ) {
313 res = -ENOMEM;
314 goto error_exit_free_blocks;
315 }
316 memset ( zeros, 0, to_paste*UNFM_P_SIZE);
317 do {
318 to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE );
319 if ( is_indirect_le_ih(ih) ) {
320 /* Ok, there is existing indirect item already. Need to append it */
321 /* Calculate position past inserted item */
322 make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
323 res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)zeros, UNFM_P_SIZE*to_paste);
324 if ( res ) {
325 kfree(zeros);
326 goto error_exit_free_blocks;
327 }
328 } else if ( is_statdata_le_ih(ih) ) {
329 /* No existing item, create it */
330 /* item head for new item */
331 struct item_head ins_ih;
332
333 /* create a key for our new item */
334 make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3);
335
336 /* Create new item head for our new item */
337 make_le_item_head (&ins_ih, &key, key.version, 1,
338 TYPE_INDIRECT, to_paste*UNFM_P_SIZE,
339 0 /* free space */);
340
341 /* Find where such item should live in the tree */
342 res = search_item (inode->i_sb, &key, &path);
343 if ( res != ITEM_NOT_FOUND ) {
344 /* item should not exist, otherwise we have error */
345 if ( res != -ENOSPC ) {
346 reiserfs_warning (inode->i_sb,
347 "green-9008: search_by_key (%K) returned %d",
348 &key, res);
349 }
350 res = -EIO;
351 kfree(zeros);
352 goto error_exit_free_blocks;
353 }
354 res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)zeros);
355 } else {
356 reiserfs_panic(inode->i_sb, "green-9011: Unexpected key type %K\n", &key);
357 }
358 if ( res ) {
359 kfree(zeros);
360 goto error_exit_free_blocks;
361 }
362 /* Now we want to check if transaction is too full, and if it is
363 we restart it. This will also free the path. */
364 if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
365 res = restart_transaction(th, inode, &path);
366 if (res) {
367 pathrelse (&path);
368 kfree(zeros);
369 goto error_exit;
370 }
371 }
372
373 /* Well, need to recalculate path and stuff */
374 set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + (to_paste << inode->i_blkbits));
375 res = search_for_position_by_key(inode->i_sb, &key, &path);
376 if ( res == IO_ERROR ) {
377 res = -EIO;
378 kfree(zeros);
379 goto error_exit_free_blocks;
380 }
381 bh=get_last_bh(&path);
382 ih=get_ih(&path);
383 item = get_item(&path);
384 hole_size -= to_paste;
385 } while ( hole_size );
386 kfree(zeros);
387 }
388 }
389
390 // Go through existing indirect items first
391 // replace all zeroes with blocknumbers from list
392 // Note that if no corresponding item was found, by previous search,
393 // it means there are no existing in-tree representation for file area
394 // we are going to overwrite, so there is nothing to scan through for holes.
395 for ( curr_block = 0, itempos = path.pos_in_item ; curr_block < blocks_to_allocate && res == POSITION_FOUND ; ) {
396retry:
397
398 if ( itempos >= ih_item_len(ih)/UNFM_P_SIZE ) {
399 /* We run out of data in this indirect item, let's look for another
400 one. */
401 /* First if we are already modifying current item, log it */
402 if ( modifying_this_item ) {
403 journal_mark_dirty (th, inode->i_sb, bh);
404 modifying_this_item = 0;
405 }
406 /* Then set the key to look for a new indirect item (offset of old
407 item is added to old item length */
408 set_cpu_key_k_offset( &key, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize));
409 /* Search ofor position of new key in the tree. */
410 res = search_for_position_by_key(inode->i_sb, &key, &path);
411 if ( res == IO_ERROR) {
412 res = -EIO;
413 goto error_exit_free_blocks;
414 }
415 bh=get_last_bh(&path);
416 ih=get_ih(&path);
417 item = get_item(&path);
418 itempos = path.pos_in_item;
419 continue; // loop to check all kinds of conditions and so on.
420 }
421 /* Ok, we have correct position in item now, so let's see if it is
422 representing file hole (blocknumber is zero) and fill it if needed */
423 if ( !item[itempos] ) {
424 /* Ok, a hole. Now we need to check if we already prepared this
425 block to be journaled */
426 while ( !modifying_this_item ) { // loop until succeed
427 /* Well, this item is not journaled yet, so we must prepare
428 it for journal first, before we can change it */
429 struct item_head tmp_ih; // We copy item head of found item,
430 // here to detect if fs changed under
431 // us while we were preparing for
432 // journal.
433 int fs_gen; // We store fs generation here to find if someone
434 // changes fs under our feet
435
436 copy_item_head (&tmp_ih, ih); // Remember itemhead
437 fs_gen = get_generation (inode->i_sb); // remember fs generation
438 reiserfs_prepare_for_journal(inode->i_sb, bh, 1); // Prepare a buffer within which indirect item is stored for changing.
439 if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
440 // Sigh, fs was changed under us, we need to look for new
441 // location of item we are working with
442
443 /* unmark prepaerd area as journaled and search for it's
444 new position */
445 reiserfs_restore_prepared_buffer(inode->i_sb, bh);
446 res = search_for_position_by_key(inode->i_sb, &key, &path);
447 if ( res == IO_ERROR) {
448 res = -EIO;
449 goto error_exit_free_blocks;
450 }
451 bh=get_last_bh(&path);
452 ih=get_ih(&path);
453 item = get_item(&path);
454 itempos = path.pos_in_item;
455 goto retry;
456 }
457 modifying_this_item = 1;
458 }
459 item[itempos] = allocated_blocks[curr_block]; // Assign new block
460 curr_block++;
461 }
462 itempos++;
463 }
464
465 if ( modifying_this_item ) { // We need to log last-accessed block, if it
466 // was modified, but not logged yet.
467 journal_mark_dirty (th, inode->i_sb, bh);
468 }
469
470 if ( curr_block < blocks_to_allocate ) {
471 // Oh, well need to append to indirect item, or to create indirect item
472 // if there weren't any
473 if ( is_indirect_le_ih(ih) ) {
474 // Existing indirect item - append. First calculate key for append
475 // position. We do not need to recalculate path as it should
476 // already point to correct place.
477 make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
478 res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block));
479 if ( res ) {
480 goto error_exit_free_blocks;
481 }
482 } else if (is_statdata_le_ih(ih) ) {
483 // Last found item was statdata. That means we need to create indirect item.
484 struct item_head ins_ih; /* itemhead for new item */
485
486 /* create a key for our new item */
487 make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3); // Position one,
488 // because that's
489 // where first
490 // indirect item
491 // begins
492 /* Create new item head for our new item */
493 make_le_item_head (&ins_ih, &key, key.version, 1, TYPE_INDIRECT,
494 (blocks_to_allocate-curr_block)*UNFM_P_SIZE,
495 0 /* free space */);
496 /* Find where such item should live in the tree */
497 res = search_item (inode->i_sb, &key, &path);
498 if ( res != ITEM_NOT_FOUND ) {
499 /* Well, if we have found such item already, or some error
500 occured, we need to warn user and return error */
501 if ( res != -ENOSPC ) {
502 reiserfs_warning (inode->i_sb,
503 "green-9009: search_by_key (%K) "
504 "returned %d", &key, res);
505 }
506 res = -EIO;
507 goto error_exit_free_blocks;
508 }
509 /* Insert item into the tree with the data as its body */
510 res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)(allocated_blocks+curr_block));
511 } else {
512 reiserfs_panic(inode->i_sb, "green-9010: unexpected item type for key %K\n",&key);
513 }
514 }
515
516 // the caller is responsible for closing the transaction
517 // unless we return an error, they are also responsible for logging
518 // the inode.
519 //
520 pathrelse(&path);
521 /*
522 * cleanup prellocation from previous writes
523 * if this is a partial block write
524 */
525 if (write_bytes & (inode->i_sb->s_blocksize -1))
526 reiserfs_discard_prealloc(th, inode);
527 reiserfs_write_unlock(inode->i_sb);
528
529 // go through all the pages/buffers and map the buffers to newly allocated
530 // blocks (so that system knows where to write these pages later).
531 curr_block = 0;
532 for ( i = 0; i < num_pages ; i++ ) {
533 struct page *page=prepared_pages[i]; //current page
534 struct buffer_head *head = page_buffers(page);// first buffer for a page
535 int block_start, block_end; // in-page offsets for buffers.
536
537 if (!page_buffers(page))
538 reiserfs_panic(inode->i_sb, "green-9005: No buffers for prepared page???");
539
540 /* For each buffer in page */
541 for(bh = head, block_start = 0; bh != head || !block_start;
542 block_start=block_end, bh = bh->b_this_page) {
543 if (!bh)
544 reiserfs_panic(inode->i_sb, "green-9006: Allocated but absent buffer for a page?");
545 block_end = block_start+inode->i_sb->s_blocksize;
546 if (i == 0 && block_end <= from )
547 /* if this buffer is before requested data to map, skip it */
548 continue;
549 if (i == num_pages - 1 && block_start >= to)
550 /* If this buffer is after requested data to map, abort
551 processing of current page */
552 break;
553
554 if ( !buffer_mapped(bh) ) { // Ok, unmapped buffer, need to map it
555 map_bh( bh, inode->i_sb, le32_to_cpu(allocated_blocks[curr_block]));
556 curr_block++;
557 set_buffer_new(bh);
558 }
559 }
560 }
561
562 RFALSE( curr_block > blocks_to_allocate, "green-9007: Used too many blocks? weird");
563
564 kfree(allocated_blocks);
565 return 0;
566
567// Need to deal with transaction here.
568error_exit_free_blocks:
569 pathrelse(&path);
570 // free blocks
571 for( i = 0; i < blocks_to_allocate; i++ )
572 reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]), 1);
573
574error_exit:
575 if (th->t_trans_id) {
576 int err;
577 // update any changes we made to blk count
578 reiserfs_update_sd(th, inode);
579 err = journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS);
580 if (err)
581 res = err;
582 }
583 reiserfs_write_unlock(inode->i_sb);
584 kfree(allocated_blocks);
585
586 return res;
587}
588
589/* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
590static void reiserfs_unprepare_pages(struct page **prepared_pages, /* list of locked pages */
591 size_t num_pages /* amount of pages */) {
592 int i; // loop counter
593
594 for (i=0; i < num_pages ; i++) {
595 struct page *page = prepared_pages[i];
596
597 try_to_free_buffers(page);
598 unlock_page(page);
599 page_cache_release(page);
600 }
601}
602
603/* This function will copy data from userspace to specified pages within
604 supplied byte range */
605static int reiserfs_copy_from_user_to_file_region(
606 loff_t pos, /* In-file position */
607 int num_pages, /* Number of pages affected */
608 int write_bytes, /* Amount of bytes to write */
609 struct page **prepared_pages, /* pointer to
610 array to
611 prepared pages
612 */
613 const char __user *buf /* Pointer to user-supplied
614 data*/
615 )
616{
617 long page_fault=0; // status of copy_from_user.
618 int i; // loop counter.
619 int offset; // offset in page
620
621 for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
622 size_t count = min_t(size_t,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
623 struct page *page=prepared_pages[i]; // Current page we process.
624
625 fault_in_pages_readable( buf, count);
626
627 /* Copy data from userspace to the current page */
628 kmap(page);
629 page_fault = __copy_from_user(page_address(page)+offset, buf, count); // Copy the data.
630 /* Flush processor's dcache for this page */
631 flush_dcache_page(page);
632 kunmap(page);
633 buf+=count;
634 write_bytes-=count;
635
636 if (page_fault)
637 break; // Was there a fault? abort.
638 }
639
640 return page_fault?-EFAULT:0;
641}
642
643/* taken fs/buffer.c:__block_commit_write */
644int reiserfs_commit_page(struct inode *inode, struct page *page,
645 unsigned from, unsigned to)
646{
647 unsigned block_start, block_end;
648 int partial = 0;
649 unsigned blocksize;
650 struct buffer_head *bh, *head;
651 unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
652 int new;
653 int logit = reiserfs_file_data_log(inode);
654 struct super_block *s = inode->i_sb;
655 int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
656 struct reiserfs_transaction_handle th;
657 int ret = 0;
658
659 th.t_trans_id = 0;
660 blocksize = 1 << inode->i_blkbits;
661
662 if (logit) {
663 reiserfs_write_lock(s);
664 ret = journal_begin(&th, s, bh_per_page + 1);
665 if (ret)
666 goto drop_write_lock;
667 reiserfs_update_inode_transaction(inode);
668 }
669 for(bh = head = page_buffers(page), block_start = 0;
670 bh != head || !block_start;
671 block_start=block_end, bh = bh->b_this_page)
672 {
673
674 new = buffer_new(bh);
675 clear_buffer_new(bh);
676 block_end = block_start + blocksize;
677 if (block_end <= from || block_start >= to) {
678 if (!buffer_uptodate(bh))
679 partial = 1;
680 } else {
681 set_buffer_uptodate(bh);
682 if (logit) {
683 reiserfs_prepare_for_journal(s, bh, 1);
684 journal_mark_dirty(&th, s, bh);
685 } else if (!buffer_dirty(bh)) {
686 mark_buffer_dirty(bh);
687 /* do data=ordered on any page past the end
688 * of file and any buffer marked BH_New.
689 */
690 if (reiserfs_data_ordered(inode->i_sb) &&
691 (new || page->index >= i_size_index)) {
692 reiserfs_add_ordered_list(inode, bh);
693 }
694 }
695 }
696 }
697 if (logit) {
698 ret = journal_end(&th, s, bh_per_page + 1);
699drop_write_lock:
700 reiserfs_write_unlock(s);
701 }
702 /*
703 * If this is a partial write which happened to make all buffers
704 * uptodate then we can optimize away a bogus readpage() for
705 * the next read(). Here we 'discover' whether the page went
706 * uptodate as a result of this (potentially partial) write.
707 */
708 if (!partial)
709 SetPageUptodate(page);
710 return ret;
711}
712
713
714/* Submit pages for write. This was separated from actual file copying
715 because we might want to allocate block numbers in-between.
716 This function assumes that caller will adjust file size to correct value. */
717static int reiserfs_submit_file_region_for_write(
718 struct reiserfs_transaction_handle *th,
719 struct inode *inode,
720 loff_t pos, /* Writing position offset */
721 size_t num_pages, /* Number of pages to write */
722 size_t write_bytes, /* number of bytes to write */
723 struct page **prepared_pages /* list of pages */
724 )
725{
726 int status; // return status of block_commit_write.
727 int retval = 0; // Return value we are going to return.
728 int i; // loop counter
729 int offset; // Writing offset in page.
730 int orig_write_bytes = write_bytes;
731 int sd_update = 0;
732
733 for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
734 int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
735 struct page *page=prepared_pages[i]; // Current page we process.
736
737 status = reiserfs_commit_page(inode, page, offset, offset+count);
738 if ( status )
739 retval = status; // To not overcomplicate matters We are going to
740 // submit all the pages even if there was error.
741 // we only remember error status to report it on
742 // exit.
743 write_bytes-=count;
744 }
745 /* now that we've gotten all the ordered buffers marked dirty,
746 * we can safely update i_size and close any running transaction
747 */
748 if ( pos + orig_write_bytes > inode->i_size) {
749 inode->i_size = pos + orig_write_bytes; // Set new size
750 /* If the file have grown so much that tail packing is no
751 * longer possible, reset "need to pack" flag */
752 if ( (have_large_tails (inode->i_sb) &&
753 inode->i_size > i_block_size (inode)*4) ||
754 (have_small_tails (inode->i_sb) &&
755 inode->i_size > i_block_size(inode)) )
756 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
757 else if ( (have_large_tails (inode->i_sb) &&
758 inode->i_size < i_block_size (inode)*4) ||
759 (have_small_tails (inode->i_sb) &&
760 inode->i_size < i_block_size(inode)) )
761 REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
762
763 if (th->t_trans_id) {
764 reiserfs_write_lock(inode->i_sb);
765 reiserfs_update_sd(th, inode); // And update on-disk metadata
766 reiserfs_write_unlock(inode->i_sb);
767 } else
768 inode->i_sb->s_op->dirty_inode(inode);
769
770 sd_update = 1;
771 }
772 if (th->t_trans_id) {
773 reiserfs_write_lock(inode->i_sb);
774 if (!sd_update)
775 reiserfs_update_sd(th, inode);
776 status = journal_end(th, th->t_super, th->t_blocks_allocated);
777 if (status)
778 retval = status;
779 reiserfs_write_unlock(inode->i_sb);
780 }
781 th->t_trans_id = 0;
782
783 /*
784 * we have to unlock the pages after updating i_size, otherwise
785 * we race with writepage
786 */
787 for ( i = 0; i < num_pages ; i++) {
788 struct page *page=prepared_pages[i];
789 unlock_page(page);
790 mark_page_accessed(page);
791 page_cache_release(page);
792 }
793 return retval;
794}
795
796/* Look if passed writing region is going to touch file's tail
797 (if it is present). And if it is, convert the tail to unformatted node */
798static int reiserfs_check_for_tail_and_convert( struct inode *inode, /* inode to deal with */
799 loff_t pos, /* Writing position */
800 int write_bytes /* amount of bytes to write */
801 )
802{
803 INITIALIZE_PATH(path); // needed for search_for_position
804 struct cpu_key key; // Key that would represent last touched writing byte.
805 struct item_head *ih; // item header of found block;
806 int res; // Return value of various functions we call.
807 int cont_expand_offset; // We will put offset for generic_cont_expand here
808 // This can be int just because tails are created
809 // only for small files.
810
811/* this embodies a dependency on a particular tail policy */
812 if ( inode->i_size >= inode->i_sb->s_blocksize*4 ) {
813 /* such a big files do not have tails, so we won't bother ourselves
814 to look for tails, simply return */
815 return 0;
816 }
817
818 reiserfs_write_lock(inode->i_sb);
819 /* find the item containing the last byte to be written, or if
820 * writing past the end of the file then the last item of the
821 * file (and then we check its type). */
822 make_cpu_key (&key, inode, pos+write_bytes+1, TYPE_ANY, 3/*key length*/);
823 res = search_for_position_by_key(inode->i_sb, &key, &path);
824 if ( res == IO_ERROR ) {
825 reiserfs_write_unlock(inode->i_sb);
826 return -EIO;
827 }
828 ih = get_ih(&path);
829 res = 0;
830 if ( is_direct_le_ih(ih) ) {
831 /* Ok, closest item is file tail (tails are stored in "direct"
832 * items), so we need to unpack it. */
833 /* To not overcomplicate matters, we just call generic_cont_expand
834 which will in turn call other stuff and finally will boil down to
835 reiserfs_get_block() that would do necessary conversion. */
836 cont_expand_offset = le_key_k_offset(get_inode_item_key_version(inode), &(ih->ih_key));
837 pathrelse(&path);
838 res = generic_cont_expand( inode, cont_expand_offset);
839 } else
840 pathrelse(&path);
841
842 reiserfs_write_unlock(inode->i_sb);
843 return res;
844}
845
846/* This function locks pages starting from @pos for @inode.
847 @num_pages pages are locked and stored in
848 @prepared_pages array. Also buffers are allocated for these pages.
849 First and last page of the region is read if it is overwritten only
850 partially. If last page did not exist before write (file hole or file
851 append), it is zeroed, then.
852 Returns number of unallocated blocks that should be allocated to cover
853 new file data.*/
854static int reiserfs_prepare_file_region_for_write(
855 struct inode *inode /* Inode of the file */,
856 loff_t pos, /* position in the file */
857 size_t num_pages, /* number of pages to
858 prepare */
859 size_t write_bytes, /* Amount of bytes to be
860 overwritten from
861 @pos */
862 struct page **prepared_pages /* pointer to array
863 where to store
864 prepared pages */
865 )
866{
867 int res=0; // Return values of different functions we call.
868 unsigned long index = pos >> PAGE_CACHE_SHIFT; // Offset in file in pages.
869 int from = (pos & (PAGE_CACHE_SIZE - 1)); // Writing offset in first page
870 int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
871 /* offset of last modified byte in last
872 page */
873 struct address_space *mapping = inode->i_mapping; // Pages are mapped here.
874 int i; // Simple counter
875 int blocks = 0; /* Return value (blocks that should be allocated) */
876 struct buffer_head *bh, *head; // Current bufferhead and first bufferhead
877 // of a page.
878 unsigned block_start, block_end; // Starting and ending offsets of current
879 // buffer in the page.
880 struct buffer_head *wait[2], **wait_bh=wait; // Buffers for page, if
881 // Page appeared to be not up
882 // to date. Note how we have
883 // at most 2 buffers, this is
884 // because we at most may
885 // partially overwrite two
886 // buffers for one page. One at // the beginning of write area
887 // and one at the end.
888 // Everything inthe middle gets // overwritten totally.
889
890 struct cpu_key key; // cpu key of item that we are going to deal with
891 struct item_head *ih = NULL; // pointer to item head that we are going to deal with
892 struct buffer_head *itembuf=NULL; // Buffer head that contains items that we are going to deal with
893 INITIALIZE_PATH(path); // path to item, that we are going to deal with.
894 __u32 * item=NULL; // pointer to item we are going to deal with
895 int item_pos=-1; /* Position in indirect item */
896
897
898 if ( num_pages < 1 ) {
899 reiserfs_warning (inode->i_sb,
900 "green-9001: reiserfs_prepare_file_region_for_write "
901 "called with zero number of pages to process");
902 return -EFAULT;
903 }
904
905 /* We have 2 loops for pages. In first loop we grab and lock the pages, so
906 that nobody would touch these until we release the pages. Then
907 we'd start to deal with mapping buffers to blocks. */
908 for ( i = 0; i < num_pages; i++) {
909 prepared_pages[i] = grab_cache_page(mapping, index + i); // locks the page
910 if ( !prepared_pages[i]) {
911 res = -ENOMEM;
912 goto failed_page_grabbing;
913 }
914 if (!page_has_buffers(prepared_pages[i]))
915 create_empty_buffers(prepared_pages[i], inode->i_sb->s_blocksize, 0);
916 }
917
918 /* Let's count amount of blocks for a case where all the blocks
919 overwritten are new (we will substract already allocated blocks later)*/
920 if ( num_pages > 2 )
921 /* These are full-overwritten pages so we count all the blocks in
922 these pages are counted as needed to be allocated */
923 blocks = (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
924
925 /* count blocks needed for first page (possibly partially written) */
926 blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) +
927 !!(from & (inode->i_sb->s_blocksize-1)); /* roundup */
928
929 /* Now we account for last page. If last page == first page (we
930 overwrite only one page), we substract all the blocks past the
931 last writing position in a page out of already calculated number
932 of blocks */
933 blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT-inode->i_blkbits)) -
934 ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
935 /* Note how we do not roundup here since partial blocks still
936 should be allocated */
937
938 /* Now if all the write area lies past the file end, no point in
939 maping blocks, since there is none, so we just zero out remaining
940 parts of first and last pages in write area (if needed) */
941 if ( (pos & ~((loff_t)PAGE_CACHE_SIZE - 1)) > inode->i_size ) {
942 if ( from != 0 ) {/* First page needs to be partially zeroed */
943 char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
944 memset(kaddr, 0, from);
945 kunmap_atomic( kaddr, KM_USER0);
946 }
947 if ( to != PAGE_CACHE_SIZE ) { /* Last page needs to be partially zeroed */
948 char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0);
949 memset(kaddr+to, 0, PAGE_CACHE_SIZE - to);
950 kunmap_atomic( kaddr, KM_USER0);
951 }
952
953 /* Since all blocks are new - use already calculated value */
954 return blocks;
955 }
956
957 /* Well, since we write somewhere into the middle of a file, there is
958 possibility we are writing over some already allocated blocks, so
959 let's map these blocks and substract number of such blocks out of blocks
960 we need to allocate (calculated above) */
961 /* Mask write position to start on blocksize, we do it out of the
962 loop for performance reasons */
963 pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
964 /* Set cpu key to the starting position in a file (on left block boundary)*/
965 make_cpu_key (&key, inode, 1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)), TYPE_ANY, 3/*key length*/);
966
967 reiserfs_write_lock(inode->i_sb); // We need that for at least search_by_key()
968 for ( i = 0; i < num_pages ; i++ ) {
969
970 head = page_buffers(prepared_pages[i]);
971 /* For each buffer in the page */
972 for(bh = head, block_start = 0; bh != head || !block_start;
973 block_start=block_end, bh = bh->b_this_page) {
974 if (!bh)
975 reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
976 /* Find where this buffer ends */
977 block_end = block_start+inode->i_sb->s_blocksize;
978 if (i == 0 && block_end <= from )
979 /* if this buffer is before requested data to map, skip it*/
980 continue;
981
982 if (i == num_pages - 1 && block_start >= to) {
983 /* If this buffer is after requested data to map, abort
984 processing of current page */
985 break;
986 }
987
988 if ( buffer_mapped(bh) && bh->b_blocknr !=0 ) {
989 /* This is optimisation for a case where buffer is mapped
990 and have blocknumber assigned. In case significant amount
991 of such buffers are present, we may avoid some amount
992 of search_by_key calls.
993 Probably it would be possible to move parts of this code
994 out of BKL, but I afraid that would overcomplicate code
995 without any noticeable benefit.
996 */
997 item_pos++;
998 /* Update the key */
999 set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize);
1000 blocks--; // Decrease the amount of blocks that need to be
1001 // allocated
1002 continue; // Go to the next buffer
1003 }
1004
1005 if ( !itembuf || /* if first iteration */
1006 item_pos >= ih_item_len(ih)/UNFM_P_SIZE)
1007 { /* or if we progressed past the
1008 current unformatted_item */
1009 /* Try to find next item */
1010 res = search_for_position_by_key(inode->i_sb, &key, &path);
1011 /* Abort if no more items */
1012 if ( res != POSITION_FOUND ) {
1013 /* make sure later loops don't use this item */
1014 itembuf = NULL;
1015 item = NULL;
1016 break;
1017 }
1018
1019 /* Update information about current indirect item */
1020 itembuf = get_last_bh( &path );
1021 ih = get_ih( &path );
1022 item = get_item( &path );
1023 item_pos = path.pos_in_item;
1024
1025 RFALSE( !is_indirect_le_ih (ih), "green-9003: indirect item expected");
1026 }
1027
1028 /* See if there is some block associated with the file
1029 at that position, map the buffer to this block */
1030 if ( get_block_num(item,item_pos) ) {
1031 map_bh(bh, inode->i_sb, get_block_num(item,item_pos));
1032 blocks--; // Decrease the amount of blocks that need to be
1033 // allocated
1034 }
1035 item_pos++;
1036 /* Update the key */
1037 set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize);
1038 }
1039 }
1040 pathrelse(&path); // Free the path
1041 reiserfs_write_unlock(inode->i_sb);
1042
1043 /* Now zero out unmappend buffers for the first and last pages of
1044 write area or issue read requests if page is mapped. */
1045 /* First page, see if it is not uptodate */
1046 if ( !PageUptodate(prepared_pages[0]) ) {
1047 head = page_buffers(prepared_pages[0]);
1048
1049 /* For each buffer in page */
1050 for(bh = head, block_start = 0; bh != head || !block_start;
1051 block_start=block_end, bh = bh->b_this_page) {
1052
1053 if (!bh)
1054 reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
1055 /* Find where this buffer ends */
1056 block_end = block_start+inode->i_sb->s_blocksize;
1057 if ( block_end <= from )
1058 /* if this buffer is before requested data to map, skip it*/
1059 continue;
1060 if ( block_start < from ) { /* Aha, our partial buffer */
1061 if ( buffer_mapped(bh) ) { /* If it is mapped, we need to
1062 issue READ request for it to
1063 not loose data */
1064 ll_rw_block(READ, 1, &bh);
1065 *wait_bh++=bh;
1066 } else { /* Not mapped, zero it */
1067 char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
1068 memset(kaddr+block_start, 0, from-block_start);
1069 kunmap_atomic( kaddr, KM_USER0);
1070 set_buffer_uptodate(bh);
1071 }
1072 }
1073 }
1074 }
1075
1076 /* Last page, see if it is not uptodate, or if the last page is past the end of the file. */
1077 if ( !PageUptodate(prepared_pages[num_pages-1]) ||
1078 ((pos+write_bytes)>>PAGE_CACHE_SHIFT) > (inode->i_size>>PAGE_CACHE_SHIFT) ) {
1079 head = page_buffers(prepared_pages[num_pages-1]);
1080
1081 /* for each buffer in page */
1082 for(bh = head, block_start = 0; bh != head || !block_start;
1083 block_start=block_end, bh = bh->b_this_page) {
1084
1085 if (!bh)
1086 reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
1087 /* Find where this buffer ends */
1088 block_end = block_start+inode->i_sb->s_blocksize;
1089 if ( block_start >= to )
1090 /* if this buffer is after requested data to map, skip it*/
1091 break;
1092 if ( block_end > to ) { /* Aha, our partial buffer */
1093 if ( buffer_mapped(bh) ) { /* If it is mapped, we need to
1094 issue READ request for it to
1095 not loose data */
1096 ll_rw_block(READ, 1, &bh);
1097 *wait_bh++=bh;
1098 } else { /* Not mapped, zero it */
1099 char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0);
1100 memset(kaddr+to, 0, block_end-to);
1101 kunmap_atomic( kaddr, KM_USER0);
1102 set_buffer_uptodate(bh);
1103 }
1104 }
1105 }
1106 }
1107
1108 /* Wait for read requests we made to happen, if necessary */
1109 while(wait_bh > wait) {
1110 wait_on_buffer(*--wait_bh);
1111 if (!buffer_uptodate(*wait_bh)) {
1112 res = -EIO;
1113 goto failed_read;
1114 }
1115 }
1116
1117 return blocks;
1118failed_page_grabbing:
1119 num_pages = i;
1120failed_read:
1121 reiserfs_unprepare_pages(prepared_pages, num_pages);
1122 return res;
1123}
1124
1125/* Write @count bytes at position @ppos in a file indicated by @file
1126 from the buffer @buf.
1127
1128 generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
1129 something simple that works. It is not for serious use by general purpose filesystems, excepting the one that it was
1130 written for (ext2/3). This is for several reasons:
1131
1132 * It has no understanding of any filesystem specific optimizations.
1133
1134 * It enters the filesystem repeatedly for each page that is written.
1135
1136 * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
1137 * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
1138 * to reiserfs which allows for fewer tree traversals.
1139
1140 * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
1141
1142 * Asking the block allocation code for blocks one at a time is slightly less efficient.
1143
1144 All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
1145 use it, but we were in a hurry to make code freeze, and so it couldn't be revised then. This new code should make
1146 things right finally.
1147
1148 Future Features: providing search_by_key with hints.
1149
1150*/
1151static ssize_t reiserfs_file_write( struct file *file, /* the file we are going to write into */
1152 const char __user *buf, /* pointer to user supplied data
1153(in userspace) */
1154 size_t count, /* amount of bytes to write */
1155 loff_t *ppos /* pointer to position in file that we start writing at. Should be updated to
1156 * new current position before returning. */ )
1157{
1158 size_t already_written = 0; // Number of bytes already written to the file.
1159 loff_t pos; // Current position in the file.
1160 ssize_t res; // return value of various functions that we call.
1161 int err = 0;
1162 struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to.
1163 /* To simplify coding at this time, we store
1164 locked pages in array for now */
1165 struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
1166 struct reiserfs_transaction_handle th;
1167 th.t_trans_id = 0;
1168
1169 if ( file->f_flags & O_DIRECT) { // Direct IO needs treatment
1170 ssize_t result, after_file_end = 0;
1171 if ( (*ppos + count >= inode->i_size) || (file->f_flags & O_APPEND) ) {
1172 /* If we are appending a file, we need to put this savelink in here.
1173 If we will crash while doing direct io, finish_unfinished will
1174 cut the garbage from the file end. */
1175 reiserfs_write_lock(inode->i_sb);
1176 err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT );
1177 if (err) {
1178 reiserfs_write_unlock (inode->i_sb);
1179 return err;
1180 }
1181 reiserfs_update_inode_transaction(inode);
1182 add_save_link (&th, inode, 1 /* Truncate */);
1183 after_file_end = 1;
1184 err = journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT );
1185 reiserfs_write_unlock(inode->i_sb);
1186 if (err)
1187 return err;
1188 }
1189 result = generic_file_write(file, buf, count, ppos);
1190
1191 if ( after_file_end ) { /* Now update i_size and remove the savelink */
1192 struct reiserfs_transaction_handle th;
1193 reiserfs_write_lock(inode->i_sb);
1194 err = journal_begin(&th, inode->i_sb, 1);
1195 if (err) {
1196 reiserfs_write_unlock (inode->i_sb);
1197 return err;
1198 }
1199 reiserfs_update_inode_transaction(inode);
1200 reiserfs_update_sd(&th, inode);
1201 err = journal_end(&th, inode->i_sb, 1);
1202 if (err) {
1203 reiserfs_write_unlock (inode->i_sb);
1204 return err;
1205 }
1206 err = remove_save_link (inode, 1/* truncate */);
1207 reiserfs_write_unlock(inode->i_sb);
1208 if (err)
1209 return err;
1210 }
1211
1212 return result;
1213 }
1214
1215 if ( unlikely((ssize_t) count < 0 ))
1216 return -EINVAL;
1217
1218 if (unlikely(!access_ok(VERIFY_READ, buf, count)))
1219 return -EFAULT;
1220
1221 down(&inode->i_sem); // locks the entire file for just us
1222
1223 pos = *ppos;
1224
1225 /* Check if we can write to specified region of file, file
1226 is not overly big and this kind of stuff. Adjust pos and
1227 count, if needed */
1228 res = generic_write_checks(file, &pos, &count, 0);
1229 if (res)
1230 goto out;
1231
1232 if ( count == 0 )
1233 goto out;
1234
1235 res = remove_suid(file->f_dentry);
1236 if (res)
1237 goto out;
1238
1239 inode_update_time(inode, 1); /* Both mtime and ctime */
1240
1241 // Ok, we are done with all the checks.
1242
1243 // Now we should start real work
1244
1245 /* If we are going to write past the file's packed tail or if we are going
1246 to overwrite part of the tail, we need that tail to be converted into
1247 unformatted node */
1248 res = reiserfs_check_for_tail_and_convert( inode, pos, count);
1249 if (res)
1250 goto out;
1251
1252 while ( count > 0) {
1253 /* This is the main loop in which we running until some error occures
1254 or until we write all of the data. */
1255 size_t num_pages;/* amount of pages we are going to write this iteration */
1256 size_t write_bytes; /* amount of bytes to write during this iteration */
1257 size_t blocks_to_allocate; /* how much blocks we need to allocate for this iteration */
1258
1259 /* (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos*/
1260 num_pages = !!((pos+count) & (PAGE_CACHE_SIZE - 1)) + /* round up partial
1261 pages */
1262 ((count + (pos & (PAGE_CACHE_SIZE-1))) >> PAGE_CACHE_SHIFT);
1263 /* convert size to amount of
1264 pages */
1265 reiserfs_write_lock(inode->i_sb);
1266 if ( num_pages > REISERFS_WRITE_PAGES_AT_A_TIME
1267 || num_pages > reiserfs_can_fit_pages(inode->i_sb) ) {
1268 /* If we were asked to write more data than we want to or if there
1269 is not that much space, then we shorten amount of data to write
1270 for this iteration. */
1271 num_pages = min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME, reiserfs_can_fit_pages(inode->i_sb));
1272 /* Also we should not forget to set size in bytes accordingly */
1273 write_bytes = (num_pages << PAGE_CACHE_SHIFT) -
1274 (pos & (PAGE_CACHE_SIZE-1));
1275 /* If position is not on the
1276 start of the page, we need
1277 to substract the offset
1278 within page */
1279 } else
1280 write_bytes = count;
1281
1282 /* reserve the blocks to be allocated later, so that later on
1283 we still have the space to write the blocks to */
1284 reiserfs_claim_blocks_to_be_allocated(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits));
1285 reiserfs_write_unlock(inode->i_sb);
1286
1287 if ( !num_pages ) { /* If we do not have enough space even for */
1288 res = -ENOSPC; /* single page, return -ENOSPC */
1289 if ( pos > (inode->i_size & (inode->i_sb->s_blocksize-1)))
1290 break; // In case we are writing past the file end, break.
1291 // Otherwise we are possibly overwriting the file, so
1292 // let's set write size to be equal or less than blocksize.
1293 // This way we get it correctly for file holes.
1294 // But overwriting files on absolutelly full volumes would not
1295 // be very efficient. Well, people are not supposed to fill
1296 // 100% of disk space anyway.
1297 write_bytes = min_t(size_t, count, inode->i_sb->s_blocksize - (pos & (inode->i_sb->s_blocksize - 1)));
1298 num_pages = 1;
1299 // No blocks were claimed before, so do it now.
1300 reiserfs_claim_blocks_to_be_allocated(inode->i_sb, 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits));
1301 }
1302
1303 /* Prepare for writing into the region, read in all the
1304 partially overwritten pages, if needed. And lock the pages,
1305 so that nobody else can access these until we are done.
1306 We get number of actual blocks needed as a result.*/
1307 blocks_to_allocate = reiserfs_prepare_file_region_for_write(inode, pos, num_pages, write_bytes, prepared_pages);
1308 if ( blocks_to_allocate < 0 ) {
1309 res = blocks_to_allocate;
1310 reiserfs_release_claimed_blocks(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits));
1311 break;
1312 }
1313
1314 /* First we correct our estimate of how many blocks we need */
1315 reiserfs_release_claimed_blocks(inode->i_sb, (num_pages << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits)) - blocks_to_allocate );
1316
1317 if ( blocks_to_allocate > 0) {/*We only allocate blocks if we need to*/
1318 /* Fill in all the possible holes and append the file if needed */
1319 res = reiserfs_allocate_blocks_for_region(&th, inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate);
1320 }
1321
1322 /* well, we have allocated the blocks, so it is time to free
1323 the reservation we made earlier. */
1324 reiserfs_release_claimed_blocks(inode->i_sb, blocks_to_allocate);
1325 if ( res ) {
1326 reiserfs_unprepare_pages(prepared_pages, num_pages);
1327 break;
1328 }
1329
1330/* NOTE that allocating blocks and filling blocks can be done in reverse order
1331 and probably we would do that just to get rid of garbage in files after a
1332 crash */
1333
1334 /* Copy data from user-supplied buffer to file's pages */
1335 res = reiserfs_copy_from_user_to_file_region(pos, num_pages, write_bytes, prepared_pages, buf);
1336 if ( res ) {
1337 reiserfs_unprepare_pages(prepared_pages, num_pages);
1338 break;
1339 }
1340
1341 /* Send the pages to disk and unlock them. */
1342 res = reiserfs_submit_file_region_for_write(&th, inode, pos, num_pages,
1343 write_bytes,prepared_pages);
1344 if ( res )
1345 break;
1346
1347 already_written += write_bytes;
1348 buf += write_bytes;
1349 *ppos = pos += write_bytes;
1350 count -= write_bytes;
1351 balance_dirty_pages_ratelimited(inode->i_mapping);
1352 }
1353
1354 /* this is only true on error */
1355 if (th.t_trans_id) {
1356 reiserfs_write_lock(inode->i_sb);
1357 err = journal_end(&th, th.t_super, th.t_blocks_allocated);
1358 reiserfs_write_unlock(inode->i_sb);
1359 if (err) {
1360 res = err;
1361 goto out;
1362 }
1363 }
1364
1365 if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
1366 res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA);
1367
1368 up(&inode->i_sem);
1369 reiserfs_async_progress_wait(inode->i_sb);
1370 return (already_written != 0)?already_written:res;
1371
1372out:
1373 up(&inode->i_sem); // unlock the file on exit.
1374 return res;
1375}
1376
1377static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user *buf,
1378 size_t count, loff_t pos)
1379{
1380 return generic_file_aio_write(iocb, buf, count, pos);
1381}
1382
1383
1384
1385struct file_operations reiserfs_file_operations = {
1386 .read = generic_file_read,
1387 .write = reiserfs_file_write,
1388 .ioctl = reiserfs_ioctl,
1389 .mmap = generic_file_mmap,
1390 .release = reiserfs_file_release,
1391 .fsync = reiserfs_sync_file,
1392 .sendfile = generic_file_sendfile,
1393 .aio_read = generic_file_aio_read,
1394 .aio_write = reiserfs_aio_write,
1395};
1396
1397
1398struct inode_operations reiserfs_file_inode_operations = {
1399 .truncate = reiserfs_vfs_truncate_file,
1400 .setattr = reiserfs_setattr,
1401 .setxattr = reiserfs_setxattr,
1402 .getxattr = reiserfs_getxattr,
1403 .listxattr = reiserfs_listxattr,
1404 .removexattr = reiserfs_removexattr,
1405 .permission = reiserfs_permission,
1406};
1407
1408
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
new file mode 100644
index 000000000000..e4f64be9e15b
--- /dev/null
+++ b/fs/reiserfs/fix_node.c
@@ -0,0 +1,2518 @@
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5/**
6 ** old_item_num
7 ** old_entry_num
8 ** set_entry_sizes
9 ** create_virtual_node
10 ** check_left
11 ** check_right
12 ** directory_part_size
13 ** get_num_ver
14 ** set_parameters
15 ** is_leaf_removable
16 ** are_leaves_removable
17 ** get_empty_nodes
18 ** get_lfree
19 ** get_rfree
20 ** is_left_neighbor_in_cache
21 ** decrement_key
22 ** get_far_parent
23 ** get_parents
24 ** can_node_be_removed
25 ** ip_check_balance
26 ** dc_check_balance_internal
27 ** dc_check_balance_leaf
28 ** dc_check_balance
29 ** check_balance
30 ** get_direct_parent
31 ** get_neighbors
32 ** fix_nodes
33 **
34 **
35 **/
36
37
38#include <linux/config.h>
39#include <linux/time.h>
40#include <linux/string.h>
41#include <linux/reiserfs_fs.h>
42#include <linux/buffer_head.h>
43
44
45/* To make any changes in the tree we find a node, that contains item
46 to be changed/deleted or position in the node we insert a new item
47 to. We call this node S. To do balancing we need to decide what we
48 will shift to left/right neighbor, or to a new node, where new item
49 will be etc. To make this analysis simpler we build virtual
50 node. Virtual node is an array of items, that will replace items of
51 node S. (For instance if we are going to delete an item, virtual
52 node does not contain it). Virtual node keeps information about
53 item sizes and types, mergeability of first and last items, sizes
54 of all entries in directory item. We use this array of items when
55 calculating what we can shift to neighbors and how many nodes we
56 have to have if we do not any shiftings, if we shift to left/right
57 neighbor or to both. */
58
59
60/* taking item number in virtual node, returns number of item, that it has in source buffer */
61static inline int old_item_num (int new_num, int affected_item_num, int mode)
62{
63 if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
64 return new_num;
65
66 if (mode == M_INSERT) {
67
68 RFALSE( new_num == 0,
69 "vs-8005: for INSERT mode and item number of inserted item");
70
71 return new_num - 1;
72 }
73
74 RFALSE( mode != M_DELETE,
75 "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", mode);
76 /* delete mode */
77 return new_num + 1;
78}
79
80static void create_virtual_node (struct tree_balance * tb, int h)
81{
82 struct item_head * ih;
83 struct virtual_node * vn = tb->tb_vn;
84 int new_num;
85 struct buffer_head * Sh; /* this comes from tb->S[h] */
86
87 Sh = PATH_H_PBUFFER (tb->tb_path, h);
88
89 /* size of changed node */
90 vn->vn_size = MAX_CHILD_SIZE (Sh) - B_FREE_SPACE (Sh) + tb->insert_size[h];
91
92 /* for internal nodes array if virtual items is not created */
93 if (h) {
94 vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
95 return;
96 }
97
98 /* number of items in virtual node */
99 vn->vn_nr_item = B_NR_ITEMS (Sh) + ((vn->vn_mode == M_INSERT)? 1 : 0) - ((vn->vn_mode == M_DELETE)? 1 : 0);
100
101 /* first virtual item */
102 vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
103 memset (vn->vn_vi, 0, vn->vn_nr_item * sizeof (struct virtual_item));
104 vn->vn_free_ptr += vn->vn_nr_item * sizeof (struct virtual_item);
105
106
107 /* first item in the node */
108 ih = B_N_PITEM_HEAD (Sh, 0);
109
110 /* define the mergeability for 0-th item (if it is not being deleted) */
111 if (op_is_left_mergeable (&(ih->ih_key), Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
112 vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
113
114 /* go through all items those remain in the virtual node (except for the new (inserted) one) */
115 for (new_num = 0; new_num < vn->vn_nr_item; new_num ++) {
116 int j;
117 struct virtual_item * vi = vn->vn_vi + new_num;
118 int is_affected = ((new_num != vn->vn_affected_item_num) ? 0 : 1);
119
120
121 if (is_affected && vn->vn_mode == M_INSERT)
122 continue;
123
124 /* get item number in source node */
125 j = old_item_num (new_num, vn->vn_affected_item_num, vn->vn_mode);
126
127 vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
128 vi->vi_ih = ih + j;
129 vi->vi_item = B_I_PITEM (Sh, ih + j);
130 vi->vi_uarea = vn->vn_free_ptr;
131
132 // FIXME: there is no check, that item operation did not
133 // consume too much memory
134 vn->vn_free_ptr += op_create_vi (vn, vi, is_affected, tb->insert_size [0]);
135 if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
136 reiserfs_panic (tb->tb_sb, "vs-8030: create_virtual_node: "
137 "virtual node space consumed");
138
139 if (!is_affected)
140 /* this is not being changed */
141 continue;
142
143 if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
144 vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
145 vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted
146 }
147 }
148
149
150 /* virtual inserted item is not defined yet */
151 if (vn->vn_mode == M_INSERT) {
152 struct virtual_item * vi = vn->vn_vi + vn->vn_affected_item_num;
153
154 RFALSE( vn->vn_ins_ih == 0,
155 "vs-8040: item header of inserted item is not specified");
156 vi->vi_item_len = tb->insert_size[0];
157 vi->vi_ih = vn->vn_ins_ih;
158 vi->vi_item = vn->vn_data;
159 vi->vi_uarea = vn->vn_free_ptr;
160
161 op_create_vi (vn, vi, 0/*not pasted or cut*/, tb->insert_size [0]);
162 }
163
164 /* set right merge flag we take right delimiting key and check whether it is a mergeable item */
165 if (tb->CFR[0]) {
166 struct reiserfs_key * key;
167
168 key = B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]);
169 if (op_is_left_mergeable (key, Sh->b_size) && (vn->vn_mode != M_DELETE ||
170 vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1))
171 vn->vn_vi[vn->vn_nr_item-1].vi_type |= VI_TYPE_RIGHT_MERGEABLE;
172
173#ifdef CONFIG_REISERFS_CHECK
174 if (op_is_left_mergeable (key, Sh->b_size) &&
175 !(vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1) ) {
176 /* we delete last item and it could be merged with right neighbor's first item */
177 if (!(B_NR_ITEMS (Sh) == 1 && is_direntry_le_ih (B_N_PITEM_HEAD (Sh, 0)) &&
178 I_ENTRY_COUNT (B_N_PITEM_HEAD (Sh, 0)) == 1)) {
179 /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
180 print_block (Sh, 0, -1, -1);
181 reiserfs_panic (tb->tb_sb, "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c",
182 key, vn->vn_affected_item_num, vn->vn_mode, M_DELETE);
183 } else
184 /* we can delete directory item, that has only one directory entry in it */
185 ;
186 }
187#endif
188
189 }
190}
191
192
193/* using virtual node check, how many items can be shifted to left
194 neighbor */
195static void check_left (struct tree_balance * tb, int h, int cur_free)
196{
197 int i;
198 struct virtual_node * vn = tb->tb_vn;
199 struct virtual_item * vi;
200 int d_size, ih_size;
201
202 RFALSE( cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
203
204 /* internal level */
205 if (h > 0) {
206 tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
207 return;
208 }
209
210 /* leaf level */
211
212 if (!cur_free || !vn->vn_nr_item) {
213 /* no free space or nothing to move */
214 tb->lnum[h] = 0;
215 tb->lbytes = -1;
216 return;
217 }
218
219 RFALSE( !PATH_H_PPARENT (tb->tb_path, 0),
220 "vs-8055: parent does not exist or invalid");
221
222 vi = vn->vn_vi;
223 if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
224 /* all contents of S[0] fits into L[0] */
225
226 RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
227 "vs-8055: invalid mode or balance condition failed");
228
229 tb->lnum[0] = vn->vn_nr_item;
230 tb->lbytes = -1;
231 return;
232 }
233
234
235 d_size = 0, ih_size = IH_SIZE;
236
237 /* first item may be merge with last item in left neighbor */
238 if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
239 d_size = -((int)IH_SIZE), ih_size = 0;
240
241 tb->lnum[0] = 0;
242 for (i = 0; i < vn->vn_nr_item; i ++, ih_size = IH_SIZE, d_size = 0, vi ++) {
243 d_size += vi->vi_item_len;
244 if (cur_free >= d_size) {
245 /* the item can be shifted entirely */
246 cur_free -= d_size;
247 tb->lnum[0] ++;
248 continue;
249 }
250
251 /* the item cannot be shifted entirely, try to split it */
252 /* check whether L[0] can hold ih and at least one byte of the item body */
253 if (cur_free <= ih_size) {
254 /* cannot shift even a part of the current item */
255 tb->lbytes = -1;
256 return;
257 }
258 cur_free -= ih_size;
259
260 tb->lbytes = op_check_left (vi, cur_free, 0, 0);
261 if (tb->lbytes != -1)
262 /* count partially shifted item */
263 tb->lnum[0] ++;
264
265 break;
266 }
267
268 return;
269}
270
271
272/* using virtual node check, how many items can be shifted to right
273 neighbor */
274static void check_right (struct tree_balance * tb, int h, int cur_free)
275{
276 int i;
277 struct virtual_node * vn = tb->tb_vn;
278 struct virtual_item * vi;
279 int d_size, ih_size;
280
281 RFALSE( cur_free < 0, "vs-8070: cur_free < 0");
282
283 /* internal level */
284 if (h > 0) {
285 tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
286 return;
287 }
288
289 /* leaf level */
290
291 if (!cur_free || !vn->vn_nr_item) {
292 /* no free space */
293 tb->rnum[h] = 0;
294 tb->rbytes = -1;
295 return;
296 }
297
298 RFALSE( !PATH_H_PPARENT (tb->tb_path, 0),
299 "vs-8075: parent does not exist or invalid");
300
301 vi = vn->vn_vi + vn->vn_nr_item - 1;
302 if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
303 /* all contents of S[0] fits into R[0] */
304
305 RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
306 "vs-8080: invalid mode or balance condition failed");
307
308 tb->rnum[h] = vn->vn_nr_item;
309 tb->rbytes = -1;
310 return;
311 }
312
313 d_size = 0, ih_size = IH_SIZE;
314
315 /* last item may be merge with first item in right neighbor */
316 if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
317 d_size = -(int)IH_SIZE, ih_size = 0;
318
319 tb->rnum[0] = 0;
320 for (i = vn->vn_nr_item - 1; i >= 0; i --, d_size = 0, ih_size = IH_SIZE, vi --) {
321 d_size += vi->vi_item_len;
322 if (cur_free >= d_size) {
323 /* the item can be shifted entirely */
324 cur_free -= d_size;
325 tb->rnum[0] ++;
326 continue;
327 }
328
329 /* check whether R[0] can hold ih and at least one byte of the item body */
330 if ( cur_free <= ih_size ) { /* cannot shift even a part of the current item */
331 tb->rbytes = -1;
332 return;
333 }
334
335 /* R[0] can hold the header of the item and at least one byte of its body */
336 cur_free -= ih_size; /* cur_free is still > 0 */
337
338 tb->rbytes = op_check_right (vi, cur_free);
339 if (tb->rbytes != -1)
340 /* count partially shifted item */
341 tb->rnum[0] ++;
342
343 break;
344 }
345
346 return;
347}
348
349
350/*
351 * from - number of items, which are shifted to left neighbor entirely
352 * to - number of item, which are shifted to right neighbor entirely
353 * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor
354 * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */
355static int get_num_ver (int mode, struct tree_balance * tb, int h,
356 int from, int from_bytes,
357 int to, int to_bytes,
358 short * snum012, int flow
359 )
360{
361 int i;
362 int cur_free;
363 // int bytes;
364 int units;
365 struct virtual_node * vn = tb->tb_vn;
366 // struct virtual_item * vi;
367
368 int total_node_size, max_node_size, current_item_size;
369 int needed_nodes;
370 int start_item, /* position of item we start filling node from */
371 end_item, /* position of item we finish filling node by */
372 start_bytes,/* number of first bytes (entries for directory) of start_item-th item
373 we do not include into node that is being filled */
374 end_bytes; /* number of last bytes (entries for directory) of end_item-th item
375 we do node include into node that is being filled */
376 int split_item_positions[2]; /* these are positions in virtual item of
377 items, that are split between S[0] and
378 S1new and S1new and S2new */
379
380 split_item_positions[0] = -1;
381 split_item_positions[1] = -1;
382
383 /* We only create additional nodes if we are in insert or paste mode
384 or we are in replace mode at the internal level. If h is 0 and
385 the mode is M_REPLACE then in fix_nodes we change the mode to
386 paste or insert before we get here in the code. */
387 RFALSE( tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
388 "vs-8100: insert_size < 0 in overflow");
389
390 max_node_size = MAX_CHILD_SIZE (PATH_H_PBUFFER (tb->tb_path, h));
391
392 /* snum012 [0-2] - number of items, that lay
393 to S[0], first new node and second new node */
394 snum012[3] = -1; /* s1bytes */
395 snum012[4] = -1; /* s2bytes */
396
397 /* internal level */
398 if (h > 0) {
399 i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
400 if (i == max_node_size)
401 return 1;
402 return (i / max_node_size + 1);
403 }
404
405 /* leaf level */
406 needed_nodes = 1;
407 total_node_size = 0;
408 cur_free = max_node_size;
409
410 // start from 'from'-th item
411 start_item = from;
412 // skip its first 'start_bytes' units
413 start_bytes = ((from_bytes != -1) ? from_bytes : 0);
414
415 // last included item is the 'end_item'-th one
416 end_item = vn->vn_nr_item - to - 1;
417 // do not count last 'end_bytes' units of 'end_item'-th item
418 end_bytes = (to_bytes != -1) ? to_bytes : 0;
419
420 /* go through all item beginning from the start_item-th item and ending by
421 the end_item-th item. Do not count first 'start_bytes' units of
422 'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
423
424 for (i = start_item; i <= end_item; i ++) {
425 struct virtual_item * vi = vn->vn_vi + i;
426 int skip_from_end = ((i == end_item) ? end_bytes : 0);
427
428 RFALSE( needed_nodes > 3, "vs-8105: too many nodes are needed");
429
430 /* get size of current item */
431 current_item_size = vi->vi_item_len;
432
433 /* do not take in calculation head part (from_bytes) of from-th item */
434 current_item_size -= op_part_size (vi, 0/*from start*/, start_bytes);
435
436 /* do not take in calculation tail part of last item */
437 current_item_size -= op_part_size (vi, 1/*from end*/, skip_from_end);
438
439 /* if item fits into current node entierly */
440 if (total_node_size + current_item_size <= max_node_size) {
441 snum012[needed_nodes - 1] ++;
442 total_node_size += current_item_size;
443 start_bytes = 0;
444 continue;
445 }
446
447 if (current_item_size > max_node_size) {
448 /* virtual item length is longer, than max size of item in
449 a node. It is impossible for direct item */
450 RFALSE( is_direct_le_ih (vi->vi_ih),
451 "vs-8110: "
452 "direct item length is %d. It can not be longer than %d",
453 current_item_size, max_node_size);
454 /* we will try to split it */
455 flow = 1;
456 }
457
458 if (!flow) {
459 /* as we do not split items, take new node and continue */
460 needed_nodes ++; i --; total_node_size = 0;
461 continue;
462 }
463
464 // calculate number of item units which fit into node being
465 // filled
466 {
467 int free_space;
468
469 free_space = max_node_size - total_node_size - IH_SIZE;
470 units = op_check_left (vi, free_space, start_bytes, skip_from_end);
471 if (units == -1) {
472 /* nothing fits into current node, take new node and continue */
473 needed_nodes ++, i--, total_node_size = 0;
474 continue;
475 }
476 }
477
478 /* something fits into the current node */
479 //if (snum012[3] != -1 || needed_nodes != 1)
480 // reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
481 //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
482 start_bytes += units;
483 snum012[needed_nodes - 1 + 3] = units;
484
485 if (needed_nodes > 2)
486 reiserfs_warning (tb->tb_sb, "vs-8111: get_num_ver: "
487 "split_item_position is out of boundary");
488 snum012[needed_nodes - 1] ++;
489 split_item_positions[needed_nodes - 1] = i;
490 needed_nodes ++;
491 /* continue from the same item with start_bytes != -1 */
492 start_item = i;
493 i --;
494 total_node_size = 0;
495 }
496
497 // sum012[4] (if it is not -1) contains number of units of which
498 // are to be in S1new, snum012[3] - to be in S0. They are supposed
499 // to be S1bytes and S2bytes correspondingly, so recalculate
500 if (snum012[4] > 0) {
501 int split_item_num;
502 int bytes_to_r, bytes_to_l;
503 int bytes_to_S1new;
504
505 split_item_num = split_item_positions[1];
506 bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0);
507 bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0);
508 bytes_to_S1new = ((split_item_positions[0] == split_item_positions[1]) ? snum012[3] : 0);
509
510 // s2bytes
511 snum012[4] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[4] - bytes_to_r - bytes_to_l - bytes_to_S1new;
512
513 if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
514 vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
515 reiserfs_warning (tb->tb_sb, "vs-8115: get_num_ver: not "
516 "directory or indirect item");
517 }
518
519 /* now we know S2bytes, calculate S1bytes */
520 if (snum012[3] > 0) {
521 int split_item_num;
522 int bytes_to_r, bytes_to_l;
523 int bytes_to_S2new;
524
525 split_item_num = split_item_positions[0];
526 bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0);
527 bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0);
528 bytes_to_S2new = ((split_item_positions[0] == split_item_positions[1] && snum012[4] != -1) ? snum012[4] : 0);
529
530 // s1bytes
531 snum012[3] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[3] - bytes_to_r - bytes_to_l - bytes_to_S2new;
532 }
533
534 return needed_nodes;
535}
536
537
538#ifdef CONFIG_REISERFS_CHECK
539extern struct tree_balance * cur_tb;
540#endif
541
542
543/* Set parameters for balancing.
544 * Performs write of results of analysis of balancing into structure tb,
545 * where it will later be used by the functions that actually do the balancing.
546 * Parameters:
547 * tb tree_balance structure;
548 * h current level of the node;
549 * lnum number of items from S[h] that must be shifted to L[h];
550 * rnum number of items from S[h] that must be shifted to R[h];
551 * blk_num number of blocks that S[h] will be splitted into;
552 * s012 number of items that fall into splitted nodes.
553 * lbytes number of bytes which flow to the left neighbor from the item that is not
554 * not shifted entirely
555 * rbytes number of bytes which flow to the right neighbor from the item that is not
556 * not shifted entirely
557 * s1bytes number of bytes which flow to the first new node when S[0] splits (this number is contained in s012 array)
558 */
559
560static void set_parameters (struct tree_balance * tb, int h, int lnum,
561 int rnum, int blk_num, short * s012, int lb, int rb)
562{
563
564 tb->lnum[h] = lnum;
565 tb->rnum[h] = rnum;
566 tb->blknum[h] = blk_num;
567
568 if (h == 0)
569 { /* only for leaf level */
570 if (s012 != NULL)
571 {
572 tb->s0num = * s012 ++,
573 tb->s1num = * s012 ++,
574 tb->s2num = * s012 ++;
575 tb->s1bytes = * s012 ++;
576 tb->s2bytes = * s012;
577 }
578 tb->lbytes = lb;
579 tb->rbytes = rb;
580 }
581 PROC_INFO_ADD( tb -> tb_sb, lnum[ h ], lnum );
582 PROC_INFO_ADD( tb -> tb_sb, rnum[ h ], rnum );
583
584 PROC_INFO_ADD( tb -> tb_sb, lbytes[ h ], lb );
585 PROC_INFO_ADD( tb -> tb_sb, rbytes[ h ], rb );
586}
587
588
589
590/* check, does node disappear if we shift tb->lnum[0] items to left
591 neighbor and tb->rnum[0] to the right one. */
592static int is_leaf_removable (struct tree_balance * tb)
593{
594 struct virtual_node * vn = tb->tb_vn;
595 int to_left, to_right;
596 int size;
597 int remain_items;
598
599 /* number of items, that will be shifted to left (right) neighbor
600 entirely */
601 to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
602 to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
603 remain_items = vn->vn_nr_item;
604
605 /* how many items remain in S[0] after shiftings to neighbors */
606 remain_items -= (to_left + to_right);
607
608 if (remain_items < 1) {
609 /* all content of node can be shifted to neighbors */
610 set_parameters (tb, 0, to_left, vn->vn_nr_item - to_left, 0, NULL, -1, -1);
611 return 1;
612 }
613
614 if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
615 /* S[0] is not removable */
616 return 0;
617
618 /* check, whether we can divide 1 remaining item between neighbors */
619
620 /* get size of remaining item (in item units) */
621 size = op_unit_num (&(vn->vn_vi[to_left]));
622
623 if (tb->lbytes + tb->rbytes >= size) {
624 set_parameters (tb, 0, to_left + 1, to_right + 1, 0, NULL, tb->lbytes, -1);
625 return 1;
626 }
627
628 return 0;
629}
630
631
632/* check whether L, S, R can be joined in one node */
633static int are_leaves_removable (struct tree_balance * tb, int lfree, int rfree)
634{
635 struct virtual_node * vn = tb->tb_vn;
636 int ih_size;
637 struct buffer_head *S0;
638
639 S0 = PATH_H_PBUFFER (tb->tb_path, 0);
640
641 ih_size = 0;
642 if (vn->vn_nr_item) {
643 if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
644 ih_size += IH_SIZE;
645
646 if (vn->vn_vi[vn->vn_nr_item-1].vi_type & VI_TYPE_RIGHT_MERGEABLE)
647 ih_size += IH_SIZE;
648 } else {
649 /* there was only one item and it will be deleted */
650 struct item_head * ih;
651
652 RFALSE( B_NR_ITEMS (S0) != 1,
653 "vs-8125: item number must be 1: it is %d", B_NR_ITEMS(S0));
654
655 ih = B_N_PITEM_HEAD (S0, 0);
656 if (tb->CFR[0] && !comp_short_le_keys (&(ih->ih_key), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0])))
657 if (is_direntry_le_ih (ih)) {
658 /* Directory must be in correct state here: that is
659 somewhere at the left side should exist first directory
660 item. But the item being deleted can not be that first
661 one because its right neighbor is item of the same
662 directory. (But first item always gets deleted in last
663 turn). So, neighbors of deleted item can be merged, so
664 we can save ih_size */
665 ih_size = IH_SIZE;
666
667 /* we might check that left neighbor exists and is of the
668 same directory */
669 RFALSE(le_ih_k_offset (ih) == DOT_OFFSET,
670 "vs-8130: first directory item can not be removed until directory is not empty");
671 }
672
673 }
674
675 if (MAX_CHILD_SIZE (S0) + vn->vn_size <= rfree + lfree + ih_size) {
676 set_parameters (tb, 0, -1, -1, -1, NULL, -1, -1);
677 PROC_INFO_INC( tb -> tb_sb, leaves_removable );
678 return 1;
679 }
680 return 0;
681
682}
683
684
685
686/* when we do not split item, lnum and rnum are numbers of entire items */
687#define SET_PAR_SHIFT_LEFT \
688if (h)\
689{\
690 int to_l;\
691 \
692 to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
693 (MAX_NR_KEY(Sh) + 1 - lpar);\
694 \
695 set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
696}\
697else \
698{\
699 if (lset==LEFT_SHIFT_FLOW)\
700 set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
701 tb->lbytes, -1);\
702 else\
703 set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
704 -1, -1);\
705}
706
707
708#define SET_PAR_SHIFT_RIGHT \
709if (h)\
710{\
711 int to_r;\
712 \
713 to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
714 \
715 set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
716}\
717else \
718{\
719 if (rset==RIGHT_SHIFT_FLOW)\
720 set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
721 -1, tb->rbytes);\
722 else\
723 set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
724 -1, -1);\
725}
726
727
728static void free_buffers_in_tb (
729 struct tree_balance * p_s_tb
730 ) {
731 int n_counter;
732
733 decrement_counters_in_path(p_s_tb->tb_path);
734
735 for ( n_counter = 0; n_counter < MAX_HEIGHT; n_counter++ ) {
736 decrement_bcount(p_s_tb->L[n_counter]);
737 p_s_tb->L[n_counter] = NULL;
738 decrement_bcount(p_s_tb->R[n_counter]);
739 p_s_tb->R[n_counter] = NULL;
740 decrement_bcount(p_s_tb->FL[n_counter]);
741 p_s_tb->FL[n_counter] = NULL;
742 decrement_bcount(p_s_tb->FR[n_counter]);
743 p_s_tb->FR[n_counter] = NULL;
744 decrement_bcount(p_s_tb->CFL[n_counter]);
745 p_s_tb->CFL[n_counter] = NULL;
746 decrement_bcount(p_s_tb->CFR[n_counter]);
747 p_s_tb->CFR[n_counter] = NULL;
748 }
749}
750
751
752/* Get new buffers for storing new nodes that are created while balancing.
753 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
754 * CARRY_ON - schedule didn't occur while the function worked;
755 * NO_DISK_SPACE - no disk space.
756 */
757/* The function is NOT SCHEDULE-SAFE! */
758static int get_empty_nodes(
759 struct tree_balance * p_s_tb,
760 int n_h
761 ) {
762 struct buffer_head * p_s_new_bh,
763 * p_s_Sh = PATH_H_PBUFFER (p_s_tb->tb_path, n_h);
764 b_blocknr_t * p_n_blocknr,
765 a_n_blocknrs[MAX_AMOUNT_NEEDED] = {0, };
766 int n_counter,
767 n_number_of_freeblk,
768 n_amount_needed,/* number of needed empty blocks */
769 n_retval = CARRY_ON;
770 struct super_block * p_s_sb = p_s_tb->tb_sb;
771
772
773 /* number_of_freeblk is the number of empty blocks which have been
774 acquired for use by the balancing algorithm minus the number of
775 empty blocks used in the previous levels of the analysis,
776 number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
777 after empty blocks are acquired, and the balancing analysis is
778 then restarted, amount_needed is the number needed by this level
779 (n_h) of the balancing analysis.
780
781 Note that for systems with many processes writing, it would be
782 more layout optimal to calculate the total number needed by all
783 levels and then to run reiserfs_new_blocks to get all of them at once. */
784
785 /* Initiate number_of_freeblk to the amount acquired prior to the restart of
786 the analysis or 0 if not restarted, then subtract the amount needed
787 by all of the levels of the tree below n_h. */
788 /* blknum includes S[n_h], so we subtract 1 in this calculation */
789 for ( n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum; n_counter < n_h; n_counter++ )
790 n_number_of_freeblk -= ( p_s_tb->blknum[n_counter] ) ? (p_s_tb->blknum[n_counter] - 1) : 0;
791
792 /* Allocate missing empty blocks. */
793 /* if p_s_Sh == 0 then we are getting a new root */
794 n_amount_needed = ( p_s_Sh ) ? (p_s_tb->blknum[n_h] - 1) : 1;
795 /* Amount_needed = the amount that we need more than the amount that we have. */
796 if ( n_amount_needed > n_number_of_freeblk )
797 n_amount_needed -= n_number_of_freeblk;
798 else /* If we have enough already then there is nothing to do. */
799 return CARRY_ON;
800
801 /* No need to check quota - is not allocated for blocks used for formatted nodes */
802 if (reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs,
803 n_amount_needed) == NO_DISK_SPACE)
804 return NO_DISK_SPACE;
805
806 /* for each blocknumber we just got, get a buffer and stick it on FEB */
807 for ( p_n_blocknr = a_n_blocknrs, n_counter = 0; n_counter < n_amount_needed;
808 p_n_blocknr++, n_counter++ ) {
809
810 RFALSE( ! *p_n_blocknr,
811 "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
812
813 p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr);
814 RFALSE (buffer_dirty (p_s_new_bh) ||
815 buffer_journaled (p_s_new_bh) ||
816 buffer_journal_dirty (p_s_new_bh),
817 "PAP-8140: journlaled or dirty buffer %b for the new block",
818 p_s_new_bh);
819
820 /* Put empty buffers into the array. */
821 RFALSE (p_s_tb->FEB[p_s_tb->cur_blknum],
822 "PAP-8141: busy slot for new buffer");
823
824 set_buffer_journal_new (p_s_new_bh);
825 p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh;
826 }
827
828 if ( n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB (p_s_tb) )
829 n_retval = REPEAT_SEARCH ;
830
831 return n_retval;
832}
833
834
835/* Get free space of the left neighbor, which is stored in the parent
836 * node of the left neighbor. */
837static int get_lfree (struct tree_balance * tb, int h)
838{
839 struct buffer_head * l, * f;
840 int order;
841
842 if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0)
843 return 0;
844
845 if (f == l)
846 order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) - 1;
847 else {
848 order = B_NR_ITEMS (l);
849 f = l;
850 }
851
852 return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f,order)));
853}
854
855
856/* Get free space of the right neighbor,
857 * which is stored in the parent node of the right neighbor.
858 */
859static int get_rfree (struct tree_balance * tb, int h)
860{
861 struct buffer_head * r, * f;
862 int order;
863
864 if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0)
865 return 0;
866
867 if (f == r)
868 order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) + 1;
869 else {
870 order = 0;
871 f = r;
872 }
873
874 return (MAX_CHILD_SIZE(f) - dc_size( B_N_CHILD(f,order)));
875
876}
877
878
879/* Check whether left neighbor is in memory. */
880static int is_left_neighbor_in_cache(
881 struct tree_balance * p_s_tb,
882 int n_h
883 ) {
884 struct buffer_head * p_s_father, * left;
885 struct super_block * p_s_sb = p_s_tb->tb_sb;
886 b_blocknr_t n_left_neighbor_blocknr;
887 int n_left_neighbor_position;
888
889 if ( ! p_s_tb->FL[n_h] ) /* Father of the left neighbor does not exist. */
890 return 0;
891
892 /* Calculate father of the node to be balanced. */
893 p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1);
894
895 RFALSE( ! p_s_father ||
896 ! B_IS_IN_TREE (p_s_father) ||
897 ! B_IS_IN_TREE (p_s_tb->FL[n_h]) ||
898 ! buffer_uptodate (p_s_father) ||
899 ! buffer_uptodate (p_s_tb->FL[n_h]),
900 "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
901 p_s_father, p_s_tb->FL[n_h]);
902
903
904 /* Get position of the pointer to the left neighbor into the left father. */
905 n_left_neighbor_position = ( p_s_father == p_s_tb->FL[n_h] ) ?
906 p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]);
907 /* Get left neighbor block number. */
908 n_left_neighbor_blocknr = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position);
909 /* Look for the left neighbor in the cache. */
910 if ( (left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr)) ) {
911
912 RFALSE( buffer_uptodate (left) && ! B_IS_IN_TREE(left),
913 "vs-8170: left neighbor (%b %z) is not in the tree", left, left);
914 put_bh(left) ;
915 return 1;
916 }
917
918 return 0;
919}
920
921
922#define LEFT_PARENTS 'l'
923#define RIGHT_PARENTS 'r'
924
925
926static void decrement_key (struct cpu_key * p_s_key)
927{
928 // call item specific function for this key
929 item_ops[cpu_key_k_type (p_s_key)]->decrement_key (p_s_key);
930}
931
932
933
934
935/* Calculate far left/right parent of the left/right neighbor of the current node, that
936 * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h].
937 * Calculate left/right common parent of the current node and L[h]/R[h].
938 * Calculate left/right delimiting key position.
939 * Returns: PATH_INCORRECT - path in the tree is not correct;
940 SCHEDULE_OCCURRED - schedule occurred while the function worked;
941 * CARRY_ON - schedule didn't occur while the function worked;
942 */
943static int get_far_parent (struct tree_balance * p_s_tb,
944 int n_h,
945 struct buffer_head ** pp_s_father,
946 struct buffer_head ** pp_s_com_father,
947 char c_lr_par)
948{
949 struct buffer_head * p_s_parent;
950 INITIALIZE_PATH (s_path_to_neighbor_father);
951 struct path * p_s_path = p_s_tb->tb_path;
952 struct cpu_key s_lr_father_key;
953 int n_counter,
954 n_position = INT_MAX,
955 n_first_last_position = 0,
956 n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h);
957
958 /* Starting from F[n_h] go upwards in the tree, and look for the common
959 ancestor of F[n_h], and its neighbor l/r, that should be obtained. */
960
961 n_counter = n_path_offset;
962
963 RFALSE( n_counter < FIRST_PATH_ELEMENT_OFFSET,
964 "PAP-8180: invalid path length");
965
966
967 for ( ; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter-- ) {
968 /* Check whether parent of the current buffer in the path is really parent in the tree. */
969 if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)) )
970 return REPEAT_SEARCH;
971 /* Check whether position in the parent is correct. */
972 if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_counter - 1)) > B_NR_ITEMS(p_s_parent) )
973 return REPEAT_SEARCH;
974 /* Check whether parent at the path really points to the child. */
975 if ( B_N_CHILD_NUM(p_s_parent, n_position) !=
976 PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr )
977 return REPEAT_SEARCH;
978 /* Return delimiting key if position in the parent is not equal to first/last one. */
979 if ( c_lr_par == RIGHT_PARENTS )
980 n_first_last_position = B_NR_ITEMS (p_s_parent);
981 if ( n_position != n_first_last_position ) {
982 *pp_s_com_father = p_s_parent;
983 get_bh(*pp_s_com_father) ;
984 /*(*pp_s_com_father = p_s_parent)->b_count++;*/
985 break;
986 }
987 }
988
989 /* if we are in the root of the tree, then there is no common father */
990 if ( n_counter == FIRST_PATH_ELEMENT_OFFSET ) {
991 /* Check whether first buffer in the path is the root of the tree. */
992 if ( PATH_OFFSET_PBUFFER(p_s_tb->tb_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
993 SB_ROOT_BLOCK (p_s_tb->tb_sb) ) {
994 *pp_s_father = *pp_s_com_father = NULL;
995 return CARRY_ON;
996 }
997 return REPEAT_SEARCH;
998 }
999
1000 RFALSE( B_LEVEL (*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL,
1001 "PAP-8185: (%b %z) level too small",
1002 *pp_s_com_father, *pp_s_com_father);
1003
1004 /* Check whether the common parent is locked. */
1005
1006 if ( buffer_locked (*pp_s_com_father) ) {
1007 __wait_on_buffer(*pp_s_com_father);
1008 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1009 decrement_bcount(*pp_s_com_father);
1010 return REPEAT_SEARCH;
1011 }
1012 }
1013
1014 /* So, we got common parent of the current node and its left/right neighbor.
1015 Now we are geting the parent of the left/right neighbor. */
1016
1017 /* Form key to get parent of the left/right neighbor. */
1018 le_key2cpu_key (&s_lr_father_key, B_N_PDELIM_KEY(*pp_s_com_father, ( c_lr_par == LEFT_PARENTS ) ?
1019 (p_s_tb->lkey[n_h - 1] = n_position - 1) : (p_s_tb->rkey[n_h - 1] = n_position)));
1020
1021
1022 if ( c_lr_par == LEFT_PARENTS )
1023 decrement_key(&s_lr_father_key);
1024
1025 if (search_by_key(p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, n_h + 1) == IO_ERROR)
1026 // path is released
1027 return IO_ERROR;
1028
1029 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1030 decrement_counters_in_path(&s_path_to_neighbor_father);
1031 decrement_bcount(*pp_s_com_father);
1032 return REPEAT_SEARCH;
1033 }
1034
1035 *pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
1036
1037 RFALSE( B_LEVEL (*pp_s_father) != n_h + 1,
1038 "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father);
1039 RFALSE( s_path_to_neighbor_father.path_length < FIRST_PATH_ELEMENT_OFFSET,
1040 "PAP-8192: path length is too small");
1041
1042 s_path_to_neighbor_father.path_length--;
1043 decrement_counters_in_path(&s_path_to_neighbor_father);
1044 return CARRY_ON;
1045}
1046
1047
1048/* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of
1049 * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset],
1050 * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset].
1051 * Calculate numbers of left and right delimiting keys position: lkey[n_path_offset], rkey[n_path_offset].
1052 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
1053 * CARRY_ON - schedule didn't occur while the function worked;
1054 */
1055static int get_parents (struct tree_balance * p_s_tb, int n_h)
1056{
1057 struct path * p_s_path = p_s_tb->tb_path;
1058 int n_position,
1059 n_ret_value,
1060 n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1061 struct buffer_head * p_s_curf,
1062 * p_s_curcf;
1063
1064 /* Current node is the root of the tree or will be root of the tree */
1065 if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) {
1066 /* The root can not have parents.
1067 Release nodes which previously were obtained as parents of the current node neighbors. */
1068 decrement_bcount(p_s_tb->FL[n_h]);
1069 decrement_bcount(p_s_tb->CFL[n_h]);
1070 decrement_bcount(p_s_tb->FR[n_h]);
1071 decrement_bcount(p_s_tb->CFR[n_h]);
1072 p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] = p_s_tb->CFR[n_h] = NULL;
1073 return CARRY_ON;
1074 }
1075
1076 /* Get parent FL[n_path_offset] of L[n_path_offset]. */
1077 if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) ) {
1078 /* Current node is not the first child of its parent. */
1079 /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/
1080 p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1081 get_bh(p_s_curf) ;
1082 get_bh(p_s_curf) ;
1083 p_s_tb->lkey[n_h] = n_position - 1;
1084 }
1085 else {
1086 /* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node.
1087 Calculate current common parent of L[n_path_offset] and the current node. Note that
1088 CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset].
1089 Calculate lkey[n_path_offset]. */
1090 if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf,
1091 &p_s_curcf, LEFT_PARENTS)) != CARRY_ON )
1092 return n_ret_value;
1093 }
1094
1095 decrement_bcount(p_s_tb->FL[n_h]);
1096 p_s_tb->FL[n_h] = p_s_curf; /* New initialization of FL[n_h]. */
1097 decrement_bcount(p_s_tb->CFL[n_h]);
1098 p_s_tb->CFL[n_h] = p_s_curcf; /* New initialization of CFL[n_h]. */
1099
1100 RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) ||
1101 (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)),
1102 "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf);
1103
1104/* Get parent FR[n_h] of R[n_h]. */
1105
1106/* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */
1107 if ( n_position == B_NR_ITEMS (PATH_H_PBUFFER(p_s_path, n_h + 1)) ) {
1108/* Calculate current parent of R[n_h], which is the right neighbor of F[n_h].
1109 Calculate current common parent of R[n_h] and current node. Note that CFR[n_h]
1110 not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */
1111 if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf, RIGHT_PARENTS)) != CARRY_ON )
1112 return n_ret_value;
1113 }
1114 else {
1115/* Current node is not the last child of its parent F[n_h]. */
1116 /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/
1117 p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1118 get_bh(p_s_curf) ;
1119 get_bh(p_s_curf) ;
1120 p_s_tb->rkey[n_h] = n_position;
1121 }
1122
1123 decrement_bcount(p_s_tb->FR[n_h]);
1124 p_s_tb->FR[n_h] = p_s_curf; /* New initialization of FR[n_path_offset]. */
1125
1126 decrement_bcount(p_s_tb->CFR[n_h]);
1127 p_s_tb->CFR[n_h] = p_s_curcf; /* New initialization of CFR[n_path_offset]. */
1128
1129 RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) ||
1130 (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)),
1131 "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf);
1132
1133 return CARRY_ON;
1134}
1135
1136
1137/* it is possible to remove node as result of shiftings to
1138 neighbors even when we insert or paste item. */
1139static inline int can_node_be_removed (int mode, int lfree, int sfree, int rfree, struct tree_balance * tb, int h)
1140{
1141 struct buffer_head * Sh = PATH_H_PBUFFER (tb->tb_path, h);
1142 int levbytes = tb->insert_size[h];
1143 struct item_head * ih;
1144 struct reiserfs_key * r_key = NULL;
1145
1146 ih = B_N_PITEM_HEAD (Sh, 0);
1147 if ( tb->CFR[h] )
1148 r_key = B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]);
1149
1150 if (
1151 lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
1152 /* shifting may merge items which might save space */
1153 - (( ! h && op_is_left_mergeable (&(ih->ih_key), Sh->b_size) ) ? IH_SIZE : 0)
1154 - (( ! h && r_key && op_is_left_mergeable (r_key, Sh->b_size) ) ? IH_SIZE : 0)
1155 + (( h ) ? KEY_SIZE : 0))
1156 {
1157 /* node can not be removed */
1158 if (sfree >= levbytes ) { /* new item fits into node S[h] without any shifting */
1159 if ( ! h )
1160 tb->s0num = B_NR_ITEMS(Sh) + ((mode == M_INSERT ) ? 1 : 0);
1161 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1162 return NO_BALANCING_NEEDED;
1163 }
1164 }
1165 PROC_INFO_INC( tb -> tb_sb, can_node_be_removed[ h ] );
1166 return !NO_BALANCING_NEEDED;
1167}
1168
1169
1170
1171/* Check whether current node S[h] is balanced when increasing its size by
1172 * Inserting or Pasting.
1173 * Calculate parameters for balancing for current level h.
1174 * Parameters:
1175 * tb tree_balance structure;
1176 * h current level of the node;
1177 * inum item number in S[h];
1178 * mode i - insert, p - paste;
1179 * Returns: 1 - schedule occurred;
1180 * 0 - balancing for higher levels needed;
1181 * -1 - no balancing for higher levels needed;
1182 * -2 - no disk space.
1183 */
1184/* ip means Inserting or Pasting */
1185static int ip_check_balance (struct tree_balance * tb, int h)
1186{
1187 struct virtual_node * vn = tb->tb_vn;
1188 int levbytes, /* Number of bytes that must be inserted into (value
1189 is negative if bytes are deleted) buffer which
1190 contains node being balanced. The mnemonic is
1191 that the attempted change in node space used level
1192 is levbytes bytes. */
1193 n_ret_value;
1194
1195 int lfree, sfree, rfree /* free space in L, S and R */;
1196
1197 /* nver is short for number of vertixes, and lnver is the number if
1198 we shift to the left, rnver is the number if we shift to the
1199 right, and lrnver is the number if we shift in both directions.
1200 The goal is to minimize first the number of vertixes, and second,
1201 the number of vertixes whose contents are changed by shifting,
1202 and third the number of uncached vertixes whose contents are
1203 changed by shifting and must be read from disk. */
1204 int nver, lnver, rnver, lrnver;
1205
1206 /* used at leaf level only, S0 = S[0] is the node being balanced,
1207 sInum [ I = 0,1,2 ] is the number of items that will
1208 remain in node SI after balancing. S1 and S2 are new
1209 nodes that might be created. */
1210
1211 /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters.
1212 where 4th parameter is s1bytes and 5th - s2bytes
1213 */
1214 short snum012[40] = {0,}; /* s0num, s1num, s2num for 8 cases
1215 0,1 - do not shift and do not shift but bottle
1216 2 - shift only whole item to left
1217 3 - shift to left and bottle as much as possible
1218 4,5 - shift to right (whole items and as much as possible
1219 6,7 - shift to both directions (whole items and as much as possible)
1220 */
1221
1222 /* Sh is the node whose balance is currently being checked */
1223 struct buffer_head * Sh;
1224
1225 Sh = PATH_H_PBUFFER (tb->tb_path, h);
1226 levbytes = tb->insert_size[h];
1227
1228 /* Calculate balance parameters for creating new root. */
1229 if ( ! Sh ) {
1230 if ( ! h )
1231 reiserfs_panic (tb->tb_sb, "vs-8210: ip_check_balance: S[0] can not be 0");
1232 switch ( n_ret_value = get_empty_nodes (tb, h) ) {
1233 case CARRY_ON:
1234 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1235 return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
1236
1237 case NO_DISK_SPACE:
1238 case REPEAT_SEARCH:
1239 return n_ret_value;
1240 default:
1241 reiserfs_panic(tb->tb_sb, "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes");
1242 }
1243 }
1244
1245 if ( (n_ret_value = get_parents (tb, h)) != CARRY_ON ) /* get parents of S[h] neighbors. */
1246 return n_ret_value;
1247
1248 sfree = B_FREE_SPACE (Sh);
1249
1250 /* get free space of neighbors */
1251 rfree = get_rfree (tb, h);
1252 lfree = get_lfree (tb, h);
1253
1254 if (can_node_be_removed (vn->vn_mode, lfree, sfree, rfree, tb, h) == NO_BALANCING_NEEDED)
1255 /* and new item fits into node S[h] without any shifting */
1256 return NO_BALANCING_NEEDED;
1257
1258 create_virtual_node (tb, h);
1259
1260 /*
1261 determine maximal number of items we can shift to the left neighbor (in tb structure)
1262 and the maximal number of bytes that can flow to the left neighbor
1263 from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
1264 */
1265 check_left (tb, h, lfree);
1266
1267 /*
1268 determine maximal number of items we can shift to the right neighbor (in tb structure)
1269 and the maximal number of bytes that can flow to the right neighbor
1270 from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
1271 */
1272 check_right (tb, h, rfree);
1273
1274
1275 /* all contents of internal node S[h] can be moved into its
1276 neighbors, S[h] will be removed after balancing */
1277 if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
1278 int to_r;
1279
1280 /* Since we are working on internal nodes, and our internal
1281 nodes have fixed size entries, then we can balance by the
1282 number of items rather than the space they consume. In this
1283 routine we set the left node equal to the right node,
1284 allowing a difference of less than or equal to 1 child
1285 pointer. */
1286 to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
1287 (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1288 set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
1289 return CARRY_ON;
1290 }
1291
1292 /* this checks balance condition, that any two neighboring nodes can not fit in one node */
1293 RFALSE( h &&
1294 ( tb->lnum[h] >= vn->vn_nr_item + 1 ||
1295 tb->rnum[h] >= vn->vn_nr_item + 1),
1296 "vs-8220: tree is not balanced on internal level");
1297 RFALSE( ! h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
1298 (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1)) ),
1299 "vs-8225: tree is not balanced on leaf level");
1300
1301 /* all contents of S[0] can be moved into its neighbors
1302 S[0] will be removed after balancing. */
1303 if (!h && is_leaf_removable (tb))
1304 return CARRY_ON;
1305
1306
1307 /* why do we perform this check here rather than earlier??
1308 Answer: we can win 1 node in some cases above. Moreover we
1309 checked it above, when we checked, that S[0] is not removable
1310 in principle */
1311 if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */
1312 if ( ! h )
1313 tb->s0num = vn->vn_nr_item;
1314 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1315 return NO_BALANCING_NEEDED;
1316 }
1317
1318
1319 {
1320 int lpar, rpar, nset, lset, rset, lrset;
1321 /*
1322 * regular overflowing of the node
1323 */
1324
1325 /* get_num_ver works in 2 modes (FLOW & NO_FLOW)
1326 lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
1327 nset, lset, rset, lrset - shows, whether flowing items give better packing
1328 */
1329#define FLOW 1
1330#define NO_FLOW 0 /* do not any splitting */
1331
1332 /* we choose one the following */
1333#define NOTHING_SHIFT_NO_FLOW 0
1334#define NOTHING_SHIFT_FLOW 5
1335#define LEFT_SHIFT_NO_FLOW 10
1336#define LEFT_SHIFT_FLOW 15
1337#define RIGHT_SHIFT_NO_FLOW 20
1338#define RIGHT_SHIFT_FLOW 25
1339#define LR_SHIFT_NO_FLOW 30
1340#define LR_SHIFT_FLOW 35
1341
1342
1343 lpar = tb->lnum[h];
1344 rpar = tb->rnum[h];
1345
1346
1347 /* calculate number of blocks S[h] must be split into when
1348 nothing is shifted to the neighbors,
1349 as well as number of items in each part of the split node (s012 numbers),
1350 and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
1351 nset = NOTHING_SHIFT_NO_FLOW;
1352 nver = get_num_ver (vn->vn_mode, tb, h,
1353 0, -1, h?vn->vn_nr_item:0, -1,
1354 snum012, NO_FLOW);
1355
1356 if (!h)
1357 {
1358 int nver1;
1359
1360 /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
1361 nver1 = get_num_ver (vn->vn_mode, tb, h,
1362 0, -1, 0, -1,
1363 snum012 + NOTHING_SHIFT_FLOW, FLOW);
1364 if (nver > nver1)
1365 nset = NOTHING_SHIFT_FLOW, nver = nver1;
1366 }
1367
1368
1369 /* calculate number of blocks S[h] must be split into when
1370 l_shift_num first items and l_shift_bytes of the right most
1371 liquid item to be shifted are shifted to the left neighbor,
1372 as well as number of items in each part of the splitted node (s012 numbers),
1373 and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1374 */
1375 lset = LEFT_SHIFT_NO_FLOW;
1376 lnver = get_num_ver (vn->vn_mode, tb, h,
1377 lpar - (( h || tb->lbytes == -1 ) ? 0 : 1), -1, h ? vn->vn_nr_item:0, -1,
1378 snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
1379 if (!h)
1380 {
1381 int lnver1;
1382
1383 lnver1 = get_num_ver (vn->vn_mode, tb, h,
1384 lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, 0, -1,
1385 snum012 + LEFT_SHIFT_FLOW, FLOW);
1386 if (lnver > lnver1)
1387 lset = LEFT_SHIFT_FLOW, lnver = lnver1;
1388 }
1389
1390
1391 /* calculate number of blocks S[h] must be split into when
1392 r_shift_num first items and r_shift_bytes of the left most
1393 liquid item to be shifted are shifted to the right neighbor,
1394 as well as number of items in each part of the splitted node (s012 numbers),
1395 and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1396 */
1397 rset = RIGHT_SHIFT_NO_FLOW;
1398 rnver = get_num_ver (vn->vn_mode, tb, h,
1399 0, -1, h ? (vn->vn_nr_item-rpar) : (rpar - (( tb->rbytes != -1 ) ? 1 : 0)), -1,
1400 snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
1401 if (!h)
1402 {
1403 int rnver1;
1404
1405 rnver1 = get_num_ver (vn->vn_mode, tb, h,
1406 0, -1, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes,
1407 snum012 + RIGHT_SHIFT_FLOW, FLOW);
1408
1409 if (rnver > rnver1)
1410 rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
1411 }
1412
1413
1414 /* calculate number of blocks S[h] must be split into when
1415 items are shifted in both directions,
1416 as well as number of items in each part of the splitted node (s012 numbers),
1417 and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1418 */
1419 lrset = LR_SHIFT_NO_FLOW;
1420 lrnver = get_num_ver (vn->vn_mode, tb, h,
1421 lpar - ((h || tb->lbytes == -1) ? 0 : 1), -1, h ? (vn->vn_nr_item-rpar):(rpar - ((tb->rbytes != -1) ? 1 : 0)), -1,
1422 snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
1423 if (!h)
1424 {
1425 int lrnver1;
1426
1427 lrnver1 = get_num_ver (vn->vn_mode, tb, h,
1428 lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes,
1429 snum012 + LR_SHIFT_FLOW, FLOW);
1430 if (lrnver > lrnver1)
1431 lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
1432 }
1433
1434
1435
1436 /* Our general shifting strategy is:
1437 1) to minimized number of new nodes;
1438 2) to minimized number of neighbors involved in shifting;
1439 3) to minimized number of disk reads; */
1440
1441 /* we can win TWO or ONE nodes by shifting in both directions */
1442 if (lrnver < lnver && lrnver < rnver)
1443 {
1444 RFALSE( h &&
1445 (tb->lnum[h] != 1 ||
1446 tb->rnum[h] != 1 ||
1447 lrnver != 1 || rnver != 2 || lnver != 2 || h != 1),
1448 "vs-8230: bad h");
1449 if (lrset == LR_SHIFT_FLOW)
1450 set_parameters (tb, h, tb->lnum[h], tb->rnum[h], lrnver, snum012 + lrset,
1451 tb->lbytes, tb->rbytes);
1452 else
1453 set_parameters (tb, h, tb->lnum[h] - ((tb->lbytes == -1) ? 0 : 1),
1454 tb->rnum[h] - ((tb->rbytes == -1) ? 0 : 1), lrnver, snum012 + lrset, -1, -1);
1455
1456 return CARRY_ON;
1457 }
1458
1459 /* if shifting doesn't lead to better packing then don't shift */
1460 if (nver == lrnver)
1461 {
1462 set_parameters (tb, h, 0, 0, nver, snum012 + nset, -1, -1);
1463 return CARRY_ON;
1464 }
1465
1466
1467 /* now we know that for better packing shifting in only one
1468 direction either to the left or to the right is required */
1469
1470 /* if shifting to the left is better than shifting to the right */
1471 if (lnver < rnver)
1472 {
1473 SET_PAR_SHIFT_LEFT;
1474 return CARRY_ON;
1475 }
1476
1477 /* if shifting to the right is better than shifting to the left */
1478 if (lnver > rnver)
1479 {
1480 SET_PAR_SHIFT_RIGHT;
1481 return CARRY_ON;
1482 }
1483
1484
1485 /* now shifting in either direction gives the same number
1486 of nodes and we can make use of the cached neighbors */
1487 if (is_left_neighbor_in_cache (tb,h))
1488 {
1489 SET_PAR_SHIFT_LEFT;
1490 return CARRY_ON;
1491 }
1492
1493 /* shift to the right independently on whether the right neighbor in cache or not */
1494 SET_PAR_SHIFT_RIGHT;
1495 return CARRY_ON;
1496 }
1497}
1498
1499
1500/* Check whether current node S[h] is balanced when Decreasing its size by
1501 * Deleting or Cutting for INTERNAL node of S+tree.
1502 * Calculate parameters for balancing for current level h.
1503 * Parameters:
1504 * tb tree_balance structure;
1505 * h current level of the node;
1506 * inum item number in S[h];
1507 * mode i - insert, p - paste;
1508 * Returns: 1 - schedule occurred;
1509 * 0 - balancing for higher levels needed;
1510 * -1 - no balancing for higher levels needed;
1511 * -2 - no disk space.
1512 *
1513 * Note: Items of internal nodes have fixed size, so the balance condition for
1514 * the internal part of S+tree is as for the B-trees.
1515 */
1516static int dc_check_balance_internal (struct tree_balance * tb, int h)
1517{
1518 struct virtual_node * vn = tb->tb_vn;
1519
1520 /* Sh is the node whose balance is currently being checked,
1521 and Fh is its father. */
1522 struct buffer_head * Sh, * Fh;
1523 int maxsize,
1524 n_ret_value;
1525 int lfree, rfree /* free space in L and R */;
1526
1527 Sh = PATH_H_PBUFFER (tb->tb_path, h);
1528 Fh = PATH_H_PPARENT (tb->tb_path, h);
1529
1530 maxsize = MAX_CHILD_SIZE(Sh);
1531
1532/* using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */
1533/* new_nr_item = number of items node would have if operation is */
1534/* performed without balancing (new_nr_item); */
1535 create_virtual_node (tb, h);
1536
1537 if ( ! Fh )
1538 { /* S[h] is the root. */
1539 if ( vn->vn_nr_item > 0 )
1540 {
1541 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1542 return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
1543 }
1544 /* new_nr_item == 0.
1545 * Current root will be deleted resulting in
1546 * decrementing the tree height. */
1547 set_parameters (tb, h, 0, 0, 0, NULL, -1, -1);
1548 return CARRY_ON;
1549 }
1550
1551 if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON )
1552 return n_ret_value;
1553
1554
1555 /* get free space of neighbors */
1556 rfree = get_rfree (tb, h);
1557 lfree = get_lfree (tb, h);
1558
1559 /* determine maximal number of items we can fit into neighbors */
1560 check_left (tb, h, lfree);
1561 check_right (tb, h, rfree);
1562
1563
1564 if ( vn->vn_nr_item >= MIN_NR_KEY(Sh) )
1565 { /* Balance condition for the internal node is valid.
1566 * In this case we balance only if it leads to better packing. */
1567 if ( vn->vn_nr_item == MIN_NR_KEY(Sh) )
1568 { /* Here we join S[h] with one of its neighbors,
1569 * which is impossible with greater values of new_nr_item. */
1570 if ( tb->lnum[h] >= vn->vn_nr_item + 1 )
1571 {
1572 /* All contents of S[h] can be moved to L[h]. */
1573 int n;
1574 int order_L;
1575
1576 order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1577 n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE);
1578 set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1);
1579 return CARRY_ON;
1580 }
1581
1582 if ( tb->rnum[h] >= vn->vn_nr_item + 1 )
1583 {
1584 /* All contents of S[h] can be moved to R[h]. */
1585 int n;
1586 int order_R;
1587
1588 order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : n + 1;
1589 n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE);
1590 set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1);
1591 return CARRY_ON;
1592 }
1593 }
1594
1595 if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)
1596 {
1597 /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1598 int to_r;
1599
1600 to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
1601 (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1602 set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
1603 return CARRY_ON;
1604 }
1605
1606 /* Balancing does not lead to better packing. */
1607 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1608 return NO_BALANCING_NEEDED;
1609 }
1610
1611 /* Current node contain insufficient number of items. Balancing is required. */
1612 /* Check whether we can merge S[h] with left neighbor. */
1613 if (tb->lnum[h] >= vn->vn_nr_item + 1)
1614 if (is_left_neighbor_in_cache (tb,h) || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h])
1615 {
1616 int n;
1617 int order_L;
1618
1619 order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1620 n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE);
1621 set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1);
1622 return CARRY_ON;
1623 }
1624
1625 /* Check whether we can merge S[h] with right neighbor. */
1626 if (tb->rnum[h] >= vn->vn_nr_item + 1)
1627 {
1628 int n;
1629 int order_R;
1630
1631 order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : (n + 1);
1632 n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE);
1633 set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1);
1634 return CARRY_ON;
1635 }
1636
1637 /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1638 if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)
1639 {
1640 int to_r;
1641
1642 to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
1643 (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1644 set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
1645 return CARRY_ON;
1646 }
1647
1648 /* For internal nodes try to borrow item from a neighbor */
1649 RFALSE( !tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
1650
1651 /* Borrow one or two items from caching neighbor */
1652 if (is_left_neighbor_in_cache (tb,h) || !tb->FR[h])
1653 {
1654 int from_l;
1655
1656 from_l = (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + 1) / 2 - (vn->vn_nr_item + 1);
1657 set_parameters (tb, h, -from_l, 0, 1, NULL, -1, -1);
1658 return CARRY_ON;
1659 }
1660
1661 set_parameters (tb, h, 0, -((MAX_NR_KEY(Sh)+1-tb->rnum[h]+vn->vn_nr_item+1)/2-(vn->vn_nr_item+1)), 1,
1662 NULL, -1, -1);
1663 return CARRY_ON;
1664}
1665
1666
1667/* Check whether current node S[h] is balanced when Decreasing its size by
1668 * Deleting or Truncating for LEAF node of S+tree.
1669 * Calculate parameters for balancing for current level h.
1670 * Parameters:
1671 * tb tree_balance structure;
1672 * h current level of the node;
1673 * inum item number in S[h];
1674 * mode i - insert, p - paste;
1675 * Returns: 1 - schedule occurred;
1676 * 0 - balancing for higher levels needed;
1677 * -1 - no balancing for higher levels needed;
1678 * -2 - no disk space.
1679 */
1680static int dc_check_balance_leaf (struct tree_balance * tb, int h)
1681{
1682 struct virtual_node * vn = tb->tb_vn;
1683
1684 /* Number of bytes that must be deleted from
1685 (value is negative if bytes are deleted) buffer which
1686 contains node being balanced. The mnemonic is that the
1687 attempted change in node space used level is levbytes bytes. */
1688 int levbytes;
1689 /* the maximal item size */
1690 int maxsize,
1691 n_ret_value;
1692 /* S0 is the node whose balance is currently being checked,
1693 and F0 is its father. */
1694 struct buffer_head * S0, * F0;
1695 int lfree, rfree /* free space in L and R */;
1696
1697 S0 = PATH_H_PBUFFER (tb->tb_path, 0);
1698 F0 = PATH_H_PPARENT (tb->tb_path, 0);
1699
1700 levbytes = tb->insert_size[h];
1701
1702 maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */
1703
1704 if ( ! F0 )
1705 { /* S[0] is the root now. */
1706
1707 RFALSE( -levbytes >= maxsize - B_FREE_SPACE (S0),
1708 "vs-8240: attempt to create empty buffer tree");
1709
1710 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1711 return NO_BALANCING_NEEDED;
1712 }
1713
1714 if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON )
1715 return n_ret_value;
1716
1717 /* get free space of neighbors */
1718 rfree = get_rfree (tb, h);
1719 lfree = get_lfree (tb, h);
1720
1721 create_virtual_node (tb, h);
1722
1723 /* if 3 leaves can be merge to one, set parameters and return */
1724 if (are_leaves_removable (tb, lfree, rfree))
1725 return CARRY_ON;
1726
1727 /* determine maximal number of items we can shift to the left/right neighbor
1728 and the maximal number of bytes that can flow to the left/right neighbor
1729 from the left/right most liquid item that cannot be shifted from S[0] entirely
1730 */
1731 check_left (tb, h, lfree);
1732 check_right (tb, h, rfree);
1733
1734 /* check whether we can merge S with left neighbor. */
1735 if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
1736 if (is_left_neighbor_in_cache (tb,h) ||
1737 ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */
1738 !tb->FR[h]) {
1739
1740 RFALSE( !tb->FL[h], "vs-8245: dc_check_balance_leaf: FL[h] must exist");
1741
1742 /* set parameter to merge S[0] with its left neighbor */
1743 set_parameters (tb, h, -1, 0, 0, NULL, -1, -1);
1744 return CARRY_ON;
1745 }
1746
1747 /* check whether we can merge S[0] with right neighbor. */
1748 if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
1749 set_parameters (tb, h, 0, -1, 0, NULL, -1, -1);
1750 return CARRY_ON;
1751 }
1752
1753 /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
1754 if (is_leaf_removable (tb))
1755 return CARRY_ON;
1756
1757 /* Balancing is not required. */
1758 tb->s0num = vn->vn_nr_item;
1759 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1760 return NO_BALANCING_NEEDED;
1761}
1762
1763
1764
1765/* Check whether current node S[h] is balanced when Decreasing its size by
1766 * Deleting or Cutting.
1767 * Calculate parameters for balancing for current level h.
1768 * Parameters:
1769 * tb tree_balance structure;
1770 * h current level of the node;
1771 * inum item number in S[h];
1772 * mode d - delete, c - cut.
1773 * Returns: 1 - schedule occurred;
1774 * 0 - balancing for higher levels needed;
1775 * -1 - no balancing for higher levels needed;
1776 * -2 - no disk space.
1777 */
1778static int dc_check_balance (struct tree_balance * tb, int h)
1779{
1780 RFALSE( ! (PATH_H_PBUFFER (tb->tb_path, h)), "vs-8250: S is not initialized");
1781
1782 if ( h )
1783 return dc_check_balance_internal (tb, h);
1784 else
1785 return dc_check_balance_leaf (tb, h);
1786}
1787
1788
1789
1790/* Check whether current node S[h] is balanced.
1791 * Calculate parameters for balancing for current level h.
1792 * Parameters:
1793 *
1794 * tb tree_balance structure:
1795 *
1796 * tb is a large structure that must be read about in the header file
1797 * at the same time as this procedure if the reader is to successfully
1798 * understand this procedure
1799 *
1800 * h current level of the node;
1801 * inum item number in S[h];
1802 * mode i - insert, p - paste, d - delete, c - cut.
1803 * Returns: 1 - schedule occurred;
1804 * 0 - balancing for higher levels needed;
1805 * -1 - no balancing for higher levels needed;
1806 * -2 - no disk space.
1807 */
1808static int check_balance (int mode,
1809 struct tree_balance * tb,
1810 int h,
1811 int inum,
1812 int pos_in_item,
1813 struct item_head * ins_ih,
1814 const void * data
1815 )
1816{
1817 struct virtual_node * vn;
1818
1819 vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
1820 vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
1821 vn->vn_mode = mode;
1822 vn->vn_affected_item_num = inum;
1823 vn->vn_pos_in_item = pos_in_item;
1824 vn->vn_ins_ih = ins_ih;
1825 vn->vn_data = data;
1826
1827 RFALSE( mode == M_INSERT && !vn->vn_ins_ih,
1828 "vs-8255: ins_ih can not be 0 in insert mode");
1829
1830 if ( tb->insert_size[h] > 0 )
1831 /* Calculate balance parameters when size of node is increasing. */
1832 return ip_check_balance (tb, h);
1833
1834 /* Calculate balance parameters when size of node is decreasing. */
1835 return dc_check_balance (tb, h);
1836}
1837
1838
1839
1840/* Check whether parent at the path is the really parent of the current node.*/
1841static int get_direct_parent(
1842 struct tree_balance * p_s_tb,
1843 int n_h
1844 ) {
1845 struct buffer_head * p_s_bh;
1846 struct path * p_s_path = p_s_tb->tb_path;
1847 int n_position,
1848 n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1849
1850 /* We are in the root or in the new root. */
1851 if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) {
1852
1853 RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
1854 "PAP-8260: invalid offset in the path");
1855
1856 if ( PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1857 SB_ROOT_BLOCK (p_s_tb->tb_sb) ) {
1858 /* Root is not changed. */
1859 PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL;
1860 PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0;
1861 return CARRY_ON;
1862 }
1863 return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */
1864 }
1865
1866 if ( ! B_IS_IN_TREE(p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)) )
1867 return REPEAT_SEARCH; /* Parent in the path is not in the tree. */
1868
1869 if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) > B_NR_ITEMS(p_s_bh) )
1870 return REPEAT_SEARCH;
1871
1872 if ( B_N_CHILD_NUM(p_s_bh, n_position) != PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr )
1873 /* Parent in the path is not parent of the current node in the tree. */
1874 return REPEAT_SEARCH;
1875
1876 if ( buffer_locked(p_s_bh) ) {
1877 __wait_on_buffer(p_s_bh);
1878 if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
1879 return REPEAT_SEARCH;
1880 }
1881
1882 return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */
1883}
1884
1885
1886/* Using lnum[n_h] and rnum[n_h] we should determine what neighbors
1887 * of S[n_h] we
1888 * need in order to balance S[n_h], and get them if necessary.
1889 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
1890 * CARRY_ON - schedule didn't occur while the function worked;
1891 */
1892static int get_neighbors(
1893 struct tree_balance * p_s_tb,
1894 int n_h
1895 ) {
1896 int n_child_position,
1897 n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1);
1898 unsigned long n_son_number;
1899 struct super_block * p_s_sb = p_s_tb->tb_sb;
1900 struct buffer_head * p_s_bh;
1901
1902
1903 PROC_INFO_INC( p_s_sb, get_neighbors[ n_h ] );
1904
1905 if ( p_s_tb->lnum[n_h] ) {
1906 /* We need left neighbor to balance S[n_h]. */
1907 PROC_INFO_INC( p_s_sb, need_l_neighbor[ n_h ] );
1908 p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1909
1910 RFALSE( p_s_bh == p_s_tb->FL[n_h] &&
1911 ! PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset),
1912 "PAP-8270: invalid position in the parent");
1913
1914 n_child_position = ( p_s_bh == p_s_tb->FL[n_h] ) ? p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]);
1915 n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position);
1916 p_s_bh = sb_bread(p_s_sb, n_son_number);
1917 if (!p_s_bh)
1918 return IO_ERROR;
1919 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1920 decrement_bcount(p_s_bh);
1921 PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] );
1922 return REPEAT_SEARCH;
1923 }
1924
1925 RFALSE( ! B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
1926 n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) ||
1927 B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) !=
1928 p_s_bh->b_blocknr, "PAP-8275: invalid parent");
1929 RFALSE( ! B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child");
1930 RFALSE( ! n_h &&
1931 B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FL[0],n_child_position)),
1932 "PAP-8290: invalid child size of left neighbor");
1933
1934 decrement_bcount(p_s_tb->L[n_h]);
1935 p_s_tb->L[n_h] = p_s_bh;
1936 }
1937
1938
1939 if ( p_s_tb->rnum[n_h] ) { /* We need right neighbor to balance S[n_path_offset]. */
1940 PROC_INFO_INC( p_s_sb, need_r_neighbor[ n_h ] );
1941 p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1942
1943 RFALSE( p_s_bh == p_s_tb->FR[n_h] &&
1944 PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset) >= B_NR_ITEMS(p_s_bh),
1945 "PAP-8295: invalid position in the parent");
1946
1947 n_child_position = ( p_s_bh == p_s_tb->FR[n_h] ) ? p_s_tb->rkey[n_h] + 1 : 0;
1948 n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position);
1949 p_s_bh = sb_bread(p_s_sb, n_son_number);
1950 if (!p_s_bh)
1951 return IO_ERROR;
1952 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1953 decrement_bcount(p_s_bh);
1954 PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] );
1955 return REPEAT_SEARCH;
1956 }
1957 decrement_bcount(p_s_tb->R[n_h]);
1958 p_s_tb->R[n_h] = p_s_bh;
1959
1960 RFALSE( ! n_h && B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)),
1961 "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
1962 B_FREE_SPACE (p_s_bh), MAX_CHILD_SIZE (p_s_bh),
1963 dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)));
1964
1965 }
1966 return CARRY_ON;
1967}
1968
1969#ifdef CONFIG_REISERFS_CHECK
1970void * reiserfs_kmalloc (size_t size, int flags, struct super_block * s)
1971{
1972 void * vp;
1973 static size_t malloced;
1974
1975
1976 vp = kmalloc (size, flags);
1977 if (vp) {
1978 REISERFS_SB(s)->s_kmallocs += size;
1979 if (REISERFS_SB(s)->s_kmallocs > malloced + 200000) {
1980 reiserfs_warning (s,
1981 "vs-8301: reiserfs_kmalloc: allocated memory %d",
1982 REISERFS_SB(s)->s_kmallocs);
1983 malloced = REISERFS_SB(s)->s_kmallocs;
1984 }
1985 }
1986 return vp;
1987}
1988
1989void reiserfs_kfree (const void * vp, size_t size, struct super_block * s)
1990{
1991 kfree (vp);
1992
1993 REISERFS_SB(s)->s_kmallocs -= size;
1994 if (REISERFS_SB(s)->s_kmallocs < 0)
1995 reiserfs_warning (s, "vs-8302: reiserfs_kfree: allocated memory %d",
1996 REISERFS_SB(s)->s_kmallocs);
1997
1998}
1999#endif
2000
2001
2002static int get_virtual_node_size (struct super_block * sb, struct buffer_head * bh)
2003{
2004 int max_num_of_items;
2005 int max_num_of_entries;
2006 unsigned long blocksize = sb->s_blocksize;
2007
2008#define MIN_NAME_LEN 1
2009
2010 max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
2011 max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
2012 (DEH_SIZE + MIN_NAME_LEN);
2013
2014 return sizeof(struct virtual_node) +
2015 max(max_num_of_items * sizeof (struct virtual_item),
2016 sizeof (struct virtual_item) + sizeof(struct direntry_uarea) +
2017 (max_num_of_entries - 1) * sizeof (__u16));
2018}
2019
2020
2021
2022/* maybe we should fail balancing we are going to perform when kmalloc
2023 fails several times. But now it will loop until kmalloc gets
2024 required memory */
2025static int get_mem_for_virtual_node (struct tree_balance * tb)
2026{
2027 int check_fs = 0;
2028 int size;
2029 char * buf;
2030
2031 size = get_virtual_node_size (tb->tb_sb, PATH_PLAST_BUFFER (tb->tb_path));
2032
2033 if (size > tb->vn_buf_size) {
2034 /* we have to allocate more memory for virtual node */
2035 if (tb->vn_buf) {
2036 /* free memory allocated before */
2037 reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
2038 /* this is not needed if kfree is atomic */
2039 check_fs = 1;
2040 }
2041
2042 /* virtual node requires now more memory */
2043 tb->vn_buf_size = size;
2044
2045 /* get memory for virtual item */
2046 buf = reiserfs_kmalloc(size, GFP_ATOMIC | __GFP_NOWARN, tb->tb_sb);
2047 if ( ! buf ) {
2048 /* getting memory with GFP_KERNEL priority may involve
2049 balancing now (due to indirect_to_direct conversion on
2050 dcache shrinking). So, release path and collected
2051 resources here */
2052 free_buffers_in_tb (tb);
2053 buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb);
2054 if ( !buf ) {
2055#ifdef CONFIG_REISERFS_CHECK
2056 reiserfs_warning (tb->tb_sb,
2057 "vs-8345: get_mem_for_virtual_node: "
2058 "kmalloc failed. reiserfs kmalloced %d bytes",
2059 REISERFS_SB(tb->tb_sb)->s_kmallocs);
2060#endif
2061 tb->vn_buf_size = 0;
2062 }
2063 tb->vn_buf = buf;
2064 schedule() ;
2065 return REPEAT_SEARCH;
2066 }
2067
2068 tb->vn_buf = buf;
2069 }
2070
2071 if ( check_fs && FILESYSTEM_CHANGED_TB (tb) )
2072 return REPEAT_SEARCH;
2073
2074 return CARRY_ON;
2075}
2076
2077
2078#ifdef CONFIG_REISERFS_CHECK
2079static void tb_buffer_sanity_check (struct super_block * p_s_sb,
2080 struct buffer_head * p_s_bh,
2081 const char *descr, int level) {
2082 if (p_s_bh) {
2083 if (atomic_read (&(p_s_bh->b_count)) <= 0) {
2084
2085 reiserfs_panic (p_s_sb, "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n", descr, level, p_s_bh);
2086 }
2087
2088 if ( ! buffer_uptodate (p_s_bh) ) {
2089 reiserfs_panic (p_s_sb, "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n", descr, level, p_s_bh);
2090 }
2091
2092 if ( ! B_IS_IN_TREE (p_s_bh) ) {
2093 reiserfs_panic (p_s_sb, "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n", descr, level, p_s_bh);
2094 }
2095
2096 if (p_s_bh->b_bdev != p_s_sb->s_bdev) {
2097 reiserfs_panic (p_s_sb, "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n", descr, level, p_s_bh);
2098 }
2099
2100 if (p_s_bh->b_size != p_s_sb->s_blocksize) {
2101 reiserfs_panic (p_s_sb, "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n", descr, level, p_s_bh);
2102 }
2103
2104 if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
2105 reiserfs_panic (p_s_sb, "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n", descr, level, p_s_bh);
2106 }
2107 }
2108}
2109#else
2110static void tb_buffer_sanity_check (struct super_block * p_s_sb,
2111 struct buffer_head * p_s_bh,
2112 const char *descr, int level)
2113{;}
2114#endif
2115
2116static int clear_all_dirty_bits(struct super_block *s,
2117 struct buffer_head *bh) {
2118 return reiserfs_prepare_for_journal(s, bh, 0) ;
2119}
2120
2121static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
2122{
2123 struct buffer_head * locked;
2124#ifdef CONFIG_REISERFS_CHECK
2125 int repeat_counter = 0;
2126#endif
2127 int i;
2128
2129 do {
2130
2131 locked = NULL;
2132
2133 for ( i = p_s_tb->tb_path->path_length; !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i-- ) {
2134 if ( PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i) ) {
2135 /* if I understand correctly, we can only be sure the last buffer
2136 ** in the path is in the tree --clm
2137 */
2138#ifdef CONFIG_REISERFS_CHECK
2139 if (PATH_PLAST_BUFFER(p_s_tb->tb_path) ==
2140 PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
2141 tb_buffer_sanity_check (p_s_tb->tb_sb,
2142 PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i),
2143 "S",
2144 p_s_tb->tb_path->path_length - i);
2145 }
2146#endif
2147 if (!clear_all_dirty_bits(p_s_tb->tb_sb,
2148 PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)))
2149 {
2150 locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i);
2151 }
2152 }
2153 }
2154
2155 for ( i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i]; i++ ) {
2156
2157 if (p_s_tb->lnum[i] ) {
2158
2159 if ( p_s_tb->L[i] ) {
2160 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i);
2161 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]))
2162 locked = p_s_tb->L[i];
2163 }
2164
2165 if ( !locked && p_s_tb->FL[i] ) {
2166 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i);
2167 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]))
2168 locked = p_s_tb->FL[i];
2169 }
2170
2171 if ( !locked && p_s_tb->CFL[i] ) {
2172 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i);
2173 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]))
2174 locked = p_s_tb->CFL[i];
2175 }
2176
2177 }
2178
2179 if ( !locked && (p_s_tb->rnum[i]) ) {
2180
2181 if ( p_s_tb->R[i] ) {
2182 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i);
2183 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]))
2184 locked = p_s_tb->R[i];
2185 }
2186
2187
2188 if ( !locked && p_s_tb->FR[i] ) {
2189 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i);
2190 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]))
2191 locked = p_s_tb->FR[i];
2192 }
2193
2194 if ( !locked && p_s_tb->CFR[i] ) {
2195 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i);
2196 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]))
2197 locked = p_s_tb->CFR[i];
2198 }
2199 }
2200 }
2201 /* as far as I can tell, this is not required. The FEB list seems
2202 ** to be full of newly allocated nodes, which will never be locked,
2203 ** dirty, or anything else.
2204 ** To be safe, I'm putting in the checks and waits in. For the moment,
2205 ** they are needed to keep the code in journal.c from complaining
2206 ** about the buffer. That code is inside CONFIG_REISERFS_CHECK as well.
2207 ** --clm
2208 */
2209 for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) {
2210 if ( p_s_tb->FEB[i] ) {
2211 if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]))
2212 locked = p_s_tb->FEB[i] ;
2213 }
2214 }
2215
2216 if (locked) {
2217#ifdef CONFIG_REISERFS_CHECK
2218 repeat_counter++;
2219 if ( (repeat_counter % 10000) == 0) {
2220 reiserfs_warning (p_s_tb->tb_sb,
2221 "wait_tb_buffers_until_released(): too many "
2222 "iterations waiting for buffer to unlock "
2223 "(%b)", locked);
2224
2225 /* Don't loop forever. Try to recover from possible error. */
2226
2227 return ( FILESYSTEM_CHANGED_TB (p_s_tb) ) ? REPEAT_SEARCH : CARRY_ON;
2228 }
2229#endif
2230 __wait_on_buffer (locked);
2231 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
2232 return REPEAT_SEARCH;
2233 }
2234 }
2235
2236 } while (locked);
2237
2238 return CARRY_ON;
2239}
2240
2241
2242/* Prepare for balancing, that is
2243 * get all necessary parents, and neighbors;
2244 * analyze what and where should be moved;
2245 * get sufficient number of new nodes;
2246 * Balancing will start only after all resources will be collected at a time.
2247 *
2248 * When ported to SMP kernels, only at the last moment after all needed nodes
2249 * are collected in cache, will the resources be locked using the usual
2250 * textbook ordered lock acquisition algorithms. Note that ensuring that
2251 * this code neither write locks what it does not need to write lock nor locks out of order
2252 * will be a pain in the butt that could have been avoided. Grumble grumble. -Hans
2253 *
2254 * fix is meant in the sense of render unchanging
2255 *
2256 * Latency might be improved by first gathering a list of what buffers are needed
2257 * and then getting as many of them in parallel as possible? -Hans
2258 *
2259 * Parameters:
2260 * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
2261 * tb tree_balance structure;
2262 * inum item number in S[h];
2263 * pos_in_item - comment this if you can
2264 * ins_ih & ins_sd are used when inserting
2265 * Returns: 1 - schedule occurred while the function worked;
2266 * 0 - schedule didn't occur while the function worked;
2267 * -1 - if no_disk_space
2268 */
2269
2270
2271int fix_nodes (int n_op_mode,
2272 struct tree_balance * p_s_tb,
2273 struct item_head * p_s_ins_ih, // item head of item being inserted
2274 const void * data // inserted item or data to be pasted
2275 ) {
2276 int n_ret_value,
2277 n_h,
2278 n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path);
2279 int n_pos_in_item;
2280
2281 /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
2282 ** during wait_tb_buffers_run
2283 */
2284 int wait_tb_buffers_run = 0 ;
2285 struct buffer_head * p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
2286
2287 ++ REISERFS_SB(p_s_tb -> tb_sb) -> s_fix_nodes;
2288
2289 n_pos_in_item = p_s_tb->tb_path->pos_in_item;
2290
2291
2292 p_s_tb->fs_gen = get_generation (p_s_tb->tb_sb);
2293
2294 /* we prepare and log the super here so it will already be in the
2295 ** transaction when do_balance needs to change it.
2296 ** This way do_balance won't have to schedule when trying to prepare
2297 ** the super for logging
2298 */
2299 reiserfs_prepare_for_journal(p_s_tb->tb_sb,
2300 SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1) ;
2301 journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb,
2302 SB_BUFFER_WITH_SB(p_s_tb->tb_sb)) ;
2303 if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
2304 return REPEAT_SEARCH;
2305
2306 /* if it possible in indirect_to_direct conversion */
2307 if (buffer_locked (p_s_tbS0)) {
2308 __wait_on_buffer (p_s_tbS0);
2309 if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
2310 return REPEAT_SEARCH;
2311 }
2312
2313#ifdef CONFIG_REISERFS_CHECK
2314 if ( cur_tb ) {
2315 print_cur_tb ("fix_nodes");
2316 reiserfs_panic(p_s_tb->tb_sb,"PAP-8305: fix_nodes: there is pending do_balance");
2317 }
2318
2319 if (!buffer_uptodate (p_s_tbS0) || !B_IS_IN_TREE (p_s_tbS0)) {
2320 reiserfs_panic (p_s_tb->tb_sb, "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate "
2321 "at the beginning of fix_nodes or not in tree (mode %c)", p_s_tbS0, p_s_tbS0, n_op_mode);
2322 }
2323
2324 /* Check parameters. */
2325 switch (n_op_mode) {
2326 case M_INSERT:
2327 if ( n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0) )
2328 reiserfs_panic(p_s_tb->tb_sb,"PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert",
2329 n_item_num, B_NR_ITEMS(p_s_tbS0));
2330 break;
2331 case M_PASTE:
2332 case M_DELETE:
2333 case M_CUT:
2334 if ( n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0) ) {
2335 print_block (p_s_tbS0, 0, -1, -1);
2336 reiserfs_panic(p_s_tb->tb_sb,"PAP-8335: fix_nodes: Incorrect item number(%d); mode = %c insert_size = %d\n", n_item_num, n_op_mode, p_s_tb->insert_size[0]);
2337 }
2338 break;
2339 default:
2340 reiserfs_panic(p_s_tb->tb_sb,"PAP-8340: fix_nodes: Incorrect mode of operation");
2341 }
2342#endif
2343
2344 if (get_mem_for_virtual_node (p_s_tb) == REPEAT_SEARCH)
2345 // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
2346 return REPEAT_SEARCH;
2347
2348
2349 /* Starting from the leaf level; for all levels n_h of the tree. */
2350 for ( n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++ ) {
2351 if ( (n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON ) {
2352 goto repeat;
2353 }
2354
2355 if ( (n_ret_value = check_balance (n_op_mode, p_s_tb, n_h, n_item_num,
2356 n_pos_in_item, p_s_ins_ih, data)) != CARRY_ON ) {
2357 if ( n_ret_value == NO_BALANCING_NEEDED ) {
2358 /* No balancing for higher levels needed. */
2359 if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) {
2360 goto repeat;
2361 }
2362 if ( n_h != MAX_HEIGHT - 1 )
2363 p_s_tb->insert_size[n_h + 1] = 0;
2364 /* ok, analysis and resource gathering are complete */
2365 break;
2366 }
2367 goto repeat;
2368 }
2369
2370 if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) {
2371 goto repeat;
2372 }
2373
2374 if ( (n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON ) {
2375 goto repeat; /* No disk space, or schedule occurred and
2376 analysis may be invalid and needs to be redone. */
2377 }
2378
2379 if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h) ) {
2380 /* We have a positive insert size but no nodes exist on this
2381 level, this means that we are creating a new root. */
2382
2383 RFALSE( p_s_tb->blknum[n_h] != 1,
2384 "PAP-8350: creating new empty root");
2385
2386 if ( n_h < MAX_HEIGHT - 1 )
2387 p_s_tb->insert_size[n_h + 1] = 0;
2388 }
2389 else
2390 if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1) ) {
2391 if ( p_s_tb->blknum[n_h] > 1 ) {
2392 /* The tree needs to be grown, so this node S[n_h]
2393 which is the root node is split into two nodes,
2394 and a new node (S[n_h+1]) will be created to
2395 become the root node. */
2396
2397 RFALSE( n_h == MAX_HEIGHT - 1,
2398 "PAP-8355: attempt to create too high of a tree");
2399
2400 p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) + DC_SIZE;
2401 }
2402 else
2403 if ( n_h < MAX_HEIGHT - 1 )
2404 p_s_tb->insert_size[n_h + 1] = 0;
2405 }
2406 else
2407 p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
2408 }
2409
2410 if ((n_ret_value = wait_tb_buffers_until_unlocked (p_s_tb)) == CARRY_ON) {
2411 if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2412 wait_tb_buffers_run = 1 ;
2413 n_ret_value = REPEAT_SEARCH ;
2414 goto repeat;
2415 } else {
2416 return CARRY_ON;
2417 }
2418 } else {
2419 wait_tb_buffers_run = 1 ;
2420 goto repeat;
2421 }
2422
2423 repeat:
2424 // fix_nodes was unable to perform its calculation due to
2425 // filesystem got changed under us, lack of free disk space or i/o
2426 // failure. If the first is the case - the search will be
2427 // repeated. For now - free all resources acquired so far except
2428 // for the new allocated nodes
2429 {
2430 int i;
2431
2432 /* Release path buffers. */
2433 if (wait_tb_buffers_run) {
2434 pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path) ;
2435 } else {
2436 pathrelse (p_s_tb->tb_path);
2437 }
2438 /* brelse all resources collected for balancing */
2439 for ( i = 0; i < MAX_HEIGHT; i++ ) {
2440 if (wait_tb_buffers_run) {
2441 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->L[i]);
2442 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->R[i]);
2443 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FL[i]);
2444 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FR[i]);
2445 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFL[i]);
2446 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFR[i]);
2447 }
2448
2449 brelse (p_s_tb->L[i]);p_s_tb->L[i] = NULL;
2450 brelse (p_s_tb->R[i]);p_s_tb->R[i] = NULL;
2451 brelse (p_s_tb->FL[i]);p_s_tb->FL[i] = NULL;
2452 brelse (p_s_tb->FR[i]);p_s_tb->FR[i] = NULL;
2453 brelse (p_s_tb->CFL[i]);p_s_tb->CFL[i] = NULL;
2454 brelse (p_s_tb->CFR[i]);p_s_tb->CFR[i] = NULL;
2455 }
2456
2457 if (wait_tb_buffers_run) {
2458 for ( i = 0; i < MAX_FEB_SIZE; i++ ) {
2459 if ( p_s_tb->FEB[i] ) {
2460 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2461 p_s_tb->FEB[i]) ;
2462 }
2463 }
2464 }
2465 return n_ret_value;
2466 }
2467
2468}
2469
2470
2471/* Anatoly will probably forgive me renaming p_s_tb to tb. I just
2472 wanted to make lines shorter */
2473void unfix_nodes (struct tree_balance * tb)
2474{
2475 int i;
2476
2477 /* Release path buffers. */
2478 pathrelse_and_restore (tb->tb_sb, tb->tb_path);
2479
2480 /* brelse all resources collected for balancing */
2481 for ( i = 0; i < MAX_HEIGHT; i++ ) {
2482 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->L[i]);
2483 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->R[i]);
2484 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FL[i]);
2485 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FR[i]);
2486 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFL[i]);
2487 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFR[i]);
2488
2489 brelse (tb->L[i]);
2490 brelse (tb->R[i]);
2491 brelse (tb->FL[i]);
2492 brelse (tb->FR[i]);
2493 brelse (tb->CFL[i]);
2494 brelse (tb->CFR[i]);
2495 }
2496
2497 /* deal with list of allocated (used and unused) nodes */
2498 for ( i = 0; i < MAX_FEB_SIZE; i++ ) {
2499 if ( tb->FEB[i] ) {
2500 b_blocknr_t blocknr = tb->FEB[i]->b_blocknr ;
2501 /* de-allocated block which was not used by balancing and
2502 bforget about buffer for it */
2503 brelse (tb->FEB[i]);
2504 reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0);
2505 }
2506 if (tb->used[i]) {
2507 /* release used as new nodes including a new root */
2508 brelse (tb->used[i]);
2509 }
2510 }
2511
2512 if (tb->vn_buf)
2513 reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
2514
2515}
2516
2517
2518
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
new file mode 100644
index 000000000000..08d0508c2d39
--- /dev/null
+++ b/fs/reiserfs/hashes.c
@@ -0,0 +1,209 @@
1
2/*
3 * Keyed 32-bit hash function using TEA in a Davis-Meyer function
4 * H0 = Key
5 * Hi = E Mi(Hi-1) + Hi-1
6 *
7 * (see Applied Cryptography, 2nd edition, p448).
8 *
9 * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
10 *
11 * Jeremy has agreed to the contents of reiserfs/README. -Hans
12 * Yura's function is added (04/07/2000)
13 */
14
15//
16// keyed_hash
17// yura_hash
18// r5_hash
19//
20
21#include <linux/kernel.h>
22#include <asm/types.h>
23#include <asm/bug.h>
24
25
26#define DELTA 0x9E3779B9
27#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
28#define PARTROUNDS 6 /* 6 gets complete mixing */
29
30/* a, b, c, d - data; h0, h1 - accumulated hash */
31#define TEACORE(rounds) \
32 do { \
33 u32 sum = 0; \
34 int n = rounds; \
35 u32 b0, b1; \
36 \
37 b0 = h0; \
38 b1 = h1; \
39 \
40 do \
41 { \
42 sum += DELTA; \
43 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
44 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
45 } while(--n); \
46 \
47 h0 += b0; \
48 h1 += b1; \
49 } while(0)
50
51
52u32 keyed_hash(const signed char *msg, int len)
53{
54 u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3};
55
56 u32 h0 = k[0], h1 = k[1];
57 u32 a, b, c, d;
58 u32 pad;
59 int i;
60
61 // assert(len >= 0 && len < 256);
62
63 pad = (u32)len | ((u32)len << 8);
64 pad |= pad << 16;
65
66 while(len >= 16)
67 {
68 a = (u32)msg[ 0] |
69 (u32)msg[ 1] << 8 |
70 (u32)msg[ 2] << 16|
71 (u32)msg[ 3] << 24;
72 b = (u32)msg[ 4] |
73 (u32)msg[ 5] << 8 |
74 (u32)msg[ 6] << 16|
75 (u32)msg[ 7] << 24;
76 c = (u32)msg[ 8] |
77 (u32)msg[ 9] << 8 |
78 (u32)msg[10] << 16|
79 (u32)msg[11] << 24;
80 d = (u32)msg[12] |
81 (u32)msg[13] << 8 |
82 (u32)msg[14] << 16|
83 (u32)msg[15] << 24;
84
85 TEACORE(PARTROUNDS);
86
87 len -= 16;
88 msg += 16;
89 }
90
91 if (len >= 12)
92 {
93 a = (u32)msg[ 0] |
94 (u32)msg[ 1] << 8 |
95 (u32)msg[ 2] << 16|
96 (u32)msg[ 3] << 24;
97 b = (u32)msg[ 4] |
98 (u32)msg[ 5] << 8 |
99 (u32)msg[ 6] << 16|
100 (u32)msg[ 7] << 24;
101 c = (u32)msg[ 8] |
102 (u32)msg[ 9] << 8 |
103 (u32)msg[10] << 16|
104 (u32)msg[11] << 24;
105
106 d = pad;
107 for(i = 12; i < len; i++)
108 {
109 d <<= 8;
110 d |= msg[i];
111 }
112 }
113 else if (len >= 8)
114 {
115 a = (u32)msg[ 0] |
116 (u32)msg[ 1] << 8 |
117 (u32)msg[ 2] << 16|
118 (u32)msg[ 3] << 24;
119 b = (u32)msg[ 4] |
120 (u32)msg[ 5] << 8 |
121 (u32)msg[ 6] << 16|
122 (u32)msg[ 7] << 24;
123
124 c = d = pad;
125 for(i = 8; i < len; i++)
126 {
127 c <<= 8;
128 c |= msg[i];
129 }
130 }
131 else if (len >= 4)
132 {
133 a = (u32)msg[ 0] |
134 (u32)msg[ 1] << 8 |
135 (u32)msg[ 2] << 16|
136 (u32)msg[ 3] << 24;
137
138 b = c = d = pad;
139 for(i = 4; i < len; i++)
140 {
141 b <<= 8;
142 b |= msg[i];
143 }
144 }
145 else
146 {
147 a = b = c = d = pad;
148 for(i = 0; i < len; i++)
149 {
150 a <<= 8;
151 a |= msg[i];
152 }
153 }
154
155 TEACORE(FULLROUNDS);
156
157/* return 0;*/
158 return h0^h1;
159}
160
161/* What follows in this file is copyright 2000 by Hans Reiser, and the
162 * licensing of what follows is governed by reiserfs/README */
163
164u32 yura_hash (const signed char *msg, int len)
165{
166 int j, pow;
167 u32 a, c;
168 int i;
169
170 for (pow=1,i=1; i < len; i++) pow = pow * 10;
171
172 if (len == 1)
173 a = msg[0]-48;
174 else
175 a = (msg[0] - 48) * pow;
176
177 for (i=1; i < len; i++) {
178 c = msg[i] - 48;
179 for (pow=1,j=i; j < len-1; j++) pow = pow * 10;
180 a = a + c * pow;
181 }
182
183 for (; i < 40; i++) {
184 c = '0' - 48;
185 for (pow=1,j=i; j < len-1; j++) pow = pow * 10;
186 a = a + c * pow;
187 }
188
189 for (; i < 256; i++) {
190 c = i;
191 for (pow=1,j=i; j < len-1; j++) pow = pow * 10;
192 a = a + c * pow;
193 }
194
195 a = a << 7;
196 return a;
197}
198
199u32 r5_hash (const signed char *msg, int len)
200{
201 u32 a=0;
202 while(*msg) {
203 a += *msg << 4;
204 a += *msg >> 4;
205 a *= 11;
206 msg++;
207 }
208 return a;
209}
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
new file mode 100644
index 000000000000..a362125da0d8
--- /dev/null
+++ b/fs/reiserfs/ibalance.c
@@ -0,0 +1,1058 @@
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5#include <linux/config.h>
6#include <asm/uaccess.h>
7#include <linux/string.h>
8#include <linux/time.h>
9#include <linux/reiserfs_fs.h>
10#include <linux/buffer_head.h>
11
12/* this is one and only function that is used outside (do_balance.c) */
13int balance_internal (
14 struct tree_balance * ,
15 int,
16 int,
17 struct item_head * ,
18 struct buffer_head **
19 );
20
21/* modes of internal_shift_left, internal_shift_right and internal_insert_childs */
22#define INTERNAL_SHIFT_FROM_S_TO_L 0
23#define INTERNAL_SHIFT_FROM_R_TO_S 1
24#define INTERNAL_SHIFT_FROM_L_TO_S 2
25#define INTERNAL_SHIFT_FROM_S_TO_R 3
26#define INTERNAL_INSERT_TO_S 4
27#define INTERNAL_INSERT_TO_L 5
28#define INTERNAL_INSERT_TO_R 6
29
30static void internal_define_dest_src_infos (
31 int shift_mode,
32 struct tree_balance * tb,
33 int h,
34 struct buffer_info * dest_bi,
35 struct buffer_info * src_bi,
36 int * d_key,
37 struct buffer_head ** cf
38 )
39{
40 memset (dest_bi, 0, sizeof (struct buffer_info));
41 memset (src_bi, 0, sizeof (struct buffer_info));
42 /* define dest, src, dest parent, dest position */
43 switch (shift_mode) {
44 case INTERNAL_SHIFT_FROM_S_TO_L: /* used in internal_shift_left */
45 src_bi->tb = tb;
46 src_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h);
47 src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h);
48 src_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
49 dest_bi->tb = tb;
50 dest_bi->bi_bh = tb->L[h];
51 dest_bi->bi_parent = tb->FL[h];
52 dest_bi->bi_position = get_left_neighbor_position (tb, h);
53 *d_key = tb->lkey[h];
54 *cf = tb->CFL[h];
55 break;
56 case INTERNAL_SHIFT_FROM_L_TO_S:
57 src_bi->tb = tb;
58 src_bi->bi_bh = tb->L[h];
59 src_bi->bi_parent = tb->FL[h];
60 src_bi->bi_position = get_left_neighbor_position (tb, h);
61 dest_bi->tb = tb;
62 dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h);
63 dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h);
64 dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); /* dest position is analog of dest->b_item_order */
65 *d_key = tb->lkey[h];
66 *cf = tb->CFL[h];
67 break;
68
69 case INTERNAL_SHIFT_FROM_R_TO_S: /* used in internal_shift_left */
70 src_bi->tb = tb;
71 src_bi->bi_bh = tb->R[h];
72 src_bi->bi_parent = tb->FR[h];
73 src_bi->bi_position = get_right_neighbor_position (tb, h);
74 dest_bi->tb = tb;
75 dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h);
76 dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h);
77 dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
78 *d_key = tb->rkey[h];
79 *cf = tb->CFR[h];
80 break;
81
82 case INTERNAL_SHIFT_FROM_S_TO_R:
83 src_bi->tb = tb;
84 src_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h);
85 src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h);
86 src_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
87 dest_bi->tb = tb;
88 dest_bi->bi_bh = tb->R[h];
89 dest_bi->bi_parent = tb->FR[h];
90 dest_bi->bi_position = get_right_neighbor_position (tb, h);
91 *d_key = tb->rkey[h];
92 *cf = tb->CFR[h];
93 break;
94
95 case INTERNAL_INSERT_TO_L:
96 dest_bi->tb = tb;
97 dest_bi->bi_bh = tb->L[h];
98 dest_bi->bi_parent = tb->FL[h];
99 dest_bi->bi_position = get_left_neighbor_position (tb, h);
100 break;
101
102 case INTERNAL_INSERT_TO_S:
103 dest_bi->tb = tb;
104 dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h);
105 dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h);
106 dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
107 break;
108
109 case INTERNAL_INSERT_TO_R:
110 dest_bi->tb = tb;
111 dest_bi->bi_bh = tb->R[h];
112 dest_bi->bi_parent = tb->FR[h];
113 dest_bi->bi_position = get_right_neighbor_position (tb, h);
114 break;
115
116 default:
117 reiserfs_panic (tb->tb_sb, "internal_define_dest_src_infos: shift type is unknown (%d)", shift_mode);
118 }
119}
120
121
122
123/* Insert count node pointers into buffer cur before position to + 1.
124 * Insert count items into buffer cur before position to.
125 * Items and node pointers are specified by inserted and bh respectively.
126 */
127static void internal_insert_childs (struct buffer_info * cur_bi,
128 int to, int count,
129 struct item_head * inserted,
130 struct buffer_head ** bh
131 )
132{
133 struct buffer_head * cur = cur_bi->bi_bh;
134 struct block_head * blkh;
135 int nr;
136 struct reiserfs_key * ih;
137 struct disk_child new_dc[2];
138 struct disk_child * dc;
139 int i;
140
141 if (count <= 0)
142 return;
143
144 blkh = B_BLK_HEAD(cur);
145 nr = blkh_nr_item(blkh);
146
147 RFALSE( count > 2,
148 "too many children (%d) are to be inserted", count);
149 RFALSE( B_FREE_SPACE (cur) < count * (KEY_SIZE + DC_SIZE),
150 "no enough free space (%d), needed %d bytes",
151 B_FREE_SPACE (cur), count * (KEY_SIZE + DC_SIZE));
152
153 /* prepare space for count disk_child */
154 dc = B_N_CHILD(cur,to+1);
155
156 memmove (dc + count, dc, (nr+1-(to+1)) * DC_SIZE);
157
158 /* copy to_be_insert disk children */
159 for (i = 0; i < count; i ++) {
160 put_dc_size( &(new_dc[i]), MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i]));
161 put_dc_block_number( &(new_dc[i]), bh[i]->b_blocknr );
162 }
163 memcpy (dc, new_dc, DC_SIZE * count);
164
165
166 /* prepare space for count items */
167 ih = B_N_PDELIM_KEY (cur, ((to == -1) ? 0 : to));
168
169 memmove (ih + count, ih, (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE);
170
171 /* copy item headers (keys) */
172 memcpy (ih, inserted, KEY_SIZE);
173 if ( count > 1 )
174 memcpy (ih + 1, inserted + 1, KEY_SIZE);
175
176 /* sizes, item number */
177 set_blkh_nr_item( blkh, blkh_nr_item(blkh) + count );
178 set_blkh_free_space( blkh,
179 blkh_free_space(blkh) - count * (DC_SIZE + KEY_SIZE ) );
180
181 do_balance_mark_internal_dirty (cur_bi->tb, cur,0);
182
183 /*&&&&&&&&&&&&&&&&&&&&&&&&*/
184 check_internal (cur);
185 /*&&&&&&&&&&&&&&&&&&&&&&&&*/
186
187 if (cur_bi->bi_parent) {
188 struct disk_child *t_dc = B_N_CHILD (cur_bi->bi_parent,cur_bi->bi_position);
189 put_dc_size( t_dc, dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE)));
190 do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, 0);
191
192 /*&&&&&&&&&&&&&&&&&&&&&&&&*/
193 check_internal (cur_bi->bi_parent);
194 /*&&&&&&&&&&&&&&&&&&&&&&&&*/
195 }
196
197}
198
199
200/* Delete del_num items and node pointers from buffer cur starting from *
201 * the first_i'th item and first_p'th pointers respectively. */
202static void internal_delete_pointers_items (
203 struct buffer_info * cur_bi,
204 int first_p,
205 int first_i,
206 int del_num
207 )
208{
209 struct buffer_head * cur = cur_bi->bi_bh;
210 int nr;
211 struct block_head * blkh;
212 struct reiserfs_key * key;
213 struct disk_child * dc;
214
215 RFALSE( cur == NULL, "buffer is 0");
216 RFALSE( del_num < 0,
217 "negative number of items (%d) can not be deleted", del_num);
218 RFALSE( first_p < 0 || first_p + del_num > B_NR_ITEMS (cur) + 1 || first_i < 0,
219 "first pointer order (%d) < 0 or "
220 "no so many pointers (%d), only (%d) or "
221 "first key order %d < 0", first_p,
222 first_p + del_num, B_NR_ITEMS (cur) + 1, first_i);
223 if ( del_num == 0 )
224 return;
225
226 blkh = B_BLK_HEAD(cur);
227 nr = blkh_nr_item(blkh);
228
229 if ( first_p == 0 && del_num == nr + 1 ) {
230 RFALSE( first_i != 0, "1st deleted key must have order 0, not %d", first_i);
231 make_empty_node (cur_bi);
232 return;
233 }
234
235 RFALSE( first_i + del_num > B_NR_ITEMS (cur),
236 "first_i = %d del_num = %d "
237 "no so many keys (%d) in the node (%b)(%z)",
238 first_i, del_num, first_i + del_num, cur, cur);
239
240
241 /* deleting */
242 dc = B_N_CHILD (cur, first_p);
243
244 memmove (dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE);
245 key = B_N_PDELIM_KEY (cur, first_i);
246 memmove (key, key + del_num, (nr - first_i - del_num) * KEY_SIZE + (nr + 1 - del_num) * DC_SIZE);
247
248
249 /* sizes, item number */
250 set_blkh_nr_item( blkh, blkh_nr_item(blkh) - del_num );
251 set_blkh_free_space( blkh,
252 blkh_free_space(blkh) + (del_num * (KEY_SIZE + DC_SIZE) ) );
253
254 do_balance_mark_internal_dirty (cur_bi->tb, cur, 0);
255 /*&&&&&&&&&&&&&&&&&&&&&&&*/
256 check_internal (cur);
257 /*&&&&&&&&&&&&&&&&&&&&&&&*/
258
259 if (cur_bi->bi_parent) {
260 struct disk_child *t_dc;
261 t_dc = B_N_CHILD (cur_bi->bi_parent, cur_bi->bi_position);
262 put_dc_size( t_dc, dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE) ) );
263
264 do_balance_mark_internal_dirty (cur_bi->tb, cur_bi->bi_parent,0);
265 /*&&&&&&&&&&&&&&&&&&&&&&&&*/
266 check_internal (cur_bi->bi_parent);
267 /*&&&&&&&&&&&&&&&&&&&&&&&&*/
268 }
269}
270
271
272/* delete n node pointers and items starting from given position */
273static void internal_delete_childs (struct buffer_info * cur_bi,
274 int from, int n)
275{
276 int i_from;
277
278 i_from = (from == 0) ? from : from - 1;
279
280 /* delete n pointers starting from `from' position in CUR;
281 delete n keys starting from 'i_from' position in CUR;
282 */
283 internal_delete_pointers_items (cur_bi, from, i_from, n);
284}
285
286
287/* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest
288* last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest
289 * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest
290 */
291static void internal_copy_pointers_items (
292 struct buffer_info * dest_bi,
293 struct buffer_head * src,
294 int last_first, int cpy_num
295 )
296{
297 /* ATTENTION! Number of node pointers in DEST is equal to number of items in DEST *
298 * as delimiting key have already inserted to buffer dest.*/
299 struct buffer_head * dest = dest_bi->bi_bh;
300 int nr_dest, nr_src;
301 int dest_order, src_order;
302 struct block_head * blkh;
303 struct reiserfs_key * key;
304 struct disk_child * dc;
305
306 nr_src = B_NR_ITEMS (src);
307
308 RFALSE( dest == NULL || src == NULL,
309 "src (%p) or dest (%p) buffer is 0", src, dest);
310 RFALSE( last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
311 "invalid last_first parameter (%d)", last_first);
312 RFALSE( nr_src < cpy_num - 1,
313 "no so many items (%d) in src (%d)", cpy_num, nr_src);
314 RFALSE( cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num);
315 RFALSE( cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest),
316 "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)",
317 cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest));
318
319 if ( cpy_num == 0 )
320 return;
321
322 /* coping */
323 blkh = B_BLK_HEAD(dest);
324 nr_dest = blkh_nr_item(blkh);
325
326 /*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest;*/
327 /*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0;*/
328 (last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order = nr_src - cpy_num + 1) :
329 (dest_order = nr_dest, src_order = 0);
330
331 /* prepare space for cpy_num pointers */
332 dc = B_N_CHILD (dest, dest_order);
333
334 memmove (dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE);
335
336 /* insert pointers */
337 memcpy (dc, B_N_CHILD (src, src_order), DC_SIZE * cpy_num);
338
339
340 /* prepare space for cpy_num - 1 item headers */
341 key = B_N_PDELIM_KEY(dest, dest_order);
342 memmove (key + cpy_num - 1, key,
343 KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest + cpy_num));
344
345
346 /* insert headers */
347 memcpy (key, B_N_PDELIM_KEY (src, src_order), KEY_SIZE * (cpy_num - 1));
348
349 /* sizes, item number */
350 set_blkh_nr_item( blkh, blkh_nr_item(blkh) + (cpy_num - 1 ) );
351 set_blkh_free_space( blkh,
352 blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) + DC_SIZE * cpy_num ) );
353
354 do_balance_mark_internal_dirty (dest_bi->tb, dest, 0);
355
356 /*&&&&&&&&&&&&&&&&&&&&&&&&*/
357 check_internal (dest);
358 /*&&&&&&&&&&&&&&&&&&&&&&&&*/
359
360 if (dest_bi->bi_parent) {
361 struct disk_child *t_dc;
362 t_dc = B_N_CHILD(dest_bi->bi_parent,dest_bi->bi_position);
363 put_dc_size( t_dc, dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) + DC_SIZE * cpy_num) );
364
365 do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent,0);
366 /*&&&&&&&&&&&&&&&&&&&&&&&&*/
367 check_internal (dest_bi->bi_parent);
368 /*&&&&&&&&&&&&&&&&&&&&&&&&*/
369 }
370
371}
372
373
374/* Copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest.
375 * Delete cpy_num - del_par items and node pointers from buffer src.
376 * last_first == FIRST_TO_LAST means, that we copy/delete first items from src.
377 * last_first == LAST_TO_FIRST means, that we copy/delete last items from src.
378 */
379static void internal_move_pointers_items (struct buffer_info * dest_bi,
380 struct buffer_info * src_bi,
381 int last_first, int cpy_num, int del_par)
382{
383 int first_pointer;
384 int first_item;
385
386 internal_copy_pointers_items (dest_bi, src_bi->bi_bh, last_first, cpy_num);
387
388 if (last_first == FIRST_TO_LAST) { /* shift_left occurs */
389 first_pointer = 0;
390 first_item = 0;
391 /* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer,
392 for key - with first_item */
393 internal_delete_pointers_items (src_bi, first_pointer, first_item, cpy_num - del_par);
394 } else { /* shift_right occurs */
395 int i, j;
396
397 i = ( cpy_num - del_par == ( j = B_NR_ITEMS(src_bi->bi_bh)) + 1 ) ? 0 : j - cpy_num + del_par;
398
399 internal_delete_pointers_items (src_bi, j + 1 - cpy_num + del_par, i, cpy_num - del_par);
400 }
401}
402
403/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */
404static void internal_insert_key (struct buffer_info * dest_bi,
405 int dest_position_before, /* insert key before key with n_dest number */
406 struct buffer_head * src,
407 int src_position)
408{
409 struct buffer_head * dest = dest_bi->bi_bh;
410 int nr;
411 struct block_head * blkh;
412 struct reiserfs_key * key;
413
414 RFALSE( dest == NULL || src == NULL,
415 "source(%p) or dest(%p) buffer is 0", src, dest);
416 RFALSE( dest_position_before < 0 || src_position < 0,
417 "source(%d) or dest(%d) key number less than 0",
418 src_position, dest_position_before);
419 RFALSE( dest_position_before > B_NR_ITEMS (dest) ||
420 src_position >= B_NR_ITEMS(src),
421 "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))",
422 dest_position_before, B_NR_ITEMS (dest),
423 src_position, B_NR_ITEMS(src));
424 RFALSE( B_FREE_SPACE (dest) < KEY_SIZE,
425 "no enough free space (%d) in dest buffer", B_FREE_SPACE (dest));
426
427 blkh = B_BLK_HEAD(dest);
428 nr = blkh_nr_item(blkh);
429
430 /* prepare space for inserting key */
431 key = B_N_PDELIM_KEY (dest, dest_position_before);
432 memmove (key + 1, key, (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE);
433
434 /* insert key */
435 memcpy (key, B_N_PDELIM_KEY(src, src_position), KEY_SIZE);
436
437 /* Change dirt, free space, item number fields. */
438
439 set_blkh_nr_item( blkh, blkh_nr_item(blkh) + 1 );
440 set_blkh_free_space( blkh, blkh_free_space(blkh) - KEY_SIZE );
441
442 do_balance_mark_internal_dirty (dest_bi->tb, dest, 0);
443
444 if (dest_bi->bi_parent) {
445 struct disk_child *t_dc;
446 t_dc = B_N_CHILD(dest_bi->bi_parent,dest_bi->bi_position);
447 put_dc_size( t_dc, dc_size(t_dc) + KEY_SIZE );
448
449 do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent,0);
450 }
451}
452
453
454
455/* Insert d_key'th (delimiting) key from buffer cfl to tail of dest.
456 * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest.
457 * Replace d_key'th key in buffer cfl.
458 * Delete pointer_amount items and node pointers from buffer src.
459 */
460/* this can be invoked both to shift from S to L and from R to S */
461static void internal_shift_left (
462 int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S */
463 struct tree_balance * tb,
464 int h,
465 int pointer_amount
466 )
467{
468 struct buffer_info dest_bi, src_bi;
469 struct buffer_head * cf;
470 int d_key_position;
471
472 internal_define_dest_src_infos (mode, tb, h, &dest_bi, &src_bi, &d_key_position, &cf);
473
474 /*printk("pointer_amount = %d\n",pointer_amount);*/
475
476 if (pointer_amount) {
477 /* insert delimiting key from common father of dest and src to node dest into position B_NR_ITEM(dest) */
478 internal_insert_key (&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position);
479
480 if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) {
481 if (src_bi.bi_position/*src->b_item_order*/ == 0)
482 replace_key (tb, cf, d_key_position, src_bi.bi_parent/*src->b_parent*/, 0);
483 } else
484 replace_key (tb, cf, d_key_position, src_bi.bi_bh, pointer_amount - 1);
485 }
486 /* last parameter is del_parameter */
487 internal_move_pointers_items (&dest_bi, &src_bi, FIRST_TO_LAST, pointer_amount, 0);
488
489}
490
491/* Insert delimiting key to L[h].
492 * Copy n node pointers and n - 1 items from buffer S[h] to L[h].
493 * Delete n - 1 items and node pointers from buffer S[h].
494 */
495/* it always shifts from S[h] to L[h] */
496static void internal_shift1_left (
497 struct tree_balance * tb,
498 int h,
499 int pointer_amount
500 )
501{
502 struct buffer_info dest_bi, src_bi;
503 struct buffer_head * cf;
504 int d_key_position;
505
506 internal_define_dest_src_infos (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, &dest_bi, &src_bi, &d_key_position, &cf);
507
508 if ( pointer_amount > 0 ) /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */
509 internal_insert_key (&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position);
510 /* internal_insert_key (tb->L[h], B_NR_ITEM(tb->L[h]), tb->CFL[h], tb->lkey[h]);*/
511
512 /* last parameter is del_parameter */
513 internal_move_pointers_items (&dest_bi, &src_bi, FIRST_TO_LAST, pointer_amount, 1);
514 /* internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1);*/
515}
516
517
518/* Insert d_key'th (delimiting) key from buffer cfr to head of dest.
519 * Copy n node pointers and n - 1 items from buffer src to buffer dest.
520 * Replace d_key'th key in buffer cfr.
521 * Delete n items and node pointers from buffer src.
522 */
523static void internal_shift_right (
524 int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S */
525 struct tree_balance * tb,
526 int h,
527 int pointer_amount
528 )
529{
530 struct buffer_info dest_bi, src_bi;
531 struct buffer_head * cf;
532 int d_key_position;
533 int nr;
534
535
536 internal_define_dest_src_infos (mode, tb, h, &dest_bi, &src_bi, &d_key_position, &cf);
537
538 nr = B_NR_ITEMS (src_bi.bi_bh);
539
540 if (pointer_amount > 0) {
541 /* insert delimiting key from common father of dest and src to dest node into position 0 */
542 internal_insert_key (&dest_bi, 0, cf, d_key_position);
543 if (nr == pointer_amount - 1) {
544 RFALSE( src_bi.bi_bh != PATH_H_PBUFFER (tb->tb_path, h)/*tb->S[h]*/ ||
545 dest_bi.bi_bh != tb->R[h],
546 "src (%p) must be == tb->S[h](%p) when it disappears",
547 src_bi.bi_bh, PATH_H_PBUFFER (tb->tb_path, h));
548 /* when S[h] disappers replace left delemiting key as well */
549 if (tb->CFL[h])
550 replace_key (tb, cf, d_key_position, tb->CFL[h], tb->lkey[h]);
551 } else
552 replace_key (tb, cf, d_key_position, src_bi.bi_bh, nr - pointer_amount);
553 }
554
555 /* last parameter is del_parameter */
556 internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, pointer_amount, 0);
557}
558
559/* Insert delimiting key to R[h].
560 * Copy n node pointers and n - 1 items from buffer S[h] to R[h].
561 * Delete n - 1 items and node pointers from buffer S[h].
562 */
563/* it always shift from S[h] to R[h] */
564static void internal_shift1_right (
565 struct tree_balance * tb,
566 int h,
567 int pointer_amount
568 )
569{
570 struct buffer_info dest_bi, src_bi;
571 struct buffer_head * cf;
572 int d_key_position;
573
574 internal_define_dest_src_infos (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, &dest_bi, &src_bi, &d_key_position, &cf);
575
576 if (pointer_amount > 0) /* insert rkey from CFR[h] to right neighbor R[h] */
577 internal_insert_key (&dest_bi, 0, cf, d_key_position);
578 /* internal_insert_key (tb->R[h], 0, tb->CFR[h], tb->rkey[h]);*/
579
580 /* last parameter is del_parameter */
581 internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, pointer_amount, 1);
582 /* internal_move_pointers_items (tb->R[h], tb->S[h], LAST_TO_FIRST, pointer_amount, 1);*/
583}
584
585
586/* Delete insert_num node pointers together with their left items
587 * and balance current node.*/
588static void balance_internal_when_delete (struct tree_balance * tb,
589 int h, int child_pos)
590{
591 int insert_num;
592 int n;
593 struct buffer_head * tbSh = PATH_H_PBUFFER (tb->tb_path, h);
594 struct buffer_info bi;
595
596 insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE));
597
598 /* delete child-node-pointer(s) together with their left item(s) */
599 bi.tb = tb;
600 bi.bi_bh = tbSh;
601 bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h);
602 bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
603
604 internal_delete_childs (&bi, child_pos, -insert_num);
605
606 RFALSE( tb->blknum[h] > 1,
607 "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]);
608
609 n = B_NR_ITEMS(tbSh);
610
611 if ( tb->lnum[h] == 0 && tb->rnum[h] == 0 ) {
612 if ( tb->blknum[h] == 0 ) {
613 /* node S[h] (root of the tree) is empty now */
614 struct buffer_head *new_root;
615
616 RFALSE( n || B_FREE_SPACE (tbSh) != MAX_CHILD_SIZE(tbSh) - DC_SIZE,
617 "buffer must have only 0 keys (%d)", n);
618 RFALSE( bi.bi_parent, "root has parent (%p)", bi.bi_parent);
619
620 /* choose a new root */
621 if ( ! tb->L[h-1] || ! B_NR_ITEMS(tb->L[h-1]) )
622 new_root = tb->R[h-1];
623 else
624 new_root = tb->L[h-1];
625 /* switch super block's tree root block number to the new value */
626 PUT_SB_ROOT_BLOCK( tb->tb_sb, new_root->b_blocknr );
627 //REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --;
628 PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) - 1 );
629
630 do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
631 /*&&&&&&&&&&&&&&&&&&&&&&*/
632 if (h > 1)
633 /* use check_internal if new root is an internal node */
634 check_internal (new_root);
635 /*&&&&&&&&&&&&&&&&&&&&&&*/
636
637 /* do what is needed for buffer thrown from tree */
638 reiserfs_invalidate_buffer(tb, tbSh);
639 return;
640 }
641 return;
642 }
643
644 if ( tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1 ) { /* join S[h] with L[h] */
645
646 RFALSE( tb->rnum[h] != 0,
647 "invalid tb->rnum[%d]==%d when joining S[h] with L[h]",
648 h, tb->rnum[h]);
649
650 internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1);
651 reiserfs_invalidate_buffer(tb, tbSh);
652
653 return;
654 }
655
656 if ( tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1 ) { /* join S[h] with R[h] */
657 RFALSE( tb->lnum[h] != 0,
658 "invalid tb->lnum[%d]==%d when joining S[h] with R[h]",
659 h, tb->lnum[h]);
660
661 internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1);
662
663 reiserfs_invalidate_buffer(tb,tbSh);
664 return;
665 }
666
667 if ( tb->lnum[h] < 0 ) { /* borrow from left neighbor L[h] */
668 RFALSE( tb->rnum[h] != 0,
669 "wrong tb->rnum[%d]==%d when borrow from L[h]", h, tb->rnum[h]);
670 /*internal_shift_right (tb, h, tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], -tb->lnum[h]);*/
671 internal_shift_right (INTERNAL_SHIFT_FROM_L_TO_S, tb, h, -tb->lnum[h]);
672 return;
673 }
674
675 if ( tb->rnum[h] < 0 ) { /* borrow from right neighbor R[h] */
676 RFALSE( tb->lnum[h] != 0,
677 "invalid tb->lnum[%d]==%d when borrow from R[h]",
678 h, tb->lnum[h]);
679 internal_shift_left (INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]);/*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]);*/
680 return;
681 }
682
683 if ( tb->lnum[h] > 0 ) { /* split S[h] into two parts and put them into neighbors */
684 RFALSE( tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1,
685 "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them",
686 h, tb->lnum[h], h, tb->rnum[h], n);
687
688 internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]);/*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]);*/
689 internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]);
690
691 reiserfs_invalidate_buffer (tb, tbSh);
692
693 return;
694 }
695 reiserfs_panic (tb->tb_sb, "balance_internal_when_delete: unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d",
696 h, tb->lnum[h], h, tb->rnum[h]);
697}
698
699
700/* Replace delimiting key of buffers L[h] and S[h] by the given key.*/
701static void replace_lkey (
702 struct tree_balance * tb,
703 int h,
704 struct item_head * key
705 )
706{
707 RFALSE( tb->L[h] == NULL || tb->CFL[h] == NULL,
708 "L[h](%p) and CFL[h](%p) must exist in replace_lkey",
709 tb->L[h], tb->CFL[h]);
710
711 if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0)
712 return;
713
714 memcpy (B_N_PDELIM_KEY(tb->CFL[h],tb->lkey[h]), key, KEY_SIZE);
715
716 do_balance_mark_internal_dirty (tb, tb->CFL[h],0);
717}
718
719
720/* Replace delimiting key of buffers S[h] and R[h] by the given key.*/
721static void replace_rkey (
722 struct tree_balance * tb,
723 int h,
724 struct item_head * key
725 )
726{
727 RFALSE( tb->R[h] == NULL || tb->CFR[h] == NULL,
728 "R[h](%p) and CFR[h](%p) must exist in replace_rkey",
729 tb->R[h], tb->CFR[h]);
730 RFALSE( B_NR_ITEMS(tb->R[h]) == 0,
731 "R[h] can not be empty if it exists (item number=%d)",
732 B_NR_ITEMS(tb->R[h]));
733
734 memcpy (B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]), key, KEY_SIZE);
735
736 do_balance_mark_internal_dirty (tb, tb->CFR[h], 0);
737}
738
739
740int balance_internal (struct tree_balance * tb, /* tree_balance structure */
741 int h, /* level of the tree */
742 int child_pos,
743 struct item_head * insert_key, /* key for insertion on higher level */
744 struct buffer_head ** insert_ptr /* node for insertion on higher level*/
745 )
746 /* if inserting/pasting
747 {
748 child_pos is the position of the node-pointer in S[h] that *
749 pointed to S[h-1] before balancing of the h-1 level; *
750 this means that new pointers and items must be inserted AFTER *
751 child_pos
752 }
753 else
754 {
755 it is the position of the leftmost pointer that must be deleted (together with
756 its corresponding key to the left of the pointer)
757 as a result of the previous level's balancing.
758 }
759*/
760{
761 struct buffer_head * tbSh = PATH_H_PBUFFER (tb->tb_path, h);
762 struct buffer_info bi;
763 int order; /* we return this: it is 0 if there is no S[h], else it is tb->S[h]->b_item_order */
764 int insert_num, n, k;
765 struct buffer_head * S_new;
766 struct item_head new_insert_key;
767 struct buffer_head * new_insert_ptr = NULL;
768 struct item_head * new_insert_key_addr = insert_key;
769
770 RFALSE( h < 1, "h (%d) can not be < 1 on internal level", h);
771
772 PROC_INFO_INC( tb -> tb_sb, balance_at[ h ] );
773
774 order = ( tbSh ) ? PATH_H_POSITION (tb->tb_path, h + 1)/*tb->S[h]->b_item_order*/ : 0;
775
776 /* Using insert_size[h] calculate the number insert_num of items
777 that must be inserted to or deleted from S[h]. */
778 insert_num = tb->insert_size[h]/((int)(KEY_SIZE + DC_SIZE));
779
780 /* Check whether insert_num is proper **/
781 RFALSE( insert_num < -2 || insert_num > 2,
782 "incorrect number of items inserted to the internal node (%d)",
783 insert_num);
784 RFALSE( h > 1 && (insert_num > 1 || insert_num < -1),
785 "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level",
786 insert_num, h);
787
788 /* Make balance in case insert_num < 0 */
789 if ( insert_num < 0 ) {
790 balance_internal_when_delete (tb, h, child_pos);
791 return order;
792 }
793
794 k = 0;
795 if ( tb->lnum[h] > 0 ) {
796 /* shift lnum[h] items from S[h] to the left neighbor L[h].
797 check how many of new items fall into L[h] or CFL[h] after
798 shifting */
799 n = B_NR_ITEMS (tb->L[h]); /* number of items in L[h] */
800 if ( tb->lnum[h] <= child_pos ) {
801 /* new items don't fall into L[h] or CFL[h] */
802 internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]);
803 /*internal_shift_left (tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,tb->lnum[h]);*/
804 child_pos -= tb->lnum[h];
805 } else if ( tb->lnum[h] > child_pos + insert_num ) {
806 /* all new items fall into L[h] */
807 internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h] - insert_num);
808 /* internal_shift_left(tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,
809 tb->lnum[h]-insert_num);
810 */
811 /* insert insert_num keys and node-pointers into L[h] */
812 bi.tb = tb;
813 bi.bi_bh = tb->L[h];
814 bi.bi_parent = tb->FL[h];
815 bi.bi_position = get_left_neighbor_position (tb, h);
816 internal_insert_childs (&bi,/*tb->L[h], tb->S[h-1]->b_next*/ n + child_pos + 1,
817 insert_num,insert_key,insert_ptr);
818
819 insert_num = 0;
820 } else {
821 struct disk_child * dc;
822
823 /* some items fall into L[h] or CFL[h], but some don't fall */
824 internal_shift1_left(tb,h,child_pos+1);
825 /* calculate number of new items that fall into L[h] */
826 k = tb->lnum[h] - child_pos - 1;
827 bi.tb = tb;
828 bi.bi_bh = tb->L[h];
829 bi.bi_parent = tb->FL[h];
830 bi.bi_position = get_left_neighbor_position (tb, h);
831 internal_insert_childs (&bi,/*tb->L[h], tb->S[h-1]->b_next,*/ n + child_pos + 1,k,
832 insert_key,insert_ptr);
833
834 replace_lkey(tb,h,insert_key + k);
835
836 /* replace the first node-ptr in S[h] by node-ptr to insert_ptr[k] */
837 dc = B_N_CHILD(tbSh, 0);
838 put_dc_size( dc, MAX_CHILD_SIZE(insert_ptr[k]) - B_FREE_SPACE (insert_ptr[k]));
839 put_dc_block_number( dc, insert_ptr[k]->b_blocknr );
840
841 do_balance_mark_internal_dirty (tb, tbSh, 0);
842
843 k++;
844 insert_key += k;
845 insert_ptr += k;
846 insert_num -= k;
847 child_pos = 0;
848 }
849 } /* tb->lnum[h] > 0 */
850
851 if ( tb->rnum[h] > 0 ) {
852 /*shift rnum[h] items from S[h] to the right neighbor R[h]*/
853 /* check how many of new items fall into R or CFR after shifting */
854 n = B_NR_ITEMS (tbSh); /* number of items in S[h] */
855 if ( n - tb->rnum[h] >= child_pos )
856 /* new items fall into S[h] */
857 /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],tb->rnum[h]);*/
858 internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]);
859 else
860 if ( n + insert_num - tb->rnum[h] < child_pos )
861 {
862 /* all new items fall into R[h] */
863 /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],
864 tb->rnum[h] - insert_num);*/
865 internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h] - insert_num);
866
867 /* insert insert_num keys and node-pointers into R[h] */
868 bi.tb = tb;
869 bi.bi_bh = tb->R[h];
870 bi.bi_parent = tb->FR[h];
871 bi.bi_position = get_right_neighbor_position (tb, h);
872 internal_insert_childs (&bi, /*tb->R[h],tb->S[h-1]->b_next*/ child_pos - n - insert_num + tb->rnum[h] - 1,
873 insert_num,insert_key,insert_ptr);
874 insert_num = 0;
875 }
876 else
877 {
878 struct disk_child * dc;
879
880 /* one of the items falls into CFR[h] */
881 internal_shift1_right(tb,h,n - child_pos + 1);
882 /* calculate number of new items that fall into R[h] */
883 k = tb->rnum[h] - n + child_pos - 1;
884 bi.tb = tb;
885 bi.bi_bh = tb->R[h];
886 bi.bi_parent = tb->FR[h];
887 bi.bi_position = get_right_neighbor_position (tb, h);
888 internal_insert_childs (&bi, /*tb->R[h], tb->R[h]->b_child,*/ 0, k, insert_key + 1, insert_ptr + 1);
889
890 replace_rkey(tb,h,insert_key + insert_num - k - 1);
891
892 /* replace the first node-ptr in R[h] by node-ptr insert_ptr[insert_num-k-1]*/
893 dc = B_N_CHILD(tb->R[h], 0);
894 put_dc_size( dc, MAX_CHILD_SIZE(insert_ptr[insert_num-k-1]) -
895 B_FREE_SPACE (insert_ptr[insert_num-k-1]));
896 put_dc_block_number( dc, insert_ptr[insert_num-k-1]->b_blocknr );
897
898 do_balance_mark_internal_dirty (tb, tb->R[h],0);
899
900 insert_num -= (k + 1);
901 }
902 }
903
904 /** Fill new node that appears instead of S[h] **/
905 RFALSE( tb->blknum[h] > 2, "blknum can not be > 2 for internal level");
906 RFALSE( tb->blknum[h] < 0, "blknum can not be < 0");
907
908 if ( ! tb->blknum[h] )
909 { /* node S[h] is empty now */
910 RFALSE( ! tbSh, "S[h] is equal NULL");
911
912 /* do what is needed for buffer thrown from tree */
913 reiserfs_invalidate_buffer(tb,tbSh);
914 return order;
915 }
916
917 if ( ! tbSh ) {
918 /* create new root */
919 struct disk_child * dc;
920 struct buffer_head * tbSh_1 = PATH_H_PBUFFER (tb->tb_path, h - 1);
921 struct block_head * blkh;
922
923
924 if ( tb->blknum[h] != 1 )
925 reiserfs_panic(NULL, "balance_internal: One new node required for creating the new root");
926 /* S[h] = empty buffer from the list FEB. */
927 tbSh = get_FEB (tb);
928 blkh = B_BLK_HEAD(tbSh);
929 set_blkh_level( blkh, h + 1 );
930
931 /* Put the unique node-pointer to S[h] that points to S[h-1]. */
932
933 dc = B_N_CHILD(tbSh, 0);
934 put_dc_block_number( dc, tbSh_1->b_blocknr );
935 put_dc_size( dc, (MAX_CHILD_SIZE (tbSh_1) - B_FREE_SPACE (tbSh_1)));
936
937 tb->insert_size[h] -= DC_SIZE;
938 set_blkh_free_space( blkh, blkh_free_space(blkh) - DC_SIZE );
939
940 do_balance_mark_internal_dirty (tb, tbSh, 0);
941
942 /*&&&&&&&&&&&&&&&&&&&&&&&&*/
943 check_internal (tbSh);
944 /*&&&&&&&&&&&&&&&&&&&&&&&&*/
945
946 /* put new root into path structure */
947 PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) = tbSh;
948
949 /* Change root in structure super block. */
950 PUT_SB_ROOT_BLOCK( tb->tb_sb, tbSh->b_blocknr );
951 PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1 );
952 do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
953 }
954
955 if ( tb->blknum[h] == 2 ) {
956 int snum;
957 struct buffer_info dest_bi, src_bi;
958
959
960 /* S_new = free buffer from list FEB */
961 S_new = get_FEB(tb);
962
963 set_blkh_level( B_BLK_HEAD(S_new), h + 1 );
964
965 dest_bi.tb = tb;
966 dest_bi.bi_bh = S_new;
967 dest_bi.bi_parent = NULL;
968 dest_bi.bi_position = 0;
969 src_bi.tb = tb;
970 src_bi.bi_bh = tbSh;
971 src_bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h);
972 src_bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
973
974 n = B_NR_ITEMS (tbSh); /* number of items in S[h] */
975 snum = (insert_num + n + 1)/2;
976 if ( n - snum >= child_pos ) {
977 /* new items don't fall into S_new */
978 /* store the delimiting key for the next level */
979 /* new_insert_key = (n - snum)'th key in S[h] */
980 memcpy (&new_insert_key,B_N_PDELIM_KEY(tbSh,n - snum),
981 KEY_SIZE);
982 /* last parameter is del_par */
983 internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, snum, 0);
984 /* internal_move_pointers_items(S_new, tbSh, LAST_TO_FIRST, snum, 0);*/
985 } else if ( n + insert_num - snum < child_pos ) {
986 /* all new items fall into S_new */
987 /* store the delimiting key for the next level */
988 /* new_insert_key = (n + insert_item - snum)'th key in S[h] */
989 memcpy(&new_insert_key,B_N_PDELIM_KEY(tbSh,n + insert_num - snum),
990 KEY_SIZE);
991 /* last parameter is del_par */
992 internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, snum - insert_num, 0);
993 /* internal_move_pointers_items(S_new,tbSh,1,snum - insert_num,0);*/
994
995 /* insert insert_num keys and node-pointers into S_new */
996 internal_insert_childs (&dest_bi, /*S_new,tb->S[h-1]->b_next,*/child_pos - n - insert_num + snum - 1,
997 insert_num,insert_key,insert_ptr);
998
999 insert_num = 0;
1000 } else {
1001 struct disk_child * dc;
1002
1003 /* some items fall into S_new, but some don't fall */
1004 /* last parameter is del_par */
1005 internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, n - child_pos + 1, 1);
1006 /* internal_move_pointers_items(S_new,tbSh,1,n - child_pos + 1,1);*/
1007 /* calculate number of new items that fall into S_new */
1008 k = snum - n + child_pos - 1;
1009
1010 internal_insert_childs (&dest_bi, /*S_new,*/ 0, k, insert_key + 1, insert_ptr+1);
1011
1012 /* new_insert_key = insert_key[insert_num - k - 1] */
1013 memcpy(&new_insert_key,insert_key + insert_num - k - 1,
1014 KEY_SIZE);
1015 /* replace first node-ptr in S_new by node-ptr to insert_ptr[insert_num-k-1] */
1016
1017 dc = B_N_CHILD(S_new,0);
1018 put_dc_size( dc, (MAX_CHILD_SIZE(insert_ptr[insert_num-k-1]) -
1019 B_FREE_SPACE(insert_ptr[insert_num-k-1])) );
1020 put_dc_block_number( dc, insert_ptr[insert_num-k-1]->b_blocknr );
1021
1022 do_balance_mark_internal_dirty (tb, S_new,0);
1023
1024 insert_num -= (k + 1);
1025 }
1026 /* new_insert_ptr = node_pointer to S_new */
1027 new_insert_ptr = S_new;
1028
1029 RFALSE (!buffer_journaled(S_new) || buffer_journal_dirty(S_new) ||
1030 buffer_dirty (S_new),
1031 "cm-00001: bad S_new (%b)", S_new);
1032
1033 // S_new is released in unfix_nodes
1034 }
1035
1036 n = B_NR_ITEMS (tbSh); /*number of items in S[h] */
1037
1038 if ( 0 <= child_pos && child_pos <= n && insert_num > 0 ) {
1039 bi.tb = tb;
1040 bi.bi_bh = tbSh;
1041 bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h);
1042 bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
1043 internal_insert_childs (
1044 &bi,/*tbSh,*/
1045 /* ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next : tb->S[h]->b_child->b_next,*/
1046 child_pos,insert_num,insert_key,insert_ptr
1047 );
1048 }
1049
1050
1051 memcpy (new_insert_key_addr,&new_insert_key,KEY_SIZE);
1052 insert_ptr[0] = new_insert_ptr;
1053
1054 return order;
1055 }
1056
1057
1058
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
new file mode 100644
index 000000000000..7543031396f4
--- /dev/null
+++ b/fs/reiserfs/inode.c
@@ -0,0 +1,2846 @@
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5#include <linux/config.h>
6#include <linux/time.h>
7#include <linux/fs.h>
8#include <linux/reiserfs_fs.h>
9#include <linux/reiserfs_acl.h>
10#include <linux/reiserfs_xattr.h>
11#include <linux/smp_lock.h>
12#include <linux/pagemap.h>
13#include <linux/highmem.h>
14#include <asm/uaccess.h>
15#include <asm/unaligned.h>
16#include <linux/buffer_head.h>
17#include <linux/mpage.h>
18#include <linux/writeback.h>
19#include <linux/quotaops.h>
20
21extern int reiserfs_default_io_size; /* default io size devuned in super.c */
22
23static int reiserfs_commit_write(struct file *f, struct page *page,
24 unsigned from, unsigned to);
25static int reiserfs_prepare_write(struct file *f, struct page *page,
26 unsigned from, unsigned to);
27
28void reiserfs_delete_inode (struct inode * inode)
29{
30 /* We need blocks for transaction + (user+group) quota update (possibly delete) */
31 int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS;
32 struct reiserfs_transaction_handle th ;
33
34 reiserfs_write_lock(inode->i_sb);
35
36 /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
37 if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
38 down (&inode->i_sem);
39
40 reiserfs_delete_xattrs (inode);
41
42 if (journal_begin(&th, inode->i_sb, jbegin_count)) {
43 up (&inode->i_sem);
44 goto out;
45 }
46 reiserfs_update_inode_transaction(inode) ;
47
48 if (reiserfs_delete_object (&th, inode)) {
49 up (&inode->i_sem);
50 goto out;
51 }
52
53 /* Do quota update inside a transaction for journaled quotas. We must do that
54 * after delete_object so that quota updates go into the same transaction as
55 * stat data deletion */
56 DQUOT_FREE_INODE(inode);
57
58 if (journal_end(&th, inode->i_sb, jbegin_count)) {
59 up (&inode->i_sem);
60 goto out;
61 }
62
63 up (&inode->i_sem);
64
65 /* all items of file are deleted, so we can remove "save" link */
66 remove_save_link (inode, 0/* not truncate */); /* we can't do anything
67 * about an error here */
68 } else {
69 /* no object items are in the tree */
70 ;
71 }
72out:
73 clear_inode (inode); /* note this must go after the journal_end to prevent deadlock */
74 inode->i_blocks = 0;
75 reiserfs_write_unlock(inode->i_sb);
76}
77
78static void _make_cpu_key (struct cpu_key * key, int version, __u32 dirid, __u32 objectid,
79 loff_t offset, int type, int length )
80{
81 key->version = version;
82
83 key->on_disk_key.k_dir_id = dirid;
84 key->on_disk_key.k_objectid = objectid;
85 set_cpu_key_k_offset (key, offset);
86 set_cpu_key_k_type (key, type);
87 key->key_length = length;
88}
89
90
91/* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
92 offset and type of key */
93void make_cpu_key (struct cpu_key * key, struct inode * inode, loff_t offset,
94 int type, int length )
95{
96 _make_cpu_key (key, get_inode_item_key_version (inode), le32_to_cpu (INODE_PKEY (inode)->k_dir_id),
97 le32_to_cpu (INODE_PKEY (inode)->k_objectid),
98 offset, type, length);
99}
100
101
102//
103// when key is 0, do not set version and short key
104//
105inline void make_le_item_head (struct item_head * ih, const struct cpu_key * key,
106 int version,
107 loff_t offset, int type, int length,
108 int entry_count/*or ih_free_space*/)
109{
110 if (key) {
111 ih->ih_key.k_dir_id = cpu_to_le32 (key->on_disk_key.k_dir_id);
112 ih->ih_key.k_objectid = cpu_to_le32 (key->on_disk_key.k_objectid);
113 }
114 put_ih_version( ih, version );
115 set_le_ih_k_offset (ih, offset);
116 set_le_ih_k_type (ih, type);
117 put_ih_item_len( ih, length );
118 /* set_ih_free_space (ih, 0);*/
119 // for directory items it is entry count, for directs and stat
120 // datas - 0xffff, for indirects - 0
121 put_ih_entry_count( ih, entry_count );
122}
123
124//
125// FIXME: we might cache recently accessed indirect item
126
127// Ugh. Not too eager for that....
128// I cut the code until such time as I see a convincing argument (benchmark).
129// I don't want a bloated inode struct..., and I don't like code complexity....
130
131/* cutting the code is fine, since it really isn't in use yet and is easy
132** to add back in. But, Vladimir has a really good idea here. Think
133** about what happens for reading a file. For each page,
134** The VFS layer calls reiserfs_readpage, who searches the tree to find
135** an indirect item. This indirect item has X number of pointers, where
136** X is a big number if we've done the block allocation right. But,
137** we only use one or two of these pointers during each call to readpage,
138** needlessly researching again later on.
139**
140** The size of the cache could be dynamic based on the size of the file.
141**
142** I'd also like to see us cache the location the stat data item, since
143** we are needlessly researching for that frequently.
144**
145** --chris
146*/
147
148/* If this page has a file tail in it, and
149** it was read in by get_block_create_0, the page data is valid,
150** but tail is still sitting in a direct item, and we can't write to
151** it. So, look through this page, and check all the mapped buffers
152** to make sure they have valid block numbers. Any that don't need
153** to be unmapped, so that block_prepare_write will correctly call
154** reiserfs_get_block to convert the tail into an unformatted node
155*/
156static inline void fix_tail_page_for_writing(struct page *page) {
157 struct buffer_head *head, *next, *bh ;
158
159 if (page && page_has_buffers(page)) {
160 head = page_buffers(page) ;
161 bh = head ;
162 do {
163 next = bh->b_this_page ;
164 if (buffer_mapped(bh) && bh->b_blocknr == 0) {
165 reiserfs_unmap_buffer(bh) ;
166 }
167 bh = next ;
168 } while (bh != head) ;
169 }
170}
171
172/* reiserfs_get_block does not need to allocate a block only if it has been
173 done already or non-hole position has been found in the indirect item */
174static inline int allocation_needed (int retval, b_blocknr_t allocated,
175 struct item_head * ih,
176 __u32 * item, int pos_in_item)
177{
178 if (allocated)
179 return 0;
180 if (retval == POSITION_FOUND && is_indirect_le_ih (ih) &&
181 get_block_num(item, pos_in_item))
182 return 0;
183 return 1;
184}
185
186static inline int indirect_item_found (int retval, struct item_head * ih)
187{
188 return (retval == POSITION_FOUND) && is_indirect_le_ih (ih);
189}
190
191
192static inline void set_block_dev_mapped (struct buffer_head * bh,
193 b_blocknr_t block, struct inode * inode)
194{
195 map_bh(bh, inode->i_sb, block);
196}
197
198
199//
200// files which were created in the earlier version can not be longer,
201// than 2 gb
202//
203static int file_capable (struct inode * inode, long block)
204{
205 if (get_inode_item_key_version (inode) != KEY_FORMAT_3_5 || // it is new file.
206 block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb
207 return 1;
208
209 return 0;
210}
211
212/*static*/ int restart_transaction(struct reiserfs_transaction_handle *th,
213 struct inode *inode, struct path *path) {
214 struct super_block *s = th->t_super ;
215 int len = th->t_blocks_allocated ;
216 int err;
217
218 BUG_ON (!th->t_trans_id);
219 BUG_ON (!th->t_refcount);
220
221 /* we cannot restart while nested */
222 if (th->t_refcount > 1) {
223 return 0 ;
224 }
225 pathrelse(path) ;
226 reiserfs_update_sd(th, inode) ;
227 err = journal_end(th, s, len) ;
228 if (!err) {
229 err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6) ;
230 if (!err)
231 reiserfs_update_inode_transaction(inode) ;
232 }
233 return err;
234}
235
236// it is called by get_block when create == 0. Returns block number
237// for 'block'-th logical block of file. When it hits direct item it
238// returns 0 (being called from bmap) or read direct item into piece
239// of page (bh_result)
240
241// Please improve the english/clarity in the comment above, as it is
242// hard to understand.
243
244static int _get_block_create_0 (struct inode * inode, long block,
245 struct buffer_head * bh_result,
246 int args)
247{
248 INITIALIZE_PATH (path);
249 struct cpu_key key;
250 struct buffer_head * bh;
251 struct item_head * ih, tmp_ih;
252 int fs_gen ;
253 int blocknr;
254 char * p = NULL;
255 int chars;
256 int ret ;
257 int done = 0 ;
258 unsigned long offset ;
259
260 // prepare the key to look for the 'block'-th block of file
261 make_cpu_key (&key, inode,
262 (loff_t)block * inode->i_sb->s_blocksize + 1, TYPE_ANY, 3);
263
264research:
265 if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND) {
266 pathrelse (&path);
267 if (p)
268 kunmap(bh_result->b_page) ;
269 // We do not return -ENOENT if there is a hole but page is uptodate, because it means
270 // That there is some MMAPED data associated with it that is yet to be written to disk.
271 if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) {
272 return -ENOENT ;
273 }
274 return 0 ;
275 }
276
277 //
278 bh = get_last_bh (&path);
279 ih = get_ih (&path);
280 if (is_indirect_le_ih (ih)) {
281 __u32 * ind_item = (__u32 *)B_I_PITEM (bh, ih);
282
283 /* FIXME: here we could cache indirect item or part of it in
284 the inode to avoid search_by_key in case of subsequent
285 access to file */
286 blocknr = get_block_num(ind_item, path.pos_in_item) ;
287 ret = 0 ;
288 if (blocknr) {
289 map_bh(bh_result, inode->i_sb, blocknr);
290 if (path.pos_in_item == ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
291 set_buffer_boundary(bh_result);
292 }
293 } else
294 // We do not return -ENOENT if there is a hole but page is uptodate, because it means
295 // That there is some MMAPED data associated with it that is yet to be written to disk.
296 if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) {
297 ret = -ENOENT ;
298 }
299
300 pathrelse (&path);
301 if (p)
302 kunmap(bh_result->b_page) ;
303 return ret ;
304 }
305
306 // requested data are in direct item(s)
307 if (!(args & GET_BLOCK_READ_DIRECT)) {
308 // we are called by bmap. FIXME: we can not map block of file
309 // when it is stored in direct item(s)
310 pathrelse (&path);
311 if (p)
312 kunmap(bh_result->b_page) ;
313 return -ENOENT;
314 }
315
316 /* if we've got a direct item, and the buffer or page was uptodate,
317 ** we don't want to pull data off disk again. skip to the
318 ** end, where we map the buffer and return
319 */
320 if (buffer_uptodate(bh_result)) {
321 goto finished ;
322 } else
323 /*
324 ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
325 ** pages without any buffers. If the page is up to date, we don't want
326 ** read old data off disk. Set the up to date bit on the buffer instead
327 ** and jump to the end
328 */
329 if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
330 set_buffer_uptodate(bh_result);
331 goto finished ;
332 }
333
334 // read file tail into part of page
335 offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1) ;
336 fs_gen = get_generation(inode->i_sb) ;
337 copy_item_head (&tmp_ih, ih);
338
339 /* we only want to kmap if we are reading the tail into the page.
340 ** this is not the common case, so we don't kmap until we are
341 ** sure we need to. But, this means the item might move if
342 ** kmap schedules
343 */
344 if (!p) {
345 p = (char *)kmap(bh_result->b_page) ;
346 if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
347 goto research;
348 }
349 }
350 p += offset ;
351 memset (p, 0, inode->i_sb->s_blocksize);
352 do {
353 if (!is_direct_le_ih (ih)) {
354 BUG ();
355 }
356 /* make sure we don't read more bytes than actually exist in
357 ** the file. This can happen in odd cases where i_size isn't
358 ** correct, and when direct item padding results in a few
359 ** extra bytes at the end of the direct item
360 */
361 if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
362 break ;
363 if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
364 chars = inode->i_size - (le_ih_k_offset(ih) - 1) - path.pos_in_item;
365 done = 1 ;
366 } else {
367 chars = ih_item_len(ih) - path.pos_in_item;
368 }
369 memcpy (p, B_I_PITEM (bh, ih) + path.pos_in_item, chars);
370
371 if (done)
372 break ;
373
374 p += chars;
375
376 if (PATH_LAST_POSITION (&path) != (B_NR_ITEMS (bh) - 1))
377 // we done, if read direct item is not the last item of
378 // node FIXME: we could try to check right delimiting key
379 // to see whether direct item continues in the right
380 // neighbor or rely on i_size
381 break;
382
383 // update key to look for the next piece
384 set_cpu_key_k_offset (&key, cpu_key_k_offset (&key) + chars);
385 if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND)
386 // we read something from tail, even if now we got IO_ERROR
387 break;
388 bh = get_last_bh (&path);
389 ih = get_ih (&path);
390 } while (1);
391
392 flush_dcache_page(bh_result->b_page) ;
393 kunmap(bh_result->b_page) ;
394
395finished:
396 pathrelse (&path);
397 /* this buffer has valid data, but isn't valid for io. mapping it to
398 * block #0 tells the rest of reiserfs it just has a tail in it
399 */
400 map_bh(bh_result, inode->i_sb, 0);
401 set_buffer_uptodate (bh_result);
402 return 0;
403}
404
405
406// this is called to create file map. So, _get_block_create_0 will not
407// read direct item
408static int reiserfs_bmap (struct inode * inode, sector_t block,
409 struct buffer_head * bh_result, int create)
410{
411 if (!file_capable (inode, block))
412 return -EFBIG;
413
414 reiserfs_write_lock(inode->i_sb);
415 /* do not read the direct item */
416 _get_block_create_0 (inode, block, bh_result, 0) ;
417 reiserfs_write_unlock(inode->i_sb);
418 return 0;
419}
420
421/* special version of get_block that is only used by grab_tail_page right
422** now. It is sent to block_prepare_write, and when you try to get a
423** block past the end of the file (or a block from a hole) it returns
424** -ENOENT instead of a valid buffer. block_prepare_write expects to
425** be able to do i/o on the buffers returned, unless an error value
426** is also returned.
427**
428** So, this allows block_prepare_write to be used for reading a single block
429** in a page. Where it does not produce a valid page for holes, or past the
430** end of the file. This turns out to be exactly what we need for reading
431** tails for conversion.
432**
433** The point of the wrapper is forcing a certain value for create, even
434** though the VFS layer is calling this function with create==1. If you
435** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
436** don't use this function.
437*/
438static int reiserfs_get_block_create_0 (struct inode * inode, sector_t block,
439 struct buffer_head * bh_result, int create) {
440 return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE) ;
441}
442
443/* This is special helper for reiserfs_get_block in case we are executing
444 direct_IO request. */
445static int reiserfs_get_blocks_direct_io(struct inode *inode,
446 sector_t iblock,
447 unsigned long max_blocks,
448 struct buffer_head *bh_result,
449 int create)
450{
451 int ret ;
452
453 bh_result->b_page = NULL;
454
455 /* We set the b_size before reiserfs_get_block call since it is
456 referenced in convert_tail_for_hole() that may be called from
457 reiserfs_get_block() */
458 bh_result->b_size = (1 << inode->i_blkbits);
459
460 ret = reiserfs_get_block(inode, iblock, bh_result,
461 create | GET_BLOCK_NO_DANGLE) ;
462 if (ret)
463 goto out;
464
465 /* don't allow direct io onto tail pages */
466 if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
467 /* make sure future calls to the direct io funcs for this offset
468 ** in the file fail by unmapping the buffer
469 */
470 clear_buffer_mapped(bh_result);
471 ret = -EINVAL ;
472 }
473 /* Possible unpacked tail. Flush the data before pages have
474 disappeared */
475 if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
476 int err;
477 lock_kernel();
478 err = reiserfs_commit_for_inode(inode);
479 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
480 unlock_kernel();
481 if (err < 0)
482 ret = err;
483 }
484out:
485 return ret ;
486}
487
488
489/*
490** helper function for when reiserfs_get_block is called for a hole
491** but the file tail is still in a direct item
492** bh_result is the buffer head for the hole
493** tail_offset is the offset of the start of the tail in the file
494**
495** This calls prepare_write, which will start a new transaction
496** you should not be in a transaction, or have any paths held when you
497** call this.
498*/
499static int convert_tail_for_hole(struct inode *inode,
500 struct buffer_head *bh_result,
501 loff_t tail_offset) {
502 unsigned long index ;
503 unsigned long tail_end ;
504 unsigned long tail_start ;
505 struct page * tail_page ;
506 struct page * hole_page = bh_result->b_page ;
507 int retval = 0 ;
508
509 if ((tail_offset & (bh_result->b_size - 1)) != 1)
510 return -EIO ;
511
512 /* always try to read until the end of the block */
513 tail_start = tail_offset & (PAGE_CACHE_SIZE - 1) ;
514 tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ;
515
516 index = tail_offset >> PAGE_CACHE_SHIFT ;
517 /* hole_page can be zero in case of direct_io, we are sure
518 that we cannot get here if we write with O_DIRECT into
519 tail page */
520 if (!hole_page || index != hole_page->index) {
521 tail_page = grab_cache_page(inode->i_mapping, index) ;
522 retval = -ENOMEM;
523 if (!tail_page) {
524 goto out ;
525 }
526 } else {
527 tail_page = hole_page ;
528 }
529
530 /* we don't have to make sure the conversion did not happen while
531 ** we were locking the page because anyone that could convert
532 ** must first take i_sem.
533 **
534 ** We must fix the tail page for writing because it might have buffers
535 ** that are mapped, but have a block number of 0. This indicates tail
536 ** data that has been read directly into the page, and block_prepare_write
537 ** won't trigger a get_block in this case.
538 */
539 fix_tail_page_for_writing(tail_page) ;
540 retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
541 if (retval)
542 goto unlock ;
543
544 /* tail conversion might change the data in the page */
545 flush_dcache_page(tail_page) ;
546
547 retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end) ;
548
549unlock:
550 if (tail_page != hole_page) {
551 unlock_page(tail_page) ;
552 page_cache_release(tail_page) ;
553 }
554out:
555 return retval ;
556}
557
558static inline int _allocate_block(struct reiserfs_transaction_handle *th,
559 long block,
560 struct inode *inode,
561 b_blocknr_t *allocated_block_nr,
562 struct path * path,
563 int flags) {
564 BUG_ON (!th->t_trans_id);
565
566#ifdef REISERFS_PREALLOCATE
567 if (!(flags & GET_BLOCK_NO_ISEM)) {
568 return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, path, block);
569 }
570#endif
571 return reiserfs_new_unf_blocknrs (th, inode, allocated_block_nr, path, block);
572}
573
574int reiserfs_get_block (struct inode * inode, sector_t block,
575 struct buffer_head * bh_result, int create)
576{
577 int repeat, retval = 0;
578 b_blocknr_t allocated_block_nr = 0;// b_blocknr_t is (unsigned) 32 bit int
579 INITIALIZE_PATH(path);
580 int pos_in_item;
581 struct cpu_key key;
582 struct buffer_head * bh, * unbh = NULL;
583 struct item_head * ih, tmp_ih;
584 __u32 * item;
585 int done;
586 int fs_gen;
587 struct reiserfs_transaction_handle *th = NULL;
588 /* space reserved in transaction batch:
589 . 3 balancings in direct->indirect conversion
590 . 1 block involved into reiserfs_update_sd()
591 XXX in practically impossible worst case direct2indirect()
592 can incur (much) more than 3 balancings.
593 quota update for user, group */
594 int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS;
595 int version;
596 int dangle = 1;
597 loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ;
598
599 /* bad.... */
600 reiserfs_write_lock(inode->i_sb);
601 version = get_inode_item_key_version (inode);
602
603 if (block < 0) {
604 reiserfs_write_unlock(inode->i_sb);
605 return -EIO;
606 }
607
608 if (!file_capable (inode, block)) {
609 reiserfs_write_unlock(inode->i_sb);
610 return -EFBIG;
611 }
612
613 /* if !create, we aren't changing the FS, so we don't need to
614 ** log anything, so we don't need to start a transaction
615 */
616 if (!(create & GET_BLOCK_CREATE)) {
617 int ret ;
618 /* find number of block-th logical block of the file */
619 ret = _get_block_create_0 (inode, block, bh_result,
620 create | GET_BLOCK_READ_DIRECT) ;
621 reiserfs_write_unlock(inode->i_sb);
622 return ret;
623 }
624 /*
625 * if we're already in a transaction, make sure to close
626 * any new transactions we start in this func
627 */
628 if ((create & GET_BLOCK_NO_DANGLE) ||
629 reiserfs_transaction_running(inode->i_sb))
630 dangle = 0;
631
632 /* If file is of such a size, that it might have a tail and tails are enabled
633 ** we should mark it as possibly needing tail packing on close
634 */
635 if ( (have_large_tails (inode->i_sb) && inode->i_size < i_block_size (inode)*4) ||
636 (have_small_tails (inode->i_sb) && inode->i_size < i_block_size(inode)) )
637 REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
638
639 /* set the key of the first byte in the 'block'-th block of file */
640 make_cpu_key (&key, inode, new_offset,
641 TYPE_ANY, 3/*key length*/);
642 if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
643start_trans:
644 th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
645 if (!th) {
646 retval = -ENOMEM;
647 goto failure;
648 }
649 reiserfs_update_inode_transaction(inode) ;
650 }
651 research:
652
653 retval = search_for_position_by_key (inode->i_sb, &key, &path);
654 if (retval == IO_ERROR) {
655 retval = -EIO;
656 goto failure;
657 }
658
659 bh = get_last_bh (&path);
660 ih = get_ih (&path);
661 item = get_item (&path);
662 pos_in_item = path.pos_in_item;
663
664 fs_gen = get_generation (inode->i_sb);
665 copy_item_head (&tmp_ih, ih);
666
667 if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) {
668 /* we have to allocate block for the unformatted node */
669 if (!th) {
670 pathrelse(&path) ;
671 goto start_trans;
672 }
673
674 repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create);
675
676 if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
677 /* restart the transaction to give the journal a chance to free
678 ** some blocks. releases the path, so we have to go back to
679 ** research if we succeed on the second try
680 */
681 SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
682 retval = restart_transaction(th, inode, &path) ;
683 if (retval)
684 goto failure;
685 repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create);
686
687 if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
688 goto research ;
689 }
690 if (repeat == QUOTA_EXCEEDED)
691 retval = -EDQUOT;
692 else
693 retval = -ENOSPC;
694 goto failure;
695 }
696
697 if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
698 goto research;
699 }
700 }
701
702 if (indirect_item_found (retval, ih)) {
703 b_blocknr_t unfm_ptr;
704 /* 'block'-th block is in the file already (there is
705 corresponding cell in some indirect item). But it may be
706 zero unformatted node pointer (hole) */
707 unfm_ptr = get_block_num (item, pos_in_item);
708 if (unfm_ptr == 0) {
709 /* use allocated block to plug the hole */
710 reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
711 if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
712 reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
713 goto research;
714 }
715 set_buffer_new(bh_result);
716 if (buffer_dirty(bh_result) && reiserfs_data_ordered(inode->i_sb))
717 reiserfs_add_ordered_list(inode, bh_result);
718 put_block_num(item, pos_in_item, allocated_block_nr) ;
719 unfm_ptr = allocated_block_nr;
720 journal_mark_dirty (th, inode->i_sb, bh);
721 reiserfs_update_sd(th, inode) ;
722 }
723 set_block_dev_mapped(bh_result, unfm_ptr, inode);
724 pathrelse (&path);
725 retval = 0;
726 if (!dangle && th)
727 retval = reiserfs_end_persistent_transaction(th);
728
729 reiserfs_write_unlock(inode->i_sb);
730
731 /* the item was found, so new blocks were not added to the file
732 ** there is no need to make sure the inode is updated with this
733 ** transaction
734 */
735 return retval;
736 }
737
738 if (!th) {
739 pathrelse(&path) ;
740 goto start_trans;
741 }
742
743 /* desired position is not found or is in the direct item. We have
744 to append file with holes up to 'block'-th block converting
745 direct items to indirect one if necessary */
746 done = 0;
747 do {
748 if (is_statdata_le_ih (ih)) {
749 __u32 unp = 0;
750 struct cpu_key tmp_key;
751
752 /* indirect item has to be inserted */
753 make_le_item_head (&tmp_ih, &key, version, 1, TYPE_INDIRECT,
754 UNFM_P_SIZE, 0/* free_space */);
755
756 if (cpu_key_k_offset (&key) == 1) {
757 /* we are going to add 'block'-th block to the file. Use
758 allocated block for that */
759 unp = cpu_to_le32 (allocated_block_nr);
760 set_block_dev_mapped (bh_result, allocated_block_nr, inode);
761 set_buffer_new(bh_result);
762 done = 1;
763 }
764 tmp_key = key; // ;)
765 set_cpu_key_k_offset (&tmp_key, 1);
766 PATH_LAST_POSITION(&path) ++;
767
768 retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, inode, (char *)&unp);
769 if (retval) {
770 reiserfs_free_block (th, inode, allocated_block_nr, 1);
771 goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
772 }
773 //mark_tail_converted (inode);
774 } else if (is_direct_le_ih (ih)) {
775 /* direct item has to be converted */
776 loff_t tail_offset;
777
778 tail_offset = ((le_ih_k_offset (ih) - 1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
779 if (tail_offset == cpu_key_k_offset (&key)) {
780 /* direct item we just found fits into block we have
781 to map. Convert it into unformatted node: use
782 bh_result for the conversion */
783 set_block_dev_mapped (bh_result, allocated_block_nr, inode);
784 unbh = bh_result;
785 done = 1;
786 } else {
787 /* we have to padd file tail stored in direct item(s)
788 up to block size and convert it to unformatted
789 node. FIXME: this should also get into page cache */
790
791 pathrelse(&path) ;
792 /*
793 * ugly, but we can only end the transaction if
794 * we aren't nested
795 */
796 BUG_ON (!th->t_refcount);
797 if (th->t_refcount == 1) {
798 retval = reiserfs_end_persistent_transaction(th);
799 th = NULL;
800 if (retval)
801 goto failure;
802 }
803
804 retval = convert_tail_for_hole(inode, bh_result, tail_offset) ;
805 if (retval) {
806 if ( retval != -ENOSPC )
807 reiserfs_warning (inode->i_sb, "clm-6004: convert tail failed inode %lu, error %d", inode->i_ino, retval) ;
808 if (allocated_block_nr) {
809 /* the bitmap, the super, and the stat data == 3 */
810 if (!th)
811 th = reiserfs_persistent_transaction(inode->i_sb,3);
812 if (th)
813 reiserfs_free_block (th,inode,allocated_block_nr,1);
814 }
815 goto failure ;
816 }
817 goto research ;
818 }
819 retval = direct2indirect (th, inode, &path, unbh, tail_offset);
820 if (retval) {
821 reiserfs_unmap_buffer(unbh);
822 reiserfs_free_block (th, inode, allocated_block_nr, 1);
823 goto failure;
824 }
825 /* it is important the set_buffer_uptodate is done after
826 ** the direct2indirect. The buffer might contain valid
827 ** data newer than the data on disk (read by readpage, changed,
828 ** and then sent here by writepage). direct2indirect needs
829 ** to know if unbh was already up to date, so it can decide
830 ** if the data in unbh needs to be replaced with data from
831 ** the disk
832 */
833 set_buffer_uptodate (unbh);
834
835 /* unbh->b_page == NULL in case of DIRECT_IO request, this means
836 buffer will disappear shortly, so it should not be added to
837 */
838 if ( unbh->b_page ) {
839 /* we've converted the tail, so we must
840 ** flush unbh before the transaction commits
841 */
842 reiserfs_add_tail_list(inode, unbh) ;
843
844 /* mark it dirty now to prevent commit_write from adding
845 ** this buffer to the inode's dirty buffer list
846 */
847 /*
848 * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
849 * It's still atomic, but it sets the page dirty too,
850 * which makes it eligible for writeback at any time by the
851 * VM (which was also the case with __mark_buffer_dirty())
852 */
853 mark_buffer_dirty(unbh) ;
854 }
855 } else {
856 /* append indirect item with holes if needed, when appending
857 pointer to 'block'-th block use block, which is already
858 allocated */
859 struct cpu_key tmp_key;
860 unp_t unf_single=0; // We use this in case we need to allocate only
861 // one block which is a fastpath
862 unp_t *un;
863 __u64 max_to_insert=MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE;
864 __u64 blocks_needed;
865
866 RFALSE( pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
867 "vs-804: invalid position for append");
868 /* indirect item has to be appended, set up key of that position */
869 make_cpu_key (&tmp_key, inode,
870 le_key_k_offset (version, &(ih->ih_key)) + op_bytes_number (ih, inode->i_sb->s_blocksize),
871 //pos_in_item * inode->i_sb->s_blocksize,
872 TYPE_INDIRECT, 3);// key type is unimportant
873
874 blocks_needed = 1 + ((cpu_key_k_offset (&key) - cpu_key_k_offset (&tmp_key)) >> inode->i_sb->s_blocksize_bits);
875 RFALSE( blocks_needed < 0, "green-805: invalid offset");
876
877 if ( blocks_needed == 1 ) {
878 un = &unf_single;
879 } else {
880 un=kmalloc( min(blocks_needed,max_to_insert)*UNFM_P_SIZE,
881 GFP_ATOMIC); // We need to avoid scheduling.
882 if ( !un) {
883 un = &unf_single;
884 blocks_needed = 1;
885 max_to_insert = 0;
886 } else
887 memset(un, 0, UNFM_P_SIZE * min(blocks_needed,max_to_insert));
888 }
889 if ( blocks_needed <= max_to_insert) {
890 /* we are going to add target block to the file. Use allocated
891 block for that */
892 un[blocks_needed-1] = cpu_to_le32 (allocated_block_nr);
893 set_block_dev_mapped (bh_result, allocated_block_nr, inode);
894 set_buffer_new(bh_result);
895 done = 1;
896 } else {
897 /* paste hole to the indirect item */
898 /* If kmalloc failed, max_to_insert becomes zero and it means we
899 only have space for one block */
900 blocks_needed=max_to_insert?max_to_insert:1;
901 }
902 retval = reiserfs_paste_into_item (th, &path, &tmp_key, inode, (char *)un, UNFM_P_SIZE * blocks_needed);
903
904 if (blocks_needed != 1)
905 kfree(un);
906
907 if (retval) {
908 reiserfs_free_block (th, inode, allocated_block_nr, 1);
909 goto failure;
910 }
911 if (!done) {
912 /* We need to mark new file size in case this function will be
913 interrupted/aborted later on. And we may do this only for
914 holes. */
915 inode->i_size += inode->i_sb->s_blocksize * blocks_needed;
916 }
917 }
918
919 if (done == 1)
920 break;
921
922 /* this loop could log more blocks than we had originally asked
923 ** for. So, we have to allow the transaction to end if it is
924 ** too big or too full. Update the inode so things are
925 ** consistent if we crash before the function returns
926 **
927 ** release the path so that anybody waiting on the path before
928 ** ending their transaction will be able to continue.
929 */
930 if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
931 retval = restart_transaction(th, inode, &path) ;
932 if (retval)
933 goto failure;
934 }
935 /* inserting indirect pointers for a hole can take a
936 ** long time. reschedule if needed
937 */
938 cond_resched();
939
940 retval = search_for_position_by_key (inode->i_sb, &key, &path);
941 if (retval == IO_ERROR) {
942 retval = -EIO;
943 goto failure;
944 }
945 if (retval == POSITION_FOUND) {
946 reiserfs_warning (inode->i_sb, "vs-825: reiserfs_get_block: "
947 "%K should not be found", &key);
948 retval = -EEXIST;
949 if (allocated_block_nr)
950 reiserfs_free_block (th, inode, allocated_block_nr, 1);
951 pathrelse(&path) ;
952 goto failure;
953 }
954 bh = get_last_bh (&path);
955 ih = get_ih (&path);
956 item = get_item (&path);
957 pos_in_item = path.pos_in_item;
958 } while (1);
959
960
961 retval = 0;
962
963 failure:
964 if (th && (!dangle || (retval && !th->t_trans_id))) {
965 int err;
966 if (th->t_trans_id)
967 reiserfs_update_sd(th, inode);
968 err = reiserfs_end_persistent_transaction(th);
969 if (err)
970 retval = err;
971 }
972
973 reiserfs_write_unlock(inode->i_sb);
974 reiserfs_check_path(&path) ;
975 return retval;
976}
977
978static int
979reiserfs_readpages(struct file *file, struct address_space *mapping,
980 struct list_head *pages, unsigned nr_pages)
981{
982 return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
983}
984
985/* Compute real number of used bytes by file
986 * Following three functions can go away when we'll have enough space in stat item
987 */
988static int real_space_diff(struct inode *inode, int sd_size)
989{
990 int bytes;
991 loff_t blocksize = inode->i_sb->s_blocksize ;
992
993 if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
994 return sd_size ;
995
996 /* End of file is also in full block with indirect reference, so round
997 ** up to the next block.
998 **
999 ** there is just no way to know if the tail is actually packed
1000 ** on the file, so we have to assume it isn't. When we pack the
1001 ** tail, we add 4 bytes to pretend there really is an unformatted
1002 ** node pointer
1003 */
1004 bytes = ((inode->i_size + (blocksize-1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + sd_size;
1005 return bytes ;
1006}
1007
1008static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
1009 int sd_size)
1010{
1011 if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1012 return inode->i_size + (loff_t)(real_space_diff(inode, sd_size)) ;
1013 }
1014 return ((loff_t)real_space_diff(inode, sd_size)) + (((loff_t)blocks) << 9);
1015}
1016
1017/* Compute number of blocks used by file in ReiserFS counting */
1018static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
1019{
1020 loff_t bytes = inode_get_bytes(inode) ;
1021 loff_t real_space = real_space_diff(inode, sd_size) ;
1022
1023 /* keeps fsck and non-quota versions of reiserfs happy */
1024 if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1025 bytes += (loff_t)511 ;
1026 }
1027
1028 /* files from before the quota patch might i_blocks such that
1029 ** bytes < real_space. Deal with that here to prevent it from
1030 ** going negative.
1031 */
1032 if (bytes < real_space)
1033 return 0 ;
1034 return (bytes - real_space) >> 9;
1035}
1036
1037//
1038// BAD: new directories have stat data of new type and all other items
1039// of old type. Version stored in the inode says about body items, so
1040// in update_stat_data we can not rely on inode, but have to check
1041// item version directly
1042//
1043
1044// called by read_locked_inode
1045static void init_inode (struct inode * inode, struct path * path)
1046{
1047 struct buffer_head * bh;
1048 struct item_head * ih;
1049 __u32 rdev;
1050 //int version = ITEM_VERSION_1;
1051
1052 bh = PATH_PLAST_BUFFER (path);
1053 ih = PATH_PITEM_HEAD (path);
1054
1055
1056 copy_key (INODE_PKEY (inode), &(ih->ih_key));
1057 inode->i_blksize = reiserfs_default_io_size;
1058
1059 INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list ));
1060 REISERFS_I(inode)->i_flags = 0;
1061 REISERFS_I(inode)->i_prealloc_block = 0;
1062 REISERFS_I(inode)->i_prealloc_count = 0;
1063 REISERFS_I(inode)->i_trans_id = 0;
1064 REISERFS_I(inode)->i_jl = NULL;
1065 REISERFS_I(inode)->i_acl_access = NULL;
1066 REISERFS_I(inode)->i_acl_default = NULL;
1067 init_rwsem (&REISERFS_I(inode)->xattr_sem);
1068
1069 if (stat_data_v1 (ih)) {
1070 struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih);
1071 unsigned long blocks;
1072
1073 set_inode_item_key_version (inode, KEY_FORMAT_3_5);
1074 set_inode_sd_version (inode, STAT_DATA_V1);
1075 inode->i_mode = sd_v1_mode(sd);
1076 inode->i_nlink = sd_v1_nlink(sd);
1077 inode->i_uid = sd_v1_uid(sd);
1078 inode->i_gid = sd_v1_gid(sd);
1079 inode->i_size = sd_v1_size(sd);
1080 inode->i_atime.tv_sec = sd_v1_atime(sd);
1081 inode->i_mtime.tv_sec = sd_v1_mtime(sd);
1082 inode->i_ctime.tv_sec = sd_v1_ctime(sd);
1083 inode->i_atime.tv_nsec = 0;
1084 inode->i_ctime.tv_nsec = 0;
1085 inode->i_mtime.tv_nsec = 0;
1086
1087 inode->i_blocks = sd_v1_blocks(sd);
1088 inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id);
1089 blocks = (inode->i_size + 511) >> 9;
1090 blocks = _ROUND_UP (blocks, inode->i_sb->s_blocksize >> 9);
1091 if (inode->i_blocks > blocks) {
1092 // there was a bug in <=3.5.23 when i_blocks could take negative
1093 // values. Starting from 3.5.17 this value could even be stored in
1094 // stat data. For such files we set i_blocks based on file
1095 // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
1096 // only updated if file's inode will ever change
1097 inode->i_blocks = blocks;
1098 }
1099
1100 rdev = sd_v1_rdev(sd);
1101 REISERFS_I(inode)->i_first_direct_byte = sd_v1_first_direct_byte(sd);
1102 /* an early bug in the quota code can give us an odd number for the
1103 ** block count. This is incorrect, fix it here.
1104 */
1105 if (inode->i_blocks & 1) {
1106 inode->i_blocks++ ;
1107 }
1108 inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
1109 SD_V1_SIZE));
1110 /* nopack is initially zero for v1 objects. For v2 objects,
1111 nopack is initialised from sd_attrs */
1112 REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
1113 } else {
1114 // new stat data found, but object may have old items
1115 // (directories and symlinks)
1116 struct stat_data * sd = (struct stat_data *)B_I_PITEM (bh, ih);
1117
1118 inode->i_mode = sd_v2_mode(sd);
1119 inode->i_nlink = sd_v2_nlink(sd);
1120 inode->i_uid = sd_v2_uid(sd);
1121 inode->i_size = sd_v2_size(sd);
1122 inode->i_gid = sd_v2_gid(sd);
1123 inode->i_mtime.tv_sec = sd_v2_mtime(sd);
1124 inode->i_atime.tv_sec = sd_v2_atime(sd);
1125 inode->i_ctime.tv_sec = sd_v2_ctime(sd);
1126 inode->i_ctime.tv_nsec = 0;
1127 inode->i_mtime.tv_nsec = 0;
1128 inode->i_atime.tv_nsec = 0;
1129 inode->i_blocks = sd_v2_blocks(sd);
1130 rdev = sd_v2_rdev(sd);
1131 if( S_ISCHR( inode -> i_mode ) || S_ISBLK( inode -> i_mode ) )
1132 inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id);
1133 else
1134 inode->i_generation = sd_v2_generation(sd);
1135
1136 if (S_ISDIR (inode->i_mode) || S_ISLNK (inode->i_mode))
1137 set_inode_item_key_version (inode, KEY_FORMAT_3_5);
1138 else
1139 set_inode_item_key_version (inode, KEY_FORMAT_3_6);
1140 REISERFS_I(inode)->i_first_direct_byte = 0;
1141 set_inode_sd_version (inode, STAT_DATA_V2);
1142 inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
1143 SD_V2_SIZE));
1144 /* read persistent inode attributes from sd and initalise
1145 generic inode flags from them */
1146 REISERFS_I(inode)->i_attrs = sd_v2_attrs( sd );
1147 sd_attrs_to_i_attrs( sd_v2_attrs( sd ), inode );
1148 }
1149
1150 pathrelse (path);
1151 if (S_ISREG (inode->i_mode)) {
1152 inode->i_op = &reiserfs_file_inode_operations;
1153 inode->i_fop = &reiserfs_file_operations;
1154 inode->i_mapping->a_ops = &reiserfs_address_space_operations ;
1155 } else if (S_ISDIR (inode->i_mode)) {
1156 inode->i_op = &reiserfs_dir_inode_operations;
1157 inode->i_fop = &reiserfs_dir_operations;
1158 } else if (S_ISLNK (inode->i_mode)) {
1159 inode->i_op = &reiserfs_symlink_inode_operations;
1160 inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1161 } else {
1162 inode->i_blocks = 0;
1163 inode->i_op = &reiserfs_special_inode_operations;
1164 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
1165 }
1166}
1167
1168
1169// update new stat data with inode fields
1170static void inode2sd (void * sd, struct inode * inode, loff_t size)
1171{
1172 struct stat_data * sd_v2 = (struct stat_data *)sd;
1173 __u16 flags;
1174
1175 set_sd_v2_mode(sd_v2, inode->i_mode );
1176 set_sd_v2_nlink(sd_v2, inode->i_nlink );
1177 set_sd_v2_uid(sd_v2, inode->i_uid );
1178 set_sd_v2_size(sd_v2, size );
1179 set_sd_v2_gid(sd_v2, inode->i_gid );
1180 set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec );
1181 set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec );
1182 set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec );
1183 set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
1184 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1185 set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
1186 else
1187 set_sd_v2_generation(sd_v2, inode->i_generation);
1188 flags = REISERFS_I(inode)->i_attrs;
1189 i_attrs_to_sd_attrs( inode, &flags );
1190 set_sd_v2_attrs( sd_v2, flags );
1191}
1192
1193
1194// used to copy inode's fields to old stat data
1195static void inode2sd_v1 (void * sd, struct inode * inode, loff_t size)
1196{
1197 struct stat_data_v1 * sd_v1 = (struct stat_data_v1 *)sd;
1198
1199 set_sd_v1_mode(sd_v1, inode->i_mode );
1200 set_sd_v1_uid(sd_v1, inode->i_uid );
1201 set_sd_v1_gid(sd_v1, inode->i_gid );
1202 set_sd_v1_nlink(sd_v1, inode->i_nlink );
1203 set_sd_v1_size(sd_v1, size );
1204 set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec );
1205 set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec );
1206 set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec );
1207
1208 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1209 set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
1210 else
1211 set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
1212
1213 // Sigh. i_first_direct_byte is back
1214 set_sd_v1_first_direct_byte(sd_v1, REISERFS_I(inode)->i_first_direct_byte);
1215}
1216
1217
1218/* NOTE, you must prepare the buffer head before sending it here,
1219** and then log it after the call
1220*/
1221static void update_stat_data (struct path * path, struct inode * inode,
1222 loff_t size)
1223{
1224 struct buffer_head * bh;
1225 struct item_head * ih;
1226
1227 bh = PATH_PLAST_BUFFER (path);
1228 ih = PATH_PITEM_HEAD (path);
1229
1230 if (!is_statdata_le_ih (ih))
1231 reiserfs_panic (inode->i_sb, "vs-13065: update_stat_data: key %k, found item %h",
1232 INODE_PKEY (inode), ih);
1233
1234 if (stat_data_v1 (ih)) {
1235 // path points to old stat data
1236 inode2sd_v1 (B_I_PITEM (bh, ih), inode, size);
1237 } else {
1238 inode2sd (B_I_PITEM (bh, ih), inode, size);
1239 }
1240
1241 return;
1242}
1243
1244
1245void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th,
1246 struct inode * inode, loff_t size)
1247{
1248 struct cpu_key key;
1249 INITIALIZE_PATH(path);
1250 struct buffer_head *bh ;
1251 int fs_gen ;
1252 struct item_head *ih, tmp_ih ;
1253 int retval;
1254
1255 BUG_ON (!th->t_trans_id);
1256
1257 make_cpu_key (&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);//key type is unimportant
1258
1259 for(;;) {
1260 int pos;
1261 /* look for the object's stat data */
1262 retval = search_item (inode->i_sb, &key, &path);
1263 if (retval == IO_ERROR) {
1264 reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: "
1265 "i/o failure occurred trying to update %K stat data",
1266 &key);
1267 return;
1268 }
1269 if (retval == ITEM_NOT_FOUND) {
1270 pos = PATH_LAST_POSITION (&path);
1271 pathrelse(&path) ;
1272 if (inode->i_nlink == 0) {
1273 /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found");*/
1274 return;
1275 }
1276 reiserfs_warning (inode->i_sb, "vs-13060: reiserfs_update_sd: "
1277 "stat data of object %k (nlink == %d) not found (pos %d)",
1278 INODE_PKEY (inode), inode->i_nlink, pos);
1279 reiserfs_check_path(&path) ;
1280 return;
1281 }
1282
1283 /* sigh, prepare_for_journal might schedule. When it schedules the
1284 ** FS might change. We have to detect that, and loop back to the
1285 ** search if the stat data item has moved
1286 */
1287 bh = get_last_bh(&path) ;
1288 ih = get_ih(&path) ;
1289 copy_item_head (&tmp_ih, ih);
1290 fs_gen = get_generation (inode->i_sb);
1291 reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
1292 if (fs_changed (fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) {
1293 reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
1294 continue ; /* Stat_data item has been moved after scheduling. */
1295 }
1296 break;
1297 }
1298 update_stat_data (&path, inode, size);
1299 journal_mark_dirty(th, th->t_super, bh) ;
1300 pathrelse (&path);
1301 return;
1302}
1303
1304/* reiserfs_read_locked_inode is called to read the inode off disk, and it
1305** does a make_bad_inode when things go wrong. But, we need to make sure
1306** and clear the key in the private portion of the inode, otherwise a
1307** corresponding iput might try to delete whatever object the inode last
1308** represented.
1309*/
1310static void reiserfs_make_bad_inode(struct inode *inode) {
1311 memset(INODE_PKEY(inode), 0, KEY_SIZE);
1312 make_bad_inode(inode);
1313}
1314
1315//
1316// initially this function was derived from minix or ext2's analog and
1317// evolved as the prototype did
1318//
1319
1320int reiserfs_init_locked_inode (struct inode * inode, void *p)
1321{
1322 struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p ;
1323 inode->i_ino = args->objectid;
1324 INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
1325 return 0;
1326}
1327
1328/* looks for stat data in the tree, and fills up the fields of in-core
1329 inode stat data fields */
1330void reiserfs_read_locked_inode (struct inode * inode, struct reiserfs_iget_args *args)
1331{
1332 INITIALIZE_PATH (path_to_sd);
1333 struct cpu_key key;
1334 unsigned long dirino;
1335 int retval;
1336
1337 dirino = args->dirid ;
1338
1339 /* set version 1, version 2 could be used too, because stat data
1340 key is the same in both versions */
1341 key.version = KEY_FORMAT_3_5;
1342 key.on_disk_key.k_dir_id = dirino;
1343 key.on_disk_key.k_objectid = inode->i_ino;
1344 key.on_disk_key.u.k_offset_v1.k_offset = SD_OFFSET;
1345 key.on_disk_key.u.k_offset_v1.k_uniqueness = SD_UNIQUENESS;
1346
1347 /* look for the object's stat data */
1348 retval = search_item (inode->i_sb, &key, &path_to_sd);
1349 if (retval == IO_ERROR) {
1350 reiserfs_warning (inode->i_sb, "vs-13070: reiserfs_read_locked_inode: "
1351 "i/o failure occurred trying to find stat data of %K",
1352 &key);
1353 reiserfs_make_bad_inode(inode) ;
1354 return;
1355 }
1356 if (retval != ITEM_FOUND) {
1357 /* a stale NFS handle can trigger this without it being an error */
1358 pathrelse (&path_to_sd);
1359 reiserfs_make_bad_inode(inode) ;
1360 inode->i_nlink = 0;
1361 return;
1362 }
1363
1364 init_inode (inode, &path_to_sd);
1365
1366 /* It is possible that knfsd is trying to access inode of a file
1367 that is being removed from the disk by some other thread. As we
1368 update sd on unlink all that is required is to check for nlink
1369 here. This bug was first found by Sizif when debugging
1370 SquidNG/Butterfly, forgotten, and found again after Philippe
1371 Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
1372
1373 More logical fix would require changes in fs/inode.c:iput() to
1374 remove inode from hash-table _after_ fs cleaned disk stuff up and
1375 in iget() to return NULL if I_FREEING inode is found in
1376 hash-table. */
1377 /* Currently there is one place where it's ok to meet inode with
1378 nlink==0: processing of open-unlinked and half-truncated files
1379 during mount (fs/reiserfs/super.c:finish_unfinished()). */
1380 if( ( inode -> i_nlink == 0 ) &&
1381 ! REISERFS_SB(inode -> i_sb) -> s_is_unlinked_ok ) {
1382 reiserfs_warning (inode->i_sb,
1383 "vs-13075: reiserfs_read_locked_inode: "
1384 "dead inode read from disk %K. "
1385 "This is likely to be race with knfsd. Ignore",
1386 &key );
1387 reiserfs_make_bad_inode( inode );
1388 }
1389
1390 reiserfs_check_path(&path_to_sd) ; /* init inode should be relsing */
1391
1392}
1393
1394/**
1395 * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
1396 *
1397 * @inode: inode from hash table to check
1398 * @opaque: "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
1399 *
1400 * This function is called by iget5_locked() to distinguish reiserfs inodes
1401 * having the same inode numbers. Such inodes can only exist due to some
1402 * error condition. One of them should be bad. Inodes with identical
1403 * inode numbers (objectids) are distinguished by parent directory ids.
1404 *
1405 */
1406int reiserfs_find_actor( struct inode *inode, void *opaque )
1407{
1408 struct reiserfs_iget_args *args;
1409
1410 args = opaque;
1411 /* args is already in CPU order */
1412 return (inode->i_ino == args->objectid) &&
1413 (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
1414}
1415
1416struct inode * reiserfs_iget (struct super_block * s, const struct cpu_key * key)
1417{
1418 struct inode * inode;
1419 struct reiserfs_iget_args args ;
1420
1421 args.objectid = key->on_disk_key.k_objectid ;
1422 args.dirid = key->on_disk_key.k_dir_id ;
1423 inode = iget5_locked (s, key->on_disk_key.k_objectid,
1424 reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args));
1425 if (!inode)
1426 return ERR_PTR(-ENOMEM) ;
1427
1428 if (inode->i_state & I_NEW) {
1429 reiserfs_read_locked_inode(inode, &args);
1430 unlock_new_inode(inode);
1431 }
1432
1433 if (comp_short_keys (INODE_PKEY (inode), key) || is_bad_inode (inode)) {
1434 /* either due to i/o error or a stale NFS handle */
1435 iput (inode);
1436 inode = NULL;
1437 }
1438 return inode;
1439}
1440
1441struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp)
1442{
1443 __u32 *data = vobjp;
1444 struct cpu_key key ;
1445 struct dentry *result;
1446 struct inode *inode;
1447
1448 key.on_disk_key.k_objectid = data[0] ;
1449 key.on_disk_key.k_dir_id = data[1] ;
1450 reiserfs_write_lock(sb);
1451 inode = reiserfs_iget(sb, &key) ;
1452 if (inode && !IS_ERR(inode) && data[2] != 0 &&
1453 data[2] != inode->i_generation) {
1454 iput(inode) ;
1455 inode = NULL ;
1456 }
1457 reiserfs_write_unlock(sb);
1458 if (!inode)
1459 inode = ERR_PTR(-ESTALE);
1460 if (IS_ERR(inode))
1461 return ERR_PTR(PTR_ERR(inode));
1462 result = d_alloc_anon(inode);
1463 if (!result) {
1464 iput(inode);
1465 return ERR_PTR(-ENOMEM);
1466 }
1467 return result;
1468}
1469
1470struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 *data,
1471 int len, int fhtype,
1472 int (*acceptable)(void *contect, struct dentry *de),
1473 void *context) {
1474 __u32 obj[3], parent[3];
1475
1476 /* fhtype happens to reflect the number of u32s encoded.
1477 * due to a bug in earlier code, fhtype might indicate there
1478 * are more u32s then actually fitted.
1479 * so if fhtype seems to be more than len, reduce fhtype.
1480 * Valid types are:
1481 * 2 - objectid + dir_id - legacy support
1482 * 3 - objectid + dir_id + generation
1483 * 4 - objectid + dir_id + objectid and dirid of parent - legacy
1484 * 5 - objectid + dir_id + generation + objectid and dirid of parent
1485 * 6 - as above plus generation of directory
1486 * 6 does not fit in NFSv2 handles
1487 */
1488 if (fhtype > len) {
1489 if (fhtype != 6 || len != 5)
1490 reiserfs_warning (sb, "nfsd/reiserfs, fhtype=%d, len=%d - odd",
1491 fhtype, len);
1492 fhtype = 5;
1493 }
1494
1495 obj[0] = data[0];
1496 obj[1] = data[1];
1497 if (fhtype == 3 || fhtype >= 5)
1498 obj[2] = data[2];
1499 else obj[2] = 0; /* generation number */
1500
1501 if (fhtype >= 4) {
1502 parent[0] = data[fhtype>=5?3:2] ;
1503 parent[1] = data[fhtype>=5?4:3] ;
1504 if (fhtype == 6)
1505 parent[2] = data[5];
1506 else parent[2] = 0;
1507 }
1508 return sb->s_export_op->find_exported_dentry(sb, obj, fhtype < 4 ? NULL : parent,
1509 acceptable, context);
1510}
1511
1512int reiserfs_encode_fh(struct dentry *dentry, __u32 *data, int *lenp, int need_parent) {
1513 struct inode *inode = dentry->d_inode ;
1514 int maxlen = *lenp;
1515
1516 if (maxlen < 3)
1517 return 255 ;
1518
1519 data[0] = inode->i_ino ;
1520 data[1] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ;
1521 data[2] = inode->i_generation ;
1522 *lenp = 3 ;
1523 /* no room for directory info? return what we've stored so far */
1524 if (maxlen < 5 || ! need_parent)
1525 return 3 ;
1526
1527 spin_lock(&dentry->d_lock);
1528 inode = dentry->d_parent->d_inode ;
1529 data[3] = inode->i_ino ;
1530 data[4] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ;
1531 *lenp = 5 ;
1532 if (maxlen >= 6) {
1533 data[5] = inode->i_generation ;
1534 *lenp = 6 ;
1535 }
1536 spin_unlock(&dentry->d_lock);
1537 return *lenp ;
1538}
1539
1540
1541/* looks for stat data, then copies fields to it, marks the buffer
1542 containing stat data as dirty */
1543/* reiserfs inodes are never really dirty, since the dirty inode call
1544** always logs them. This call allows the VFS inode marking routines
1545** to properly mark inodes for datasync and such, but only actually
1546** does something when called for a synchronous update.
1547*/
1548int reiserfs_write_inode (struct inode * inode, int do_sync) {
1549 struct reiserfs_transaction_handle th ;
1550 int jbegin_count = 1 ;
1551
1552 if (inode->i_sb->s_flags & MS_RDONLY)
1553 return -EROFS;
1554 /* memory pressure can sometimes initiate write_inode calls with sync == 1,
1555 ** these cases are just when the system needs ram, not when the
1556 ** inode needs to reach disk for safety, and they can safely be
1557 ** ignored because the altered inode has already been logged.
1558 */
1559 if (do_sync && !(current->flags & PF_MEMALLOC)) {
1560 reiserfs_write_lock(inode->i_sb);
1561 if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
1562 reiserfs_update_sd (&th, inode);
1563 journal_end_sync(&th, inode->i_sb, jbegin_count) ;
1564 }
1565 reiserfs_write_unlock(inode->i_sb);
1566 }
1567 return 0;
1568}
1569
1570/* stat data of new object is inserted already, this inserts the item
1571 containing "." and ".." entries */
1572static int reiserfs_new_directory (struct reiserfs_transaction_handle *th,
1573 struct inode *inode,
1574 struct item_head * ih, struct path * path,
1575 struct inode * dir)
1576{
1577 struct super_block * sb = th->t_super;
1578 char empty_dir [EMPTY_DIR_SIZE];
1579 char * body = empty_dir;
1580 struct cpu_key key;
1581 int retval;
1582
1583 BUG_ON (!th->t_trans_id);
1584
1585 _make_cpu_key (&key, KEY_FORMAT_3_5, le32_to_cpu (ih->ih_key.k_dir_id),
1586 le32_to_cpu (ih->ih_key.k_objectid), DOT_OFFSET, TYPE_DIRENTRY, 3/*key length*/);
1587
1588 /* compose item head for new item. Directories consist of items of
1589 old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
1590 is done by reiserfs_new_inode */
1591 if (old_format_only (sb)) {
1592 make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
1593
1594 make_empty_dir_item_v1 (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid,
1595 INODE_PKEY (dir)->k_dir_id,
1596 INODE_PKEY (dir)->k_objectid );
1597 } else {
1598 make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
1599
1600 make_empty_dir_item (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid,
1601 INODE_PKEY (dir)->k_dir_id,
1602 INODE_PKEY (dir)->k_objectid );
1603 }
1604
1605 /* look for place in the tree for new item */
1606 retval = search_item (sb, &key, path);
1607 if (retval == IO_ERROR) {
1608 reiserfs_warning (sb, "vs-13080: reiserfs_new_directory: "
1609 "i/o failure occurred creating new directory");
1610 return -EIO;
1611 }
1612 if (retval == ITEM_FOUND) {
1613 pathrelse (path);
1614 reiserfs_warning (sb, "vs-13070: reiserfs_new_directory: "
1615 "object with this key exists (%k)", &(ih->ih_key));
1616 return -EEXIST;
1617 }
1618
1619 /* insert item, that is empty directory item */
1620 return reiserfs_insert_item (th, path, &key, ih, inode, body);
1621}
1622
1623
1624/* stat data of object has been inserted, this inserts the item
1625 containing the body of symlink */
1626static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th,
1627 struct inode *inode, /* Inode of symlink */
1628 struct item_head * ih,
1629 struct path * path, const char * symname, int item_len)
1630{
1631 struct super_block * sb = th->t_super;
1632 struct cpu_key key;
1633 int retval;
1634
1635 BUG_ON (!th->t_trans_id);
1636
1637 _make_cpu_key (&key, KEY_FORMAT_3_5,
1638 le32_to_cpu (ih->ih_key.k_dir_id),
1639 le32_to_cpu (ih->ih_key.k_objectid),
1640 1, TYPE_DIRECT, 3/*key length*/);
1641
1642 make_le_item_head (ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len, 0/*free_space*/);
1643
1644 /* look for place in the tree for new item */
1645 retval = search_item (sb, &key, path);
1646 if (retval == IO_ERROR) {
1647 reiserfs_warning (sb, "vs-13080: reiserfs_new_symlinik: "
1648 "i/o failure occurred creating new symlink");
1649 return -EIO;
1650 }
1651 if (retval == ITEM_FOUND) {
1652 pathrelse (path);
1653 reiserfs_warning (sb, "vs-13080: reiserfs_new_symlink: "
1654 "object with this key exists (%k)", &(ih->ih_key));
1655 return -EEXIST;
1656 }
1657
1658 /* insert item, that is body of symlink */
1659 return reiserfs_insert_item (th, path, &key, ih, inode, symname);
1660}
1661
1662
1663/* inserts the stat data into the tree, and then calls
1664 reiserfs_new_directory (to insert ".", ".." item if new object is
1665 directory) or reiserfs_new_symlink (to insert symlink body if new
1666 object is symlink) or nothing (if new object is regular file)
1667
1668 NOTE! uid and gid must already be set in the inode. If we return
1669 non-zero due to an error, we have to drop the quota previously allocated
1670 for the fresh inode. This can only be done outside a transaction, so
1671 if we return non-zero, we also end the transaction. */
1672int reiserfs_new_inode (struct reiserfs_transaction_handle *th,
1673 struct inode * dir, int mode,
1674 const char * symname,
1675 /* 0 for regular, EMTRY_DIR_SIZE for dirs,
1676 strlen (symname) for symlinks)*/
1677 loff_t i_size, struct dentry *dentry,
1678 struct inode *inode)
1679{
1680 struct super_block * sb;
1681 INITIALIZE_PATH (path_to_key);
1682 struct cpu_key key;
1683 struct item_head ih;
1684 struct stat_data sd;
1685 int retval;
1686 int err;
1687
1688 BUG_ON (!th->t_trans_id);
1689
1690 if (DQUOT_ALLOC_INODE(inode)) {
1691 err = -EDQUOT;
1692 goto out_end_trans;
1693 }
1694 if (!dir || !dir->i_nlink) {
1695 err = -EPERM;
1696 goto out_bad_inode;
1697 }
1698
1699 sb = dir->i_sb;
1700
1701 /* item head of new item */
1702 ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
1703 ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid (th));
1704 if (!ih.ih_key.k_objectid) {
1705 err = -ENOMEM;
1706 goto out_bad_inode ;
1707 }
1708 if (old_format_only (sb))
1709 /* not a perfect generation count, as object ids can be reused, but
1710 ** this is as good as reiserfs can do right now.
1711 ** note that the private part of inode isn't filled in yet, we have
1712 ** to use the directory.
1713 */
1714 inode->i_generation = le32_to_cpu (INODE_PKEY (dir)->k_objectid);
1715 else
1716#if defined( USE_INODE_GENERATION_COUNTER )
1717 inode->i_generation = le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
1718#else
1719 inode->i_generation = ++event;
1720#endif
1721
1722 /* fill stat data */
1723 inode->i_nlink = (S_ISDIR (mode) ? 2 : 1);
1724
1725 /* uid and gid must already be set by the caller for quota init */
1726
1727 /* symlink cannot be immutable or append only, right? */
1728 if( S_ISLNK( inode -> i_mode ) )
1729 inode -> i_flags &= ~ ( S_IMMUTABLE | S_APPEND );
1730
1731 inode->i_mtime = inode->i_atime = inode->i_ctime =
1732 CURRENT_TIME_SEC;
1733 inode->i_size = i_size;
1734 inode->i_blocks = 0;
1735 inode->i_bytes = 0;
1736 REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
1737 U32_MAX/*NO_BYTES_IN_DIRECT_ITEM*/;
1738
1739 INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list ));
1740 REISERFS_I(inode)->i_flags = 0;
1741 REISERFS_I(inode)->i_prealloc_block = 0;
1742 REISERFS_I(inode)->i_prealloc_count = 0;
1743 REISERFS_I(inode)->i_trans_id = 0;
1744 REISERFS_I(inode)->i_jl = NULL;
1745 REISERFS_I(inode)->i_attrs =
1746 REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
1747 sd_attrs_to_i_attrs( REISERFS_I(inode) -> i_attrs, inode );
1748 REISERFS_I(inode)->i_acl_access = NULL;
1749 REISERFS_I(inode)->i_acl_default = NULL;
1750 init_rwsem (&REISERFS_I(inode)->xattr_sem);
1751
1752 if (old_format_only (sb))
1753 make_le_item_head (&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1754 else
1755 make_le_item_head (&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1756
1757 /* key to search for correct place for new stat data */
1758 _make_cpu_key (&key, KEY_FORMAT_3_6, le32_to_cpu (ih.ih_key.k_dir_id),
1759 le32_to_cpu (ih.ih_key.k_objectid), SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/);
1760
1761 /* find proper place for inserting of stat data */
1762 retval = search_item (sb, &key, &path_to_key);
1763 if (retval == IO_ERROR) {
1764 err = -EIO;
1765 goto out_bad_inode;
1766 }
1767 if (retval == ITEM_FOUND) {
1768 pathrelse (&path_to_key);
1769 err = -EEXIST;
1770 goto out_bad_inode;
1771 }
1772 if (old_format_only (sb)) {
1773 if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
1774 pathrelse (&path_to_key);
1775 /* i_uid or i_gid is too big to be stored in stat data v3.5 */
1776 err = -EINVAL;
1777 goto out_bad_inode;
1778 }
1779 inode2sd_v1 (&sd, inode, inode->i_size);
1780 } else {
1781 inode2sd (&sd, inode, inode->i_size);
1782 }
1783 // these do not go to on-disk stat data
1784 inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid);
1785 inode->i_blksize = reiserfs_default_io_size;
1786
1787 // store in in-core inode the key of stat data and version all
1788 // object items will have (directory items will have old offset
1789 // format, other new objects will consist of new items)
1790 memcpy (INODE_PKEY (inode), &(ih.ih_key), KEY_SIZE);
1791 if (old_format_only (sb) || S_ISDIR(mode) || S_ISLNK(mode))
1792 set_inode_item_key_version (inode, KEY_FORMAT_3_5);
1793 else
1794 set_inode_item_key_version (inode, KEY_FORMAT_3_6);
1795 if (old_format_only (sb))
1796 set_inode_sd_version (inode, STAT_DATA_V1);
1797 else
1798 set_inode_sd_version (inode, STAT_DATA_V2);
1799
1800 /* insert the stat data into the tree */
1801#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1802 if (REISERFS_I(dir)->new_packing_locality)
1803 th->displace_new_blocks = 1;
1804#endif
1805 retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, inode, (char *)(&sd));
1806 if (retval) {
1807 err = retval;
1808 reiserfs_check_path(&path_to_key) ;
1809 goto out_bad_inode;
1810 }
1811
1812#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1813 if (!th->displace_new_blocks)
1814 REISERFS_I(dir)->new_packing_locality = 0;
1815#endif
1816 if (S_ISDIR(mode)) {
1817 /* insert item with "." and ".." */
1818 retval = reiserfs_new_directory (th, inode, &ih, &path_to_key, dir);
1819 }
1820
1821 if (S_ISLNK(mode)) {
1822 /* insert body of symlink */
1823 if (!old_format_only (sb))
1824 i_size = ROUND_UP(i_size);
1825 retval = reiserfs_new_symlink (th, inode, &ih, &path_to_key, symname, i_size);
1826 }
1827 if (retval) {
1828 err = retval;
1829 reiserfs_check_path(&path_to_key) ;
1830 journal_end(th, th->t_super, th->t_blocks_allocated);
1831 goto out_inserted_sd;
1832 }
1833
1834 /* XXX CHECK THIS */
1835 if (reiserfs_posixacl (inode->i_sb)) {
1836 retval = reiserfs_inherit_default_acl (dir, dentry, inode);
1837 if (retval) {
1838 err = retval;
1839 reiserfs_check_path(&path_to_key) ;
1840 journal_end(th, th->t_super, th->t_blocks_allocated);
1841 goto out_inserted_sd;
1842 }
1843 } else if (inode->i_sb->s_flags & MS_POSIXACL) {
1844 reiserfs_warning (inode->i_sb, "ACLs aren't enabled in the fs, "
1845 "but vfs thinks they are!");
1846 } else if (is_reiserfs_priv_object (dir)) {
1847 reiserfs_mark_inode_private (inode);
1848 }
1849
1850 insert_inode_hash (inode);
1851 reiserfs_update_sd(th, inode);
1852 reiserfs_check_path(&path_to_key) ;
1853
1854 return 0;
1855
1856/* it looks like you can easily compress these two goto targets into
1857 * one. Keeping it like this doesn't actually hurt anything, and they
1858 * are place holders for what the quota code actually needs.
1859 */
1860out_bad_inode:
1861 /* Invalidate the object, nothing was inserted yet */
1862 INODE_PKEY(inode)->k_objectid = 0;
1863
1864 /* Quota change must be inside a transaction for journaling */
1865 DQUOT_FREE_INODE(inode);
1866
1867out_end_trans:
1868 journal_end(th, th->t_super, th->t_blocks_allocated) ;
1869 /* Drop can be outside and it needs more credits so it's better to have it outside */
1870 DQUOT_DROP(inode);
1871 inode->i_flags |= S_NOQUOTA;
1872 make_bad_inode(inode);
1873
1874out_inserted_sd:
1875 inode->i_nlink = 0;
1876 th->t_trans_id = 0; /* so the caller can't use this handle later */
1877 iput(inode);
1878 return err;
1879}
1880
1881/*
1882** finds the tail page in the page cache,
1883** reads the last block in.
1884**
1885** On success, page_result is set to a locked, pinned page, and bh_result
1886** is set to an up to date buffer for the last block in the file. returns 0.
1887**
1888** tail conversion is not done, so bh_result might not be valid for writing
1889** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
1890** trying to write the block.
1891**
1892** on failure, nonzero is returned, page_result and bh_result are untouched.
1893*/
1894static int grab_tail_page(struct inode *p_s_inode,
1895 struct page **page_result,
1896 struct buffer_head **bh_result) {
1897
1898 /* we want the page with the last byte in the file,
1899 ** not the page that will hold the next byte for appending
1900 */
1901 unsigned long index = (p_s_inode->i_size-1) >> PAGE_CACHE_SHIFT ;
1902 unsigned long pos = 0 ;
1903 unsigned long start = 0 ;
1904 unsigned long blocksize = p_s_inode->i_sb->s_blocksize ;
1905 unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1) ;
1906 struct buffer_head *bh ;
1907 struct buffer_head *head ;
1908 struct page * page ;
1909 int error ;
1910
1911 /* we know that we are only called with inode->i_size > 0.
1912 ** we also know that a file tail can never be as big as a block
1913 ** If i_size % blocksize == 0, our file is currently block aligned
1914 ** and it won't need converting or zeroing after a truncate.
1915 */
1916 if ((offset & (blocksize - 1)) == 0) {
1917 return -ENOENT ;
1918 }
1919 page = grab_cache_page(p_s_inode->i_mapping, index) ;
1920 error = -ENOMEM ;
1921 if (!page) {
1922 goto out ;
1923 }
1924 /* start within the page of the last block in the file */
1925 start = (offset / blocksize) * blocksize ;
1926
1927 error = block_prepare_write(page, start, offset,
1928 reiserfs_get_block_create_0) ;
1929 if (error)
1930 goto unlock ;
1931
1932 head = page_buffers(page) ;
1933 bh = head;
1934 do {
1935 if (pos >= start) {
1936 break ;
1937 }
1938 bh = bh->b_this_page ;
1939 pos += blocksize ;
1940 } while(bh != head) ;
1941
1942 if (!buffer_uptodate(bh)) {
1943 /* note, this should never happen, prepare_write should
1944 ** be taking care of this for us. If the buffer isn't up to date,
1945 ** I've screwed up the code to find the buffer, or the code to
1946 ** call prepare_write
1947 */
1948 reiserfs_warning (p_s_inode->i_sb,
1949 "clm-6000: error reading block %lu on dev %s",
1950 bh->b_blocknr,
1951 reiserfs_bdevname (p_s_inode->i_sb)) ;
1952 error = -EIO ;
1953 goto unlock ;
1954 }
1955 *bh_result = bh ;
1956 *page_result = page ;
1957
1958out:
1959 return error ;
1960
1961unlock:
1962 unlock_page(page) ;
1963 page_cache_release(page) ;
1964 return error ;
1965}
1966
1967/*
1968** vfs version of truncate file. Must NOT be called with
1969** a transaction already started.
1970**
1971** some code taken from block_truncate_page
1972*/
1973int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) {
1974 struct reiserfs_transaction_handle th ;
1975 /* we want the offset for the first byte after the end of the file */
1976 unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1) ;
1977 unsigned blocksize = p_s_inode->i_sb->s_blocksize ;
1978 unsigned length ;
1979 struct page *page = NULL ;
1980 int error ;
1981 struct buffer_head *bh = NULL ;
1982
1983 reiserfs_write_lock(p_s_inode->i_sb);
1984
1985 if (p_s_inode->i_size > 0) {
1986 if ((error = grab_tail_page(p_s_inode, &page, &bh))) {
1987 // -ENOENT means we truncated past the end of the file,
1988 // and get_block_create_0 could not find a block to read in,
1989 // which is ok.
1990 if (error != -ENOENT)
1991 reiserfs_warning (p_s_inode->i_sb,
1992 "clm-6001: grab_tail_page failed %d",
1993 error);
1994 page = NULL ;
1995 bh = NULL ;
1996 }
1997 }
1998
1999 /* so, if page != NULL, we have a buffer head for the offset at
2000 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
2001 ** then we have an unformatted node. Otherwise, we have a direct item,
2002 ** and no zeroing is required on disk. We zero after the truncate,
2003 ** because the truncate might pack the item anyway
2004 ** (it will unmap bh if it packs).
2005 */
2006 /* it is enough to reserve space in transaction for 2 balancings:
2007 one for "save" link adding and another for the first
2008 cut_from_item. 1 is for update_sd */
2009 error = journal_begin (&th, p_s_inode->i_sb,
2010 JOURNAL_PER_BALANCE_CNT * 2 + 1);
2011 if (error)
2012 goto out;
2013 reiserfs_update_inode_transaction(p_s_inode) ;
2014 if (update_timestamps)
2015 /* we are doing real truncate: if the system crashes before the last
2016 transaction of truncating gets committed - on reboot the file
2017 either appears truncated properly or not truncated at all */
2018 add_save_link (&th, p_s_inode, 1);
2019 error = reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ;
2020 if (error)
2021 goto out;
2022 error = journal_end (&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
2023 if (error)
2024 goto out;
2025
2026 if (update_timestamps) {
2027 error = remove_save_link (p_s_inode, 1/* truncate */);
2028 if (error)
2029 goto out;
2030 }
2031
2032 if (page) {
2033 length = offset & (blocksize - 1) ;
2034 /* if we are not on a block boundary */
2035 if (length) {
2036 char *kaddr;
2037
2038 length = blocksize - length ;
2039 kaddr = kmap_atomic(page, KM_USER0) ;
2040 memset(kaddr + offset, 0, length) ;
2041 flush_dcache_page(page) ;
2042 kunmap_atomic(kaddr, KM_USER0) ;
2043 if (buffer_mapped(bh) && bh->b_blocknr != 0) {
2044 mark_buffer_dirty(bh) ;
2045 }
2046 }
2047 unlock_page(page) ;
2048 page_cache_release(page) ;
2049 }
2050
2051 reiserfs_write_unlock(p_s_inode->i_sb);
2052 return 0;
2053out:
2054 if (page) {
2055 unlock_page (page);
2056 page_cache_release (page);
2057 }
2058 reiserfs_write_unlock(p_s_inode->i_sb);
2059 return error;
2060}
2061
2062static int map_block_for_writepage(struct inode *inode,
2063 struct buffer_head *bh_result,
2064 unsigned long block) {
2065 struct reiserfs_transaction_handle th ;
2066 int fs_gen ;
2067 struct item_head tmp_ih ;
2068 struct item_head *ih ;
2069 struct buffer_head *bh ;
2070 __u32 *item ;
2071 struct cpu_key key ;
2072 INITIALIZE_PATH(path) ;
2073 int pos_in_item ;
2074 int jbegin_count = JOURNAL_PER_BALANCE_CNT ;
2075 loff_t byte_offset = (block << inode->i_sb->s_blocksize_bits) + 1 ;
2076 int retval ;
2077 int use_get_block = 0 ;
2078 int bytes_copied = 0 ;
2079 int copy_size ;
2080 int trans_running = 0;
2081
2082 /* catch places below that try to log something without starting a trans */
2083 th.t_trans_id = 0;
2084
2085 if (!buffer_uptodate(bh_result)) {
2086 return -EIO;
2087 }
2088
2089 kmap(bh_result->b_page) ;
2090start_over:
2091 reiserfs_write_lock(inode->i_sb);
2092 make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3) ;
2093
2094research:
2095 retval = search_for_position_by_key(inode->i_sb, &key, &path) ;
2096 if (retval != POSITION_FOUND) {
2097 use_get_block = 1;
2098 goto out ;
2099 }
2100
2101 bh = get_last_bh(&path) ;
2102 ih = get_ih(&path) ;
2103 item = get_item(&path) ;
2104 pos_in_item = path.pos_in_item ;
2105
2106 /* we've found an unformatted node */
2107 if (indirect_item_found(retval, ih)) {
2108 if (bytes_copied > 0) {
2109 reiserfs_warning (inode->i_sb, "clm-6002: bytes_copied %d",
2110 bytes_copied) ;
2111 }
2112 if (!get_block_num(item, pos_in_item)) {
2113 /* crap, we are writing to a hole */
2114 use_get_block = 1;
2115 goto out ;
2116 }
2117 set_block_dev_mapped(bh_result, get_block_num(item,pos_in_item),inode);
2118 } else if (is_direct_le_ih(ih)) {
2119 char *p ;
2120 p = page_address(bh_result->b_page) ;
2121 p += (byte_offset -1) & (PAGE_CACHE_SIZE - 1) ;
2122 copy_size = ih_item_len(ih) - pos_in_item;
2123
2124 fs_gen = get_generation(inode->i_sb) ;
2125 copy_item_head(&tmp_ih, ih) ;
2126
2127 if (!trans_running) {
2128 /* vs-3050 is gone, no need to drop the path */
2129 retval = journal_begin(&th, inode->i_sb, jbegin_count) ;
2130 if (retval)
2131 goto out;
2132 reiserfs_update_inode_transaction(inode) ;
2133 trans_running = 1;
2134 if (fs_changed(fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) {
2135 reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
2136 goto research;
2137 }
2138 }
2139
2140 reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
2141
2142 if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
2143 reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
2144 goto research;
2145 }
2146
2147 memcpy( B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, copy_size) ;
2148
2149 journal_mark_dirty(&th, inode->i_sb, bh) ;
2150 bytes_copied += copy_size ;
2151 set_block_dev_mapped(bh_result, 0, inode);
2152
2153 /* are there still bytes left? */
2154 if (bytes_copied < bh_result->b_size &&
2155 (byte_offset + bytes_copied) < inode->i_size) {
2156 set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + copy_size) ;
2157 goto research ;
2158 }
2159 } else {
2160 reiserfs_warning (inode->i_sb,
2161 "clm-6003: bad item inode %lu, device %s",
2162 inode->i_ino, reiserfs_bdevname (inode->i_sb)) ;
2163 retval = -EIO ;
2164 goto out ;
2165 }
2166 retval = 0 ;
2167
2168out:
2169 pathrelse(&path) ;
2170 if (trans_running) {
2171 int err = journal_end(&th, inode->i_sb, jbegin_count) ;
2172 if (err)
2173 retval = err;
2174 trans_running = 0;
2175 }
2176 reiserfs_write_unlock(inode->i_sb);
2177
2178 /* this is where we fill in holes in the file. */
2179 if (use_get_block) {
2180 retval = reiserfs_get_block(inode, block, bh_result,
2181 GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM |
2182 GET_BLOCK_NO_DANGLE);
2183 if (!retval) {
2184 if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) {
2185 /* get_block failed to find a mapped unformatted node. */
2186 use_get_block = 0 ;
2187 goto start_over ;
2188 }
2189 }
2190 }
2191 kunmap(bh_result->b_page) ;
2192
2193 if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
2194 /* we've copied data from the page into the direct item, so the
2195 * buffer in the page is now clean, mark it to reflect that.
2196 */
2197 lock_buffer(bh_result);
2198 clear_buffer_dirty(bh_result);
2199 unlock_buffer(bh_result);
2200 }
2201 return retval ;
2202}
2203
2204/*
2205 * mason@suse.com: updated in 2.5.54 to follow the same general io
2206 * start/recovery path as __block_write_full_page, along with special
2207 * code to handle reiserfs tails.
2208 */
2209static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) {
2210 struct inode *inode = page->mapping->host ;
2211 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ;
2212 int error = 0;
2213 unsigned long block ;
2214 struct buffer_head *head, *bh;
2215 int partial = 0 ;
2216 int nr = 0;
2217 int checked = PageChecked(page);
2218 struct reiserfs_transaction_handle th;
2219 struct super_block *s = inode->i_sb;
2220 int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
2221 th.t_trans_id = 0;
2222
2223 /* The page dirty bit is cleared before writepage is called, which
2224 * means we have to tell create_empty_buffers to make dirty buffers
2225 * The page really should be up to date at this point, so tossing
2226 * in the BH_Uptodate is just a sanity check.
2227 */
2228 if (!page_has_buffers(page)) {
2229 create_empty_buffers(page, s->s_blocksize,
2230 (1 << BH_Dirty) | (1 << BH_Uptodate));
2231 }
2232 head = page_buffers(page) ;
2233
2234 /* last page in the file, zero out any contents past the
2235 ** last byte in the file
2236 */
2237 if (page->index >= end_index) {
2238 char *kaddr;
2239 unsigned last_offset;
2240
2241 last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1) ;
2242 /* no file contents in this page */
2243 if (page->index >= end_index + 1 || !last_offset) {
2244 unlock_page(page);
2245 return 0;
2246 }
2247 kaddr = kmap_atomic(page, KM_USER0);
2248 memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE-last_offset) ;
2249 flush_dcache_page(page) ;
2250 kunmap_atomic(kaddr, KM_USER0) ;
2251 }
2252 bh = head ;
2253 block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits) ;
2254 /* first map all the buffers, logging any direct items we find */
2255 do {
2256 if ((checked || buffer_dirty(bh)) && (!buffer_mapped(bh) ||
2257 (buffer_mapped(bh) && bh->b_blocknr == 0))) {
2258 /* not mapped yet, or it points to a direct item, search
2259 * the btree for the mapping info, and log any direct
2260 * items found
2261 */
2262 if ((error = map_block_for_writepage(inode, bh, block))) {
2263 goto fail ;
2264 }
2265 }
2266 bh = bh->b_this_page;
2267 block++;
2268 } while(bh != head) ;
2269
2270 /*
2271 * we start the transaction after map_block_for_writepage,
2272 * because it can create holes in the file (an unbounded operation).
2273 * starting it here, we can make a reliable estimate for how many
2274 * blocks we're going to log
2275 */
2276 if (checked) {
2277 ClearPageChecked(page);
2278 reiserfs_write_lock(s);
2279 error = journal_begin(&th, s, bh_per_page + 1);
2280 if (error) {
2281 reiserfs_write_unlock(s);
2282 goto fail;
2283 }
2284 reiserfs_update_inode_transaction(inode);
2285 }
2286 /* now go through and lock any dirty buffers on the page */
2287 do {
2288 get_bh(bh);
2289 if (!buffer_mapped(bh))
2290 continue;
2291 if (buffer_mapped(bh) && bh->b_blocknr == 0)
2292 continue;
2293
2294 if (checked) {
2295 reiserfs_prepare_for_journal(s, bh, 1);
2296 journal_mark_dirty(&th, s, bh);
2297 continue;
2298 }
2299 /* from this point on, we know the buffer is mapped to a
2300 * real block and not a direct item
2301 */
2302 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
2303 lock_buffer(bh);
2304 } else {
2305 if (test_set_buffer_locked(bh)) {
2306 redirty_page_for_writepage(wbc, page);
2307 continue;
2308 }
2309 }
2310 if (test_clear_buffer_dirty(bh)) {
2311 mark_buffer_async_write(bh);
2312 } else {
2313 unlock_buffer(bh);
2314 }
2315 } while((bh = bh->b_this_page) != head);
2316
2317 if (checked) {
2318 error = journal_end(&th, s, bh_per_page + 1);
2319 reiserfs_write_unlock(s);
2320 if (error)
2321 goto fail;
2322 }
2323 BUG_ON(PageWriteback(page));
2324 set_page_writeback(page);
2325 unlock_page(page);
2326
2327 /*
2328 * since any buffer might be the only dirty buffer on the page,
2329 * the first submit_bh can bring the page out of writeback.
2330 * be careful with the buffers.
2331 */
2332 do {
2333 struct buffer_head *next = bh->b_this_page;
2334 if (buffer_async_write(bh)) {
2335 submit_bh(WRITE, bh);
2336 nr++;
2337 }
2338 put_bh(bh);
2339 bh = next;
2340 } while(bh != head);
2341
2342 error = 0;
2343done:
2344 if (nr == 0) {
2345 /*
2346 * if this page only had a direct item, it is very possible for
2347 * no io to be required without there being an error. Or,
2348 * someone else could have locked them and sent them down the
2349 * pipe without locking the page
2350 */
2351 bh = head ;
2352 do {
2353 if (!buffer_uptodate(bh)) {
2354 partial = 1;
2355 break;
2356 }
2357 bh = bh->b_this_page;
2358 } while(bh != head);
2359 if (!partial)
2360 SetPageUptodate(page);
2361 end_page_writeback(page);
2362 }
2363 return error;
2364
2365fail:
2366 /* catches various errors, we need to make sure any valid dirty blocks
2367 * get to the media. The page is currently locked and not marked for
2368 * writeback
2369 */
2370 ClearPageUptodate(page);
2371 bh = head;
2372 do {
2373 get_bh(bh);
2374 if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
2375 lock_buffer(bh);
2376 mark_buffer_async_write(bh);
2377 } else {
2378 /*
2379 * clear any dirty bits that might have come from getting
2380 * attached to a dirty page
2381 */
2382 clear_buffer_dirty(bh);
2383 }
2384 bh = bh->b_this_page;
2385 } while(bh != head);
2386 SetPageError(page);
2387 BUG_ON(PageWriteback(page));
2388 set_page_writeback(page);
2389 unlock_page(page);
2390 do {
2391 struct buffer_head *next = bh->b_this_page;
2392 if (buffer_async_write(bh)) {
2393 clear_buffer_dirty(bh);
2394 submit_bh(WRITE, bh);
2395 nr++;
2396 }
2397 put_bh(bh);
2398 bh = next;
2399 } while(bh != head);
2400 goto done;
2401}
2402
2403
2404static int reiserfs_readpage (struct file *f, struct page * page)
2405{
2406 return block_read_full_page (page, reiserfs_get_block);
2407}
2408
2409
2410static int reiserfs_writepage (struct page * page, struct writeback_control *wbc)
2411{
2412 struct inode *inode = page->mapping->host ;
2413 reiserfs_wait_on_write_block(inode->i_sb) ;
2414 return reiserfs_write_full_page(page, wbc) ;
2415}
2416
2417static int reiserfs_prepare_write(struct file *f, struct page *page,
2418 unsigned from, unsigned to) {
2419 struct inode *inode = page->mapping->host ;
2420 int ret;
2421 int old_ref = 0;
2422
2423 reiserfs_wait_on_write_block(inode->i_sb) ;
2424 fix_tail_page_for_writing(page) ;
2425 if (reiserfs_transaction_running(inode->i_sb)) {
2426 struct reiserfs_transaction_handle *th;
2427 th = (struct reiserfs_transaction_handle *)current->journal_info;
2428 BUG_ON (!th->t_refcount);
2429 BUG_ON (!th->t_trans_id);
2430 old_ref = th->t_refcount;
2431 th->t_refcount++;
2432 }
2433
2434 ret = block_prepare_write(page, from, to, reiserfs_get_block) ;
2435 if (ret && reiserfs_transaction_running(inode->i_sb)) {
2436 struct reiserfs_transaction_handle *th = current->journal_info;
2437 /* this gets a little ugly. If reiserfs_get_block returned an
2438 * error and left a transacstion running, we've got to close it,
2439 * and we've got to free handle if it was a persistent transaction.
2440 *
2441 * But, if we had nested into an existing transaction, we need
2442 * to just drop the ref count on the handle.
2443 *
2444 * If old_ref == 0, the transaction is from reiserfs_get_block,
2445 * and it was a persistent trans. Otherwise, it was nested above.
2446 */
2447 if (th->t_refcount > old_ref) {
2448 if (old_ref)
2449 th->t_refcount--;
2450 else {
2451 int err;
2452 reiserfs_write_lock(inode->i_sb);
2453 err = reiserfs_end_persistent_transaction(th);
2454 reiserfs_write_unlock(inode->i_sb);
2455 if (err)
2456 ret = err;
2457 }
2458 }
2459 }
2460 return ret;
2461
2462}
2463
2464
2465static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) {
2466 return generic_block_bmap(as, block, reiserfs_bmap) ;
2467}
2468
2469static int reiserfs_commit_write(struct file *f, struct page *page,
2470 unsigned from, unsigned to) {
2471 struct inode *inode = page->mapping->host ;
2472 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2473 int ret = 0;
2474 int update_sd = 0;
2475 struct reiserfs_transaction_handle *th = NULL;
2476
2477 reiserfs_wait_on_write_block(inode->i_sb) ;
2478 if (reiserfs_transaction_running(inode->i_sb)) {
2479 th = current->journal_info;
2480 }
2481 reiserfs_commit_page(inode, page, from, to);
2482
2483 /* generic_commit_write does this for us, but does not update the
2484 ** transaction tracking stuff when the size changes. So, we have
2485 ** to do the i_size updates here.
2486 */
2487 if (pos > inode->i_size) {
2488 struct reiserfs_transaction_handle myth ;
2489 reiserfs_write_lock(inode->i_sb);
2490 /* If the file have grown beyond the border where it
2491 can have a tail, unmark it as needing a tail
2492 packing */
2493 if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) ||
2494 (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
2495 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
2496
2497 ret = journal_begin(&myth, inode->i_sb, 1) ;
2498 if (ret) {
2499 reiserfs_write_unlock(inode->i_sb);
2500 goto journal_error;
2501 }
2502 reiserfs_update_inode_transaction(inode) ;
2503 inode->i_size = pos ;
2504 reiserfs_update_sd(&myth, inode) ;
2505 update_sd = 1;
2506 ret = journal_end(&myth, inode->i_sb, 1) ;
2507 reiserfs_write_unlock(inode->i_sb);
2508 if (ret)
2509 goto journal_error;
2510 }
2511 if (th) {
2512 reiserfs_write_lock(inode->i_sb);
2513 if (!update_sd)
2514 reiserfs_update_sd(th, inode) ;
2515 ret = reiserfs_end_persistent_transaction(th);
2516 reiserfs_write_unlock(inode->i_sb);
2517 if (ret)
2518 goto out;
2519 }
2520
2521 /* we test for O_SYNC here so we can commit the transaction
2522 ** for any packed tails the file might have had
2523 */
2524 if (f && (f->f_flags & O_SYNC)) {
2525 reiserfs_write_lock(inode->i_sb);
2526 ret = reiserfs_commit_for_inode(inode) ;
2527 reiserfs_write_unlock(inode->i_sb);
2528 }
2529out:
2530 return ret ;
2531
2532journal_error:
2533 if (th) {
2534 reiserfs_write_lock(inode->i_sb);
2535 if (!update_sd)
2536 reiserfs_update_sd(th, inode) ;
2537 ret = reiserfs_end_persistent_transaction(th);
2538 reiserfs_write_unlock(inode->i_sb);
2539 }
2540
2541 return ret;
2542}
2543
2544void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode )
2545{
2546 if( reiserfs_attrs( inode -> i_sb ) ) {
2547 if( sd_attrs & REISERFS_SYNC_FL )
2548 inode -> i_flags |= S_SYNC;
2549 else
2550 inode -> i_flags &= ~S_SYNC;
2551 if( sd_attrs & REISERFS_IMMUTABLE_FL )
2552 inode -> i_flags |= S_IMMUTABLE;
2553 else
2554 inode -> i_flags &= ~S_IMMUTABLE;
2555 if( sd_attrs & REISERFS_APPEND_FL )
2556 inode -> i_flags |= S_APPEND;
2557 else
2558 inode -> i_flags &= ~S_APPEND;
2559 if( sd_attrs & REISERFS_NOATIME_FL )
2560 inode -> i_flags |= S_NOATIME;
2561 else
2562 inode -> i_flags &= ~S_NOATIME;
2563 if( sd_attrs & REISERFS_NOTAIL_FL )
2564 REISERFS_I(inode)->i_flags |= i_nopack_mask;
2565 else
2566 REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
2567 }
2568}
2569
2570void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs )
2571{
2572 if( reiserfs_attrs( inode -> i_sb ) ) {
2573 if( inode -> i_flags & S_IMMUTABLE )
2574 *sd_attrs |= REISERFS_IMMUTABLE_FL;
2575 else
2576 *sd_attrs &= ~REISERFS_IMMUTABLE_FL;
2577 if( inode -> i_flags & S_SYNC )
2578 *sd_attrs |= REISERFS_SYNC_FL;
2579 else
2580 *sd_attrs &= ~REISERFS_SYNC_FL;
2581 if( inode -> i_flags & S_NOATIME )
2582 *sd_attrs |= REISERFS_NOATIME_FL;
2583 else
2584 *sd_attrs &= ~REISERFS_NOATIME_FL;
2585 if( REISERFS_I(inode)->i_flags & i_nopack_mask )
2586 *sd_attrs |= REISERFS_NOTAIL_FL;
2587 else
2588 *sd_attrs &= ~REISERFS_NOTAIL_FL;
2589 }
2590}
2591
2592/* decide if this buffer needs to stay around for data logging or ordered
2593** write purposes
2594*/
2595static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2596{
2597 int ret = 1 ;
2598 struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
2599
2600 spin_lock(&j->j_dirty_buffers_lock) ;
2601 if (!buffer_mapped(bh)) {
2602 goto free_jh;
2603 }
2604 /* the page is locked, and the only places that log a data buffer
2605 * also lock the page.
2606 */
2607 if (reiserfs_file_data_log(inode)) {
2608 /*
2609 * very conservative, leave the buffer pinned if
2610 * anyone might need it.
2611 */
2612 if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
2613 ret = 0 ;
2614 }
2615 } else
2616 if (buffer_dirty(bh) || buffer_locked(bh)) {
2617 struct reiserfs_journal_list *jl;
2618 struct reiserfs_jh *jh = bh->b_private;
2619
2620 /* why is this safe?
2621 * reiserfs_setattr updates i_size in the on disk
2622 * stat data before allowing vmtruncate to be called.
2623 *
2624 * If buffer was put onto the ordered list for this
2625 * transaction, we know for sure either this transaction
2626 * or an older one already has updated i_size on disk,
2627 * and this ordered data won't be referenced in the file
2628 * if we crash.
2629 *
2630 * if the buffer was put onto the ordered list for an older
2631 * transaction, we need to leave it around
2632 */
2633 if (jh && (jl = jh->jl) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
2634 ret = 0;
2635 }
2636free_jh:
2637 if (ret && bh->b_private) {
2638 reiserfs_free_jh(bh);
2639 }
2640 spin_unlock(&j->j_dirty_buffers_lock) ;
2641 return ret ;
2642}
2643
2644/* clm -- taken from fs/buffer.c:block_invalidate_page */
2645static int reiserfs_invalidatepage(struct page *page, unsigned long offset)
2646{
2647 struct buffer_head *head, *bh, *next;
2648 struct inode *inode = page->mapping->host;
2649 unsigned int curr_off = 0;
2650 int ret = 1;
2651
2652 BUG_ON(!PageLocked(page));
2653
2654 if (offset == 0)
2655 ClearPageChecked(page);
2656
2657 if (!page_has_buffers(page))
2658 goto out;
2659
2660 head = page_buffers(page);
2661 bh = head;
2662 do {
2663 unsigned int next_off = curr_off + bh->b_size;
2664 next = bh->b_this_page;
2665
2666 /*
2667 * is this block fully invalidated?
2668 */
2669 if (offset <= curr_off) {
2670 if (invalidatepage_can_drop(inode, bh))
2671 reiserfs_unmap_buffer(bh);
2672 else
2673 ret = 0;
2674 }
2675 curr_off = next_off;
2676 bh = next;
2677 } while (bh != head);
2678
2679 /*
2680 * We release buffers only if the entire page is being invalidated.
2681 * The get_block cached value has been unconditionally invalidated,
2682 * so real IO is not possible anymore.
2683 */
2684 if (!offset && ret)
2685 ret = try_to_release_page(page, 0);
2686out:
2687 return ret;
2688}
2689
2690static int reiserfs_set_page_dirty(struct page *page) {
2691 struct inode *inode = page->mapping->host;
2692 if (reiserfs_file_data_log(inode)) {
2693 SetPageChecked(page);
2694 return __set_page_dirty_nobuffers(page);
2695 }
2696 return __set_page_dirty_buffers(page);
2697}
2698
2699/*
2700 * Returns 1 if the page's buffers were dropped. The page is locked.
2701 *
2702 * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
2703 * in the buffers at page_buffers(page).
2704 *
2705 * even in -o notail mode, we can't be sure an old mount without -o notail
2706 * didn't create files with tails.
2707 */
2708static int reiserfs_releasepage(struct page *page, int unused_gfp_flags)
2709{
2710 struct inode *inode = page->mapping->host ;
2711 struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
2712 struct buffer_head *head ;
2713 struct buffer_head *bh ;
2714 int ret = 1 ;
2715
2716 WARN_ON(PageChecked(page));
2717 spin_lock(&j->j_dirty_buffers_lock) ;
2718 head = page_buffers(page) ;
2719 bh = head ;
2720 do {
2721 if (bh->b_private) {
2722 if (!buffer_dirty(bh) && !buffer_locked(bh)) {
2723 reiserfs_free_jh(bh);
2724 } else {
2725 ret = 0 ;
2726 break ;
2727 }
2728 }
2729 bh = bh->b_this_page ;
2730 } while (bh != head) ;
2731 if (ret)
2732 ret = try_to_free_buffers(page) ;
2733 spin_unlock(&j->j_dirty_buffers_lock) ;
2734 return ret ;
2735}
2736
2737/* We thank Mingming Cao for helping us understand in great detail what
2738 to do in this section of the code. */
2739static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
2740 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
2741{
2742 struct file *file = iocb->ki_filp;
2743 struct inode *inode = file->f_mapping->host;
2744
2745 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
2746 offset, nr_segs, reiserfs_get_blocks_direct_io, NULL);
2747}
2748
2749int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
2750 struct inode *inode = dentry->d_inode ;
2751 int error ;
2752 unsigned int ia_valid = attr->ia_valid;
2753 reiserfs_write_lock(inode->i_sb);
2754 if (attr->ia_valid & ATTR_SIZE) {
2755 /* version 2 items will be caught by the s_maxbytes check
2756 ** done for us in vmtruncate
2757 */
2758 if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
2759 attr->ia_size > MAX_NON_LFS) {
2760 error = -EFBIG ;
2761 goto out;
2762 }
2763 /* fill in hole pointers in the expanding truncate case. */
2764 if (attr->ia_size > inode->i_size) {
2765 error = generic_cont_expand(inode, attr->ia_size) ;
2766 if (REISERFS_I(inode)->i_prealloc_count > 0) {
2767 int err;
2768 struct reiserfs_transaction_handle th ;
2769 /* we're changing at most 2 bitmaps, inode + super */
2770 err = journal_begin(&th, inode->i_sb, 4) ;
2771 if (!err) {
2772 reiserfs_discard_prealloc (&th, inode);
2773 err = journal_end(&th, inode->i_sb, 4) ;
2774 }
2775 if (err)
2776 error = err;
2777 }
2778 if (error)
2779 goto out;
2780 }
2781 }
2782
2783 if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
2784 ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
2785 (get_inode_sd_version (inode) == STAT_DATA_V1)) {
2786 /* stat data of format v3.5 has 16 bit uid and gid */
2787 error = -EINVAL;
2788 goto out;
2789 }
2790
2791 error = inode_change_ok(inode, attr) ;
2792 if (!error) {
2793 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2794 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2795 error = reiserfs_chown_xattrs (inode, attr);
2796
2797 if (!error) {
2798 struct reiserfs_transaction_handle th;
2799
2800 /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
2801 journal_begin(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2);
2802 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2803 if (error) {
2804 journal_end(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2);
2805 goto out;
2806 }
2807 /* Update corresponding info in inode so that everything is in
2808 * one transaction */
2809 if (attr->ia_valid & ATTR_UID)
2810 inode->i_uid = attr->ia_uid;
2811 if (attr->ia_valid & ATTR_GID)
2812 inode->i_gid = attr->ia_gid;
2813 mark_inode_dirty(inode);
2814 journal_end(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2);
2815 }
2816 }
2817 if (!error)
2818 error = inode_setattr(inode, attr) ;
2819 }
2820
2821
2822 if (!error && reiserfs_posixacl (inode->i_sb)) {
2823 if (attr->ia_valid & ATTR_MODE)
2824 error = reiserfs_acl_chmod (inode);
2825 }
2826
2827out:
2828 reiserfs_write_unlock(inode->i_sb);
2829 return error ;
2830}
2831
2832
2833
2834struct address_space_operations reiserfs_address_space_operations = {
2835 .writepage = reiserfs_writepage,
2836 .readpage = reiserfs_readpage,
2837 .readpages = reiserfs_readpages,
2838 .releasepage = reiserfs_releasepage,
2839 .invalidatepage = reiserfs_invalidatepage,
2840 .sync_page = block_sync_page,
2841 .prepare_write = reiserfs_prepare_write,
2842 .commit_write = reiserfs_commit_write,
2843 .bmap = reiserfs_aop_bmap,
2844 .direct_IO = reiserfs_direct_IO,
2845 .set_page_dirty = reiserfs_set_page_dirty,
2846} ;
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
new file mode 100644
index 000000000000..94dc42475a04
--- /dev/null
+++ b/fs/reiserfs/ioctl.c
@@ -0,0 +1,151 @@
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5#include <linux/fs.h>
6#include <linux/reiserfs_fs.h>
7#include <linux/time.h>
8#include <asm/uaccess.h>
9#include <linux/pagemap.h>
10#include <linux/smp_lock.h>
11
12static int reiserfs_unpack (struct inode * inode, struct file * filp);
13
14/*
15** reiserfs_ioctl - handler for ioctl for inode
16** supported commands:
17** 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
18** and prevent packing file (argument arg has to be non-zero)
19** 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
20** 3) That's all for a while ...
21*/
22int reiserfs_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
23 unsigned long arg)
24{
25 unsigned int flags;
26
27 switch (cmd) {
28 case REISERFS_IOC_UNPACK:
29 if( S_ISREG( inode -> i_mode ) ) {
30 if (arg)
31 return reiserfs_unpack (inode, filp);
32 else
33 return 0;
34 } else
35 return -ENOTTY;
36 /* following two cases are taken from fs/ext2/ioctl.c by Remy
37 Card (card@masi.ibp.fr) */
38 case REISERFS_IOC_GETFLAGS:
39 flags = REISERFS_I(inode) -> i_attrs;
40 i_attrs_to_sd_attrs( inode, ( __u16 * ) &flags );
41 return put_user(flags, (int __user *) arg);
42 case REISERFS_IOC_SETFLAGS: {
43 if (IS_RDONLY(inode))
44 return -EROFS;
45
46 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
47 return -EPERM;
48
49 if (get_user(flags, (int __user *) arg))
50 return -EFAULT;
51
52 if ( ( ( flags ^ REISERFS_I(inode) -> i_attrs) & ( REISERFS_IMMUTABLE_FL | REISERFS_APPEND_FL)) &&
53 !capable( CAP_LINUX_IMMUTABLE ) )
54 return -EPERM;
55
56 if( ( flags & REISERFS_NOTAIL_FL ) &&
57 S_ISREG( inode -> i_mode ) ) {
58 int result;
59
60 result = reiserfs_unpack( inode, filp );
61 if( result )
62 return result;
63 }
64 sd_attrs_to_i_attrs( flags, inode );
65 REISERFS_I(inode) -> i_attrs = flags;
66 inode->i_ctime = CURRENT_TIME_SEC;
67 mark_inode_dirty(inode);
68 return 0;
69 }
70 case REISERFS_IOC_GETVERSION:
71 return put_user(inode->i_generation, (int __user *) arg);
72 case REISERFS_IOC_SETVERSION:
73 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
74 return -EPERM;
75 if (IS_RDONLY(inode))
76 return -EROFS;
77 if (get_user(inode->i_generation, (int __user *) arg))
78 return -EFAULT;
79 inode->i_ctime = CURRENT_TIME_SEC;
80 mark_inode_dirty(inode);
81 return 0;
82 default:
83 return -ENOTTY;
84 }
85}
86
87/*
88** reiserfs_unpack
89** Function try to convert tail from direct item into indirect.
90** It set up nopack attribute in the REISERFS_I(inode)->nopack
91*/
92static int reiserfs_unpack (struct inode * inode, struct file * filp)
93{
94 int retval = 0;
95 int index ;
96 struct page *page ;
97 struct address_space *mapping ;
98 unsigned long write_from ;
99 unsigned long blocksize = inode->i_sb->s_blocksize ;
100
101 if (inode->i_size == 0) {
102 REISERFS_I(inode)->i_flags |= i_nopack_mask;
103 return 0 ;
104 }
105 /* ioctl already done */
106 if (REISERFS_I(inode)->i_flags & i_nopack_mask) {
107 return 0 ;
108 }
109 reiserfs_write_lock(inode->i_sb);
110
111 /* we need to make sure nobody is changing the file size beneath
112 ** us
113 */
114 down(&inode->i_sem) ;
115
116 write_from = inode->i_size & (blocksize - 1) ;
117 /* if we are on a block boundary, we are already unpacked. */
118 if ( write_from == 0) {
119 REISERFS_I(inode)->i_flags |= i_nopack_mask;
120 goto out ;
121 }
122
123 /* we unpack by finding the page with the tail, and calling
124 ** reiserfs_prepare_write on that page. This will force a
125 ** reiserfs_get_block to unpack the tail for us.
126 */
127 index = inode->i_size >> PAGE_CACHE_SHIFT ;
128 mapping = inode->i_mapping ;
129 page = grab_cache_page(mapping, index) ;
130 retval = -ENOMEM;
131 if (!page) {
132 goto out ;
133 }
134 retval = mapping->a_ops->prepare_write(NULL, page, write_from, write_from) ;
135 if (retval)
136 goto out_unlock ;
137
138 /* conversion can change page contents, must flush */
139 flush_dcache_page(page) ;
140 retval = mapping->a_ops->commit_write(NULL, page, write_from, write_from) ;
141 REISERFS_I(inode)->i_flags |= i_nopack_mask;
142
143out_unlock:
144 unlock_page(page) ;
145 page_cache_release(page) ;
146
147out:
148 up(&inode->i_sem) ;
149 reiserfs_write_unlock(inode->i_sb);
150 return retval;
151}
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
new file mode 100644
index 000000000000..9cf7c13b120d
--- /dev/null
+++ b/fs/reiserfs/item_ops.c
@@ -0,0 +1,788 @@
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5#include <linux/time.h>
6#include <linux/reiserfs_fs.h>
7
8// this contains item handlers for old item types: sd, direct,
9// indirect, directory
10
11/* and where are the comments? how about saying where we can find an
12 explanation of each item handler method? -Hans */
13
14//////////////////////////////////////////////////////////////////////////////
15// stat data functions
16//
17static int sd_bytes_number (struct item_head * ih, int block_size)
18{
19 return 0;
20}
21
22static void sd_decrement_key (struct cpu_key * key)
23{
24 key->on_disk_key.k_objectid --;
25 set_cpu_key_k_type (key, TYPE_ANY);
26 set_cpu_key_k_offset(key, (loff_t)(-1));
27}
28
29static int sd_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
30{
31 return 0;
32}
33
34
35
36static char * print_time (time_t t)
37{
38 static char timebuf[256];
39
40 sprintf (timebuf, "%ld", t);
41 return timebuf;
42}
43
44
45static void sd_print_item (struct item_head * ih, char * item)
46{
47 printk ("\tmode | size | nlinks | first direct | mtime\n");
48 if (stat_data_v1 (ih)) {
49 struct stat_data_v1 * sd = (struct stat_data_v1 *)item;
50
51 printk ("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd),
52 sd_v1_size(sd), sd_v1_nlink(sd), sd_v1_first_direct_byte(sd),
53 print_time( sd_v1_mtime(sd) ) );
54 } else {
55 struct stat_data * sd = (struct stat_data *)item;
56
57 printk ("\t0%-6o | %6Lu | %2u | %d | %s\n", sd_v2_mode(sd),
58 (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd),
59 sd_v2_rdev(sd), print_time(sd_v2_mtime(sd)));
60 }
61}
62
63static void sd_check_item (struct item_head * ih, char * item)
64{
65 // FIXME: type something here!
66}
67
68
69static int sd_create_vi (struct virtual_node * vn,
70 struct virtual_item * vi,
71 int is_affected,
72 int insert_size)
73{
74 vi->vi_index = TYPE_STAT_DATA;
75 //vi->vi_type |= VI_TYPE_STAT_DATA;// not needed?
76 return 0;
77}
78
79
80static int sd_check_left (struct virtual_item * vi, int free,
81 int start_skip, int end_skip)
82{
83 if (start_skip || end_skip)
84 BUG ();
85 return -1;
86}
87
88
89static int sd_check_right (struct virtual_item * vi, int free)
90{
91 return -1;
92}
93
94static int sd_part_size (struct virtual_item * vi, int first, int count)
95{
96 if (count)
97 BUG ();
98 return 0;
99}
100
101static int sd_unit_num (struct virtual_item * vi)
102{
103 return vi->vi_item_len - IH_SIZE;
104}
105
106
107static void sd_print_vi (struct virtual_item * vi)
108{
109 reiserfs_warning (NULL, "STATDATA, index %d, type 0x%x, %h",
110 vi->vi_index, vi->vi_type, vi->vi_ih);
111}
112
113static struct item_operations stat_data_ops = {
114 .bytes_number = sd_bytes_number,
115 .decrement_key = sd_decrement_key,
116 .is_left_mergeable = sd_is_left_mergeable,
117 .print_item = sd_print_item,
118 .check_item = sd_check_item,
119
120 .create_vi = sd_create_vi,
121 .check_left = sd_check_left,
122 .check_right = sd_check_right,
123 .part_size = sd_part_size,
124 .unit_num = sd_unit_num,
125 .print_vi = sd_print_vi
126};
127
128
129
130//////////////////////////////////////////////////////////////////////////////
131// direct item functions
132//
133static int direct_bytes_number (struct item_head * ih, int block_size)
134{
135 return ih_item_len(ih);
136}
137
138
139// FIXME: this should probably switch to indirect as well
140static void direct_decrement_key (struct cpu_key * key)
141{
142 cpu_key_k_offset_dec (key);
143 if (cpu_key_k_offset (key) == 0)
144 set_cpu_key_k_type (key, TYPE_STAT_DATA);
145}
146
147
148static int direct_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
149{
150 int version = le_key_version (key);
151 return ((le_key_k_offset (version, key) & (bsize - 1)) != 1);
152}
153
154
155static void direct_print_item (struct item_head * ih, char * item)
156{
157 int j = 0;
158
159// return;
160 printk ("\"");
161 while (j < ih_item_len(ih))
162 printk ("%c", item[j++]);
163 printk ("\"\n");
164}
165
166
167static void direct_check_item (struct item_head * ih, char * item)
168{
169 // FIXME: type something here!
170}
171
172
173static int direct_create_vi (struct virtual_node * vn,
174 struct virtual_item * vi,
175 int is_affected,
176 int insert_size)
177{
178 vi->vi_index = TYPE_DIRECT;
179 //vi->vi_type |= VI_TYPE_DIRECT;
180 return 0;
181}
182
183static int direct_check_left (struct virtual_item * vi, int free,
184 int start_skip, int end_skip)
185{
186 int bytes;
187
188 bytes = free - free % 8;
189 return bytes ?: -1;
190}
191
192
193static int direct_check_right (struct virtual_item * vi, int free)
194{
195 return direct_check_left (vi, free, 0, 0);
196}
197
198static int direct_part_size (struct virtual_item * vi, int first, int count)
199{
200 return count;
201}
202
203
204static int direct_unit_num (struct virtual_item * vi)
205{
206 return vi->vi_item_len - IH_SIZE;
207}
208
209
210static void direct_print_vi (struct virtual_item * vi)
211{
212 reiserfs_warning (NULL, "DIRECT, index %d, type 0x%x, %h",
213 vi->vi_index, vi->vi_type, vi->vi_ih);
214}
215
216static struct item_operations direct_ops = {
217 .bytes_number = direct_bytes_number,
218 .decrement_key = direct_decrement_key,
219 .is_left_mergeable = direct_is_left_mergeable,
220 .print_item = direct_print_item,
221 .check_item = direct_check_item,
222
223 .create_vi = direct_create_vi,
224 .check_left = direct_check_left,
225 .check_right = direct_check_right,
226 .part_size = direct_part_size,
227 .unit_num = direct_unit_num,
228 .print_vi = direct_print_vi
229};
230
231
232
233//////////////////////////////////////////////////////////////////////////////
234// indirect item functions
235//
236
237static int indirect_bytes_number (struct item_head * ih, int block_size)
238{
239 return ih_item_len(ih) / UNFM_P_SIZE * block_size; //- get_ih_free_space (ih);
240}
241
242
243// decrease offset, if it becomes 0, change type to stat data
244static void indirect_decrement_key (struct cpu_key * key)
245{
246 cpu_key_k_offset_dec (key);
247 if (cpu_key_k_offset (key) == 0)
248 set_cpu_key_k_type (key, TYPE_STAT_DATA);
249}
250
251
252// if it is not first item of the body, then it is mergeable
253static int indirect_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
254{
255 int version = le_key_version (key);
256 return (le_key_k_offset (version, key) != 1);
257}
258
259
260// printing of indirect item
261static void start_new_sequence (__u32 * start, int * len, __u32 new)
262{
263 *start = new;
264 *len = 1;
265}
266
267
268static int sequence_finished (__u32 start, int * len, __u32 new)
269{
270 if (start == INT_MAX)
271 return 1;
272
273 if (start == 0 && new == 0) {
274 (*len) ++;
275 return 0;
276 }
277 if (start != 0 && (start + *len) == new) {
278 (*len) ++;
279 return 0;
280 }
281 return 1;
282}
283
284static void print_sequence (__u32 start, int len)
285{
286 if (start == INT_MAX)
287 return;
288
289 if (len == 1)
290 printk (" %d", start);
291 else
292 printk (" %d(%d)", start, len);
293}
294
295
296static void indirect_print_item (struct item_head * ih, char * item)
297{
298 int j;
299 __u32 * unp, prev = INT_MAX;
300 int num;
301
302 unp = (__u32 *)item;
303
304 if (ih_item_len(ih) % UNFM_P_SIZE)
305 reiserfs_warning (NULL, "indirect_print_item: invalid item len");
306
307 printk ("%d pointers\n[ ", (int)I_UNFM_NUM (ih));
308 for (j = 0; j < I_UNFM_NUM (ih); j ++) {
309 if (sequence_finished (prev, &num, get_block_num(unp, j))) {
310 print_sequence (prev, num);
311 start_new_sequence (&prev, &num, get_block_num(unp, j));
312 }
313 }
314 print_sequence (prev, num);
315 printk ("]\n");
316}
317
318static void indirect_check_item (struct item_head * ih, char * item)
319{
320 // FIXME: type something here!
321}
322
323
324static int indirect_create_vi (struct virtual_node * vn,
325 struct virtual_item * vi,
326 int is_affected,
327 int insert_size)
328{
329 vi->vi_index = TYPE_INDIRECT;
330 //vi->vi_type |= VI_TYPE_INDIRECT;
331 return 0;
332}
333
334static int indirect_check_left (struct virtual_item * vi, int free,
335 int start_skip, int end_skip)
336{
337 int bytes;
338
339 bytes = free - free % UNFM_P_SIZE;
340 return bytes ?: -1;
341}
342
343
344static int indirect_check_right (struct virtual_item * vi, int free)
345{
346 return indirect_check_left (vi, free, 0, 0);
347}
348
349
350
351// return size in bytes of 'units' units. If first == 0 - calculate from the head (left), otherwise - from tail (right)
352static int indirect_part_size (struct virtual_item * vi, int first, int units)
353{
354 // unit of indirect item is byte (yet)
355 return units;
356}
357
358static int indirect_unit_num (struct virtual_item * vi)
359{
360 // unit of indirect item is byte (yet)
361 return vi->vi_item_len - IH_SIZE;
362}
363
364static void indirect_print_vi (struct virtual_item * vi)
365{
366 reiserfs_warning (NULL, "INDIRECT, index %d, type 0x%x, %h",
367 vi->vi_index, vi->vi_type, vi->vi_ih);
368}
369
370static struct item_operations indirect_ops = {
371 .bytes_number = indirect_bytes_number,
372 .decrement_key = indirect_decrement_key,
373 .is_left_mergeable = indirect_is_left_mergeable,
374 .print_item = indirect_print_item,
375 .check_item = indirect_check_item,
376
377 .create_vi = indirect_create_vi,
378 .check_left = indirect_check_left,
379 .check_right = indirect_check_right,
380 .part_size = indirect_part_size,
381 .unit_num = indirect_unit_num,
382 .print_vi = indirect_print_vi
383};
384
385
386//////////////////////////////////////////////////////////////////////////////
387// direntry functions
388//
389
390
391static int direntry_bytes_number (struct item_head * ih, int block_size)
392{
393 reiserfs_warning (NULL, "vs-16090: direntry_bytes_number: "
394 "bytes number is asked for direntry");
395 return 0;
396}
397
398static void direntry_decrement_key (struct cpu_key * key)
399{
400 cpu_key_k_offset_dec (key);
401 if (cpu_key_k_offset (key) == 0)
402 set_cpu_key_k_type (key, TYPE_STAT_DATA);
403}
404
405
406static int direntry_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
407{
408 if (le32_to_cpu (key->u.k_offset_v1.k_offset) == DOT_OFFSET)
409 return 0;
410 return 1;
411
412}
413
414
415static void direntry_print_item (struct item_head * ih, char * item)
416{
417 int i;
418 int namelen;
419 struct reiserfs_de_head * deh;
420 char * name;
421 static char namebuf [80];
422
423
424 printk ("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name", "Key of pointed object", "Hash", "Gen number", "Status");
425
426 deh = (struct reiserfs_de_head *)item;
427
428 for (i = 0; i < I_ENTRY_COUNT (ih); i ++, deh ++) {
429 namelen = (i ? (deh_location(deh - 1)) : ih_item_len(ih)) - deh_location(deh);
430 name = item + deh_location(deh);
431 if (name[namelen-1] == 0)
432 namelen = strlen (name);
433 namebuf[0] = '"';
434 if (namelen > sizeof (namebuf) - 3) {
435 strncpy (namebuf + 1, name, sizeof (namebuf) - 3);
436 namebuf[sizeof (namebuf) - 2] = '"';
437 namebuf[sizeof (namebuf) - 1] = 0;
438 } else {
439 memcpy (namebuf + 1, name, namelen);
440 namebuf[namelen + 1] = '"';
441 namebuf[namelen + 2] = 0;
442 }
443
444 printk ("%d: %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n",
445 i, namebuf,
446 deh_dir_id(deh), deh_objectid(deh),
447 GET_HASH_VALUE (deh_offset (deh)), GET_GENERATION_NUMBER ((deh_offset (deh))),
448 (de_hidden (deh)) ? "HIDDEN" : "VISIBLE");
449 }
450}
451
452
453static void direntry_check_item (struct item_head * ih, char * item)
454{
455 int i;
456 struct reiserfs_de_head * deh;
457
458 // FIXME: type something here!
459 deh = (struct reiserfs_de_head *)item;
460 for (i = 0; i < I_ENTRY_COUNT (ih); i ++, deh ++) {
461 ;
462 }
463}
464
465
466
467#define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1
468
469/*
470 * function returns old entry number in directory item in real node
471 * using new entry number in virtual item in virtual node */
472static inline int old_entry_num (int is_affected, int virtual_entry_num, int pos_in_item, int mode)
473{
474 if ( mode == M_INSERT || mode == M_DELETE)
475 return virtual_entry_num;
476
477 if (!is_affected)
478 /* cut or paste is applied to another item */
479 return virtual_entry_num;
480
481 if (virtual_entry_num < pos_in_item)
482 return virtual_entry_num;
483
484 if (mode == M_CUT)
485 return virtual_entry_num + 1;
486
487 RFALSE( mode != M_PASTE || virtual_entry_num == 0,
488 "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'", mode);
489
490 return virtual_entry_num - 1;
491}
492
493
494
495
496/* Create an array of sizes of directory entries for virtual
497 item. Return space used by an item. FIXME: no control over
498 consuming of space used by this item handler */
499static int direntry_create_vi (struct virtual_node * vn,
500 struct virtual_item * vi,
501 int is_affected,
502 int insert_size)
503{
504 struct direntry_uarea * dir_u = vi->vi_uarea;
505 int i, j;
506 int size = sizeof (struct direntry_uarea);
507 struct reiserfs_de_head * deh;
508
509 vi->vi_index = TYPE_DIRENTRY;
510
511 if (!(vi->vi_ih) || !vi->vi_item)
512 BUG ();
513
514
515 dir_u->flags = 0;
516 if (le_ih_k_offset (vi->vi_ih) == DOT_OFFSET)
517 dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM;
518
519 deh = (struct reiserfs_de_head *)(vi->vi_item);
520
521
522 /* virtual directory item have this amount of entry after */
523 dir_u->entry_count = ih_entry_count (vi->vi_ih) +
524 ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 :
525 (vn->vn_mode == M_PASTE ? 1 : 0)) : 0);
526
527 for (i = 0; i < dir_u->entry_count; i ++) {
528 j = old_entry_num (is_affected, i, vn->vn_pos_in_item, vn->vn_mode);
529 dir_u->entry_sizes[i] = (j ? deh_location( &(deh[j - 1]) ) :
530 ih_item_len (vi->vi_ih)) -
531 deh_location( &(deh[j])) + DEH_SIZE;
532 }
533
534 size += (dir_u->entry_count * sizeof (short));
535
536 /* set size of pasted entry */
537 if (is_affected && vn->vn_mode == M_PASTE)
538 dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size;
539
540
541#ifdef CONFIG_REISERFS_CHECK
542 /* compare total size of entries with item length */
543 {
544 int k, l;
545
546 l = 0;
547 for (k = 0; k < dir_u->entry_count; k ++)
548 l += dir_u->entry_sizes[k];
549
550 if (l + IH_SIZE != vi->vi_item_len +
551 ((is_affected && (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT)) ? insert_size : 0) ) {
552 reiserfs_panic (NULL, "vs-8025: set_entry_sizes: (mode==%c, insert_size==%d), invalid length of directory item",
553 vn->vn_mode, insert_size);
554 }
555 }
556#endif
557
558 return size;
559
560
561}
562
563
564//
565// return number of entries which may fit into specified amount of
566// free space, or -1 if free space is not enough even for 1 entry
567//
568static int direntry_check_left (struct virtual_item * vi, int free,
569 int start_skip, int end_skip)
570{
571 int i;
572 int entries = 0;
573 struct direntry_uarea * dir_u = vi->vi_uarea;
574
575 for (i = start_skip; i < dir_u->entry_count - end_skip; i ++) {
576 if (dir_u->entry_sizes[i] > free)
577 /* i-th entry doesn't fit into the remaining free space */
578 break;
579
580 free -= dir_u->entry_sizes[i];
581 entries ++;
582 }
583
584 if (entries == dir_u->entry_count) {
585 reiserfs_panic (NULL, "free space %d, entry_count %d\n", free, dir_u->entry_count);
586 }
587
588 /* "." and ".." can not be separated from each other */
589 if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) && entries < 2)
590 entries = 0;
591
592 return entries ?: -1;
593}
594
595
596static int direntry_check_right (struct virtual_item * vi, int free)
597{
598 int i;
599 int entries = 0;
600 struct direntry_uarea * dir_u = vi->vi_uarea;
601
602 for (i = dir_u->entry_count - 1; i >= 0; i --) {
603 if (dir_u->entry_sizes[i] > free)
604 /* i-th entry doesn't fit into the remaining free space */
605 break;
606
607 free -= dir_u->entry_sizes[i];
608 entries ++;
609 }
610 if (entries == dir_u->entry_count)
611 BUG ();
612
613 /* "." and ".." can not be separated from each other */
614 if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) && entries > dir_u->entry_count - 2)
615 entries = dir_u->entry_count - 2;
616
617 return entries ?: -1;
618}
619
620
621/* sum of entry sizes between from-th and to-th entries including both edges */
622static int direntry_part_size (struct virtual_item * vi, int first, int count)
623{
624 int i, retval;
625 int from, to;
626 struct direntry_uarea * dir_u = vi->vi_uarea;
627
628 retval = 0;
629 if (first == 0)
630 from = 0;
631 else
632 from = dir_u->entry_count - count;
633 to = from + count - 1;
634
635 for (i = from; i <= to; i ++)
636 retval += dir_u->entry_sizes[i];
637
638 return retval;
639}
640
641static int direntry_unit_num (struct virtual_item * vi)
642{
643 struct direntry_uarea * dir_u = vi->vi_uarea;
644
645 return dir_u->entry_count;
646}
647
648
649
650static void direntry_print_vi (struct virtual_item * vi)
651{
652 int i;
653 struct direntry_uarea * dir_u = vi->vi_uarea;
654
655 reiserfs_warning (NULL, "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x",
656 vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags);
657 printk ("%d entries: ", dir_u->entry_count);
658 for (i = 0; i < dir_u->entry_count; i ++)
659 printk ("%d ", dir_u->entry_sizes[i]);
660 printk ("\n");
661}
662
663static struct item_operations direntry_ops = {
664 .bytes_number = direntry_bytes_number,
665 .decrement_key = direntry_decrement_key,
666 .is_left_mergeable = direntry_is_left_mergeable,
667 .print_item = direntry_print_item,
668 .check_item = direntry_check_item,
669
670 .create_vi = direntry_create_vi,
671 .check_left = direntry_check_left,
672 .check_right = direntry_check_right,
673 .part_size = direntry_part_size,
674 .unit_num = direntry_unit_num,
675 .print_vi = direntry_print_vi
676};
677
678
679//////////////////////////////////////////////////////////////////////////////
680// Error catching functions to catch errors caused by incorrect item types.
681//
682static int errcatch_bytes_number (struct item_head * ih, int block_size)
683{
684 reiserfs_warning (NULL, "green-16001: Invalid item type observed, run fsck ASAP");
685 return 0;
686}
687
688static void errcatch_decrement_key (struct cpu_key * key)
689{
690 reiserfs_warning (NULL, "green-16002: Invalid item type observed, run fsck ASAP");
691}
692
693
694static int errcatch_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
695{
696 reiserfs_warning (NULL, "green-16003: Invalid item type observed, run fsck ASAP");
697 return 0;
698}
699
700
701static void errcatch_print_item (struct item_head * ih, char * item)
702{
703 reiserfs_warning (NULL, "green-16004: Invalid item type observed, run fsck ASAP");
704}
705
706
707static void errcatch_check_item (struct item_head * ih, char * item)
708{
709 reiserfs_warning (NULL, "green-16005: Invalid item type observed, run fsck ASAP");
710}
711
712static int errcatch_create_vi (struct virtual_node * vn,
713 struct virtual_item * vi,
714 int is_affected,
715 int insert_size)
716{
717 reiserfs_warning (NULL, "green-16006: Invalid item type observed, run fsck ASAP");
718 return 0; // We might return -1 here as well, but it won't help as create_virtual_node() from where
719 // this operation is called from is of return type void.
720}
721
722static int errcatch_check_left (struct virtual_item * vi, int free,
723 int start_skip, int end_skip)
724{
725 reiserfs_warning (NULL, "green-16007: Invalid item type observed, run fsck ASAP");
726 return -1;
727}
728
729
730static int errcatch_check_right (struct virtual_item * vi, int free)
731{
732 reiserfs_warning (NULL, "green-16008: Invalid item type observed, run fsck ASAP");
733 return -1;
734}
735
736static int errcatch_part_size (struct virtual_item * vi, int first, int count)
737{
738 reiserfs_warning (NULL, "green-16009: Invalid item type observed, run fsck ASAP");
739 return 0;
740}
741
742static int errcatch_unit_num (struct virtual_item * vi)
743{
744 reiserfs_warning (NULL, "green-16010: Invalid item type observed, run fsck ASAP");
745 return 0;
746}
747
748static void errcatch_print_vi (struct virtual_item * vi)
749{
750 reiserfs_warning (NULL, "green-16011: Invalid item type observed, run fsck ASAP");
751}
752
753static struct item_operations errcatch_ops = {
754 errcatch_bytes_number,
755 errcatch_decrement_key,
756 errcatch_is_left_mergeable,
757 errcatch_print_item,
758 errcatch_check_item,
759
760 errcatch_create_vi,
761 errcatch_check_left,
762 errcatch_check_right,
763 errcatch_part_size,
764 errcatch_unit_num,
765 errcatch_print_vi
766};
767
768
769
770//////////////////////////////////////////////////////////////////////////////
771//
772//
773#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3)
774 do not compile
775#endif
776
777struct item_operations * item_ops [TYPE_ANY + 1] = {
778 &stat_data_ops,
779 &indirect_ops,
780 &direct_ops,
781 &direntry_ops,
782 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
783 &errcatch_ops /* This is to catch errors with invalid type (15th entry for TYPE_ANY) */
784};
785
786
787
788
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
new file mode 100644
index 000000000000..c9ad3a7849f4
--- /dev/null
+++ b/fs/reiserfs/journal.c
@@ -0,0 +1,3876 @@
1/*
2** Write ahead logging implementation copyright Chris Mason 2000
3**
4** The background commits make this code very interelated, and
5** overly complex. I need to rethink things a bit....The major players:
6**
7** journal_begin -- call with the number of blocks you expect to log.
8** If the current transaction is too
9** old, it will block until the current transaction is
10** finished, and then start a new one.
11** Usually, your transaction will get joined in with
12** previous ones for speed.
13**
14** journal_join -- same as journal_begin, but won't block on the current
15** transaction regardless of age. Don't ever call
16** this. Ever. There are only two places it should be
17** called from, and they are both inside this file.
18**
19** journal_mark_dirty -- adds blocks into this transaction. clears any flags
20** that might make them get sent to disk
21** and then marks them BH_JDirty. Puts the buffer head
22** into the current transaction hash.
23**
24** journal_end -- if the current transaction is batchable, it does nothing
25** otherwise, it could do an async/synchronous commit, or
26** a full flush of all log and real blocks in the
27** transaction.
28**
29** flush_old_commits -- if the current transaction is too old, it is ended and
30** commit blocks are sent to disk. Forces commit blocks
31** to disk for all backgrounded commits that have been
32** around too long.
33** -- Note, if you call this as an immediate flush from
34** from within kupdate, it will ignore the immediate flag
35*/
36
37#include <linux/config.h>
38#include <asm/uaccess.h>
39#include <asm/system.h>
40
41#include <linux/time.h>
42#include <asm/semaphore.h>
43
44#include <linux/vmalloc.h>
45#include <linux/reiserfs_fs.h>
46
47#include <linux/kernel.h>
48#include <linux/errno.h>
49#include <linux/fcntl.h>
50#include <linux/stat.h>
51#include <linux/string.h>
52#include <linux/smp_lock.h>
53#include <linux/buffer_head.h>
54#include <linux/workqueue.h>
55#include <linux/writeback.h>
56#include <linux/blkdev.h>
57
58
59/* gets a struct reiserfs_journal_list * from a list head */
60#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
61 j_list))
62#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
63 j_working_list))
64
65/* the number of mounted filesystems. This is used to decide when to
66** start and kill the commit workqueue
67*/
68static int reiserfs_mounted_fs_count;
69
70static struct workqueue_struct *commit_wq;
71
72#define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit
73 structs at 4k */
74#define BUFNR 64 /*read ahead */
75
76/* cnode stat bits. Move these into reiserfs_fs.h */
77
78#define BLOCK_FREED 2 /* this block was freed, and can't be written. */
79#define BLOCK_FREED_HOLDER 3 /* this block was freed during this transaction, and can't be written */
80
81#define BLOCK_NEEDS_FLUSH 4 /* used in flush_journal_list */
82#define BLOCK_DIRTIED 5
83
84
85/* journal list state bits */
86#define LIST_TOUCHED 1
87#define LIST_DIRTY 2
88#define LIST_COMMIT_PENDING 4 /* someone will commit this list */
89
90/* flags for do_journal_end */
91#define FLUSH_ALL 1 /* flush commit and real blocks */
92#define COMMIT_NOW 2 /* end and commit this transaction */
93#define WAIT 4 /* wait for the log blocks to hit the disk*/
94
95static int do_journal_end(struct reiserfs_transaction_handle *,struct super_block *,unsigned long nblocks,int flags) ;
96static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ;
97static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ;
98static int can_dirty(struct reiserfs_journal_cnode *cn) ;
99static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks);
100static int release_journal_dev( struct super_block *super,
101 struct reiserfs_journal *journal );
102static int dirty_one_transaction(struct super_block *s,
103 struct reiserfs_journal_list *jl);
104static void flush_async_commits(void *p);
105static void queue_log_writer(struct super_block *s);
106
107/* values for join in do_journal_begin_r */
108enum {
109 JBEGIN_REG = 0, /* regular journal begin */
110 JBEGIN_JOIN = 1, /* join the running transaction if at all possible */
111 JBEGIN_ABORT = 2, /* called from cleanup code, ignores aborted flag */
112};
113
114static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
115 struct super_block * p_s_sb,
116 unsigned long nblocks,int join);
117
118static void init_journal_hash(struct super_block *p_s_sb) {
119 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
120 memset(journal->j_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
121}
122
123/*
124** clears BH_Dirty and sticks the buffer on the clean list. Called because I can't allow refile_buffer to
125** make schedule happen after I've freed a block. Look at remove_from_transaction and journal_mark_freed for
126** more details.
127*/
128static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) {
129 if (bh) {
130 clear_buffer_dirty(bh);
131 clear_buffer_journal_test(bh);
132 }
133 return 0 ;
134}
135
136static void disable_barrier(struct super_block *s)
137{
138 REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
139 printk("reiserfs: disabling flush barriers on %s\n", reiserfs_bdevname(s));
140}
141
142static struct reiserfs_bitmap_node *
143allocate_bitmap_node(struct super_block *p_s_sb) {
144 struct reiserfs_bitmap_node *bn ;
145 static int id;
146
147 bn = reiserfs_kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS, p_s_sb) ;
148 if (!bn) {
149 return NULL ;
150 }
151 bn->data = reiserfs_kmalloc(p_s_sb->s_blocksize, GFP_NOFS, p_s_sb) ;
152 if (!bn->data) {
153 reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ;
154 return NULL ;
155 }
156 bn->id = id++ ;
157 memset(bn->data, 0, p_s_sb->s_blocksize) ;
158 INIT_LIST_HEAD(&bn->list) ;
159 return bn ;
160}
161
162static struct reiserfs_bitmap_node *
163get_bitmap_node(struct super_block *p_s_sb) {
164 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
165 struct reiserfs_bitmap_node *bn = NULL;
166 struct list_head *entry = journal->j_bitmap_nodes.next ;
167
168 journal->j_used_bitmap_nodes++ ;
169repeat:
170
171 if(entry != &journal->j_bitmap_nodes) {
172 bn = list_entry(entry, struct reiserfs_bitmap_node, list) ;
173 list_del(entry) ;
174 memset(bn->data, 0, p_s_sb->s_blocksize) ;
175 journal->j_free_bitmap_nodes-- ;
176 return bn ;
177 }
178 bn = allocate_bitmap_node(p_s_sb) ;
179 if (!bn) {
180 yield();
181 goto repeat ;
182 }
183 return bn ;
184}
185static inline void free_bitmap_node(struct super_block *p_s_sb,
186 struct reiserfs_bitmap_node *bn) {
187 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
188 journal->j_used_bitmap_nodes-- ;
189 if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
190 reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb) ;
191 reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ;
192 } else {
193 list_add(&bn->list, &journal->j_bitmap_nodes) ;
194 journal->j_free_bitmap_nodes++ ;
195 }
196}
197
198static void allocate_bitmap_nodes(struct super_block *p_s_sb) {
199 int i ;
200 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
201 struct reiserfs_bitmap_node *bn = NULL ;
202 for (i = 0 ; i < REISERFS_MIN_BITMAP_NODES ; i++) {
203 bn = allocate_bitmap_node(p_s_sb) ;
204 if (bn) {
205 list_add(&bn->list, &journal->j_bitmap_nodes) ;
206 journal->j_free_bitmap_nodes++ ;
207 } else {
208 break ; // this is ok, we'll try again when more are needed
209 }
210 }
211}
212
213static int set_bit_in_list_bitmap(struct super_block *p_s_sb, int block,
214 struct reiserfs_list_bitmap *jb) {
215 int bmap_nr = block / (p_s_sb->s_blocksize << 3) ;
216 int bit_nr = block % (p_s_sb->s_blocksize << 3) ;
217
218 if (!jb->bitmaps[bmap_nr]) {
219 jb->bitmaps[bmap_nr] = get_bitmap_node(p_s_sb) ;
220 }
221 set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data) ;
222 return 0 ;
223}
224
225static void cleanup_bitmap_list(struct super_block *p_s_sb,
226 struct reiserfs_list_bitmap *jb) {
227 int i;
228 if (jb->bitmaps == NULL)
229 return;
230
231 for (i = 0 ; i < SB_BMAP_NR(p_s_sb) ; i++) {
232 if (jb->bitmaps[i]) {
233 free_bitmap_node(p_s_sb, jb->bitmaps[i]) ;
234 jb->bitmaps[i] = NULL ;
235 }
236 }
237}
238
239/*
240** only call this on FS unmount.
241*/
242static int free_list_bitmaps(struct super_block *p_s_sb,
243 struct reiserfs_list_bitmap *jb_array) {
244 int i ;
245 struct reiserfs_list_bitmap *jb ;
246 for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) {
247 jb = jb_array + i ;
248 jb->journal_list = NULL ;
249 cleanup_bitmap_list(p_s_sb, jb) ;
250 vfree(jb->bitmaps) ;
251 jb->bitmaps = NULL ;
252 }
253 return 0;
254}
255
256static int free_bitmap_nodes(struct super_block *p_s_sb) {
257 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
258 struct list_head *next = journal->j_bitmap_nodes.next ;
259 struct reiserfs_bitmap_node *bn ;
260
261 while(next != &journal->j_bitmap_nodes) {
262 bn = list_entry(next, struct reiserfs_bitmap_node, list) ;
263 list_del(next) ;
264 reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb) ;
265 reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ;
266 next = journal->j_bitmap_nodes.next ;
267 journal->j_free_bitmap_nodes-- ;
268 }
269
270 return 0 ;
271}
272
273/*
274** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
275** jb_array is the array to be filled in.
276*/
277int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb,
278 struct reiserfs_list_bitmap *jb_array,
279 int bmap_nr) {
280 int i ;
281 int failed = 0 ;
282 struct reiserfs_list_bitmap *jb ;
283 int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *) ;
284
285 for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) {
286 jb = jb_array + i ;
287 jb->journal_list = NULL ;
288 jb->bitmaps = vmalloc( mem ) ;
289 if (!jb->bitmaps) {
290 reiserfs_warning(p_s_sb, "clm-2000, unable to allocate bitmaps for journal lists") ;
291 failed = 1;
292 break ;
293 }
294 memset(jb->bitmaps, 0, mem) ;
295 }
296 if (failed) {
297 free_list_bitmaps(p_s_sb, jb_array) ;
298 return -1 ;
299 }
300 return 0 ;
301}
302
303/*
304** find an available list bitmap. If you can't find one, flush a commit list
305** and try again
306*/
307static struct reiserfs_list_bitmap *
308get_list_bitmap(struct super_block *p_s_sb, struct reiserfs_journal_list *jl) {
309 int i,j ;
310 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
311 struct reiserfs_list_bitmap *jb = NULL ;
312
313 for (j = 0 ; j < (JOURNAL_NUM_BITMAPS * 3) ; j++) {
314 i = journal->j_list_bitmap_index ;
315 journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS ;
316 jb = journal->j_list_bitmap + i ;
317 if (journal->j_list_bitmap[i].journal_list) {
318 flush_commit_list(p_s_sb, journal->j_list_bitmap[i].journal_list, 1) ;
319 if (!journal->j_list_bitmap[i].journal_list) {
320 break ;
321 }
322 } else {
323 break ;
324 }
325 }
326 if (jb->journal_list) { /* double check to make sure if flushed correctly */
327 return NULL ;
328 }
329 jb->journal_list = jl ;
330 return jb ;
331}
332
333/*
334** allocates a new chunk of X nodes, and links them all together as a list.
335** Uses the cnode->next and cnode->prev pointers
336** returns NULL on failure
337*/
338static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) {
339 struct reiserfs_journal_cnode *head ;
340 int i ;
341 if (num_cnodes <= 0) {
342 return NULL ;
343 }
344 head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)) ;
345 if (!head) {
346 return NULL ;
347 }
348 memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode)) ;
349 head[0].prev = NULL ;
350 head[0].next = head + 1 ;
351 for (i = 1 ; i < num_cnodes; i++) {
352 head[i].prev = head + (i - 1) ;
353 head[i].next = head + (i + 1) ; /* if last one, overwrite it after the if */
354 }
355 head[num_cnodes -1].next = NULL ;
356 return head ;
357}
358
359/*
360** pulls a cnode off the free list, or returns NULL on failure
361*/
362static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb) {
363 struct reiserfs_journal_cnode *cn ;
364 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
365
366 reiserfs_check_lock_depth(p_s_sb, "get_cnode") ;
367
368 if (journal->j_cnode_free <= 0) {
369 return NULL ;
370 }
371 journal->j_cnode_used++ ;
372 journal->j_cnode_free-- ;
373 cn = journal->j_cnode_free_list ;
374 if (!cn) {
375 return cn ;
376 }
377 if (cn->next) {
378 cn->next->prev = NULL ;
379 }
380 journal->j_cnode_free_list = cn->next ;
381 memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ;
382 return cn ;
383}
384
385/*
386** returns a cnode to the free list
387*/
388static void free_cnode(struct super_block *p_s_sb, struct reiserfs_journal_cnode *cn) {
389 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
390
391 reiserfs_check_lock_depth(p_s_sb, "free_cnode") ;
392
393 journal->j_cnode_used-- ;
394 journal->j_cnode_free++ ;
395 /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */
396 cn->next = journal->j_cnode_free_list ;
397 if (journal->j_cnode_free_list) {
398 journal->j_cnode_free_list->prev = cn ;
399 }
400 cn->prev = NULL ; /* not needed with the memset, but I might kill the memset, and forget to do this */
401 journal->j_cnode_free_list = cn ;
402}
403
404static void clear_prepared_bits(struct buffer_head *bh) {
405 clear_buffer_journal_prepared (bh);
406 clear_buffer_journal_restore_dirty (bh);
407}
408
409/* utility function to force a BUG if it is called without the big
410** kernel lock held. caller is the string printed just before calling BUG()
411*/
412void reiserfs_check_lock_depth(struct super_block *sb, char *caller) {
413#ifdef CONFIG_SMP
414 if (current->lock_depth < 0) {
415 reiserfs_panic (sb, "%s called without kernel lock held", caller) ;
416 }
417#else
418 ;
419#endif
420}
421
422/* return a cnode with same dev, block number and size in table, or null if not found */
423static inline struct reiserfs_journal_cnode *
424get_journal_hash_dev(struct super_block *sb,
425 struct reiserfs_journal_cnode **table,
426 long bl)
427{
428 struct reiserfs_journal_cnode *cn ;
429 cn = journal_hash(table, sb, bl) ;
430 while(cn) {
431 if (cn->blocknr == bl && cn->sb == sb)
432 return cn ;
433 cn = cn->hnext ;
434 }
435 return (struct reiserfs_journal_cnode *)0 ;
436}
437
438/*
439** this actually means 'can this block be reallocated yet?'. If you set search_all, a block can only be allocated
440** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever
441** being overwritten by a replay after crashing.
442**
443** If you don't set search_all, a block can only be allocated if it is not in the current transaction. Since deleting
444** a block removes it from the current transaction, this case should never happen. If you don't set search_all, make
445** sure you never write the block without logging it.
446**
447** next_zero_bit is a suggestion about the next block to try for find_forward.
448** when bl is rejected because it is set in a journal list bitmap, we search
449** for the next zero bit in the bitmap that rejected bl. Then, we return that
450** through next_zero_bit for find_forward to try.
451**
452** Just because we return something in next_zero_bit does not mean we won't
453** reject it on the next call to reiserfs_in_journal
454**
455*/
456int reiserfs_in_journal(struct super_block *p_s_sb,
457 int bmap_nr, int bit_nr, int search_all,
458 b_blocknr_t *next_zero_bit) {
459 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
460 struct reiserfs_journal_cnode *cn ;
461 struct reiserfs_list_bitmap *jb ;
462 int i ;
463 unsigned long bl;
464
465 *next_zero_bit = 0 ; /* always start this at zero. */
466
467 PROC_INFO_INC( p_s_sb, journal.in_journal );
468 /* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
469 ** if we crash before the transaction that freed it commits, this transaction won't
470 ** have committed either, and the block will never be written
471 */
472 if (search_all) {
473 for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) {
474 PROC_INFO_INC( p_s_sb, journal.in_journal_bitmap );
475 jb = journal->j_list_bitmap + i ;
476 if (jb->journal_list && jb->bitmaps[bmap_nr] &&
477 test_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data)) {
478 *next_zero_bit = find_next_zero_bit((unsigned long *)
479 (jb->bitmaps[bmap_nr]->data),
480 p_s_sb->s_blocksize << 3, bit_nr+1) ;
481 return 1 ;
482 }
483 }
484 }
485
486 bl = bmap_nr * (p_s_sb->s_blocksize << 3) + bit_nr;
487 /* is it in any old transactions? */
488 if (search_all && (cn = get_journal_hash_dev(p_s_sb, journal->j_list_hash_table, bl))) {
489 return 1;
490 }
491
492 /* is it in the current transaction. This should never happen */
493 if ((cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, bl))) {
494 BUG();
495 return 1;
496 }
497
498 PROC_INFO_INC( p_s_sb, journal.in_journal_reusable );
499 /* safe for reuse */
500 return 0 ;
501}
502
503/* insert cn into table
504*/
505static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, struct reiserfs_journal_cnode *cn) {
506 struct reiserfs_journal_cnode *cn_orig ;
507
508 cn_orig = journal_hash(table, cn->sb, cn->blocknr) ;
509 cn->hnext = cn_orig ;
510 cn->hprev = NULL ;
511 if (cn_orig) {
512 cn_orig->hprev = cn ;
513 }
514 journal_hash(table, cn->sb, cn->blocknr) = cn ;
515}
516
517/* lock the current transaction */
518inline static void lock_journal(struct super_block *p_s_sb) {
519 PROC_INFO_INC( p_s_sb, journal.lock_journal );
520 down(&SB_JOURNAL(p_s_sb)->j_lock);
521}
522
523/* unlock the current transaction */
524inline static void unlock_journal(struct super_block *p_s_sb) {
525 up(&SB_JOURNAL(p_s_sb)->j_lock);
526}
527
528static inline void get_journal_list(struct reiserfs_journal_list *jl)
529{
530 jl->j_refcount++;
531}
532
533static inline void put_journal_list(struct super_block *s,
534 struct reiserfs_journal_list *jl)
535{
536 if (jl->j_refcount < 1) {
537 reiserfs_panic (s, "trans id %lu, refcount at %d", jl->j_trans_id,
538 jl->j_refcount);
539 }
540 if (--jl->j_refcount == 0)
541 reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s);
542}
543
544/*
545** this used to be much more involved, and I'm keeping it just in case things get ugly again.
546** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a
547** transaction.
548*/
549static void cleanup_freed_for_journal_list(struct super_block *p_s_sb, struct reiserfs_journal_list *jl) {
550
551 struct reiserfs_list_bitmap *jb = jl->j_list_bitmap ;
552 if (jb) {
553 cleanup_bitmap_list(p_s_sb, jb) ;
554 }
555 jl->j_list_bitmap->journal_list = NULL ;
556 jl->j_list_bitmap = NULL ;
557}
558
559static int journal_list_still_alive(struct super_block *s,
560 unsigned long trans_id)
561{
562 struct reiserfs_journal *journal = SB_JOURNAL (s);
563 struct list_head *entry = &journal->j_journal_list;
564 struct reiserfs_journal_list *jl;
565
566 if (!list_empty(entry)) {
567 jl = JOURNAL_LIST_ENTRY(entry->next);
568 if (jl->j_trans_id <= trans_id) {
569 return 1;
570 }
571 }
572 return 0;
573}
574
575static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
576 char b[BDEVNAME_SIZE];
577
578 if (buffer_journaled(bh)) {
579 reiserfs_warning(NULL, "clm-2084: pinned buffer %lu:%s sent to disk",
580 bh->b_blocknr, bdevname(bh->b_bdev, b)) ;
581 }
582 if (uptodate)
583 set_buffer_uptodate(bh) ;
584 else
585 clear_buffer_uptodate(bh) ;
586 unlock_buffer(bh) ;
587 put_bh(bh) ;
588}
589
590static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) {
591 if (uptodate)
592 set_buffer_uptodate(bh) ;
593 else
594 clear_buffer_uptodate(bh) ;
595 unlock_buffer(bh) ;
596 put_bh(bh) ;
597}
598
599static void submit_logged_buffer(struct buffer_head *bh) {
600 get_bh(bh) ;
601 bh->b_end_io = reiserfs_end_buffer_io_sync ;
602 clear_buffer_journal_new (bh);
603 clear_buffer_dirty(bh) ;
604 if (!test_clear_buffer_journal_test (bh))
605 BUG();
606 if (!buffer_uptodate(bh))
607 BUG();
608 submit_bh(WRITE, bh) ;
609}
610
611static void submit_ordered_buffer(struct buffer_head *bh) {
612 get_bh(bh) ;
613 bh->b_end_io = reiserfs_end_ordered_io;
614 clear_buffer_dirty(bh) ;
615 if (!buffer_uptodate(bh))
616 BUG();
617 submit_bh(WRITE, bh) ;
618}
619
620static int submit_barrier_buffer(struct buffer_head *bh) {
621 get_bh(bh) ;
622 bh->b_end_io = reiserfs_end_ordered_io;
623 clear_buffer_dirty(bh) ;
624 if (!buffer_uptodate(bh))
625 BUG();
626 return submit_bh(WRITE_BARRIER, bh) ;
627}
628
629static void check_barrier_completion(struct super_block *s,
630 struct buffer_head *bh) {
631 if (buffer_eopnotsupp(bh)) {
632 clear_buffer_eopnotsupp(bh);
633 disable_barrier(s);
634 set_buffer_uptodate(bh);
635 set_buffer_dirty(bh);
636 sync_dirty_buffer(bh);
637 }
638}
639
640#define CHUNK_SIZE 32
641struct buffer_chunk {
642 struct buffer_head *bh[CHUNK_SIZE];
643 int nr;
644};
645
646static void write_chunk(struct buffer_chunk *chunk) {
647 int i;
648 for (i = 0; i < chunk->nr ; i++) {
649 submit_logged_buffer(chunk->bh[i]) ;
650 }
651 chunk->nr = 0;
652}
653
654static void write_ordered_chunk(struct buffer_chunk *chunk) {
655 int i;
656 for (i = 0; i < chunk->nr ; i++) {
657 submit_ordered_buffer(chunk->bh[i]) ;
658 }
659 chunk->nr = 0;
660}
661
662static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
663 spinlock_t *lock,
664 void (fn)(struct buffer_chunk *))
665{
666 int ret = 0;
667 if (chunk->nr >= CHUNK_SIZE)
668 BUG();
669 chunk->bh[chunk->nr++] = bh;
670 if (chunk->nr >= CHUNK_SIZE) {
671 ret = 1;
672 if (lock)
673 spin_unlock(lock);
674 fn(chunk);
675 if (lock)
676 spin_lock(lock);
677 }
678 return ret;
679}
680
681
682static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
683static struct reiserfs_jh *alloc_jh(void) {
684 struct reiserfs_jh *jh;
685 while(1) {
686 jh = kmalloc(sizeof(*jh), GFP_NOFS);
687 if (jh) {
688 atomic_inc(&nr_reiserfs_jh);
689 return jh;
690 }
691 yield();
692 }
693}
694
695/*
696 * we want to free the jh when the buffer has been written
697 * and waited on
698 */
699void reiserfs_free_jh(struct buffer_head *bh) {
700 struct reiserfs_jh *jh;
701
702 jh = bh->b_private;
703 if (jh) {
704 bh->b_private = NULL;
705 jh->bh = NULL;
706 list_del_init(&jh->list);
707 kfree(jh);
708 if (atomic_read(&nr_reiserfs_jh) <= 0)
709 BUG();
710 atomic_dec(&nr_reiserfs_jh);
711 put_bh(bh);
712 }
713}
714
715static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
716 int tail)
717{
718 struct reiserfs_jh *jh;
719
720 if (bh->b_private) {
721 spin_lock(&j->j_dirty_buffers_lock);
722 if (!bh->b_private) {
723 spin_unlock(&j->j_dirty_buffers_lock);
724 goto no_jh;
725 }
726 jh = bh->b_private;
727 list_del_init(&jh->list);
728 } else {
729no_jh:
730 get_bh(bh);
731 jh = alloc_jh();
732 spin_lock(&j->j_dirty_buffers_lock);
733 /* buffer must be locked for __add_jh, should be able to have
734 * two adds at the same time
735 */
736 if (bh->b_private)
737 BUG();
738 jh->bh = bh;
739 bh->b_private = jh;
740 }
741 jh->jl = j->j_current_jl;
742 if (tail)
743 list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
744 else {
745 list_add_tail(&jh->list, &jh->jl->j_bh_list);
746 }
747 spin_unlock(&j->j_dirty_buffers_lock);
748 return 0;
749}
750
751int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) {
752 return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
753}
754int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) {
755 return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
756}
757
758#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
759static int write_ordered_buffers(spinlock_t *lock,
760 struct reiserfs_journal *j,
761 struct reiserfs_journal_list *jl,
762 struct list_head *list)
763{
764 struct buffer_head *bh;
765 struct reiserfs_jh *jh;
766 int ret = j->j_errno;
767 struct buffer_chunk chunk;
768 struct list_head tmp;
769 INIT_LIST_HEAD(&tmp);
770
771 chunk.nr = 0;
772 spin_lock(lock);
773 while(!list_empty(list)) {
774 jh = JH_ENTRY(list->next);
775 bh = jh->bh;
776 get_bh(bh);
777 if (test_set_buffer_locked(bh)) {
778 if (!buffer_dirty(bh)) {
779 list_del_init(&jh->list);
780 list_add(&jh->list, &tmp);
781 goto loop_next;
782 }
783 spin_unlock(lock);
784 if (chunk.nr)
785 write_ordered_chunk(&chunk);
786 wait_on_buffer(bh);
787 cond_resched();
788 spin_lock(lock);
789 goto loop_next;
790 }
791 if (buffer_dirty(bh)) {
792 list_del_init(&jh->list);
793 list_add(&jh->list, &tmp);
794 add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
795 } else {
796 reiserfs_free_jh(bh);
797 unlock_buffer(bh);
798 }
799loop_next:
800 put_bh(bh);
801 cond_resched_lock(lock);
802 }
803 if (chunk.nr) {
804 spin_unlock(lock);
805 write_ordered_chunk(&chunk);
806 spin_lock(lock);
807 }
808 while(!list_empty(&tmp)) {
809 jh = JH_ENTRY(tmp.prev);
810 bh = jh->bh;
811 get_bh(bh);
812 reiserfs_free_jh(bh);
813
814 if (buffer_locked(bh)) {
815 spin_unlock(lock);
816 wait_on_buffer(bh);
817 spin_lock(lock);
818 }
819 if (!buffer_uptodate(bh)) {
820 ret = -EIO;
821 }
822 put_bh(bh);
823 cond_resched_lock(lock);
824 }
825 spin_unlock(lock);
826 return ret;
827}
828
829static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) {
830 struct reiserfs_journal *journal = SB_JOURNAL (s);
831 struct reiserfs_journal_list *other_jl;
832 struct reiserfs_journal_list *first_jl;
833 struct list_head *entry;
834 unsigned long trans_id = jl->j_trans_id;
835 unsigned long other_trans_id;
836 unsigned long first_trans_id;
837
838find_first:
839 /*
840 * first we walk backwards to find the oldest uncommitted transation
841 */
842 first_jl = jl;
843 entry = jl->j_list.prev;
844 while(1) {
845 other_jl = JOURNAL_LIST_ENTRY(entry);
846 if (entry == &journal->j_journal_list ||
847 atomic_read(&other_jl->j_older_commits_done))
848 break;
849
850 first_jl = other_jl;
851 entry = other_jl->j_list.prev;
852 }
853
854 /* if we didn't find any older uncommitted transactions, return now */
855 if (first_jl == jl) {
856 return 0;
857 }
858
859 first_trans_id = first_jl->j_trans_id;
860
861 entry = &first_jl->j_list;
862 while(1) {
863 other_jl = JOURNAL_LIST_ENTRY(entry);
864 other_trans_id = other_jl->j_trans_id;
865
866 if (other_trans_id < trans_id) {
867 if (atomic_read(&other_jl->j_commit_left) != 0) {
868 flush_commit_list(s, other_jl, 0);
869
870 /* list we were called with is gone, return */
871 if (!journal_list_still_alive(s, trans_id))
872 return 1;
873
874 /* the one we just flushed is gone, this means all
875 * older lists are also gone, so first_jl is no longer
876 * valid either. Go back to the beginning.
877 */
878 if (!journal_list_still_alive(s, other_trans_id)) {
879 goto find_first;
880 }
881 }
882 entry = entry->next;
883 if (entry == &journal->j_journal_list)
884 return 0;
885 } else {
886 return 0;
887 }
888 }
889 return 0;
890}
891int reiserfs_async_progress_wait(struct super_block *s) {
892 DEFINE_WAIT(wait);
893 struct reiserfs_journal *j = SB_JOURNAL(s);
894 if (atomic_read(&j->j_async_throttle))
895 blk_congestion_wait(WRITE, HZ/10);
896 return 0;
897}
898
899/*
900** if this journal list still has commit blocks unflushed, send them to disk.
901**
902** log areas must be flushed in order (transaction 2 can't commit before transaction 1)
903** Before the commit block can by written, every other log block must be safely on disk
904**
905*/
906static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) {
907 int i;
908 int bn ;
909 struct buffer_head *tbh = NULL ;
910 unsigned long trans_id = jl->j_trans_id;
911 struct reiserfs_journal *journal = SB_JOURNAL (s);
912 int barrier = 0;
913 int retval = 0;
914
915 reiserfs_check_lock_depth(s, "flush_commit_list") ;
916
917 if (atomic_read(&jl->j_older_commits_done)) {
918 return 0 ;
919 }
920
921 /* before we can put our commit blocks on disk, we have to make sure everyone older than
922 ** us is on disk too
923 */
924 BUG_ON (jl->j_len <= 0);
925 BUG_ON (trans_id == journal->j_trans_id);
926
927 get_journal_list(jl);
928 if (flushall) {
929 if (flush_older_commits(s, jl) == 1) {
930 /* list disappeared during flush_older_commits. return */
931 goto put_jl;
932 }
933 }
934
935 /* make sure nobody is trying to flush this one at the same time */
936 down(&jl->j_commit_lock);
937 if (!journal_list_still_alive(s, trans_id)) {
938 up(&jl->j_commit_lock);
939 goto put_jl;
940 }
941 BUG_ON (jl->j_trans_id == 0);
942
943 /* this commit is done, exit */
944 if (atomic_read(&(jl->j_commit_left)) <= 0) {
945 if (flushall) {
946 atomic_set(&(jl->j_older_commits_done), 1) ;
947 }
948 up(&jl->j_commit_lock);
949 goto put_jl;
950 }
951
952 if (!list_empty(&jl->j_bh_list)) {
953 unlock_kernel();
954 write_ordered_buffers(&journal->j_dirty_buffers_lock,
955 journal, jl, &jl->j_bh_list);
956 lock_kernel();
957 }
958 BUG_ON (!list_empty(&jl->j_bh_list));
959 /*
960 * for the description block and all the log blocks, submit any buffers
961 * that haven't already reached the disk
962 */
963 atomic_inc(&journal->j_async_throttle);
964 for (i = 0 ; i < (jl->j_len + 1) ; i++) {
965 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) %
966 SB_ONDISK_JOURNAL_SIZE(s);
967 tbh = journal_find_get_block(s, bn) ;
968 if (buffer_dirty(tbh)) /* redundant, ll_rw_block() checks */
969 ll_rw_block(WRITE, 1, &tbh) ;
970 put_bh(tbh) ;
971 }
972 atomic_dec(&journal->j_async_throttle);
973
974 /* wait on everything written so far before writing the commit
975 * if we are in barrier mode, send the commit down now
976 */
977 barrier = reiserfs_barrier_flush(s);
978 if (barrier) {
979 int ret;
980 lock_buffer(jl->j_commit_bh);
981 ret = submit_barrier_buffer(jl->j_commit_bh);
982 if (ret == -EOPNOTSUPP) {
983 set_buffer_uptodate(jl->j_commit_bh);
984 disable_barrier(s);
985 barrier = 0;
986 }
987 }
988 for (i = 0 ; i < (jl->j_len + 1) ; i++) {
989 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
990 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
991 tbh = journal_find_get_block(s, bn) ;
992 wait_on_buffer(tbh) ;
993 // since we're using ll_rw_blk above, it might have skipped over
994 // a locked buffer. Double check here
995 //
996 if (buffer_dirty(tbh)) /* redundant, sync_dirty_buffer() checks */
997 sync_dirty_buffer(tbh);
998 if (unlikely (!buffer_uptodate(tbh))) {
999#ifdef CONFIG_REISERFS_CHECK
1000 reiserfs_warning(s, "journal-601, buffer write failed") ;
1001#endif
1002 retval = -EIO;
1003 }
1004 put_bh(tbh) ; /* once for journal_find_get_block */
1005 put_bh(tbh) ; /* once due to original getblk in do_journal_end */
1006 atomic_dec(&(jl->j_commit_left)) ;
1007 }
1008
1009 BUG_ON (atomic_read(&(jl->j_commit_left)) != 1);
1010
1011 if (!barrier) {
1012 if (buffer_dirty(jl->j_commit_bh))
1013 BUG();
1014 mark_buffer_dirty(jl->j_commit_bh) ;
1015 sync_dirty_buffer(jl->j_commit_bh) ;
1016 } else
1017 wait_on_buffer(jl->j_commit_bh);
1018
1019 check_barrier_completion(s, jl->j_commit_bh);
1020
1021 /* If there was a write error in the journal - we can't commit this
1022 * transaction - it will be invalid and, if successful, will just end
1023 * up propogating the write error out to the filesystem. */
1024 if (unlikely (!buffer_uptodate(jl->j_commit_bh))) {
1025#ifdef CONFIG_REISERFS_CHECK
1026 reiserfs_warning(s, "journal-615: buffer write failed") ;
1027#endif
1028 retval = -EIO;
1029 }
1030 bforget(jl->j_commit_bh) ;
1031 if (journal->j_last_commit_id != 0 &&
1032 (jl->j_trans_id - journal->j_last_commit_id) != 1) {
1033 reiserfs_warning(s, "clm-2200: last commit %lu, current %lu",
1034 journal->j_last_commit_id,
1035 jl->j_trans_id);
1036 }
1037 journal->j_last_commit_id = jl->j_trans_id;
1038
1039 /* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */
1040 cleanup_freed_for_journal_list(s, jl) ;
1041
1042 retval = retval ? retval : journal->j_errno;
1043
1044 /* mark the metadata dirty */
1045 if (!retval)
1046 dirty_one_transaction(s, jl);
1047 atomic_dec(&(jl->j_commit_left)) ;
1048
1049 if (flushall) {
1050 atomic_set(&(jl->j_older_commits_done), 1) ;
1051 }
1052 up(&jl->j_commit_lock);
1053put_jl:
1054 put_journal_list(s, jl);
1055
1056 if (retval)
1057 reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__);
1058 return retval;
1059}
1060
1061/*
1062** flush_journal_list frequently needs to find a newer transaction for a given block. This does that, or
1063** returns NULL if it can't find anything
1064*/
1065static struct reiserfs_journal_list *find_newer_jl_for_cn(struct reiserfs_journal_cnode *cn) {
1066 struct super_block *sb = cn->sb;
1067 b_blocknr_t blocknr = cn->blocknr ;
1068
1069 cn = cn->hprev ;
1070 while(cn) {
1071 if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {
1072 return cn->jlist ;
1073 }
1074 cn = cn->hprev ;
1075 }
1076 return NULL ;
1077}
1078
1079static void remove_journal_hash(struct super_block *, struct reiserfs_journal_cnode **,
1080struct reiserfs_journal_list *, unsigned long, int);
1081
1082/*
1083** once all the real blocks have been flushed, it is safe to remove them from the
1084** journal list for this transaction. Aside from freeing the cnode, this also allows the
1085** block to be reallocated for data blocks if it had been deleted.
1086*/
1087static void remove_all_from_journal_list(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, int debug) {
1088 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
1089 struct reiserfs_journal_cnode *cn, *last ;
1090 cn = jl->j_realblock ;
1091
1092 /* which is better, to lock once around the whole loop, or
1093 ** to lock for each call to remove_journal_hash?
1094 */
1095 while(cn) {
1096 if (cn->blocknr != 0) {
1097 if (debug) {
1098 reiserfs_warning (p_s_sb, "block %u, bh is %d, state %ld", cn->blocknr,
1099 cn->bh ? 1: 0, cn->state) ;
1100 }
1101 cn->state = 0 ;
1102 remove_journal_hash(p_s_sb, journal->j_list_hash_table, jl, cn->blocknr, 1) ;
1103 }
1104 last = cn ;
1105 cn = cn->next ;
1106 free_cnode(p_s_sb, last) ;
1107 }
1108 jl->j_realblock = NULL ;
1109}
1110
1111/*
1112** if this timestamp is greater than the timestamp we wrote last to the header block, write it to the header block.
1113** once this is done, I can safely say the log area for this transaction won't ever be replayed, and I can start
1114** releasing blocks in this transaction for reuse as data blocks.
1115** called by flush_journal_list, before it calls remove_all_from_journal_list
1116**
1117*/
1118static int _update_journal_header_block(struct super_block *p_s_sb, unsigned long offset, unsigned long trans_id) {
1119 struct reiserfs_journal_header *jh ;
1120 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
1121
1122 if (reiserfs_is_journal_aborted (journal))
1123 return -EIO;
1124
1125 if (trans_id >= journal->j_last_flush_trans_id) {
1126 if (buffer_locked((journal->j_header_bh))) {
1127 wait_on_buffer((journal->j_header_bh)) ;
1128 if (unlikely (!buffer_uptodate(journal->j_header_bh))) {
1129#ifdef CONFIG_REISERFS_CHECK
1130 reiserfs_warning (p_s_sb, "journal-699: buffer write failed") ;
1131#endif
1132 return -EIO;
1133 }
1134 }
1135 journal->j_last_flush_trans_id = trans_id ;
1136 journal->j_first_unflushed_offset = offset ;
1137 jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data) ;
1138 jh->j_last_flush_trans_id = cpu_to_le32(trans_id) ;
1139 jh->j_first_unflushed_offset = cpu_to_le32(offset) ;
1140 jh->j_mount_id = cpu_to_le32(journal->j_mount_id) ;
1141
1142 if (reiserfs_barrier_flush(p_s_sb)) {
1143 int ret;
1144 lock_buffer(journal->j_header_bh);
1145 ret = submit_barrier_buffer(journal->j_header_bh);
1146 if (ret == -EOPNOTSUPP) {
1147 set_buffer_uptodate(journal->j_header_bh);
1148 disable_barrier(p_s_sb);
1149 goto sync;
1150 }
1151 wait_on_buffer(journal->j_header_bh);
1152 check_barrier_completion(p_s_sb, journal->j_header_bh);
1153 } else {
1154sync:
1155 set_buffer_dirty(journal->j_header_bh) ;
1156 sync_dirty_buffer(journal->j_header_bh) ;
1157 }
1158 if (!buffer_uptodate(journal->j_header_bh)) {
1159 reiserfs_warning (p_s_sb, "journal-837: IO error during journal replay");
1160 return -EIO ;
1161 }
1162 }
1163 return 0 ;
1164}
1165
1166static int update_journal_header_block(struct super_block *p_s_sb,
1167 unsigned long offset,
1168 unsigned long trans_id) {
1169 return _update_journal_header_block(p_s_sb, offset, trans_id);
1170}
1171/*
1172** flush any and all journal lists older than you are
1173** can only be called from flush_journal_list
1174*/
1175static int flush_older_journal_lists(struct super_block *p_s_sb,
1176 struct reiserfs_journal_list *jl)
1177{
1178 struct list_head *entry;
1179 struct reiserfs_journal_list *other_jl ;
1180 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
1181 unsigned long trans_id = jl->j_trans_id;
1182
1183 /* we know we are the only ones flushing things, no extra race
1184 * protection is required.
1185 */
1186restart:
1187 entry = journal->j_journal_list.next;
1188 /* Did we wrap? */
1189 if (entry == &journal->j_journal_list)
1190 return 0;
1191 other_jl = JOURNAL_LIST_ENTRY(entry);
1192 if (other_jl->j_trans_id < trans_id) {
1193 BUG_ON (other_jl->j_refcount <= 0);
1194 /* do not flush all */
1195 flush_journal_list(p_s_sb, other_jl, 0) ;
1196
1197 /* other_jl is now deleted from the list */
1198 goto restart;
1199 }
1200 return 0 ;
1201}
1202
1203static void del_from_work_list(struct super_block *s,
1204 struct reiserfs_journal_list *jl) {
1205 struct reiserfs_journal *journal = SB_JOURNAL (s);
1206 if (!list_empty(&jl->j_working_list)) {
1207 list_del_init(&jl->j_working_list);
1208 journal->j_num_work_lists--;
1209 }
1210}
1211
1212/* flush a journal list, both commit and real blocks
1213**
1214** always set flushall to 1, unless you are calling from inside
1215** flush_journal_list
1216**
1217** IMPORTANT. This can only be called while there are no journal writers,
1218** and the journal is locked. That means it can only be called from
1219** do_journal_end, or by journal_release
1220*/
1221static int flush_journal_list(struct super_block *s,
1222 struct reiserfs_journal_list *jl, int flushall) {
1223 struct reiserfs_journal_list *pjl ;
1224 struct reiserfs_journal_cnode *cn, *last ;
1225 int count ;
1226 int was_jwait = 0 ;
1227 int was_dirty = 0 ;
1228 struct buffer_head *saved_bh ;
1229 unsigned long j_len_saved = jl->j_len ;
1230 struct reiserfs_journal *journal = SB_JOURNAL (s);
1231 int err = 0;
1232
1233 BUG_ON (j_len_saved <= 0);
1234
1235 if (atomic_read(&journal->j_wcount) != 0) {
1236 reiserfs_warning(s, "clm-2048: flush_journal_list called with wcount %d",
1237 atomic_read(&journal->j_wcount)) ;
1238 }
1239 BUG_ON (jl->j_trans_id == 0);
1240
1241 /* if flushall == 0, the lock is already held */
1242 if (flushall) {
1243 down(&journal->j_flush_sem);
1244 } else if (!down_trylock(&journal->j_flush_sem)) {
1245 BUG();
1246 }
1247
1248 count = 0 ;
1249 if (j_len_saved > journal->j_trans_max) {
1250 reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, trans id %lu\n", j_len_saved, jl->j_trans_id);
1251 return 0 ;
1252 }
1253
1254 /* if all the work is already done, get out of here */
1255 if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
1256 atomic_read(&(jl->j_commit_left)) <= 0) {
1257 goto flush_older_and_return ;
1258 }
1259
1260 /* start by putting the commit list on disk. This will also flush
1261 ** the commit lists of any olders transactions
1262 */
1263 flush_commit_list(s, jl, 1) ;
1264
1265 if (!(jl->j_state & LIST_DIRTY) && !reiserfs_is_journal_aborted (journal))
1266 BUG();
1267
1268 /* are we done now? */
1269 if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
1270 atomic_read(&(jl->j_commit_left)) <= 0) {
1271 goto flush_older_and_return ;
1272 }
1273
1274 /* loop through each cnode, see if we need to write it,
1275 ** or wait on a more recent transaction, or just ignore it
1276 */
1277 if (atomic_read(&(journal->j_wcount)) != 0) {
1278 reiserfs_panic(s, "journal-844: panic journal list is flushing, wcount is not 0\n") ;
1279 }
1280 cn = jl->j_realblock ;
1281 while(cn) {
1282 was_jwait = 0 ;
1283 was_dirty = 0 ;
1284 saved_bh = NULL ;
1285 /* blocknr of 0 is no longer in the hash, ignore it */
1286 if (cn->blocknr == 0) {
1287 goto free_cnode ;
1288 }
1289
1290 /* This transaction failed commit. Don't write out to the disk */
1291 if (!(jl->j_state & LIST_DIRTY))
1292 goto free_cnode;
1293
1294 pjl = find_newer_jl_for_cn(cn) ;
1295 /* the order is important here. We check pjl to make sure we
1296 ** don't clear BH_JDirty_wait if we aren't the one writing this
1297 ** block to disk
1298 */
1299 if (!pjl && cn->bh) {
1300 saved_bh = cn->bh ;
1301
1302 /* we do this to make sure nobody releases the buffer while
1303 ** we are working with it
1304 */
1305 get_bh(saved_bh) ;
1306
1307 if (buffer_journal_dirty(saved_bh)) {
1308 BUG_ON (!can_dirty (cn));
1309 was_jwait = 1 ;
1310 was_dirty = 1 ;
1311 } else if (can_dirty(cn)) {
1312 /* everything with !pjl && jwait should be writable */
1313 BUG();
1314 }
1315 }
1316
1317 /* if someone has this block in a newer transaction, just make
1318 ** sure they are commited, and don't try writing it to disk
1319 */
1320 if (pjl) {
1321 if (atomic_read(&pjl->j_commit_left))
1322 flush_commit_list(s, pjl, 1) ;
1323 goto free_cnode ;
1324 }
1325
1326 /* bh == NULL when the block got to disk on its own, OR,
1327 ** the block got freed in a future transaction
1328 */
1329 if (saved_bh == NULL) {
1330 goto free_cnode ;
1331 }
1332
1333 /* this should never happen. kupdate_one_transaction has this list
1334 ** locked while it works, so we should never see a buffer here that
1335 ** is not marked JDirty_wait
1336 */
1337 if ((!was_jwait) && !buffer_locked(saved_bh)) {
1338 reiserfs_warning (s, "journal-813: BAD! buffer %llu %cdirty %cjwait, "
1339 "not in a newer tranasction",
1340 (unsigned long long)saved_bh->b_blocknr,
1341 was_dirty ? ' ' : '!', was_jwait ? ' ' : '!') ;
1342 }
1343 if (was_dirty) {
1344 /* we inc again because saved_bh gets decremented at free_cnode */
1345 get_bh(saved_bh) ;
1346 set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
1347 lock_buffer(saved_bh);
1348 BUG_ON (cn->blocknr != saved_bh->b_blocknr);
1349 if (buffer_dirty(saved_bh))
1350 submit_logged_buffer(saved_bh) ;
1351 else
1352 unlock_buffer(saved_bh);
1353 count++ ;
1354 } else {
1355 reiserfs_warning (s, "clm-2082: Unable to flush buffer %llu in %s",
1356 (unsigned long long)saved_bh->b_blocknr, __FUNCTION__);
1357 }
1358free_cnode:
1359 last = cn ;
1360 cn = cn->next ;
1361 if (saved_bh) {
1362 /* we incremented this to keep others from taking the buffer head away */
1363 put_bh(saved_bh) ;
1364 if (atomic_read(&(saved_bh->b_count)) < 0) {
1365 reiserfs_warning (s, "journal-945: saved_bh->b_count < 0");
1366 }
1367 }
1368 }
1369 if (count > 0) {
1370 cn = jl->j_realblock ;
1371 while(cn) {
1372 if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
1373 if (!cn->bh) {
1374 reiserfs_panic(s, "journal-1011: cn->bh is NULL\n") ;
1375 }
1376 wait_on_buffer(cn->bh) ;
1377 if (!cn->bh) {
1378 reiserfs_panic(s, "journal-1012: cn->bh is NULL\n") ;
1379 }
1380 if (unlikely (!buffer_uptodate(cn->bh))) {
1381#ifdef CONFIG_REISERFS_CHECK
1382 reiserfs_warning(s, "journal-949: buffer write failed\n") ;
1383#endif
1384 err = -EIO;
1385 }
1386 /* note, we must clear the JDirty_wait bit after the up to date
1387 ** check, otherwise we race against our flushpage routine
1388 */
1389 BUG_ON (!test_clear_buffer_journal_dirty (cn->bh));
1390
1391 /* undo the inc from journal_mark_dirty */
1392 put_bh(cn->bh) ;
1393 brelse(cn->bh) ;
1394 }
1395 cn = cn->next ;
1396 }
1397 }
1398
1399 if (err)
1400 reiserfs_abort (s, -EIO, "Write error while pushing transaction to disk in %s", __FUNCTION__);
1401flush_older_and_return:
1402
1403
1404 /* before we can update the journal header block, we _must_ flush all
1405 ** real blocks from all older transactions to disk. This is because
1406 ** once the header block is updated, this transaction will not be
1407 ** replayed after a crash
1408 */
1409 if (flushall) {
1410 flush_older_journal_lists(s, jl);
1411 }
1412
1413 err = journal->j_errno;
1414 /* before we can remove everything from the hash tables for this
1415 ** transaction, we must make sure it can never be replayed
1416 **
1417 ** since we are only called from do_journal_end, we know for sure there
1418 ** are no allocations going on while we are flushing journal lists. So,
1419 ** we only need to update the journal header block for the last list
1420 ** being flushed
1421 */
1422 if (!err && flushall) {
1423 err = update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(s), jl->j_trans_id) ;
1424 if (err)
1425 reiserfs_abort (s, -EIO, "Write error while updating journal header in %s", __FUNCTION__);
1426 }
1427 remove_all_from_journal_list(s, jl, 0) ;
1428 list_del_init(&jl->j_list);
1429 journal->j_num_lists--;
1430 del_from_work_list(s, jl);
1431
1432 if (journal->j_last_flush_id != 0 &&
1433 (jl->j_trans_id - journal->j_last_flush_id) != 1) {
1434 reiserfs_warning(s, "clm-2201: last flush %lu, current %lu",
1435 journal->j_last_flush_id,
1436 jl->j_trans_id);
1437 }
1438 journal->j_last_flush_id = jl->j_trans_id;
1439
1440 /* not strictly required since we are freeing the list, but it should
1441 * help find code using dead lists later on
1442 */
1443 jl->j_len = 0 ;
1444 atomic_set(&(jl->j_nonzerolen), 0) ;
1445 jl->j_start = 0 ;
1446 jl->j_realblock = NULL ;
1447 jl->j_commit_bh = NULL ;
1448 jl->j_trans_id = 0 ;
1449 jl->j_state = 0;
1450 put_journal_list(s, jl);
1451 if (flushall)
1452 up(&journal->j_flush_sem);
1453 return err ;
1454}
1455
1456static int write_one_transaction(struct super_block *s,
1457 struct reiserfs_journal_list *jl,
1458 struct buffer_chunk *chunk)
1459{
1460 struct reiserfs_journal_cnode *cn;
1461 int ret = 0 ;
1462
1463 jl->j_state |= LIST_TOUCHED;
1464 del_from_work_list(s, jl);
1465 if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
1466 return 0;
1467 }
1468
1469 cn = jl->j_realblock ;
1470 while(cn) {
1471 /* if the blocknr == 0, this has been cleared from the hash,
1472 ** skip it
1473 */
1474 if (cn->blocknr == 0) {
1475 goto next ;
1476 }
1477 if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
1478 struct buffer_head *tmp_bh;
1479 /* we can race against journal_mark_freed when we try
1480 * to lock_buffer(cn->bh), so we have to inc the buffer
1481 * count, and recheck things after locking
1482 */
1483 tmp_bh = cn->bh;
1484 get_bh(tmp_bh);
1485 lock_buffer(tmp_bh);
1486 if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
1487 if (!buffer_journal_dirty(tmp_bh) ||
1488 buffer_journal_prepared(tmp_bh))
1489 BUG();
1490 add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
1491 ret++;
1492 } else {
1493 /* note, cn->bh might be null now */
1494 unlock_buffer(tmp_bh);
1495 }
1496 put_bh(tmp_bh);
1497 }
1498next:
1499 cn = cn->next ;
1500 cond_resched();
1501 }
1502 return ret ;
1503}
1504
1505/* used by flush_commit_list */
1506static int dirty_one_transaction(struct super_block *s,
1507 struct reiserfs_journal_list *jl)
1508{
1509 struct reiserfs_journal_cnode *cn;
1510 struct reiserfs_journal_list *pjl;
1511 int ret = 0 ;
1512
1513 jl->j_state |= LIST_DIRTY;
1514 cn = jl->j_realblock ;
1515 while(cn) {
1516 /* look for a more recent transaction that logged this
1517 ** buffer. Only the most recent transaction with a buffer in
1518 ** it is allowed to send that buffer to disk
1519 */
1520 pjl = find_newer_jl_for_cn(cn) ;
1521 if (!pjl && cn->blocknr && cn->bh && buffer_journal_dirty(cn->bh))
1522 {
1523 BUG_ON (!can_dirty(cn));
1524 /* if the buffer is prepared, it will either be logged
1525 * or restored. If restored, we need to make sure
1526 * it actually gets marked dirty
1527 */
1528 clear_buffer_journal_new (cn->bh);
1529 if (buffer_journal_prepared (cn->bh)) {
1530 set_buffer_journal_restore_dirty (cn->bh);
1531 } else {
1532 set_buffer_journal_test (cn->bh);
1533 mark_buffer_dirty(cn->bh);
1534 }
1535 }
1536 cn = cn->next ;
1537 }
1538 return ret ;
1539}
1540
1541static int kupdate_transactions(struct super_block *s,
1542 struct reiserfs_journal_list *jl,
1543 struct reiserfs_journal_list **next_jl,
1544 unsigned long *next_trans_id,
1545 int num_blocks,
1546 int num_trans) {
1547 int ret = 0;
1548 int written = 0 ;
1549 int transactions_flushed = 0;
1550 unsigned long orig_trans_id = jl->j_trans_id;
1551 struct buffer_chunk chunk;
1552 struct list_head *entry;
1553 struct reiserfs_journal *journal = SB_JOURNAL (s);
1554 chunk.nr = 0;
1555
1556 down(&journal->j_flush_sem);
1557 if (!journal_list_still_alive(s, orig_trans_id)) {
1558 goto done;
1559 }
1560
1561 /* we've got j_flush_sem held, nobody is going to delete any
1562 * of these lists out from underneath us
1563 */
1564 while((num_trans && transactions_flushed < num_trans) ||
1565 (!num_trans && written < num_blocks)) {
1566
1567 if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
1568 atomic_read(&jl->j_commit_left) || !(jl->j_state & LIST_DIRTY))
1569 {
1570 del_from_work_list(s, jl);
1571 break;
1572 }
1573 ret = write_one_transaction(s, jl, &chunk);
1574
1575 if (ret < 0)
1576 goto done;
1577 transactions_flushed++;
1578 written += ret;
1579 entry = jl->j_list.next;
1580
1581 /* did we wrap? */
1582 if (entry == &journal->j_journal_list) {
1583 break;
1584 }
1585 jl = JOURNAL_LIST_ENTRY(entry);
1586
1587 /* don't bother with older transactions */
1588 if (jl->j_trans_id <= orig_trans_id)
1589 break;
1590 }
1591 if (chunk.nr) {
1592 write_chunk(&chunk);
1593 }
1594
1595done:
1596 up(&journal->j_flush_sem);
1597 return ret;
1598}
1599
1600/* for o_sync and fsync heavy applications, they tend to use
1601** all the journa list slots with tiny transactions. These
1602** trigger lots and lots of calls to update the header block, which
1603** adds seeks and slows things down.
1604**
1605** This function tries to clear out a large chunk of the journal lists
1606** at once, which makes everything faster since only the newest journal
1607** list updates the header block
1608*/
1609static int flush_used_journal_lists(struct super_block *s,
1610 struct reiserfs_journal_list *jl) {
1611 unsigned long len = 0;
1612 unsigned long cur_len;
1613 int ret;
1614 int i;
1615 int limit = 256;
1616 struct reiserfs_journal_list *tjl;
1617 struct reiserfs_journal_list *flush_jl;
1618 unsigned long trans_id;
1619 struct reiserfs_journal *journal = SB_JOURNAL (s);
1620
1621 flush_jl = tjl = jl;
1622
1623 /* in data logging mode, try harder to flush a lot of blocks */
1624 if (reiserfs_data_log(s))
1625 limit = 1024;
1626 /* flush for 256 transactions or limit blocks, whichever comes first */
1627 for(i = 0 ; i < 256 && len < limit ; i++) {
1628 if (atomic_read(&tjl->j_commit_left) ||
1629 tjl->j_trans_id < jl->j_trans_id) {
1630 break;
1631 }
1632 cur_len = atomic_read(&tjl->j_nonzerolen);
1633 if (cur_len > 0) {
1634 tjl->j_state &= ~LIST_TOUCHED;
1635 }
1636 len += cur_len;
1637 flush_jl = tjl;
1638 if (tjl->j_list.next == &journal->j_journal_list)
1639 break;
1640 tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
1641 }
1642 /* try to find a group of blocks we can flush across all the
1643 ** transactions, but only bother if we've actually spanned
1644 ** across multiple lists
1645 */
1646 if (flush_jl != jl) {
1647 ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
1648 }
1649 flush_journal_list(s, flush_jl, 1);
1650 return 0;
1651}
1652
1653/*
1654** removes any nodes in table with name block and dev as bh.
1655** only touchs the hnext and hprev pointers.
1656*/
1657void remove_journal_hash(struct super_block *sb,
1658 struct reiserfs_journal_cnode **table,
1659 struct reiserfs_journal_list *jl,
1660 unsigned long block, int remove_freed)
1661{
1662 struct reiserfs_journal_cnode *cur ;
1663 struct reiserfs_journal_cnode **head ;
1664
1665 head= &(journal_hash(table, sb, block)) ;
1666 if (!head) {
1667 return ;
1668 }
1669 cur = *head ;
1670 while(cur) {
1671 if (cur->blocknr == block && cur->sb == sb && (jl == NULL || jl == cur->jlist) &&
1672 (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {
1673 if (cur->hnext) {
1674 cur->hnext->hprev = cur->hprev ;
1675 }
1676 if (cur->hprev) {
1677 cur->hprev->hnext = cur->hnext ;
1678 } else {
1679 *head = cur->hnext ;
1680 }
1681 cur->blocknr = 0 ;
1682 cur->sb = NULL ;
1683 cur->state = 0 ;
1684 if (cur->bh && cur->jlist) /* anybody who clears the cur->bh will also dec the nonzerolen */
1685 atomic_dec(&(cur->jlist->j_nonzerolen)) ;
1686 cur->bh = NULL ;
1687 cur->jlist = NULL ;
1688 }
1689 cur = cur->hnext ;
1690 }
1691}
1692
1693static void free_journal_ram(struct super_block *p_s_sb) {
1694 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
1695 reiserfs_kfree(journal->j_current_jl,
1696 sizeof(struct reiserfs_journal_list), p_s_sb);
1697 journal->j_num_lists--;
1698
1699 vfree(journal->j_cnode_free_orig) ;
1700 free_list_bitmaps(p_s_sb, journal->j_list_bitmap) ;
1701 free_bitmap_nodes(p_s_sb) ; /* must be after free_list_bitmaps */
1702 if (journal->j_header_bh) {
1703 brelse(journal->j_header_bh) ;
1704 }
1705 /* j_header_bh is on the journal dev, make sure not to release the journal
1706 * dev until we brelse j_header_bh
1707 */
1708 release_journal_dev(p_s_sb, journal);
1709 vfree(journal) ;
1710}
1711
1712/*
1713** call on unmount. Only set error to 1 if you haven't made your way out
1714** of read_super() yet. Any other caller must keep error at 0.
1715*/
1716static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, int error) {
1717 struct reiserfs_transaction_handle myth ;
1718 int flushed = 0;
1719 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
1720
1721 /* we only want to flush out transactions if we were called with error == 0
1722 */
1723 if (!error && !(p_s_sb->s_flags & MS_RDONLY)) {
1724 /* end the current trans */
1725 BUG_ON (!th->t_trans_id);
1726 do_journal_end(th, p_s_sb,10, FLUSH_ALL) ;
1727
1728 /* make sure something gets logged to force our way into the flush code */
1729 if (!journal_join(&myth, p_s_sb, 1)) {
1730 reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
1731 journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
1732 do_journal_end(&myth, p_s_sb,1, FLUSH_ALL) ;
1733 flushed = 1;
1734 }
1735 }
1736
1737 /* this also catches errors during the do_journal_end above */
1738 if (!error && reiserfs_is_journal_aborted(journal)) {
1739 memset(&myth, 0, sizeof(myth));
1740 if (!journal_join_abort(&myth, p_s_sb, 1)) {
1741 reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
1742 journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
1743 do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL) ;
1744 }
1745 }
1746
1747 reiserfs_mounted_fs_count-- ;
1748 /* wait for all commits to finish */
1749 cancel_delayed_work(&SB_JOURNAL(p_s_sb)->j_work);
1750 flush_workqueue(commit_wq);
1751 if (!reiserfs_mounted_fs_count) {
1752 destroy_workqueue(commit_wq);
1753 commit_wq = NULL;
1754 }
1755
1756 free_journal_ram(p_s_sb) ;
1757
1758 return 0 ;
1759}
1760
1761/*
1762** call on unmount. flush all journal trans, release all alloc'd ram
1763*/
1764int journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb) {
1765 return do_journal_release(th, p_s_sb, 0) ;
1766}
1767/*
1768** only call from an error condition inside reiserfs_read_super!
1769*/
1770int journal_release_error(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb) {
1771 return do_journal_release(th, p_s_sb, 1) ;
1772}
1773
1774/* compares description block with commit block. returns 1 if they differ, 0 if they are the same */
1775static int journal_compare_desc_commit(struct super_block *p_s_sb, struct reiserfs_journal_desc *desc,
1776 struct reiserfs_journal_commit *commit) {
1777 if (get_commit_trans_id (commit) != get_desc_trans_id (desc) ||
1778 get_commit_trans_len (commit) != get_desc_trans_len (desc) ||
1779 get_commit_trans_len (commit) > SB_JOURNAL(p_s_sb)->j_trans_max ||
1780 get_commit_trans_len (commit) <= 0
1781 ) {
1782 return 1 ;
1783 }
1784 return 0 ;
1785}
1786/* returns 0 if it did not find a description block
1787** returns -1 if it found a corrupt commit block
1788** returns 1 if both desc and commit were valid
1789*/
1790static int journal_transaction_is_valid(struct super_block *p_s_sb, struct buffer_head *d_bh, unsigned long *oldest_invalid_trans_id, unsigned long *newest_mount_id) {
1791 struct reiserfs_journal_desc *desc ;
1792 struct reiserfs_journal_commit *commit ;
1793 struct buffer_head *c_bh ;
1794 unsigned long offset ;
1795
1796 if (!d_bh)
1797 return 0 ;
1798
1799 desc = (struct reiserfs_journal_desc *)d_bh->b_data ;
1800 if (get_desc_trans_len(desc) > 0 && !memcmp(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8)) {
1801 if (oldest_invalid_trans_id && *oldest_invalid_trans_id && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
1802 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-986: transaction "
1803 "is valid returning because trans_id %d is greater than "
1804 "oldest_invalid %lu", get_desc_trans_id(desc),
1805 *oldest_invalid_trans_id);
1806 return 0 ;
1807 }
1808 if (newest_mount_id && *newest_mount_id > get_desc_mount_id (desc)) {
1809 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1087: transaction "
1810 "is valid returning because mount_id %d is less than "
1811 "newest_mount_id %lu", get_desc_mount_id (desc),
1812 *newest_mount_id) ;
1813 return -1 ;
1814 }
1815 if ( get_desc_trans_len(desc) > SB_JOURNAL(p_s_sb)->j_trans_max ) {
1816 reiserfs_warning(p_s_sb, "journal-2018: Bad transaction length %d encountered, ignoring transaction", get_desc_trans_len(desc));
1817 return -1 ;
1818 }
1819 offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ;
1820
1821 /* ok, we have a journal description block, lets see if the transaction was valid */
1822 c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
1823 ((offset + get_desc_trans_len(desc) + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
1824 if (!c_bh)
1825 return 0 ;
1826 commit = (struct reiserfs_journal_commit *)c_bh->b_data ;
1827 if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
1828 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
1829 "journal_transaction_is_valid, commit offset %ld had bad "
1830 "time %d or length %d",
1831 c_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
1832 get_commit_trans_id (commit),
1833 get_commit_trans_len(commit));
1834 brelse(c_bh) ;
1835 if (oldest_invalid_trans_id) {
1836 *oldest_invalid_trans_id = get_desc_trans_id(desc) ;
1837 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1004: "
1838 "transaction_is_valid setting oldest invalid trans_id "
1839 "to %d", get_desc_trans_id(desc)) ;
1840 }
1841 return -1;
1842 }
1843 brelse(c_bh) ;
1844 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1006: found valid "
1845 "transaction start offset %llu, len %d id %d",
1846 d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
1847 get_desc_trans_len(desc), get_desc_trans_id(desc)) ;
1848 return 1 ;
1849 } else {
1850 return 0 ;
1851 }
1852}
1853
1854static void brelse_array(struct buffer_head **heads, int num) {
1855 int i ;
1856 for (i = 0 ; i < num ; i++) {
1857 brelse(heads[i]) ;
1858 }
1859}
1860
1861/*
1862** given the start, and values for the oldest acceptable transactions,
1863** this either reads in a replays a transaction, or returns because the transaction
1864** is invalid, or too old.
1865*/
1866static int journal_read_transaction(struct super_block *p_s_sb, unsigned long cur_dblock, unsigned long oldest_start,
1867 unsigned long oldest_trans_id, unsigned long newest_mount_id) {
1868 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
1869 struct reiserfs_journal_desc *desc ;
1870 struct reiserfs_journal_commit *commit ;
1871 unsigned long trans_id = 0 ;
1872 struct buffer_head *c_bh ;
1873 struct buffer_head *d_bh ;
1874 struct buffer_head **log_blocks = NULL ;
1875 struct buffer_head **real_blocks = NULL ;
1876 unsigned long trans_offset ;
1877 int i;
1878 int trans_half;
1879
1880 d_bh = journal_bread(p_s_sb, cur_dblock) ;
1881 if (!d_bh)
1882 return 1 ;
1883 desc = (struct reiserfs_journal_desc *)d_bh->b_data ;
1884 trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ;
1885 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: "
1886 "journal_read_transaction, offset %llu, len %d mount_id %d",
1887 d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
1888 get_desc_trans_len(desc), get_desc_mount_id(desc)) ;
1889 if (get_desc_trans_id(desc) < oldest_trans_id) {
1890 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1039: "
1891 "journal_read_trans skipping because %lu is too old",
1892 cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)) ;
1893 brelse(d_bh) ;
1894 return 1 ;
1895 }
1896 if (get_desc_mount_id(desc) != newest_mount_id) {
1897 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1146: "
1898 "journal_read_trans skipping because %d is != "
1899 "newest_mount_id %lu", get_desc_mount_id(desc),
1900 newest_mount_id) ;
1901 brelse(d_bh) ;
1902 return 1 ;
1903 }
1904 c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
1905 ((trans_offset + get_desc_trans_len(desc) + 1) %
1906 SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
1907 if (!c_bh) {
1908 brelse(d_bh) ;
1909 return 1 ;
1910 }
1911 commit = (struct reiserfs_journal_commit *)c_bh->b_data ;
1912 if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
1913 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal_read_transaction, "
1914 "commit offset %llu had bad time %d or length %d",
1915 c_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
1916 get_commit_trans_id(commit), get_commit_trans_len(commit));
1917 brelse(c_bh) ;
1918 brelse(d_bh) ;
1919 return 1;
1920 }
1921 trans_id = get_desc_trans_id(desc) ;
1922 /* now we know we've got a good transaction, and it was inside the valid time ranges */
1923 log_blocks = reiserfs_kmalloc(get_desc_trans_len(desc) * sizeof(struct buffer_head *), GFP_NOFS, p_s_sb) ;
1924 real_blocks = reiserfs_kmalloc(get_desc_trans_len(desc) * sizeof(struct buffer_head *), GFP_NOFS, p_s_sb) ;
1925 if (!log_blocks || !real_blocks) {
1926 brelse(c_bh) ;
1927 brelse(d_bh) ;
1928 reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
1929 reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
1930 reiserfs_warning(p_s_sb, "journal-1169: kmalloc failed, unable to mount FS") ;
1931 return -1 ;
1932 }
1933 /* get all the buffer heads */
1934 trans_half = journal_trans_half (p_s_sb->s_blocksize) ;
1935 for(i = 0 ; i < get_desc_trans_len(desc) ; i++) {
1936 log_blocks[i] = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + (trans_offset + 1 + i) % SB_ONDISK_JOURNAL_SIZE(p_s_sb));
1937 if (i < trans_half) {
1938 real_blocks[i] = sb_getblk(p_s_sb, le32_to_cpu(desc->j_realblock[i])) ;
1939 } else {
1940 real_blocks[i] = sb_getblk(p_s_sb, le32_to_cpu(commit->j_realblock[i - trans_half])) ;
1941 }
1942 if ( real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(p_s_sb) ) {
1943 reiserfs_warning(p_s_sb, "journal-1207: REPLAY FAILURE fsck required! Block to replay is outside of filesystem");
1944 goto abort_replay;
1945 }
1946 /* make sure we don't try to replay onto log or reserved area */
1947 if (is_block_in_log_or_reserved_area(p_s_sb, real_blocks[i]->b_blocknr)) {
1948 reiserfs_warning(p_s_sb, "journal-1204: REPLAY FAILURE fsck required! Trying to replay onto a log block") ;
1949abort_replay:
1950 brelse_array(log_blocks, i) ;
1951 brelse_array(real_blocks, i) ;
1952 brelse(c_bh) ;
1953 brelse(d_bh) ;
1954 reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
1955 reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
1956 return -1 ;
1957 }
1958 }
1959 /* read in the log blocks, memcpy to the corresponding real block */
1960 ll_rw_block(READ, get_desc_trans_len(desc), log_blocks) ;
1961 for (i = 0 ; i < get_desc_trans_len(desc) ; i++) {
1962 wait_on_buffer(log_blocks[i]) ;
1963 if (!buffer_uptodate(log_blocks[i])) {
1964 reiserfs_warning(p_s_sb, "journal-1212: REPLAY FAILURE fsck required! buffer write failed") ;
1965 brelse_array(log_blocks + i, get_desc_trans_len(desc) - i) ;
1966 brelse_array(real_blocks, get_desc_trans_len(desc)) ;
1967 brelse(c_bh) ;
1968 brelse(d_bh) ;
1969 reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
1970 reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
1971 return -1 ;
1972 }
1973 memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, real_blocks[i]->b_size) ;
1974 set_buffer_uptodate(real_blocks[i]) ;
1975 brelse(log_blocks[i]) ;
1976 }
1977 /* flush out the real blocks */
1978 for (i = 0 ; i < get_desc_trans_len(desc) ; i++) {
1979 set_buffer_dirty(real_blocks[i]) ;
1980 ll_rw_block(WRITE, 1, real_blocks + i) ;
1981 }
1982 for (i = 0 ; i < get_desc_trans_len(desc) ; i++) {
1983 wait_on_buffer(real_blocks[i]) ;
1984 if (!buffer_uptodate(real_blocks[i])) {
1985 reiserfs_warning(p_s_sb, "journal-1226: REPLAY FAILURE, fsck required! buffer write failed") ;
1986 brelse_array(real_blocks + i, get_desc_trans_len(desc) - i) ;
1987 brelse(c_bh) ;
1988 brelse(d_bh) ;
1989 reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
1990 reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
1991 return -1 ;
1992 }
1993 brelse(real_blocks[i]) ;
1994 }
1995 cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + ((trans_offset + get_desc_trans_len(desc) + 2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)) ;
1996 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1095: setting journal "
1997 "start to offset %ld",
1998 cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)) ;
1999
2000 /* init starting values for the first transaction, in case this is the last transaction to be replayed. */
2001 journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ;
2002 journal->j_last_flush_trans_id = trans_id ;
2003 journal->j_trans_id = trans_id + 1;
2004 brelse(c_bh) ;
2005 brelse(d_bh) ;
2006 reiserfs_kfree(log_blocks, le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), p_s_sb) ;
2007 reiserfs_kfree(real_blocks, le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), p_s_sb) ;
2008 return 0 ;
2009}
2010
2011/* This function reads blocks starting from block and to max_block of bufsize
2012 size (but no more than BUFNR blocks at a time). This proved to improve
2013 mounting speed on self-rebuilding raid5 arrays at least.
2014 Right now it is only used from journal code. But later we might use it
2015 from other places.
2016 Note: Do not use journal_getblk/sb_getblk functions here! */
2017static struct buffer_head * reiserfs_breada (struct block_device *dev, int block, int bufsize,
2018 unsigned int max_block)
2019{
2020 struct buffer_head * bhlist[BUFNR];
2021 unsigned int blocks = BUFNR;
2022 struct buffer_head * bh;
2023 int i, j;
2024
2025 bh = __getblk (dev, block, bufsize );
2026 if (buffer_uptodate (bh))
2027 return (bh);
2028
2029 if (block + BUFNR > max_block) {
2030 blocks = max_block - block;
2031 }
2032 bhlist[0] = bh;
2033 j = 1;
2034 for (i = 1; i < blocks; i++) {
2035 bh = __getblk (dev, block + i, bufsize);
2036 if (buffer_uptodate (bh)) {
2037 brelse (bh);
2038 break;
2039 }
2040 else bhlist[j++] = bh;
2041 }
2042 ll_rw_block (READ, j, bhlist);
2043 for(i = 1; i < j; i++)
2044 brelse (bhlist[i]);
2045 bh = bhlist[0];
2046 wait_on_buffer (bh);
2047 if (buffer_uptodate (bh))
2048 return bh;
2049 brelse (bh);
2050 return NULL;
2051}
2052
2053/*
2054** read and replay the log
2055** on a clean unmount, the journal header's next unflushed pointer will be to an invalid
2056** transaction. This tests that before finding all the transactions in the log, which makes normal mount times fast.
2057**
2058** After a crash, this starts with the next unflushed transaction, and replays until it finds one too old, or invalid.
2059**
2060** On exit, it sets things up so the first transaction will work correctly.
2061*/
2062static int journal_read(struct super_block *p_s_sb) {
2063 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
2064 struct reiserfs_journal_desc *desc ;
2065 unsigned long oldest_trans_id = 0;
2066 unsigned long oldest_invalid_trans_id = 0 ;
2067 time_t start ;
2068 unsigned long oldest_start = 0;
2069 unsigned long cur_dblock = 0 ;
2070 unsigned long newest_mount_id = 9 ;
2071 struct buffer_head *d_bh ;
2072 struct reiserfs_journal_header *jh ;
2073 int valid_journal_header = 0 ;
2074 int replay_count = 0 ;
2075 int continue_replay = 1 ;
2076 int ret ;
2077 char b[BDEVNAME_SIZE];
2078
2079 cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ;
2080 reiserfs_info (p_s_sb, "checking transaction log (%s)\n",
2081 bdevname(journal->j_dev_bd, b));
2082 start = get_seconds();
2083
2084 /* step 1, read in the journal header block. Check the transaction it says
2085 ** is the first unflushed, and if that transaction is not valid,
2086 ** replay is done
2087 */
2088 journal->j_header_bh = journal_bread(p_s_sb,
2089 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
2090 SB_ONDISK_JOURNAL_SIZE(p_s_sb));
2091 if (!journal->j_header_bh) {
2092 return 1 ;
2093 }
2094 jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data) ;
2095 if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 &&
2096 le32_to_cpu(jh->j_first_unflushed_offset) < SB_ONDISK_JOURNAL_SIZE(p_s_sb) &&
2097 le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
2098 oldest_start = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
2099 le32_to_cpu(jh->j_first_unflushed_offset) ;
2100 oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
2101 newest_mount_id = le32_to_cpu(jh->j_mount_id);
2102 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1153: found in "
2103 "header: first_unflushed_offset %d, last_flushed_trans_id "
2104 "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
2105 le32_to_cpu(jh->j_last_flush_trans_id)) ;
2106 valid_journal_header = 1 ;
2107
2108 /* now, we try to read the first unflushed offset. If it is not valid,
2109 ** there is nothing more we can do, and it makes no sense to read
2110 ** through the whole log.
2111 */
2112 d_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + le32_to_cpu(jh->j_first_unflushed_offset)) ;
2113 ret = journal_transaction_is_valid(p_s_sb, d_bh, NULL, NULL) ;
2114 if (!ret) {
2115 continue_replay = 0 ;
2116 }
2117 brelse(d_bh) ;
2118 goto start_log_replay;
2119 }
2120
2121 if (continue_replay && bdev_read_only(p_s_sb->s_bdev)) {
2122 reiserfs_warning (p_s_sb,
2123 "clm-2076: device is readonly, unable to replay log") ;
2124 return -1 ;
2125 }
2126
2127 /* ok, there are transactions that need to be replayed. start with the first log block, find
2128 ** all the valid transactions, and pick out the oldest.
2129 */
2130 while(continue_replay && cur_dblock < (SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb))) {
2131 /* Note that it is required for blocksize of primary fs device and journal
2132 device to be the same */
2133 d_bh = reiserfs_breada(journal->j_dev_bd, cur_dblock, p_s_sb->s_blocksize,
2134 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) ;
2135 ret = journal_transaction_is_valid(p_s_sb, d_bh, &oldest_invalid_trans_id, &newest_mount_id) ;
2136 if (ret == 1) {
2137 desc = (struct reiserfs_journal_desc *)d_bh->b_data ;
2138 if (oldest_start == 0) { /* init all oldest_ values */
2139 oldest_trans_id = get_desc_trans_id(desc) ;
2140 oldest_start = d_bh->b_blocknr ;
2141 newest_mount_id = get_desc_mount_id(desc) ;
2142 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1179: Setting "
2143 "oldest_start to offset %llu, trans_id %lu",
2144 oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
2145 oldest_trans_id) ;
2146 } else if (oldest_trans_id > get_desc_trans_id(desc)) {
2147 /* one we just read was older */
2148 oldest_trans_id = get_desc_trans_id(desc) ;
2149 oldest_start = d_bh->b_blocknr ;
2150 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1180: Resetting "
2151 "oldest_start to offset %lu, trans_id %lu",
2152 oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
2153 oldest_trans_id) ;
2154 }
2155 if (newest_mount_id < get_desc_mount_id(desc)) {
2156 newest_mount_id = get_desc_mount_id(desc) ;
2157 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
2158 "newest_mount_id to %d", get_desc_mount_id(desc));
2159 }
2160 cur_dblock += get_desc_trans_len(desc) + 2 ;
2161 } else {
2162 cur_dblock++ ;
2163 }
2164 brelse(d_bh) ;
2165 }
2166
2167start_log_replay:
2168 cur_dblock = oldest_start ;
2169 if (oldest_trans_id) {
2170 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1206: Starting replay "
2171 "from offset %llu, trans_id %lu",
2172 cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
2173 oldest_trans_id) ;
2174
2175 }
2176 replay_count = 0 ;
2177 while(continue_replay && oldest_trans_id > 0) {
2178 ret = journal_read_transaction(p_s_sb, cur_dblock, oldest_start, oldest_trans_id, newest_mount_id) ;
2179 if (ret < 0) {
2180 return ret ;
2181 } else if (ret != 0) {
2182 break ;
2183 }
2184 cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start ;
2185 replay_count++ ;
2186 if (cur_dblock == oldest_start)
2187 break;
2188 }
2189
2190 if (oldest_trans_id == 0) {
2191 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1225: No valid "
2192 "transactions found") ;
2193 }
2194 /* j_start does not get set correctly if we don't replay any transactions.
2195 ** if we had a valid journal_header, set j_start to the first unflushed transaction value,
2196 ** copy the trans_id from the header
2197 */
2198 if (valid_journal_header && replay_count == 0) {
2199 journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset) ;
2200 journal->j_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
2201 journal->j_last_flush_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) ;
2202 journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;
2203 } else {
2204 journal->j_mount_id = newest_mount_id + 1 ;
2205 }
2206 reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
2207 "newest_mount_id to %lu", journal->j_mount_id) ;
2208 journal->j_first_unflushed_offset = journal->j_start ;
2209 if (replay_count > 0) {
2210 reiserfs_info (p_s_sb, "replayed %d transactions in %lu seconds\n",
2211 replay_count, get_seconds() - start) ;
2212 }
2213 if (!bdev_read_only(p_s_sb->s_bdev) &&
2214 _update_journal_header_block(p_s_sb, journal->j_start,
2215 journal->j_last_flush_trans_id))
2216 {
2217 /* replay failed, caller must call free_journal_ram and abort
2218 ** the mount
2219 */
2220 return -1 ;
2221 }
2222 return 0 ;
2223}
2224
2225static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
2226{
2227 struct reiserfs_journal_list *jl;
2228retry:
2229 jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, s);
2230 if (!jl) {
2231 yield();
2232 goto retry;
2233 }
2234 memset(jl, 0, sizeof(*jl));
2235 INIT_LIST_HEAD(&jl->j_list);
2236 INIT_LIST_HEAD(&jl->j_working_list);
2237 INIT_LIST_HEAD(&jl->j_tail_bh_list);
2238 INIT_LIST_HEAD(&jl->j_bh_list);
2239 sema_init(&jl->j_commit_lock, 1);
2240 SB_JOURNAL(s)->j_num_lists++;
2241 get_journal_list(jl);
2242 return jl;
2243}
2244
2245static void journal_list_init(struct super_block *p_s_sb) {
2246 SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
2247}
2248
2249static int release_journal_dev( struct super_block *super,
2250 struct reiserfs_journal *journal )
2251{
2252 int result;
2253
2254 result = 0;
2255
2256 if( journal -> j_dev_file != NULL ) {
2257 result = filp_close( journal -> j_dev_file, NULL );
2258 journal -> j_dev_file = NULL;
2259 journal -> j_dev_bd = NULL;
2260 } else if( journal -> j_dev_bd != NULL ) {
2261 result = blkdev_put( journal -> j_dev_bd );
2262 journal -> j_dev_bd = NULL;
2263 }
2264
2265 if( result != 0 ) {
2266 reiserfs_warning(super, "sh-457: release_journal_dev: Cannot release journal device: %i", result );
2267 }
2268 return result;
2269}
2270
2271static int journal_init_dev( struct super_block *super,
2272 struct reiserfs_journal *journal,
2273 const char *jdev_name )
2274{
2275 int result;
2276 dev_t jdev;
2277 int blkdev_mode = FMODE_READ | FMODE_WRITE;
2278 char b[BDEVNAME_SIZE];
2279
2280 result = 0;
2281
2282 journal -> j_dev_bd = NULL;
2283 journal -> j_dev_file = NULL;
2284 jdev = SB_ONDISK_JOURNAL_DEVICE( super ) ?
2285 new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
2286
2287 if (bdev_read_only(super->s_bdev))
2288 blkdev_mode = FMODE_READ;
2289
2290 /* there is no "jdev" option and journal is on separate device */
2291 if( ( !jdev_name || !jdev_name[ 0 ] ) ) {
2292 journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);
2293 if (IS_ERR(journal->j_dev_bd)) {
2294 result = PTR_ERR(journal->j_dev_bd);
2295 journal->j_dev_bd = NULL;
2296 reiserfs_warning (super, "sh-458: journal_init_dev: "
2297 "cannot init journal device '%s': %i",
2298 __bdevname(jdev, b), result );
2299 return result;
2300 } else if (jdev != super->s_dev)
2301 set_blocksize(journal->j_dev_bd, super->s_blocksize);
2302 return 0;
2303 }
2304
2305 journal -> j_dev_file = filp_open( jdev_name, 0, 0 );
2306 if( !IS_ERR( journal -> j_dev_file ) ) {
2307 struct inode *jdev_inode = journal->j_dev_file->f_mapping->host;
2308 if( !S_ISBLK( jdev_inode -> i_mode ) ) {
2309 reiserfs_warning (super, "journal_init_dev: '%s' is "
2310 "not a block device", jdev_name );
2311 result = -ENOTBLK;
2312 } else {
2313 /* ok */
2314 journal->j_dev_bd = I_BDEV(jdev_inode);
2315 set_blocksize(journal->j_dev_bd, super->s_blocksize);
2316 }
2317 } else {
2318 result = PTR_ERR( journal -> j_dev_file );
2319 journal -> j_dev_file = NULL;
2320 reiserfs_warning (super,
2321 "journal_init_dev: Cannot open '%s': %i",
2322 jdev_name, result );
2323 }
2324 if( result != 0 ) {
2325 release_journal_dev( super, journal );
2326 }
2327 reiserfs_info(super, "journal_init_dev: journal device: %s\n",
2328 bdevname(journal->j_dev_bd, b));
2329 return result;
2330}
2331
2332/*
2333** must be called once on fs mount. calls journal_read for you
2334*/
2335int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_format, unsigned int commit_max_age) {
2336 int num_cnodes = SB_ONDISK_JOURNAL_SIZE(p_s_sb) * 2 ;
2337 struct buffer_head *bhjh;
2338 struct reiserfs_super_block * rs;
2339 struct reiserfs_journal_header *jh;
2340 struct reiserfs_journal *journal;
2341 struct reiserfs_journal_list *jl;
2342 char b[BDEVNAME_SIZE];
2343
2344 journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof (struct reiserfs_journal)) ;
2345 if (!journal) {
2346 reiserfs_warning (p_s_sb, "journal-1256: unable to get memory for journal structure") ;
2347 return 1 ;
2348 }
2349 memset(journal, 0, sizeof(struct reiserfs_journal)) ;
2350 INIT_LIST_HEAD(&journal->j_bitmap_nodes) ;
2351 INIT_LIST_HEAD (&journal->j_prealloc_list);
2352 INIT_LIST_HEAD(&journal->j_working_list);
2353 INIT_LIST_HEAD(&journal->j_journal_list);
2354 journal->j_persistent_trans = 0;
2355 if (reiserfs_allocate_list_bitmaps(p_s_sb,
2356 journal->j_list_bitmap,
2357 SB_BMAP_NR(p_s_sb)))
2358 goto free_and_return ;
2359 allocate_bitmap_nodes(p_s_sb) ;
2360
2361 /* reserved for journal area support */
2362 SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ?
2363 REISERFS_OLD_DISK_OFFSET_IN_BYTES / p_s_sb->s_blocksize +
2364 SB_BMAP_NR(p_s_sb) + 1 :
2365 REISERFS_DISK_OFFSET_IN_BYTES / p_s_sb->s_blocksize + 2);
2366
2367 /* Sanity check to see is the standard journal fitting withing first bitmap
2368 (actual for small blocksizes) */
2369 if ( !SB_ONDISK_JOURNAL_DEVICE( p_s_sb ) &&
2370 (SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb) > p_s_sb->s_blocksize * 8) ) {
2371 reiserfs_warning (p_s_sb, "journal-1393: journal does not fit for area "
2372 "addressed by first of bitmap blocks. It starts at "
2373 "%u and its size is %u. Block size %ld",
2374 SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb),
2375 SB_ONDISK_JOURNAL_SIZE(p_s_sb), p_s_sb->s_blocksize);
2376 goto free_and_return;
2377 }
2378
2379 if( journal_init_dev( p_s_sb, journal, j_dev_name ) != 0 ) {
2380 reiserfs_warning (p_s_sb, "sh-462: unable to initialize jornal device");
2381 goto free_and_return;
2382 }
2383
2384 rs = SB_DISK_SUPER_BLOCK(p_s_sb);
2385
2386 /* read journal header */
2387 bhjh = journal_bread(p_s_sb,
2388 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb));
2389 if (!bhjh) {
2390 reiserfs_warning (p_s_sb, "sh-459: unable to read journal header");
2391 goto free_and_return;
2392 }
2393 jh = (struct reiserfs_journal_header *)(bhjh->b_data);
2394
2395 /* make sure that journal matches to the super block */
2396 if (is_reiserfs_jr(rs) && (jh->jh_journal.jp_journal_magic != sb_jp_journal_magic(rs))) {
2397 reiserfs_warning (p_s_sb, "sh-460: journal header magic %x "
2398 "(device %s) does not match to magic found in super "
2399 "block %x",
2400 jh->jh_journal.jp_journal_magic,
2401 bdevname( journal->j_dev_bd, b),
2402 sb_jp_journal_magic(rs));
2403 brelse (bhjh);
2404 goto free_and_return;
2405 }
2406
2407 journal->j_trans_max = le32_to_cpu (jh->jh_journal.jp_journal_trans_max);
2408 journal->j_max_batch = le32_to_cpu (jh->jh_journal.jp_journal_max_batch);
2409 journal->j_max_commit_age = le32_to_cpu (jh->jh_journal.jp_journal_max_commit_age);
2410 journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
2411
2412 if (journal->j_trans_max) {
2413 /* make sure these parameters are available, assign it if they are not */
2414 __u32 initial = journal->j_trans_max;
2415 __u32 ratio = 1;
2416
2417 if (p_s_sb->s_blocksize < 4096)
2418 ratio = 4096 / p_s_sb->s_blocksize;
2419
2420 if (SB_ONDISK_JOURNAL_SIZE(p_s_sb)/journal->j_trans_max < JOURNAL_MIN_RATIO)
2421 journal->j_trans_max = SB_ONDISK_JOURNAL_SIZE(p_s_sb) / JOURNAL_MIN_RATIO;
2422 if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio)
2423 journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT / ratio;
2424 if (journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio)
2425 journal->j_trans_max = JOURNAL_TRANS_MIN_DEFAULT / ratio;
2426
2427 if (journal->j_trans_max != initial)
2428 reiserfs_warning (p_s_sb, "sh-461: journal_init: wrong transaction max size (%u). Changed to %u",
2429 initial, journal->j_trans_max);
2430
2431 journal->j_max_batch = journal->j_trans_max*
2432 JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT;
2433 }
2434
2435 if (!journal->j_trans_max) {
2436 /*we have the file system was created by old version of mkreiserfs
2437 so this field contains zero value */
2438 journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT ;
2439 journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT ;
2440 journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE ;
2441
2442 /* for blocksize >= 4096 - max transaction size is 1024. For block size < 4096
2443 trans max size is decreased proportionally */
2444 if (p_s_sb->s_blocksize < 4096) {
2445 journal->j_trans_max /= (4096 / p_s_sb->s_blocksize) ;
2446 journal->j_max_batch = (journal->j_trans_max) * 9 / 10 ;
2447 }
2448 }
2449
2450 journal->j_default_max_commit_age = journal->j_max_commit_age;
2451
2452 if (commit_max_age != 0) {
2453 journal->j_max_commit_age = commit_max_age;
2454 journal->j_max_trans_age = commit_max_age;
2455 }
2456
2457 reiserfs_info (p_s_sb, "journal params: device %s, size %u, "
2458 "journal first block %u, max trans len %u, max batch %u, "
2459 "max commit age %u, max trans age %u\n",
2460 bdevname( journal->j_dev_bd, b),
2461 SB_ONDISK_JOURNAL_SIZE(p_s_sb),
2462 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
2463 journal->j_trans_max,
2464 journal->j_max_batch,
2465 journal->j_max_commit_age,
2466 journal->j_max_trans_age);
2467
2468 brelse (bhjh);
2469
2470 journal->j_list_bitmap_index = 0 ;
2471 journal_list_init(p_s_sb) ;
2472
2473 memset(journal->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
2474
2475 INIT_LIST_HEAD(&journal->j_dirty_buffers) ;
2476 spin_lock_init(&journal->j_dirty_buffers_lock) ;
2477
2478 journal->j_start = 0 ;
2479 journal->j_len = 0 ;
2480 journal->j_len_alloc = 0 ;
2481 atomic_set(&(journal->j_wcount), 0) ;
2482 atomic_set(&(journal->j_async_throttle), 0) ;
2483 journal->j_bcount = 0 ;
2484 journal->j_trans_start_time = 0 ;
2485 journal->j_last = NULL ;
2486 journal->j_first = NULL ;
2487 init_waitqueue_head(&(journal->j_join_wait)) ;
2488 sema_init(&journal->j_lock, 1);
2489 sema_init(&journal->j_flush_sem, 1);
2490
2491 journal->j_trans_id = 10 ;
2492 journal->j_mount_id = 10 ;
2493 journal->j_state = 0 ;
2494 atomic_set(&(journal->j_jlock), 0) ;
2495 journal->j_cnode_free_list = allocate_cnodes(num_cnodes) ;
2496 journal->j_cnode_free_orig = journal->j_cnode_free_list ;
2497 journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0 ;
2498 journal->j_cnode_used = 0 ;
2499 journal->j_must_wait = 0 ;
2500
2501 init_journal_hash(p_s_sb) ;
2502 jl = journal->j_current_jl;
2503 jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl);
2504 if (!jl->j_list_bitmap) {
2505 reiserfs_warning(p_s_sb, "journal-2005, get_list_bitmap failed for journal list 0") ;
2506 goto free_and_return;
2507 }
2508 if (journal_read(p_s_sb) < 0) {
2509 reiserfs_warning(p_s_sb, "Replay Failure, unable to mount") ;
2510 goto free_and_return;
2511 }
2512
2513 reiserfs_mounted_fs_count++ ;
2514 if (reiserfs_mounted_fs_count <= 1)
2515 commit_wq = create_workqueue("reiserfs");
2516
2517 INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb);
2518 return 0 ;
2519free_and_return:
2520 free_journal_ram(p_s_sb);
2521 return 1;
2522}
2523
2524/*
2525** test for a polite end of the current transaction. Used by file_write, and should
2526** be used by delete to make sure they don't write more than can fit inside a single
2527** transaction
2528*/
2529int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) {
2530 struct reiserfs_journal *journal = SB_JOURNAL (th->t_super);
2531 time_t now = get_seconds() ;
2532 /* cannot restart while nested */
2533 BUG_ON (!th->t_trans_id);
2534 if (th->t_refcount > 1)
2535 return 0 ;
2536 if ( journal->j_must_wait > 0 ||
2537 (journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
2538 atomic_read(&(journal->j_jlock)) ||
2539 (now - journal->j_trans_start_time) > journal->j_max_trans_age ||
2540 journal->j_cnode_free < (journal->j_trans_max * 3)) {
2541 return 1 ;
2542 }
2543 return 0 ;
2544}
2545
2546/* this must be called inside a transaction, and requires the
2547** kernel_lock to be held
2548*/
2549void reiserfs_block_writes(struct reiserfs_transaction_handle *th) {
2550 struct reiserfs_journal *journal = SB_JOURNAL (th->t_super);
2551 BUG_ON (!th->t_trans_id);
2552 journal->j_must_wait = 1 ;
2553 set_bit(J_WRITERS_BLOCKED, &journal->j_state) ;
2554 return ;
2555}
2556
2557/* this must be called without a transaction started, and does not
2558** require BKL
2559*/
2560void reiserfs_allow_writes(struct super_block *s) {
2561 struct reiserfs_journal *journal = SB_JOURNAL (s);
2562 clear_bit(J_WRITERS_BLOCKED, &journal->j_state) ;
2563 wake_up(&journal->j_join_wait) ;
2564}
2565
2566/* this must be called without a transaction started, and does not
2567** require BKL
2568*/
2569void reiserfs_wait_on_write_block(struct super_block *s) {
2570 struct reiserfs_journal *journal = SB_JOURNAL (s);
2571 wait_event(journal->j_join_wait,
2572 !test_bit(J_WRITERS_BLOCKED, &journal->j_state)) ;
2573}
2574
2575static void queue_log_writer(struct super_block *s) {
2576 wait_queue_t wait;
2577 struct reiserfs_journal *journal = SB_JOURNAL (s);
2578 set_bit(J_WRITERS_QUEUED, &journal->j_state);
2579
2580 /*
2581 * we don't want to use wait_event here because
2582 * we only want to wait once.
2583 */
2584 init_waitqueue_entry(&wait, current);
2585 add_wait_queue(&journal->j_join_wait, &wait);
2586 set_current_state(TASK_UNINTERRUPTIBLE);
2587 if (test_bit(J_WRITERS_QUEUED, &journal->j_state))
2588 schedule();
2589 current->state = TASK_RUNNING;
2590 remove_wait_queue(&journal->j_join_wait, &wait);
2591}
2592
2593static void wake_queued_writers(struct super_block *s) {
2594 struct reiserfs_journal *journal = SB_JOURNAL (s);
2595 if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))
2596 wake_up(&journal->j_join_wait);
2597}
2598
2599static void let_transaction_grow(struct super_block *sb,
2600 unsigned long trans_id)
2601{
2602 struct reiserfs_journal *journal = SB_JOURNAL (sb);
2603 unsigned long bcount = journal->j_bcount;
2604 while(1) {
2605 set_current_state(TASK_UNINTERRUPTIBLE);
2606 schedule_timeout(1);
2607 journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
2608 while ((atomic_read(&journal->j_wcount) > 0 ||
2609 atomic_read(&journal->j_jlock)) &&
2610 journal->j_trans_id == trans_id) {
2611 queue_log_writer(sb);
2612 }
2613 if (journal->j_trans_id != trans_id)
2614 break;
2615 if (bcount == journal->j_bcount)
2616 break;
2617 bcount = journal->j_bcount;
2618 }
2619}
2620
2621/* join == true if you must join an existing transaction.
2622** join == false if you can deal with waiting for others to finish
2623**
2624** this will block until the transaction is joinable. send the number of blocks you
2625** expect to use in nblocks.
2626*/
2627static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,unsigned long nblocks,int join) {
2628 time_t now = get_seconds() ;
2629 int old_trans_id ;
2630 struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
2631 struct reiserfs_transaction_handle myth;
2632 int sched_count = 0;
2633 int retval;
2634
2635 reiserfs_check_lock_depth(p_s_sb, "journal_begin") ;
2636
2637 PROC_INFO_INC( p_s_sb, journal.journal_being );
2638 /* set here for journal_join */
2639 th->t_refcount = 1;
2640 th->t_super = p_s_sb ;
2641
2642relock:
2643 lock_journal(p_s_sb) ;
2644 if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted (journal)) {
2645 unlock_journal (p_s_sb);
2646 retval = journal->j_errno;
2647 goto out_fail;
2648 }
2649 journal->j_bcount++;
2650
2651 if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
2652 unlock_journal(p_s_sb) ;
2653 reiserfs_wait_on_write_block(p_s_sb) ;
2654 PROC_INFO_INC( p_s_sb, journal.journal_relock_writers );
2655 goto relock ;
2656 }
2657 now = get_seconds();
2658
2659 /* if there is no room in the journal OR
2660 ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning
2661 ** we don't sleep if there aren't other writers
2662 */
2663
2664 if ( (!join && journal->j_must_wait > 0) ||
2665 ( !join && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch) ||
2666 (!join && atomic_read(&journal->j_wcount) > 0 && journal->j_trans_start_time > 0 &&
2667 (now - journal->j_trans_start_time) > journal->j_max_trans_age) ||
2668 (!join && atomic_read(&journal->j_jlock)) ||
2669 (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
2670
2671 old_trans_id = journal->j_trans_id;
2672 unlock_journal(p_s_sb) ; /* allow others to finish this transaction */
2673
2674 if (!join && (journal->j_len_alloc + nblocks + 2) >=
2675 journal->j_max_batch &&
2676 ((journal->j_len + nblocks + 2) * 100) < (journal->j_len_alloc * 75))
2677 {
2678 if (atomic_read(&journal->j_wcount) > 10) {
2679 sched_count++;
2680 queue_log_writer(p_s_sb);
2681 goto relock;
2682 }
2683 }
2684 /* don't mess with joining the transaction if all we have to do is
2685 * wait for someone else to do a commit
2686 */
2687 if (atomic_read(&journal->j_jlock)) {
2688 while (journal->j_trans_id == old_trans_id &&
2689 atomic_read(&journal->j_jlock)) {
2690 queue_log_writer(p_s_sb);
2691 }
2692 goto relock;
2693 }
2694 retval = journal_join(&myth, p_s_sb, 1) ;
2695 if (retval)
2696 goto out_fail;
2697
2698 /* someone might have ended the transaction while we joined */
2699 if (old_trans_id != journal->j_trans_id) {
2700 retval = do_journal_end(&myth, p_s_sb, 1, 0) ;
2701 } else {
2702 retval = do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW) ;
2703 }
2704
2705 if (retval)
2706 goto out_fail;
2707
2708 PROC_INFO_INC( p_s_sb, journal.journal_relock_wcount );
2709 goto relock ;
2710 }
2711 /* we are the first writer, set trans_id */
2712 if (journal->j_trans_start_time == 0) {
2713 journal->j_trans_start_time = get_seconds();
2714 }
2715 atomic_inc(&(journal->j_wcount)) ;
2716 journal->j_len_alloc += nblocks ;
2717 th->t_blocks_logged = 0 ;
2718 th->t_blocks_allocated = nblocks ;
2719 th->t_trans_id = journal->j_trans_id ;
2720 unlock_journal(p_s_sb) ;
2721 INIT_LIST_HEAD (&th->t_list);
2722 return 0 ;
2723
2724out_fail:
2725 memset (th, 0, sizeof (*th));
2726 /* Re-set th->t_super, so we can properly keep track of how many
2727 * persistent transactions there are. We need to do this so if this
2728 * call is part of a failed restart_transaction, we can free it later */
2729 th->t_super = p_s_sb;
2730 return retval;
2731}
2732
2733struct reiserfs_transaction_handle *
2734reiserfs_persistent_transaction(struct super_block *s, int nblocks) {
2735 int ret ;
2736 struct reiserfs_transaction_handle *th ;
2737
2738 /* if we're nesting into an existing transaction. It will be
2739 ** persistent on its own
2740 */
2741 if (reiserfs_transaction_running(s)) {
2742 th = current->journal_info ;
2743 th->t_refcount++ ;
2744 if (th->t_refcount < 2) {
2745 BUG() ;
2746 }
2747 return th ;
2748 }
2749 th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS, s) ;
2750 if (!th)
2751 return NULL;
2752 ret = journal_begin(th, s, nblocks) ;
2753 if (ret) {
2754 reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
2755 return NULL;
2756 }
2757
2758 SB_JOURNAL(s)->j_persistent_trans++;
2759 return th ;
2760}
2761
2762int
2763reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) {
2764 struct super_block *s = th->t_super;
2765 int ret = 0;
2766 if (th->t_trans_id)
2767 ret = journal_end(th, th->t_super, th->t_blocks_allocated);
2768 else
2769 ret = -EIO;
2770 if (th->t_refcount == 0) {
2771 SB_JOURNAL(s)->j_persistent_trans--;
2772 reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
2773 }
2774 return ret;
2775}
2776
2777static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
2778 struct reiserfs_transaction_handle *cur_th = current->journal_info;
2779
2780 /* this keeps do_journal_end from NULLing out the current->journal_info
2781 ** pointer
2782 */
2783 th->t_handle_save = cur_th ;
2784 if (cur_th && cur_th->t_refcount > 1) {
2785 BUG() ;
2786 }
2787 return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_JOIN) ;
2788}
2789
2790int journal_join_abort(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
2791 struct reiserfs_transaction_handle *cur_th = current->journal_info;
2792
2793 /* this keeps do_journal_end from NULLing out the current->journal_info
2794 ** pointer
2795 */
2796 th->t_handle_save = cur_th ;
2797 if (cur_th && cur_th->t_refcount > 1) {
2798 BUG() ;
2799 }
2800 return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_ABORT) ;
2801}
2802
2803int journal_begin(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks) {
2804 struct reiserfs_transaction_handle *cur_th = current->journal_info ;
2805 int ret ;
2806
2807 th->t_handle_save = NULL ;
2808 if (cur_th) {
2809 /* we are nesting into the current transaction */
2810 if (cur_th->t_super == p_s_sb) {
2811 BUG_ON (!cur_th->t_refcount);
2812 cur_th->t_refcount++ ;
2813 memcpy(th, cur_th, sizeof(*th));
2814 if (th->t_refcount <= 1)
2815 reiserfs_warning (p_s_sb, "BAD: refcount <= 1, but journal_info != 0");
2816 return 0;
2817 } else {
2818 /* we've ended up with a handle from a different filesystem.
2819 ** save it and restore on journal_end. This should never
2820 ** really happen...
2821 */
2822 reiserfs_warning(p_s_sb, "clm-2100: nesting info a different FS") ;
2823 th->t_handle_save = current->journal_info ;
2824 current->journal_info = th;
2825 }
2826 } else {
2827 current->journal_info = th;
2828 }
2829 ret = do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_REG) ;
2830 if (current->journal_info != th)
2831 BUG() ;
2832
2833 /* I guess this boils down to being the reciprocal of clm-2100 above.
2834 * If do_journal_begin_r fails, we need to put it back, since journal_end
2835 * won't be called to do it. */
2836 if (ret)
2837 current->journal_info = th->t_handle_save;
2838 else
2839 BUG_ON (!th->t_refcount);
2840
2841 return ret ;
2842}
2843
2844/*
2845** puts bh into the current transaction. If it was already there, reorders removes the
2846** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order).
2847**
2848** if it was dirty, cleans and files onto the clean list. I can't let it be dirty again until the
2849** transaction is committed.
2850**
2851** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
2852*/
2853int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) {
2854 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
2855 struct reiserfs_journal_cnode *cn = NULL;
2856 int count_already_incd = 0 ;
2857 int prepared = 0 ;
2858 BUG_ON (!th->t_trans_id);
2859
2860 PROC_INFO_INC( p_s_sb, journal.mark_dirty );
2861 if (th->t_trans_id != journal->j_trans_id) {
2862 reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n",
2863 th->t_trans_id, journal->j_trans_id);
2864 }
2865
2866 p_s_sb->s_dirt = 1;
2867
2868 prepared = test_clear_buffer_journal_prepared (bh);
2869 clear_buffer_journal_restore_dirty (bh);
2870 /* already in this transaction, we are done */
2871 if (buffer_journaled(bh)) {
2872 PROC_INFO_INC( p_s_sb, journal.mark_dirty_already );
2873 return 0 ;
2874 }
2875
2876 /* this must be turned into a panic instead of a warning. We can't allow
2877 ** a dirty or journal_dirty or locked buffer to be logged, as some changes
2878 ** could get to disk too early. NOT GOOD.
2879 */
2880 if (!prepared || buffer_dirty(bh)) {
2881 reiserfs_warning (p_s_sb, "journal-1777: buffer %llu bad state "
2882 "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
2883 (unsigned long long)bh->b_blocknr, prepared ? ' ' : '!',
2884 buffer_locked(bh) ? ' ' : '!',
2885 buffer_dirty(bh) ? ' ' : '!',
2886 buffer_journal_dirty(bh) ? ' ' : '!') ;
2887 }
2888
2889 if (atomic_read(&(journal->j_wcount)) <= 0) {
2890 reiserfs_warning (p_s_sb, "journal-1409: journal_mark_dirty returning because j_wcount was %d", atomic_read(&(journal->j_wcount))) ;
2891 return 1 ;
2892 }
2893 /* this error means I've screwed up, and we've overflowed the transaction.
2894 ** Nothing can be done here, except make the FS readonly or panic.
2895 */
2896 if (journal->j_len >= journal->j_trans_max) {
2897 reiserfs_panic(th->t_super, "journal-1413: journal_mark_dirty: j_len (%lu) is too big\n", journal->j_len) ;
2898 }
2899
2900 if (buffer_journal_dirty(bh)) {
2901 count_already_incd = 1 ;
2902 PROC_INFO_INC( p_s_sb, journal.mark_dirty_notjournal );
2903 clear_buffer_journal_dirty (bh);
2904 }
2905
2906 if (journal->j_len > journal->j_len_alloc) {
2907 journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT ;
2908 }
2909
2910 set_buffer_journaled (bh);
2911
2912 /* now put this guy on the end */
2913 if (!cn) {
2914 cn = get_cnode(p_s_sb) ;
2915 if (!cn) {
2916 reiserfs_panic(p_s_sb, "get_cnode failed!\n");
2917 }
2918
2919 if (th->t_blocks_logged == th->t_blocks_allocated) {
2920 th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT ;
2921 journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT ;
2922 }
2923 th->t_blocks_logged++ ;
2924 journal->j_len++ ;
2925
2926 cn->bh = bh ;
2927 cn->blocknr = bh->b_blocknr ;
2928 cn->sb = p_s_sb;
2929 cn->jlist = NULL ;
2930 insert_journal_hash(journal->j_hash_table, cn) ;
2931 if (!count_already_incd) {
2932 get_bh(bh) ;
2933 }
2934 }
2935 cn->next = NULL ;
2936 cn->prev = journal->j_last ;
2937 cn->bh = bh ;
2938 if (journal->j_last) {
2939 journal->j_last->next = cn ;
2940 journal->j_last = cn ;
2941 } else {
2942 journal->j_first = cn ;
2943 journal->j_last = cn ;
2944 }
2945 return 0 ;
2946}
2947
2948int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
2949 if (!current->journal_info && th->t_refcount > 1)
2950 reiserfs_warning (p_s_sb, "REISER-NESTING: th NULL, refcount %d",
2951 th->t_refcount);
2952
2953 if (!th->t_trans_id) {
2954 WARN_ON (1);
2955 return -EIO;
2956 }
2957
2958 th->t_refcount--;
2959 if (th->t_refcount > 0) {
2960 struct reiserfs_transaction_handle *cur_th = current->journal_info ;
2961
2962 /* we aren't allowed to close a nested transaction on a different
2963 ** filesystem from the one in the task struct
2964 */
2965 if (cur_th->t_super != th->t_super)
2966 BUG() ;
2967
2968 if (th != cur_th) {
2969 memcpy(current->journal_info, th, sizeof(*th));
2970 th->t_trans_id = 0;
2971 }
2972 return 0;
2973 } else {
2974 return do_journal_end(th, p_s_sb, nblocks, 0) ;
2975 }
2976}
2977
2978/* removes from the current transaction, relsing and descrementing any counters.
2979** also files the removed buffer directly onto the clean list
2980**
2981** called by journal_mark_freed when a block has been deleted
2982**
2983** returns 1 if it cleaned and relsed the buffer. 0 otherwise
2984*/
2985static int remove_from_transaction(struct super_block *p_s_sb, b_blocknr_t blocknr, int already_cleaned) {
2986 struct buffer_head *bh ;
2987 struct reiserfs_journal_cnode *cn ;
2988 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
2989 int ret = 0;
2990
2991 cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr) ;
2992 if (!cn || !cn->bh) {
2993 return ret ;
2994 }
2995 bh = cn->bh ;
2996 if (cn->prev) {
2997 cn->prev->next = cn->next ;
2998 }
2999 if (cn->next) {
3000 cn->next->prev = cn->prev ;
3001 }
3002 if (cn == journal->j_first) {
3003 journal->j_first = cn->next ;
3004 }
3005 if (cn == journal->j_last) {
3006 journal->j_last = cn->prev ;
3007 }
3008 if (bh)
3009 remove_journal_hash(p_s_sb, journal->j_hash_table, NULL, bh->b_blocknr, 0) ;
3010 clear_buffer_journaled (bh); /* don't log this one */
3011
3012 if (!already_cleaned) {
3013 clear_buffer_journal_dirty (bh);
3014 clear_buffer_dirty(bh);
3015 clear_buffer_journal_test (bh);
3016 put_bh(bh) ;
3017 if (atomic_read(&(bh->b_count)) < 0) {
3018 reiserfs_warning (p_s_sb, "journal-1752: remove from trans, b_count < 0");
3019 }
3020 ret = 1 ;
3021 }
3022 journal->j_len-- ;
3023 journal->j_len_alloc-- ;
3024 free_cnode(p_s_sb, cn) ;
3025 return ret ;
3026}
3027
3028/*
3029** for any cnode in a journal list, it can only be dirtied of all the
3030** transactions that include it are commited to disk.
3031** this checks through each transaction, and returns 1 if you are allowed to dirty,
3032** and 0 if you aren't
3033**
3034** it is called by dirty_journal_list, which is called after flush_commit_list has gotten all the log
3035** blocks for a given transaction on disk
3036**
3037*/
3038static int can_dirty(struct reiserfs_journal_cnode *cn) {
3039 struct super_block *sb = cn->sb;
3040 b_blocknr_t blocknr = cn->blocknr ;
3041 struct reiserfs_journal_cnode *cur = cn->hprev ;
3042 int can_dirty = 1 ;
3043
3044 /* first test hprev. These are all newer than cn, so any node here
3045 ** with the same block number and dev means this node can't be sent
3046 ** to disk right now.
3047 */
3048 while(cur && can_dirty) {
3049 if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
3050 cur->blocknr == blocknr) {
3051 can_dirty = 0 ;
3052 }
3053 cur = cur->hprev ;
3054 }
3055 /* then test hnext. These are all older than cn. As long as they
3056 ** are committed to the log, it is safe to write cn to disk
3057 */
3058 cur = cn->hnext ;
3059 while(cur && can_dirty) {
3060 if (cur->jlist && cur->jlist->j_len > 0 &&
3061 atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh &&
3062 cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
3063 can_dirty = 0 ;
3064 }
3065 cur = cur->hnext ;
3066 }
3067 return can_dirty ;
3068}
3069
3070/* syncs the commit blocks, but does not force the real buffers to disk
3071** will wait until the current transaction is done/commited before returning
3072*/
3073int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
3074 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
3075
3076 BUG_ON (!th->t_trans_id);
3077 /* you can sync while nested, very, very bad */
3078 if (th->t_refcount > 1) {
3079 BUG() ;
3080 }
3081 if (journal->j_len == 0) {
3082 reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
3083 journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
3084 }
3085 return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT) ;
3086}
3087
3088/*
3089** writeback the pending async commits to disk
3090*/
3091static void flush_async_commits(void *p) {
3092 struct super_block *p_s_sb = p;
3093 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
3094 struct reiserfs_journal_list *jl;
3095 struct list_head *entry;
3096
3097 lock_kernel();
3098 if (!list_empty(&journal->j_journal_list)) {
3099 /* last entry is the youngest, commit it and you get everything */
3100 entry = journal->j_journal_list.prev;
3101 jl = JOURNAL_LIST_ENTRY(entry);
3102 flush_commit_list(p_s_sb, jl, 1);
3103 }
3104 unlock_kernel();
3105 /*
3106 * this is a little racey, but there's no harm in missing
3107 * the filemap_fdata_write
3108 */
3109 if (!atomic_read(&journal->j_async_throttle) && !reiserfs_is_journal_aborted (journal)) {
3110 atomic_inc(&journal->j_async_throttle);
3111 filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping);
3112 atomic_dec(&journal->j_async_throttle);
3113 }
3114}
3115
3116/*
3117** flushes any old transactions to disk
3118** ends the current transaction if it is too old
3119*/
3120int reiserfs_flush_old_commits(struct super_block *p_s_sb) {
3121 time_t now ;
3122 struct reiserfs_transaction_handle th ;
3123 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
3124
3125 now = get_seconds();
3126 /* safety check so we don't flush while we are replaying the log during
3127 * mount
3128 */
3129 if (list_empty(&journal->j_journal_list)) {
3130 return 0 ;
3131 }
3132
3133 /* check the current transaction. If there are no writers, and it is
3134 * too old, finish it, and force the commit blocks to disk
3135 */
3136 if (atomic_read(&journal->j_wcount) <= 0 &&
3137 journal->j_trans_start_time > 0 &&
3138 journal->j_len > 0 &&
3139 (now - journal->j_trans_start_time) > journal->j_max_trans_age)
3140 {
3141 if (!journal_join(&th, p_s_sb, 1)) {
3142 reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
3143 journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
3144
3145 /* we're only being called from kreiserfsd, it makes no sense to do
3146 ** an async commit so that kreiserfsd can do it later
3147 */
3148 do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
3149 }
3150 }
3151 return p_s_sb->s_dirt;
3152}
3153
3154/*
3155** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit
3156**
3157** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all
3158** the writers are done. By the time it wakes up, the transaction it was called has already ended, so it just
3159** flushes the commit list and returns 0.
3160**
3161** Won't batch when flush or commit_now is set. Also won't batch when others are waiting on j_join_wait.
3162**
3163** Note, we can't allow the journal_end to proceed while there are still writers in the log.
3164*/
3165static int check_journal_end(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,
3166 unsigned long nblocks, int flags) {
3167
3168 time_t now ;
3169 int flush = flags & FLUSH_ALL ;
3170 int commit_now = flags & COMMIT_NOW ;
3171 int wait_on_commit = flags & WAIT ;
3172 struct reiserfs_journal_list *jl;
3173 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
3174
3175 BUG_ON (!th->t_trans_id);
3176
3177 if (th->t_trans_id != journal->j_trans_id) {
3178 reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n",
3179 th->t_trans_id, journal->j_trans_id);
3180 }
3181
3182 journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged) ;
3183 if (atomic_read(&(journal->j_wcount)) > 0) { /* <= 0 is allowed. unmounting might not call begin */
3184 atomic_dec(&(journal->j_wcount)) ;
3185 }
3186
3187 /* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released
3188 ** will be dealt with by next transaction that actually writes something, but should be taken
3189 ** care of in this trans
3190 */
3191 if (journal->j_len == 0) {
3192 BUG();
3193 }
3194 /* if wcount > 0, and we are called to with flush or commit_now,
3195 ** we wait on j_join_wait. We will wake up when the last writer has
3196 ** finished the transaction, and started it on its way to the disk.
3197 ** Then, we flush the commit or journal list, and just return 0
3198 ** because the rest of journal end was already done for this transaction.
3199 */
3200 if (atomic_read(&(journal->j_wcount)) > 0) {
3201 if (flush || commit_now) {
3202 unsigned trans_id ;
3203
3204 jl = journal->j_current_jl;
3205 trans_id = jl->j_trans_id;
3206 if (wait_on_commit)
3207 jl->j_state |= LIST_COMMIT_PENDING;
3208 atomic_set(&(journal->j_jlock), 1) ;
3209 if (flush) {
3210 journal->j_next_full_flush = 1 ;
3211 }
3212 unlock_journal(p_s_sb) ;
3213
3214 /* sleep while the current transaction is still j_jlocked */
3215 while(journal->j_trans_id == trans_id) {
3216 if (atomic_read(&journal->j_jlock)) {
3217 queue_log_writer(p_s_sb);
3218 } else {
3219 lock_journal(p_s_sb);
3220 if (journal->j_trans_id == trans_id) {
3221 atomic_set(&(journal->j_jlock), 1) ;
3222 }
3223 unlock_journal(p_s_sb);
3224 }
3225 }
3226 if (journal->j_trans_id == trans_id) {
3227 BUG();
3228 }
3229 if (commit_now && journal_list_still_alive(p_s_sb, trans_id) &&
3230 wait_on_commit)
3231 {
3232 flush_commit_list(p_s_sb, jl, 1) ;
3233 }
3234 return 0 ;
3235 }
3236 unlock_journal(p_s_sb) ;
3237 return 0 ;
3238 }
3239
3240 /* deal with old transactions where we are the last writers */
3241 now = get_seconds();
3242 if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
3243 commit_now = 1 ;
3244 journal->j_next_async_flush = 1 ;
3245 }
3246 /* don't batch when someone is waiting on j_join_wait */
3247 /* don't batch when syncing the commit or flushing the whole trans */
3248 if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock))) && !flush && !commit_now &&
3249 (journal->j_len < journal->j_max_batch) &&
3250 journal->j_len_alloc < journal->j_max_batch && journal->j_cnode_free > (journal->j_trans_max * 3)) {
3251 journal->j_bcount++ ;
3252 unlock_journal(p_s_sb) ;
3253 return 0 ;
3254 }
3255
3256 if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
3257 reiserfs_panic(p_s_sb, "journal-003: journal_end: j_start (%ld) is too high\n", journal->j_start) ;
3258 }
3259 return 1 ;
3260}
3261
3262/*
3263** Does all the work that makes deleting blocks safe.
3264** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on.
3265**
3266** otherwise:
3267** set a bit for the block in the journal bitmap. That will prevent it from being allocated for unformatted nodes
3268** before this transaction has finished.
3269**
3270** mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. That will prevent any old transactions with
3271** this block from trying to flush to the real location. Since we aren't removing the cnode from the journal_list_hash,
3272** the block can't be reallocated yet.
3273**
3274** Then remove it from the current transaction, decrementing any counters and filing it on the clean list.
3275*/
3276int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, b_blocknr_t blocknr) {
3277 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
3278 struct reiserfs_journal_cnode *cn = NULL ;
3279 struct buffer_head *bh = NULL ;
3280 struct reiserfs_list_bitmap *jb = NULL ;
3281 int cleaned = 0 ;
3282 BUG_ON (!th->t_trans_id);
3283
3284 cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr);
3285 if (cn && cn->bh) {
3286 bh = cn->bh ;
3287 get_bh(bh) ;
3288 }
3289 /* if it is journal new, we just remove it from this transaction */
3290 if (bh && buffer_journal_new(bh)) {
3291 clear_buffer_journal_new (bh);
3292 clear_prepared_bits(bh) ;
3293 reiserfs_clean_and_file_buffer(bh) ;
3294 cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
3295 } else {
3296 /* set the bit for this block in the journal bitmap for this transaction */
3297 jb = journal->j_current_jl->j_list_bitmap;
3298 if (!jb) {
3299 reiserfs_panic(p_s_sb, "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n") ;
3300 }
3301 set_bit_in_list_bitmap(p_s_sb, blocknr, jb) ;
3302
3303 /* Note, the entire while loop is not allowed to schedule. */
3304
3305 if (bh) {
3306 clear_prepared_bits(bh) ;
3307 reiserfs_clean_and_file_buffer(bh) ;
3308 }
3309 cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
3310
3311 /* find all older transactions with this block, make sure they don't try to write it out */
3312 cn = get_journal_hash_dev(p_s_sb,journal->j_list_hash_table, blocknr) ;
3313 while (cn) {
3314 if (p_s_sb == cn->sb && blocknr == cn->blocknr) {
3315 set_bit(BLOCK_FREED, &cn->state) ;
3316 if (cn->bh) {
3317 if (!cleaned) {
3318 /* remove_from_transaction will brelse the buffer if it was
3319 ** in the current trans
3320 */
3321 clear_buffer_journal_dirty (cn->bh);
3322 clear_buffer_dirty(cn->bh);
3323 clear_buffer_journal_test(cn->bh);
3324 cleaned = 1 ;
3325 put_bh(cn->bh) ;
3326 if (atomic_read(&(cn->bh->b_count)) < 0) {
3327 reiserfs_warning (p_s_sb, "journal-2138: cn->bh->b_count < 0");
3328 }
3329 }
3330 if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */
3331 atomic_dec(&(cn->jlist->j_nonzerolen)) ;
3332 }
3333 cn->bh = NULL ;
3334 }
3335 }
3336 cn = cn->hnext ;
3337 }
3338 }
3339
3340 if (bh) {
3341 put_bh(bh) ; /* get_hash grabs the buffer */
3342 if (atomic_read(&(bh->b_count)) < 0) {
3343 reiserfs_warning (p_s_sb, "journal-2165: bh->b_count < 0");
3344 }
3345 }
3346 return 0 ;
3347}
3348
3349void reiserfs_update_inode_transaction(struct inode *inode) {
3350 struct reiserfs_journal *journal = SB_JOURNAL (inode->i_sb);
3351 REISERFS_I(inode)->i_jl = journal->j_current_jl;
3352 REISERFS_I(inode)->i_trans_id = journal->j_trans_id ;
3353}
3354
3355/*
3356 * returns -1 on error, 0 if no commits/barriers were done and 1
3357 * if a transaction was actually committed and the barrier was done
3358 */
3359static int __commit_trans_jl(struct inode *inode, unsigned long id,
3360 struct reiserfs_journal_list *jl)
3361{
3362 struct reiserfs_transaction_handle th ;
3363 struct super_block *sb = inode->i_sb ;
3364 struct reiserfs_journal *journal = SB_JOURNAL (sb);
3365 int ret = 0;
3366
3367 /* is it from the current transaction, or from an unknown transaction? */
3368 if (id == journal->j_trans_id) {
3369 jl = journal->j_current_jl;
3370 /* try to let other writers come in and grow this transaction */
3371 let_transaction_grow(sb, id);
3372 if (journal->j_trans_id != id) {
3373 goto flush_commit_only;
3374 }
3375
3376 ret = journal_begin(&th, sb, 1) ;
3377 if (ret)
3378 return ret;
3379
3380 /* someone might have ended this transaction while we joined */
3381 if (journal->j_trans_id != id) {
3382 reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1) ;
3383 journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)) ;
3384 ret = journal_end(&th, sb, 1) ;
3385 goto flush_commit_only;
3386 }
3387
3388 ret = journal_end_sync(&th, sb, 1) ;
3389 if (!ret)
3390 ret = 1;
3391
3392 } else {
3393 /* this gets tricky, we have to make sure the journal list in
3394 * the inode still exists. We know the list is still around
3395 * if we've got a larger transaction id than the oldest list
3396 */
3397flush_commit_only:
3398 if (journal_list_still_alive(inode->i_sb, id)) {
3399 /*
3400 * we only set ret to 1 when we know for sure
3401 * the barrier hasn't been started yet on the commit
3402 * block.
3403 */
3404 if (atomic_read(&jl->j_commit_left) > 1)
3405 ret = 1;
3406 flush_commit_list(sb, jl, 1) ;
3407 if (journal->j_errno)
3408 ret = journal->j_errno;
3409 }
3410 }
3411 /* otherwise the list is gone, and long since committed */
3412 return ret;
3413}
3414
3415int reiserfs_commit_for_inode(struct inode *inode) {
3416 unsigned long id = REISERFS_I(inode)->i_trans_id;
3417 struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
3418
3419 /* for the whole inode, assume unset id means it was
3420 * changed in the current transaction. More conservative
3421 */
3422 if (!id || !jl) {
3423 reiserfs_update_inode_transaction(inode) ;
3424 id = REISERFS_I(inode)->i_trans_id;
3425 /* jl will be updated in __commit_trans_jl */
3426 }
3427
3428 return __commit_trans_jl(inode, id, jl);
3429}
3430
3431void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb,
3432 struct buffer_head *bh) {
3433 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
3434 PROC_INFO_INC( p_s_sb, journal.restore_prepared );
3435 if (!bh) {
3436 return ;
3437 }
3438 if (test_clear_buffer_journal_restore_dirty (bh) &&
3439 buffer_journal_dirty(bh)) {
3440 struct reiserfs_journal_cnode *cn;
3441 cn = get_journal_hash_dev(p_s_sb,
3442 journal->j_list_hash_table,
3443 bh->b_blocknr);
3444 if (cn && can_dirty(cn)) {
3445 set_buffer_journal_test (bh);
3446 mark_buffer_dirty(bh);
3447 }
3448 }
3449 clear_buffer_journal_prepared (bh);
3450}
3451
3452extern struct tree_balance *cur_tb ;
3453/*
3454** before we can change a metadata block, we have to make sure it won't
3455** be written to disk while we are altering it. So, we must:
3456** clean it
3457** wait on it.
3458**
3459*/
3460int reiserfs_prepare_for_journal(struct super_block *p_s_sb,
3461 struct buffer_head *bh, int wait) {
3462 PROC_INFO_INC( p_s_sb, journal.prepare );
3463
3464 if (test_set_buffer_locked(bh)) {
3465 if (!wait)
3466 return 0;
3467 lock_buffer(bh);
3468 }
3469 set_buffer_journal_prepared (bh);
3470 if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {
3471 clear_buffer_journal_test (bh);
3472 set_buffer_journal_restore_dirty (bh);
3473 }
3474 unlock_buffer(bh);
3475 return 1;
3476}
3477
3478static void flush_old_journal_lists(struct super_block *s) {
3479 struct reiserfs_journal *journal = SB_JOURNAL (s);
3480 struct reiserfs_journal_list *jl;
3481 struct list_head *entry;
3482 time_t now = get_seconds();
3483
3484 while(!list_empty(&journal->j_journal_list)) {
3485 entry = journal->j_journal_list.next;
3486 jl = JOURNAL_LIST_ENTRY(entry);
3487 /* this check should always be run, to send old lists to disk */
3488 if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) {
3489 flush_used_journal_lists(s, jl);
3490 } else {
3491 break;
3492 }
3493 }
3494}
3495
3496/*
3497** long and ugly. If flush, will not return until all commit
3498** blocks and all real buffers in the trans are on disk.
3499** If no_async, won't return until all commit blocks are on disk.
3500**
3501** keep reading, there are comments as you go along
3502**
3503** If the journal is aborted, we just clean up. Things like flushing
3504** journal lists, etc just won't happen.
3505*/
3506static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks,
3507 int flags) {
3508 struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
3509 struct reiserfs_journal_cnode *cn, *next, *jl_cn;
3510 struct reiserfs_journal_cnode *last_cn = NULL;
3511 struct reiserfs_journal_desc *desc ;
3512 struct reiserfs_journal_commit *commit ;
3513 struct buffer_head *c_bh ; /* commit bh */
3514 struct buffer_head *d_bh ; /* desc bh */
3515 int cur_write_start = 0 ; /* start index of current log write */
3516 int old_start ;
3517 int i ;
3518 int flush = flags & FLUSH_ALL ;
3519 int wait_on_commit = flags & WAIT ;
3520 struct reiserfs_journal_list *jl, *temp_jl;
3521 struct list_head *entry, *safe;
3522 unsigned long jindex;
3523 unsigned long commit_trans_id;
3524 int trans_half;
3525
3526 BUG_ON (th->t_refcount > 1);
3527 BUG_ON (!th->t_trans_id);
3528
3529 current->journal_info = th->t_handle_save;
3530 reiserfs_check_lock_depth(p_s_sb, "journal end");
3531 if (journal->j_len == 0) {
3532 reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
3533 journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
3534 }
3535
3536 lock_journal(p_s_sb) ;
3537 if (journal->j_next_full_flush) {
3538 flags |= FLUSH_ALL ;
3539 flush = 1 ;
3540 }
3541 if (journal->j_next_async_flush) {
3542 flags |= COMMIT_NOW | WAIT;
3543 wait_on_commit = 1;
3544 }
3545
3546 /* check_journal_end locks the journal, and unlocks if it does not return 1
3547 ** it tells us if we should continue with the journal_end, or just return
3548 */
3549 if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
3550 p_s_sb->s_dirt = 1;
3551 wake_queued_writers(p_s_sb);
3552 reiserfs_async_progress_wait(p_s_sb);
3553 goto out ;
3554 }
3555
3556 /* check_journal_end might set these, check again */
3557 if (journal->j_next_full_flush) {
3558 flush = 1 ;
3559 }
3560
3561 /*
3562 ** j must wait means we have to flush the log blocks, and the real blocks for
3563 ** this transaction
3564 */
3565 if (journal->j_must_wait > 0) {
3566 flush = 1 ;
3567 }
3568
3569#ifdef REISERFS_PREALLOCATE
3570 /* quota ops might need to nest, setup the journal_info pointer for them */
3571 current->journal_info = th ;
3572 reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into
3573 * the transaction */
3574 current->journal_info = th->t_handle_save ;
3575#endif
3576
3577 /* setup description block */
3578 d_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start) ;
3579 set_buffer_uptodate(d_bh);
3580 desc = (struct reiserfs_journal_desc *)(d_bh)->b_data ;
3581 memset(d_bh->b_data, 0, d_bh->b_size) ;
3582 memcpy(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8) ;
3583 set_desc_trans_id(desc, journal->j_trans_id) ;
3584
3585 /* setup commit block. Don't write (keep it clean too) this one until after everyone else is written */
3586 c_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
3587 ((journal->j_start + journal->j_len + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
3588 commit = (struct reiserfs_journal_commit *)c_bh->b_data ;
3589 memset(c_bh->b_data, 0, c_bh->b_size) ;
3590 set_commit_trans_id(commit, journal->j_trans_id) ;
3591 set_buffer_uptodate(c_bh) ;
3592
3593 /* init this journal list */
3594 jl = journal->j_current_jl;
3595
3596 /* we lock the commit before doing anything because
3597 * we want to make sure nobody tries to run flush_commit_list until
3598 * the new transaction is fully setup, and we've already flushed the
3599 * ordered bh list
3600 */
3601 down(&jl->j_commit_lock);
3602
3603 /* save the transaction id in case we need to commit it later */
3604 commit_trans_id = jl->j_trans_id;
3605
3606 atomic_set(&jl->j_older_commits_done, 0) ;
3607 jl->j_trans_id = journal->j_trans_id ;
3608 jl->j_timestamp = journal->j_trans_start_time ;
3609 jl->j_commit_bh = c_bh ;
3610 jl->j_start = journal->j_start ;
3611 jl->j_len = journal->j_len ;
3612 atomic_set(&jl->j_nonzerolen, journal->j_len) ;
3613 atomic_set(&jl->j_commit_left, journal->j_len + 2);
3614 jl->j_realblock = NULL ;
3615
3616 /* The ENTIRE FOR LOOP MUST not cause schedule to occur.
3617 ** for each real block, add it to the journal list hash,
3618 ** copy into real block index array in the commit or desc block
3619 */
3620 trans_half = journal_trans_half(p_s_sb->s_blocksize);
3621 for (i = 0, cn = journal->j_first ; cn ; cn = cn->next, i++) {
3622 if (buffer_journaled (cn->bh)) {
3623 jl_cn = get_cnode(p_s_sb) ;
3624 if (!jl_cn) {
3625 reiserfs_panic(p_s_sb, "journal-1676, get_cnode returned NULL\n") ;
3626 }
3627 if (i == 0) {
3628 jl->j_realblock = jl_cn ;
3629 }
3630 jl_cn->prev = last_cn ;
3631 jl_cn->next = NULL ;
3632 if (last_cn) {
3633 last_cn->next = jl_cn ;
3634 }
3635 last_cn = jl_cn ;
3636 /* make sure the block we are trying to log is not a block
3637 of journal or reserved area */
3638
3639 if (is_block_in_log_or_reserved_area(p_s_sb, cn->bh->b_blocknr)) {
3640 reiserfs_panic(p_s_sb, "journal-2332: Trying to log block %lu, which is a log block\n", cn->bh->b_blocknr) ;
3641 }
3642 jl_cn->blocknr = cn->bh->b_blocknr ;
3643 jl_cn->state = 0 ;
3644 jl_cn->sb = p_s_sb;
3645 jl_cn->bh = cn->bh ;
3646 jl_cn->jlist = jl;
3647 insert_journal_hash(journal->j_list_hash_table, jl_cn) ;
3648 if (i < trans_half) {
3649 desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr) ;
3650 } else {
3651 commit->j_realblock[i - trans_half] = cpu_to_le32(cn->bh->b_blocknr) ;
3652 }
3653 } else {
3654 i-- ;
3655 }
3656 }
3657 set_desc_trans_len(desc, journal->j_len) ;
3658 set_desc_mount_id(desc, journal->j_mount_id) ;
3659 set_desc_trans_id(desc, journal->j_trans_id) ;
3660 set_commit_trans_len(commit, journal->j_len);
3661
3662 /* special check in case all buffers in the journal were marked for not logging */
3663 if (journal->j_len == 0) {
3664 BUG();
3665 }
3666
3667 /* we're about to dirty all the log blocks, mark the description block
3668 * dirty now too. Don't mark the commit block dirty until all the
3669 * others are on disk
3670 */
3671 mark_buffer_dirty(d_bh);
3672
3673 /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
3674 cur_write_start = journal->j_start ;
3675 cn = journal->j_first ;
3676 jindex = 1 ; /* start at one so we don't get the desc again */
3677 while(cn) {
3678 clear_buffer_journal_new (cn->bh);
3679 /* copy all the real blocks into log area. dirty log blocks */
3680 if (buffer_journaled (cn->bh)) {
3681 struct buffer_head *tmp_bh ;
3682 char *addr;
3683 struct page *page;
3684 tmp_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
3685 ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
3686 set_buffer_uptodate(tmp_bh);
3687 page = cn->bh->b_page;
3688 addr = kmap(page);
3689 memcpy(tmp_bh->b_data, addr + offset_in_page(cn->bh->b_data),
3690 cn->bh->b_size);
3691 kunmap(page);
3692 mark_buffer_dirty(tmp_bh);
3693 jindex++ ;
3694 set_buffer_journal_dirty (cn->bh);
3695 clear_buffer_journaled (cn->bh);
3696 } else {
3697 /* JDirty cleared sometime during transaction. don't log this one */
3698 reiserfs_warning(p_s_sb, "journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!") ;
3699 brelse(cn->bh) ;
3700 }
3701 next = cn->next ;
3702 free_cnode(p_s_sb, cn) ;
3703 cn = next ;
3704 cond_resched();
3705 }
3706
3707 /* we are done with both the c_bh and d_bh, but
3708 ** c_bh must be written after all other commit blocks,
3709 ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
3710 */
3711
3712 journal->j_current_jl = alloc_journal_list(p_s_sb);
3713
3714 /* now it is safe to insert this transaction on the main list */
3715 list_add_tail(&jl->j_list, &journal->j_journal_list);
3716 list_add_tail(&jl->j_working_list, &journal->j_working_list);
3717 journal->j_num_work_lists++;
3718
3719 /* reset journal values for the next transaction */
3720 old_start = journal->j_start ;
3721 journal->j_start = (journal->j_start + journal->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb);
3722 atomic_set(&(journal->j_wcount), 0) ;
3723 journal->j_bcount = 0 ;
3724 journal->j_last = NULL ;
3725 journal->j_first = NULL ;
3726 journal->j_len = 0 ;
3727 journal->j_trans_start_time = 0 ;
3728 journal->j_trans_id++ ;
3729 journal->j_current_jl->j_trans_id = journal->j_trans_id;
3730 journal->j_must_wait = 0 ;
3731 journal->j_len_alloc = 0 ;
3732 journal->j_next_full_flush = 0 ;
3733 journal->j_next_async_flush = 0 ;
3734 init_journal_hash(p_s_sb) ;
3735
3736 // make sure reiserfs_add_jh sees the new current_jl before we
3737 // write out the tails
3738 smp_mb();
3739
3740 /* tail conversion targets have to hit the disk before we end the
3741 * transaction. Otherwise a later transaction might repack the tail
3742 * before this transaction commits, leaving the data block unflushed and
3743 * clean, if we crash before the later transaction commits, the data block
3744 * is lost.
3745 */
3746 if (!list_empty(&jl->j_tail_bh_list)) {
3747 unlock_kernel();
3748 write_ordered_buffers(&journal->j_dirty_buffers_lock,
3749 journal, jl, &jl->j_tail_bh_list);
3750 lock_kernel();
3751 }
3752 if (!list_empty(&jl->j_tail_bh_list))
3753 BUG();
3754 up(&jl->j_commit_lock);
3755
3756 /* honor the flush wishes from the caller, simple commits can
3757 ** be done outside the journal lock, they are done below
3758 **
3759 ** if we don't flush the commit list right now, we put it into
3760 ** the work queue so the people waiting on the async progress work
3761 ** queue don't wait for this proc to flush journal lists and such.
3762 */
3763 if (flush) {
3764 flush_commit_list(p_s_sb, jl, 1) ;
3765 flush_journal_list(p_s_sb, jl, 1) ;
3766 } else if (!(jl->j_state & LIST_COMMIT_PENDING))
3767 queue_delayed_work(commit_wq, &journal->j_work, HZ/10);
3768
3769
3770 /* if the next transaction has any chance of wrapping, flush
3771 ** transactions that might get overwritten. If any journal lists are very
3772 ** old flush them as well.
3773 */
3774first_jl:
3775 list_for_each_safe(entry, safe, &journal->j_journal_list) {
3776 temp_jl = JOURNAL_LIST_ENTRY(entry);
3777 if (journal->j_start <= temp_jl->j_start) {
3778 if ((journal->j_start + journal->j_trans_max + 1) >=
3779 temp_jl->j_start)
3780 {
3781 flush_used_journal_lists(p_s_sb, temp_jl);
3782 goto first_jl;
3783 } else if ((journal->j_start +
3784 journal->j_trans_max + 1) <
3785 SB_ONDISK_JOURNAL_SIZE(p_s_sb))
3786 {
3787 /* if we don't cross into the next transaction and we don't
3788 * wrap, there is no way we can overlap any later transactions
3789 * break now
3790 */
3791 break;
3792 }
3793 } else if ((journal->j_start +
3794 journal->j_trans_max + 1) >
3795 SB_ONDISK_JOURNAL_SIZE(p_s_sb))
3796 {
3797 if (((journal->j_start + journal->j_trans_max + 1) %
3798 SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= temp_jl->j_start)
3799 {
3800 flush_used_journal_lists(p_s_sb, temp_jl);
3801 goto first_jl;
3802 } else {
3803 /* we don't overlap anything from out start to the end of the
3804 * log, and our wrapped portion doesn't overlap anything at
3805 * the start of the log. We can break
3806 */
3807 break;
3808 }
3809 }
3810 }
3811 flush_old_journal_lists(p_s_sb);
3812
3813 journal->j_current_jl->j_list_bitmap = get_list_bitmap(p_s_sb, journal->j_current_jl) ;
3814
3815 if (!(journal->j_current_jl->j_list_bitmap)) {
3816 reiserfs_panic(p_s_sb, "journal-1996: do_journal_end, could not get a list bitmap\n") ;
3817 }
3818
3819 atomic_set(&(journal->j_jlock), 0) ;
3820 unlock_journal(p_s_sb) ;
3821 /* wake up any body waiting to join. */
3822 clear_bit(J_WRITERS_QUEUED, &journal->j_state);
3823 wake_up(&(journal->j_join_wait)) ;
3824
3825 if (!flush && wait_on_commit &&
3826 journal_list_still_alive(p_s_sb, commit_trans_id)) {
3827 flush_commit_list(p_s_sb, jl, 1) ;
3828 }
3829out:
3830 reiserfs_check_lock_depth(p_s_sb, "journal end2");
3831
3832 memset (th, 0, sizeof (*th));
3833 /* Re-set th->t_super, so we can properly keep track of how many
3834 * persistent transactions there are. We need to do this so if this
3835 * call is part of a failed restart_transaction, we can free it later */
3836 th->t_super = p_s_sb;
3837
3838 return journal->j_errno;
3839}
3840
3841static void
3842__reiserfs_journal_abort_hard (struct super_block *sb)
3843{
3844 struct reiserfs_journal *journal = SB_JOURNAL (sb);
3845 if (test_bit (J_ABORTED, &journal->j_state))
3846 return;
3847
3848 printk (KERN_CRIT "REISERFS: Aborting journal for filesystem on %s\n",
3849 reiserfs_bdevname (sb));
3850
3851 sb->s_flags |= MS_RDONLY;
3852 set_bit (J_ABORTED, &journal->j_state);
3853
3854#ifdef CONFIG_REISERFS_CHECK
3855 dump_stack();
3856#endif
3857}
3858
3859static void
3860__reiserfs_journal_abort_soft (struct super_block *sb, int errno)
3861{
3862 struct reiserfs_journal *journal = SB_JOURNAL (sb);
3863 if (test_bit (J_ABORTED, &journal->j_state))
3864 return;
3865
3866 if (!journal->j_errno)
3867 journal->j_errno = errno;
3868
3869 __reiserfs_journal_abort_hard (sb);
3870}
3871
3872void
3873reiserfs_journal_abort (struct super_block *sb, int errno)
3874{
3875 return __reiserfs_journal_abort_soft (sb, errno);
3876}
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
new file mode 100644
index 000000000000..2406608fc5cd
--- /dev/null
+++ b/fs/reiserfs/lbalance.c
@@ -0,0 +1,1222 @@
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5#include <linux/config.h>
6#include <asm/uaccess.h>
7#include <linux/string.h>
8#include <linux/time.h>
9#include <linux/reiserfs_fs.h>
10#include <linux/buffer_head.h>
11
12/* these are used in do_balance.c */
13
14/* leaf_move_items
15 leaf_shift_left
16 leaf_shift_right
17 leaf_delete_items
18 leaf_insert_into_buf
19 leaf_paste_in_buffer
20 leaf_cut_from_buffer
21 leaf_paste_entries
22 */
23
24
25/* copy copy_count entries from source directory item to dest buffer (creating new item if needed) */
26static void leaf_copy_dir_entries (struct buffer_info * dest_bi, struct buffer_head * source,
27 int last_first, int item_num, int from, int copy_count)
28{
29 struct buffer_head * dest = dest_bi->bi_bh;
30 int item_num_in_dest; /* either the number of target item,
31 or if we must create a new item,
32 the number of the item we will
33 create it next to */
34 struct item_head * ih;
35 struct reiserfs_de_head * deh;
36 int copy_records_len; /* length of all records in item to be copied */
37 char * records;
38
39 ih = B_N_PITEM_HEAD (source, item_num);
40
41 RFALSE( !is_direntry_le_ih (ih), "vs-10000: item must be directory item");
42
43 /* length of all record to be copied and first byte of the last of them */
44 deh = B_I_DEH (source, ih);
45 if (copy_count) {
46 copy_records_len = (from ? deh_location( &(deh[from - 1]) ) :
47 ih_item_len(ih)) - deh_location( &(deh[from + copy_count - 1]));
48 records = source->b_data + ih_location(ih) +
49 deh_location( &(deh[from + copy_count - 1]));
50 } else {
51 copy_records_len = 0;
52 records = NULL;
53 }
54
55 /* when copy last to first, dest buffer can contain 0 items */
56 item_num_in_dest = (last_first == LAST_TO_FIRST) ? (( B_NR_ITEMS(dest) ) ? 0 : -1) : (B_NR_ITEMS(dest) - 1);
57
58 /* if there are no items in dest or the first/last item in dest is not item of the same directory */
59 if ( (item_num_in_dest == - 1) ||
60 (last_first == FIRST_TO_LAST && le_ih_k_offset (ih) == DOT_OFFSET) ||
61 (last_first == LAST_TO_FIRST && comp_short_le_keys/*COMP_SHORT_KEYS*/ (&ih->ih_key, B_N_PKEY (dest, item_num_in_dest)))) {
62 /* create new item in dest */
63 struct item_head new_ih;
64
65 /* form item header */
66 memcpy (&new_ih.ih_key, &ih->ih_key, KEY_SIZE);
67 put_ih_version( &new_ih, KEY_FORMAT_3_5 );
68 /* calculate item len */
69 put_ih_item_len( &new_ih, DEH_SIZE * copy_count + copy_records_len );
70 put_ih_entry_count( &new_ih, 0 );
71
72 if (last_first == LAST_TO_FIRST) {
73 /* form key by the following way */
74 if (from < I_ENTRY_COUNT(ih)) {
75 set_le_ih_k_offset( &new_ih, deh_offset( &(deh[from]) ) );
76 /*memcpy (&new_ih.ih_key.k_offset, &deh[from].deh_offset, SHORT_KEY_SIZE);*/
77 } else {
78 /* no entries will be copied to this item in this function */
79 set_le_ih_k_offset (&new_ih, U32_MAX);
80 /* this item is not yet valid, but we want I_IS_DIRECTORY_ITEM to return 1 for it, so we -1 */
81 }
82 set_le_key_k_type (KEY_FORMAT_3_5, &(new_ih.ih_key), TYPE_DIRENTRY);
83 }
84
85 /* insert item into dest buffer */
86 leaf_insert_into_buf (dest_bi, (last_first == LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest), &new_ih, NULL, 0);
87 } else {
88 /* prepare space for entries */
89 leaf_paste_in_buffer (dest_bi, (last_first==FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0, MAX_US_INT,
90 DEH_SIZE * copy_count + copy_records_len, records, 0
91 );
92 }
93
94 item_num_in_dest = (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest)-1) : 0;
95
96 leaf_paste_entries (dest_bi->bi_bh, item_num_in_dest,
97 (last_first == FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD (dest, item_num_in_dest)) : 0,
98 copy_count, deh + from, records,
99 DEH_SIZE * copy_count + copy_records_len
100 );
101}
102
103
104/* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or
105 part of it or nothing (see the return 0 below) from SOURCE to the end
106 (if last_first) or beginning (!last_first) of the DEST */
107/* returns 1 if anything was copied, else 0 */
108static int leaf_copy_boundary_item (struct buffer_info * dest_bi, struct buffer_head * src, int last_first,
109 int bytes_or_entries)
110{
111 struct buffer_head * dest = dest_bi->bi_bh;
112 int dest_nr_item, src_nr_item; /* number of items in the source and destination buffers */
113 struct item_head * ih;
114 struct item_head * dih;
115
116 dest_nr_item = B_NR_ITEMS(dest);
117
118 if ( last_first == FIRST_TO_LAST ) {
119 /* if ( DEST is empty or first item of SOURCE and last item of DEST are the items of different objects
120 or of different types ) then there is no need to treat this item differently from the other items
121 that we copy, so we return */
122 ih = B_N_PITEM_HEAD (src, 0);
123 dih = B_N_PITEM_HEAD (dest, dest_nr_item - 1);
124 if (!dest_nr_item || (!op_is_left_mergeable (&(ih->ih_key), src->b_size)))
125 /* there is nothing to merge */
126 return 0;
127
128 RFALSE( ! ih_item_len(ih), "vs-10010: item can not have empty length");
129
130 if ( is_direntry_le_ih (ih) ) {
131 if ( bytes_or_entries == -1 )
132 /* copy all entries to dest */
133 bytes_or_entries = ih_entry_count(ih);
134 leaf_copy_dir_entries (dest_bi, src, FIRST_TO_LAST, 0, 0, bytes_or_entries);
135 return 1;
136 }
137
138 /* copy part of the body of the first item of SOURCE to the end of the body of the last item of the DEST
139 part defined by 'bytes_or_entries'; if bytes_or_entries == -1 copy whole body; don't create new item header
140 */
141 if ( bytes_or_entries == -1 )
142 bytes_or_entries = ih_item_len(ih);
143
144#ifdef CONFIG_REISERFS_CHECK
145 else {
146 if (bytes_or_entries == ih_item_len(ih) && is_indirect_le_ih(ih))
147 if (get_ih_free_space (ih))
148 reiserfs_panic (NULL, "vs-10020: leaf_copy_boundary_item: "
149 "last unformatted node must be filled entirely (%h)",
150 ih);
151 }
152#endif
153
154 /* merge first item (or its part) of src buffer with the last
155 item of dest buffer. Both are of the same file */
156 leaf_paste_in_buffer (dest_bi,
157 dest_nr_item - 1, ih_item_len(dih), bytes_or_entries, B_I_PITEM(src,ih), 0
158 );
159
160 if (is_indirect_le_ih (dih)) {
161 RFALSE( get_ih_free_space (dih),
162 "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space",
163 ih);
164 if (bytes_or_entries == ih_item_len(ih))
165 set_ih_free_space (dih, get_ih_free_space (ih));
166 }
167
168 return 1;
169 }
170
171
172 /* copy boundary item to right (last_first == LAST_TO_FIRST) */
173
174 /* ( DEST is empty or last item of SOURCE and first item of DEST
175 are the items of different object or of different types )
176 */
177 src_nr_item = B_NR_ITEMS (src);
178 ih = B_N_PITEM_HEAD (src, src_nr_item - 1);
179 dih = B_N_PITEM_HEAD (dest, 0);
180
181 if (!dest_nr_item || !op_is_left_mergeable (&(dih->ih_key), src->b_size))
182 return 0;
183
184 if ( is_direntry_le_ih (ih)) {
185 if ( bytes_or_entries == -1 )
186 /* bytes_or_entries = entries number in last item body of SOURCE */
187 bytes_or_entries = ih_entry_count(ih);
188
189 leaf_copy_dir_entries (dest_bi, src, LAST_TO_FIRST, src_nr_item - 1, ih_entry_count(ih) - bytes_or_entries, bytes_or_entries);
190 return 1;
191 }
192
193 /* copy part of the body of the last item of SOURCE to the begin of the body of the first item of the DEST;
194 part defined by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; change first item key of the DEST;
195 don't create new item header
196 */
197
198 RFALSE( is_indirect_le_ih(ih) && get_ih_free_space (ih),
199 "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)",
200 ih);
201
202 if ( bytes_or_entries == -1 ) {
203 /* bytes_or_entries = length of last item body of SOURCE */
204 bytes_or_entries = ih_item_len(ih);
205
206 RFALSE( le_ih_k_offset (dih) !=
207 le_ih_k_offset (ih) + op_bytes_number (ih, src->b_size),
208 "vs-10050: items %h and %h do not match", ih, dih);
209
210 /* change first item key of the DEST */
211 set_le_ih_k_offset (dih, le_ih_k_offset (ih));
212
213 /* item becomes non-mergeable */
214 /* or mergeable if left item was */
215 set_le_ih_k_type (dih, le_ih_k_type (ih));
216 } else {
217 /* merge to right only part of item */
218 RFALSE( ih_item_len(ih) <= bytes_or_entries,
219 "vs-10060: no so much bytes %lu (needed %lu)",
220 ( unsigned long )ih_item_len(ih), ( unsigned long )bytes_or_entries);
221
222 /* change first item key of the DEST */
223 if ( is_direct_le_ih (dih) ) {
224 RFALSE( le_ih_k_offset (dih) <= (unsigned long)bytes_or_entries,
225 "vs-10070: dih %h, bytes_or_entries(%d)", dih, bytes_or_entries);
226 set_le_ih_k_offset (dih, le_ih_k_offset (dih) - bytes_or_entries);
227 } else {
228 RFALSE( le_ih_k_offset (dih) <=
229 (bytes_or_entries / UNFM_P_SIZE) * dest->b_size,
230 "vs-10080: dih %h, bytes_or_entries(%d)",
231 dih, (bytes_or_entries/UNFM_P_SIZE)*dest->b_size);
232 set_le_ih_k_offset (dih, le_ih_k_offset (dih) - ((bytes_or_entries / UNFM_P_SIZE) * dest->b_size));
233 }
234 }
235
236 leaf_paste_in_buffer (dest_bi, 0, 0, bytes_or_entries, B_I_PITEM(src,ih) + ih_item_len(ih) - bytes_or_entries, 0);
237 return 1;
238}
239
240
241/* copy cpy_mun items from buffer src to buffer dest
242 * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning from first-th item in src to tail of dest
243 * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning from first-th item in src to head of dest
244 */
245static void leaf_copy_items_entirely (struct buffer_info * dest_bi, struct buffer_head * src, int last_first,
246 int first, int cpy_num)
247{
248 struct buffer_head * dest;
249 int nr, free_space;
250 int dest_before;
251 int last_loc, last_inserted_loc, location;
252 int i, j;
253 struct block_head * blkh;
254 struct item_head * ih;
255
256 RFALSE( last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST,
257 "vs-10090: bad last_first parameter %d", last_first);
258 RFALSE( B_NR_ITEMS (src) - first < cpy_num,
259 "vs-10100: too few items in source %d, required %d from %d",
260 B_NR_ITEMS(src), cpy_num, first);
261 RFALSE( cpy_num < 0, "vs-10110: can not copy negative amount of items");
262 RFALSE( ! dest_bi, "vs-10120: can not copy negative amount of items");
263
264 dest = dest_bi->bi_bh;
265
266 RFALSE( ! dest, "vs-10130: can not copy negative amount of items");
267
268 if (cpy_num == 0)
269 return;
270
271 blkh = B_BLK_HEAD(dest);
272 nr = blkh_nr_item( blkh );
273 free_space = blkh_free_space(blkh);
274
275 /* we will insert items before 0-th or nr-th item in dest buffer. It depends of last_first parameter */
276 dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr;
277
278 /* location of head of first new item */
279 ih = B_N_PITEM_HEAD (dest, dest_before);
280
281 RFALSE( blkh_free_space(blkh) < cpy_num * IH_SIZE,
282 "vs-10140: not enough free space for headers %d (needed %d)",
283 B_FREE_SPACE (dest), cpy_num * IH_SIZE);
284
285 /* prepare space for headers */
286 memmove (ih + cpy_num, ih, (nr-dest_before) * IH_SIZE);
287
288 /* copy item headers */
289 memcpy (ih, B_N_PITEM_HEAD (src, first), cpy_num * IH_SIZE);
290
291 free_space -= (IH_SIZE * cpy_num);
292 set_blkh_free_space( blkh, free_space );
293
294 /* location of unmovable item */
295 j = location = (dest_before == 0) ? dest->b_size : ih_location(ih-1);
296 for (i = dest_before; i < nr + cpy_num; i ++) {
297 location -= ih_item_len( ih + i - dest_before );
298 put_ih_location( ih + i - dest_before, location );
299 }
300
301 /* prepare space for items */
302 last_loc = ih_location( &(ih[nr+cpy_num-1-dest_before]) );
303 last_inserted_loc = ih_location( &(ih[cpy_num-1]) );
304
305 /* check free space */
306 RFALSE( free_space < j - last_inserted_loc,
307 "vs-10150: not enough free space for items %d (needed %d)",
308 free_space, j - last_inserted_loc);
309
310 memmove (dest->b_data + last_loc,
311 dest->b_data + last_loc + j - last_inserted_loc,
312 last_inserted_loc - last_loc);
313
314 /* copy items */
315 memcpy (dest->b_data + last_inserted_loc, B_N_PITEM(src,(first + cpy_num - 1)),
316 j - last_inserted_loc);
317
318 /* sizes, item number */
319 set_blkh_nr_item( blkh, nr + cpy_num );
320 set_blkh_free_space( blkh, free_space - (j - last_inserted_loc) );
321
322 do_balance_mark_leaf_dirty (dest_bi->tb, dest, 0);
323
324 if (dest_bi->bi_parent) {
325 struct disk_child *t_dc;
326 t_dc = B_N_CHILD (dest_bi->bi_parent, dest_bi->bi_position);
327 RFALSE( dc_block_number(t_dc) != dest->b_blocknr,
328 "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu",
329 ( long unsigned ) dest->b_blocknr,
330 ( long unsigned ) dc_block_number(t_dc));
331 put_dc_size( t_dc, dc_size(t_dc) + (j - last_inserted_loc + IH_SIZE * cpy_num ) );
332
333 do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent, 0);
334 }
335}
336
337
338/* This function splits the (liquid) item into two items (useful when
339 shifting part of an item into another node.) */
340static void leaf_item_bottle (struct buffer_info * dest_bi, struct buffer_head * src, int last_first,
341 int item_num, int cpy_bytes)
342{
343 struct buffer_head * dest = dest_bi->bi_bh;
344 struct item_head * ih;
345
346 RFALSE( cpy_bytes == -1, "vs-10170: bytes == - 1 means: do not split item");
347
348 if ( last_first == FIRST_TO_LAST ) {
349 /* if ( if item in position item_num in buffer SOURCE is directory item ) */
350 if (is_direntry_le_ih (ih = B_N_PITEM_HEAD(src,item_num)))
351 leaf_copy_dir_entries (dest_bi, src, FIRST_TO_LAST, item_num, 0, cpy_bytes);
352 else {
353 struct item_head n_ih;
354
355 /* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST
356 part defined by 'cpy_bytes'; create new item header; change old item_header (????);
357 n_ih = new item_header;
358 */
359 memcpy (&n_ih, ih, IH_SIZE);
360 put_ih_item_len( &n_ih, cpy_bytes );
361 if (is_indirect_le_ih (ih)) {
362 RFALSE( cpy_bytes == ih_item_len(ih) && get_ih_free_space(ih),
363 "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)",
364 ( long unsigned ) get_ih_free_space (ih));
365 set_ih_free_space (&n_ih, 0);
366 }
367
368 RFALSE( op_is_left_mergeable (&(ih->ih_key), src->b_size),
369 "vs-10190: bad mergeability of item %h", ih);
370 n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
371 leaf_insert_into_buf (dest_bi, B_NR_ITEMS(dest), &n_ih, B_N_PITEM (src, item_num), 0);
372 }
373 } else {
374 /* if ( if item in position item_num in buffer SOURCE is directory item ) */
375 if (is_direntry_le_ih(ih = B_N_PITEM_HEAD (src, item_num)))
376 leaf_copy_dir_entries (dest_bi, src, LAST_TO_FIRST, item_num, I_ENTRY_COUNT(ih) - cpy_bytes, cpy_bytes);
377 else {
378 struct item_head n_ih;
379
380 /* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST
381 part defined by 'cpy_bytes'; create new item header;
382 n_ih = new item_header;
383 */
384 memcpy (&n_ih, ih, SHORT_KEY_SIZE);
385
386 n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
387
388 if (is_direct_le_ih (ih)) {
389 set_le_ih_k_offset (&n_ih, le_ih_k_offset (ih) + ih_item_len(ih) - cpy_bytes);
390 set_le_ih_k_type (&n_ih, TYPE_DIRECT);
391 set_ih_free_space (&n_ih, MAX_US_INT);
392 } else {
393 /* indirect item */
394 RFALSE( !cpy_bytes && get_ih_free_space (ih),
395 "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended");
396 set_le_ih_k_offset (&n_ih, le_ih_k_offset (ih) + (ih_item_len(ih) - cpy_bytes) / UNFM_P_SIZE * dest->b_size);
397 set_le_ih_k_type (&n_ih, TYPE_INDIRECT);
398 set_ih_free_space (&n_ih, get_ih_free_space (ih));
399 }
400
401 /* set item length */
402 put_ih_item_len( &n_ih, cpy_bytes );
403
404 n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
405
406 leaf_insert_into_buf (dest_bi, 0, &n_ih, B_N_PITEM(src,item_num) + ih_item_len(ih) - cpy_bytes, 0);
407 }
408 }
409}
410
411
412/* If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE to DEST.
413 If cpy_bytes not equal to minus one than copy cpy_num-1 whole items from SOURCE to DEST.
414 From last item copy cpy_num bytes for regular item and cpy_num directory entries for
415 directory item. */
416static int leaf_copy_items (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, int cpy_num,
417 int cpy_bytes)
418{
419 struct buffer_head * dest;
420 int pos, i, src_nr_item, bytes;
421
422 dest = dest_bi->bi_bh;
423 RFALSE( !dest || !src, "vs-10210: !dest || !src");
424 RFALSE( last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
425 "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST");
426 RFALSE( B_NR_ITEMS(src) < cpy_num,
427 "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src), cpy_num);
428 RFALSE( cpy_num < 0,"vs-10240: cpy_num < 0 (%d)", cpy_num);
429
430 if ( cpy_num == 0 )
431 return 0;
432
433 if ( last_first == FIRST_TO_LAST ) {
434 /* copy items to left */
435 pos = 0;
436 if ( cpy_num == 1 )
437 bytes = cpy_bytes;
438 else
439 bytes = -1;
440
441 /* copy the first item or it part or nothing to the end of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) */
442 i = leaf_copy_boundary_item (dest_bi, src, FIRST_TO_LAST, bytes);
443 cpy_num -= i;
444 if ( cpy_num == 0 )
445 return i;
446 pos += i;
447 if ( cpy_bytes == -1 )
448 /* copy first cpy_num items starting from position 'pos' of SOURCE to end of DEST */
449 leaf_copy_items_entirely (dest_bi, src, FIRST_TO_LAST, pos, cpy_num);
450 else {
451 /* copy first cpy_num-1 items starting from position 'pos-1' of the SOURCE to the end of the DEST */
452 leaf_copy_items_entirely (dest_bi, src, FIRST_TO_LAST, pos, cpy_num-1);
453
454 /* copy part of the item which number is cpy_num+pos-1 to the end of the DEST */
455 leaf_item_bottle (dest_bi, src, FIRST_TO_LAST, cpy_num+pos-1, cpy_bytes);
456 }
457 } else {
458 /* copy items to right */
459 src_nr_item = B_NR_ITEMS (src);
460 if ( cpy_num == 1 )
461 bytes = cpy_bytes;
462 else
463 bytes = -1;
464
465 /* copy the last item or it part or nothing to the begin of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); */
466 i = leaf_copy_boundary_item (dest_bi, src, LAST_TO_FIRST, bytes);
467
468 cpy_num -= i;
469 if ( cpy_num == 0 )
470 return i;
471
472 pos = src_nr_item - cpy_num - i;
473 if ( cpy_bytes == -1 ) {
474 /* starting from position 'pos' copy last cpy_num items of SOURCE to begin of DEST */
475 leaf_copy_items_entirely (dest_bi, src, LAST_TO_FIRST, pos, cpy_num);
476 } else {
477 /* copy last cpy_num-1 items starting from position 'pos+1' of the SOURCE to the begin of the DEST; */
478 leaf_copy_items_entirely (dest_bi, src, LAST_TO_FIRST, pos+1, cpy_num-1);
479
480 /* copy part of the item which number is pos to the begin of the DEST */
481 leaf_item_bottle (dest_bi, src, LAST_TO_FIRST, pos, cpy_bytes);
482 }
483 }
484 return i;
485}
486
487
488/* there are types of coping: from S[0] to L[0], from S[0] to R[0],
489 from R[0] to L[0]. for each of these we have to define parent and
490 positions of destination and source buffers */
491static void leaf_define_dest_src_infos (int shift_mode, struct tree_balance * tb, struct buffer_info * dest_bi,
492 struct buffer_info * src_bi, int * first_last,
493 struct buffer_head * Snew)
494{
495 memset (dest_bi, 0, sizeof (struct buffer_info));
496 memset (src_bi, 0, sizeof (struct buffer_info));
497
498 /* define dest, src, dest parent, dest position */
499 switch (shift_mode) {
500 case LEAF_FROM_S_TO_L: /* it is used in leaf_shift_left */
501 src_bi->tb = tb;
502 src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path);
503 src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
504 src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0); /* src->b_item_order */
505 dest_bi->tb = tb;
506 dest_bi->bi_bh = tb->L[0];
507 dest_bi->bi_parent = tb->FL[0];
508 dest_bi->bi_position = get_left_neighbor_position (tb, 0);
509 *first_last = FIRST_TO_LAST;
510 break;
511
512 case LEAF_FROM_S_TO_R: /* it is used in leaf_shift_right */
513 src_bi->tb = tb;
514 src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path);
515 src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
516 src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0);
517 dest_bi->tb = tb;
518 dest_bi->bi_bh = tb->R[0];
519 dest_bi->bi_parent = tb->FR[0];
520 dest_bi->bi_position = get_right_neighbor_position (tb, 0);
521 *first_last = LAST_TO_FIRST;
522 break;
523
524 case LEAF_FROM_R_TO_L: /* it is used in balance_leaf_when_delete */
525 src_bi->tb = tb;
526 src_bi->bi_bh = tb->R[0];
527 src_bi->bi_parent = tb->FR[0];
528 src_bi->bi_position = get_right_neighbor_position (tb, 0);
529 dest_bi->tb = tb;
530 dest_bi->bi_bh = tb->L[0];
531 dest_bi->bi_parent = tb->FL[0];
532 dest_bi->bi_position = get_left_neighbor_position (tb, 0);
533 *first_last = FIRST_TO_LAST;
534 break;
535
536 case LEAF_FROM_L_TO_R: /* it is used in balance_leaf_when_delete */
537 src_bi->tb = tb;
538 src_bi->bi_bh = tb->L[0];
539 src_bi->bi_parent = tb->FL[0];
540 src_bi->bi_position = get_left_neighbor_position (tb, 0);
541 dest_bi->tb = tb;
542 dest_bi->bi_bh = tb->R[0];
543 dest_bi->bi_parent = tb->FR[0];
544 dest_bi->bi_position = get_right_neighbor_position (tb, 0);
545 *first_last = LAST_TO_FIRST;
546 break;
547
548 case LEAF_FROM_S_TO_SNEW:
549 src_bi->tb = tb;
550 src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path);
551 src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
552 src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0);
553 dest_bi->tb = tb;
554 dest_bi->bi_bh = Snew;
555 dest_bi->bi_parent = NULL;
556 dest_bi->bi_position = 0;
557 *first_last = LAST_TO_FIRST;
558 break;
559
560 default:
561 reiserfs_panic (NULL, "vs-10250: leaf_define_dest_src_infos: shift type is unknown (%d)", shift_mode);
562 }
563 RFALSE( src_bi->bi_bh == 0 || dest_bi->bi_bh == 0,
564 "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly",
565 shift_mode, src_bi->bi_bh, dest_bi->bi_bh);
566}
567
568
569
570
571/* copy mov_num items and mov_bytes of the (mov_num-1)th item to
572 neighbor. Delete them from source */
573int leaf_move_items (int shift_mode, struct tree_balance * tb, int mov_num, int mov_bytes, struct buffer_head * Snew)
574{
575 int ret_value;
576 struct buffer_info dest_bi, src_bi;
577 int first_last;
578
579 leaf_define_dest_src_infos (shift_mode, tb, &dest_bi, &src_bi, &first_last, Snew);
580
581 ret_value = leaf_copy_items (&dest_bi, src_bi.bi_bh, first_last, mov_num, mov_bytes);
582
583 leaf_delete_items (&src_bi, first_last, (first_last == FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) - mov_num), mov_num, mov_bytes);
584
585
586 return ret_value;
587}
588
589
590/* Shift shift_num items (and shift_bytes of last shifted item if shift_bytes != -1)
591 from S[0] to L[0] and replace the delimiting key */
592int leaf_shift_left (struct tree_balance * tb, int shift_num, int shift_bytes)
593{
594 struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path);
595 int i;
596
597 /* move shift_num (and shift_bytes bytes) items from S[0] to left neighbor L[0] */
598 i = leaf_move_items (LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL);
599
600 if ( shift_num ) {
601 if (B_NR_ITEMS (S0) == 0) { /* number of items in S[0] == 0 */
602
603 RFALSE( shift_bytes != -1,
604 "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)",
605 shift_bytes);
606#ifdef CONFIG_REISERFS_CHECK
607 if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) {
608 print_cur_tb ("vs-10275");
609 reiserfs_panic (tb->tb_sb, "vs-10275: leaf_shift_left: balance condition corrupted (%c)", tb->tb_mode);
610 }
611#endif
612
613 if (PATH_H_POSITION (tb->tb_path, 1) == 0)
614 replace_key (tb, tb->CFL[0], tb->lkey[0], PATH_H_PPARENT (tb->tb_path, 0), 0);
615
616 } else {
617 /* replace lkey in CFL[0] by 0-th key from S[0]; */
618 replace_key (tb, tb->CFL[0], tb->lkey[0], S0, 0);
619
620 RFALSE( (shift_bytes != -1 &&
621 !(is_direntry_le_ih (B_N_PITEM_HEAD (S0, 0))
622 && !I_ENTRY_COUNT (B_N_PITEM_HEAD (S0, 0)))) &&
623 (!op_is_left_mergeable (B_N_PKEY (S0, 0), S0->b_size)),
624 "vs-10280: item must be mergeable");
625 }
626 }
627
628 return i;
629}
630
631
632
633
634
635/* CLEANING STOPPED HERE */
636
637
638
639
640/* Shift shift_num (shift_bytes) items from S[0] to the right neighbor, and replace the delimiting key */
641int leaf_shift_right(
642 struct tree_balance * tb,
643 int shift_num,
644 int shift_bytes
645 )
646{
647 // struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path);
648 int ret_value;
649
650 /* move shift_num (and shift_bytes) items from S[0] to right neighbor R[0] */
651 ret_value = leaf_move_items (LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL);
652
653 /* replace rkey in CFR[0] by the 0-th key from R[0] */
654 if (shift_num) {
655 replace_key (tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
656
657 }
658
659 return ret_value;
660}
661
662
663
664static void leaf_delete_items_entirely (struct buffer_info * bi,
665 int first, int del_num);
666/* If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR.
667 If not.
668 If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of
669 the first item. Part defined by del_bytes. Don't delete first item header
670 If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of
671 the last item . Part defined by del_bytes. Don't delete last item header.
672*/
673void leaf_delete_items (struct buffer_info * cur_bi, int last_first,
674 int first, int del_num, int del_bytes)
675{
676 struct buffer_head * bh;
677 int item_amount = B_NR_ITEMS (bh = cur_bi->bi_bh);
678
679 RFALSE( !bh, "10155: bh is not defined");
680 RFALSE( del_num < 0, "10160: del_num can not be < 0. del_num==%d", del_num);
681 RFALSE( first < 0 || first + del_num > item_amount,
682 "10165: invalid number of first item to be deleted (%d) or "
683 "no so much items (%d) to delete (only %d)",
684 first, first + del_num, item_amount);
685
686 if ( del_num == 0 )
687 return;
688
689 if ( first == 0 && del_num == item_amount && del_bytes == -1 ) {
690 make_empty_node (cur_bi);
691 do_balance_mark_leaf_dirty (cur_bi->tb, bh, 0);
692 return;
693 }
694
695 if ( del_bytes == -1 )
696 /* delete del_num items beginning from item in position first */
697 leaf_delete_items_entirely (cur_bi, first, del_num);
698 else {
699 if ( last_first == FIRST_TO_LAST ) {
700 /* delete del_num-1 items beginning from item in position first */
701 leaf_delete_items_entirely (cur_bi, first, del_num-1);
702
703 /* delete the part of the first item of the bh
704 do not delete item header
705 */
706 leaf_cut_from_buffer (cur_bi, 0, 0, del_bytes);
707 } else {
708 struct item_head * ih;
709 int len;
710
711 /* delete del_num-1 items beginning from item in position first+1 */
712 leaf_delete_items_entirely (cur_bi, first+1, del_num-1);
713
714 if (is_direntry_le_ih (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh)-1))) /* the last item is directory */
715 /* len = numbers of directory entries in this item */
716 len = ih_entry_count(ih);
717 else
718 /* len = body len of item */
719 len = ih_item_len(ih);
720
721 /* delete the part of the last item of the bh
722 do not delete item header
723 */
724 leaf_cut_from_buffer (cur_bi, B_NR_ITEMS(bh)-1, len - del_bytes, del_bytes);
725 }
726 }
727}
728
729
730/* insert item into the leaf node in position before */
731void leaf_insert_into_buf (struct buffer_info * bi, int before,
732 struct item_head * inserted_item_ih,
733 const char * inserted_item_body,
734 int zeros_number)
735{
736 struct buffer_head * bh = bi->bi_bh;
737 int nr, free_space;
738 struct block_head * blkh;
739 struct item_head * ih;
740 int i;
741 int last_loc, unmoved_loc;
742 char * to;
743
744
745 blkh = B_BLK_HEAD(bh);
746 nr = blkh_nr_item(blkh);
747 free_space = blkh_free_space( blkh );
748
749 /* check free space */
750 RFALSE( free_space < ih_item_len(inserted_item_ih) + IH_SIZE,
751 "vs-10170: not enough free space in block %z, new item %h",
752 bh, inserted_item_ih);
753 RFALSE( zeros_number > ih_item_len(inserted_item_ih),
754 "vs-10172: zero number == %d, item length == %d",
755 zeros_number, ih_item_len(inserted_item_ih));
756
757
758 /* get item new item must be inserted before */
759 ih = B_N_PITEM_HEAD (bh, before);
760
761 /* prepare space for the body of new item */
762 last_loc = nr ? ih_location( &(ih[nr - before - 1]) ) : bh->b_size;
763 unmoved_loc = before ? ih_location( ih-1 ) : bh->b_size;
764
765
766 memmove (bh->b_data + last_loc - ih_item_len(inserted_item_ih),
767 bh->b_data + last_loc, unmoved_loc - last_loc);
768
769 to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih);
770 memset (to, 0, zeros_number);
771 to += zeros_number;
772
773 /* copy body to prepared space */
774 if (inserted_item_body)
775 memmove (to, inserted_item_body, ih_item_len(inserted_item_ih) - zeros_number);
776 else
777 memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number);
778
779 /* insert item header */
780 memmove (ih + 1, ih, IH_SIZE * (nr - before));
781 memmove (ih, inserted_item_ih, IH_SIZE);
782
783 /* change locations */
784 for (i = before; i < nr + 1; i ++)
785 {
786 unmoved_loc -= ih_item_len( &(ih[i-before]));
787 put_ih_location( &(ih[i-before]), unmoved_loc );
788 }
789
790 /* sizes, free space, item number */
791 set_blkh_nr_item( blkh, blkh_nr_item(blkh) + 1 );
792 set_blkh_free_space( blkh,
793 free_space - (IH_SIZE + ih_item_len(inserted_item_ih ) ) );
794 do_balance_mark_leaf_dirty (bi->tb, bh, 1);
795
796 if (bi->bi_parent) {
797 struct disk_child *t_dc;
798 t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position);
799 put_dc_size( t_dc, dc_size(t_dc) + (IH_SIZE + ih_item_len(inserted_item_ih)));
800 do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0);
801 }
802}
803
804
805/* paste paste_size bytes to affected_item_num-th item.
806 When item is a directory, this only prepare space for new entries */
807void leaf_paste_in_buffer (struct buffer_info * bi, int affected_item_num,
808 int pos_in_item, int paste_size,
809 const char * body,
810 int zeros_number)
811{
812 struct buffer_head * bh = bi->bi_bh;
813 int nr, free_space;
814 struct block_head * blkh;
815 struct item_head * ih;
816 int i;
817 int last_loc, unmoved_loc;
818
819 blkh = B_BLK_HEAD(bh);
820 nr = blkh_nr_item(blkh);
821 free_space = blkh_free_space(blkh);
822
823
824 /* check free space */
825 RFALSE( free_space < paste_size,
826 "vs-10175: not enough free space: needed %d, available %d",
827 paste_size, free_space);
828
829#ifdef CONFIG_REISERFS_CHECK
830 if (zeros_number > paste_size) {
831 print_cur_tb ("10177");
832 reiserfs_panic ( NULL, "vs-10177: leaf_paste_in_buffer: ero number == %d, paste_size == %d",
833 zeros_number, paste_size);
834 }
835#endif /* CONFIG_REISERFS_CHECK */
836
837
838 /* item to be appended */
839 ih = B_N_PITEM_HEAD(bh, affected_item_num);
840
841 last_loc = ih_location( &(ih[nr - affected_item_num - 1]) );
842 unmoved_loc = affected_item_num ? ih_location( ih-1 ) : bh->b_size;
843
844 /* prepare space */
845 memmove (bh->b_data + last_loc - paste_size, bh->b_data + last_loc,
846 unmoved_loc - last_loc);
847
848
849 /* change locations */
850 for (i = affected_item_num; i < nr; i ++)
851 put_ih_location( &(ih[i-affected_item_num]),
852 ih_location( &(ih[i-affected_item_num])) - paste_size );
853
854 if ( body ) {
855 if (!is_direntry_le_ih (ih)) {
856 if (!pos_in_item) {
857 /* shift data to right */
858 memmove (bh->b_data + ih_location(ih) + paste_size,
859 bh->b_data + ih_location(ih), ih_item_len(ih));
860 /* paste data in the head of item */
861 memset (bh->b_data + ih_location(ih), 0, zeros_number);
862 memcpy (bh->b_data + ih_location(ih) + zeros_number, body, paste_size - zeros_number);
863 } else {
864 memset (bh->b_data + unmoved_loc - paste_size, 0, zeros_number);
865 memcpy (bh->b_data + unmoved_loc - paste_size + zeros_number, body, paste_size - zeros_number);
866 }
867 }
868 }
869 else
870 memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size);
871
872 put_ih_item_len( ih, ih_item_len(ih) + paste_size );
873
874 /* change free space */
875 set_blkh_free_space( blkh, free_space - paste_size );
876
877 do_balance_mark_leaf_dirty (bi->tb, bh, 0);
878
879 if (bi->bi_parent) {
880 struct disk_child *t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position);
881 put_dc_size( t_dc, dc_size(t_dc) + paste_size );
882 do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0);
883 }
884}
885
886
887/* cuts DEL_COUNT entries beginning from FROM-th entry. Directory item
888 does not have free space, so it moves DEHs and remaining records as
889 necessary. Return value is size of removed part of directory item
890 in bytes. */
891static int leaf_cut_entries (
892 struct buffer_head * bh,
893 struct item_head * ih,
894 int from,
895 int del_count
896 )
897{
898 char * item;
899 struct reiserfs_de_head * deh;
900 int prev_record_offset; /* offset of record, that is (from-1)th */
901 char * prev_record; /* */
902 int cut_records_len; /* length of all removed records */
903 int i;
904
905
906 /* make sure, that item is directory and there are enough entries to
907 remove */
908 RFALSE( !is_direntry_le_ih (ih), "10180: item is not directory item");
909 RFALSE( I_ENTRY_COUNT(ih) < from + del_count,
910 "10185: item contains not enough entries: entry_cout = %d, from = %d, to delete = %d",
911 I_ENTRY_COUNT(ih), from, del_count);
912
913 if (del_count == 0)
914 return 0;
915
916 /* first byte of item */
917 item = bh->b_data + ih_location(ih);
918
919 /* entry head array */
920 deh = B_I_DEH (bh, ih);
921
922 /* first byte of remaining entries, those are BEFORE cut entries
923 (prev_record) and length of all removed records (cut_records_len) */
924 prev_record_offset = (from ? deh_location( &(deh[from - 1])) : ih_item_len(ih));
925 cut_records_len = prev_record_offset/*from_record*/ -
926 deh_location( &(deh[from + del_count - 1]));
927 prev_record = item + prev_record_offset;
928
929
930 /* adjust locations of remaining entries */
931 for (i = I_ENTRY_COUNT(ih) - 1; i > from + del_count - 1; i --)
932 put_deh_location( &(deh[i]),
933 deh_location( &deh[i] ) - (DEH_SIZE * del_count ) );
934
935 for (i = 0; i < from; i ++)
936 put_deh_location( &(deh[i]),
937 deh_location( &deh[i] ) - (DEH_SIZE * del_count + cut_records_len) );
938
939 put_ih_entry_count( ih, ih_entry_count(ih) - del_count );
940
941 /* shift entry head array and entries those are AFTER removed entries */
942 memmove ((char *)(deh + from),
943 deh + from + del_count,
944 prev_record - cut_records_len - (char *)(deh + from + del_count));
945
946 /* shift records, those are BEFORE removed entries */
947 memmove (prev_record - cut_records_len - DEH_SIZE * del_count,
948 prev_record, item + ih_item_len(ih) - prev_record);
949
950 return DEH_SIZE * del_count + cut_records_len;
951}
952
953
954/* when cut item is part of regular file
955 pos_in_item - first byte that must be cut
956 cut_size - number of bytes to be cut beginning from pos_in_item
957
958 when cut item is part of directory
959 pos_in_item - number of first deleted entry
960 cut_size - count of deleted entries
961 */
962void leaf_cut_from_buffer (struct buffer_info * bi, int cut_item_num,
963 int pos_in_item, int cut_size)
964{
965 int nr;
966 struct buffer_head * bh = bi->bi_bh;
967 struct block_head * blkh;
968 struct item_head * ih;
969 int last_loc, unmoved_loc;
970 int i;
971
972 blkh = B_BLK_HEAD(bh);
973 nr = blkh_nr_item(blkh);
974
975 /* item head of truncated item */
976 ih = B_N_PITEM_HEAD (bh, cut_item_num);
977
978 if (is_direntry_le_ih (ih)) {
979 /* first cut entry ()*/
980 cut_size = leaf_cut_entries (bh, ih, pos_in_item, cut_size);
981 if (pos_in_item == 0) {
982 /* change key */
983 RFALSE( cut_item_num,
984 "when 0-th enrty of item is cut, that item must be first in the node, not %d-th", cut_item_num);
985 /* change item key by key of first entry in the item */
986 set_le_ih_k_offset (ih, deh_offset(B_I_DEH (bh, ih)));
987 /*memcpy (&ih->ih_key.k_offset, &(B_I_DEH (bh, ih)->deh_offset), SHORT_KEY_SIZE);*/
988 }
989 } else {
990 /* item is direct or indirect */
991 RFALSE( is_statdata_le_ih (ih), "10195: item is stat data");
992 RFALSE( pos_in_item && pos_in_item + cut_size != ih_item_len(ih),
993 "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)",
994 ( long unsigned ) pos_in_item, ( long unsigned ) cut_size,
995 ( long unsigned ) ih_item_len (ih));
996
997 /* shift item body to left if cut is from the head of item */
998 if (pos_in_item == 0) {
999 memmove( bh->b_data + ih_location(ih),
1000 bh->b_data + ih_location(ih) + cut_size,
1001 ih_item_len(ih) - cut_size);
1002
1003 /* change key of item */
1004 if (is_direct_le_ih (ih))
1005 set_le_ih_k_offset (ih, le_ih_k_offset (ih) + cut_size);
1006 else {
1007 set_le_ih_k_offset (ih, le_ih_k_offset (ih) + (cut_size / UNFM_P_SIZE) * bh->b_size);
1008 RFALSE( ih_item_len(ih) == cut_size && get_ih_free_space (ih),
1009 "10205: invalid ih_free_space (%h)", ih);
1010 }
1011 }
1012 }
1013
1014
1015 /* location of the last item */
1016 last_loc = ih_location( &(ih[nr - cut_item_num - 1]) );
1017
1018 /* location of the item, which is remaining at the same place */
1019 unmoved_loc = cut_item_num ? ih_location(ih-1) : bh->b_size;
1020
1021
1022 /* shift */
1023 memmove (bh->b_data + last_loc + cut_size, bh->b_data + last_loc,
1024 unmoved_loc - last_loc - cut_size);
1025
1026 /* change item length */
1027 put_ih_item_len( ih, ih_item_len(ih) - cut_size );
1028
1029 if (is_indirect_le_ih (ih)) {
1030 if (pos_in_item)
1031 set_ih_free_space (ih, 0);
1032 }
1033
1034 /* change locations */
1035 for (i = cut_item_num; i < nr; i ++)
1036 put_ih_location( &(ih[i-cut_item_num]), ih_location( &ih[i-cut_item_num]) + cut_size );
1037
1038 /* size, free space */
1039 set_blkh_free_space( blkh, blkh_free_space(blkh) + cut_size );
1040
1041 do_balance_mark_leaf_dirty (bi->tb, bh, 0);
1042
1043 if (bi->bi_parent) {
1044 struct disk_child *t_dc;
1045 t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position);
1046 put_dc_size( t_dc, dc_size(t_dc) - cut_size );
1047 do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0);
1048 }
1049}
1050
1051
1052/* delete del_num items from buffer starting from the first'th item */
1053static void leaf_delete_items_entirely (struct buffer_info * bi,
1054 int first, int del_num)
1055{
1056 struct buffer_head * bh = bi->bi_bh;
1057 int nr;
1058 int i, j;
1059 int last_loc, last_removed_loc;
1060 struct block_head * blkh;
1061 struct item_head * ih;
1062
1063 RFALSE( bh == NULL, "10210: buffer is 0");
1064 RFALSE( del_num < 0, "10215: del_num less than 0 (%d)", del_num);
1065
1066 if (del_num == 0)
1067 return;
1068
1069 blkh = B_BLK_HEAD(bh);
1070 nr = blkh_nr_item(blkh);
1071
1072 RFALSE( first < 0 || first + del_num > nr,
1073 "10220: first=%d, number=%d, there is %d items", first, del_num, nr);
1074
1075 if (first == 0 && del_num == nr) {
1076 /* this does not work */
1077 make_empty_node (bi);
1078
1079 do_balance_mark_leaf_dirty (bi->tb, bh, 0);
1080 return;
1081 }
1082
1083 ih = B_N_PITEM_HEAD (bh, first);
1084
1085 /* location of unmovable item */
1086 j = (first == 0) ? bh->b_size : ih_location(ih-1);
1087
1088 /* delete items */
1089 last_loc = ih_location( &(ih[nr-1-first]) );
1090 last_removed_loc = ih_location( &(ih[del_num-1]) );
1091
1092 memmove (bh->b_data + last_loc + j - last_removed_loc,
1093 bh->b_data + last_loc, last_removed_loc - last_loc);
1094
1095 /* delete item headers */
1096 memmove (ih, ih + del_num, (nr - first - del_num) * IH_SIZE);
1097
1098 /* change item location */
1099 for (i = first; i < nr - del_num; i ++)
1100 put_ih_location( &(ih[i-first]), ih_location( &(ih[i-first]) ) + (j - last_removed_loc) );
1101
1102 /* sizes, item number */
1103 set_blkh_nr_item( blkh, blkh_nr_item(blkh) - del_num );
1104 set_blkh_free_space( blkh, blkh_free_space(blkh) + (j - last_removed_loc + IH_SIZE * del_num) );
1105
1106 do_balance_mark_leaf_dirty (bi->tb, bh, 0);
1107
1108 if (bi->bi_parent) {
1109 struct disk_child *t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position);
1110 put_dc_size( t_dc, dc_size(t_dc) -
1111 (j - last_removed_loc + IH_SIZE * del_num));
1112 do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0);
1113 }
1114}
1115
1116
1117
1118
1119
1120/* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */
1121void leaf_paste_entries (
1122 struct buffer_head * bh,
1123 int item_num,
1124 int before,
1125 int new_entry_count,
1126 struct reiserfs_de_head * new_dehs,
1127 const char * records,
1128 int paste_size
1129 )
1130{
1131 struct item_head * ih;
1132 char * item;
1133 struct reiserfs_de_head * deh;
1134 char * insert_point;
1135 int i, old_entry_num;
1136
1137 if (new_entry_count == 0)
1138 return;
1139
1140 ih = B_N_PITEM_HEAD(bh, item_num);
1141
1142 /* make sure, that item is directory, and there are enough records in it */
1143 RFALSE( !is_direntry_le_ih (ih), "10225: item is not directory item");
1144 RFALSE( I_ENTRY_COUNT (ih) < before,
1145 "10230: there are no entry we paste entries before. entry_count = %d, before = %d",
1146 I_ENTRY_COUNT (ih), before);
1147
1148
1149 /* first byte of dest item */
1150 item = bh->b_data + ih_location(ih);
1151
1152 /* entry head array */
1153 deh = B_I_DEH (bh, ih);
1154
1155 /* new records will be pasted at this point */
1156 insert_point = item + (before ? deh_location( &(deh[before - 1])) : (ih_item_len(ih) - paste_size));
1157
1158 /* adjust locations of records that will be AFTER new records */
1159 for (i = I_ENTRY_COUNT(ih) - 1; i >= before; i --)
1160 put_deh_location( &(deh[i]),
1161 deh_location(&(deh[i])) + (DEH_SIZE * new_entry_count ));
1162
1163 /* adjust locations of records that will be BEFORE new records */
1164 for (i = 0; i < before; i ++)
1165 put_deh_location( &(deh[i]), deh_location(&(deh[i])) + paste_size );
1166
1167 old_entry_num = I_ENTRY_COUNT(ih);
1168 put_ih_entry_count( ih, ih_entry_count(ih) + new_entry_count );
1169
1170 /* prepare space for pasted records */
1171 memmove (insert_point + paste_size, insert_point, item + (ih_item_len(ih) - paste_size) - insert_point);
1172
1173 /* copy new records */
1174 memcpy (insert_point + DEH_SIZE * new_entry_count, records,
1175 paste_size - DEH_SIZE * new_entry_count);
1176
1177 /* prepare space for new entry heads */
1178 deh += before;
1179 memmove ((char *)(deh + new_entry_count), deh, insert_point - (char *)deh);
1180
1181 /* copy new entry heads */
1182 deh = (struct reiserfs_de_head *)((char *)deh);
1183 memcpy (deh, new_dehs, DEH_SIZE * new_entry_count);
1184
1185 /* set locations of new records */
1186 for (i = 0; i < new_entry_count; i ++)
1187 {
1188 put_deh_location( &(deh[i]),
1189 deh_location( &(deh[i] )) +
1190 (- deh_location( &(new_dehs[new_entry_count - 1])) +
1191 insert_point + DEH_SIZE * new_entry_count - item));
1192 }
1193
1194
1195 /* change item key if necessary (when we paste before 0-th entry */
1196 if (!before)
1197 {
1198 set_le_ih_k_offset (ih, deh_offset(new_dehs));
1199/* memcpy (&ih->ih_key.k_offset,
1200 &new_dehs->deh_offset, SHORT_KEY_SIZE);*/
1201 }
1202
1203#ifdef CONFIG_REISERFS_CHECK
1204 {
1205 int prev, next;
1206 /* check record locations */
1207 deh = B_I_DEH (bh, ih);
1208 for (i = 0; i < I_ENTRY_COUNT(ih); i ++) {
1209 next = (i < I_ENTRY_COUNT(ih) - 1) ? deh_location( &(deh[i + 1])) : 0;
1210 prev = (i != 0) ? deh_location( &(deh[i - 1]) ) : 0;
1211
1212 if (prev && prev <= deh_location( &(deh[i])))
1213 reiserfs_warning (NULL, "vs-10240: leaf_paste_entries: directory item (%h) corrupted (prev %a, cur(%d) %a)",
1214 ih, deh + i - 1, i, deh + i);
1215 if (next && next >= deh_location( &(deh[i])))
1216 reiserfs_warning (NULL, "vs-10250: leaf_paste_entries: directory item (%h) corrupted (cur(%d) %a, next %a)",
1217 ih, i, deh + i, deh + i + 1);
1218 }
1219 }
1220#endif
1221
1222}
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
new file mode 100644
index 000000000000..80e92d9b81cb
--- /dev/null
+++ b/fs/reiserfs/namei.c
@@ -0,0 +1,1491 @@
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 *
4 * Trivial changes by Alan Cox to remove EHASHCOLLISION for compatibility
5 *
6 * Trivial Changes:
7 * Rights granted to Hans Reiser to redistribute under other terms providing
8 * he accepts all liability including but not limited to patent, fitness
9 * for purpose, and direct or indirect claims arising from failure to perform.
10 *
11 * NO WARRANTY
12 */
13
14#include <linux/config.h>
15#include <linux/time.h>
16#include <linux/bitops.h>
17#include <linux/reiserfs_fs.h>
18#include <linux/reiserfs_acl.h>
19#include <linux/reiserfs_xattr.h>
20#include <linux/smp_lock.h>
21#include <linux/quotaops.h>
22
23#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { i->i_nlink++; if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
24#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) i->i_nlink--;
25
26// directory item contains array of entry headers. This performs
27// binary search through that array
28static int bin_search_in_dir_item (struct reiserfs_dir_entry * de, loff_t off)
29{
30 struct item_head * ih = de->de_ih;
31 struct reiserfs_de_head * deh = de->de_deh;
32 int rbound, lbound, j;
33
34 lbound = 0;
35 rbound = I_ENTRY_COUNT (ih) - 1;
36
37 for (j = (rbound + lbound) / 2; lbound <= rbound; j = (rbound + lbound) / 2) {
38 if (off < deh_offset (deh + j)) {
39 rbound = j - 1;
40 continue;
41 }
42 if (off > deh_offset (deh + j)) {
43 lbound = j + 1;
44 continue;
45 }
46 // this is not name found, but matched third key component
47 de->de_entry_num = j;
48 return NAME_FOUND;
49 }
50
51 de->de_entry_num = lbound;
52 return NAME_NOT_FOUND;
53}
54
55
56// comment? maybe something like set de to point to what the path points to?
57static inline void set_de_item_location (struct reiserfs_dir_entry * de, struct path * path)
58{
59 de->de_bh = get_last_bh (path);
60 de->de_ih = get_ih (path);
61 de->de_deh = B_I_DEH (de->de_bh, de->de_ih);
62 de->de_item_num = PATH_LAST_POSITION (path);
63}
64
65
66// de_bh, de_ih, de_deh (points to first element of array), de_item_num is set
67inline void set_de_name_and_namelen (struct reiserfs_dir_entry * de)
68{
69 struct reiserfs_de_head * deh = de->de_deh + de->de_entry_num;
70
71 if (de->de_entry_num >= ih_entry_count (de->de_ih))
72 BUG ();
73
74 de->de_entrylen = entry_length (de->de_bh, de->de_ih, de->de_entry_num);
75 de->de_namelen = de->de_entrylen - (de_with_sd (deh) ? SD_SIZE : 0);
76 de->de_name = B_I_PITEM (de->de_bh, de->de_ih) + deh_location(deh);
77 if (de->de_name[de->de_namelen - 1] == 0)
78 de->de_namelen = strlen (de->de_name);
79}
80
81
82// what entry points to
83static inline void set_de_object_key (struct reiserfs_dir_entry * de)
84{
85 if (de->de_entry_num >= ih_entry_count (de->de_ih))
86 BUG ();
87 de->de_dir_id = deh_dir_id( &(de->de_deh[de->de_entry_num]));
88 de->de_objectid = deh_objectid( &(de->de_deh[de->de_entry_num]));
89}
90
91
92static inline void store_de_entry_key (struct reiserfs_dir_entry * de)
93{
94 struct reiserfs_de_head * deh = de->de_deh + de->de_entry_num;
95
96 if (de->de_entry_num >= ih_entry_count (de->de_ih))
97 BUG ();
98
99 /* store key of the found entry */
100 de->de_entry_key.version = KEY_FORMAT_3_5;
101 de->de_entry_key.on_disk_key.k_dir_id = le32_to_cpu (de->de_ih->ih_key.k_dir_id);
102 de->de_entry_key.on_disk_key.k_objectid = le32_to_cpu (de->de_ih->ih_key.k_objectid);
103 set_cpu_key_k_offset (&(de->de_entry_key), deh_offset (deh));
104 set_cpu_key_k_type (&(de->de_entry_key), TYPE_DIRENTRY);
105}
106
107
108/* We assign a key to each directory item, and place multiple entries
109in a single directory item. A directory item has a key equal to the
110key of the first directory entry in it.
111
112This function first calls search_by_key, then, if item whose first
113entry matches is not found it looks for the entry inside directory
114item found by search_by_key. Fills the path to the entry, and to the
115entry position in the item
116
117*/
118
119/* The function is NOT SCHEDULE-SAFE! */
120int search_by_entry_key (struct super_block * sb, const struct cpu_key * key,
121 struct path * path, struct reiserfs_dir_entry * de)
122{
123 int retval;
124
125 retval = search_item (sb, key, path);
126 switch (retval) {
127 case ITEM_NOT_FOUND:
128 if (!PATH_LAST_POSITION (path)) {
129 reiserfs_warning (sb, "vs-7000: search_by_entry_key: search_by_key returned item position == 0");
130 pathrelse(path) ;
131 return IO_ERROR ;
132 }
133 PATH_LAST_POSITION (path) --;
134
135 case ITEM_FOUND:
136 break;
137
138 case IO_ERROR:
139 return retval;
140
141 default:
142 pathrelse (path);
143 reiserfs_warning (sb, "vs-7002: search_by_entry_key: no path to here");
144 return IO_ERROR;
145 }
146
147 set_de_item_location (de, path);
148
149#ifdef CONFIG_REISERFS_CHECK
150 if (!is_direntry_le_ih (de->de_ih) ||
151 COMP_SHORT_KEYS (&(de->de_ih->ih_key), key)) {
152 print_block (de->de_bh, 0, -1, -1);
153 reiserfs_panic (sb, "vs-7005: search_by_entry_key: found item %h is not directory item or "
154 "does not belong to the same directory as key %K", de->de_ih, key);
155 }
156#endif /* CONFIG_REISERFS_CHECK */
157
158 /* binary search in directory item by third componen t of the
159 key. sets de->de_entry_num of de */
160 retval = bin_search_in_dir_item (de, cpu_key_k_offset (key));
161 path->pos_in_item = de->de_entry_num;
162 if (retval != NAME_NOT_FOUND) {
163 // ugly, but rename needs de_bh, de_deh, de_name, de_namelen, de_objectid set
164 set_de_name_and_namelen (de);
165 set_de_object_key (de);
166 }
167 return retval;
168}
169
170
171
172/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */
173
174/* The third component is hashed, and you can choose from more than
175 one hash function. Per directory hashes are not yet implemented
176 but are thought about. This function should be moved to hashes.c
177 Jedi, please do so. -Hans */
178
179static __u32 get_third_component (struct super_block * s,
180 const char * name, int len)
181{
182 __u32 res;
183
184 if (!len || (len == 1 && name[0] == '.'))
185 return DOT_OFFSET;
186 if (len == 2 && name[0] == '.' && name[1] == '.')
187 return DOT_DOT_OFFSET;
188
189 res = REISERFS_SB(s)->s_hash_function (name, len);
190
191 // take bits from 7-th to 30-th including both bounds
192 res = GET_HASH_VALUE(res);
193 if (res == 0)
194 // needed to have no names before "." and ".." those have hash
195 // value == 0 and generation conters 1 and 2 accordingly
196 res = 128;
197 return res + MAX_GENERATION_NUMBER;
198}
199
200
201static int reiserfs_match (struct reiserfs_dir_entry * de,
202 const char * name, int namelen)
203{
204 int retval = NAME_NOT_FOUND;
205
206 if ((namelen == de->de_namelen) &&
207 !memcmp(de->de_name, name, de->de_namelen))
208 retval = (de_visible (de->de_deh + de->de_entry_num) ? NAME_FOUND : NAME_FOUND_INVISIBLE);
209
210 return retval;
211}
212
213
214/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */
215
216 /* used when hash collisions exist */
217
218
219static int linear_search_in_dir_item (struct cpu_key * key, struct reiserfs_dir_entry * de,
220 const char * name, int namelen)
221{
222 struct reiserfs_de_head * deh = de->de_deh;
223 int retval;
224 int i;
225
226 i = de->de_entry_num;
227
228 if (i == I_ENTRY_COUNT (de->de_ih) ||
229 GET_HASH_VALUE (deh_offset (deh + i)) != GET_HASH_VALUE (cpu_key_k_offset (key))) {
230 i --;
231 }
232
233 RFALSE( de->de_deh != B_I_DEH (de->de_bh, de->de_ih),
234 "vs-7010: array of entry headers not found");
235
236 deh += i;
237
238 for (; i >= 0; i --, deh --) {
239 if (GET_HASH_VALUE (deh_offset (deh)) !=
240 GET_HASH_VALUE (cpu_key_k_offset (key))) {
241 // hash value does not match, no need to check whole name
242 return NAME_NOT_FOUND;
243 }
244
245 /* mark, that this generation number is used */
246 if (de->de_gen_number_bit_string)
247 set_bit (GET_GENERATION_NUMBER (deh_offset (deh)), (unsigned long *)de->de_gen_number_bit_string);
248
249 // calculate pointer to name and namelen
250 de->de_entry_num = i;
251 set_de_name_and_namelen (de);
252
253 if ((retval = reiserfs_match (de, name, namelen)) != NAME_NOT_FOUND) {
254 // de's de_name, de_namelen, de_recordlen are set. Fill the rest:
255
256 // key of pointed object
257 set_de_object_key (de);
258
259 store_de_entry_key (de);
260
261 // retval can be NAME_FOUND or NAME_FOUND_INVISIBLE
262 return retval;
263 }
264 }
265
266 if (GET_GENERATION_NUMBER (le_ih_k_offset (de->de_ih)) == 0)
267 /* we have reached left most entry in the node. In common we
268 have to go to the left neighbor, but if generation counter
269 is 0 already, we know for sure, that there is no name with
270 the same hash value */
271 // FIXME: this work correctly only because hash value can not
272 // be 0. Btw, in case of Yura's hash it is probably possible,
273 // so, this is a bug
274 return NAME_NOT_FOUND;
275
276 RFALSE( de->de_item_num,
277 "vs-7015: two diritems of the same directory in one node?");
278
279 return GOTO_PREVIOUS_ITEM;
280}
281
282
283// may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND
284// FIXME: should add something like IOERROR
285static int reiserfs_find_entry (struct inode * dir, const char * name, int namelen,
286 struct path * path_to_entry, struct reiserfs_dir_entry * de)
287{
288 struct cpu_key key_to_search;
289 int retval;
290
291
292 if (namelen > REISERFS_MAX_NAME (dir->i_sb->s_blocksize))
293 return NAME_NOT_FOUND;
294
295 /* we will search for this key in the tree */
296 make_cpu_key (&key_to_search, dir,
297 get_third_component (dir->i_sb, name, namelen), TYPE_DIRENTRY, 3);
298
299 while (1) {
300 retval = search_by_entry_key (dir->i_sb, &key_to_search, path_to_entry, de);
301 if (retval == IO_ERROR) {
302 reiserfs_warning (dir->i_sb, "zam-7001: io error in %s",
303 __FUNCTION__);
304 return IO_ERROR;
305 }
306
307 /* compare names for all entries having given hash value */
308 retval = linear_search_in_dir_item (&key_to_search, de, name, namelen);
309 if (retval != GOTO_PREVIOUS_ITEM) {
310 /* there is no need to scan directory anymore. Given entry found or does not exist */
311 path_to_entry->pos_in_item = de->de_entry_num;
312 return retval;
313 }
314
315 /* there is left neighboring item of this directory and given entry can be there */
316 set_cpu_key_k_offset (&key_to_search, le_ih_k_offset (de->de_ih) - 1);
317 pathrelse (path_to_entry);
318
319 } /* while (1) */
320}
321
322
323static struct dentry * reiserfs_lookup (struct inode * dir, struct dentry * dentry, struct nameidata *nd)
324{
325 int retval;
326 struct inode * inode = NULL;
327 struct reiserfs_dir_entry de;
328 INITIALIZE_PATH (path_to_entry);
329
330 if (REISERFS_MAX_NAME (dir->i_sb->s_blocksize) < dentry->d_name.len)
331 return ERR_PTR(-ENAMETOOLONG);
332
333 reiserfs_write_lock(dir->i_sb);
334 de.de_gen_number_bit_string = NULL;
335 retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path_to_entry, &de);
336 pathrelse (&path_to_entry);
337 if (retval == NAME_FOUND) {
338 /* Hide the .reiserfs_priv directory */
339 if (reiserfs_xattrs (dir->i_sb) &&
340 !old_format_only(dir->i_sb) &&
341 REISERFS_SB(dir->i_sb)->priv_root &&
342 REISERFS_SB(dir->i_sb)->priv_root->d_inode &&
343 de.de_objectid == le32_to_cpu (INODE_PKEY(REISERFS_SB(dir->i_sb)->priv_root->d_inode)->k_objectid)) {
344 reiserfs_write_unlock (dir->i_sb);
345 return ERR_PTR (-EACCES);
346 }
347
348 inode = reiserfs_iget (dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
349 if (!inode || IS_ERR(inode)) {
350 reiserfs_write_unlock(dir->i_sb);
351 return ERR_PTR(-EACCES);
352 }
353
354 /* Propogate the priv_object flag so we know we're in the priv tree */
355 if (is_reiserfs_priv_object (dir))
356 reiserfs_mark_inode_private (inode);
357 }
358 reiserfs_write_unlock(dir->i_sb);
359 if ( retval == IO_ERROR ) {
360 return ERR_PTR(-EIO);
361 }
362
363 if (inode)
364 return d_splice_alias(inode, dentry);
365
366 d_add(dentry, inode);
367 return NULL;
368}
369
370
371/*
372** looks up the dentry of the parent directory for child.
373** taken from ext2_get_parent
374*/
375struct dentry *reiserfs_get_parent(struct dentry *child)
376{
377 int retval;
378 struct inode * inode = NULL;
379 struct reiserfs_dir_entry de;
380 INITIALIZE_PATH (path_to_entry);
381 struct dentry *parent;
382 struct inode *dir = child->d_inode ;
383
384
385 if (dir->i_nlink == 0) {
386 return ERR_PTR(-ENOENT);
387 }
388 de.de_gen_number_bit_string = NULL;
389
390 reiserfs_write_lock(dir->i_sb);
391 retval = reiserfs_find_entry (dir, "..", 2, &path_to_entry, &de);
392 pathrelse (&path_to_entry);
393 if (retval != NAME_FOUND) {
394 reiserfs_write_unlock(dir->i_sb);
395 return ERR_PTR(-ENOENT);
396 }
397 inode = reiserfs_iget (dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
398 reiserfs_write_unlock(dir->i_sb);
399
400 if (!inode || IS_ERR(inode)) {
401 return ERR_PTR(-EACCES);
402 }
403 parent = d_alloc_anon(inode);
404 if (!parent) {
405 iput(inode);
406 parent = ERR_PTR(-ENOMEM);
407 }
408 return parent;
409}
410
411
412/* add entry to the directory (entry can be hidden).
413
414insert definition of when hidden directories are used here -Hans
415
416 Does not mark dir inode dirty, do it after successesfull call to it */
417
418static int reiserfs_add_entry (struct reiserfs_transaction_handle *th, struct inode * dir,
419 const char * name, int namelen, struct inode * inode,
420 int visible)
421{
422 struct cpu_key entry_key;
423 struct reiserfs_de_head * deh;
424 INITIALIZE_PATH (path);
425 struct reiserfs_dir_entry de;
426 int bit_string [MAX_GENERATION_NUMBER / (sizeof(int) * 8) + 1];
427 int gen_number;
428 char small_buf[32+DEH_SIZE] ; /* 48 bytes now and we avoid kmalloc
429 if we create file with short name */
430 char * buffer;
431 int buflen, paste_size;
432 int retval;
433
434 BUG_ON (!th->t_trans_id);
435
436 /* cannot allow items to be added into a busy deleted directory */
437 if (!namelen)
438 return -EINVAL;
439
440 if (namelen > REISERFS_MAX_NAME (dir->i_sb->s_blocksize))
441 return -ENAMETOOLONG;
442
443 /* each entry has unique key. compose it */
444 make_cpu_key (&entry_key, dir,
445 get_third_component (dir->i_sb, name, namelen), TYPE_DIRENTRY, 3);
446
447 /* get memory for composing the entry */
448 buflen = DEH_SIZE + ROUND_UP (namelen);
449 if (buflen > sizeof (small_buf)) {
450 buffer = reiserfs_kmalloc (buflen, GFP_NOFS, dir->i_sb);
451 if (buffer == 0)
452 return -ENOMEM;
453 } else
454 buffer = small_buf;
455
456 paste_size = (get_inode_sd_version (dir) == STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen;
457
458 /* fill buffer : directory entry head, name[, dir objectid | , stat data | ,stat data, dir objectid ] */
459 deh = (struct reiserfs_de_head *)buffer;
460 deh->deh_location = 0; /* JDM Endian safe if 0 */
461 put_deh_offset( deh, cpu_key_k_offset( &entry_key ) );
462 deh->deh_state = 0; /* JDM Endian safe if 0 */
463 /* put key (ino analog) to de */
464 deh->deh_dir_id = INODE_PKEY (inode)->k_dir_id; /* safe: k_dir_id is le */
465 deh->deh_objectid = INODE_PKEY (inode)->k_objectid; /* safe: k_objectid is le */
466
467 /* copy name */
468 memcpy ((char *)(deh + 1), name, namelen);
469 /* padd by 0s to the 4 byte boundary */
470 padd_item ((char *)(deh + 1), ROUND_UP (namelen), namelen);
471
472 /* entry is ready to be pasted into tree, set 'visibility' and 'stat data in entry' attributes */
473 mark_de_without_sd (deh);
474 visible ? mark_de_visible (deh) : mark_de_hidden (deh);
475
476 /* find the proper place for the new entry */
477 memset (bit_string, 0, sizeof (bit_string));
478 de.de_gen_number_bit_string = (char *)bit_string;
479 retval = reiserfs_find_entry (dir, name, namelen, &path, &de);
480 if( retval != NAME_NOT_FOUND ) {
481 if (buffer != small_buf)
482 reiserfs_kfree (buffer, buflen, dir->i_sb);
483 pathrelse (&path);
484
485 if ( retval == IO_ERROR ) {
486 return -EIO;
487 }
488
489 if (retval != NAME_FOUND) {
490 reiserfs_warning (dir->i_sb, "zam-7002:%s: \"reiserfs_find_entry\" "
491 "has returned unexpected value (%d)",
492 __FUNCTION__, retval);
493 }
494
495 return -EEXIST;
496 }
497
498 gen_number = find_first_zero_bit ((unsigned long *)bit_string, MAX_GENERATION_NUMBER + 1);
499 if (gen_number > MAX_GENERATION_NUMBER) {
500 /* there is no free generation number */
501 reiserfs_warning (dir->i_sb, "reiserfs_add_entry: Congratulations! we have got hash function screwed up");
502 if (buffer != small_buf)
503 reiserfs_kfree (buffer, buflen, dir->i_sb);
504 pathrelse (&path);
505 return -EBUSY;
506 }
507 /* adjust offset of directory enrty */
508 put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number));
509 set_cpu_key_k_offset (&entry_key, deh_offset(deh));
510
511 /* update max-hash-collisions counter in reiserfs_sb_info */
512 PROC_INFO_MAX( th -> t_super, max_hash_collisions, gen_number );
513
514 if (gen_number != 0) { /* we need to re-search for the insertion point */
515 if (search_by_entry_key (dir->i_sb, &entry_key, &path, &de) != NAME_NOT_FOUND) {
516 reiserfs_warning (dir->i_sb, "vs-7032: reiserfs_add_entry: "
517 "entry with this key (%K) already exists",
518 &entry_key);
519
520 if (buffer != small_buf)
521 reiserfs_kfree (buffer, buflen, dir->i_sb);
522 pathrelse (&path);
523 return -EBUSY;
524 }
525 }
526
527 /* perform the insertion of the entry that we have prepared */
528 retval = reiserfs_paste_into_item (th, &path, &entry_key, dir, buffer, paste_size);
529 if (buffer != small_buf)
530 reiserfs_kfree (buffer, buflen, dir->i_sb);
531 if (retval) {
532 reiserfs_check_path(&path) ;
533 return retval;
534 }
535
536 dir->i_size += paste_size;
537 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
538 if (!S_ISDIR (inode->i_mode) && visible)
539 // reiserfs_mkdir or reiserfs_rename will do that by itself
540 reiserfs_update_sd (th, dir);
541
542 reiserfs_check_path(&path) ;
543 return 0;
544}
545
546/* quota utility function, call if you've had to abort after calling
547** new_inode_init, and have not called reiserfs_new_inode yet.
548** This should only be called on inodes that do not have stat data
549** inserted into the tree yet.
550*/
551static int drop_new_inode(struct inode *inode) {
552 DQUOT_DROP(inode);
553 make_bad_inode(inode) ;
554 inode->i_flags |= S_NOQUOTA;
555 iput(inode) ;
556 return 0 ;
557}
558
559/* utility function that does setup for reiserfs_new_inode.
560** DQUOT_INIT needs lots of credits so it's better to have it
561** outside of a transaction, so we had to pull some bits of
562** reiserfs_new_inode out into this func.
563*/
564static int new_inode_init(struct inode *inode, struct inode *dir, int mode) {
565
566 /* the quota init calls have to know who to charge the quota to, so
567 ** we have to set uid and gid here
568 */
569 inode->i_uid = current->fsuid;
570 inode->i_mode = mode;
571
572 if (dir->i_mode & S_ISGID) {
573 inode->i_gid = dir->i_gid;
574 if (S_ISDIR(mode))
575 inode->i_mode |= S_ISGID;
576 } else {
577 inode->i_gid = current->fsgid;
578 }
579 DQUOT_INIT(inode);
580 return 0 ;
581}
582
583static int reiserfs_create (struct inode * dir, struct dentry *dentry, int mode,
584 struct nameidata *nd)
585{
586 int retval;
587 struct inode * inode;
588 /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
589 int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS);
590 struct reiserfs_transaction_handle th ;
591 int locked;
592
593 if (!(inode = new_inode(dir->i_sb))) {
594 return -ENOMEM ;
595 }
596 new_inode_init(inode, dir, mode);
597
598 locked = reiserfs_cache_default_acl (dir);
599
600 reiserfs_write_lock(dir->i_sb);
601
602 if (locked)
603 reiserfs_write_lock_xattrs (dir->i_sb);
604
605 retval = journal_begin(&th, dir->i_sb, jbegin_count);
606 if (retval) {
607 drop_new_inode (inode);
608 goto out_failed;
609 }
610
611 retval = reiserfs_new_inode (&th, dir, mode, 0, 0/*i_size*/, dentry, inode);
612 if (retval)
613 goto out_failed;
614
615 if (locked) {
616 reiserfs_write_unlock_xattrs (dir->i_sb);
617 locked = 0;
618 }
619
620 inode->i_op = &reiserfs_file_inode_operations;
621 inode->i_fop = &reiserfs_file_operations;
622 inode->i_mapping->a_ops = &reiserfs_address_space_operations ;
623
624 retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len,
625 inode, 1/*visible*/);
626 if (retval) {
627 int err;
628 inode->i_nlink--;
629 reiserfs_update_sd (&th, inode);
630 err = journal_end(&th, dir->i_sb, jbegin_count) ;
631 if (err)
632 retval = err;
633 iput (inode);
634 goto out_failed;
635 }
636 reiserfs_update_inode_transaction(inode) ;
637 reiserfs_update_inode_transaction(dir) ;
638
639 d_instantiate(dentry, inode);
640 retval = journal_end(&th, dir->i_sb, jbegin_count) ;
641
642out_failed:
643 if (locked)
644 reiserfs_write_unlock_xattrs (dir->i_sb);
645 reiserfs_write_unlock(dir->i_sb);
646 return retval;
647}
648
649
650static int reiserfs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev)
651{
652 int retval;
653 struct inode * inode;
654 struct reiserfs_transaction_handle th ;
655 /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
656 int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS);
657 int locked;
658
659 if (!new_valid_dev(rdev))
660 return -EINVAL;
661
662 if (!(inode = new_inode(dir->i_sb))) {
663 return -ENOMEM ;
664 }
665 new_inode_init(inode, dir, mode);
666
667 locked = reiserfs_cache_default_acl (dir);
668
669 reiserfs_write_lock(dir->i_sb);
670
671 if (locked)
672 reiserfs_write_lock_xattrs (dir->i_sb);
673
674 retval = journal_begin(&th, dir->i_sb, jbegin_count) ;
675 if (retval) {
676 drop_new_inode (inode);
677 goto out_failed;
678 }
679
680 retval = reiserfs_new_inode (&th, dir, mode, NULL, 0/*i_size*/, dentry, inode);
681 if (retval) {
682 goto out_failed;
683 }
684
685 if (locked) {
686 reiserfs_write_unlock_xattrs (dir->i_sb);
687 locked = 0;
688 }
689
690
691 inode->i_op = &reiserfs_special_inode_operations;
692 init_special_inode(inode, inode->i_mode, rdev) ;
693
694 //FIXME: needed for block and char devices only
695 reiserfs_update_sd (&th, inode);
696
697 reiserfs_update_inode_transaction(inode) ;
698 reiserfs_update_inode_transaction(dir) ;
699
700 retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len,
701 inode, 1/*visible*/);
702 if (retval) {
703 int err;
704 inode->i_nlink--;
705 reiserfs_update_sd (&th, inode);
706 err = journal_end(&th, dir->i_sb, jbegin_count) ;
707 if (err)
708 retval = err;
709 iput (inode);
710 goto out_failed;
711 }
712
713 d_instantiate(dentry, inode);
714 retval = journal_end(&th, dir->i_sb, jbegin_count) ;
715
716out_failed:
717 if (locked)
718 reiserfs_write_unlock_xattrs (dir->i_sb);
719 reiserfs_write_unlock(dir->i_sb);
720 return retval;
721}
722
723
724static int reiserfs_mkdir (struct inode * dir, struct dentry *dentry, int mode)
725{
726 int retval;
727 struct inode * inode;
728 struct reiserfs_transaction_handle th ;
729 /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
730 int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS);
731 int locked;
732
733#ifdef DISPLACE_NEW_PACKING_LOCALITIES
734 /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */
735 REISERFS_I(dir)->new_packing_locality = 1;
736#endif
737 mode = S_IFDIR | mode;
738 if (!(inode = new_inode(dir->i_sb))) {
739 return -ENOMEM ;
740 }
741 new_inode_init(inode, dir, mode);
742
743 locked = reiserfs_cache_default_acl (dir);
744
745 reiserfs_write_lock(dir->i_sb);
746 if (locked)
747 reiserfs_write_lock_xattrs (dir->i_sb);
748
749 retval = journal_begin(&th, dir->i_sb, jbegin_count) ;
750 if (retval) {
751 drop_new_inode (inode);
752 goto out_failed;
753 }
754
755
756 /* inc the link count now, so another writer doesn't overflow it while
757 ** we sleep later on.
758 */
759 INC_DIR_INODE_NLINK(dir)
760
761 retval = reiserfs_new_inode (&th, dir, mode, NULL/*symlink*/,
762 old_format_only (dir->i_sb) ?
763 EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
764 dentry, inode);
765 if (retval) {
766 dir->i_nlink-- ;
767 goto out_failed;
768 }
769
770 if (locked) {
771 reiserfs_write_unlock_xattrs (dir->i_sb);
772 locked = 0;
773 }
774
775 reiserfs_update_inode_transaction(inode) ;
776 reiserfs_update_inode_transaction(dir) ;
777
778 inode->i_op = &reiserfs_dir_inode_operations;
779 inode->i_fop = &reiserfs_dir_operations;
780
781 // note, _this_ add_entry will not update dir's stat data
782 retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len,
783 inode, 1/*visible*/);
784 if (retval) {
785 int err;
786 inode->i_nlink = 0;
787 DEC_DIR_INODE_NLINK(dir);
788 reiserfs_update_sd (&th, inode);
789 err = journal_end(&th, dir->i_sb, jbegin_count) ;
790 if (err)
791 retval = err;
792 iput (inode);
793 goto out_failed;
794 }
795
796 // the above add_entry did not update dir's stat data
797 reiserfs_update_sd (&th, dir);
798
799 d_instantiate(dentry, inode);
800 retval = journal_end(&th, dir->i_sb, jbegin_count) ;
801out_failed:
802 if (locked)
803 reiserfs_write_unlock_xattrs (dir->i_sb);
804 reiserfs_write_unlock(dir->i_sb);
805 return retval;
806}
807
808static inline int reiserfs_empty_dir(struct inode *inode) {
809 /* we can cheat because an old format dir cannot have
810 ** EMPTY_DIR_SIZE, and a new format dir cannot have
811 ** EMPTY_DIR_SIZE_V1. So, if the inode is either size,
812 ** regardless of disk format version, the directory is empty.
813 */
814 if (inode->i_size != EMPTY_DIR_SIZE &&
815 inode->i_size != EMPTY_DIR_SIZE_V1) {
816 return 0 ;
817 }
818 return 1 ;
819}
820
821static int reiserfs_rmdir (struct inode * dir, struct dentry *dentry)
822{
823 int retval, err;
824 struct inode * inode;
825 struct reiserfs_transaction_handle th ;
826 int jbegin_count;
827 INITIALIZE_PATH (path);
828 struct reiserfs_dir_entry de;
829
830
831 /* we will be doing 2 balancings and update 2 stat data, we change quotas
832 * of the owner of the directory and of the owner of the parent directory */
833 jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS);
834
835 reiserfs_write_lock(dir->i_sb);
836 retval = journal_begin(&th, dir->i_sb, jbegin_count) ;
837 if (retval)
838 goto out_rmdir;
839
840 de.de_gen_number_bit_string = NULL;
841 if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) {
842 retval = -ENOENT;
843 goto end_rmdir;
844 } else if ( retval == IO_ERROR) {
845 retval = -EIO;
846 goto end_rmdir;
847 }
848
849 inode = dentry->d_inode;
850
851 reiserfs_update_inode_transaction(inode) ;
852 reiserfs_update_inode_transaction(dir) ;
853
854 if (de.de_objectid != inode->i_ino) {
855 // FIXME: compare key of an object and a key found in the
856 // entry
857 retval = -EIO;
858 goto end_rmdir;
859 }
860 if (!reiserfs_empty_dir(inode)) {
861 retval = -ENOTEMPTY;
862 goto end_rmdir;
863 }
864
865 /* cut entry from dir directory */
866 retval = reiserfs_cut_from_item (&th, &path, &(de.de_entry_key), dir,
867 NULL, /* page */
868 0/*new file size - not used here*/);
869 if (retval < 0)
870 goto end_rmdir;
871
872 if ( inode->i_nlink != 2 && inode->i_nlink != 1 )
873 reiserfs_warning (inode->i_sb, "%s: empty directory has nlink "
874 "!= 2 (%d)", __FUNCTION__, inode->i_nlink);
875
876 inode->i_nlink = 0;
877 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
878 reiserfs_update_sd (&th, inode);
879
880 DEC_DIR_INODE_NLINK(dir)
881 dir->i_size -= (DEH_SIZE + de.de_entrylen);
882 reiserfs_update_sd (&th, dir);
883
884 /* prevent empty directory from getting lost */
885 add_save_link (&th, inode, 0/* not truncate */);
886
887 retval = journal_end(&th, dir->i_sb, jbegin_count) ;
888 reiserfs_check_path(&path) ;
889out_rmdir:
890 reiserfs_write_unlock(dir->i_sb);
891 return retval;
892
893 end_rmdir:
894 /* we must release path, because we did not call
895 reiserfs_cut_from_item, or reiserfs_cut_from_item does not
896 release path if operation was not complete */
897 pathrelse (&path);
898 err = journal_end(&th, dir->i_sb, jbegin_count) ;
899 reiserfs_write_unlock(dir->i_sb);
900 return err ? err : retval;
901}
902
903static int reiserfs_unlink (struct inode * dir, struct dentry *dentry)
904{
905 int retval, err;
906 struct inode * inode;
907 struct reiserfs_dir_entry de;
908 INITIALIZE_PATH (path);
909 struct reiserfs_transaction_handle th ;
910 int jbegin_count;
911 unsigned long savelink;
912
913 inode = dentry->d_inode;
914
915 /* in this transaction we can be doing at max two balancings and update
916 two stat datas, we change quotas of the owner of the directory and of
917 the owner of the parent directory */
918 jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS);
919
920 reiserfs_write_lock(dir->i_sb);
921 retval = journal_begin(&th, dir->i_sb, jbegin_count) ;
922 if (retval)
923 goto out_unlink;
924
925 de.de_gen_number_bit_string = NULL;
926 if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) {
927 retval = -ENOENT;
928 goto end_unlink;
929 } else if (retval == IO_ERROR) {
930 retval = -EIO;
931 goto end_unlink;
932 }
933
934 reiserfs_update_inode_transaction(inode) ;
935 reiserfs_update_inode_transaction(dir) ;
936
937 if (de.de_objectid != inode->i_ino) {
938 // FIXME: compare key of an object and a key found in the
939 // entry
940 retval = -EIO;
941 goto end_unlink;
942 }
943
944 if (!inode->i_nlink) {
945 reiserfs_warning (inode->i_sb, "%s: deleting nonexistent file "
946 "(%s:%lu), %d", __FUNCTION__,
947 reiserfs_bdevname (inode->i_sb), inode->i_ino,
948 inode->i_nlink);
949 inode->i_nlink = 1;
950 }
951
952 inode->i_nlink--;
953
954 /*
955 * we schedule before doing the add_save_link call, save the link
956 * count so we don't race
957 */
958 savelink = inode->i_nlink;
959
960
961 retval = reiserfs_cut_from_item (&th, &path, &(de.de_entry_key), dir, NULL, 0);
962 if (retval < 0) {
963 inode->i_nlink++;
964 goto end_unlink;
965 }
966 inode->i_ctime = CURRENT_TIME_SEC;
967 reiserfs_update_sd (&th, inode);
968
969 dir->i_size -= (de.de_entrylen + DEH_SIZE);
970 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
971 reiserfs_update_sd (&th, dir);
972
973 if (!savelink)
974 /* prevent file from getting lost */
975 add_save_link (&th, inode, 0/* not truncate */);
976
977 retval = journal_end(&th, dir->i_sb, jbegin_count) ;
978 reiserfs_check_path(&path) ;
979 reiserfs_write_unlock(dir->i_sb);
980 return retval;
981
982 end_unlink:
983 pathrelse (&path);
984 err = journal_end(&th, dir->i_sb, jbegin_count) ;
985 reiserfs_check_path(&path) ;
986 if (err)
987 retval = err;
988out_unlink:
989 reiserfs_write_unlock(dir->i_sb);
990 return retval;
991}
992
993static int reiserfs_symlink (struct inode * parent_dir,
994 struct dentry * dentry, const char * symname)
995{
996 int retval;
997 struct inode * inode;
998 char * name;
999 int item_len;
1000 struct reiserfs_transaction_handle th ;
1001 int mode = S_IFLNK | S_IRWXUGO;
1002 /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
1003 int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS);
1004
1005 if (!(inode = new_inode(parent_dir->i_sb))) {
1006 return -ENOMEM ;
1007 }
1008 new_inode_init(inode, parent_dir, mode);
1009
1010 reiserfs_write_lock(parent_dir->i_sb);
1011 item_len = ROUND_UP (strlen (symname));
1012 if (item_len > MAX_DIRECT_ITEM_LEN (parent_dir->i_sb->s_blocksize)) {
1013 retval = -ENAMETOOLONG;
1014 drop_new_inode(inode);
1015 goto out_failed;
1016 }
1017
1018 name = reiserfs_kmalloc (item_len, GFP_NOFS, parent_dir->i_sb);
1019 if (!name) {
1020 drop_new_inode(inode);
1021 retval = -ENOMEM;
1022 goto out_failed;
1023 }
1024 memcpy (name, symname, strlen (symname));
1025 padd_item (name, item_len, strlen (symname));
1026
1027 /* We would inherit the default ACL here, but symlinks don't get ACLs */
1028
1029 retval = journal_begin(&th, parent_dir->i_sb, jbegin_count) ;
1030 if (retval) {
1031 drop_new_inode (inode);
1032 reiserfs_kfree (name, item_len, parent_dir->i_sb);
1033 goto out_failed;
1034 }
1035
1036 retval = reiserfs_new_inode (&th, parent_dir, mode, name, strlen (symname),
1037 dentry, inode);
1038 reiserfs_kfree (name, item_len, parent_dir->i_sb);
1039 if (retval) { /* reiserfs_new_inode iputs for us */
1040 goto out_failed;
1041 }
1042
1043 reiserfs_update_inode_transaction(inode) ;
1044 reiserfs_update_inode_transaction(parent_dir) ;
1045
1046 inode->i_op = &reiserfs_symlink_inode_operations;
1047 inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1048
1049 // must be sure this inode is written with this transaction
1050 //
1051 //reiserfs_update_sd (&th, inode, READ_BLOCKS);
1052
1053 retval = reiserfs_add_entry (&th, parent_dir, dentry->d_name.name,
1054 dentry->d_name.len, inode, 1/*visible*/);
1055 if (retval) {
1056 int err;
1057 inode->i_nlink--;
1058 reiserfs_update_sd (&th, inode);
1059 err = journal_end(&th, parent_dir->i_sb, jbegin_count) ;
1060 if (err)
1061 retval = err;
1062 iput (inode);
1063 goto out_failed;
1064 }
1065
1066 d_instantiate(dentry, inode);
1067 retval = journal_end(&th, parent_dir->i_sb, jbegin_count) ;
1068out_failed:
1069 reiserfs_write_unlock(parent_dir->i_sb);
1070 return retval;
1071}
1072
1073static int reiserfs_link (struct dentry * old_dentry, struct inode * dir, struct dentry * dentry)
1074{
1075 int retval;
1076 struct inode *inode = old_dentry->d_inode;
1077 struct reiserfs_transaction_handle th ;
1078 /* We need blocks for transaction + update of quotas for the owners of the directory */
1079 int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * REISERFS_QUOTA_TRANS_BLOCKS;
1080
1081 reiserfs_write_lock(dir->i_sb);
1082 if (inode->i_nlink >= REISERFS_LINK_MAX) {
1083 //FIXME: sd_nlink is 32 bit for new files
1084 reiserfs_write_unlock(dir->i_sb);
1085 return -EMLINK;
1086 }
1087 if (inode->i_nlink == 0) {
1088 reiserfs_write_unlock(dir->i_sb);
1089 return -ENOENT;
1090 }
1091
1092 /* inc before scheduling so reiserfs_unlink knows we are here */
1093 inode->i_nlink++;
1094
1095 retval = journal_begin(&th, dir->i_sb, jbegin_count) ;
1096 if (retval) {
1097 inode->i_nlink--;
1098 reiserfs_write_unlock (dir->i_sb);
1099 return retval;
1100 }
1101
1102 /* create new entry */
1103 retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len,
1104 inode, 1/*visible*/);
1105
1106 reiserfs_update_inode_transaction(inode) ;
1107 reiserfs_update_inode_transaction(dir) ;
1108
1109 if (retval) {
1110 int err;
1111 inode->i_nlink--;
1112 err = journal_end(&th, dir->i_sb, jbegin_count) ;
1113 reiserfs_write_unlock(dir->i_sb);
1114 return err ? err : retval;
1115 }
1116
1117 inode->i_ctime = CURRENT_TIME_SEC;
1118 reiserfs_update_sd (&th, inode);
1119
1120 atomic_inc(&inode->i_count) ;
1121 d_instantiate(dentry, inode);
1122 retval = journal_end(&th, dir->i_sb, jbegin_count) ;
1123 reiserfs_write_unlock(dir->i_sb);
1124 return retval;
1125}
1126
1127
1128// de contains information pointing to an entry which
1129static int de_still_valid (const char * name, int len, struct reiserfs_dir_entry * de)
1130{
1131 struct reiserfs_dir_entry tmp = *de;
1132
1133 // recalculate pointer to name and name length
1134 set_de_name_and_namelen (&tmp);
1135 // FIXME: could check more
1136 if (tmp.de_namelen != len || memcmp (name, de->de_name, len))
1137 return 0;
1138 return 1;
1139}
1140
1141
1142static int entry_points_to_object (const char * name, int len, struct reiserfs_dir_entry * de, struct inode * inode)
1143{
1144 if (!de_still_valid (name, len, de))
1145 return 0;
1146
1147 if (inode) {
1148 if (!de_visible (de->de_deh + de->de_entry_num))
1149 reiserfs_panic (NULL, "vs-7042: entry_points_to_object: entry must be visible");
1150 return (de->de_objectid == inode->i_ino) ? 1 : 0;
1151 }
1152
1153 /* this must be added hidden entry */
1154 if (de_visible (de->de_deh + de->de_entry_num))
1155 reiserfs_panic (NULL, "vs-7043: entry_points_to_object: entry must be visible");
1156
1157 return 1;
1158}
1159
1160
1161/* sets key of objectid the entry has to point to */
1162static void set_ino_in_dir_entry (struct reiserfs_dir_entry * de, struct reiserfs_key * key)
1163{
1164 /* JDM These operations are endian safe - both are le */
1165 de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id;
1166 de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid;
1167}
1168
1169
1170/*
1171 * process, that is going to call fix_nodes/do_balance must hold only
1172 * one path. If it holds 2 or more, it can get into endless waiting in
1173 * get_empty_nodes or its clones
1174 */
1175static int reiserfs_rename (struct inode * old_dir, struct dentry *old_dentry,
1176 struct inode * new_dir, struct dentry *new_dentry)
1177{
1178 int retval;
1179 INITIALIZE_PATH (old_entry_path);
1180 INITIALIZE_PATH (new_entry_path);
1181 INITIALIZE_PATH (dot_dot_entry_path);
1182 struct item_head new_entry_ih, old_entry_ih, dot_dot_ih ;
1183 struct reiserfs_dir_entry old_de, new_de, dot_dot_de;
1184 struct inode * old_inode, * new_dentry_inode;
1185 struct reiserfs_transaction_handle th ;
1186 int jbegin_count ;
1187 umode_t old_inode_mode;
1188 unsigned long savelink = 1;
1189 struct timespec ctime;
1190
1191 /* three balancings: (1) old name removal, (2) new name insertion
1192 and (3) maybe "save" link insertion
1193 stat data updates: (1) old directory,
1194 (2) new directory and (3) maybe old object stat data (when it is
1195 directory) and (4) maybe stat data of object to which new entry
1196 pointed initially and (5) maybe block containing ".." of
1197 renamed directory
1198 quota updates: two parent directories */
1199 jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 5 + 4 * REISERFS_QUOTA_TRANS_BLOCKS;
1200
1201 old_inode = old_dentry->d_inode;
1202 new_dentry_inode = new_dentry->d_inode;
1203
1204 // make sure, that oldname still exists and points to an object we
1205 // are going to rename
1206 old_de.de_gen_number_bit_string = NULL;
1207 reiserfs_write_lock(old_dir->i_sb);
1208 retval = reiserfs_find_entry (old_dir, old_dentry->d_name.name, old_dentry->d_name.len,
1209 &old_entry_path, &old_de);
1210 pathrelse (&old_entry_path);
1211 if (retval == IO_ERROR) {
1212 reiserfs_write_unlock(old_dir->i_sb);
1213 return -EIO;
1214 }
1215
1216 if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) {
1217 reiserfs_write_unlock(old_dir->i_sb);
1218 return -ENOENT;
1219 }
1220
1221 old_inode_mode = old_inode->i_mode;
1222 if (S_ISDIR(old_inode_mode)) {
1223 // make sure, that directory being renamed has correct ".."
1224 // and that its new parent directory has not too many links
1225 // already
1226
1227 if (new_dentry_inode) {
1228 if (!reiserfs_empty_dir(new_dentry_inode)) {
1229 reiserfs_write_unlock(old_dir->i_sb);
1230 return -ENOTEMPTY;
1231 }
1232 }
1233
1234 /* directory is renamed, its parent directory will be changed,
1235 ** so find ".." entry
1236 */
1237 dot_dot_de.de_gen_number_bit_string = NULL;
1238 retval = reiserfs_find_entry (old_inode, "..", 2, &dot_dot_entry_path, &dot_dot_de);
1239 pathrelse (&dot_dot_entry_path);
1240 if (retval != NAME_FOUND) {
1241 reiserfs_write_unlock(old_dir->i_sb);
1242 return -EIO;
1243 }
1244
1245 /* inode number of .. must equal old_dir->i_ino */
1246 if (dot_dot_de.de_objectid != old_dir->i_ino) {
1247 reiserfs_write_unlock(old_dir->i_sb);
1248 return -EIO;
1249 }
1250 }
1251
1252 retval = journal_begin(&th, old_dir->i_sb, jbegin_count) ;
1253 if (retval) {
1254 reiserfs_write_unlock (old_dir->i_sb);
1255 return retval;
1256 }
1257
1258 /* add new entry (or find the existing one) */
1259 retval = reiserfs_add_entry (&th, new_dir, new_dentry->d_name.name, new_dentry->d_name.len,
1260 old_inode, 0);
1261 if (retval == -EEXIST) {
1262 if (!new_dentry_inode) {
1263 reiserfs_panic (old_dir->i_sb,
1264 "vs-7050: new entry is found, new inode == 0\n");
1265 }
1266 } else if (retval) {
1267 int err = journal_end(&th, old_dir->i_sb, jbegin_count) ;
1268 reiserfs_write_unlock(old_dir->i_sb);
1269 return err ? err : retval;
1270 }
1271
1272 reiserfs_update_inode_transaction(old_dir) ;
1273 reiserfs_update_inode_transaction(new_dir) ;
1274
1275 /* this makes it so an fsync on an open fd for the old name will
1276 ** commit the rename operation
1277 */
1278 reiserfs_update_inode_transaction(old_inode) ;
1279
1280 if (new_dentry_inode)
1281 reiserfs_update_inode_transaction(new_dentry_inode) ;
1282
1283 while (1) {
1284 // look for old name using corresponding entry key (found by reiserfs_find_entry)
1285 if ((retval = search_by_entry_key (new_dir->i_sb, &old_de.de_entry_key,
1286 &old_entry_path, &old_de)) != NAME_FOUND) {
1287 pathrelse(&old_entry_path);
1288 journal_end(&th, old_dir->i_sb, jbegin_count);
1289 reiserfs_write_unlock(old_dir->i_sb);
1290 return -EIO;
1291 }
1292
1293 copy_item_head(&old_entry_ih, get_ih(&old_entry_path)) ;
1294
1295 reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1) ;
1296
1297 // look for new name by reiserfs_find_entry
1298 new_de.de_gen_number_bit_string = NULL;
1299 retval = reiserfs_find_entry (new_dir, new_dentry->d_name.name, new_dentry->d_name.len,
1300 &new_entry_path, &new_de);
1301 // reiserfs_add_entry should not return IO_ERROR, because it is called with essentially same parameters from
1302 // reiserfs_add_entry above, and we'll catch any i/o errors before we get here.
1303 if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) {
1304 pathrelse(&new_entry_path);
1305 pathrelse(&old_entry_path);
1306 journal_end(&th, old_dir->i_sb, jbegin_count);
1307 reiserfs_write_unlock(old_dir->i_sb);
1308 return -EIO;
1309 }
1310
1311 copy_item_head(&new_entry_ih, get_ih(&new_entry_path)) ;
1312
1313 reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1) ;
1314
1315 if (S_ISDIR(old_inode->i_mode)) {
1316 if ((retval = search_by_entry_key (new_dir->i_sb, &dot_dot_de.de_entry_key,
1317 &dot_dot_entry_path, &dot_dot_de)) != NAME_FOUND) {
1318 pathrelse(&dot_dot_entry_path);
1319 pathrelse(&new_entry_path);
1320 pathrelse(&old_entry_path);
1321 journal_end(&th, old_dir->i_sb, jbegin_count);
1322 reiserfs_write_unlock(old_dir->i_sb);
1323 return -EIO;
1324 }
1325 copy_item_head(&dot_dot_ih, get_ih(&dot_dot_entry_path)) ;
1326 // node containing ".." gets into transaction
1327 reiserfs_prepare_for_journal(old_inode->i_sb, dot_dot_de.de_bh, 1) ;
1328 }
1329 /* we should check seals here, not do
1330 this stuff, yes? Then, having
1331 gathered everything into RAM we
1332 should lock the buffers, yes? -Hans */
1333 /* probably. our rename needs to hold more
1334 ** than one path at once. The seals would
1335 ** have to be written to deal with multi-path
1336 ** issues -chris
1337 */
1338 /* sanity checking before doing the rename - avoid races many
1339 ** of the above checks could have scheduled. We have to be
1340 ** sure our items haven't been shifted by another process.
1341 */
1342 if (item_moved(&new_entry_ih, &new_entry_path) ||
1343 !entry_points_to_object(new_dentry->d_name.name,
1344 new_dentry->d_name.len,
1345 &new_de, new_dentry_inode) ||
1346 item_moved(&old_entry_ih, &old_entry_path) ||
1347 !entry_points_to_object (old_dentry->d_name.name,
1348 old_dentry->d_name.len,
1349 &old_de, old_inode)) {
1350 reiserfs_restore_prepared_buffer (old_inode->i_sb, new_de.de_bh);
1351 reiserfs_restore_prepared_buffer (old_inode->i_sb, old_de.de_bh);
1352 if (S_ISDIR(old_inode_mode))
1353 reiserfs_restore_prepared_buffer (old_inode->i_sb, dot_dot_de.de_bh);
1354 continue;
1355 }
1356 if (S_ISDIR(old_inode_mode)) {
1357 if ( item_moved(&dot_dot_ih, &dot_dot_entry_path) ||
1358 !entry_points_to_object ( "..", 2, &dot_dot_de, old_dir) ) {
1359 reiserfs_restore_prepared_buffer (old_inode->i_sb, old_de.de_bh);
1360 reiserfs_restore_prepared_buffer (old_inode->i_sb, new_de.de_bh);
1361 reiserfs_restore_prepared_buffer (old_inode->i_sb, dot_dot_de.de_bh);
1362 continue;
1363 }
1364 }
1365
1366 RFALSE( S_ISDIR(old_inode_mode) &&
1367 !buffer_journal_prepared(dot_dot_de.de_bh), "" );
1368
1369 break;
1370 }
1371
1372 /* ok, all the changes can be done in one fell swoop when we
1373 have claimed all the buffers needed.*/
1374
1375 mark_de_visible (new_de.de_deh + new_de.de_entry_num);
1376 set_ino_in_dir_entry (&new_de, INODE_PKEY (old_inode));
1377 journal_mark_dirty (&th, old_dir->i_sb, new_de.de_bh);
1378
1379 mark_de_hidden (old_de.de_deh + old_de.de_entry_num);
1380 journal_mark_dirty (&th, old_dir->i_sb, old_de.de_bh);
1381 ctime = CURRENT_TIME_SEC;
1382 old_dir->i_ctime = old_dir->i_mtime = ctime;
1383 new_dir->i_ctime = new_dir->i_mtime = ctime;
1384 /* thanks to Alex Adriaanse <alex_a@caltech.edu> for patch which adds ctime update of
1385 renamed object */
1386 old_inode->i_ctime = ctime;
1387
1388 if (new_dentry_inode) {
1389 // adjust link number of the victim
1390 if (S_ISDIR(new_dentry_inode->i_mode)) {
1391 new_dentry_inode->i_nlink = 0;
1392 } else {
1393 new_dentry_inode->i_nlink--;
1394 }
1395 new_dentry_inode->i_ctime = ctime;
1396 savelink = new_dentry_inode->i_nlink;
1397 }
1398
1399 if (S_ISDIR(old_inode_mode)) {
1400 // adjust ".." of renamed directory
1401 set_ino_in_dir_entry (&dot_dot_de, INODE_PKEY (new_dir));
1402 journal_mark_dirty (&th, new_dir->i_sb, dot_dot_de.de_bh);
1403
1404 if (!new_dentry_inode)
1405 /* there (in new_dir) was no directory, so it got new link
1406 (".." of renamed directory) */
1407 INC_DIR_INODE_NLINK(new_dir);
1408
1409 /* old directory lost one link - ".. " of renamed directory */
1410 DEC_DIR_INODE_NLINK(old_dir);
1411 }
1412
1413 // looks like in 2.3.99pre3 brelse is atomic. so we can use pathrelse
1414 pathrelse (&new_entry_path);
1415 pathrelse (&dot_dot_entry_path);
1416
1417 // FIXME: this reiserfs_cut_from_item's return value may screw up
1418 // anybody, but it will panic if will not be able to find the
1419 // entry. This needs one more clean up
1420 if (reiserfs_cut_from_item (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL, 0) < 0)
1421 reiserfs_warning (old_dir->i_sb, "vs-7060: reiserfs_rename: couldn't not cut old name. Fsck later?");
1422
1423 old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
1424
1425 reiserfs_update_sd (&th, old_dir);
1426 reiserfs_update_sd (&th, new_dir);
1427 reiserfs_update_sd (&th, old_inode);
1428
1429 if (new_dentry_inode) {
1430 if (savelink == 0)
1431 add_save_link (&th, new_dentry_inode, 0/* not truncate */);
1432 reiserfs_update_sd (&th, new_dentry_inode);
1433 }
1434
1435 retval = journal_end(&th, old_dir->i_sb, jbegin_count) ;
1436 reiserfs_write_unlock(old_dir->i_sb);
1437 return retval;
1438}
1439
1440/*
1441 * directories can handle most operations...
1442 */
1443struct inode_operations reiserfs_dir_inode_operations = {
1444 //&reiserfs_dir_operations, /* default_file_ops */
1445 .create = reiserfs_create,
1446 .lookup = reiserfs_lookup,
1447 .link = reiserfs_link,
1448 .unlink = reiserfs_unlink,
1449 .symlink = reiserfs_symlink,
1450 .mkdir = reiserfs_mkdir,
1451 .rmdir = reiserfs_rmdir,
1452 .mknod = reiserfs_mknod,
1453 .rename = reiserfs_rename,
1454 .setattr = reiserfs_setattr,
1455 .setxattr = reiserfs_setxattr,
1456 .getxattr = reiserfs_getxattr,
1457 .listxattr = reiserfs_listxattr,
1458 .removexattr = reiserfs_removexattr,
1459 .permission = reiserfs_permission,
1460};
1461
1462/*
1463 * symlink operations.. same as page_symlink_inode_operations, with xattr
1464 * stuff added
1465 */
1466struct inode_operations reiserfs_symlink_inode_operations = {
1467 .readlink = generic_readlink,
1468 .follow_link = page_follow_link_light,
1469 .put_link = page_put_link,
1470 .setattr = reiserfs_setattr,
1471 .setxattr = reiserfs_setxattr,
1472 .getxattr = reiserfs_getxattr,
1473 .listxattr = reiserfs_listxattr,
1474 .removexattr = reiserfs_removexattr,
1475 .permission = reiserfs_permission,
1476
1477};
1478
1479
1480/*
1481 * special file operations.. just xattr/acl stuff
1482 */
1483struct inode_operations reiserfs_special_inode_operations = {
1484 .setattr = reiserfs_setattr,
1485 .setxattr = reiserfs_setxattr,
1486 .getxattr = reiserfs_getxattr,
1487 .listxattr = reiserfs_listxattr,
1488 .removexattr = reiserfs_removexattr,
1489 .permission = reiserfs_permission,
1490
1491};
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
new file mode 100644
index 000000000000..0785c43a7486
--- /dev/null
+++ b/fs/reiserfs/objectid.c
@@ -0,0 +1,206 @@
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5#include <linux/config.h>
6#include <linux/string.h>
7#include <linux/random.h>
8#include <linux/time.h>
9#include <linux/reiserfs_fs.h>
10#include <linux/reiserfs_fs_sb.h>
11
12// find where objectid map starts
13#define objectid_map(s,rs) (old_format_only (s) ? \
14 (__u32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\
15 (__u32 *)((rs) + 1))
16
17
18#ifdef CONFIG_REISERFS_CHECK
19
20static void check_objectid_map (struct super_block * s, __u32 * map)
21{
22 if (le32_to_cpu (map[0]) != 1)
23 reiserfs_panic (s, "vs-15010: check_objectid_map: map corrupted: %lx",
24 ( long unsigned int ) le32_to_cpu (map[0]));
25
26 // FIXME: add something else here
27}
28
29#else
30static void check_objectid_map (struct super_block * s, __u32 * map)
31{;}
32#endif
33
34
35/* When we allocate objectids we allocate the first unused objectid.
36 Each sequence of objectids in use (the odd sequences) is followed
37 by a sequence of objectids not in use (the even sequences). We
38 only need to record the last objectid in each of these sequences
39 (both the odd and even sequences) in order to fully define the
40 boundaries of the sequences. A consequence of allocating the first
41 objectid not in use is that under most conditions this scheme is
42 extremely compact. The exception is immediately after a sequence
43 of operations which deletes a large number of objects of
44 non-sequential objectids, and even then it will become compact
45 again as soon as more objects are created. Note that many
46 interesting optimizations of layout could result from complicating
47 objectid assignment, but we have deferred making them for now. */
48
49
50/* get unique object identifier */
51__u32 reiserfs_get_unused_objectid (struct reiserfs_transaction_handle *th)
52{
53 struct super_block * s = th->t_super;
54 struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s);
55 __u32 * map = objectid_map (s, rs);
56 __u32 unused_objectid;
57
58 BUG_ON (!th->t_trans_id);
59
60 check_objectid_map (s, map);
61
62 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
63 /* comment needed -Hans */
64 unused_objectid = le32_to_cpu (map[1]);
65 if (unused_objectid == U32_MAX) {
66 reiserfs_warning (s, "%s: no more object ids", __FUNCTION__);
67 reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s)) ;
68 return 0;
69 }
70
71 /* This incrementation allocates the first unused objectid. That
72 is to say, the first entry on the objectid map is the first
73 unused objectid, and by incrementing it we use it. See below
74 where we check to see if we eliminated a sequence of unused
75 objectids.... */
76 map[1] = cpu_to_le32 (unused_objectid + 1);
77
78 /* Now we check to see if we eliminated the last remaining member of
79 the first even sequence (and can eliminate the sequence by
80 eliminating its last objectid from oids), and can collapse the
81 first two odd sequences into one sequence. If so, then the net
82 result is to eliminate a pair of objectids from oids. We do this
83 by shifting the entire map to the left. */
84 if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) {
85 memmove (map + 1, map + 3, (sb_oid_cursize(rs) - 3) * sizeof(__u32));
86 set_sb_oid_cursize( rs, sb_oid_cursize(rs) - 2 );
87 }
88
89 journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s));
90 return unused_objectid;
91}
92
93
94/* makes object identifier unused */
95void reiserfs_release_objectid (struct reiserfs_transaction_handle *th,
96 __u32 objectid_to_release)
97{
98 struct super_block * s = th->t_super;
99 struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s);
100 __u32 * map = objectid_map (s, rs);
101 int i = 0;
102
103 BUG_ON (!th->t_trans_id);
104 //return;
105 check_objectid_map (s, map);
106
107 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
108 journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s));
109
110 /* start at the beginning of the objectid map (i = 0) and go to
111 the end of it (i = disk_sb->s_oid_cursize). Linear search is
112 what we use, though it is possible that binary search would be
113 more efficient after performing lots of deletions (which is
114 when oids is large.) We only check even i's. */
115 while (i < sb_oid_cursize(rs)) {
116 if (objectid_to_release == le32_to_cpu (map[i])) {
117 /* This incrementation unallocates the objectid. */
118 //map[i]++;
119 map[i] = cpu_to_le32 (le32_to_cpu (map[i]) + 1);
120
121 /* Did we unallocate the last member of an odd sequence, and can shrink oids? */
122 if (map[i] == map[i+1]) {
123 /* shrink objectid map */
124 memmove (map + i, map + i + 2,
125 (sb_oid_cursize(rs) - i - 2) * sizeof (__u32));
126 //disk_sb->s_oid_cursize -= 2;
127 set_sb_oid_cursize( rs, sb_oid_cursize(rs) - 2 );
128
129 RFALSE( sb_oid_cursize(rs) < 2 ||
130 sb_oid_cursize(rs) > sb_oid_maxsize(rs),
131 "vs-15005: objectid map corrupted cur_size == %d (max == %d)",
132 sb_oid_cursize(rs), sb_oid_maxsize(rs));
133 }
134 return;
135 }
136
137 if (objectid_to_release > le32_to_cpu (map[i]) &&
138 objectid_to_release < le32_to_cpu (map[i + 1])) {
139 /* size of objectid map is not changed */
140 if (objectid_to_release + 1 == le32_to_cpu (map[i + 1])) {
141 //objectid_map[i+1]--;
142 map[i + 1] = cpu_to_le32 (le32_to_cpu (map[i + 1]) - 1);
143 return;
144 }
145
146 /* JDM comparing two little-endian values for equality -- safe */
147 if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) {
148 /* objectid map must be expanded, but there is no space */
149 PROC_INFO_INC( s, leaked_oid );
150 return;
151 }
152
153 /* expand the objectid map*/
154 memmove (map + i + 3, map + i + 1,
155 (sb_oid_cursize(rs) - i - 1) * sizeof(__u32));
156 map[i + 1] = cpu_to_le32 (objectid_to_release);
157 map[i + 2] = cpu_to_le32 (objectid_to_release + 1);
158 set_sb_oid_cursize( rs, sb_oid_cursize(rs) + 2 );
159 return;
160 }
161 i += 2;
162 }
163
164 reiserfs_warning (s, "vs-15011: reiserfs_release_objectid: tried to free free object id (%lu)",
165 ( long unsigned ) objectid_to_release);
166}
167
168
169int reiserfs_convert_objectid_map_v1(struct super_block *s) {
170 struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK (s);
171 int cur_size = sb_oid_cursize(disk_sb);
172 int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2 ;
173 int old_max = sb_oid_maxsize(disk_sb);
174 struct reiserfs_super_block_v1 *disk_sb_v1 ;
175 __u32 *objectid_map, *new_objectid_map ;
176 int i ;
177
178 disk_sb_v1=(struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data);
179 objectid_map = (__u32 *)(disk_sb_v1 + 1) ;
180 new_objectid_map = (__u32 *)(disk_sb + 1) ;
181
182 if (cur_size > new_size) {
183 /* mark everyone used that was listed as free at the end of the objectid
184 ** map
185 */
186 objectid_map[new_size - 1] = objectid_map[cur_size - 1] ;
187 set_sb_oid_cursize(disk_sb,new_size) ;
188 }
189 /* move the smaller objectid map past the end of the new super */
190 for (i = new_size - 1 ; i >= 0 ; i--) {
191 objectid_map[i + (old_max - new_size)] = objectid_map[i] ;
192 }
193
194
195 /* set the max size so we don't overflow later */
196 set_sb_oid_maxsize(disk_sb,new_size) ;
197
198 /* Zero out label and generate random UUID */
199 memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label)) ;
200 generate_random_uuid(disk_sb->s_uuid);
201
202 /* finally, zero out the unused chunk of the new super */
203 memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused)) ;
204 return 0 ;
205}
206
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
new file mode 100644
index 000000000000..16fdca1d4bd7
--- /dev/null
+++ b/fs/reiserfs/prints.c
@@ -0,0 +1,727 @@
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5#include <linux/config.h>
6#include <linux/time.h>
7#include <linux/fs.h>
8#include <linux/reiserfs_fs.h>
9#include <linux/string.h>
10#include <linux/buffer_head.h>
11
12#include <stdarg.h>
13
14static char error_buf[1024];
15static char fmt_buf[1024];
16static char off_buf[80];
17
18
19static char * reiserfs_cpu_offset (struct cpu_key * key)
20{
21 if (cpu_key_k_type(key) == TYPE_DIRENTRY)
22 sprintf (off_buf, "%Lu(%Lu)",
23 (unsigned long long)GET_HASH_VALUE (cpu_key_k_offset (key)),
24 (unsigned long long)GET_GENERATION_NUMBER (cpu_key_k_offset (key)));
25 else
26 sprintf (off_buf, "0x%Lx", (unsigned long long)cpu_key_k_offset (key));
27 return off_buf;
28}
29
30
31static char * le_offset (struct reiserfs_key * key)
32{
33 int version;
34
35 version = le_key_version (key);
36 if (le_key_k_type (version, key) == TYPE_DIRENTRY)
37 sprintf (off_buf, "%Lu(%Lu)",
38 (unsigned long long)GET_HASH_VALUE (le_key_k_offset (version, key)),
39 (unsigned long long)GET_GENERATION_NUMBER (le_key_k_offset (version, key)));
40 else
41 sprintf (off_buf, "0x%Lx", (unsigned long long)le_key_k_offset (version, key));
42 return off_buf;
43}
44
45
46static char * cpu_type (struct cpu_key * key)
47{
48 if (cpu_key_k_type (key) == TYPE_STAT_DATA)
49 return "SD";
50 if (cpu_key_k_type (key) == TYPE_DIRENTRY)
51 return "DIR";
52 if (cpu_key_k_type (key) == TYPE_DIRECT)
53 return "DIRECT";
54 if (cpu_key_k_type (key) == TYPE_INDIRECT)
55 return "IND";
56 return "UNKNOWN";
57}
58
59
60static char * le_type (struct reiserfs_key * key)
61{
62 int version;
63
64 version = le_key_version (key);
65
66 if (le_key_k_type (version, key) == TYPE_STAT_DATA)
67 return "SD";
68 if (le_key_k_type (version, key) == TYPE_DIRENTRY)
69 return "DIR";
70 if (le_key_k_type (version, key) == TYPE_DIRECT)
71 return "DIRECT";
72 if (le_key_k_type (version, key) == TYPE_INDIRECT)
73 return "IND";
74 return "UNKNOWN";
75}
76
77
78/* %k */
79static void sprintf_le_key (char * buf, struct reiserfs_key * key)
80{
81 if (key)
82 sprintf (buf, "[%d %d %s %s]", le32_to_cpu (key->k_dir_id),
83 le32_to_cpu (key->k_objectid), le_offset (key), le_type (key));
84 else
85 sprintf (buf, "[NULL]");
86}
87
88
89/* %K */
90static void sprintf_cpu_key (char * buf, struct cpu_key * key)
91{
92 if (key)
93 sprintf (buf, "[%d %d %s %s]", key->on_disk_key.k_dir_id,
94 key->on_disk_key.k_objectid, reiserfs_cpu_offset (key),
95 cpu_type (key));
96 else
97 sprintf (buf, "[NULL]");
98}
99
100static void sprintf_de_head( char *buf, struct reiserfs_de_head *deh )
101{
102 if( deh )
103 sprintf( buf, "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]", deh_offset(deh), deh_dir_id(deh),
104 deh_objectid(deh), deh_location(deh), deh_state(deh) );
105 else
106 sprintf( buf, "[NULL]" );
107
108}
109
110static void sprintf_item_head (char * buf, struct item_head * ih)
111{
112 if (ih) {
113 strcpy (buf, (ih_version (ih) == KEY_FORMAT_3_6) ? "*3.6* " : "*3.5*");
114 sprintf_le_key (buf + strlen (buf), &(ih->ih_key));
115 sprintf (buf + strlen (buf), ", item_len %d, item_location %d, "
116 "free_space(entry_count) %d",
117 ih_item_len(ih), ih_location(ih), ih_free_space (ih));
118 } else
119 sprintf (buf, "[NULL]");
120}
121
122
123static void sprintf_direntry (char * buf, struct reiserfs_dir_entry * de)
124{
125 char name[20];
126
127 memcpy (name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen);
128 name [de->de_namelen > 19 ? 19 : de->de_namelen] = 0;
129 sprintf (buf, "\"%s\"==>[%d %d]", name, de->de_dir_id, de->de_objectid);
130}
131
132
133static void sprintf_block_head (char * buf, struct buffer_head * bh)
134{
135 sprintf (buf, "level=%d, nr_items=%d, free_space=%d rdkey ",
136 B_LEVEL (bh), B_NR_ITEMS (bh), B_FREE_SPACE (bh));
137}
138
139
140static void sprintf_buffer_head (char * buf, struct buffer_head * bh)
141{
142 char b[BDEVNAME_SIZE];
143
144 sprintf (buf, "dev %s, size %d, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
145 bdevname (bh->b_bdev, b), bh->b_size,
146 (unsigned long long)bh->b_blocknr,
147 atomic_read (&(bh->b_count)),
148 bh->b_state, bh->b_page,
149 buffer_uptodate (bh) ? "UPTODATE" : "!UPTODATE",
150 buffer_dirty (bh) ? "DIRTY" : "CLEAN",
151 buffer_locked (bh) ? "LOCKED" : "UNLOCKED");
152}
153
154
155static void sprintf_disk_child (char * buf, struct disk_child * dc)
156{
157 sprintf (buf, "[dc_number=%d, dc_size=%u]", dc_block_number(dc), dc_size(dc));
158}
159
160
161static char * is_there_reiserfs_struct (char * fmt, int * what, int * skip)
162{
163 char * k = fmt;
164
165 *skip = 0;
166
167 while ((k = strchr (k, '%')) != NULL)
168 {
169 if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' ||
170 k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a' ) {
171 *what = k[1];
172 break;
173 }
174 (*skip) ++;
175 k ++;
176 }
177 return k;
178}
179
180
181/* debugging reiserfs we used to print out a lot of different
182 variables, like keys, item headers, buffer heads etc. Values of
183 most fields matter. So it took a long time just to write
184 appropriative printk. With this reiserfs_warning you can use format
185 specification for complex structures like you used to do with
186 printfs for integers, doubles and pointers. For instance, to print
187 out key structure you have to write just:
188 reiserfs_warning ("bad key %k", key);
189 instead of
190 printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid,
191 key->k_offset, key->k_uniqueness);
192*/
193
194
195static void
196prepare_error_buf( const char *fmt, va_list args )
197{
198 char * fmt1 = fmt_buf;
199 char * k;
200 char * p = error_buf;
201 int i, j, what, skip;
202
203 strcpy (fmt1, fmt);
204
205 while( (k = is_there_reiserfs_struct( fmt1, &what, &skip )) != NULL )
206 {
207 *k = 0;
208
209 p += vsprintf (p, fmt1, args);
210
211 for (i = 0; i < skip; i ++)
212 j = va_arg (args, int);
213
214 switch (what) {
215 case 'k':
216 sprintf_le_key (p, va_arg(args, struct reiserfs_key *));
217 break;
218 case 'K':
219 sprintf_cpu_key (p, va_arg(args, struct cpu_key *));
220 break;
221 case 'h':
222 sprintf_item_head (p, va_arg(args, struct item_head *));
223 break;
224 case 't':
225 sprintf_direntry (p, va_arg(args, struct reiserfs_dir_entry *));
226 break;
227 case 'y':
228 sprintf_disk_child (p, va_arg(args, struct disk_child *));
229 break;
230 case 'z':
231 sprintf_block_head (p, va_arg(args, struct buffer_head *));
232 break;
233 case 'b':
234 sprintf_buffer_head (p, va_arg(args, struct buffer_head *));
235 break;
236 case 'a':
237 sprintf_de_head (p, va_arg(args, struct reiserfs_de_head *));
238 break;
239 }
240
241 p += strlen (p);
242 fmt1 = k + 2;
243 }
244 vsprintf (p, fmt1, args);
245
246}
247
248
249/* in addition to usual conversion specifiers this accepts reiserfs
250 specific conversion specifiers:
251 %k to print little endian key,
252 %K to print cpu key,
253 %h to print item_head,
254 %t to print directory entry
255 %z to print block head (arg must be struct buffer_head *
256 %b to print buffer_head
257*/
258
259#define do_reiserfs_warning(fmt)\
260{\
261 va_list args;\
262 va_start( args, fmt );\
263 prepare_error_buf( fmt, args );\
264 va_end( args );\
265}
266
267void reiserfs_warning (struct super_block *sb, const char * fmt, ...)
268{
269 do_reiserfs_warning(fmt);
270 if (sb)
271 printk (KERN_WARNING "ReiserFS: %s: warning: %s\n",
272 reiserfs_bdevname (sb), error_buf);
273 else
274 printk (KERN_WARNING "ReiserFS: warning: %s\n", error_buf);
275}
276
277/* No newline.. reiserfs_info calls can be followed by printk's */
278void reiserfs_info (struct super_block *sb, const char * fmt, ...)
279{
280 do_reiserfs_warning(fmt);
281 if (sb)
282 printk (KERN_NOTICE "ReiserFS: %s: %s",
283 reiserfs_bdevname (sb), error_buf);
284 else
285 printk (KERN_NOTICE "ReiserFS: %s", error_buf);
286}
287
288/* No newline.. reiserfs_printk calls can be followed by printk's */
289static void reiserfs_printk (const char * fmt, ...)
290{
291 do_reiserfs_warning(fmt);
292 printk (error_buf);
293}
294
295void reiserfs_debug (struct super_block *s, int level, const char * fmt, ...)
296{
297#ifdef CONFIG_REISERFS_CHECK
298 do_reiserfs_warning(fmt);
299 if (s)
300 printk (KERN_DEBUG "ReiserFS: %s: %s\n",
301 reiserfs_bdevname (s), error_buf);
302 else
303 printk (KERN_DEBUG "ReiserFS: %s\n", error_buf);
304#endif
305}
306
307/* The format:
308
309 maintainer-errorid: [function-name:] message
310
311 where errorid is unique to the maintainer and function-name is
312 optional, is recommended, so that anyone can easily find the bug
313 with a simple grep for the short to type string
314 maintainer-errorid. Don't bother with reusing errorids, there are
315 lots of numbers out there.
316
317 Example:
318
319 reiserfs_panic(
320 p_sb, "reiser-29: reiserfs_new_blocknrs: "
321 "one of search_start or rn(%d) is equal to MAX_B_NUM,"
322 "which means that we are optimizing location based on the bogus location of a temp buffer (%p).",
323 rn, bh
324 );
325
326 Regular panic()s sometimes clear the screen before the message can
327 be read, thus the need for the while loop.
328
329 Numbering scheme for panic used by Vladimir and Anatoly( Hans completely ignores this scheme, and considers it
330 pointless complexity):
331
332 panics in reiserfs_fs.h have numbers from 1000 to 1999
333 super.c 2000 to 2999
334 preserve.c (unused) 3000 to 3999
335 bitmap.c 4000 to 4999
336 stree.c 5000 to 5999
337 prints.c 6000 to 6999
338 namei.c 7000 to 7999
339 fix_nodes.c 8000 to 8999
340 dir.c 9000 to 9999
341 lbalance.c 10000 to 10999
342 ibalance.c 11000 to 11999 not ready
343 do_balan.c 12000 to 12999
344 inode.c 13000 to 13999
345 file.c 14000 to 14999
346 objectid.c 15000 - 15999
347 buffer.c 16000 - 16999
348 symlink.c 17000 - 17999
349
350 . */
351
352
353#ifdef CONFIG_REISERFS_CHECK
354extern struct tree_balance * cur_tb;
355#endif
356
357void reiserfs_panic (struct super_block * sb, const char * fmt, ...)
358{
359 do_reiserfs_warning(fmt);
360 printk (KERN_EMERG "REISERFS: panic (device %s): %s\n",
361 reiserfs_bdevname (sb), error_buf);
362 BUG ();
363
364 /* this is not actually called, but makes reiserfs_panic() "noreturn" */
365 panic ("REISERFS: panic (device %s): %s\n",
366 reiserfs_bdevname (sb), error_buf);
367}
368
369void
370reiserfs_abort (struct super_block *sb, int errno, const char *fmt, ...)
371{
372 do_reiserfs_warning (fmt);
373
374 if (reiserfs_error_panic (sb)) {
375 panic (KERN_CRIT "REISERFS: panic (device %s): %s\n",
376 reiserfs_bdevname (sb), error_buf);
377 }
378
379 if (sb->s_flags & MS_RDONLY)
380 return;
381
382 printk (KERN_CRIT "REISERFS: abort (device %s): %s\n",
383 reiserfs_bdevname (sb), error_buf);
384
385 sb->s_flags |= MS_RDONLY;
386 reiserfs_journal_abort (sb, errno);
387}
388
389/* this prints internal nodes (4 keys/items in line) (dc_number,
390 dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number,
391 dc_size)...*/
392static int print_internal (struct buffer_head * bh, int first, int last)
393{
394 struct reiserfs_key * key;
395 struct disk_child * dc;
396 int i;
397 int from, to;
398
399 if (!B_IS_KEYS_LEVEL (bh))
400 return 1;
401
402 check_internal (bh);
403
404 if (first == -1) {
405 from = 0;
406 to = B_NR_ITEMS (bh);
407 } else {
408 from = first;
409 to = last < B_NR_ITEMS (bh) ? last : B_NR_ITEMS (bh);
410 }
411
412 reiserfs_printk ("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh);
413
414 dc = B_N_CHILD (bh, from);
415 reiserfs_printk ("PTR %d: %y ", from, dc);
416
417 for (i = from, key = B_N_PDELIM_KEY (bh, from), dc ++; i < to; i ++, key ++, dc ++) {
418 reiserfs_printk ("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc);
419 if (i && i % 4 == 0)
420 printk ("\n");
421 }
422 printk ("\n");
423 return 0;
424}
425
426
427
428
429
430static int print_leaf (struct buffer_head * bh, int print_mode, int first, int last)
431{
432 struct block_head * blkh;
433 struct item_head * ih;
434 int i, nr;
435 int from, to;
436
437 if (!B_IS_ITEMS_LEVEL (bh))
438 return 1;
439
440 check_leaf (bh);
441
442 blkh = B_BLK_HEAD (bh);
443 ih = B_N_PITEM_HEAD (bh,0);
444 nr = blkh_nr_item(blkh);
445
446 printk ("\n===================================================================\n");
447 reiserfs_printk ("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh);
448
449 if (!(print_mode & PRINT_LEAF_ITEMS)) {
450 reiserfs_printk ("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n",
451 &(ih->ih_key), &((ih + nr - 1)->ih_key));
452 return 0;
453 }
454
455 if (first < 0 || first > nr - 1)
456 from = 0;
457 else
458 from = first;
459
460 if (last < 0 || last > nr )
461 to = nr;
462 else
463 to = last;
464
465 ih += from;
466 printk ("-------------------------------------------------------------------------------\n");
467 printk ("|##| type | key | ilen | free_space | version | loc |\n");
468 for (i = from; i < to; i++, ih ++) {
469 printk ("-------------------------------------------------------------------------------\n");
470 reiserfs_printk ("|%2d| %h |\n", i, ih);
471 if (print_mode & PRINT_LEAF_ITEMS)
472 op_print_item (ih, B_I_PITEM (bh, ih));
473 }
474
475 printk ("===================================================================\n");
476
477 return 0;
478}
479
480char * reiserfs_hashname(int code)
481{
482 if ( code == YURA_HASH)
483 return "rupasov";
484 if ( code == TEA_HASH)
485 return "tea";
486 if ( code == R5_HASH)
487 return "r5";
488
489 return "unknown";
490}
491
492/* return 1 if this is not super block */
493static int print_super_block (struct buffer_head * bh)
494{
495 struct reiserfs_super_block * rs = (struct reiserfs_super_block *)(bh->b_data);
496 int skipped, data_blocks;
497 char *version;
498 char b[BDEVNAME_SIZE];
499
500 if (is_reiserfs_3_5(rs)) {
501 version = "3.5";
502 } else if (is_reiserfs_3_6(rs)) {
503 version = "3.6";
504 } else if (is_reiserfs_jr(rs)) {
505 version = ((sb_version(rs) == REISERFS_VERSION_2) ?
506 "3.6" : "3.5");
507 } else {
508 return 1;
509 }
510
511 printk ("%s\'s super block is in block %llu\n", bdevname (bh->b_bdev, b),
512 (unsigned long long)bh->b_blocknr);
513 printk ("Reiserfs version %s\n", version );
514 printk ("Block count %u\n", sb_block_count(rs));
515 printk ("Blocksize %d\n", sb_blocksize(rs));
516 printk ("Free blocks %u\n", sb_free_blocks(rs));
517 // FIXME: this would be confusing if
518 // someone stores reiserfs super block in some data block ;)
519// skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs);
520 skipped = bh->b_blocknr;
521 data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) -
522 (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) + 1 : sb_reserved_for_journal(rs)) -
523 sb_free_blocks(rs);
524 printk ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n"
525 "1 super block, %d data blocks\n",
526 skipped, sb_bmap_nr(rs), (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) :
527 sb_reserved_for_journal(rs)) , data_blocks);
528 printk ("Root block %u\n", sb_root_block(rs));
529 printk ("Journal block (first) %d\n", sb_jp_journal_1st_block(rs));
530 printk ("Journal dev %d\n", sb_jp_journal_dev(rs));
531 printk ("Journal orig size %d\n", sb_jp_journal_size(rs));
532 printk ("FS state %d\n", sb_fs_state(rs));
533 printk ("Hash function \"%s\"\n",
534 reiserfs_hashname(sb_hash_function_code(rs)));
535
536 printk ("Tree height %d\n", sb_tree_height(rs));
537 return 0;
538}
539
540static int print_desc_block (struct buffer_head * bh)
541{
542 struct reiserfs_journal_desc * desc;
543
544 if (memcmp(get_journal_desc_magic (bh), JOURNAL_DESC_MAGIC, 8))
545 return 1;
546
547 desc = (struct reiserfs_journal_desc *)(bh->b_data);
548 printk ("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)",
549 (unsigned long long)bh->b_blocknr, get_desc_trans_id (desc), get_desc_mount_id (desc),
550 get_desc_trans_len (desc));
551
552 return 0;
553}
554
555
556void print_block (struct buffer_head * bh, ...)//int print_mode, int first, int last)
557{
558 va_list args;
559 int mode, first, last;
560
561 va_start (args, bh);
562
563 if ( ! bh ) {
564 printk("print_block: buffer is NULL\n");
565 return;
566 }
567
568 mode = va_arg (args, int);
569 first = va_arg (args, int);
570 last = va_arg (args, int);
571 if (print_leaf (bh, mode, first, last))
572 if (print_internal (bh, first, last))
573 if (print_super_block (bh))
574 if (print_desc_block (bh))
575 printk ("Block %llu contains unformatted data\n", (unsigned long long)bh->b_blocknr);
576}
577
578
579
580static char print_tb_buf[2048];
581
582/* this stores initial state of tree balance in the print_tb_buf */
583void store_print_tb (struct tree_balance * tb)
584{
585 int h = 0;
586 int i;
587 struct buffer_head * tbSh, * tbFh;
588
589 if (!tb)
590 return;
591
592 sprintf (print_tb_buf, "\n"
593 "BALANCING %d\n"
594 "MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n"
595 "=====================================================================\n"
596 "* h * S * L * R * F * FL * FR * CFL * CFR *\n",
597 REISERFS_SB(tb->tb_sb)->s_do_balance,
598 tb->tb_mode, PATH_LAST_POSITION (tb->tb_path), tb->tb_path->pos_in_item);
599
600 for (h = 0; h < sizeof(tb->insert_size) / sizeof (tb->insert_size[0]); h ++) {
601 if (PATH_H_PATH_OFFSET (tb->tb_path, h) <= tb->tb_path->path_length &&
602 PATH_H_PATH_OFFSET (tb->tb_path, h) > ILLEGAL_PATH_ELEMENT_OFFSET) {
603 tbSh = PATH_H_PBUFFER (tb->tb_path, h);
604 tbFh = PATH_H_PPARENT (tb->tb_path, h);
605 } else {
606 tbSh = NULL;
607 tbFh = NULL;
608 }
609 sprintf (print_tb_buf + strlen (print_tb_buf),
610 "* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n",
611 h,
612 (tbSh) ? (long long)(tbSh->b_blocknr):(-1LL),
613 (tbSh) ? atomic_read (&(tbSh->b_count)) : -1,
614 (tb->L[h]) ? (long long)(tb->L[h]->b_blocknr):(-1LL),
615 (tb->L[h]) ? atomic_read (&(tb->L[h]->b_count)) : -1,
616 (tb->R[h]) ? (long long)(tb->R[h]->b_blocknr):(-1LL),
617 (tb->R[h]) ? atomic_read (&(tb->R[h]->b_count)) : -1,
618 (tbFh) ? (long long)(tbFh->b_blocknr):(-1LL),
619 (tb->FL[h]) ? (long long)(tb->FL[h]->b_blocknr):(-1LL),
620 (tb->FR[h]) ? (long long)(tb->FR[h]->b_blocknr):(-1LL),
621 (tb->CFL[h]) ? (long long)(tb->CFL[h]->b_blocknr):(-1LL),
622 (tb->CFR[h]) ? (long long)(tb->CFR[h]->b_blocknr):(-1LL));
623 }
624
625 sprintf (print_tb_buf + strlen (print_tb_buf),
626 "=====================================================================\n"
627 "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n"
628 "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n",
629 tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0],tb->rbytes, tb->blknum[0],
630 tb->s0num, tb->s1num,tb->s1bytes, tb->s2num, tb->s2bytes, tb->cur_blknum, tb->lkey[0], tb->rkey[0]);
631
632 /* this prints balance parameters for non-leaf levels */
633 h = 0;
634 do {
635 h++;
636 sprintf (print_tb_buf + strlen (print_tb_buf),
637 "* %d * %4d * %2d * * %2d * * %2d *\n",
638 h, tb->insert_size[h], tb->lnum[h], tb->rnum[h], tb->blknum[h]);
639 } while (tb->insert_size[h]);
640
641 sprintf (print_tb_buf + strlen (print_tb_buf),
642 "=====================================================================\n"
643 "FEB list: ");
644
645 /* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */
646 h = 0;
647 for (i = 0; i < sizeof (tb->FEB) / sizeof (tb->FEB[0]); i ++)
648 sprintf (print_tb_buf + strlen (print_tb_buf),
649 "%p (%llu %d)%s", tb->FEB[i], tb->FEB[i] ? (unsigned long long)tb->FEB[i]->b_blocknr : 0ULL,
650 tb->FEB[i] ? atomic_read (&(tb->FEB[i]->b_count)) : 0,
651 (i == sizeof (tb->FEB) / sizeof (tb->FEB[0]) - 1) ? "\n" : ", ");
652
653 sprintf (print_tb_buf + strlen (print_tb_buf),
654 "======================== the end ====================================\n");
655}
656
657void print_cur_tb (char * mes)
658{
659 printk ("%s\n%s", mes, print_tb_buf);
660}
661
662static void check_leaf_block_head (struct buffer_head * bh)
663{
664 struct block_head * blkh;
665 int nr;
666
667 blkh = B_BLK_HEAD (bh);
668 nr = blkh_nr_item(blkh);
669 if ( nr > (bh->b_size - BLKH_SIZE) / IH_SIZE)
670 reiserfs_panic (NULL, "vs-6010: check_leaf_block_head: invalid item number %z", bh);
671 if ( blkh_free_space(blkh) >
672 bh->b_size - BLKH_SIZE - IH_SIZE * nr )
673 reiserfs_panic (NULL, "vs-6020: check_leaf_block_head: invalid free space %z", bh);
674
675}
676
677static void check_internal_block_head (struct buffer_head * bh)
678{
679 struct block_head * blkh;
680
681 blkh = B_BLK_HEAD (bh);
682 if (!(B_LEVEL (bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL (bh) <= MAX_HEIGHT))
683 reiserfs_panic (NULL, "vs-6025: check_internal_block_head: invalid level %z", bh);
684
685 if (B_NR_ITEMS (bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE)
686 reiserfs_panic (NULL, "vs-6030: check_internal_block_head: invalid item number %z", bh);
687
688 if (B_FREE_SPACE (bh) !=
689 bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS (bh) - DC_SIZE * (B_NR_ITEMS (bh) + 1))
690 reiserfs_panic (NULL, "vs-6040: check_internal_block_head: invalid free space %z", bh);
691
692}
693
694
695void check_leaf (struct buffer_head * bh)
696{
697 int i;
698 struct item_head * ih;
699
700 if (!bh)
701 return;
702 check_leaf_block_head (bh);
703 for (i = 0, ih = B_N_PITEM_HEAD (bh, 0); i < B_NR_ITEMS (bh); i ++, ih ++)
704 op_check_item (ih, B_I_PITEM (bh, ih));
705}
706
707
708void check_internal (struct buffer_head * bh)
709{
710 if (!bh)
711 return;
712 check_internal_block_head (bh);
713}
714
715
716void print_statistics (struct super_block * s)
717{
718
719 /*
720 printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \
721bmap with search %d, without %d, dir2ind %d, ind2dir %d\n",
722 REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes,
723 REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search,
724 REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct);
725 */
726
727}
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
new file mode 100644
index 000000000000..f4ea81ae0e0f
--- /dev/null
+++ b/fs/reiserfs/procfs.c
@@ -0,0 +1,664 @@
1/* -*- linux-c -*- */
2
3/* fs/reiserfs/procfs.c */
4
5/*
6 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
7 */
8
9/* proc info support a la one created by Sizif@Botik.RU for PGC */
10
11/* $Id: procfs.c,v 1.1.8.2 2001/07/15 17:08:42 god Exp $ */
12
13#include <linux/config.h>
14#include <linux/module.h>
15#include <linux/time.h>
16#include <linux/seq_file.h>
17#include <asm/uaccess.h>
18#include <linux/reiserfs_fs.h>
19#include <linux/reiserfs_fs_sb.h>
20#include <linux/smp_lock.h>
21#include <linux/init.h>
22#include <linux/proc_fs.h>
23
24#if defined( REISERFS_PROC_INFO )
25
26/*
27 * LOCKING:
28 *
29 * We rely on new Alexander Viro's super-block locking.
30 *
31 */
32
33static int show_version(struct seq_file *m, struct super_block *sb)
34{
35 char *format;
36
37 if ( REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6) ) {
38 format = "3.6";
39 } else if ( REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5) ) {
40 format = "3.5";
41 } else {
42 format = "unknown";
43 }
44
45 seq_printf(m, "%s format\twith checks %s\n",
46 format,
47#if defined( CONFIG_REISERFS_CHECK )
48 "on"
49#else
50 "off"
51#endif
52 );
53 return 0;
54}
55
56int reiserfs_global_version_in_proc( char *buffer, char **start, off_t offset,
57 int count, int *eof, void *data )
58{
59 *start = buffer;
60 *eof = 1;
61 return 0;
62}
63
64#define SF( x ) ( r -> x )
65#define SFP( x ) SF( s_proc_info_data.x )
66#define SFPL( x ) SFP( x[ level ] )
67#define SFPF( x ) SFP( scan_bitmap.x )
68#define SFPJ( x ) SFP( journal.x )
69
70#define D2C( x ) le16_to_cpu( x )
71#define D4C( x ) le32_to_cpu( x )
72#define DF( x ) D2C( rs -> s_v1.x )
73#define DFL( x ) D4C( rs -> s_v1.x )
74
75#define objectid_map( s, rs ) (old_format_only (s) ? \
76 (__u32 *)((struct reiserfs_super_block_v1 *)rs + 1) : \
77 (__u32 *)(rs + 1))
78#define MAP( i ) D4C( objectid_map( sb, rs )[ i ] )
79
80#define DJF( x ) le32_to_cpu( rs -> x )
81#define DJV( x ) le32_to_cpu( s_v1 -> x )
82#define DJP( x ) le32_to_cpu( jp -> x )
83#define JF( x ) ( r -> s_journal -> x )
84
85static int show_super(struct seq_file *m, struct super_block *sb)
86{
87 struct reiserfs_sb_info *r = REISERFS_SB(sb);
88
89 seq_printf(m, "state: \t%s\n"
90 "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n"
91 "gen. counter: \t%i\n"
92 "s_kmallocs: \t%i\n"
93 "s_disk_reads: \t%i\n"
94 "s_disk_writes: \t%i\n"
95 "s_fix_nodes: \t%i\n"
96 "s_do_balance: \t%i\n"
97 "s_unneeded_left_neighbor: \t%i\n"
98 "s_good_search_by_key_reada: \t%i\n"
99 "s_bmaps: \t%i\n"
100 "s_bmaps_without_search: \t%i\n"
101 "s_direct2indirect: \t%i\n"
102 "s_indirect2direct: \t%i\n"
103 "\n"
104 "max_hash_collisions: \t%i\n"
105
106 "breads: \t%lu\n"
107 "bread_misses: \t%lu\n"
108
109 "search_by_key: \t%lu\n"
110 "search_by_key_fs_changed: \t%lu\n"
111 "search_by_key_restarted: \t%lu\n"
112
113 "insert_item_restarted: \t%lu\n"
114 "paste_into_item_restarted: \t%lu\n"
115 "cut_from_item_restarted: \t%lu\n"
116 "delete_solid_item_restarted: \t%lu\n"
117 "delete_item_restarted: \t%lu\n"
118
119 "leaked_oid: \t%lu\n"
120 "leaves_removable: \t%lu\n",
121
122 SF( s_mount_state ) == REISERFS_VALID_FS ?
123 "REISERFS_VALID_FS" : "REISERFS_ERROR_FS",
124 reiserfs_r5_hash( sb ) ? "FORCE_R5 " : "",
125 reiserfs_rupasov_hash( sb ) ? "FORCE_RUPASOV " : "",
126 reiserfs_tea_hash( sb ) ? "FORCE_TEA " : "",
127 reiserfs_hash_detect( sb ) ? "DETECT_HASH " : "",
128 reiserfs_no_border( sb ) ? "NO_BORDER " : "BORDER ",
129 reiserfs_no_unhashed_relocation( sb ) ? "NO_UNHASHED_RELOCATION " : "",
130 reiserfs_hashed_relocation( sb ) ? "UNHASHED_RELOCATION " : "",
131 reiserfs_test4( sb ) ? "TEST4 " : "",
132 have_large_tails( sb ) ? "TAILS " : have_small_tails(sb)?"SMALL_TAILS ":"NO_TAILS ",
133 replay_only( sb ) ? "REPLAY_ONLY " : "",
134 convert_reiserfs( sb ) ? "CONV " : "",
135
136 atomic_read( &r -> s_generation_counter ),
137 SF( s_kmallocs ),
138 SF( s_disk_reads ),
139 SF( s_disk_writes ),
140 SF( s_fix_nodes ),
141 SF( s_do_balance ),
142 SF( s_unneeded_left_neighbor ),
143 SF( s_good_search_by_key_reada ),
144 SF( s_bmaps ),
145 SF( s_bmaps_without_search ),
146 SF( s_direct2indirect ),
147 SF( s_indirect2direct ),
148 SFP( max_hash_collisions ),
149 SFP( breads ),
150 SFP( bread_miss ),
151 SFP( search_by_key ),
152 SFP( search_by_key_fs_changed ),
153 SFP( search_by_key_restarted ),
154
155 SFP( insert_item_restarted ),
156 SFP( paste_into_item_restarted ),
157 SFP( cut_from_item_restarted ),
158 SFP( delete_solid_item_restarted ),
159 SFP( delete_item_restarted ),
160
161 SFP( leaked_oid ),
162 SFP( leaves_removable ) );
163
164 return 0;
165}
166
167static int show_per_level(struct seq_file *m, struct super_block *sb)
168{
169 struct reiserfs_sb_info *r = REISERFS_SB(sb);
170 int level;
171
172 seq_printf(m, "level\t"
173 " balances"
174 " [sbk: reads"
175 " fs_changed"
176 " restarted]"
177 " free space"
178 " items"
179 " can_remove"
180 " lnum"
181 " rnum"
182 " lbytes"
183 " rbytes"
184 " get_neig"
185 " get_neig_res"
186 " need_l_neig"
187 " need_r_neig"
188 "\n"
189
190 );
191
192 for( level = 0 ; level < MAX_HEIGHT ; ++ level ) {
193 seq_printf(m, "%i\t"
194 " %12lu"
195 " %12lu"
196 " %12lu"
197 " %12lu"
198 " %12lu"
199 " %12lu"
200 " %12lu"
201 " %12li"
202 " %12li"
203 " %12li"
204 " %12li"
205 " %12lu"
206 " %12lu"
207 " %12lu"
208 " %12lu"
209 "\n",
210 level,
211 SFPL( balance_at ),
212 SFPL( sbk_read_at ),
213 SFPL( sbk_fs_changed ),
214 SFPL( sbk_restarted ),
215 SFPL( free_at ),
216 SFPL( items_at ),
217 SFPL( can_node_be_removed ),
218 SFPL( lnum ),
219 SFPL( rnum ),
220 SFPL( lbytes ),
221 SFPL( rbytes ),
222 SFPL( get_neighbors ),
223 SFPL( get_neighbors_restart ),
224 SFPL( need_l_neighbor ),
225 SFPL( need_r_neighbor )
226 );
227 }
228 return 0;
229}
230
231static int show_bitmap(struct seq_file *m, struct super_block *sb)
232{
233 struct reiserfs_sb_info *r = REISERFS_SB(sb);
234
235 seq_printf(m, "free_block: %lu\n"
236 " scan_bitmap:"
237 " wait"
238 " bmap"
239 " retry"
240 " stolen"
241 " journal_hint"
242 "journal_nohint"
243 "\n"
244 " %14lu"
245 " %14lu"
246 " %14lu"
247 " %14lu"
248 " %14lu"
249 " %14lu"
250 " %14lu"
251 "\n",
252 SFP( free_block ),
253 SFPF( call ),
254 SFPF( wait ),
255 SFPF( bmap ),
256 SFPF( retry ),
257 SFPF( stolen ),
258 SFPF( in_journal_hint ),
259 SFPF( in_journal_nohint ) );
260
261 return 0;
262}
263
264static int show_on_disk_super(struct seq_file *m, struct super_block *sb)
265{
266 struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
267 struct reiserfs_super_block *rs = sb_info -> s_rs;
268 int hash_code = DFL( s_hash_function_code );
269 __u32 flags = DJF( s_flags );
270
271 seq_printf(m, "block_count: \t%i\n"
272 "free_blocks: \t%i\n"
273 "root_block: \t%i\n"
274 "blocksize: \t%i\n"
275 "oid_maxsize: \t%i\n"
276 "oid_cursize: \t%i\n"
277 "umount_state: \t%i\n"
278 "magic: \t%10.10s\n"
279 "fs_state: \t%i\n"
280 "hash: \t%s\n"
281 "tree_height: \t%i\n"
282 "bmap_nr: \t%i\n"
283 "version: \t%i\n"
284 "flags: \t%x[%s]\n"
285 "reserved_for_journal: \t%i\n",
286
287 DFL( s_block_count ),
288 DFL( s_free_blocks ),
289 DFL( s_root_block ),
290 DF( s_blocksize ),
291 DF( s_oid_maxsize ),
292 DF( s_oid_cursize ),
293 DF( s_umount_state ),
294 rs -> s_v1.s_magic,
295 DF( s_fs_state ),
296 hash_code == TEA_HASH ? "tea" :
297 ( hash_code == YURA_HASH ) ? "rupasov" :
298 ( hash_code == R5_HASH ) ? "r5" :
299 ( hash_code == UNSET_HASH ) ? "unset" : "unknown",
300 DF( s_tree_height ),
301 DF( s_bmap_nr ),
302 DF( s_version ),
303 flags,
304 ( flags & reiserfs_attrs_cleared )
305 ? "attrs_cleared" : "",
306 DF (s_reserved_for_journal));
307
308 return 0;
309}
310
311static int show_oidmap(struct seq_file *m, struct super_block *sb)
312{
313 struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
314 struct reiserfs_super_block *rs = sb_info -> s_rs;
315 unsigned int mapsize = le16_to_cpu( rs -> s_v1.s_oid_cursize );
316 unsigned long total_used = 0;
317 int i;
318
319 for( i = 0 ; i < mapsize ; ++i ) {
320 __u32 right;
321
322 right = ( i == mapsize - 1 ) ? MAX_KEY_OBJECTID : MAP( i + 1 );
323 seq_printf(m, "%s: [ %x .. %x )\n",
324 ( i & 1 ) ? "free" : "used", MAP( i ), right );
325 if( ! ( i & 1 ) ) {
326 total_used += right - MAP( i );
327 }
328 }
329#if defined( REISERFS_USE_OIDMAPF )
330 if( sb_info -> oidmap.use_file && ( sb_info -> oidmap.mapf != NULL ) ) {
331 loff_t size = sb_info->oidmap.mapf->f_dentry->d_inode->i_size;
332 total_used += size / sizeof( reiserfs_oidinterval_d_t );
333 }
334#endif
335 seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n",
336 mapsize,
337 mapsize, le16_to_cpu( rs -> s_v1.s_oid_maxsize ),
338 total_used);
339 return 0;
340}
341
342static int show_journal(struct seq_file *m, struct super_block *sb)
343{
344 struct reiserfs_sb_info *r = REISERFS_SB(sb);
345 struct reiserfs_super_block *rs = r -> s_rs;
346 struct journal_params *jp = &rs->s_v1.s_journal;
347 char b[BDEVNAME_SIZE];
348
349
350 seq_printf(m, /* on-disk fields */
351 "jp_journal_1st_block: \t%i\n"
352 "jp_journal_dev: \t%s[%x]\n"
353 "jp_journal_size: \t%i\n"
354 "jp_journal_trans_max: \t%i\n"
355 "jp_journal_magic: \t%i\n"
356 "jp_journal_max_batch: \t%i\n"
357 "jp_journal_max_commit_age: \t%i\n"
358 "jp_journal_max_trans_age: \t%i\n"
359 /* incore fields */
360 "j_1st_reserved_block: \t%i\n"
361 "j_state: \t%li\n"
362 "j_trans_id: \t%lu\n"
363 "j_mount_id: \t%lu\n"
364 "j_start: \t%lu\n"
365 "j_len: \t%lu\n"
366 "j_len_alloc: \t%lu\n"
367 "j_wcount: \t%i\n"
368 "j_bcount: \t%lu\n"
369 "j_first_unflushed_offset: \t%lu\n"
370 "j_last_flush_trans_id: \t%lu\n"
371 "j_trans_start_time: \t%li\n"
372 "j_list_bitmap_index: \t%i\n"
373 "j_must_wait: \t%i\n"
374 "j_next_full_flush: \t%i\n"
375 "j_next_async_flush: \t%i\n"
376 "j_cnode_used: \t%i\n"
377 "j_cnode_free: \t%i\n"
378 "\n"
379 /* reiserfs_proc_info_data_t.journal fields */
380 "in_journal: \t%12lu\n"
381 "in_journal_bitmap: \t%12lu\n"
382 "in_journal_reusable: \t%12lu\n"
383 "lock_journal: \t%12lu\n"
384 "lock_journal_wait: \t%12lu\n"
385 "journal_begin: \t%12lu\n"
386 "journal_relock_writers: \t%12lu\n"
387 "journal_relock_wcount: \t%12lu\n"
388 "mark_dirty: \t%12lu\n"
389 "mark_dirty_already: \t%12lu\n"
390 "mark_dirty_notjournal: \t%12lu\n"
391 "restore_prepared: \t%12lu\n"
392 "prepare: \t%12lu\n"
393 "prepare_retry: \t%12lu\n",
394
395 DJP( jp_journal_1st_block ),
396 bdevname(SB_JOURNAL(sb)->j_dev_bd, b),
397 DJP( jp_journal_dev ),
398 DJP( jp_journal_size ),
399 DJP( jp_journal_trans_max ),
400 DJP( jp_journal_magic ),
401 DJP( jp_journal_max_batch ),
402 SB_JOURNAL(sb)->j_max_commit_age,
403 DJP( jp_journal_max_trans_age ),
404
405 JF( j_1st_reserved_block ),
406 JF( j_state ),
407 JF( j_trans_id ),
408 JF( j_mount_id ),
409 JF( j_start ),
410 JF( j_len ),
411 JF( j_len_alloc ),
412 atomic_read( & r -> s_journal -> j_wcount ),
413 JF( j_bcount ),
414 JF( j_first_unflushed_offset ),
415 JF( j_last_flush_trans_id ),
416 JF( j_trans_start_time ),
417 JF( j_list_bitmap_index ),
418 JF( j_must_wait ),
419 JF( j_next_full_flush ),
420 JF( j_next_async_flush ),
421 JF( j_cnode_used ),
422 JF( j_cnode_free ),
423
424 SFPJ( in_journal ),
425 SFPJ( in_journal_bitmap ),
426 SFPJ( in_journal_reusable ),
427 SFPJ( lock_journal ),
428 SFPJ( lock_journal_wait ),
429 SFPJ( journal_being ),
430 SFPJ( journal_relock_writers ),
431 SFPJ( journal_relock_wcount ),
432 SFPJ( mark_dirty ),
433 SFPJ( mark_dirty_already ),
434 SFPJ( mark_dirty_notjournal ),
435 SFPJ( restore_prepared ),
436 SFPJ( prepare ),
437 SFPJ( prepare_retry )
438 );
439 return 0;
440}
441
442/* iterator */
443static int test_sb(struct super_block *sb, void *data)
444{
445 return data == sb;
446}
447
448static int set_sb(struct super_block *sb, void *data)
449{
450 return -ENOENT;
451}
452
453static void *r_start(struct seq_file *m, loff_t *pos)
454{
455 struct proc_dir_entry *de = m->private;
456 struct super_block *s = de->parent->data;
457 loff_t l = *pos;
458
459 if (l)
460 return NULL;
461
462 if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, s)))
463 return NULL;
464
465 up_write(&s->s_umount);
466
467 if (de->deleted) {
468 deactivate_super(s);
469 return NULL;
470 }
471
472 return s;
473}
474
475static void *r_next(struct seq_file *m, void *v, loff_t *pos)
476{
477 ++*pos;
478 if (v)
479 deactivate_super(v);
480 return NULL;
481}
482
483static void r_stop(struct seq_file *m, void *v)
484{
485 if (v)
486 deactivate_super(v);
487}
488
489static int r_show(struct seq_file *m, void *v)
490{
491 struct proc_dir_entry *de = m->private;
492 int (*show)(struct seq_file *, struct super_block *) = de->data;
493 return show(m, v);
494}
495
496static struct seq_operations r_ops = {
497 .start = r_start,
498 .next = r_next,
499 .stop = r_stop,
500 .show = r_show,
501};
502
503static int r_open(struct inode *inode, struct file *file)
504{
505 int ret = seq_open(file, &r_ops);
506
507 if (!ret) {
508 struct seq_file *m = file->private_data;
509 m->private = PDE(inode);
510 }
511 return ret;
512}
513
514static struct file_operations r_file_operations = {
515 .open = r_open,
516 .read = seq_read,
517 .llseek = seq_lseek,
518 .release = seq_release,
519};
520
521static struct proc_dir_entry *proc_info_root = NULL;
522static const char proc_info_root_name[] = "fs/reiserfs";
523
524static void add_file(struct super_block *sb, char *name,
525 int (*func)(struct seq_file *, struct super_block *))
526{
527 struct proc_dir_entry *de;
528 de = create_proc_entry(name, 0, REISERFS_SB(sb)->procdir);
529 if (de) {
530 de->data = func;
531 de->proc_fops = &r_file_operations;
532 }
533}
534
535int reiserfs_proc_info_init( struct super_block *sb )
536{
537 spin_lock_init( & __PINFO( sb ).lock );
538 REISERFS_SB(sb)->procdir = proc_mkdir(reiserfs_bdevname (sb), proc_info_root);
539 if( REISERFS_SB(sb)->procdir ) {
540 REISERFS_SB(sb)->procdir->owner = THIS_MODULE;
541 REISERFS_SB(sb)->procdir->data = sb;
542 add_file(sb, "version", show_version);
543 add_file(sb, "super", show_super);
544 add_file(sb, "per-level", show_per_level);
545 add_file(sb, "bitmap", show_bitmap);
546 add_file(sb, "on-disk-super", show_on_disk_super);
547 add_file(sb, "oidmap", show_oidmap);
548 add_file(sb, "journal", show_journal);
549 return 0;
550 }
551 reiserfs_warning(sb, "reiserfs: cannot create /proc/%s/%s",
552 proc_info_root_name, reiserfs_bdevname (sb) );
553 return 1;
554}
555
556int reiserfs_proc_info_done( struct super_block *sb )
557{
558 struct proc_dir_entry *de = REISERFS_SB(sb)->procdir;
559 if (de) {
560 remove_proc_entry("journal", de);
561 remove_proc_entry("oidmap", de);
562 remove_proc_entry("on-disk-super", de);
563 remove_proc_entry("bitmap", de);
564 remove_proc_entry("per-level", de);
565 remove_proc_entry("super", de);
566 remove_proc_entry("version", de);
567 }
568 spin_lock( & __PINFO( sb ).lock );
569 __PINFO( sb ).exiting = 1;
570 spin_unlock( & __PINFO( sb ).lock );
571 if ( proc_info_root ) {
572 remove_proc_entry( reiserfs_bdevname (sb), proc_info_root );
573 REISERFS_SB(sb)->procdir = NULL;
574 }
575 return 0;
576}
577
578struct proc_dir_entry *reiserfs_proc_register_global( char *name,
579 read_proc_t *func )
580{
581 return ( proc_info_root ) ? create_proc_read_entry( name, 0,
582 proc_info_root,
583 func, NULL ) : NULL;
584}
585
586void reiserfs_proc_unregister_global( const char *name )
587{
588 remove_proc_entry( name, proc_info_root );
589}
590
591int reiserfs_proc_info_global_init( void )
592{
593 if( proc_info_root == NULL ) {
594 proc_info_root = proc_mkdir(proc_info_root_name, NULL);
595 if( proc_info_root ) {
596 proc_info_root -> owner = THIS_MODULE;
597 } else {
598 reiserfs_warning (NULL,
599 "reiserfs: cannot create /proc/%s",
600 proc_info_root_name );
601 return 1;
602 }
603 }
604 return 0;
605}
606
607int reiserfs_proc_info_global_done( void )
608{
609 if ( proc_info_root != NULL ) {
610 proc_info_root = NULL;
611 remove_proc_entry(proc_info_root_name, NULL);
612 }
613 return 0;
614}
615
616/* REISERFS_PROC_INFO */
617#else
618
619int reiserfs_proc_info_init( struct super_block *sb ) { return 0; }
620int reiserfs_proc_info_done( struct super_block *sb ) { return 0; }
621
622struct proc_dir_entry *reiserfs_proc_register_global( char *name,
623 read_proc_t *func )
624{ return NULL; }
625
626void reiserfs_proc_unregister_global( const char *name ) {;}
627
628int reiserfs_proc_info_global_init( void ) { return 0; }
629int reiserfs_proc_info_global_done( void ) { return 0; }
630
631int reiserfs_global_version_in_proc( char *buffer, char **start,
632 off_t offset,
633 int count, int *eof, void *data )
634{ return 0; }
635
636/* REISERFS_PROC_INFO */
637#endif
638
639/*
640 * $Log: procfs.c,v $
641 * Revision 1.1.8.2 2001/07/15 17:08:42 god
642 * . use get_super() in procfs.c
643 * . remove remove_save_link() from reiserfs_do_truncate()
644 *
645 * I accept terms and conditions stated in the Legal Agreement
646 * (available at http://www.namesys.com/legalese.html)
647 *
648 * Revision 1.1.8.1 2001/07/11 16:48:50 god
649 * proc info support
650 *
651 * I accept terms and conditions stated in the Legal Agreement
652 * (available at http://www.namesys.com/legalese.html)
653 *
654 */
655
656/*
657 * Make Linus happy.
658 * Local variables:
659 * c-indentation-style: "K&R"
660 * mode-name: "LC"
661 * c-basic-offset: 8
662 * tab-width: 8
663 * End:
664 */
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
new file mode 100644
index 000000000000..170012078b76
--- /dev/null
+++ b/fs/reiserfs/resize.c
@@ -0,0 +1,182 @@
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5/*
6 * Written by Alexander Zarochentcev.
7 *
8 * The kernel part of the (on-line) reiserfs resizer.
9 */
10
11#include <linux/kernel.h>
12#include <linux/mm.h>
13#include <linux/vmalloc.h>
14#include <linux/string.h>
15#include <linux/errno.h>
16#include <linux/reiserfs_fs.h>
17#include <linux/reiserfs_fs_sb.h>
18#include <linux/buffer_head.h>
19
20int reiserfs_resize (struct super_block * s, unsigned long block_count_new)
21{
22 int err = 0;
23 struct reiserfs_super_block * sb;
24 struct reiserfs_bitmap_info *bitmap;
25 struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s);
26 struct buffer_head * bh;
27 struct reiserfs_transaction_handle th;
28 unsigned int bmap_nr_new, bmap_nr;
29 unsigned int block_r_new, block_r;
30
31 struct reiserfs_list_bitmap * jb;
32 struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS];
33
34 unsigned long int block_count, free_blocks;
35 int i;
36 int copy_size ;
37
38 sb = SB_DISK_SUPER_BLOCK(s);
39
40 if (SB_BLOCK_COUNT(s) >= block_count_new) {
41 printk("can\'t shrink filesystem on-line\n");
42 return -EINVAL;
43 }
44
45 /* check the device size */
46 bh = sb_bread(s, block_count_new - 1);
47 if (!bh) {
48 printk("reiserfs_resize: can\'t read last block\n");
49 return -EINVAL;
50 }
51 bforget(bh);
52
53 /* old disk layout detection; those partitions can be mounted, but
54 * cannot be resized */
55 if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size
56 != REISERFS_DISK_OFFSET_IN_BYTES ) {
57 printk("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n");
58 return -ENOTSUPP;
59 }
60
61 /* count used bits in last bitmap block */
62 block_r = SB_BLOCK_COUNT(s) -
63 (SB_BMAP_NR(s) - 1) * s->s_blocksize * 8;
64
65 /* count bitmap blocks in new fs */
66 bmap_nr_new = block_count_new / ( s->s_blocksize * 8 );
67 block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8;
68 if (block_r_new)
69 bmap_nr_new++;
70 else
71 block_r_new = s->s_blocksize * 8;
72
73 /* save old values */
74 block_count = SB_BLOCK_COUNT(s);
75 bmap_nr = SB_BMAP_NR(s);
76
77 /* resizing of reiserfs bitmaps (journal and real), if needed */
78 if (bmap_nr_new > bmap_nr) {
79 /* reallocate journal bitmaps */
80 if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) {
81 printk("reiserfs_resize: unable to allocate memory for journal bitmaps\n");
82 unlock_super(s) ;
83 return -ENOMEM ;
84 }
85 /* the new journal bitmaps are zero filled, now we copy in the bitmap
86 ** node pointers from the old journal bitmap structs, and then
87 ** transfer the new data structures into the journal struct.
88 **
89 ** using the copy_size var below allows this code to work for
90 ** both shrinking and expanding the FS.
91 */
92 copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr ;
93 copy_size = copy_size * sizeof(struct reiserfs_list_bitmap_node *) ;
94 for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) {
95 struct reiserfs_bitmap_node **node_tmp ;
96 jb = SB_JOURNAL(s)->j_list_bitmap + i ;
97 memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size) ;
98
99 /* just in case vfree schedules on us, copy the new
100 ** pointer into the journal struct before freeing the
101 ** old one
102 */
103 node_tmp = jb->bitmaps ;
104 jb->bitmaps = jbitmap[i].bitmaps ;
105 vfree(node_tmp) ;
106 }
107
108 /* allocate additional bitmap blocks, reallocate array of bitmap
109 * block pointers */
110 bitmap = vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
111 if (!bitmap) {
112 /* Journal bitmaps are still supersized, but the memory isn't
113 * leaked, so I guess it's ok */
114 printk("reiserfs_resize: unable to allocate memory.\n");
115 return -ENOMEM;
116 }
117 memset (bitmap, 0, sizeof (struct reiserfs_bitmap_info) * SB_BMAP_NR(s));
118 for (i = 0; i < bmap_nr; i++)
119 bitmap[i] = old_bitmap[i];
120
121 /* This doesn't go through the journal, but it doesn't have to.
122 * The changes are still atomic: We're synced up when the journal
123 * transaction begins, and the new bitmaps don't matter if the
124 * transaction fails. */
125 for (i = bmap_nr; i < bmap_nr_new; i++) {
126 bitmap[i].bh = sb_getblk(s, i * s->s_blocksize * 8);
127 memset(bitmap[i].bh->b_data, 0, sb_blocksize(sb));
128 reiserfs_test_and_set_le_bit(0, bitmap[i].bh->b_data);
129
130 set_buffer_uptodate(bitmap[i].bh);
131 mark_buffer_dirty(bitmap[i].bh) ;
132 sync_dirty_buffer(bitmap[i].bh);
133 // update bitmap_info stuff
134 bitmap[i].first_zero_hint=1;
135 bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
136 }
137 /* free old bitmap blocks array */
138 SB_AP_BITMAP(s) = bitmap;
139 vfree (old_bitmap);
140 }
141
142 /* begin transaction, if there was an error, it's fine. Yes, we have
143 * incorrect bitmaps now, but none of it is ever going to touch the
144 * disk anyway. */
145 err = journal_begin(&th, s, 10);
146 if (err)
147 return err;
148
149 /* correct last bitmap blocks in old and new disk layout */
150 reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[bmap_nr - 1].bh, 1);
151 for (i = block_r; i < s->s_blocksize * 8; i++)
152 reiserfs_test_and_clear_le_bit(i,
153 SB_AP_BITMAP(s)[bmap_nr - 1].bh->b_data);
154 SB_AP_BITMAP(s)[bmap_nr - 1].free_count += s->s_blocksize * 8 - block_r;
155 if ( !SB_AP_BITMAP(s)[bmap_nr - 1].first_zero_hint)
156 SB_AP_BITMAP(s)[bmap_nr - 1].first_zero_hint = block_r;
157
158 journal_mark_dirty(&th, s, SB_AP_BITMAP(s)[bmap_nr - 1].bh);
159
160 reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[bmap_nr_new - 1].bh, 1);
161 for (i = block_r_new; i < s->s_blocksize * 8; i++)
162 reiserfs_test_and_set_le_bit(i,
163 SB_AP_BITMAP(s)[bmap_nr_new - 1].bh->b_data);
164 journal_mark_dirty(&th, s, SB_AP_BITMAP(s)[bmap_nr_new - 1].bh);
165
166 SB_AP_BITMAP(s)[bmap_nr_new - 1].free_count -= s->s_blocksize * 8 - block_r_new;
167 /* Extreme case where last bitmap is the only valid block in itself. */
168 if ( !SB_AP_BITMAP(s)[bmap_nr_new - 1].free_count )
169 SB_AP_BITMAP(s)[bmap_nr_new - 1].first_zero_hint = 0;
170 /* update super */
171 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
172 free_blocks = SB_FREE_BLOCKS(s);
173 PUT_SB_FREE_BLOCKS(s, free_blocks + (block_count_new - block_count - (bmap_nr_new - bmap_nr)));
174 PUT_SB_BLOCK_COUNT(s, block_count_new);
175 PUT_SB_BMAP_NR(s, bmap_nr_new);
176 s->s_dirt = 1;
177
178 journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
179
180 SB_JOURNAL(s)->j_must_wait = 1;
181 return journal_end(&th, s, 10);
182}
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
new file mode 100644
index 000000000000..73ec5212178b
--- /dev/null
+++ b/fs/reiserfs/stree.c
@@ -0,0 +1,2073 @@
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5/*
6 * Written by Anatoly P. Pinchuk pap@namesys.botik.ru
7 * Programm System Institute
8 * Pereslavl-Zalessky Russia
9 */
10
11/*
12 * This file contains functions dealing with S+tree
13 *
14 * B_IS_IN_TREE
15 * copy_item_head
16 * comp_short_keys
17 * comp_keys
18 * comp_short_le_keys
19 * le_key2cpu_key
20 * comp_le_keys
21 * bin_search
22 * get_lkey
23 * get_rkey
24 * key_in_buffer
25 * decrement_bcount
26 * decrement_counters_in_path
27 * reiserfs_check_path
28 * pathrelse_and_restore
29 * pathrelse
30 * search_by_key_reada
31 * search_by_key
32 * search_for_position_by_key
33 * comp_items
34 * prepare_for_direct_item
35 * prepare_for_direntry_item
36 * prepare_for_delete_or_cut
37 * calc_deleted_bytes_number
38 * init_tb_struct
39 * padd_item
40 * reiserfs_delete_item
41 * reiserfs_delete_solid_item
42 * reiserfs_delete_object
43 * maybe_indirect_to_direct
44 * indirect_to_direct_roll_back
45 * reiserfs_cut_from_item
46 * truncate_directory
47 * reiserfs_do_truncate
48 * reiserfs_paste_into_item
49 * reiserfs_insert_item
50 */
51
52#include <linux/config.h>
53#include <linux/time.h>
54#include <linux/string.h>
55#include <linux/pagemap.h>
56#include <linux/reiserfs_fs.h>
57#include <linux/smp_lock.h>
58#include <linux/buffer_head.h>
59#include <linux/quotaops.h>
60
61/* Does the buffer contain a disk block which is in the tree. */
62inline int B_IS_IN_TREE (const struct buffer_head * p_s_bh)
63{
64
65 RFALSE( B_LEVEL (p_s_bh) > MAX_HEIGHT,
66 "PAP-1010: block (%b) has too big level (%z)", p_s_bh, p_s_bh);
67
68 return ( B_LEVEL (p_s_bh) != FREE_LEVEL );
69}
70
71//
72// to gets item head in le form
73//
74inline void copy_item_head(struct item_head * p_v_to,
75 const struct item_head * p_v_from)
76{
77 memcpy (p_v_to, p_v_from, IH_SIZE);
78}
79
80
81/* k1 is pointer to on-disk structure which is stored in little-endian
82 form. k2 is pointer to cpu variable. For key of items of the same
83 object this returns 0.
84 Returns: -1 if key1 < key2
85 0 if key1 == key2
86 1 if key1 > key2 */
87inline int comp_short_keys (const struct reiserfs_key * le_key,
88 const struct cpu_key * cpu_key)
89{
90 __u32 * p_s_le_u32, * p_s_cpu_u32;
91 int n_key_length = REISERFS_SHORT_KEY_LEN;
92
93 p_s_le_u32 = (__u32 *)le_key;
94 p_s_cpu_u32 = (__u32 *)&cpu_key->on_disk_key;
95 for( ; n_key_length--; ++p_s_le_u32, ++p_s_cpu_u32 ) {
96 if ( le32_to_cpu (*p_s_le_u32) < *p_s_cpu_u32 )
97 return -1;
98 if ( le32_to_cpu (*p_s_le_u32) > *p_s_cpu_u32 )
99 return 1;
100 }
101
102 return 0;
103}
104
105
106/* k1 is pointer to on-disk structure which is stored in little-endian
107 form. k2 is pointer to cpu variable.
108 Compare keys using all 4 key fields.
109 Returns: -1 if key1 < key2 0
110 if key1 = key2 1 if key1 > key2 */
111static inline int comp_keys (const struct reiserfs_key * le_key, const struct cpu_key * cpu_key)
112{
113 int retval;
114
115 retval = comp_short_keys (le_key, cpu_key);
116 if (retval)
117 return retval;
118 if (le_key_k_offset (le_key_version(le_key), le_key) < cpu_key_k_offset (cpu_key))
119 return -1;
120 if (le_key_k_offset (le_key_version(le_key), le_key) > cpu_key_k_offset (cpu_key))
121 return 1;
122
123 if (cpu_key->key_length == 3)
124 return 0;
125
126 /* this part is needed only when tail conversion is in progress */
127 if (le_key_k_type (le_key_version(le_key), le_key) < cpu_key_k_type (cpu_key))
128 return -1;
129
130 if (le_key_k_type (le_key_version(le_key), le_key) > cpu_key_k_type (cpu_key))
131 return 1;
132
133 return 0;
134}
135
136
137inline int comp_short_le_keys (const struct reiserfs_key * key1, const struct reiserfs_key * key2)
138{
139 __u32 * p_s_1_u32, * p_s_2_u32;
140 int n_key_length = REISERFS_SHORT_KEY_LEN;
141
142 p_s_1_u32 = (__u32 *)key1;
143 p_s_2_u32 = (__u32 *)key2;
144 for( ; n_key_length--; ++p_s_1_u32, ++p_s_2_u32 ) {
145 if ( le32_to_cpu (*p_s_1_u32) < le32_to_cpu (*p_s_2_u32) )
146 return -1;
147 if ( le32_to_cpu (*p_s_1_u32) > le32_to_cpu (*p_s_2_u32) )
148 return 1;
149 }
150 return 0;
151}
152
153inline void le_key2cpu_key (struct cpu_key * to, const struct reiserfs_key * from)
154{
155 to->on_disk_key.k_dir_id = le32_to_cpu (from->k_dir_id);
156 to->on_disk_key.k_objectid = le32_to_cpu (from->k_objectid);
157
158 // find out version of the key
159 to->version = le_key_version (from);
160 if (to->version == KEY_FORMAT_3_5) {
161 to->on_disk_key.u.k_offset_v1.k_offset = le32_to_cpu (from->u.k_offset_v1.k_offset);
162 to->on_disk_key.u.k_offset_v1.k_uniqueness = le32_to_cpu (from->u.k_offset_v1.k_uniqueness);
163 } else {
164 to->on_disk_key.u.k_offset_v2.k_offset = offset_v2_k_offset(&from->u.k_offset_v2);
165 to->on_disk_key.u.k_offset_v2.k_type = offset_v2_k_type(&from->u.k_offset_v2);
166 }
167}
168
169
170
171// this does not say which one is bigger, it only returns 1 if keys
172// are not equal, 0 otherwise
173inline int comp_le_keys (const struct reiserfs_key * k1, const struct reiserfs_key * k2)
174{
175 return memcmp (k1, k2, sizeof (struct reiserfs_key));
176}
177
178/**************************************************************************
179 * Binary search toolkit function *
180 * Search for an item in the array by the item key *
181 * Returns: 1 if found, 0 if not found; *
182 * *p_n_pos = number of the searched element if found, else the *
183 * number of the first element that is larger than p_v_key. *
184 **************************************************************************/
185/* For those not familiar with binary search: n_lbound is the leftmost item that it
186 could be, n_rbound the rightmost item that it could be. We examine the item
187 halfway between n_lbound and n_rbound, and that tells us either that we can increase
188 n_lbound, or decrease n_rbound, or that we have found it, or if n_lbound <= n_rbound that
189 there are no possible items, and we have not found it. With each examination we
190 cut the number of possible items it could be by one more than half rounded down,
191 or we find it. */
192static inline int bin_search (
193 const void * p_v_key, /* Key to search for. */
194 const void * p_v_base,/* First item in the array. */
195 int p_n_num, /* Number of items in the array. */
196 int p_n_width, /* Item size in the array.
197 searched. Lest the reader be
198 confused, note that this is crafted
199 as a general function, and when it
200 is applied specifically to the array
201 of item headers in a node, p_n_width
202 is actually the item header size not
203 the item size. */
204 int * p_n_pos /* Number of the searched for element. */
205 ) {
206 int n_rbound, n_lbound, n_j;
207
208 for ( n_j = ((n_rbound = p_n_num - 1) + (n_lbound = 0))/2; n_lbound <= n_rbound; n_j = (n_rbound + n_lbound)/2 )
209 switch( comp_keys((struct reiserfs_key *)((char * )p_v_base + n_j * p_n_width), (struct cpu_key *)p_v_key) ) {
210 case -1: n_lbound = n_j + 1; continue;
211 case 1: n_rbound = n_j - 1; continue;
212 case 0: *p_n_pos = n_j; return ITEM_FOUND; /* Key found in the array. */
213 }
214
215 /* bin_search did not find given key, it returns position of key,
216 that is minimal and greater than the given one. */
217 *p_n_pos = n_lbound;
218 return ITEM_NOT_FOUND;
219}
220
221#ifdef CONFIG_REISERFS_CHECK
222extern struct tree_balance * cur_tb;
223#endif
224
225
226
227/* Minimal possible key. It is never in the tree. */
228const struct reiserfs_key MIN_KEY = {0, 0, {{0, 0},}};
229
230/* Maximal possible key. It is never in the tree. */
231const struct reiserfs_key MAX_KEY = {0xffffffff, 0xffffffff, {{0xffffffff, 0xffffffff},}};
232
233
234/* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom
235 of the path, and going upwards. We must check the path's validity at each step. If the key is not in
236 the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this
237 case we return a special key, either MIN_KEY or MAX_KEY. */
238static inline const struct reiserfs_key * get_lkey (
239 const struct path * p_s_chk_path,
240 const struct super_block * p_s_sb
241 ) {
242 int n_position, n_path_offset = p_s_chk_path->path_length;
243 struct buffer_head * p_s_parent;
244
245 RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET,
246 "PAP-5010: invalid offset in the path");
247
248 /* While not higher in path than first element. */
249 while ( n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET ) {
250
251 RFALSE( ! buffer_uptodate(PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)),
252 "PAP-5020: parent is not uptodate");
253
254 /* Parent at the path is not in the tree now. */
255 if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)) )
256 return &MAX_KEY;
257 /* Check whether position in the parent is correct. */
258 if ( (n_position = PATH_OFFSET_POSITION(p_s_chk_path, n_path_offset)) > B_NR_ITEMS(p_s_parent) )
259 return &MAX_KEY;
260 /* Check whether parent at the path really points to the child. */
261 if ( B_N_CHILD_NUM(p_s_parent, n_position) !=
262 PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset + 1)->b_blocknr )
263 return &MAX_KEY;
264 /* Return delimiting key if position in the parent is not equal to zero. */
265 if ( n_position )
266 return B_N_PDELIM_KEY(p_s_parent, n_position - 1);
267 }
268 /* Return MIN_KEY if we are in the root of the buffer tree. */
269 if ( PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
270 SB_ROOT_BLOCK (p_s_sb) )
271 return &MIN_KEY;
272 return &MAX_KEY;
273}
274
275
276/* Get delimiting key of the buffer at the path and its right neighbor. */
277inline const struct reiserfs_key * get_rkey (
278 const struct path * p_s_chk_path,
279 const struct super_block * p_s_sb
280 ) {
281 int n_position,
282 n_path_offset = p_s_chk_path->path_length;
283 struct buffer_head * p_s_parent;
284
285 RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET,
286 "PAP-5030: invalid offset in the path");
287
288 while ( n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET ) {
289
290 RFALSE( ! buffer_uptodate(PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)),
291 "PAP-5040: parent is not uptodate");
292
293 /* Parent at the path is not in the tree now. */
294 if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)) )
295 return &MIN_KEY;
296 /* Check whether position in the parent is correct. */
297 if ( (n_position = PATH_OFFSET_POSITION(p_s_chk_path, n_path_offset)) > B_NR_ITEMS(p_s_parent) )
298 return &MIN_KEY;
299 /* Check whether parent at the path really points to the child. */
300 if ( B_N_CHILD_NUM(p_s_parent, n_position) !=
301 PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset + 1)->b_blocknr )
302 return &MIN_KEY;
303 /* Return delimiting key if position in the parent is not the last one. */
304 if ( n_position != B_NR_ITEMS(p_s_parent) )
305 return B_N_PDELIM_KEY(p_s_parent, n_position);
306 }
307 /* Return MAX_KEY if we are in the root of the buffer tree. */
308 if ( PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
309 SB_ROOT_BLOCK (p_s_sb) )
310 return &MAX_KEY;
311 return &MIN_KEY;
312}
313
314
315/* Check whether a key is contained in the tree rooted from a buffer at a path. */
316/* This works by looking at the left and right delimiting keys for the buffer in the last path_element in
317 the path. These delimiting keys are stored at least one level above that buffer in the tree. If the
318 buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in
319 this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */
320static inline int key_in_buffer (
321 struct path * p_s_chk_path, /* Path which should be checked. */
322 const struct cpu_key * p_s_key, /* Key which should be checked. */
323 struct super_block * p_s_sb /* Super block pointer. */
324 ) {
325
326 RFALSE( ! p_s_key || p_s_chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET ||
327 p_s_chk_path->path_length > MAX_HEIGHT,
328 "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)",
329 p_s_key, p_s_chk_path->path_length);
330 RFALSE( !PATH_PLAST_BUFFER(p_s_chk_path)->b_bdev,
331 "PAP-5060: device must not be NODEV");
332
333 if ( comp_keys(get_lkey(p_s_chk_path, p_s_sb), p_s_key) == 1 )
334 /* left delimiting key is bigger, that the key we look for */
335 return 0;
336 // if ( comp_keys(p_s_key, get_rkey(p_s_chk_path, p_s_sb)) != -1 )
337 if ( comp_keys(get_rkey(p_s_chk_path, p_s_sb), p_s_key) != 1 )
338 /* p_s_key must be less than right delimitiing key */
339 return 0;
340 return 1;
341}
342
343
344inline void decrement_bcount(
345 struct buffer_head * p_s_bh
346 ) {
347 if ( p_s_bh ) {
348 if ( atomic_read (&(p_s_bh->b_count)) ) {
349 put_bh(p_s_bh) ;
350 return;
351 }
352 reiserfs_panic(NULL, "PAP-5070: decrement_bcount: trying to free free buffer %b", p_s_bh);
353 }
354}
355
356
357/* Decrement b_count field of the all buffers in the path. */
358void decrement_counters_in_path (
359 struct path * p_s_search_path
360 ) {
361 int n_path_offset = p_s_search_path->path_length;
362
363 RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET ||
364 n_path_offset > EXTENDED_MAX_HEIGHT - 1,
365 "PAP-5080: invalid path offset of %d", n_path_offset);
366
367 while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET ) {
368 struct buffer_head * bh;
369
370 bh = PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--);
371 decrement_bcount (bh);
372 }
373 p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
374}
375
376
377int reiserfs_check_path(struct path *p) {
378 RFALSE( p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET,
379 "path not properly relsed") ;
380 return 0 ;
381}
382
383
384/* Release all buffers in the path. Restore dirty bits clean
385** when preparing the buffer for the log
386**
387** only called from fix_nodes()
388*/
389void pathrelse_and_restore (
390 struct super_block *s,
391 struct path * p_s_search_path
392 ) {
393 int n_path_offset = p_s_search_path->path_length;
394
395 RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
396 "clm-4000: invalid path offset");
397
398 while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET ) {
399 reiserfs_restore_prepared_buffer(s, PATH_OFFSET_PBUFFER(p_s_search_path,
400 n_path_offset));
401 brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--));
402 }
403 p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
404}
405
406/* Release all buffers in the path. */
407void pathrelse (
408 struct path * p_s_search_path
409 ) {
410 int n_path_offset = p_s_search_path->path_length;
411
412 RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
413 "PAP-5090: invalid path offset");
414
415 while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET )
416 brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--));
417
418 p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
419}
420
421
422
423static int is_leaf (char * buf, int blocksize, struct buffer_head * bh)
424{
425 struct block_head * blkh;
426 struct item_head * ih;
427 int used_space;
428 int prev_location;
429 int i;
430 int nr;
431
432 blkh = (struct block_head *)buf;
433 if ( blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) {
434 reiserfs_warning (NULL, "is_leaf: this should be caught earlier");
435 return 0;
436 }
437
438 nr = blkh_nr_item(blkh);
439 if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) {
440 /* item number is too big or too small */
441 reiserfs_warning (NULL, "is_leaf: nr_item seems wrong: %z", bh);
442 return 0;
443 }
444 ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
445 used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location (ih));
446 if (used_space != blocksize - blkh_free_space(blkh)) {
447 /* free space does not match to calculated amount of use space */
448 reiserfs_warning (NULL, "is_leaf: free space seems wrong: %z", bh);
449 return 0;
450 }
451
452 // FIXME: it is_leaf will hit performance too much - we may have
453 // return 1 here
454
455 /* check tables of item heads */
456 ih = (struct item_head *)(buf + BLKH_SIZE);
457 prev_location = blocksize;
458 for (i = 0; i < nr; i ++, ih ++) {
459 if ( le_ih_k_type(ih) == TYPE_ANY) {
460 reiserfs_warning (NULL, "is_leaf: wrong item type for item %h",ih);
461 return 0;
462 }
463 if (ih_location (ih) >= blocksize || ih_location (ih) < IH_SIZE * nr) {
464 reiserfs_warning (NULL, "is_leaf: item location seems wrong: %h", ih);
465 return 0;
466 }
467 if (ih_item_len (ih) < 1 || ih_item_len (ih) > MAX_ITEM_LEN (blocksize)) {
468 reiserfs_warning (NULL, "is_leaf: item length seems wrong: %h", ih);
469 return 0;
470 }
471 if (prev_location - ih_location (ih) != ih_item_len (ih)) {
472 reiserfs_warning (NULL, "is_leaf: item location seems wrong (second one): %h", ih);
473 return 0;
474 }
475 prev_location = ih_location (ih);
476 }
477
478 // one may imagine much more checks
479 return 1;
480}
481
482
483/* returns 1 if buf looks like an internal node, 0 otherwise */
484static int is_internal (char * buf, int blocksize, struct buffer_head * bh)
485{
486 struct block_head * blkh;
487 int nr;
488 int used_space;
489
490 blkh = (struct block_head *)buf;
491 nr = blkh_level(blkh);
492 if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) {
493 /* this level is not possible for internal nodes */
494 reiserfs_warning (NULL, "is_internal: this should be caught earlier");
495 return 0;
496 }
497
498 nr = blkh_nr_item(blkh);
499 if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
500 /* for internal which is not root we might check min number of keys */
501 reiserfs_warning (NULL, "is_internal: number of key seems wrong: %z", bh);
502 return 0;
503 }
504
505 used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1);
506 if (used_space != blocksize - blkh_free_space(blkh)) {
507 reiserfs_warning (NULL, "is_internal: free space seems wrong: %z", bh);
508 return 0;
509 }
510
511 // one may imagine much more checks
512 return 1;
513}
514
515
516// make sure that bh contains formatted node of reiserfs tree of
517// 'level'-th level
518static int is_tree_node (struct buffer_head * bh, int level)
519{
520 if (B_LEVEL (bh) != level) {
521 reiserfs_warning (NULL, "is_tree_node: node level %d does not match to the expected one %d",
522 B_LEVEL (bh), level);
523 return 0;
524 }
525 if (level == DISK_LEAF_NODE_LEVEL)
526 return is_leaf (bh->b_data, bh->b_size, bh);
527
528 return is_internal (bh->b_data, bh->b_size, bh);
529}
530
531
532
533#define SEARCH_BY_KEY_READA 16
534
535/* The function is NOT SCHEDULE-SAFE! */
536static void search_by_key_reada (struct super_block * s,
537 struct buffer_head **bh,
538 unsigned long *b, int num)
539{
540 int i,j;
541
542 for (i = 0 ; i < num ; i++) {
543 bh[i] = sb_getblk (s, b[i]);
544 }
545 for (j = 0 ; j < i ; j++) {
546 /*
547 * note, this needs attention if we are getting rid of the BKL
548 * you have to make sure the prepared bit isn't set on this buffer
549 */
550 if (!buffer_uptodate(bh[j]))
551 ll_rw_block(READA, 1, bh + j);
552 brelse(bh[j]);
553 }
554}
555
556/**************************************************************************
557 * Algorithm SearchByKey *
558 * look for item in the Disk S+Tree by its key *
559 * Input: p_s_sb - super block *
560 * p_s_key - pointer to the key to search *
561 * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR *
562 * p_s_search_path - path from the root to the needed leaf *
563 **************************************************************************/
564
565/* This function fills up the path from the root to the leaf as it
566 descends the tree looking for the key. It uses reiserfs_bread to
567 try to find buffers in the cache given their block number. If it
568 does not find them in the cache it reads them from disk. For each
569 node search_by_key finds using reiserfs_bread it then uses
570 bin_search to look through that node. bin_search will find the
571 position of the block_number of the next node if it is looking
572 through an internal node. If it is looking through a leaf node
573 bin_search will find the position of the item which has key either
574 equal to given key, or which is the maximal key less than the given
575 key. search_by_key returns a path that must be checked for the
576 correctness of the top of the path but need not be checked for the
577 correctness of the bottom of the path */
578/* The function is NOT SCHEDULE-SAFE! */
579int search_by_key (struct super_block * p_s_sb,
580 const struct cpu_key * p_s_key, /* Key to search. */
581 struct path * p_s_search_path, /* This structure was
582 allocated and initialized
583 by the calling
584 function. It is filled up
585 by this function. */
586 int n_stop_level /* How far down the tree to search. To
587 stop at leaf level - set to
588 DISK_LEAF_NODE_LEVEL */
589 ) {
590 int n_block_number;
591 int expected_level;
592 struct buffer_head * p_s_bh;
593 struct path_element * p_s_last_element;
594 int n_node_level, n_retval;
595 int right_neighbor_of_leaf_node;
596 int fs_gen;
597 struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
598 unsigned long reada_blocks[SEARCH_BY_KEY_READA];
599 int reada_count = 0;
600
601#ifdef CONFIG_REISERFS_CHECK
602 int n_repeat_counter = 0;
603#endif
604
605 PROC_INFO_INC( p_s_sb, search_by_key );
606
607 /* As we add each node to a path we increase its count. This means that
608 we must be careful to release all nodes in a path before we either
609 discard the path struct or re-use the path struct, as we do here. */
610
611 decrement_counters_in_path(p_s_search_path);
612
613 right_neighbor_of_leaf_node = 0;
614
615 /* With each iteration of this loop we search through the items in the
616 current node, and calculate the next current node(next path element)
617 for the next iteration of this loop.. */
618 n_block_number = SB_ROOT_BLOCK (p_s_sb);
619 expected_level = -1;
620 while ( 1 ) {
621
622#ifdef CONFIG_REISERFS_CHECK
623 if ( !(++n_repeat_counter % 50000) )
624 reiserfs_warning (p_s_sb, "PAP-5100: search_by_key: %s:"
625 "there were %d iterations of while loop "
626 "looking for key %K",
627 current->comm, n_repeat_counter, p_s_key);
628#endif
629
630 /* prep path to have another element added to it. */
631 p_s_last_element = PATH_OFFSET_PELEMENT(p_s_search_path, ++p_s_search_path->path_length);
632 fs_gen = get_generation (p_s_sb);
633
634 /* Read the next tree node, and set the last element in the path to
635 have a pointer to it. */
636 if ((p_s_bh = p_s_last_element->pe_buffer =
637 sb_getblk(p_s_sb, n_block_number)) ) {
638 if (!buffer_uptodate(p_s_bh) && reada_count > 1) {
639 search_by_key_reada (p_s_sb, reada_bh,
640 reada_blocks, reada_count);
641 }
642 ll_rw_block(READ, 1, &p_s_bh);
643 wait_on_buffer(p_s_bh);
644 if (!buffer_uptodate(p_s_bh))
645 goto io_error;
646 } else {
647io_error:
648 p_s_search_path->path_length --;
649 pathrelse(p_s_search_path);
650 return IO_ERROR;
651 }
652 reada_count = 0;
653 if (expected_level == -1)
654 expected_level = SB_TREE_HEIGHT (p_s_sb);
655 expected_level --;
656
657 /* It is possible that schedule occurred. We must check whether the key
658 to search is still in the tree rooted from the current buffer. If
659 not then repeat search from the root. */
660 if ( fs_changed (fs_gen, p_s_sb) &&
661 (!B_IS_IN_TREE (p_s_bh) ||
662 B_LEVEL(p_s_bh) != expected_level ||
663 !key_in_buffer(p_s_search_path, p_s_key, p_s_sb))) {
664 PROC_INFO_INC( p_s_sb, search_by_key_fs_changed );
665 PROC_INFO_INC( p_s_sb, search_by_key_restarted );
666 PROC_INFO_INC( p_s_sb, sbk_restarted[ expected_level - 1 ] );
667 decrement_counters_in_path(p_s_search_path);
668
669 /* Get the root block number so that we can repeat the search
670 starting from the root. */
671 n_block_number = SB_ROOT_BLOCK (p_s_sb);
672 expected_level = -1;
673 right_neighbor_of_leaf_node = 0;
674
675 /* repeat search from the root */
676 continue;
677 }
678
679 /* only check that the key is in the buffer if p_s_key is not
680 equal to the MAX_KEY. Latter case is only possible in
681 "finish_unfinished()" processing during mount. */
682 RFALSE( comp_keys( &MAX_KEY, p_s_key ) &&
683 ! key_in_buffer(p_s_search_path, p_s_key, p_s_sb),
684 "PAP-5130: key is not in the buffer");
685#ifdef CONFIG_REISERFS_CHECK
686 if ( cur_tb ) {
687 print_cur_tb ("5140");
688 reiserfs_panic(p_s_sb, "PAP-5140: search_by_key: schedule occurred in do_balance!");
689 }
690#endif
691
692 // make sure, that the node contents look like a node of
693 // certain level
694 if (!is_tree_node (p_s_bh, expected_level)) {
695 reiserfs_warning (p_s_sb, "vs-5150: search_by_key: "
696 "invalid format found in block %ld. Fsck?",
697 p_s_bh->b_blocknr);
698 pathrelse (p_s_search_path);
699 return IO_ERROR;
700 }
701
702 /* ok, we have acquired next formatted node in the tree */
703 n_node_level = B_LEVEL (p_s_bh);
704
705 PROC_INFO_BH_STAT( p_s_sb, p_s_bh, n_node_level - 1 );
706
707 RFALSE( n_node_level < n_stop_level,
708 "vs-5152: tree level (%d) is less than stop level (%d)",
709 n_node_level, n_stop_level);
710
711 n_retval = bin_search( p_s_key, B_N_PITEM_HEAD(p_s_bh, 0),
712 B_NR_ITEMS(p_s_bh),
713 ( n_node_level == DISK_LEAF_NODE_LEVEL ) ? IH_SIZE : KEY_SIZE,
714 &(p_s_last_element->pe_position));
715 if (n_node_level == n_stop_level) {
716 return n_retval;
717 }
718
719 /* we are not in the stop level */
720 if (n_retval == ITEM_FOUND)
721 /* item has been found, so we choose the pointer which is to the right of the found one */
722 p_s_last_element->pe_position++;
723
724 /* if item was not found we choose the position which is to
725 the left of the found item. This requires no code,
726 bin_search did it already.*/
727
728 /* So we have chosen a position in the current node which is
729 an internal node. Now we calculate child block number by
730 position in the node. */
731 n_block_number = B_N_CHILD_NUM(p_s_bh, p_s_last_element->pe_position);
732
733 /* if we are going to read leaf nodes, try for read ahead as well */
734 if ((p_s_search_path->reada & PATH_READA) &&
735 n_node_level == DISK_LEAF_NODE_LEVEL + 1)
736 {
737 int pos = p_s_last_element->pe_position;
738 int limit = B_NR_ITEMS(p_s_bh);
739 struct reiserfs_key *le_key;
740
741 if (p_s_search_path->reada & PATH_READA_BACK)
742 limit = 0;
743 while(reada_count < SEARCH_BY_KEY_READA) {
744 if (pos == limit)
745 break;
746 reada_blocks[reada_count++] = B_N_CHILD_NUM(p_s_bh, pos);
747 if (p_s_search_path->reada & PATH_READA_BACK)
748 pos--;
749 else
750 pos++;
751
752 /*
753 * check to make sure we're in the same object
754 */
755 le_key = B_N_PDELIM_KEY(p_s_bh, pos);
756 if (le32_to_cpu(le_key->k_objectid) !=
757 p_s_key->on_disk_key.k_objectid)
758 {
759 break;
760 }
761 }
762 }
763 }
764}
765
766
767/* Form the path to an item and position in this item which contains
768 file byte defined by p_s_key. If there is no such item
769 corresponding to the key, we point the path to the item with
770 maximal key less than p_s_key, and *p_n_pos_in_item is set to one
771 past the last entry/byte in the item. If searching for entry in a
772 directory item, and it is not found, *p_n_pos_in_item is set to one
773 entry more than the entry with maximal key which is less than the
774 sought key.
775
776 Note that if there is no entry in this same node which is one more,
777 then we point to an imaginary entry. for direct items, the
778 position is in units of bytes, for indirect items the position is
779 in units of blocknr entries, for directory items the position is in
780 units of directory entries. */
781
782/* The function is NOT SCHEDULE-SAFE! */
783int search_for_position_by_key (struct super_block * p_s_sb, /* Pointer to the super block. */
784 const struct cpu_key * p_cpu_key, /* Key to search (cpu variable) */
785 struct path * p_s_search_path /* Filled up by this function. */
786 ) {
787 struct item_head * p_le_ih; /* pointer to on-disk structure */
788 int n_blk_size;
789 loff_t item_offset, offset;
790 struct reiserfs_dir_entry de;
791 int retval;
792
793 /* If searching for directory entry. */
794 if ( is_direntry_cpu_key (p_cpu_key) )
795 return search_by_entry_key (p_s_sb, p_cpu_key, p_s_search_path, &de);
796
797 /* If not searching for directory entry. */
798
799 /* If item is found. */
800 retval = search_item (p_s_sb, p_cpu_key, p_s_search_path);
801 if (retval == IO_ERROR)
802 return retval;
803 if ( retval == ITEM_FOUND ) {
804
805 RFALSE( ! ih_item_len(
806 B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path),
807 PATH_LAST_POSITION(p_s_search_path))),
808 "PAP-5165: item length equals zero");
809
810 pos_in_item(p_s_search_path) = 0;
811 return POSITION_FOUND;
812 }
813
814 RFALSE( ! PATH_LAST_POSITION(p_s_search_path),
815 "PAP-5170: position equals zero");
816
817 /* Item is not found. Set path to the previous item. */
818 p_le_ih = B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path), --PATH_LAST_POSITION(p_s_search_path));
819 n_blk_size = p_s_sb->s_blocksize;
820
821 if (comp_short_keys (&(p_le_ih->ih_key), p_cpu_key)) {
822 return FILE_NOT_FOUND;
823 }
824
825 // FIXME: quite ugly this far
826
827 item_offset = le_ih_k_offset (p_le_ih);
828 offset = cpu_key_k_offset (p_cpu_key);
829
830 /* Needed byte is contained in the item pointed to by the path.*/
831 if (item_offset <= offset &&
832 item_offset + op_bytes_number (p_le_ih, n_blk_size) > offset) {
833 pos_in_item (p_s_search_path) = offset - item_offset;
834 if ( is_indirect_le_ih(p_le_ih) ) {
835 pos_in_item (p_s_search_path) /= n_blk_size;
836 }
837 return POSITION_FOUND;
838 }
839
840 /* Needed byte is not contained in the item pointed to by the
841 path. Set pos_in_item out of the item. */
842 if ( is_indirect_le_ih (p_le_ih) )
843 pos_in_item (p_s_search_path) = ih_item_len(p_le_ih) / UNFM_P_SIZE;
844 else
845 pos_in_item (p_s_search_path) = ih_item_len( p_le_ih );
846
847 return POSITION_NOT_FOUND;
848}
849
850
851/* Compare given item and item pointed to by the path. */
852int comp_items (const struct item_head * stored_ih, const struct path * p_s_path)
853{
854 struct buffer_head * p_s_bh;
855 struct item_head * ih;
856
857 /* Last buffer at the path is not in the tree. */
858 if ( ! B_IS_IN_TREE(p_s_bh = PATH_PLAST_BUFFER(p_s_path)) )
859 return 1;
860
861 /* Last path position is invalid. */
862 if ( PATH_LAST_POSITION(p_s_path) >= B_NR_ITEMS(p_s_bh) )
863 return 1;
864
865 /* we need only to know, whether it is the same item */
866 ih = get_ih (p_s_path);
867 return memcmp (stored_ih, ih, IH_SIZE);
868}
869
870
871/* unformatted nodes are not logged anymore, ever. This is safe
872** now
873*/
874#define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1)
875
876// block can not be forgotten as it is in I/O or held by someone
877#define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh)))
878
879
880
881// prepare for delete or cut of direct item
882static inline int prepare_for_direct_item (struct path * path,
883 struct item_head * le_ih,
884 struct inode * inode,
885 loff_t new_file_length,
886 int * cut_size)
887{
888 loff_t round_len;
889
890
891 if ( new_file_length == max_reiserfs_offset (inode) ) {
892 /* item has to be deleted */
893 *cut_size = -(IH_SIZE + ih_item_len(le_ih));
894 return M_DELETE;
895 }
896
897 // new file gets truncated
898 if (get_inode_item_key_version (inode) == KEY_FORMAT_3_6) {
899 //
900 round_len = ROUND_UP (new_file_length);
901 /* this was n_new_file_length < le_ih ... */
902 if ( round_len < le_ih_k_offset (le_ih) ) {
903 *cut_size = -(IH_SIZE + ih_item_len(le_ih));
904 return M_DELETE; /* Delete this item. */
905 }
906 /* Calculate first position and size for cutting from item. */
907 pos_in_item (path) = round_len - (le_ih_k_offset (le_ih) - 1);
908 *cut_size = -(ih_item_len(le_ih) - pos_in_item(path));
909
910 return M_CUT; /* Cut from this item. */
911 }
912
913
914 // old file: items may have any length
915
916 if ( new_file_length < le_ih_k_offset (le_ih) ) {
917 *cut_size = -(IH_SIZE + ih_item_len(le_ih));
918 return M_DELETE; /* Delete this item. */
919 }
920 /* Calculate first position and size for cutting from item. */
921 *cut_size = -(ih_item_len(le_ih) -
922 (pos_in_item (path) = new_file_length + 1 - le_ih_k_offset (le_ih)));
923 return M_CUT; /* Cut from this item. */
924}
925
926
927static inline int prepare_for_direntry_item (struct path * path,
928 struct item_head * le_ih,
929 struct inode * inode,
930 loff_t new_file_length,
931 int * cut_size)
932{
933 if (le_ih_k_offset (le_ih) == DOT_OFFSET &&
934 new_file_length == max_reiserfs_offset (inode)) {
935 RFALSE( ih_entry_count (le_ih) != 2,
936 "PAP-5220: incorrect empty directory item (%h)", le_ih);
937 *cut_size = -(IH_SIZE + ih_item_len(le_ih));
938 return M_DELETE; /* Delete the directory item containing "." and ".." entry. */
939 }
940
941 if ( ih_entry_count (le_ih) == 1 ) {
942 /* Delete the directory item such as there is one record only
943 in this item*/
944 *cut_size = -(IH_SIZE + ih_item_len(le_ih));
945 return M_DELETE;
946 }
947
948 /* Cut one record from the directory item. */
949 *cut_size = -(DEH_SIZE + entry_length (get_last_bh (path), le_ih, pos_in_item (path)));
950 return M_CUT;
951}
952
953
954/* If the path points to a directory or direct item, calculate mode and the size cut, for balance.
955 If the path points to an indirect item, remove some number of its unformatted nodes.
956 In case of file truncate calculate whether this item must be deleted/truncated or last
957 unformatted node of this item will be converted to a direct item.
958 This function returns a determination of what balance mode the calling function should employ. */
959static char prepare_for_delete_or_cut(
960 struct reiserfs_transaction_handle *th,
961 struct inode * inode,
962 struct path * p_s_path,
963 const struct cpu_key * p_s_item_key,
964 int * p_n_removed, /* Number of unformatted nodes which were removed
965 from end of the file. */
966 int * p_n_cut_size,
967 unsigned long long n_new_file_length /* MAX_KEY_OFFSET in case of delete. */
968 ) {
969 struct super_block * p_s_sb = inode->i_sb;
970 struct item_head * p_le_ih = PATH_PITEM_HEAD(p_s_path);
971 struct buffer_head * p_s_bh = PATH_PLAST_BUFFER(p_s_path);
972
973 BUG_ON (!th->t_trans_id);
974
975 /* Stat_data item. */
976 if ( is_statdata_le_ih (p_le_ih) ) {
977
978 RFALSE( n_new_file_length != max_reiserfs_offset (inode),
979 "PAP-5210: mode must be M_DELETE");
980
981 *p_n_cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
982 return M_DELETE;
983 }
984
985
986 /* Directory item. */
987 if ( is_direntry_le_ih (p_le_ih) )
988 return prepare_for_direntry_item (p_s_path, p_le_ih, inode, n_new_file_length, p_n_cut_size);
989
990 /* Direct item. */
991 if ( is_direct_le_ih (p_le_ih) )
992 return prepare_for_direct_item (p_s_path, p_le_ih, inode, n_new_file_length, p_n_cut_size);
993
994
995 /* Case of an indirect item. */
996 {
997 int n_unfm_number, /* Number of the item unformatted nodes. */
998 n_counter,
999 n_blk_size;
1000 __u32 * p_n_unfm_pointer; /* Pointer to the unformatted node number. */
1001 __u32 tmp;
1002 struct item_head s_ih; /* Item header. */
1003 char c_mode; /* Returned mode of the balance. */
1004 int need_research;
1005
1006
1007 n_blk_size = p_s_sb->s_blocksize;
1008
1009 /* Search for the needed object indirect item until there are no unformatted nodes to be removed. */
1010 do {
1011 need_research = 0;
1012 p_s_bh = PATH_PLAST_BUFFER(p_s_path);
1013 /* Copy indirect item header to a temp variable. */
1014 copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
1015 /* Calculate number of unformatted nodes in this item. */
1016 n_unfm_number = I_UNFM_NUM(&s_ih);
1017
1018 RFALSE( ! is_indirect_le_ih(&s_ih) || ! n_unfm_number ||
1019 pos_in_item (p_s_path) + 1 != n_unfm_number,
1020 "PAP-5240: invalid item %h "
1021 "n_unfm_number = %d *p_n_pos_in_item = %d",
1022 &s_ih, n_unfm_number, pos_in_item (p_s_path));
1023
1024 /* Calculate balance mode and position in the item to remove unformatted nodes. */
1025 if ( n_new_file_length == max_reiserfs_offset (inode) ) {/* Case of delete. */
1026 pos_in_item (p_s_path) = 0;
1027 *p_n_cut_size = -(IH_SIZE + ih_item_len(&s_ih));
1028 c_mode = M_DELETE;
1029 }
1030 else { /* Case of truncate. */
1031 if ( n_new_file_length < le_ih_k_offset (&s_ih) ) {
1032 pos_in_item (p_s_path) = 0;
1033 *p_n_cut_size = -(IH_SIZE + ih_item_len(&s_ih));
1034 c_mode = M_DELETE; /* Delete this item. */
1035 }
1036 else {
1037 /* indirect item must be truncated starting from *p_n_pos_in_item-th position */
1038 pos_in_item (p_s_path) = (n_new_file_length + n_blk_size - le_ih_k_offset (&s_ih) ) >> p_s_sb->s_blocksize_bits;
1039
1040 RFALSE( pos_in_item (p_s_path) > n_unfm_number,
1041 "PAP-5250: invalid position in the item");
1042
1043 /* Either convert last unformatted node of indirect item to direct item or increase
1044 its free space. */
1045 if ( pos_in_item (p_s_path) == n_unfm_number ) {
1046 *p_n_cut_size = 0; /* Nothing to cut. */
1047 return M_CONVERT; /* Maybe convert last unformatted node to the direct item. */
1048 }
1049 /* Calculate size to cut. */
1050 *p_n_cut_size = -(ih_item_len(&s_ih) - pos_in_item(p_s_path) * UNFM_P_SIZE);
1051
1052 c_mode = M_CUT; /* Cut from this indirect item. */
1053 }
1054 }
1055
1056 RFALSE( n_unfm_number <= pos_in_item (p_s_path),
1057 "PAP-5260: invalid position in the indirect item");
1058
1059 /* pointers to be cut */
1060 n_unfm_number -= pos_in_item (p_s_path);
1061 /* Set pointer to the last unformatted node pointer that is to be cut. */
1062 p_n_unfm_pointer = (__u32 *)B_I_PITEM(p_s_bh, &s_ih) + I_UNFM_NUM(&s_ih) - 1 - *p_n_removed;
1063
1064
1065 /* We go through the unformatted nodes pointers of the indirect
1066 item and look for the unformatted nodes in the cache. If we
1067 found some of them we free it, zero corresponding indirect item
1068 entry and log buffer containing that indirect item. For this we
1069 need to prepare last path element for logging. If some
1070 unformatted node has b_count > 1 we must not free this
1071 unformatted node since it is in use. */
1072 reiserfs_prepare_for_journal(p_s_sb, p_s_bh, 1);
1073 // note: path could be changed, first line in for loop takes care
1074 // of it
1075
1076 for (n_counter = *p_n_removed;
1077 n_counter < n_unfm_number; n_counter++, p_n_unfm_pointer-- ) {
1078
1079 cond_resched();
1080 if (item_moved (&s_ih, p_s_path)) {
1081 need_research = 1 ;
1082 break;
1083 }
1084 RFALSE( p_n_unfm_pointer < (__u32 *)B_I_PITEM(p_s_bh, &s_ih) ||
1085 p_n_unfm_pointer > (__u32 *)B_I_PITEM(p_s_bh, &s_ih) + I_UNFM_NUM(&s_ih) - 1,
1086 "vs-5265: pointer out of range");
1087
1088 /* Hole, nothing to remove. */
1089 if ( ! get_block_num(p_n_unfm_pointer,0) ) {
1090 (*p_n_removed)++;
1091 continue;
1092 }
1093
1094 (*p_n_removed)++;
1095
1096 tmp = get_block_num(p_n_unfm_pointer,0);
1097 put_block_num(p_n_unfm_pointer, 0, 0);
1098 journal_mark_dirty (th, p_s_sb, p_s_bh);
1099 reiserfs_free_block(th, inode, tmp, 1);
1100 if ( item_moved (&s_ih, p_s_path) ) {
1101 need_research = 1;
1102 break ;
1103 }
1104 }
1105
1106 /* a trick. If the buffer has been logged, this
1107 ** will do nothing. If we've broken the loop without
1108 ** logging it, it will restore the buffer
1109 **
1110 */
1111 reiserfs_restore_prepared_buffer(p_s_sb, p_s_bh);
1112
1113 /* This loop can be optimized. */
1114 } while ( (*p_n_removed < n_unfm_number || need_research) &&
1115 search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_FOUND );
1116
1117 RFALSE( *p_n_removed < n_unfm_number,
1118 "PAP-5310: indirect item is not found");
1119 RFALSE( item_moved (&s_ih, p_s_path),
1120 "after while, comp failed, retry") ;
1121
1122 if (c_mode == M_CUT)
1123 pos_in_item (p_s_path) *= UNFM_P_SIZE;
1124 return c_mode;
1125 }
1126}
1127
1128/* Calculate number of bytes which will be deleted or cut during balance */
1129static int calc_deleted_bytes_number(
1130 struct tree_balance * p_s_tb,
1131 char c_mode
1132 ) {
1133 int n_del_size;
1134 struct item_head * p_le_ih = PATH_PITEM_HEAD(p_s_tb->tb_path);
1135
1136 if ( is_statdata_le_ih (p_le_ih) )
1137 return 0;
1138
1139 n_del_size = ( c_mode == M_DELETE ) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0];
1140 if ( is_direntry_le_ih (p_le_ih) ) {
1141 // return EMPTY_DIR_SIZE; /* We delete emty directoris only. */
1142 // we can't use EMPTY_DIR_SIZE, as old format dirs have a different
1143 // empty size. ick. FIXME, is this right?
1144 //
1145 return n_del_size ;
1146 }
1147
1148 if ( is_indirect_le_ih (p_le_ih) )
1149 n_del_size = (n_del_size/UNFM_P_SIZE)*
1150 (PATH_PLAST_BUFFER(p_s_tb->tb_path)->b_size);// - get_ih_free_space (p_le_ih);
1151 return n_del_size;
1152}
1153
1154static void init_tb_struct(
1155 struct reiserfs_transaction_handle *th,
1156 struct tree_balance * p_s_tb,
1157 struct super_block * p_s_sb,
1158 struct path * p_s_path,
1159 int n_size
1160 ) {
1161
1162 BUG_ON (!th->t_trans_id);
1163
1164 memset (p_s_tb,'\0',sizeof(struct tree_balance));
1165 p_s_tb->transaction_handle = th ;
1166 p_s_tb->tb_sb = p_s_sb;
1167 p_s_tb->tb_path = p_s_path;
1168 PATH_OFFSET_PBUFFER(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
1169 PATH_OFFSET_POSITION(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
1170 p_s_tb->insert_size[0] = n_size;
1171}
1172
1173
1174
1175void padd_item (char * item, int total_length, int length)
1176{
1177 int i;
1178
1179 for (i = total_length; i > length; )
1180 item [--i] = 0;
1181}
1182
1183#ifdef REISERQUOTA_DEBUG
1184char key2type(struct reiserfs_key *ih)
1185{
1186 if (is_direntry_le_key(2, ih))
1187 return 'd';
1188 if (is_direct_le_key(2, ih))
1189 return 'D';
1190 if (is_indirect_le_key(2, ih))
1191 return 'i';
1192 if (is_statdata_le_key(2, ih))
1193 return 's';
1194 return 'u';
1195}
1196
1197char head2type(struct item_head *ih)
1198{
1199 if (is_direntry_le_ih(ih))
1200 return 'd';
1201 if (is_direct_le_ih(ih))
1202 return 'D';
1203 if (is_indirect_le_ih(ih))
1204 return 'i';
1205 if (is_statdata_le_ih(ih))
1206 return 's';
1207 return 'u';
1208}
1209#endif
1210
1211/* Delete object item. */
1212int reiserfs_delete_item (struct reiserfs_transaction_handle *th,
1213 struct path * p_s_path, /* Path to the deleted item. */
1214 const struct cpu_key * p_s_item_key, /* Key to search for the deleted item. */
1215 struct inode * p_s_inode,/* inode is here just to update i_blocks and quotas */
1216 struct buffer_head * p_s_un_bh) /* NULL or unformatted node pointer. */
1217{
1218 struct super_block * p_s_sb = p_s_inode->i_sb;
1219 struct tree_balance s_del_balance;
1220 struct item_head s_ih;
1221 struct item_head *q_ih;
1222 int quota_cut_bytes;
1223 int n_ret_value,
1224 n_del_size,
1225 n_removed;
1226
1227#ifdef CONFIG_REISERFS_CHECK
1228 char c_mode;
1229 int n_iter = 0;
1230#endif
1231
1232 BUG_ON (!th->t_trans_id);
1233
1234 init_tb_struct(th, &s_del_balance, p_s_sb, p_s_path, 0/*size is unknown*/);
1235
1236 while ( 1 ) {
1237 n_removed = 0;
1238
1239#ifdef CONFIG_REISERFS_CHECK
1240 n_iter++;
1241 c_mode =
1242#endif
1243 prepare_for_delete_or_cut(th, p_s_inode, p_s_path, p_s_item_key, &n_removed, &n_del_size, max_reiserfs_offset (p_s_inode));
1244
1245 RFALSE( c_mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
1246
1247 copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
1248 s_del_balance.insert_size[0] = n_del_size;
1249
1250 n_ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
1251 if ( n_ret_value != REPEAT_SEARCH )
1252 break;
1253
1254 PROC_INFO_INC( p_s_sb, delete_item_restarted );
1255
1256 // file system changed, repeat search
1257 n_ret_value = search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path);
1258 if (n_ret_value == IO_ERROR)
1259 break;
1260 if (n_ret_value == FILE_NOT_FOUND) {
1261 reiserfs_warning (p_s_sb, "vs-5340: reiserfs_delete_item: "
1262 "no items of the file %K found", p_s_item_key);
1263 break;
1264 }
1265 } /* while (1) */
1266
1267 if ( n_ret_value != CARRY_ON ) {
1268 unfix_nodes(&s_del_balance);
1269 return 0;
1270 }
1271
1272 // reiserfs_delete_item returns item length when success
1273 n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
1274 q_ih = get_ih(p_s_path) ;
1275 quota_cut_bytes = ih_item_len(q_ih) ;
1276
1277 /* hack so the quota code doesn't have to guess if the file
1278 ** has a tail. On tail insert, we allocate quota for 1 unformatted node.
1279 ** We test the offset because the tail might have been
1280 ** split into multiple items, and we only want to decrement for
1281 ** the unfm node once
1282 */
1283 if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(q_ih)) {
1284 if ((le_ih_k_offset(q_ih) & (p_s_sb->s_blocksize - 1)) == 1) {
1285 quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE;
1286 } else {
1287 quota_cut_bytes = 0 ;
1288 }
1289 }
1290
1291 if ( p_s_un_bh ) {
1292 int off;
1293 char *data ;
1294
1295 /* We are in direct2indirect conversion, so move tail contents
1296 to the unformatted node */
1297 /* note, we do the copy before preparing the buffer because we
1298 ** don't care about the contents of the unformatted node yet.
1299 ** the only thing we really care about is the direct item's data
1300 ** is in the unformatted node.
1301 **
1302 ** Otherwise, we would have to call reiserfs_prepare_for_journal on
1303 ** the unformatted node, which might schedule, meaning we'd have to
1304 ** loop all the way back up to the start of the while loop.
1305 **
1306 ** The unformatted node must be dirtied later on. We can't be
1307 ** sure here if the entire tail has been deleted yet.
1308 **
1309 ** p_s_un_bh is from the page cache (all unformatted nodes are
1310 ** from the page cache) and might be a highmem page. So, we
1311 ** can't use p_s_un_bh->b_data.
1312 ** -clm
1313 */
1314
1315 data = kmap_atomic(p_s_un_bh->b_page, KM_USER0);
1316 off = ((le_ih_k_offset (&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));
1317 memcpy(data + off,
1318 B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih), n_ret_value);
1319 kunmap_atomic(data, KM_USER0);
1320 }
1321 /* Perform balancing after all resources have been collected at once. */
1322 do_balance(&s_del_balance, NULL, NULL, M_DELETE);
1323
1324#ifdef REISERQUOTA_DEBUG
1325 reiserfs_debug (p_s_sb, REISERFS_DEBUG_CODE, "reiserquota delete_item(): freeing %u, id=%u type=%c", quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih));
1326#endif
1327 DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
1328
1329 /* Return deleted body length */
1330 return n_ret_value;
1331}
1332
1333
1334/* Summary Of Mechanisms For Handling Collisions Between Processes:
1335
1336 deletion of the body of the object is performed by iput(), with the
1337 result that if multiple processes are operating on a file, the
1338 deletion of the body of the file is deferred until the last process
1339 that has an open inode performs its iput().
1340
1341 writes and truncates are protected from collisions by use of
1342 semaphores.
1343
1344 creates, linking, and mknod are protected from collisions with other
1345 processes by making the reiserfs_add_entry() the last step in the
1346 creation, and then rolling back all changes if there was a collision.
1347 - Hans
1348*/
1349
1350
1351/* this deletes item which never gets split */
1352void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th,
1353 struct inode *inode,
1354 struct reiserfs_key * key)
1355{
1356 struct tree_balance tb;
1357 INITIALIZE_PATH (path);
1358 int item_len = 0;
1359 int tb_init = 0 ;
1360 struct cpu_key cpu_key;
1361 int retval;
1362 int quota_cut_bytes = 0;
1363
1364 BUG_ON (!th->t_trans_id);
1365
1366 le_key2cpu_key (&cpu_key, key);
1367
1368 while (1) {
1369 retval = search_item (th->t_super, &cpu_key, &path);
1370 if (retval == IO_ERROR) {
1371 reiserfs_warning (th->t_super,
1372 "vs-5350: reiserfs_delete_solid_item: "
1373 "i/o failure occurred trying to delete %K",
1374 &cpu_key);
1375 break;
1376 }
1377 if (retval != ITEM_FOUND) {
1378 pathrelse (&path);
1379 // No need for a warning, if there is just no free space to insert '..' item into the newly-created subdir
1380 if ( !( (unsigned long long) GET_HASH_VALUE (le_key_k_offset (le_key_version (key), key)) == 0 && \
1381 (unsigned long long) GET_GENERATION_NUMBER (le_key_k_offset (le_key_version (key), key)) == 1 ) )
1382 reiserfs_warning (th->t_super, "vs-5355: reiserfs_delete_solid_item: %k not found", key);
1383 break;
1384 }
1385 if (!tb_init) {
1386 tb_init = 1 ;
1387 item_len = ih_item_len( PATH_PITEM_HEAD(&path) );
1388 init_tb_struct (th, &tb, th->t_super, &path, - (IH_SIZE + item_len));
1389 }
1390 quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path)) ;
1391
1392 retval = fix_nodes (M_DELETE, &tb, NULL, NULL);
1393 if (retval == REPEAT_SEARCH) {
1394 PROC_INFO_INC( th -> t_super, delete_solid_item_restarted );
1395 continue;
1396 }
1397
1398 if (retval == CARRY_ON) {
1399 do_balance (&tb, NULL, NULL, M_DELETE);
1400 if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */
1401#ifdef REISERQUOTA_DEBUG
1402 reiserfs_debug (th->t_super, REISERFS_DEBUG_CODE, "reiserquota delete_solid_item(): freeing %u id=%u type=%c", quota_cut_bytes, inode->i_uid, key2type(key));
1403#endif
1404 DQUOT_FREE_SPACE_NODIRTY(inode, quota_cut_bytes);
1405 }
1406 break;
1407 }
1408
1409 // IO_ERROR, NO_DISK_SPACE, etc
1410 reiserfs_warning (th->t_super, "vs-5360: reiserfs_delete_solid_item: "
1411 "could not delete %K due to fix_nodes failure", &cpu_key);
1412 unfix_nodes (&tb);
1413 break;
1414 }
1415
1416 reiserfs_check_path(&path) ;
1417}
1418
1419
1420int reiserfs_delete_object (struct reiserfs_transaction_handle *th, struct inode * inode)
1421{
1422 int err;
1423 inode->i_size = 0;
1424 BUG_ON (!th->t_trans_id);
1425
1426 /* for directory this deletes item containing "." and ".." */
1427 err = reiserfs_do_truncate (th, inode, NULL, 0/*no timestamp updates*/);
1428 if (err)
1429 return err;
1430
1431#if defined( USE_INODE_GENERATION_COUNTER )
1432 if( !old_format_only ( th -> t_super ) )
1433 {
1434 __u32 *inode_generation;
1435
1436 inode_generation =
1437 &REISERFS_SB(th -> t_super) -> s_rs -> s_inode_generation;
1438 *inode_generation = cpu_to_le32( le32_to_cpu( *inode_generation ) + 1 );
1439 }
1440/* USE_INODE_GENERATION_COUNTER */
1441#endif
1442 reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode));
1443
1444 return err;
1445}
1446
1447static void
1448unmap_buffers(struct page *page, loff_t pos) {
1449 struct buffer_head *bh ;
1450 struct buffer_head *head ;
1451 struct buffer_head *next ;
1452 unsigned long tail_index ;
1453 unsigned long cur_index ;
1454
1455 if (page) {
1456 if (page_has_buffers(page)) {
1457 tail_index = pos & (PAGE_CACHE_SIZE - 1) ;
1458 cur_index = 0 ;
1459 head = page_buffers(page) ;
1460 bh = head ;
1461 do {
1462 next = bh->b_this_page ;
1463
1464 /* we want to unmap the buffers that contain the tail, and
1465 ** all the buffers after it (since the tail must be at the
1466 ** end of the file). We don't want to unmap file data
1467 ** before the tail, since it might be dirty and waiting to
1468 ** reach disk
1469 */
1470 cur_index += bh->b_size ;
1471 if (cur_index > tail_index) {
1472 reiserfs_unmap_buffer(bh) ;
1473 }
1474 bh = next ;
1475 } while (bh != head) ;
1476 if ( PAGE_SIZE == bh->b_size ) {
1477 clear_page_dirty(page);
1478 }
1479 }
1480 }
1481}
1482
1483static int maybe_indirect_to_direct (struct reiserfs_transaction_handle *th,
1484 struct inode * p_s_inode,
1485 struct page *page,
1486 struct path * p_s_path,
1487 const struct cpu_key * p_s_item_key,
1488 loff_t n_new_file_size,
1489 char * p_c_mode
1490 ) {
1491 struct super_block * p_s_sb = p_s_inode->i_sb;
1492 int n_block_size = p_s_sb->s_blocksize;
1493 int cut_bytes;
1494 BUG_ON (!th->t_trans_id);
1495
1496 if (n_new_file_size != p_s_inode->i_size)
1497 BUG ();
1498
1499 /* the page being sent in could be NULL if there was an i/o error
1500 ** reading in the last block. The user will hit problems trying to
1501 ** read the file, but for now we just skip the indirect2direct
1502 */
1503 if (atomic_read(&p_s_inode->i_count) > 1 ||
1504 !tail_has_to_be_packed (p_s_inode) ||
1505 !page || (REISERFS_I(p_s_inode)->i_flags & i_nopack_mask)) {
1506 // leave tail in an unformatted node
1507 *p_c_mode = M_SKIP_BALANCING;
1508 cut_bytes = n_block_size - (n_new_file_size & (n_block_size - 1));
1509 pathrelse(p_s_path);
1510 return cut_bytes;
1511 }
1512 /* Permorm the conversion to a direct_item. */
1513 /*return indirect_to_direct (p_s_inode, p_s_path, p_s_item_key, n_new_file_size, p_c_mode);*/
1514 return indirect2direct (th, p_s_inode, page, p_s_path, p_s_item_key, n_new_file_size, p_c_mode);
1515}
1516
1517
1518/* we did indirect_to_direct conversion. And we have inserted direct
1519 item successesfully, but there were no disk space to cut unfm
1520 pointer being converted. Therefore we have to delete inserted
1521 direct item(s) */
1522static void indirect_to_direct_roll_back (struct reiserfs_transaction_handle *th, struct inode * inode, struct path * path)
1523{
1524 struct cpu_key tail_key;
1525 int tail_len;
1526 int removed;
1527 BUG_ON (!th->t_trans_id);
1528
1529 make_cpu_key (&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);// !!!!
1530 tail_key.key_length = 4;
1531
1532 tail_len = (cpu_key_k_offset (&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1;
1533 while (tail_len) {
1534 /* look for the last byte of the tail */
1535 if (search_for_position_by_key (inode->i_sb, &tail_key, path) == POSITION_NOT_FOUND)
1536 reiserfs_panic (inode->i_sb, "vs-5615: indirect_to_direct_roll_back: found invalid item");
1537 RFALSE( path->pos_in_item != ih_item_len(PATH_PITEM_HEAD (path)) - 1,
1538 "vs-5616: appended bytes found");
1539 PATH_LAST_POSITION (path) --;
1540
1541 removed = reiserfs_delete_item (th, path, &tail_key, inode, NULL/*unbh not needed*/);
1542 RFALSE( removed <= 0 || removed > tail_len,
1543 "vs-5617: there was tail %d bytes, removed item length %d bytes",
1544 tail_len, removed);
1545 tail_len -= removed;
1546 set_cpu_key_k_offset (&tail_key, cpu_key_k_offset (&tail_key) - removed);
1547 }
1548 reiserfs_warning (inode->i_sb, "indirect_to_direct_roll_back: indirect_to_direct conversion has been rolled back due to lack of disk space");
1549 //mark_file_without_tail (inode);
1550 mark_inode_dirty (inode);
1551}
1552
1553
1554/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
1555int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th,
1556 struct path * p_s_path,
1557 struct cpu_key * p_s_item_key,
1558 struct inode * p_s_inode,
1559 struct page *page,
1560 loff_t n_new_file_size)
1561{
1562 struct super_block * p_s_sb = p_s_inode->i_sb;
1563 /* Every function which is going to call do_balance must first
1564 create a tree_balance structure. Then it must fill up this
1565 structure by using the init_tb_struct and fix_nodes functions.
1566 After that we can make tree balancing. */
1567 struct tree_balance s_cut_balance;
1568 struct item_head *p_le_ih;
1569 int n_cut_size = 0, /* Amount to be cut. */
1570 n_ret_value = CARRY_ON,
1571 n_removed = 0, /* Number of the removed unformatted nodes. */
1572 n_is_inode_locked = 0;
1573 char c_mode; /* Mode of the balance. */
1574 int retval2 = -1;
1575 int quota_cut_bytes;
1576 loff_t tail_pos = 0;
1577
1578 BUG_ON (!th->t_trans_id);
1579
1580 init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path, n_cut_size);
1581
1582
1583 /* Repeat this loop until we either cut the item without needing
1584 to balance, or we fix_nodes without schedule occurring */
1585 while ( 1 ) {
1586 /* Determine the balance mode, position of the first byte to
1587 be cut, and size to be cut. In case of the indirect item
1588 free unformatted nodes which are pointed to by the cut
1589 pointers. */
1590
1591 c_mode = prepare_for_delete_or_cut(th, p_s_inode, p_s_path, p_s_item_key, &n_removed,
1592 &n_cut_size, n_new_file_size);
1593 if ( c_mode == M_CONVERT ) {
1594 /* convert last unformatted node to direct item or leave
1595 tail in the unformatted node */
1596 RFALSE( n_ret_value != CARRY_ON, "PAP-5570: can not convert twice");
1597
1598 n_ret_value = maybe_indirect_to_direct (th, p_s_inode, page, p_s_path, p_s_item_key,
1599 n_new_file_size, &c_mode);
1600 if ( c_mode == M_SKIP_BALANCING )
1601 /* tail has been left in the unformatted node */
1602 return n_ret_value;
1603
1604 n_is_inode_locked = 1;
1605
1606 /* removing of last unformatted node will change value we
1607 have to return to truncate. Save it */
1608 retval2 = n_ret_value;
1609 /*retval2 = p_s_sb->s_blocksize - (n_new_file_size & (p_s_sb->s_blocksize - 1));*/
1610
1611 /* So, we have performed the first part of the conversion:
1612 inserting the new direct item. Now we are removing the
1613 last unformatted node pointer. Set key to search for
1614 it. */
1615 set_cpu_key_k_type (p_s_item_key, TYPE_INDIRECT);
1616 p_s_item_key->key_length = 4;
1617 n_new_file_size -= (n_new_file_size & (p_s_sb->s_blocksize - 1));
1618 tail_pos = n_new_file_size;
1619 set_cpu_key_k_offset (p_s_item_key, n_new_file_size + 1);
1620 if ( search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_NOT_FOUND ){
1621 print_block (PATH_PLAST_BUFFER (p_s_path), 3, PATH_LAST_POSITION (p_s_path) - 1, PATH_LAST_POSITION (p_s_path) + 1);
1622 reiserfs_panic(p_s_sb, "PAP-5580: reiserfs_cut_from_item: item to convert does not exist (%K)", p_s_item_key);
1623 }
1624 continue;
1625 }
1626 if (n_cut_size == 0) {
1627 pathrelse (p_s_path);
1628 return 0;
1629 }
1630
1631 s_cut_balance.insert_size[0] = n_cut_size;
1632
1633 n_ret_value = fix_nodes(c_mode, &s_cut_balance, NULL, NULL);
1634 if ( n_ret_value != REPEAT_SEARCH )
1635 break;
1636
1637 PROC_INFO_INC( p_s_sb, cut_from_item_restarted );
1638
1639 n_ret_value = search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path);
1640 if (n_ret_value == POSITION_FOUND)
1641 continue;
1642
1643 reiserfs_warning (p_s_sb, "PAP-5610: reiserfs_cut_from_item: item %K not found", p_s_item_key);
1644 unfix_nodes (&s_cut_balance);
1645 return (n_ret_value == IO_ERROR) ? -EIO : -ENOENT;
1646 } /* while */
1647
1648 // check fix_nodes results (IO_ERROR or NO_DISK_SPACE)
1649 if ( n_ret_value != CARRY_ON ) {
1650 if ( n_is_inode_locked ) {
1651 // FIXME: this seems to be not needed: we are always able
1652 // to cut item
1653 indirect_to_direct_roll_back (th, p_s_inode, p_s_path);
1654 }
1655 if (n_ret_value == NO_DISK_SPACE)
1656 reiserfs_warning (p_s_sb, "NO_DISK_SPACE");
1657 unfix_nodes (&s_cut_balance);
1658 return -EIO;
1659 }
1660
1661 /* go ahead and perform balancing */
1662
1663 RFALSE( c_mode == M_PASTE || c_mode == M_INSERT, "invalid mode");
1664
1665 /* Calculate number of bytes that need to be cut from the item. */
1666 quota_cut_bytes = ( c_mode == M_DELETE ) ? ih_item_len(get_ih(p_s_path)) : -s_cut_balance.insert_size[0];
1667 if (retval2 == -1)
1668 n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode);
1669 else
1670 n_ret_value = retval2;
1671
1672
1673 /* For direct items, we only change the quota when deleting the last
1674 ** item.
1675 */
1676 p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
1677 if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) {
1678 if (c_mode == M_DELETE &&
1679 (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) {
1680 // FIXME: this is to keep 3.5 happy
1681 REISERFS_I(p_s_inode)->i_first_direct_byte = U32_MAX;
1682 quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE ;
1683 } else {
1684 quota_cut_bytes = 0 ;
1685 }
1686 }
1687#ifdef CONFIG_REISERFS_CHECK
1688 if (n_is_inode_locked) {
1689 struct item_head * le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
1690 /* we are going to complete indirect2direct conversion. Make
1691 sure, that we exactly remove last unformatted node pointer
1692 of the item */
1693 if (!is_indirect_le_ih (le_ih))
1694 reiserfs_panic (p_s_sb, "vs-5652: reiserfs_cut_from_item: "
1695 "item must be indirect %h", le_ih);
1696
1697 if (c_mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
1698 reiserfs_panic (p_s_sb, "vs-5653: reiserfs_cut_from_item: "
1699 "completing indirect2direct conversion indirect item %h "
1700 "being deleted must be of 4 byte long", le_ih);
1701
1702 if (c_mode == M_CUT && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
1703 reiserfs_panic (p_s_sb, "vs-5654: reiserfs_cut_from_item: "
1704 "can not complete indirect2direct conversion of %h (CUT, insert_size==%d)",
1705 le_ih, s_cut_balance.insert_size[0]);
1706 }
1707 /* it would be useful to make sure, that right neighboring
1708 item is direct item of this file */
1709 }
1710#endif
1711
1712 do_balance(&s_cut_balance, NULL, NULL, c_mode);
1713 if ( n_is_inode_locked ) {
1714 /* we've done an indirect->direct conversion. when the data block
1715 ** was freed, it was removed from the list of blocks that must
1716 ** be flushed before the transaction commits, make sure to
1717 ** unmap and invalidate it
1718 */
1719 unmap_buffers(page, tail_pos);
1720 REISERFS_I(p_s_inode)->i_flags &= ~i_pack_on_close_mask ;
1721 }
1722#ifdef REISERQUOTA_DEBUG
1723 reiserfs_debug (p_s_inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota cut_from_item(): freeing %u id=%u type=%c", quota_cut_bytes, p_s_inode->i_uid, '?');
1724#endif
1725 DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
1726 return n_ret_value;
1727}
1728
1729static void truncate_directory (struct reiserfs_transaction_handle *th, struct inode * inode)
1730{
1731 BUG_ON (!th->t_trans_id);
1732 if (inode->i_nlink)
1733 reiserfs_warning (inode->i_sb,
1734 "vs-5655: truncate_directory: link count != 0");
1735
1736 set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), DOT_OFFSET);
1737 set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_DIRENTRY);
1738 reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode));
1739 reiserfs_update_sd(th, inode) ;
1740 set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), SD_OFFSET);
1741 set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_STAT_DATA);
1742}
1743
1744
1745
1746
1747/* Truncate file to the new size. Note, this must be called with a transaction
1748 already started */
1749int reiserfs_do_truncate (struct reiserfs_transaction_handle *th,
1750 struct inode * p_s_inode, /* ->i_size contains new
1751 size */
1752 struct page *page, /* up to date for last block */
1753 int update_timestamps /* when it is called by
1754 file_release to convert
1755 the tail - no timestamps
1756 should be updated */
1757 ) {
1758 INITIALIZE_PATH (s_search_path); /* Path to the current object item. */
1759 struct item_head * p_le_ih; /* Pointer to an item header. */
1760 struct cpu_key s_item_key; /* Key to search for a previous file item. */
1761 loff_t n_file_size, /* Old file size. */
1762 n_new_file_size;/* New file size. */
1763 int n_deleted; /* Number of deleted or truncated bytes. */
1764 int retval;
1765 int err = 0;
1766
1767 BUG_ON (!th->t_trans_id);
1768 if ( ! (S_ISREG(p_s_inode->i_mode) || S_ISDIR(p_s_inode->i_mode) || S_ISLNK(p_s_inode->i_mode)) )
1769 return 0;
1770
1771 if (S_ISDIR(p_s_inode->i_mode)) {
1772 // deletion of directory - no need to update timestamps
1773 truncate_directory (th, p_s_inode);
1774 return 0;
1775 }
1776
1777 /* Get new file size. */
1778 n_new_file_size = p_s_inode->i_size;
1779
1780 // FIXME: note, that key type is unimportant here
1781 make_cpu_key (&s_item_key, p_s_inode, max_reiserfs_offset (p_s_inode), TYPE_DIRECT, 3);
1782
1783 retval = search_for_position_by_key(p_s_inode->i_sb, &s_item_key, &s_search_path);
1784 if (retval == IO_ERROR) {
1785 reiserfs_warning (p_s_inode->i_sb, "vs-5657: reiserfs_do_truncate: "
1786 "i/o failure occurred trying to truncate %K", &s_item_key);
1787 err = -EIO;
1788 goto out;
1789 }
1790 if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
1791 reiserfs_warning (p_s_inode->i_sb, "PAP-5660: reiserfs_do_truncate: "
1792 "wrong result %d of search for %K", retval, &s_item_key);
1793
1794 err = -EIO;
1795 goto out;
1796 }
1797
1798 s_search_path.pos_in_item --;
1799
1800 /* Get real file size (total length of all file items) */
1801 p_le_ih = PATH_PITEM_HEAD(&s_search_path);
1802 if ( is_statdata_le_ih (p_le_ih) )
1803 n_file_size = 0;
1804 else {
1805 loff_t offset = le_ih_k_offset (p_le_ih);
1806 int bytes = op_bytes_number (p_le_ih,p_s_inode->i_sb->s_blocksize);
1807
1808 /* this may mismatch with real file size: if last direct item
1809 had no padding zeros and last unformatted node had no free
1810 space, this file would have this file size */
1811 n_file_size = offset + bytes - 1;
1812 }
1813 /*
1814 * are we doing a full truncate or delete, if so
1815 * kick in the reada code
1816 */
1817 if (n_new_file_size == 0)
1818 s_search_path.reada = PATH_READA | PATH_READA_BACK;
1819
1820 if ( n_file_size == 0 || n_file_size < n_new_file_size ) {
1821 goto update_and_out ;
1822 }
1823
1824 /* Update key to search for the last file item. */
1825 set_cpu_key_k_offset (&s_item_key, n_file_size);
1826
1827 do {
1828 /* Cut or delete file item. */
1829 n_deleted = reiserfs_cut_from_item(th, &s_search_path, &s_item_key, p_s_inode, page, n_new_file_size);
1830 if (n_deleted < 0) {
1831 reiserfs_warning (p_s_inode->i_sb, "vs-5665: reiserfs_do_truncate: reiserfs_cut_from_item failed");
1832 reiserfs_check_path(&s_search_path) ;
1833 return 0;
1834 }
1835
1836 RFALSE( n_deleted > n_file_size,
1837 "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K",
1838 n_deleted, n_file_size, &s_item_key);
1839
1840 /* Change key to search the last file item. */
1841 n_file_size -= n_deleted;
1842
1843 set_cpu_key_k_offset (&s_item_key, n_file_size);
1844
1845 /* While there are bytes to truncate and previous file item is presented in the tree. */
1846
1847 /*
1848 ** This loop could take a really long time, and could log
1849 ** many more blocks than a transaction can hold. So, we do a polite
1850 ** journal end here, and if the transaction needs ending, we make
1851 ** sure the file is consistent before ending the current trans
1852 ** and starting a new one
1853 */
1854 if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
1855 int orig_len_alloc = th->t_blocks_allocated ;
1856 decrement_counters_in_path(&s_search_path) ;
1857
1858 if (update_timestamps) {
1859 p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC;
1860 }
1861 reiserfs_update_sd(th, p_s_inode) ;
1862
1863 err = journal_end(th, p_s_inode->i_sb, orig_len_alloc) ;
1864 if (err)
1865 goto out;
1866 err = journal_begin (th, p_s_inode->i_sb,
1867 JOURNAL_PER_BALANCE_CNT * 6);
1868 if (err)
1869 goto out;
1870 reiserfs_update_inode_transaction(p_s_inode) ;
1871 }
1872 } while ( n_file_size > ROUND_UP (n_new_file_size) &&
1873 search_for_position_by_key(p_s_inode->i_sb, &s_item_key, &s_search_path) == POSITION_FOUND ) ;
1874
1875 RFALSE( n_file_size > ROUND_UP (n_new_file_size),
1876 "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d",
1877 n_new_file_size, n_file_size, s_item_key.on_disk_key.k_objectid);
1878
1879update_and_out:
1880 if (update_timestamps) {
1881 // this is truncate, not file closing
1882 p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC;
1883 }
1884 reiserfs_update_sd (th, p_s_inode);
1885
1886out:
1887 pathrelse(&s_search_path) ;
1888 return err;
1889}
1890
1891
1892#ifdef CONFIG_REISERFS_CHECK
1893// this makes sure, that we __append__, not overwrite or add holes
1894static void check_research_for_paste (struct path * path,
1895 const struct cpu_key * p_s_key)
1896{
1897 struct item_head * found_ih = get_ih (path);
1898
1899 if (is_direct_le_ih (found_ih)) {
1900 if (le_ih_k_offset (found_ih) + op_bytes_number (found_ih, get_last_bh (path)->b_size) !=
1901 cpu_key_k_offset (p_s_key) ||
1902 op_bytes_number (found_ih, get_last_bh (path)->b_size) != pos_in_item (path))
1903 reiserfs_panic (NULL, "PAP-5720: check_research_for_paste: "
1904 "found direct item %h or position (%d) does not match to key %K",
1905 found_ih, pos_in_item (path), p_s_key);
1906 }
1907 if (is_indirect_le_ih (found_ih)) {
1908 if (le_ih_k_offset (found_ih) + op_bytes_number (found_ih, get_last_bh (path)->b_size) != cpu_key_k_offset (p_s_key) ||
1909 I_UNFM_NUM (found_ih) != pos_in_item (path) ||
1910 get_ih_free_space (found_ih) != 0)
1911 reiserfs_panic (NULL, "PAP-5730: check_research_for_paste: "
1912 "found indirect item (%h) or position (%d) does not match to key (%K)",
1913 found_ih, pos_in_item (path), p_s_key);
1914 }
1915}
1916#endif /* config reiserfs check */
1917
1918
1919/* Paste bytes to the existing item. Returns bytes number pasted into the item. */
1920int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th,
1921 struct path * p_s_search_path, /* Path to the pasted item. */
1922 const struct cpu_key * p_s_key, /* Key to search for the needed item.*/
1923 struct inode * inode, /* Inode item belongs to */
1924 const char * p_c_body, /* Pointer to the bytes to paste. */
1925 int n_pasted_size) /* Size of pasted bytes. */
1926{
1927 struct tree_balance s_paste_balance;
1928 int retval;
1929 int fs_gen;
1930
1931 BUG_ON (!th->t_trans_id);
1932
1933 fs_gen = get_generation(inode->i_sb) ;
1934
1935#ifdef REISERQUOTA_DEBUG
1936 reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota paste_into_item(): allocating %u id=%u type=%c", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key)));
1937#endif
1938
1939 if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) {
1940 pathrelse(p_s_search_path);
1941 return -EDQUOT;
1942 }
1943 init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path, n_pasted_size);
1944#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1945 s_paste_balance.key = p_s_key->on_disk_key;
1946#endif
1947
1948 /* DQUOT_* can schedule, must check before the fix_nodes */
1949 if (fs_changed(fs_gen, inode->i_sb)) {
1950 goto search_again;
1951 }
1952
1953 while ((retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) ==
1954REPEAT_SEARCH ) {
1955search_again:
1956 /* file system changed while we were in the fix_nodes */
1957 PROC_INFO_INC( th -> t_super, paste_into_item_restarted );
1958 retval = search_for_position_by_key (th->t_super, p_s_key, p_s_search_path);
1959 if (retval == IO_ERROR) {
1960 retval = -EIO ;
1961 goto error_out ;
1962 }
1963 if (retval == POSITION_FOUND) {
1964 reiserfs_warning (inode->i_sb, "PAP-5710: reiserfs_paste_into_item: entry or pasted byte (%K) exists", p_s_key);
1965 retval = -EEXIST ;
1966 goto error_out ;
1967 }
1968
1969#ifdef CONFIG_REISERFS_CHECK
1970 check_research_for_paste (p_s_search_path, p_s_key);
1971#endif
1972 }
1973
1974 /* Perform balancing after all resources are collected by fix_nodes, and
1975 accessing them will not risk triggering schedule. */
1976 if ( retval == CARRY_ON ) {
1977 do_balance(&s_paste_balance, NULL/*ih*/, p_c_body, M_PASTE);
1978 return 0;
1979 }
1980 retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
1981error_out:
1982 /* this also releases the path */
1983 unfix_nodes(&s_paste_balance);
1984#ifdef REISERQUOTA_DEBUG
1985 reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota paste_into_item(): freeing %u id=%u type=%c", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key)));
1986#endif
1987 DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size);
1988 return retval ;
1989}
1990
1991
1992/* Insert new item into the buffer at the path. */
1993int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
1994 struct path * p_s_path, /* Path to the inserteded item. */
1995 const struct cpu_key * key,
1996 struct item_head * p_s_ih, /* Pointer to the item header to insert.*/
1997 struct inode * inode,
1998 const char * p_c_body) /* Pointer to the bytes to insert. */
1999{
2000 struct tree_balance s_ins_balance;
2001 int retval;
2002 int fs_gen = 0 ;
2003 int quota_bytes = 0 ;
2004
2005 BUG_ON (!th->t_trans_id);
2006
2007 if (inode) { /* Do we count quotas for item? */
2008 fs_gen = get_generation(inode->i_sb);
2009 quota_bytes = ih_item_len(p_s_ih);
2010
2011 /* hack so the quota code doesn't have to guess if the file has
2012 ** a tail, links are always tails, so there's no guessing needed
2013 */
2014 if (!S_ISLNK (inode->i_mode) && is_direct_le_ih(p_s_ih)) {
2015 quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE ;
2016 }
2017#ifdef REISERQUOTA_DEBUG
2018 reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota insert_item(): allocating %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(p_s_ih));
2019#endif
2020 /* We can't dirty inode here. It would be immediately written but
2021 * appropriate stat item isn't inserted yet... */
2022 if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) {
2023 pathrelse(p_s_path);
2024 return -EDQUOT;
2025 }
2026 }
2027 init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path, IH_SIZE + ih_item_len(p_s_ih));
2028#ifdef DISPLACE_NEW_PACKING_LOCALITIES
2029 s_ins_balance.key = key->on_disk_key;
2030#endif
2031 /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */
2032 if (inode && fs_changed(fs_gen, inode->i_sb)) {
2033 goto search_again;
2034 }
2035
2036 while ( (retval = fix_nodes(M_INSERT, &s_ins_balance, p_s_ih, p_c_body)) == REPEAT_SEARCH) {
2037search_again:
2038 /* file system changed while we were in the fix_nodes */
2039 PROC_INFO_INC( th -> t_super, insert_item_restarted );
2040 retval = search_item (th->t_super, key, p_s_path);
2041 if (retval == IO_ERROR) {
2042 retval = -EIO;
2043 goto error_out ;
2044 }
2045 if (retval == ITEM_FOUND) {
2046 reiserfs_warning (th->t_super, "PAP-5760: reiserfs_insert_item: "
2047 "key %K already exists in the tree", key);
2048 retval = -EEXIST ;
2049 goto error_out;
2050 }
2051 }
2052
2053 /* make balancing after all resources will be collected at a time */
2054 if ( retval == CARRY_ON ) {
2055 do_balance (&s_ins_balance, p_s_ih, p_c_body, M_INSERT);
2056 return 0;
2057 }
2058
2059 retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
2060error_out:
2061 /* also releases the path */
2062 unfix_nodes(&s_ins_balance);
2063#ifdef REISERQUOTA_DEBUG
2064 reiserfs_debug (th->t_super, REISERFS_DEBUG_CODE, "reiserquota insert_item(): freeing %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(p_s_ih));
2065#endif
2066 if (inode)
2067 DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes) ;
2068 return retval;
2069}
2070
2071
2072
2073
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
new file mode 100644
index 000000000000..bcdf2438d152
--- /dev/null
+++ b/fs/reiserfs/super.c
@@ -0,0 +1,2148 @@
1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 *
4 * Trivial changes by Alan Cox to add the LFS fixes
5 *
6 * Trivial Changes:
7 * Rights granted to Hans Reiser to redistribute under other terms providing
8 * he accepts all liability including but not limited to patent, fitness
9 * for purpose, and direct or indirect claims arising from failure to perform.
10 *
11 * NO WARRANTY
12 */
13
14#include <linux/config.h>
15#include <linux/module.h>
16#include <linux/vmalloc.h>
17#include <linux/time.h>
18#include <asm/uaccess.h>
19#include <linux/reiserfs_fs.h>
20#include <linux/reiserfs_acl.h>
21#include <linux/reiserfs_xattr.h>
22#include <linux/smp_lock.h>
23#include <linux/init.h>
24#include <linux/blkdev.h>
25#include <linux/buffer_head.h>
26#include <linux/vfs.h>
27#include <linux/namespace.h>
28#include <linux/mount.h>
29#include <linux/namei.h>
30#include <linux/quotaops.h>
31
32struct file_system_type reiserfs_fs_type;
33
34static const char reiserfs_3_5_magic_string[] = REISERFS_SUPER_MAGIC_STRING;
35static const char reiserfs_3_6_magic_string[] = REISER2FS_SUPER_MAGIC_STRING;
36static const char reiserfs_jr_magic_string[] = REISER2FS_JR_SUPER_MAGIC_STRING;
37
38int is_reiserfs_3_5 (struct reiserfs_super_block * rs)
39{
40 return !strncmp (rs->s_v1.s_magic, reiserfs_3_5_magic_string,
41 strlen (reiserfs_3_5_magic_string));
42}
43
44
45int is_reiserfs_3_6 (struct reiserfs_super_block * rs)
46{
47 return !strncmp (rs->s_v1.s_magic, reiserfs_3_6_magic_string,
48 strlen (reiserfs_3_6_magic_string));
49}
50
51
52int is_reiserfs_jr (struct reiserfs_super_block * rs)
53{
54 return !strncmp (rs->s_v1.s_magic, reiserfs_jr_magic_string,
55 strlen (reiserfs_jr_magic_string));
56}
57
58
59static int is_any_reiserfs_magic_string (struct reiserfs_super_block * rs)
60{
61 return (is_reiserfs_3_5 (rs) || is_reiserfs_3_6 (rs) ||
62 is_reiserfs_jr (rs));
63}
64
65static int reiserfs_remount (struct super_block * s, int * flags, char * data);
66static int reiserfs_statfs (struct super_block * s, struct kstatfs * buf);
67
68static int reiserfs_sync_fs (struct super_block * s, int wait)
69{
70 if (!(s->s_flags & MS_RDONLY)) {
71 struct reiserfs_transaction_handle th;
72 reiserfs_write_lock(s);
73 if (!journal_begin(&th, s, 1))
74 if (!journal_end_sync(&th, s, 1))
75 reiserfs_flush_old_commits(s);
76 s->s_dirt = 0; /* Even if it's not true.
77 * We'll loop forever in sync_supers otherwise */
78 reiserfs_write_unlock(s);
79 } else {
80 s->s_dirt = 0;
81 }
82 return 0;
83}
84
85static void reiserfs_write_super(struct super_block *s)
86{
87 reiserfs_sync_fs(s, 1);
88}
89
90static void reiserfs_write_super_lockfs (struct super_block * s)
91{
92 struct reiserfs_transaction_handle th ;
93 reiserfs_write_lock(s);
94 if (!(s->s_flags & MS_RDONLY)) {
95 int err = journal_begin(&th, s, 1) ;
96 if (err) {
97 reiserfs_block_writes(&th) ;
98 } else {
99 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
100 journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
101 reiserfs_block_writes(&th) ;
102 journal_end_sync(&th, s, 1) ;
103 }
104 }
105 s->s_dirt = 0;
106 reiserfs_write_unlock(s);
107}
108
109static void reiserfs_unlockfs(struct super_block *s) {
110 reiserfs_allow_writes(s) ;
111}
112
113extern const struct reiserfs_key MAX_KEY;
114
115
116/* this is used to delete "save link" when there are no items of a
117 file it points to. It can either happen if unlink is completed but
118 "save unlink" removal, or if file has both unlink and truncate
119 pending and as unlink completes first (because key of "save link"
120 protecting unlink is bigger that a key lf "save link" which
121 protects truncate), so there left no items to make truncate
122 completion on */
123static int remove_save_link_only (struct super_block * s, struct reiserfs_key * key, int oid_free)
124{
125 struct reiserfs_transaction_handle th;
126 int err;
127
128 /* we are going to do one balancing */
129 err = journal_begin (&th, s, JOURNAL_PER_BALANCE_CNT);
130 if (err)
131 return err;
132
133 reiserfs_delete_solid_item (&th, NULL, key);
134 if (oid_free)
135 /* removals are protected by direct items */
136 reiserfs_release_objectid (&th, le32_to_cpu (key->k_objectid));
137
138 return journal_end (&th, s, JOURNAL_PER_BALANCE_CNT);
139}
140
141#ifdef CONFIG_QUOTA
142static int reiserfs_quota_on_mount(struct super_block *, int);
143#endif
144
145/* look for uncompleted unlinks and truncates and complete them */
146static int finish_unfinished (struct super_block * s)
147{
148 INITIALIZE_PATH (path);
149 struct cpu_key max_cpu_key, obj_key;
150 struct reiserfs_key save_link_key;
151 int retval = 0;
152 struct item_head * ih;
153 struct buffer_head * bh;
154 int item_pos;
155 char * item;
156 int done;
157 struct inode * inode;
158 int truncate;
159#ifdef CONFIG_QUOTA
160 int i;
161 int ms_active_set;
162#endif
163
164
165 /* compose key to look for "save" links */
166 max_cpu_key.version = KEY_FORMAT_3_5;
167 max_cpu_key.on_disk_key = MAX_KEY;
168 max_cpu_key.key_length = 3;
169
170#ifdef CONFIG_QUOTA
171 /* Needed for iput() to work correctly and not trash data */
172 if (s->s_flags & MS_ACTIVE) {
173 ms_active_set = 0;
174 } else {
175 ms_active_set = 1;
176 s->s_flags |= MS_ACTIVE;
177 }
178 /* Turn on quotas so that they are updated correctly */
179 for (i = 0; i < MAXQUOTAS; i++) {
180 if (REISERFS_SB(s)->s_qf_names[i]) {
181 int ret = reiserfs_quota_on_mount(s, i);
182 if (ret < 0)
183 reiserfs_warning(s, "reiserfs: cannot turn on journalled quota: error %d", ret);
184 }
185 }
186#endif
187
188 done = 0;
189 REISERFS_SB(s)->s_is_unlinked_ok = 1;
190 while (!retval) {
191 retval = search_item (s, &max_cpu_key, &path);
192 if (retval != ITEM_NOT_FOUND) {
193 reiserfs_warning (s, "vs-2140: finish_unfinished: search_by_key returned %d",
194 retval);
195 break;
196 }
197
198 bh = get_last_bh (&path);
199 item_pos = get_item_pos (&path);
200 if (item_pos != B_NR_ITEMS (bh)) {
201 reiserfs_warning (s, "vs-2060: finish_unfinished: wrong position found");
202 break;
203 }
204 item_pos --;
205 ih = B_N_PITEM_HEAD (bh, item_pos);
206
207 if (le32_to_cpu (ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID)
208 /* there are no "save" links anymore */
209 break;
210
211 save_link_key = ih->ih_key;
212 if (is_indirect_le_ih (ih))
213 truncate = 1;
214 else
215 truncate = 0;
216
217 /* reiserfs_iget needs k_dirid and k_objectid only */
218 item = B_I_PITEM (bh, ih);
219 obj_key.on_disk_key.k_dir_id = le32_to_cpu (*(__u32 *)item);
220 obj_key.on_disk_key.k_objectid = le32_to_cpu (ih->ih_key.k_objectid);
221 obj_key.on_disk_key.u.k_offset_v1.k_offset = 0;
222 obj_key.on_disk_key.u.k_offset_v1.k_uniqueness = 0;
223
224 pathrelse (&path);
225
226 inode = reiserfs_iget (s, &obj_key);
227 if (!inode) {
228 /* the unlink almost completed, it just did not manage to remove
229 "save" link and release objectid */
230 reiserfs_warning (s, "vs-2180: finish_unfinished: iget failed for %K",
231 &obj_key);
232 retval = remove_save_link_only (s, &save_link_key, 1);
233 continue;
234 }
235
236 if (!truncate && inode->i_nlink) {
237 /* file is not unlinked */
238 reiserfs_warning (s, "vs-2185: finish_unfinished: file %K is not unlinked",
239 &obj_key);
240 retval = remove_save_link_only (s, &save_link_key, 0);
241 continue;
242 }
243 DQUOT_INIT(inode);
244
245 if (truncate && S_ISDIR (inode->i_mode) ) {
246 /* We got a truncate request for a dir which is impossible.
247 The only imaginable way is to execute unfinished truncate request
248 then boot into old kernel, remove the file and create dir with
249 the same key. */
250 reiserfs_warning(s, "green-2101: impossible truncate on a directory %k. Please report", INODE_PKEY (inode));
251 retval = remove_save_link_only (s, &save_link_key, 0);
252 truncate = 0;
253 iput (inode);
254 continue;
255 }
256
257 if (truncate) {
258 REISERFS_I(inode) -> i_flags |= i_link_saved_truncate_mask;
259 /* not completed truncate found. New size was committed together
260 with "save" link */
261 reiserfs_info (s, "Truncating %k to %Ld ..",
262 INODE_PKEY (inode), inode->i_size);
263 reiserfs_truncate_file (inode, 0/*don't update modification time*/);
264 retval = remove_save_link (inode, truncate);
265 } else {
266 REISERFS_I(inode) -> i_flags |= i_link_saved_unlink_mask;
267 /* not completed unlink (rmdir) found */
268 reiserfs_info (s, "Removing %k..", INODE_PKEY (inode));
269 /* removal gets completed in iput */
270 retval = 0;
271 }
272
273 iput (inode);
274 printk ("done\n");
275 done ++;
276 }
277 REISERFS_SB(s)->s_is_unlinked_ok = 0;
278
279#ifdef CONFIG_QUOTA
280 /* Turn quotas off */
281 for (i = 0; i < MAXQUOTAS; i++) {
282 if (sb_dqopt(s)->files[i])
283 vfs_quota_off_mount(s, i);
284 }
285 if (ms_active_set)
286 /* Restore the flag back */
287 s->s_flags &= ~MS_ACTIVE;
288#endif
289 pathrelse (&path);
290 if (done)
291 reiserfs_info (s, "There were %d uncompleted unlinks/truncates. "
292 "Completed\n", done);
293 return retval;
294}
295
296/* to protect file being unlinked from getting lost we "safe" link files
297 being unlinked. This link will be deleted in the same transaction with last
298 item of file. mounting the filesytem we scan all these links and remove
299 files which almost got lost */
300void add_save_link (struct reiserfs_transaction_handle * th,
301 struct inode * inode, int truncate)
302{
303 INITIALIZE_PATH (path);
304 int retval;
305 struct cpu_key key;
306 struct item_head ih;
307 __u32 link;
308
309 BUG_ON (!th->t_trans_id);
310
311 /* file can only get one "save link" of each kind */
312 RFALSE( truncate &&
313 ( REISERFS_I(inode) -> i_flags & i_link_saved_truncate_mask ),
314 "saved link already exists for truncated inode %lx",
315 ( long ) inode -> i_ino );
316 RFALSE( !truncate &&
317 ( REISERFS_I(inode) -> i_flags & i_link_saved_unlink_mask ),
318 "saved link already exists for unlinked inode %lx",
319 ( long ) inode -> i_ino );
320
321 /* setup key of "save" link */
322 key.version = KEY_FORMAT_3_5;
323 key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID;
324 key.on_disk_key.k_objectid = inode->i_ino;
325 if (!truncate) {
326 /* unlink, rmdir, rename */
327 set_cpu_key_k_offset (&key, 1 + inode->i_sb->s_blocksize);
328 set_cpu_key_k_type (&key, TYPE_DIRECT);
329
330 /* item head of "safe" link */
331 make_le_item_head (&ih, &key, key.version, 1 + inode->i_sb->s_blocksize, TYPE_DIRECT,
332 4/*length*/, 0xffff/*free space*/);
333 } else {
334 /* truncate */
335 if (S_ISDIR (inode->i_mode))
336 reiserfs_warning(inode->i_sb, "green-2102: Adding a truncate savelink for a directory %k! Please report", INODE_PKEY(inode));
337 set_cpu_key_k_offset (&key, 1);
338 set_cpu_key_k_type (&key, TYPE_INDIRECT);
339
340 /* item head of "safe" link */
341 make_le_item_head (&ih, &key, key.version, 1, TYPE_INDIRECT,
342 4/*length*/, 0/*free space*/);
343 }
344 key.key_length = 3;
345
346 /* look for its place in the tree */
347 retval = search_item (inode->i_sb, &key, &path);
348 if (retval != ITEM_NOT_FOUND) {
349 if ( retval != -ENOSPC )
350 reiserfs_warning (inode->i_sb, "vs-2100: add_save_link:"
351 "search_by_key (%K) returned %d", &key, retval);
352 pathrelse (&path);
353 return;
354 }
355
356 /* body of "save" link */
357 link = INODE_PKEY (inode)->k_dir_id;
358
359 /* put "save" link inot tree, don't charge quota to anyone */
360 retval = reiserfs_insert_item (th, &path, &key, &ih, NULL, (char *)&link);
361 if (retval) {
362 if (retval != -ENOSPC)
363 reiserfs_warning (inode->i_sb, "vs-2120: add_save_link: insert_item returned %d",
364 retval);
365 } else {
366 if( truncate )
367 REISERFS_I(inode) -> i_flags |= i_link_saved_truncate_mask;
368 else
369 REISERFS_I(inode) -> i_flags |= i_link_saved_unlink_mask;
370 }
371}
372
373
374/* this opens transaction unlike add_save_link */
375int remove_save_link (struct inode * inode, int truncate)
376{
377 struct reiserfs_transaction_handle th;
378 struct reiserfs_key key;
379 int err;
380
381 /* we are going to do one balancing only */
382 err = journal_begin (&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
383 if (err)
384 return err;
385
386 /* setup key of "save" link */
387 key.k_dir_id = cpu_to_le32 (MAX_KEY_OBJECTID);
388 key.k_objectid = INODE_PKEY (inode)->k_objectid;
389 if (!truncate) {
390 /* unlink, rmdir, rename */
391 set_le_key_k_offset (KEY_FORMAT_3_5, &key,
392 1 + inode->i_sb->s_blocksize);
393 set_le_key_k_type (KEY_FORMAT_3_5, &key, TYPE_DIRECT);
394 } else {
395 /* truncate */
396 set_le_key_k_offset (KEY_FORMAT_3_5, &key, 1);
397 set_le_key_k_type (KEY_FORMAT_3_5, &key, TYPE_INDIRECT);
398 }
399
400 if( ( truncate &&
401 ( REISERFS_I(inode) -> i_flags & i_link_saved_truncate_mask ) ) ||
402 ( !truncate &&
403 ( REISERFS_I(inode) -> i_flags & i_link_saved_unlink_mask ) ) )
404 /* don't take quota bytes from anywhere */
405 reiserfs_delete_solid_item (&th, NULL, &key);
406 if (!truncate) {
407 reiserfs_release_objectid (&th, inode->i_ino);
408 REISERFS_I(inode) -> i_flags &= ~i_link_saved_unlink_mask;
409 } else
410 REISERFS_I(inode) -> i_flags &= ~i_link_saved_truncate_mask;
411
412 return journal_end (&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
413}
414
415
416static void reiserfs_put_super (struct super_block * s)
417{
418 int i;
419 struct reiserfs_transaction_handle th ;
420 th.t_trans_id = 0;
421
422 if (REISERFS_SB(s)->xattr_root) {
423 d_invalidate (REISERFS_SB(s)->xattr_root);
424 dput (REISERFS_SB(s)->xattr_root);
425 }
426
427 if (REISERFS_SB(s)->priv_root) {
428 d_invalidate (REISERFS_SB(s)->priv_root);
429 dput (REISERFS_SB(s)->priv_root);
430 }
431
432 /* change file system state to current state if it was mounted with read-write permissions */
433 if (!(s->s_flags & MS_RDONLY)) {
434 if (!journal_begin(&th, s, 10)) {
435 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
436 set_sb_umount_state( SB_DISK_SUPER_BLOCK(s), REISERFS_SB(s)->s_mount_state );
437 journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
438 }
439 }
440
441 /* note, journal_release checks for readonly mount, and can decide not
442 ** to do a journal_end
443 */
444 journal_release(&th, s) ;
445
446 for (i = 0; i < SB_BMAP_NR (s); i ++)
447 brelse (SB_AP_BITMAP (s)[i].bh);
448
449 vfree (SB_AP_BITMAP (s));
450
451 brelse (SB_BUFFER_WITH_SB (s));
452
453 print_statistics (s);
454
455 if (REISERFS_SB(s)->s_kmallocs != 0) {
456 reiserfs_warning (s, "vs-2004: reiserfs_put_super: allocated memory left %d",
457 REISERFS_SB(s)->s_kmallocs);
458 }
459
460 if (REISERFS_SB(s)->reserved_blocks != 0) {
461 reiserfs_warning (s, "green-2005: reiserfs_put_super: reserved blocks left %d",
462 REISERFS_SB(s)->reserved_blocks);
463 }
464
465 reiserfs_proc_info_done( s );
466
467 kfree(s->s_fs_info);
468 s->s_fs_info = NULL;
469
470 return;
471}
472
473static kmem_cache_t * reiserfs_inode_cachep;
474
475static struct inode *reiserfs_alloc_inode(struct super_block *sb)
476{
477 struct reiserfs_inode_info *ei;
478 ei = (struct reiserfs_inode_info *)kmem_cache_alloc(reiserfs_inode_cachep, SLAB_KERNEL);
479 if (!ei)
480 return NULL;
481 return &ei->vfs_inode;
482}
483
484static void reiserfs_destroy_inode(struct inode *inode)
485{
486 kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
487}
488
489static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
490{
491 struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *) foo;
492
493 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
494 SLAB_CTOR_CONSTRUCTOR) {
495 INIT_LIST_HEAD(&ei->i_prealloc_list) ;
496 inode_init_once(&ei->vfs_inode);
497 ei->i_acl_access = NULL;
498 ei->i_acl_default = NULL;
499 }
500}
501
502static int init_inodecache(void)
503{
504 reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
505 sizeof(struct reiserfs_inode_info),
506 0, SLAB_RECLAIM_ACCOUNT,
507 init_once, NULL);
508 if (reiserfs_inode_cachep == NULL)
509 return -ENOMEM;
510 return 0;
511}
512
513static void destroy_inodecache(void)
514{
515 if (kmem_cache_destroy(reiserfs_inode_cachep))
516 reiserfs_warning (NULL, "reiserfs_inode_cache: not all structures were freed");
517}
518
519/* we don't mark inodes dirty, we just log them */
520static void reiserfs_dirty_inode (struct inode * inode) {
521 struct reiserfs_transaction_handle th ;
522
523 int err = 0;
524 if (inode->i_sb->s_flags & MS_RDONLY) {
525 reiserfs_warning(inode->i_sb, "clm-6006: writing inode %lu on readonly FS",
526 inode->i_ino) ;
527 return ;
528 }
529 reiserfs_write_lock(inode->i_sb);
530
531 /* this is really only used for atime updates, so they don't have
532 ** to be included in O_SYNC or fsync
533 */
534 err = journal_begin(&th, inode->i_sb, 1) ;
535 if (err) {
536 reiserfs_write_unlock (inode->i_sb);
537 return;
538 }
539 reiserfs_update_sd (&th, inode);
540 journal_end(&th, inode->i_sb, 1) ;
541 reiserfs_write_unlock(inode->i_sb);
542}
543
544static void reiserfs_clear_inode (struct inode *inode)
545{
546 struct posix_acl *acl;
547
548 acl = REISERFS_I(inode)->i_acl_access;
549 if (acl && !IS_ERR (acl))
550 posix_acl_release (acl);
551 REISERFS_I(inode)->i_acl_access = NULL;
552
553 acl = REISERFS_I(inode)->i_acl_default;
554 if (acl && !IS_ERR (acl))
555 posix_acl_release (acl);
556 REISERFS_I(inode)->i_acl_default = NULL;
557}
558
559#ifdef CONFIG_QUOTA
560static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
561static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t, loff_t);
562#endif
563
564static struct super_operations reiserfs_sops =
565{
566 .alloc_inode = reiserfs_alloc_inode,
567 .destroy_inode = reiserfs_destroy_inode,
568 .write_inode = reiserfs_write_inode,
569 .dirty_inode = reiserfs_dirty_inode,
570 .delete_inode = reiserfs_delete_inode,
571 .clear_inode = reiserfs_clear_inode,
572 .put_super = reiserfs_put_super,
573 .write_super = reiserfs_write_super,
574 .sync_fs = reiserfs_sync_fs,
575 .write_super_lockfs = reiserfs_write_super_lockfs,
576 .unlockfs = reiserfs_unlockfs,
577 .statfs = reiserfs_statfs,
578 .remount_fs = reiserfs_remount,
579#ifdef CONFIG_QUOTA
580 .quota_read = reiserfs_quota_read,
581 .quota_write = reiserfs_quota_write,
582#endif
583};
584
585#ifdef CONFIG_QUOTA
586#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
587
588static int reiserfs_dquot_initialize(struct inode *, int);
589static int reiserfs_dquot_drop(struct inode *);
590static int reiserfs_write_dquot(struct dquot *);
591static int reiserfs_acquire_dquot(struct dquot *);
592static int reiserfs_release_dquot(struct dquot *);
593static int reiserfs_mark_dquot_dirty(struct dquot *);
594static int reiserfs_write_info(struct super_block *, int);
595static int reiserfs_quota_on(struct super_block *, int, int, char *);
596
597static struct dquot_operations reiserfs_quota_operations =
598{
599 .initialize = reiserfs_dquot_initialize,
600 .drop = reiserfs_dquot_drop,
601 .alloc_space = dquot_alloc_space,
602 .alloc_inode = dquot_alloc_inode,
603 .free_space = dquot_free_space,
604 .free_inode = dquot_free_inode,
605 .transfer = dquot_transfer,
606 .write_dquot = reiserfs_write_dquot,
607 .acquire_dquot = reiserfs_acquire_dquot,
608 .release_dquot = reiserfs_release_dquot,
609 .mark_dirty = reiserfs_mark_dquot_dirty,
610 .write_info = reiserfs_write_info,
611};
612
613static struct quotactl_ops reiserfs_qctl_operations =
614{
615 .quota_on = reiserfs_quota_on,
616 .quota_off = vfs_quota_off,
617 .quota_sync = vfs_quota_sync,
618 .get_info = vfs_get_dqinfo,
619 .set_info = vfs_set_dqinfo,
620 .get_dqblk = vfs_get_dqblk,
621 .set_dqblk = vfs_set_dqblk,
622};
623#endif
624
625static struct export_operations reiserfs_export_ops = {
626 .encode_fh = reiserfs_encode_fh,
627 .decode_fh = reiserfs_decode_fh,
628 .get_parent = reiserfs_get_parent,
629 .get_dentry = reiserfs_get_dentry,
630} ;
631
632/* this struct is used in reiserfs_getopt () for containing the value for those
633 mount options that have values rather than being toggles. */
634typedef struct {
635 char * value;
636 int setmask; /* bitmask which is to set on mount_options bitmask when this
637 value is found, 0 is no bits are to be changed. */
638 int clrmask; /* bitmask which is to clear on mount_options bitmask when this
639 value is found, 0 is no bits are to be changed. This is
640 applied BEFORE setmask */
641} arg_desc_t;
642
643/* Set this bit in arg_required to allow empty arguments */
644#define REISERFS_OPT_ALLOWEMPTY 31
645
646/* this struct is used in reiserfs_getopt() for describing the set of reiserfs
647 mount options */
648typedef struct {
649 char * option_name;
650 int arg_required; /* 0 if argument is not required, not 0 otherwise */
651 const arg_desc_t * values; /* list of values accepted by an option */
652 int setmask; /* bitmask which is to set on mount_options bitmask when this
653 value is found, 0 is no bits are to be changed. */
654 int clrmask; /* bitmask which is to clear on mount_options bitmask when this
655 value is found, 0 is no bits are to be changed. This is
656 applied BEFORE setmask */
657} opt_desc_t;
658
659/* possible values for -o data= */
660static const arg_desc_t logging_mode[] = {
661 {"ordered", 1<<REISERFS_DATA_ORDERED, (1<<REISERFS_DATA_LOG|1<<REISERFS_DATA_WRITEBACK)},
662 {"journal", 1<<REISERFS_DATA_LOG, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_WRITEBACK)},
663 {"writeback", 1<<REISERFS_DATA_WRITEBACK, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_LOG)},
664 {NULL, 0}
665};
666
667/* possible values for -o barrier= */
668static const arg_desc_t barrier_mode[] = {
669 {"none", 1<<REISERFS_BARRIER_NONE, 1<<REISERFS_BARRIER_FLUSH},
670 {"flush", 1<<REISERFS_BARRIER_FLUSH, 1<<REISERFS_BARRIER_NONE},
671 {NULL, 0}
672};
673
674/* possible values for "-o block-allocator=" and bits which are to be set in
675 s_mount_opt of reiserfs specific part of in-core super block */
676static const arg_desc_t balloc[] = {
677 {"noborder", 1<<REISERFS_NO_BORDER, 0},
678 {"border", 0, 1<<REISERFS_NO_BORDER},
679 {"no_unhashed_relocation", 1<<REISERFS_NO_UNHASHED_RELOCATION, 0},
680 {"hashed_relocation", 1<<REISERFS_HASHED_RELOCATION, 0},
681 {"test4", 1<<REISERFS_TEST4, 0},
682 {"notest4", 0, 1<<REISERFS_TEST4},
683 {NULL, 0, 0}
684};
685
686static const arg_desc_t tails[] = {
687 {"on", 1<<REISERFS_LARGETAIL, 1<<REISERFS_SMALLTAIL},
688 {"off", 0, (1<<REISERFS_LARGETAIL)|(1<<REISERFS_SMALLTAIL)},
689 {"small", 1<<REISERFS_SMALLTAIL, 1<<REISERFS_LARGETAIL},
690 {NULL, 0, 0}
691};
692
693static const arg_desc_t error_actions[] = {
694 {"panic", 1 << REISERFS_ERROR_PANIC,
695 (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)},
696 {"ro-remount", 1 << REISERFS_ERROR_RO,
697 (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)},
698#ifdef REISERFS_JOURNAL_ERROR_ALLOWS_NO_LOG
699 {"continue", 1 << REISERFS_ERROR_CONTINUE,
700 (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)},
701#endif
702 {NULL, 0, 0},
703};
704
705int reiserfs_default_io_size = 128 * 1024; /* Default recommended I/O size is 128k.
706 There might be broken applications that are
707 confused by this. Use nolargeio mount option
708 to get usual i/o size = PAGE_SIZE.
709 */
710
711/* proceed only one option from a list *cur - string containing of mount options
712 opts - array of options which are accepted
713 opt_arg - if option is found and requires an argument and if it is specifed
714 in the input - pointer to the argument is stored here
715 bit_flags - if option requires to set a certain bit - it is set here
716 return -1 if unknown option is found, opt->arg_required otherwise */
717static int reiserfs_getopt ( struct super_block * s, char ** cur, opt_desc_t * opts, char ** opt_arg,
718 unsigned long * bit_flags)
719{
720 char * p;
721 /* foo=bar,
722 ^ ^ ^
723 | | +-- option_end
724 | +-- arg_start
725 +-- option_start
726 */
727 const opt_desc_t * opt;
728 const arg_desc_t * arg;
729
730
731 p = *cur;
732
733 /* assume argument cannot contain commas */
734 *cur = strchr (p, ',');
735 if (*cur) {
736 *(*cur) = '\0';
737 (*cur) ++;
738 }
739
740 if ( !strncmp (p, "alloc=", 6) ) {
741 /* Ugly special case, probably we should redo options parser so that
742 it can understand several arguments for some options, also so that
743 it can fill several bitfields with option values. */
744 if ( reiserfs_parse_alloc_options( s, p + 6) ) {
745 return -1;
746 } else {
747 return 0;
748 }
749 }
750
751
752 /* for every option in the list */
753 for (opt = opts; opt->option_name; opt ++) {
754 if (!strncmp (p, opt->option_name, strlen (opt->option_name))) {
755 if (bit_flags) {
756 if (opt->clrmask == (1 << REISERFS_UNSUPPORTED_OPT))
757 reiserfs_warning (s, "%s not supported.", p);
758 else
759 *bit_flags &= ~opt->clrmask;
760 if (opt->setmask == (1 << REISERFS_UNSUPPORTED_OPT))
761 reiserfs_warning (s, "%s not supported.", p);
762 else
763 *bit_flags |= opt->setmask;
764 }
765 break;
766 }
767 }
768 if (!opt->option_name) {
769 reiserfs_warning (s, "unknown mount option \"%s\"", p);
770 return -1;
771 }
772
773 p += strlen (opt->option_name);
774 switch (*p) {
775 case '=':
776 if (!opt->arg_required) {
777 reiserfs_warning (s, "the option \"%s\" does not require an argument",
778 opt->option_name);
779 return -1;
780 }
781 break;
782
783 case 0:
784 if (opt->arg_required) {
785 reiserfs_warning (s, "the option \"%s\" requires an argument", opt->option_name);
786 return -1;
787 }
788 break;
789 default:
790 reiserfs_warning (s, "head of option \"%s\" is only correct", opt->option_name);
791 return -1;
792 }
793
794 /* move to the argument, or to next option if argument is not required */
795 p ++;
796
797 if ( opt->arg_required && !(opt->arg_required & (1<<REISERFS_OPT_ALLOWEMPTY)) && !strlen (p) ) {
798 /* this catches "option=," if not allowed */
799 reiserfs_warning (s, "empty argument for \"%s\"", opt->option_name);
800 return -1;
801 }
802
803 if (!opt->values) {
804 /* *=NULLopt_arg contains pointer to argument */
805 *opt_arg = p;
806 return opt->arg_required & ~(1<<REISERFS_OPT_ALLOWEMPTY);
807 }
808
809 /* values possible for this option are listed in opt->values */
810 for (arg = opt->values; arg->value; arg ++) {
811 if (!strcmp (p, arg->value)) {
812 if (bit_flags) {
813 *bit_flags &= ~arg->clrmask;
814 *bit_flags |= arg->setmask;
815 }
816 return opt->arg_required;
817 }
818 }
819
820 reiserfs_warning (s, "bad value \"%s\" for option \"%s\"", p, opt->option_name);
821 return -1;
822}
823
824/* returns 0 if something is wrong in option string, 1 - otherwise */
825static int reiserfs_parse_options (struct super_block * s, char * options, /* string given via mount's -o */
826 unsigned long * mount_options,
827 /* after the parsing phase, contains the
828 collection of bitflags defining what
829 mount options were selected. */
830 unsigned long * blocks, /* strtol-ed from NNN of resize=NNN */
831 char ** jdev_name,
832 unsigned int * commit_max_age)
833{
834 int c;
835 char * arg = NULL;
836 char * pos;
837 opt_desc_t opts[] = {
838 /* Compatibility stuff, so that -o notail for old setups still work */
839 {"tails", .arg_required = 't', .values = tails},
840 {"notail", .clrmask = (1<<REISERFS_LARGETAIL)|(1<<REISERFS_SMALLTAIL)},
841 {"conv", .setmask = 1<<REISERFS_CONVERT},
842 {"attrs", .setmask = 1<<REISERFS_ATTRS},
843 {"noattrs", .clrmask = 1<<REISERFS_ATTRS},
844#ifdef CONFIG_REISERFS_FS_XATTR
845 {"user_xattr", .setmask = 1<<REISERFS_XATTRS_USER},
846 {"nouser_xattr",.clrmask = 1<<REISERFS_XATTRS_USER},
847#else
848 {"user_xattr", .setmask = 1<<REISERFS_UNSUPPORTED_OPT},
849 {"nouser_xattr",.clrmask = 1<<REISERFS_UNSUPPORTED_OPT},
850#endif
851#ifdef CONFIG_REISERFS_FS_POSIX_ACL
852 {"acl", .setmask = 1<<REISERFS_POSIXACL},
853 {"noacl", .clrmask = 1<<REISERFS_POSIXACL},
854#else
855 {"acl", .setmask = 1<<REISERFS_UNSUPPORTED_OPT},
856 {"noacl", .clrmask = 1<<REISERFS_UNSUPPORTED_OPT},
857#endif
858 {"nolog",}, /* This is unsupported */
859 {"replayonly", .setmask = 1<<REPLAYONLY},
860 {"block-allocator", .arg_required = 'a', .values = balloc},
861 {"data", .arg_required = 'd', .values = logging_mode},
862 {"barrier", .arg_required = 'b', .values = barrier_mode},
863 {"resize", .arg_required = 'r', .values = NULL},
864 {"jdev", .arg_required = 'j', .values = NULL},
865 {"nolargeio", .arg_required = 'w', .values = NULL},
866 {"commit", .arg_required = 'c', .values = NULL},
867 {"usrquota",},
868 {"grpquota",},
869 {"errors", .arg_required = 'e', .values = error_actions},
870 {"usrjquota", .arg_required = 'u'|(1<<REISERFS_OPT_ALLOWEMPTY), .values = NULL},
871 {"grpjquota", .arg_required = 'g'|(1<<REISERFS_OPT_ALLOWEMPTY), .values = NULL},
872 {"jqfmt", .arg_required = 'f', .values = NULL},
873 {NULL,}
874 };
875
876 *blocks = 0;
877 if (!options || !*options)
878 /* use default configuration: create tails, journaling on, no
879 conversion to newest format */
880 return 1;
881
882 for (pos = options; pos; ) {
883 c = reiserfs_getopt (s, &pos, opts, &arg, mount_options);
884 if (c == -1)
885 /* wrong option is given */
886 return 0;
887
888 if (c == 'r') {
889 char * p;
890
891 p = NULL;
892 /* "resize=NNN" */
893 *blocks = simple_strtoul (arg, &p, 0);
894 if (*p != '\0') {
895 /* NNN does not look like a number */
896 reiserfs_warning (s, "reiserfs_parse_options: bad value %s", arg);
897 return 0;
898 }
899 }
900
901 if ( c == 'c' ) {
902 char *p = NULL;
903 unsigned long val = simple_strtoul (arg, &p, 0);
904 /* commit=NNN (time in seconds) */
905 if ( *p != '\0' || val >= (unsigned int)-1) {
906 reiserfs_warning (s, "reiserfs_parse_options: bad value %s", arg); return 0;
907 }
908 *commit_max_age = (unsigned int)val;
909 }
910
911 if ( c == 'w' ) {
912 char *p=NULL;
913 int val = simple_strtoul (arg, &p, 0);
914
915 if ( *p != '\0') {
916 reiserfs_warning (s, "reiserfs_parse_options: non-numeric value %s for nolargeio option", arg);
917 return 0;
918 }
919 if ( val )
920 reiserfs_default_io_size = PAGE_SIZE;
921 else
922 reiserfs_default_io_size = 128 * 1024;
923 }
924
925 if (c == 'j') {
926 if (arg && *arg && jdev_name) {
927 if ( *jdev_name ) { //Hm, already assigned?
928 reiserfs_warning (s, "reiserfs_parse_options: journal device was already specified to be %s", *jdev_name);
929 return 0;
930 }
931 *jdev_name = arg;
932 }
933 }
934
935#ifdef CONFIG_QUOTA
936 if (c == 'u' || c == 'g') {
937 int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
938
939 if (sb_any_quota_enabled(s)) {
940 reiserfs_warning(s, "reiserfs_parse_options: cannot change journalled quota options when quota turned on.");
941 return 0;
942 }
943 if (*arg) { /* Some filename specified? */
944 if (REISERFS_SB(s)->s_qf_names[qtype] && strcmp(REISERFS_SB(s)->s_qf_names[qtype], arg)) {
945 reiserfs_warning(s, "reiserfs_parse_options: %s quota file already specified.", QTYPE2NAME(qtype));
946 return 0;
947 }
948 if (strchr(arg, '/')) {
949 reiserfs_warning(s, "reiserfs_parse_options: quotafile must be on filesystem root.");
950 return 0;
951 }
952 REISERFS_SB(s)->s_qf_names[qtype] = kmalloc(strlen(arg)+1, GFP_KERNEL);
953 if (!REISERFS_SB(s)->s_qf_names[qtype]) {
954 reiserfs_warning(s, "reiserfs_parse_options: not enough memory for storing quotafile name.");
955 return 0;
956 }
957 strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg);
958 }
959 else {
960 if (REISERFS_SB(s)->s_qf_names[qtype]) {
961 kfree(REISERFS_SB(s)->s_qf_names[qtype]);
962 REISERFS_SB(s)->s_qf_names[qtype] = NULL;
963 }
964 }
965 }
966 if (c == 'f') {
967 if (!strcmp(arg, "vfsold"))
968 REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_OLD;
969 else if (!strcmp(arg, "vfsv0"))
970 REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_V0;
971 else {
972 reiserfs_warning(s, "reiserfs_parse_options: unknown quota format specified.");
973 return 0;
974 }
975 }
976#else
977 if (c == 'u' || c == 'g' || c == 'f') {
978 reiserfs_warning(s, "reiserfs_parse_options: journalled quota options not supported.");
979 return 0;
980 }
981#endif
982 }
983
984#ifdef CONFIG_QUOTA
985 if (!REISERFS_SB(s)->s_jquota_fmt && (REISERFS_SB(s)->s_qf_names[USRQUOTA] || REISERFS_SB(s)->s_qf_names[GRPQUOTA])) {
986 reiserfs_warning(s, "reiserfs_parse_options: journalled quota format not specified.");
987 return 0;
988 }
989#endif
990 return 1;
991}
992
993static void switch_data_mode(struct super_block *s, unsigned long mode) {
994 REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
995 (1 << REISERFS_DATA_ORDERED) |
996 (1 << REISERFS_DATA_WRITEBACK));
997 REISERFS_SB(s)->s_mount_opt |= (1 << mode);
998}
999
1000static void handle_data_mode(struct super_block *s, unsigned long mount_options)
1001{
1002 if (mount_options & (1 << REISERFS_DATA_LOG)) {
1003 if (!reiserfs_data_log(s)) {
1004 switch_data_mode(s, REISERFS_DATA_LOG);
1005 reiserfs_info (s, "switching to journaled data mode\n");
1006 }
1007 } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
1008 if (!reiserfs_data_ordered(s)) {
1009 switch_data_mode(s, REISERFS_DATA_ORDERED);
1010 reiserfs_info (s, "switching to ordered data mode\n");
1011 }
1012 } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
1013 if (!reiserfs_data_writeback(s)) {
1014 switch_data_mode(s, REISERFS_DATA_WRITEBACK);
1015 reiserfs_info (s, "switching to writeback data mode\n");
1016 }
1017 }
1018}
1019
1020static void handle_barrier_mode(struct super_block *s, unsigned long bits) {
1021 int flush = (1 << REISERFS_BARRIER_FLUSH);
1022 int none = (1 << REISERFS_BARRIER_NONE);
1023 int all_barrier = flush | none;
1024
1025 if (bits & all_barrier) {
1026 REISERFS_SB(s)->s_mount_opt &= ~all_barrier;
1027 if (bits & flush) {
1028 REISERFS_SB(s)->s_mount_opt |= flush;
1029 printk("reiserfs: enabling write barrier flush mode\n");
1030 } else if (bits & none) {
1031 REISERFS_SB(s)->s_mount_opt |= none;
1032 printk("reiserfs: write barriers turned off\n");
1033 }
1034 }
1035}
1036
1037static void handle_attrs( struct super_block *s )
1038{
1039 struct reiserfs_super_block * rs;
1040
1041 if( reiserfs_attrs( s ) ) {
1042 rs = SB_DISK_SUPER_BLOCK (s);
1043 if( old_format_only(s) ) {
1044 reiserfs_warning(s, "reiserfs: cannot support attributes on 3.5.x disk format" );
1045 REISERFS_SB(s) -> s_mount_opt &= ~ ( 1 << REISERFS_ATTRS );
1046 return;
1047 }
1048 if( !( le32_to_cpu( rs -> s_flags ) & reiserfs_attrs_cleared ) ) {
1049 reiserfs_warning(s, "reiserfs: cannot support attributes until flag is set in super-block" );
1050 REISERFS_SB(s) -> s_mount_opt &= ~ ( 1 << REISERFS_ATTRS );
1051 }
1052 }
1053}
1054
1055static int reiserfs_remount (struct super_block * s, int * mount_flags, char * arg)
1056{
1057 struct reiserfs_super_block * rs;
1058 struct reiserfs_transaction_handle th ;
1059 unsigned long blocks;
1060 unsigned long mount_options = REISERFS_SB(s)->s_mount_opt;
1061 unsigned long safe_mask = 0;
1062 unsigned int commit_max_age = (unsigned int)-1;
1063 struct reiserfs_journal *journal = SB_JOURNAL(s);
1064 int err;
1065#ifdef CONFIG_QUOTA
1066 int i;
1067#endif
1068
1069 rs = SB_DISK_SUPER_BLOCK (s);
1070
1071 if (!reiserfs_parse_options(s, arg, &mount_options, &blocks, NULL, &commit_max_age)) {
1072#ifdef CONFIG_QUOTA
1073 for (i = 0; i < MAXQUOTAS; i++)
1074 if (REISERFS_SB(s)->s_qf_names[i]) {
1075 kfree(REISERFS_SB(s)->s_qf_names[i]);
1076 REISERFS_SB(s)->s_qf_names[i] = NULL;
1077 }
1078#endif
1079 return -EINVAL;
1080 }
1081
1082 handle_attrs(s);
1083
1084 /* Add options that are safe here */
1085 safe_mask |= 1 << REISERFS_SMALLTAIL;
1086 safe_mask |= 1 << REISERFS_LARGETAIL;
1087 safe_mask |= 1 << REISERFS_NO_BORDER;
1088 safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION;
1089 safe_mask |= 1 << REISERFS_HASHED_RELOCATION;
1090 safe_mask |= 1 << REISERFS_TEST4;
1091 safe_mask |= 1 << REISERFS_ATTRS;
1092 safe_mask |= 1 << REISERFS_XATTRS_USER;
1093 safe_mask |= 1 << REISERFS_POSIXACL;
1094 safe_mask |= 1 << REISERFS_BARRIER_FLUSH;
1095 safe_mask |= 1 << REISERFS_BARRIER_NONE;
1096 safe_mask |= 1 << REISERFS_ERROR_RO;
1097 safe_mask |= 1 << REISERFS_ERROR_CONTINUE;
1098 safe_mask |= 1 << REISERFS_ERROR_PANIC;
1099
1100 /* Update the bitmask, taking care to keep
1101 * the bits we're not allowed to change here */
1102 REISERFS_SB(s)->s_mount_opt = (REISERFS_SB(s)->s_mount_opt & ~safe_mask) | (mount_options & safe_mask);
1103
1104 if(commit_max_age != 0 && commit_max_age != (unsigned int)-1) {
1105 journal->j_max_commit_age = commit_max_age;
1106 journal->j_max_trans_age = commit_max_age;
1107 }
1108 else if(commit_max_age == 0)
1109 {
1110 /* 0 means restore defaults. */
1111 journal->j_max_commit_age = journal->j_default_max_commit_age;
1112 journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
1113 }
1114
1115 if(blocks) {
1116 int rc = reiserfs_resize(s, blocks);
1117 if (rc != 0)
1118 return rc;
1119 }
1120
1121 if (*mount_flags & MS_RDONLY) {
1122 reiserfs_xattr_init (s, *mount_flags);
1123 /* remount read-only */
1124 if (s->s_flags & MS_RDONLY)
1125 /* it is read-only already */
1126 return 0;
1127 /* try to remount file system with read-only permissions */
1128 if (sb_umount_state(rs) == REISERFS_VALID_FS || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
1129 return 0;
1130 }
1131
1132 err = journal_begin(&th, s, 10) ;
1133 if (err)
1134 return err;
1135
1136 /* Mounting a rw partition read-only. */
1137 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
1138 set_sb_umount_state( rs, REISERFS_SB(s)->s_mount_state );
1139 journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
1140 } else {
1141 /* remount read-write */
1142 if (!(s->s_flags & MS_RDONLY)) {
1143 reiserfs_xattr_init (s, *mount_flags);
1144 return 0; /* We are read-write already */
1145 }
1146
1147 if (reiserfs_is_journal_aborted (journal))
1148 return journal->j_errno;
1149
1150 handle_data_mode(s, mount_options);
1151 handle_barrier_mode(s, mount_options);
1152 REISERFS_SB(s)->s_mount_state = sb_umount_state(rs) ;
1153 s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */
1154 err = journal_begin(&th, s, 10) ;
1155 if (err)
1156 return err;
1157
1158 /* Mount a partition which is read-only, read-write */
1159 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
1160 REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
1161 s->s_flags &= ~MS_RDONLY;
1162 set_sb_umount_state( rs, REISERFS_ERROR_FS );
1163 /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
1164 journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
1165 REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS ;
1166 }
1167 /* this will force a full flush of all journal lists */
1168 SB_JOURNAL(s)->j_must_wait = 1 ;
1169 err = journal_end(&th, s, 10) ;
1170 if (err)
1171 return err;
1172 s->s_dirt = 0;
1173
1174 if (!( *mount_flags & MS_RDONLY ) ) {
1175 finish_unfinished( s );
1176 reiserfs_xattr_init (s, *mount_flags);
1177 }
1178
1179 return 0;
1180}
1181
1182/* load_bitmap_info_data - Sets up the reiserfs_bitmap_info structure from disk.
1183 * @sb - superblock for this filesystem
1184 * @bi - the bitmap info to be loaded. Requires that bi->bh is valid.
1185 *
1186 * This routine counts how many free bits there are, finding the first zero
1187 * as a side effect. Could also be implemented as a loop of test_bit() calls, or
1188 * a loop of find_first_zero_bit() calls. This implementation is similar to
1189 * find_first_zero_bit(), but doesn't return after it finds the first bit.
1190 * Should only be called on fs mount, but should be fairly efficient anyways.
1191 *
1192 * bi->first_zero_hint is considered unset if it == 0, since the bitmap itself
1193 * will * invariably occupt block 0 represented in the bitmap. The only
1194 * exception to this is when free_count also == 0, since there will be no
1195 * free blocks at all.
1196 */
1197
1198static void load_bitmap_info_data (struct super_block *sb,
1199 struct reiserfs_bitmap_info *bi)
1200{
1201 unsigned long *cur = (unsigned long *)bi->bh->b_data;
1202
1203 while ((char *)cur < (bi->bh->b_data + sb->s_blocksize)) {
1204
1205 /* No need to scan if all 0's or all 1's.
1206 * Since we're only counting 0's, we can simply ignore all 1's */
1207 if (*cur == 0) {
1208 if (bi->first_zero_hint == 0) {
1209 bi->first_zero_hint = ((char *)cur - bi->bh->b_data) << 3;
1210 }
1211 bi->free_count += sizeof(unsigned long)*8;
1212 } else if (*cur != ~0L) {
1213 int b;
1214 for (b = 0; b < sizeof(unsigned long)*8; b++) {
1215 if (!reiserfs_test_le_bit (b, cur)) {
1216 bi->free_count ++;
1217 if (bi->first_zero_hint == 0)
1218 bi->first_zero_hint =
1219 (((char *)cur - bi->bh->b_data) << 3) + b;
1220 }
1221 }
1222 }
1223 cur ++;
1224 }
1225
1226#ifdef CONFIG_REISERFS_CHECK
1227// This outputs a lot of unneded info on big FSes
1228// reiserfs_warning ("bitmap loaded from block %d: %d free blocks",
1229// bi->bh->b_blocknr, bi->free_count);
1230#endif
1231}
1232
1233static int read_bitmaps (struct super_block * s)
1234{
1235 int i, bmap_nr;
1236
1237 SB_AP_BITMAP (s) = vmalloc (sizeof (struct reiserfs_bitmap_info) * SB_BMAP_NR(s));
1238 if (SB_AP_BITMAP (s) == 0)
1239 return 1;
1240 memset (SB_AP_BITMAP (s), 0, sizeof (struct reiserfs_bitmap_info) * SB_BMAP_NR(s));
1241 for (i = 0, bmap_nr = REISERFS_DISK_OFFSET_IN_BYTES / s->s_blocksize + 1;
1242 i < SB_BMAP_NR(s); i++, bmap_nr = s->s_blocksize * 8 * i) {
1243 SB_AP_BITMAP (s)[i].bh = sb_getblk(s, bmap_nr);
1244 if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh))
1245 ll_rw_block(READ, 1, &SB_AP_BITMAP(s)[i].bh);
1246 }
1247 for (i = 0; i < SB_BMAP_NR(s); i++) {
1248 wait_on_buffer(SB_AP_BITMAP (s)[i].bh);
1249 if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) {
1250 reiserfs_warning(s,"sh-2029: reiserfs read_bitmaps: "
1251 "bitmap block (#%lu) reading failed",
1252 SB_AP_BITMAP(s)[i].bh->b_blocknr);
1253 for (i = 0; i < SB_BMAP_NR(s); i++)
1254 brelse(SB_AP_BITMAP(s)[i].bh);
1255 vfree(SB_AP_BITMAP(s));
1256 SB_AP_BITMAP(s) = NULL;
1257 return 1;
1258 }
1259 load_bitmap_info_data (s, SB_AP_BITMAP (s) + i);
1260 }
1261 return 0;
1262}
1263
1264static int read_old_bitmaps (struct super_block * s)
1265{
1266 int i ;
1267 struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK(s);
1268 int bmp1 = (REISERFS_OLD_DISK_OFFSET_IN_BYTES / s->s_blocksize) + 1; /* first of bitmap blocks */
1269
1270 /* read true bitmap */
1271 SB_AP_BITMAP (s) = vmalloc (sizeof (struct reiserfs_buffer_info *) * sb_bmap_nr(rs));
1272 if (SB_AP_BITMAP (s) == 0)
1273 return 1;
1274
1275 memset (SB_AP_BITMAP (s), 0, sizeof (struct reiserfs_buffer_info *) * sb_bmap_nr(rs));
1276
1277 for (i = 0; i < sb_bmap_nr(rs); i ++) {
1278 SB_AP_BITMAP (s)[i].bh = sb_bread (s, bmp1 + i);
1279 if (!SB_AP_BITMAP (s)[i].bh)
1280 return 1;
1281 load_bitmap_info_data (s, SB_AP_BITMAP (s) + i);
1282 }
1283
1284 return 0;
1285}
1286
1287static int read_super_block (struct super_block * s, int offset)
1288{
1289 struct buffer_head * bh;
1290 struct reiserfs_super_block * rs;
1291 int fs_blocksize;
1292
1293
1294 bh = sb_bread (s, offset / s->s_blocksize);
1295 if (!bh) {
1296 reiserfs_warning (s, "sh-2006: read_super_block: "
1297 "bread failed (dev %s, block %lu, size %lu)",
1298 reiserfs_bdevname (s), offset / s->s_blocksize, s->s_blocksize);
1299 return 1;
1300 }
1301
1302 rs = (struct reiserfs_super_block *)bh->b_data;
1303 if (!is_any_reiserfs_magic_string (rs)) {
1304 brelse (bh);
1305 return 1;
1306 }
1307
1308 //
1309 // ok, reiserfs signature (old or new) found in at the given offset
1310 //
1311 fs_blocksize = sb_blocksize(rs);
1312 brelse (bh);
1313 sb_set_blocksize (s, fs_blocksize);
1314
1315 bh = sb_bread (s, offset / s->s_blocksize);
1316 if (!bh) {
1317 reiserfs_warning (s, "sh-2007: read_super_block: "
1318 "bread failed (dev %s, block %lu, size %lu)\n",
1319 reiserfs_bdevname (s), offset / s->s_blocksize, s->s_blocksize);
1320 return 1;
1321 }
1322
1323 rs = (struct reiserfs_super_block *)bh->b_data;
1324 if (sb_blocksize(rs) != s->s_blocksize) {
1325 reiserfs_warning (s, "sh-2011: read_super_block: "
1326 "can't find a reiserfs filesystem on (dev %s, block %Lu, size %lu)\n",
1327 reiserfs_bdevname (s), (unsigned long long)bh->b_blocknr, s->s_blocksize);
1328 brelse (bh);
1329 return 1;
1330 }
1331
1332 if ( rs->s_v1.s_root_block == -1 ) {
1333 brelse(bh) ;
1334 reiserfs_warning (s, "Unfinished reiserfsck --rebuild-tree run detected. Please run\n"
1335 "reiserfsck --rebuild-tree and wait for a completion. If that fails\n"
1336 "get newer reiserfsprogs package");
1337 return 1;
1338 }
1339
1340 SB_BUFFER_WITH_SB (s) = bh;
1341 SB_DISK_SUPER_BLOCK (s) = rs;
1342
1343 if (is_reiserfs_jr (rs)) {
1344 /* magic is of non-standard journal filesystem, look at s_version to
1345 find which format is in use */
1346 if (sb_version(rs) == REISERFS_VERSION_2)
1347 reiserfs_warning (s, "read_super_block: found reiserfs format \"3.6\""
1348 " with non-standard journal");
1349 else if (sb_version(rs) == REISERFS_VERSION_1)
1350 reiserfs_warning (s, "read_super_block: found reiserfs format \"3.5\""
1351 " with non-standard journal");
1352 else {
1353 reiserfs_warning (s, "sh-2012: read_super_block: found unknown "
1354 "format \"%u\" of reiserfs with non-standard magic",
1355 sb_version(rs));
1356 return 1;
1357 }
1358 }
1359 else
1360 /* s_version of standard format may contain incorrect information,
1361 so we just look at the magic string */
1362 reiserfs_info (s, "found reiserfs format \"%s\" with standard journal\n",
1363 is_reiserfs_3_5 (rs) ? "3.5" : "3.6");
1364
1365 s->s_op = &reiserfs_sops;
1366 s->s_export_op = &reiserfs_export_ops;
1367#ifdef CONFIG_QUOTA
1368 s->s_qcop = &reiserfs_qctl_operations;
1369 s->dq_op = &reiserfs_quota_operations;
1370#endif
1371
1372 /* new format is limited by the 32 bit wide i_blocks field, want to
1373 ** be one full block below that.
1374 */
1375 s->s_maxbytes = (512LL << 32) - s->s_blocksize ;
1376 return 0;
1377}
1378
1379
1380
1381/* after journal replay, reread all bitmap and super blocks */
1382static int reread_meta_blocks(struct super_block *s) {
1383 int i ;
1384 ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))) ;
1385 wait_on_buffer(SB_BUFFER_WITH_SB(s)) ;
1386 if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
1387 reiserfs_warning (s, "reread_meta_blocks, error reading the super") ;
1388 return 1 ;
1389 }
1390
1391 for (i = 0; i < SB_BMAP_NR(s) ; i++) {
1392 ll_rw_block(READ, 1, &(SB_AP_BITMAP(s)[i].bh)) ;
1393 wait_on_buffer(SB_AP_BITMAP(s)[i].bh) ;
1394 if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) {
1395 reiserfs_warning (s, "reread_meta_blocks, error reading bitmap block number %d at %llu",
1396 i, (unsigned long long)SB_AP_BITMAP(s)[i].bh->b_blocknr) ;
1397 return 1 ;
1398 }
1399 }
1400 return 0 ;
1401
1402}
1403
1404
1405/////////////////////////////////////////////////////
1406// hash detection stuff
1407
1408
1409// if root directory is empty - we set default - Yura's - hash and
1410// warn about it
1411// FIXME: we look for only one name in a directory. If tea and yura
1412// bith have the same value - we ask user to send report to the
1413// mailing list
1414static __u32 find_hash_out (struct super_block * s)
1415{
1416 int retval;
1417 struct inode * inode;
1418 struct cpu_key key;
1419 INITIALIZE_PATH (path);
1420 struct reiserfs_dir_entry de;
1421 __u32 hash = DEFAULT_HASH;
1422
1423 inode = s->s_root->d_inode;
1424
1425 do { // Some serious "goto"-hater was there ;)
1426 u32 teahash, r5hash, yurahash;
1427
1428 make_cpu_key (&key, inode, ~0, TYPE_DIRENTRY, 3);
1429 retval = search_by_entry_key (s, &key, &path, &de);
1430 if (retval == IO_ERROR) {
1431 pathrelse (&path);
1432 return UNSET_HASH ;
1433 }
1434 if (retval == NAME_NOT_FOUND)
1435 de.de_entry_num --;
1436 set_de_name_and_namelen (&de);
1437 if (deh_offset( &(de.de_deh[de.de_entry_num]) ) == DOT_DOT_OFFSET) {
1438 /* allow override in this case */
1439 if (reiserfs_rupasov_hash(s)) {
1440 hash = YURA_HASH ;
1441 }
1442 reiserfs_warning(s,"FS seems to be empty, autodetect "
1443 "is using the default hash");
1444 break;
1445 }
1446 r5hash=GET_HASH_VALUE (r5_hash (de.de_name, de.de_namelen));
1447 teahash=GET_HASH_VALUE (keyed_hash (de.de_name, de.de_namelen));
1448 yurahash=GET_HASH_VALUE (yura_hash (de.de_name, de.de_namelen));
1449 if ( ( (teahash == r5hash) && (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash) ) ||
1450 ( (teahash == yurahash) && (yurahash == GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])))) ) ||
1451 ( (r5hash == yurahash) && (yurahash == GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])))) ) ) {
1452 reiserfs_warning(s,"Unable to automatically detect hash function. "
1453 "Please mount with -o hash={tea,rupasov,r5}",
1454 reiserfs_bdevname (s));
1455 hash = UNSET_HASH;
1456 break;
1457 }
1458 if (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])) ) == yurahash)
1459 hash = YURA_HASH;
1460 else if (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])) ) == teahash)
1461 hash = TEA_HASH;
1462 else if (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])) ) == r5hash)
1463 hash = R5_HASH;
1464 else {
1465 reiserfs_warning (s,"Unrecognised hash function");
1466 hash = UNSET_HASH;
1467 }
1468 } while (0);
1469
1470 pathrelse (&path);
1471 return hash;
1472}
1473
1474// finds out which hash names are sorted with
1475static int what_hash (struct super_block * s)
1476{
1477 __u32 code;
1478
1479 code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s));
1480
1481 /* reiserfs_hash_detect() == true if any of the hash mount options
1482 ** were used. We must check them to make sure the user isn't
1483 ** using a bad hash value
1484 */
1485 if (code == UNSET_HASH || reiserfs_hash_detect(s))
1486 code = find_hash_out (s);
1487
1488 if (code != UNSET_HASH && reiserfs_hash_detect(s)) {
1489 /* detection has found the hash, and we must check against the
1490 ** mount options
1491 */
1492 if (reiserfs_rupasov_hash(s) && code != YURA_HASH) {
1493 reiserfs_warning (s, "Error, %s hash detected, "
1494 "unable to force rupasov hash", reiserfs_hashname(code)) ;
1495 code = UNSET_HASH ;
1496 } else if (reiserfs_tea_hash(s) && code != TEA_HASH) {
1497 reiserfs_warning (s, "Error, %s hash detected, "
1498 "unable to force tea hash", reiserfs_hashname(code)) ;
1499 code = UNSET_HASH ;
1500 } else if (reiserfs_r5_hash(s) && code != R5_HASH) {
1501 reiserfs_warning (s, "Error, %s hash detected, "
1502 "unable to force r5 hash", reiserfs_hashname(code)) ;
1503 code = UNSET_HASH ;
1504 }
1505 } else {
1506 /* find_hash_out was not called or could not determine the hash */
1507 if (reiserfs_rupasov_hash(s)) {
1508 code = YURA_HASH ;
1509 } else if (reiserfs_tea_hash(s)) {
1510 code = TEA_HASH ;
1511 } else if (reiserfs_r5_hash(s)) {
1512 code = R5_HASH ;
1513 }
1514 }
1515
1516 /* if we are mounted RW, and we have a new valid hash code, update
1517 ** the super
1518 */
1519 if (code != UNSET_HASH &&
1520 !(s->s_flags & MS_RDONLY) &&
1521 code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) {
1522 set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code);
1523 }
1524 return code;
1525}
1526
1527// return pointer to appropriate function
1528static hashf_t hash_function (struct super_block * s)
1529{
1530 switch (what_hash (s)) {
1531 case TEA_HASH:
1532 reiserfs_info (s, "Using tea hash to sort names\n");
1533 return keyed_hash;
1534 case YURA_HASH:
1535 reiserfs_info (s, "Using rupasov hash to sort names\n");
1536 return yura_hash;
1537 case R5_HASH:
1538 reiserfs_info (s, "Using r5 hash to sort names\n");
1539 return r5_hash;
1540 }
1541 return NULL;
1542}
1543
1544// this is used to set up correct value for old partitions
1545static int function2code (hashf_t func)
1546{
1547 if (func == keyed_hash)
1548 return TEA_HASH;
1549 if (func == yura_hash)
1550 return YURA_HASH;
1551 if (func == r5_hash)
1552 return R5_HASH;
1553
1554 BUG() ; // should never happen
1555
1556 return 0;
1557}
1558
1559#define SWARN(silent, s, ...) \
1560 if (!(silent)) \
1561 reiserfs_warning (s, __VA_ARGS__)
1562
1563static int reiserfs_fill_super (struct super_block * s, void * data, int silent)
1564{
1565 struct inode *root_inode;
1566 int j;
1567 struct reiserfs_transaction_handle th ;
1568 int old_format = 0;
1569 unsigned long blocks;
1570 unsigned int commit_max_age = 0;
1571 int jinit_done = 0 ;
1572 struct reiserfs_iget_args args ;
1573 struct reiserfs_super_block * rs;
1574 char *jdev_name;
1575 struct reiserfs_sb_info *sbi;
1576 int errval = -EINVAL;
1577
1578 sbi = kmalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
1579 if (!sbi) {
1580 errval = -ENOMEM;
1581 goto error;
1582 }
1583 s->s_fs_info = sbi;
1584 memset (sbi, 0, sizeof (struct reiserfs_sb_info));
1585 /* Set default values for options: non-aggressive tails, RO on errors */
1586 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
1587 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO);
1588 /* no preallocation minimum, be smart in
1589 reiserfs_file_write instead */
1590 REISERFS_SB(s)->s_alloc_options.preallocmin = 0;
1591 /* Preallocate by 16 blocks (17-1) at once */
1592 REISERFS_SB(s)->s_alloc_options.preallocsize = 17;
1593 /* Initialize the rwsem for xattr dir */
1594 init_rwsem(&REISERFS_SB(s)->xattr_dir_sem);
1595
1596 /* setup default block allocator options */
1597 reiserfs_init_alloc_options(s);
1598
1599 jdev_name = NULL;
1600 if (reiserfs_parse_options (s, (char *) data, &(sbi->s_mount_opt), &blocks, &jdev_name, &commit_max_age) == 0) {
1601 goto error;
1602 }
1603
1604 if (blocks) {
1605 SWARN (silent, s, "jmacd-7: reiserfs_fill_super: resize option "
1606 "for remount only");
1607 goto error;
1608 }
1609
1610 /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */
1611 if (!read_super_block (s, REISERFS_OLD_DISK_OFFSET_IN_BYTES))
1612 old_format = 1;
1613 /* try new format (64-th 1k block), which can contain reiserfs super block */
1614 else if (read_super_block (s, REISERFS_DISK_OFFSET_IN_BYTES)) {
1615 SWARN(silent, s, "sh-2021: reiserfs_fill_super: can not find reiserfs on %s", reiserfs_bdevname (s));
1616 goto error;
1617 }
1618
1619 rs = SB_DISK_SUPER_BLOCK (s);
1620 /* Let's do basic sanity check to verify that underlying device is not
1621 smaller than the filesystem. If the check fails then abort and scream,
1622 because bad stuff will happen otherwise. */
1623 if ( s->s_bdev && s->s_bdev->bd_inode && i_size_read(s->s_bdev->bd_inode) < sb_block_count(rs)*sb_blocksize(rs)) {
1624 SWARN (silent, s, "Filesystem on %s cannot be mounted because it is bigger than the device", reiserfs_bdevname(s));
1625 SWARN(silent, s, "You may need to run fsck or increase size of your LVM partition");
1626 SWARN(silent, s, "Or may be you forgot to reboot after fdisk when it told you to");
1627 goto error;
1628 }
1629
1630 sbi->s_mount_state = SB_REISERFS_STATE(s);
1631 sbi->s_mount_state = REISERFS_VALID_FS ;
1632
1633 if (old_format ? read_old_bitmaps(s) : read_bitmaps(s)) {
1634 SWARN(silent, s, "jmacd-8: reiserfs_fill_super: unable to read bitmap");
1635 goto error;
1636 }
1637#ifdef CONFIG_REISERFS_CHECK
1638 SWARN (silent, s, "CONFIG_REISERFS_CHECK is set ON");
1639 SWARN (silent, s, "- it is slow mode for debugging.");
1640#endif
1641
1642 /* make data=ordered the default */
1643 if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
1644 !reiserfs_data_writeback(s))
1645 {
1646 REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
1647 }
1648
1649 if (reiserfs_data_log(s)) {
1650 reiserfs_info (s, "using journaled data mode\n");
1651 } else if (reiserfs_data_ordered(s)) {
1652 reiserfs_info (s, "using ordered data mode\n");
1653 } else {
1654 reiserfs_info (s, "using writeback data mode\n");
1655 }
1656 if (reiserfs_barrier_flush(s)) {
1657 printk("reiserfs: using flush barriers\n");
1658 }
1659
1660 // set_device_ro(s->s_dev, 1) ;
1661 if( journal_init(s, jdev_name, old_format, commit_max_age) ) {
1662 SWARN(silent, s, "sh-2022: reiserfs_fill_super: unable to initialize journal space") ;
1663 goto error ;
1664 } else {
1665 jinit_done = 1 ; /* once this is set, journal_release must be called
1666 ** if we error out of the mount
1667 */
1668 }
1669 if (reread_meta_blocks(s)) {
1670 SWARN(silent, s, "jmacd-9: reiserfs_fill_super: unable to reread meta blocks after journal init") ;
1671 goto error ;
1672 }
1673
1674 if (replay_only (s))
1675 goto error;
1676
1677 if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) {
1678 SWARN(silent, s, "clm-7000: Detected readonly device, marking FS readonly") ;
1679 s->s_flags |= MS_RDONLY ;
1680 }
1681 args.objectid = REISERFS_ROOT_OBJECTID ;
1682 args.dirid = REISERFS_ROOT_PARENT_OBJECTID ;
1683 root_inode = iget5_locked (s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args));
1684 if (!root_inode) {
1685 SWARN(silent, s, "jmacd-10: reiserfs_fill_super: get root inode failed");
1686 goto error;
1687 }
1688
1689 if (root_inode->i_state & I_NEW) {
1690 reiserfs_read_locked_inode(root_inode, &args);
1691 unlock_new_inode(root_inode);
1692 }
1693
1694 s->s_root = d_alloc_root(root_inode);
1695 if (!s->s_root) {
1696 iput(root_inode);
1697 goto error;
1698 }
1699
1700 // define and initialize hash function
1701 sbi->s_hash_function = hash_function (s);
1702 if (sbi->s_hash_function == NULL) {
1703 dput(s->s_root) ;
1704 s->s_root = NULL ;
1705 goto error ;
1706 }
1707
1708 if (is_reiserfs_3_5 (rs) || (is_reiserfs_jr (rs) && SB_VERSION (s) == REISERFS_VERSION_1))
1709 set_bit(REISERFS_3_5, &(sbi->s_properties));
1710 else
1711 set_bit(REISERFS_3_6, &(sbi->s_properties));
1712
1713 if (!(s->s_flags & MS_RDONLY)) {
1714
1715 errval = journal_begin(&th, s, 1) ;
1716 if (errval) {
1717 dput (s->s_root);
1718 s->s_root = NULL;
1719 goto error;
1720 }
1721 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
1722
1723 set_sb_umount_state( rs, REISERFS_ERROR_FS );
1724 set_sb_fs_state (rs, 0);
1725
1726 if (old_format_only(s)) {
1727 /* filesystem of format 3.5 either with standard or non-standard
1728 journal */
1729 if (convert_reiserfs (s)) {
1730 /* and -o conv is given */
1731 if(!silent)
1732 reiserfs_info (s,"converting 3.5 filesystem to the 3.6 format") ;
1733
1734 if (is_reiserfs_3_5 (rs))
1735 /* put magic string of 3.6 format. 2.2 will not be able to
1736 mount this filesystem anymore */
1737 memcpy (rs->s_v1.s_magic, reiserfs_3_6_magic_string,
1738 sizeof (reiserfs_3_6_magic_string));
1739
1740 set_sb_version(rs,REISERFS_VERSION_2);
1741 reiserfs_convert_objectid_map_v1(s) ;
1742 set_bit(REISERFS_3_6, &(sbi->s_properties));
1743 clear_bit(REISERFS_3_5, &(sbi->s_properties));
1744 } else if (!silent){
1745 reiserfs_info (s, "using 3.5.x disk format\n") ;
1746 }
1747 }
1748
1749 journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
1750 errval = journal_end(&th, s, 1) ;
1751 if (errval) {
1752 dput (s->s_root);
1753 s->s_root = NULL;
1754 goto error;
1755 }
1756
1757 if ((errval = reiserfs_xattr_init (s, s->s_flags))) {
1758 dput (s->s_root);
1759 s->s_root = NULL;
1760 goto error;
1761 }
1762
1763 /* look for files which were to be removed in previous session */
1764 finish_unfinished (s);
1765 } else {
1766 if ( old_format_only(s) && !silent) {
1767 reiserfs_info (s, "using 3.5.x disk format\n") ;
1768 }
1769
1770 if ((errval = reiserfs_xattr_init (s, s->s_flags))) {
1771 dput (s->s_root);
1772 s->s_root = NULL;
1773 goto error;
1774 }
1775 }
1776 // mark hash in super block: it could be unset. overwrite should be ok
1777 set_sb_hash_function_code( rs, function2code(sbi->s_hash_function ) );
1778
1779 handle_attrs( s );
1780
1781 reiserfs_proc_info_init( s );
1782
1783 init_waitqueue_head (&(sbi->s_wait));
1784 spin_lock_init(&sbi->bitmap_lock);
1785
1786 return (0);
1787
1788 error:
1789 if (jinit_done) { /* kill the commit thread, free journal ram */
1790 journal_release_error(NULL, s) ;
1791 }
1792 if (SB_DISK_SUPER_BLOCK (s)) {
1793 for (j = 0; j < SB_BMAP_NR (s); j ++) {
1794 if (SB_AP_BITMAP (s))
1795 brelse (SB_AP_BITMAP (s)[j].bh);
1796 }
1797 if (SB_AP_BITMAP (s))
1798 vfree (SB_AP_BITMAP (s));
1799 }
1800 if (SB_BUFFER_WITH_SB (s))
1801 brelse(SB_BUFFER_WITH_SB (s));
1802#ifdef CONFIG_QUOTA
1803 for (j = 0; j < MAXQUOTAS; j++) {
1804 if (sbi->s_qf_names[j])
1805 kfree(sbi->s_qf_names[j]);
1806 }
1807#endif
1808 if (sbi != NULL) {
1809 kfree(sbi);
1810 }
1811
1812 s->s_fs_info = NULL;
1813 return errval;
1814}
1815
1816
1817static int reiserfs_statfs (struct super_block * s, struct kstatfs * buf)
1818{
1819 struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s);
1820
1821 buf->f_namelen = (REISERFS_MAX_NAME (s->s_blocksize));
1822 buf->f_bfree = sb_free_blocks(rs);
1823 buf->f_bavail = buf->f_bfree;
1824 buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1;
1825 buf->f_bsize = s->s_blocksize;
1826 /* changed to accommodate gcc folks.*/
1827 buf->f_type = REISERFS_SUPER_MAGIC;
1828 return 0;
1829}
1830
1831#ifdef CONFIG_QUOTA
1832static int reiserfs_dquot_initialize(struct inode *inode, int type)
1833{
1834 struct reiserfs_transaction_handle th;
1835 int ret;
1836
1837 /* We may create quota structure so we need to reserve enough blocks */
1838 reiserfs_write_lock(inode->i_sb);
1839 journal_begin(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS);
1840 ret = dquot_initialize(inode, type);
1841 journal_end(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS);
1842 reiserfs_write_unlock(inode->i_sb);
1843 return ret;
1844}
1845
1846static int reiserfs_dquot_drop(struct inode *inode)
1847{
1848 struct reiserfs_transaction_handle th;
1849 int ret;
1850
1851 /* We may delete quota structure so we need to reserve enough blocks */
1852 reiserfs_write_lock(inode->i_sb);
1853 journal_begin(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS);
1854 ret = dquot_drop(inode);
1855 journal_end(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS);
1856 reiserfs_write_unlock(inode->i_sb);
1857 return ret;
1858}
1859
1860static int reiserfs_write_dquot(struct dquot *dquot)
1861{
1862 struct reiserfs_transaction_handle th;
1863 int ret;
1864
1865 reiserfs_write_lock(dquot->dq_sb);
1866 journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS);
1867 ret = dquot_commit(dquot);
1868 journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS);
1869 reiserfs_write_unlock(dquot->dq_sb);
1870 return ret;
1871}
1872
1873static int reiserfs_acquire_dquot(struct dquot *dquot)
1874{
1875 struct reiserfs_transaction_handle th;
1876 int ret;
1877
1878 reiserfs_write_lock(dquot->dq_sb);
1879 journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS);
1880 ret = dquot_acquire(dquot);
1881 journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS);
1882 reiserfs_write_unlock(dquot->dq_sb);
1883 return ret;
1884}
1885
1886static int reiserfs_release_dquot(struct dquot *dquot)
1887{
1888 struct reiserfs_transaction_handle th;
1889 int ret;
1890
1891 reiserfs_write_lock(dquot->dq_sb);
1892 journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS);
1893 ret = dquot_release(dquot);
1894 journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS);
1895 reiserfs_write_unlock(dquot->dq_sb);
1896 return ret;
1897}
1898
1899static int reiserfs_mark_dquot_dirty(struct dquot *dquot)
1900{
1901 /* Are we journalling quotas? */
1902 if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
1903 REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
1904 dquot_mark_dquot_dirty(dquot);
1905 return reiserfs_write_dquot(dquot);
1906 }
1907 else
1908 return dquot_mark_dquot_dirty(dquot);
1909}
1910
1911static int reiserfs_write_info(struct super_block *sb, int type)
1912{
1913 struct reiserfs_transaction_handle th;
1914 int ret;
1915
1916 /* Data block + inode block */
1917 reiserfs_write_lock(sb);
1918 journal_begin(&th, sb, 2);
1919 ret = dquot_commit_info(sb, type);
1920 journal_end(&th, sb, 2);
1921 reiserfs_write_unlock(sb);
1922 return ret;
1923}
1924
1925/*
1926 * Turn on quotas during mount time - we need to find
1927 * the quota file and such...
1928 */
1929static int reiserfs_quota_on_mount(struct super_block *sb, int type)
1930{
1931 int err;
1932 struct dentry *dentry;
1933 struct qstr name = { .name = REISERFS_SB(sb)->s_qf_names[type],
1934 .hash = 0,
1935 .len = strlen(REISERFS_SB(sb)->s_qf_names[type])};
1936
1937 dentry = lookup_hash(&name, sb->s_root);
1938 if (IS_ERR(dentry))
1939 return PTR_ERR(dentry);
1940 err = vfs_quota_on_mount(type, REISERFS_SB(sb)->s_jquota_fmt, dentry);
1941 /* Now invalidate and put the dentry - quota got its own reference
1942 * to inode and dentry has at least wrong hash so we had better
1943 * throw it away */
1944 d_invalidate(dentry);
1945 dput(dentry);
1946 return err;
1947}
1948
1949/*
1950 * Standard function to be called on quota_on
1951 */
1952static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, char *path)
1953{
1954 int err;
1955 struct nameidata nd;
1956
1957 err = path_lookup(path, LOOKUP_FOLLOW, &nd);
1958 if (err)
1959 return err;
1960 /* Quotafile not on the same filesystem? */
1961 if (nd.mnt->mnt_sb != sb) {
1962 path_release(&nd);
1963 return -EXDEV;
1964 }
1965 /* We must not pack tails for quota files on reiserfs for quota IO to work */
1966 if (!REISERFS_I(nd.dentry->d_inode)->i_flags & i_nopack_mask) {
1967 reiserfs_warning(sb, "reiserfs: Quota file must have tail packing disabled.");
1968 path_release(&nd);
1969 return -EINVAL;
1970 }
1971 /* Not journalling quota? No more tests needed... */
1972 if (!REISERFS_SB(sb)->s_qf_names[USRQUOTA] &&
1973 !REISERFS_SB(sb)->s_qf_names[GRPQUOTA]) {
1974 path_release(&nd);
1975 return vfs_quota_on(sb, type, format_id, path);
1976 }
1977 /* Quotafile not of fs root? */
1978 if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode)
1979 reiserfs_warning(sb, "reiserfs: Quota file not on filesystem root. "
1980 "Journalled quota will not work.");
1981 path_release(&nd);
1982 return vfs_quota_on(sb, type, format_id, path);
1983}
1984
1985/* Read data from quotafile - avoid pagecache and such because we cannot afford
1986 * acquiring the locks... As quota files are never truncated and quota code
1987 * itself serializes the operations (and noone else should touch the files)
1988 * we don't have to be afraid of races */
1989static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
1990 size_t len, loff_t off)
1991{
1992 struct inode *inode = sb_dqopt(sb)->files[type];
1993 unsigned long blk = off >> sb->s_blocksize_bits;
1994 int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
1995 size_t toread;
1996 struct buffer_head tmp_bh, *bh;
1997 loff_t i_size = i_size_read(inode);
1998
1999 if (off > i_size)
2000 return 0;
2001 if (off+len > i_size)
2002 len = i_size-off;
2003 toread = len;
2004 while (toread > 0) {
2005 tocopy = sb->s_blocksize - offset < toread ? sb->s_blocksize - offset : toread;
2006 tmp_bh.b_state = 0;
2007 /* Quota files are without tails so we can safely use this function */
2008 reiserfs_write_lock(sb);
2009 err = reiserfs_get_block(inode, blk, &tmp_bh, 0);
2010 reiserfs_write_unlock(sb);
2011 if (err)
2012 return err;
2013 if (!buffer_mapped(&tmp_bh)) /* A hole? */
2014 memset(data, 0, tocopy);
2015 else {
2016 bh = sb_bread(sb, tmp_bh.b_blocknr);
2017 if (!bh)
2018 return -EIO;
2019 memcpy(data, bh->b_data+offset, tocopy);
2020 brelse(bh);
2021 }
2022 offset = 0;
2023 toread -= tocopy;
2024 data += tocopy;
2025 blk++;
2026 }
2027 return len;
2028}
2029
2030/* Write to quotafile (we know the transaction is already started and has
2031 * enough credits) */
2032static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
2033 const char *data, size_t len, loff_t off)
2034{
2035 struct inode *inode = sb_dqopt(sb)->files[type];
2036 unsigned long blk = off >> sb->s_blocksize_bits;
2037 int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
2038 int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL;
2039 size_t towrite = len;
2040 struct buffer_head tmp_bh, *bh;
2041
2042 down(&inode->i_sem);
2043 while (towrite > 0) {
2044 tocopy = sb->s_blocksize - offset < towrite ?
2045 sb->s_blocksize - offset : towrite;
2046 tmp_bh.b_state = 0;
2047 err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
2048 if (err)
2049 goto out;
2050 if (offset || tocopy != sb->s_blocksize)
2051 bh = sb_bread(sb, tmp_bh.b_blocknr);
2052 else
2053 bh = sb_getblk(sb, tmp_bh.b_blocknr);
2054 if (!bh) {
2055 err = -EIO;
2056 goto out;
2057 }
2058 lock_buffer(bh);
2059 memcpy(bh->b_data+offset, data, tocopy);
2060 flush_dcache_page(bh->b_page);
2061 set_buffer_uptodate(bh);
2062 unlock_buffer(bh);
2063 reiserfs_prepare_for_journal(sb, bh, 1);
2064 journal_mark_dirty(current->journal_info, sb, bh);
2065 if (!journal_quota)
2066 reiserfs_add_ordered_list(inode, bh);
2067 brelse(bh);
2068 offset = 0;
2069 towrite -= tocopy;
2070 data += tocopy;
2071 blk++;
2072 }
2073out:
2074 if (len == towrite)
2075 return err;
2076 if (inode->i_size < off+len-towrite)
2077 i_size_write(inode, off+len-towrite);
2078 inode->i_version++;
2079 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2080 mark_inode_dirty(inode);
2081 up(&inode->i_sem);
2082 return len - towrite;
2083}
2084
2085#endif
2086
2087static struct super_block*
2088get_super_block (struct file_system_type *fs_type, int flags,
2089 const char *dev_name, void *data)
2090{
2091 return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
2092}
2093
2094static int __init
2095init_reiserfs_fs ( void )
2096{
2097 int ret;
2098
2099 if ((ret = init_inodecache ())) {
2100 return ret;
2101 }
2102
2103 if ((ret = reiserfs_xattr_register_handlers ()))
2104 goto failed_reiserfs_xattr_register_handlers;
2105
2106 reiserfs_proc_info_global_init ();
2107 reiserfs_proc_register_global ("version", reiserfs_global_version_in_proc);
2108
2109 ret = register_filesystem (& reiserfs_fs_type);
2110
2111 if (ret == 0) {
2112 return 0;
2113 }
2114
2115 reiserfs_xattr_unregister_handlers ();
2116
2117failed_reiserfs_xattr_register_handlers:
2118 reiserfs_proc_unregister_global ("version");
2119 reiserfs_proc_info_global_done ();
2120 destroy_inodecache ();
2121
2122 return ret;
2123}
2124
2125static void __exit
2126exit_reiserfs_fs ( void )
2127{
2128 reiserfs_xattr_unregister_handlers ();
2129 reiserfs_proc_unregister_global ("version");
2130 reiserfs_proc_info_global_done ();
2131 unregister_filesystem (& reiserfs_fs_type);
2132 destroy_inodecache ();
2133}
2134
2135struct file_system_type reiserfs_fs_type = {
2136 .owner = THIS_MODULE,
2137 .name = "reiserfs",
2138 .get_sb = get_super_block,
2139 .kill_sb = kill_block_super,
2140 .fs_flags = FS_REQUIRES_DEV,
2141};
2142
2143MODULE_DESCRIPTION ("ReiserFS journaled filesystem");
2144MODULE_AUTHOR ("Hans Reiser <reiser@namesys.com>");
2145MODULE_LICENSE ("GPL");
2146
2147module_init (init_reiserfs_fs);
2148module_exit (exit_reiserfs_fs);
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
new file mode 100644
index 000000000000..6191909d5165
--- /dev/null
+++ b/fs/reiserfs/tail_conversion.c
@@ -0,0 +1,276 @@
1/*
2 * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright details
3 */
4
5#include <linux/config.h>
6#include <linux/time.h>
7#include <linux/pagemap.h>
8#include <linux/buffer_head.h>
9#include <linux/reiserfs_fs.h>
10
11/* access to tail : when one is going to read tail it must make sure, that is not running.
12 direct2indirect and indirect2direct can not run concurrently */
13
14
15/* Converts direct items to an unformatted node. Panics if file has no
16 tail. -ENOSPC if no disk space for conversion */
17/* path points to first direct item of the file regarless of how many of
18 them are there */
19int direct2indirect (struct reiserfs_transaction_handle *th, struct inode * inode,
20 struct path * path, struct buffer_head * unbh,
21 loff_t tail_offset)
22{
23 struct super_block * sb = inode->i_sb;
24 struct buffer_head *up_to_date_bh ;
25 struct item_head * p_le_ih = PATH_PITEM_HEAD (path);
26 unsigned long total_tail = 0 ;
27 struct cpu_key end_key; /* Key to search for the last byte of the
28 converted item. */
29 struct item_head ind_ih; /* new indirect item to be inserted or
30 key of unfm pointer to be pasted */
31 int n_blk_size,
32 n_retval; /* returned value for reiserfs_insert_item and clones */
33 unp_t unfm_ptr; /* Handle on an unformatted node
34 that will be inserted in the
35 tree. */
36
37 BUG_ON (!th->t_trans_id);
38
39 REISERFS_SB(sb)->s_direct2indirect ++;
40
41 n_blk_size = sb->s_blocksize;
42
43 /* and key to search for append or insert pointer to the new
44 unformatted node. */
45 copy_item_head (&ind_ih, p_le_ih);
46 set_le_ih_k_offset (&ind_ih, tail_offset);
47 set_le_ih_k_type (&ind_ih, TYPE_INDIRECT);
48
49 /* Set the key to search for the place for new unfm pointer */
50 make_cpu_key (&end_key, inode, tail_offset, TYPE_INDIRECT, 4);
51
52 // FIXME: we could avoid this
53 if ( search_for_position_by_key (sb, &end_key, path) == POSITION_FOUND ) {
54 reiserfs_warning (sb, "PAP-14030: direct2indirect: "
55 "pasted or inserted byte exists in the tree %K. "
56 "Use fsck to repair.", &end_key);
57 pathrelse(path);
58 return -EIO;
59 }
60
61 p_le_ih = PATH_PITEM_HEAD (path);
62
63 unfm_ptr = cpu_to_le32 (unbh->b_blocknr);
64
65 if ( is_statdata_le_ih (p_le_ih) ) {
66 /* Insert new indirect item. */
67 set_ih_free_space (&ind_ih, 0); /* delete at nearest future */
68 put_ih_item_len( &ind_ih, UNFM_P_SIZE );
69 PATH_LAST_POSITION (path)++;
70 n_retval = reiserfs_insert_item (th, path, &end_key, &ind_ih, inode,
71 (char *)&unfm_ptr);
72 } else {
73 /* Paste into last indirect item of an object. */
74 n_retval = reiserfs_paste_into_item(th, path, &end_key, inode,
75 (char *)&unfm_ptr, UNFM_P_SIZE);
76 }
77 if ( n_retval ) {
78 return n_retval;
79 }
80
81 // note: from here there are two keys which have matching first
82 // three key components. They only differ by the fourth one.
83
84
85 /* Set the key to search for the direct items of the file */
86 make_cpu_key (&end_key, inode, max_reiserfs_offset (inode), TYPE_DIRECT, 4);
87
88 /* Move bytes from the direct items to the new unformatted node
89 and delete them. */
90 while (1) {
91 int tail_size;
92
93 /* end_key.k_offset is set so, that we will always have found
94 last item of the file */
95 if ( search_for_position_by_key (sb, &end_key, path) == POSITION_FOUND )
96 reiserfs_panic (sb, "PAP-14050: direct2indirect: "
97 "direct item (%K) not found", &end_key);
98 p_le_ih = PATH_PITEM_HEAD (path);
99 RFALSE( !is_direct_le_ih (p_le_ih),
100 "vs-14055: direct item expected(%K), found %h",
101 &end_key, p_le_ih);
102 tail_size = (le_ih_k_offset (p_le_ih) & (n_blk_size - 1))
103 + ih_item_len(p_le_ih) - 1;
104
105 /* we only send the unbh pointer if the buffer is not up to date.
106 ** this avoids overwriting good data from writepage() with old data
107 ** from the disk or buffer cache
108 ** Special case: unbh->b_page will be NULL if we are coming through
109 ** DIRECT_IO handler here.
110 */
111 if (!unbh->b_page || buffer_uptodate(unbh) || PageUptodate(unbh->b_page)) {
112 up_to_date_bh = NULL ;
113 } else {
114 up_to_date_bh = unbh ;
115 }
116 n_retval = reiserfs_delete_item (th, path, &end_key, inode,
117 up_to_date_bh) ;
118
119 total_tail += n_retval ;
120 if (tail_size == n_retval)
121 // done: file does not have direct items anymore
122 break;
123
124 }
125 /* if we've copied bytes from disk into the page, we need to zero
126 ** out the unused part of the block (it was not up to date before)
127 */
128 if (up_to_date_bh) {
129 unsigned pgoff = (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1);
130 char *kaddr=kmap_atomic(up_to_date_bh->b_page, KM_USER0);
131 memset(kaddr + pgoff, 0, n_blk_size - total_tail) ;
132 kunmap_atomic(kaddr, KM_USER0);
133 }
134
135 REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
136
137 return 0;
138}
139
140
141/* stolen from fs/buffer.c */
142void reiserfs_unmap_buffer(struct buffer_head *bh) {
143 lock_buffer(bh) ;
144 if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
145 BUG() ;
146 }
147 clear_buffer_dirty(bh) ;
148 /* Remove the buffer from whatever list it belongs to. We are mostly
149 interested in removing it from per-sb j_dirty_buffers list, to avoid
150 BUG() on attempt to write not mapped buffer */
151 if ( (!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) {
152 struct inode *inode = bh->b_page->mapping->host;
153 struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
154 spin_lock(&j->j_dirty_buffers_lock);
155 list_del_init(&bh->b_assoc_buffers);
156 reiserfs_free_jh(bh);
157 spin_unlock(&j->j_dirty_buffers_lock);
158 }
159 clear_buffer_mapped(bh) ;
160 clear_buffer_req(bh) ;
161 clear_buffer_new(bh);
162 bh->b_bdev = NULL;
163 unlock_buffer(bh) ;
164}
165
166/* this first locks inode (neither reads nor sync are permitted),
167 reads tail through page cache, insert direct item. When direct item
168 inserted successfully inode is left locked. Return value is always
169 what we expect from it (number of cut bytes). But when tail remains
170 in the unformatted node, we set mode to SKIP_BALANCING and unlock
171 inode */
172int indirect2direct (struct reiserfs_transaction_handle *th,
173 struct inode * p_s_inode,
174 struct page *page,
175 struct path * p_s_path, /* path to the indirect item. */
176 const struct cpu_key * p_s_item_key, /* Key to look for unformatted node pointer to be cut. */
177 loff_t n_new_file_size, /* New file size. */
178 char * p_c_mode)
179{
180 struct super_block * p_s_sb = p_s_inode->i_sb;
181 struct item_head s_ih;
182 unsigned long n_block_size = p_s_sb->s_blocksize;
183 char * tail;
184 int tail_len, round_tail_len;
185 loff_t pos, pos1; /* position of first byte of the tail */
186 struct cpu_key key;
187
188 BUG_ON (!th->t_trans_id);
189
190 REISERFS_SB(p_s_sb)->s_indirect2direct ++;
191
192 *p_c_mode = M_SKIP_BALANCING;
193
194 /* store item head path points to. */
195 copy_item_head (&s_ih, PATH_PITEM_HEAD(p_s_path));
196
197 tail_len = (n_new_file_size & (n_block_size - 1));
198 if (get_inode_sd_version (p_s_inode) == STAT_DATA_V2)
199 round_tail_len = ROUND_UP (tail_len);
200 else
201 round_tail_len = tail_len;
202
203 pos = le_ih_k_offset (&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE - 1) * p_s_sb->s_blocksize;
204 pos1 = pos;
205
206 // we are protected by i_sem. The tail can not disapper, not
207 // append can be done either
208 // we are in truncate or packing tail in file_release
209
210 tail = (char *)kmap(page) ; /* this can schedule */
211
212 if (path_changed (&s_ih, p_s_path)) {
213 /* re-search indirect item */
214 if ( search_for_position_by_key (p_s_sb, p_s_item_key, p_s_path) == POSITION_NOT_FOUND )
215 reiserfs_panic(p_s_sb, "PAP-5520: indirect2direct: "
216 "item to be converted %K does not exist", p_s_item_key);
217 copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
218#ifdef CONFIG_REISERFS_CHECK
219 pos = le_ih_k_offset (&s_ih) - 1 +
220 (ih_item_len(&s_ih) / UNFM_P_SIZE - 1) * p_s_sb->s_blocksize;
221 if (pos != pos1)
222 reiserfs_panic (p_s_sb, "vs-5530: indirect2direct: "
223 "tail position changed while we were reading it");
224#endif
225 }
226
227
228 /* Set direct item header to insert. */
229 make_le_item_head (&s_ih, NULL, get_inode_item_key_version (p_s_inode), pos1 + 1,
230 TYPE_DIRECT, round_tail_len, 0xffff/*ih_free_space*/);
231
232 /* we want a pointer to the first byte of the tail in the page.
233 ** the page was locked and this part of the page was up to date when
234 ** indirect2direct was called, so we know the bytes are still valid
235 */
236 tail = tail + (pos & (PAGE_CACHE_SIZE - 1)) ;
237
238 PATH_LAST_POSITION(p_s_path)++;
239
240 key = *p_s_item_key;
241 set_cpu_key_k_type (&key, TYPE_DIRECT);
242 key.key_length = 4;
243 /* Insert tail as new direct item in the tree */
244 if ( reiserfs_insert_item(th, p_s_path, &key, &s_ih, p_s_inode,
245 tail ? tail : NULL) < 0 ) {
246 /* No disk memory. So we can not convert last unformatted node
247 to the direct item. In this case we used to adjust
248 indirect items's ih_free_space. Now ih_free_space is not
249 used, it would be ideal to write zeros to corresponding
250 unformatted node. For now i_size is considered as guard for
251 going out of file size */
252 kunmap(page) ;
253 return n_block_size - round_tail_len;
254 }
255 kunmap(page) ;
256
257 /* make sure to get the i_blocks changes from reiserfs_insert_item */
258 reiserfs_update_sd(th, p_s_inode);
259
260 // note: we have now the same as in above direct2indirect
261 // conversion: there are two keys which have matching first three
262 // key components. They only differ by the fouhth one.
263
264 /* We have inserted new direct item and must remove last
265 unformatted node. */
266 *p_c_mode = M_CUT;
267
268 /* we store position of first direct item in the in-core inode */
269 //mark_file_with_tail (p_s_inode, pos1 + 1);
270 REISERFS_I(p_s_inode)->i_first_direct_byte = pos1 + 1;
271
272 return n_block_size - round_tail_len;
273}
274
275
276
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
new file mode 100644
index 000000000000..45582fe8b466
--- /dev/null
+++ b/fs/reiserfs/xattr.c
@@ -0,0 +1,1450 @@
1/*
2 * linux/fs/reiserfs/xattr.c
3 *
4 * Copyright (c) 2002 by Jeff Mahoney, <jeffm@suse.com>
5 *
6 */
7
8/*
9 * In order to implement EA/ACLs in a clean, backwards compatible manner,
10 * they are implemented as files in a "private" directory.
11 * Each EA is in it's own file, with the directory layout like so (/ is assumed
12 * to be relative to fs root). Inside the /.reiserfs_priv/xattrs directory,
13 * directories named using the capital-hex form of the objectid and
14 * generation number are used. Inside each directory are individual files
15 * named with the name of the extended attribute.
16 *
17 * So, for objectid 12648430, we could have:
18 * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_access
19 * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_default
20 * /.reiserfs_priv/xattrs/C0FFEE.0/user.Content-Type
21 * .. or similar.
22 *
23 * The file contents are the text of the EA. The size is known based on the
24 * stat data describing the file.
25 *
26 * In the case of system.posix_acl_access and system.posix_acl_default, since
27 * these are special cases for filesystem ACLs, they are interpreted by the
28 * kernel, in addition, they are negatively and positively cached and attached
29 * to the inode so that unnecessary lookups are avoided.
30 */
31
32#include <linux/reiserfs_fs.h>
33#include <linux/dcache.h>
34#include <linux/namei.h>
35#include <linux/errno.h>
36#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/pagemap.h>
39#include <linux/xattr.h>
40#include <linux/reiserfs_xattr.h>
41#include <linux/reiserfs_acl.h>
42#include <linux/mbcache.h>
43#include <asm/uaccess.h>
44#include <asm/checksum.h>
45#include <linux/smp_lock.h>
46#include <linux/stat.h>
47#include <asm/semaphore.h>
48
49#define FL_READONLY 128
50#define FL_DIR_SEM_HELD 256
51#define PRIVROOT_NAME ".reiserfs_priv"
52#define XAROOT_NAME "xattrs"
53
54static struct reiserfs_xattr_handler *find_xattr_handler_prefix (const char *prefix);
55
56static struct dentry *
57create_xa_root (struct super_block *sb)
58{
59 struct dentry *privroot = dget (REISERFS_SB(sb)->priv_root);
60 struct dentry *xaroot;
61
62 /* This needs to be created at mount-time */
63 if (!privroot)
64 return ERR_PTR(-EOPNOTSUPP);
65
66 xaroot = lookup_one_len (XAROOT_NAME, privroot, strlen (XAROOT_NAME));
67 if (IS_ERR (xaroot)) {
68 goto out;
69 } else if (!xaroot->d_inode) {
70 int err;
71 down (&privroot->d_inode->i_sem);
72 err = privroot->d_inode->i_op->mkdir (privroot->d_inode, xaroot, 0700);
73 up (&privroot->d_inode->i_sem);
74
75 if (err) {
76 dput (xaroot);
77 dput (privroot);
78 return ERR_PTR (err);
79 }
80 REISERFS_SB(sb)->xattr_root = dget (xaroot);
81 }
82
83out:
84 dput (privroot);
85 return xaroot;
86}
87
88/* This will return a dentry, or error, refering to the xa root directory.
89 * If the xa root doesn't exist yet, the dentry will be returned without
90 * an associated inode. This dentry can be used with ->mkdir to create
91 * the xa directory. */
92static struct dentry *
93__get_xa_root (struct super_block *s)
94{
95 struct dentry *privroot = dget (REISERFS_SB(s)->priv_root);
96 struct dentry *xaroot = NULL;
97
98 if (IS_ERR (privroot) || !privroot)
99 return privroot;
100
101 xaroot = lookup_one_len (XAROOT_NAME, privroot, strlen (XAROOT_NAME));
102 if (IS_ERR (xaroot)) {
103 goto out;
104 } else if (!xaroot->d_inode) {
105 dput (xaroot);
106 xaroot = NULL;
107 goto out;
108 }
109
110 REISERFS_SB(s)->xattr_root = dget (xaroot);
111
112out:
113 dput (privroot);
114 return xaroot;
115}
116
117/* Returns the dentry (or NULL) referring to the root of the extended
118 * attribute directory tree. If it has already been retreived, it is used.
119 * Otherwise, we attempt to retreive it from disk. It may also return
120 * a pointer-encoded error.
121 */
122static inline struct dentry *
123get_xa_root (struct super_block *s)
124{
125 struct dentry *dentry = dget (REISERFS_SB(s)->xattr_root);
126
127 if (!dentry)
128 dentry = __get_xa_root (s);
129
130 return dentry;
131}
132
133/* Opens the directory corresponding to the inode's extended attribute store.
134 * If flags allow, the tree to the directory may be created. If creation is
135 * prohibited, -ENODATA is returned. */
136static struct dentry *
137open_xa_dir (const struct inode *inode, int flags)
138{
139 struct dentry *xaroot, *xadir;
140 char namebuf[17];
141
142 xaroot = get_xa_root (inode->i_sb);
143 if (IS_ERR (xaroot)) {
144 return xaroot;
145 } else if (!xaroot) {
146 if (flags == 0 || flags & XATTR_CREATE) {
147 xaroot = create_xa_root (inode->i_sb);
148 if (IS_ERR (xaroot))
149 return xaroot;
150 }
151 if (!xaroot)
152 return ERR_PTR (-ENODATA);
153 }
154
155 /* ok, we have xaroot open */
156
157 snprintf (namebuf, sizeof (namebuf), "%X.%X",
158 le32_to_cpu (INODE_PKEY (inode)->k_objectid),
159 inode->i_generation);
160 xadir = lookup_one_len (namebuf, xaroot, strlen (namebuf));
161 if (IS_ERR (xadir)) {
162 dput (xaroot);
163 return xadir;
164 }
165
166 if (!xadir->d_inode) {
167 int err;
168 if (flags == 0 || flags & XATTR_CREATE) {
169 /* Although there is nothing else trying to create this directory,
170 * another directory with the same hash may be created, so we need
171 * to protect against that */
172 err = xaroot->d_inode->i_op->mkdir (xaroot->d_inode, xadir, 0700);
173 if (err) {
174 dput (xaroot);
175 dput (xadir);
176 return ERR_PTR (err);
177 }
178 }
179 if (!xadir->d_inode) {
180 dput (xaroot);
181 dput (xadir);
182 return ERR_PTR (-ENODATA);
183 }
184 }
185
186 dput (xaroot);
187 return xadir;
188}
189
190/* Returns a dentry corresponding to a specific extended attribute file
191 * for the inode. If flags allow, the file is created. Otherwise, a
192 * valid or negative dentry, or an error is returned. */
193static struct dentry *
194get_xa_file_dentry (const struct inode *inode, const char *name, int flags)
195{
196 struct dentry *xadir, *xafile;
197 int err = 0;
198
199 xadir = open_xa_dir (inode, flags);
200 if (IS_ERR (xadir)) {
201 return ERR_PTR (PTR_ERR (xadir));
202 } else if (xadir && !xadir->d_inode) {
203 dput (xadir);
204 return ERR_PTR (-ENODATA);
205 }
206
207 xafile = lookup_one_len (name, xadir, strlen (name));
208 if (IS_ERR (xafile)) {
209 dput (xadir);
210 return ERR_PTR (PTR_ERR (xafile));
211 }
212
213 if (xafile->d_inode) { /* file exists */
214 if (flags & XATTR_CREATE) {
215 err = -EEXIST;
216 dput (xafile);
217 goto out;
218 }
219 } else if (flags & XATTR_REPLACE || flags & FL_READONLY) {
220 goto out;
221 } else {
222 /* inode->i_sem is down, so nothing else can try to create
223 * the same xattr */
224 err = xadir->d_inode->i_op->create (xadir->d_inode, xafile,
225 0700|S_IFREG, NULL);
226
227 if (err) {
228 dput (xafile);
229 goto out;
230 }
231 }
232
233out:
234 dput (xadir);
235 if (err)
236 xafile = ERR_PTR (err);
237 return xafile;
238}
239
240
241/* Opens a file pointer to the attribute associated with inode */
242static struct file *
243open_xa_file (const struct inode *inode, const char *name, int flags)
244{
245 struct dentry *xafile;
246 struct file *fp;
247
248 xafile = get_xa_file_dentry (inode, name, flags);
249 if (IS_ERR (xafile))
250 return ERR_PTR (PTR_ERR (xafile));
251 else if (!xafile->d_inode) {
252 dput (xafile);
253 return ERR_PTR (-ENODATA);
254 }
255
256 fp = dentry_open (xafile, NULL, O_RDWR);
257 /* dentry_open dputs the dentry if it fails */
258
259 return fp;
260}
261
262
263/*
264 * this is very similar to fs/reiserfs/dir.c:reiserfs_readdir, but
265 * we need to drop the path before calling the filldir struct. That
266 * would be a big performance hit to the non-xattr case, so I've copied
267 * the whole thing for now. --clm
268 *
269 * the big difference is that I go backwards through the directory,
270 * and don't mess with f->f_pos, but the idea is the same. Do some
271 * action on each and every entry in the directory.
272 *
273 * we're called with i_sem held, so there are no worries about the directory
274 * changing underneath us.
275 */
276static int __xattr_readdir(struct file * filp, void * dirent, filldir_t filldir)
277{
278 struct inode *inode = filp->f_dentry->d_inode;
279 struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */
280 INITIALIZE_PATH (path_to_entry);
281 struct buffer_head * bh;
282 int entry_num;
283 struct item_head * ih, tmp_ih;
284 int search_res;
285 char * local_buf;
286 loff_t next_pos;
287 char small_buf[32] ; /* avoid kmalloc if we can */
288 struct reiserfs_de_head *deh;
289 int d_reclen;
290 char * d_name;
291 off_t d_off;
292 ino_t d_ino;
293 struct reiserfs_dir_entry de;
294
295
296 /* form key for search the next directory entry using f_pos field of
297 file structure */
298 next_pos = max_reiserfs_offset(inode);
299
300 while (1) {
301research:
302 if (next_pos <= DOT_DOT_OFFSET)
303 break;
304 make_cpu_key (&pos_key, inode, next_pos, TYPE_DIRENTRY, 3);
305
306 search_res = search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry, &de);
307 if (search_res == IO_ERROR) {
308 // FIXME: we could just skip part of directory which could
309 // not be read
310 pathrelse(&path_to_entry);
311 return -EIO;
312 }
313
314 if (search_res == NAME_NOT_FOUND)
315 de.de_entry_num--;
316
317 set_de_name_and_namelen(&de);
318 entry_num = de.de_entry_num;
319 deh = &(de.de_deh[entry_num]);
320
321 bh = de.de_bh;
322 ih = de.de_ih;
323
324 if (!is_direntry_le_ih(ih)) {
325 reiserfs_warning(inode->i_sb, "not direntry %h", ih);
326 break;
327 }
328 copy_item_head(&tmp_ih, ih);
329
330 /* we must have found item, that is item of this directory, */
331 RFALSE( COMP_SHORT_KEYS (&(ih->ih_key), &pos_key),
332 "vs-9000: found item %h does not match to dir we readdir %K",
333 ih, &pos_key);
334
335 if (deh_offset(deh) <= DOT_DOT_OFFSET) {
336 break;
337 }
338
339 /* look for the previous entry in the directory */
340 next_pos = deh_offset (deh) - 1;
341
342 if (!de_visible (deh))
343 /* it is hidden entry */
344 continue;
345
346 d_reclen = entry_length(bh, ih, entry_num);
347 d_name = B_I_DEH_ENTRY_FILE_NAME (bh, ih, deh);
348 d_off = deh_offset (deh);
349 d_ino = deh_objectid (deh);
350
351 if (!d_name[d_reclen - 1])
352 d_reclen = strlen (d_name);
353
354 if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)){
355 /* too big to send back to VFS */
356 continue ;
357 }
358
359 /* Ignore the .reiserfs_priv entry */
360 if (reiserfs_xattrs (inode->i_sb) &&
361 !old_format_only(inode->i_sb) &&
362 deh_objectid (deh) == le32_to_cpu (INODE_PKEY(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->k_objectid))
363 continue;
364
365 if (d_reclen <= 32) {
366 local_buf = small_buf ;
367 } else {
368 local_buf = reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb) ;
369 if (!local_buf) {
370 pathrelse (&path_to_entry);
371 return -ENOMEM ;
372 }
373 if (item_moved (&tmp_ih, &path_to_entry)) {
374 reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
375
376 /* sigh, must retry. Do this same offset again */
377 next_pos = d_off;
378 goto research;
379 }
380 }
381
382 // Note, that we copy name to user space via temporary
383 // buffer (local_buf) because filldir will block if
384 // user space buffer is swapped out. At that time
385 // entry can move to somewhere else
386 memcpy (local_buf, d_name, d_reclen);
387
388 /* the filldir function might need to start transactions,
389 * or do who knows what. Release the path now that we've
390 * copied all the important stuff out of the deh
391 */
392 pathrelse (&path_to_entry);
393
394 if (filldir (dirent, local_buf, d_reclen, d_off, d_ino,
395 DT_UNKNOWN) < 0) {
396 if (local_buf != small_buf) {
397 reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
398 }
399 goto end;
400 }
401 if (local_buf != small_buf) {
402 reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
403 }
404 } /* while */
405
406end:
407 pathrelse (&path_to_entry);
408 return 0;
409}
410
411/*
412 * this could be done with dedicated readdir ops for the xattr files,
413 * but I want to get something working asap
414 * this is stolen from vfs_readdir
415 *
416 */
417static
418int xattr_readdir(struct file *file, filldir_t filler, void *buf)
419{
420 struct inode *inode = file->f_dentry->d_inode;
421 int res = -ENOTDIR;
422 if (!file->f_op || !file->f_op->readdir)
423 goto out;
424 down(&inode->i_sem);
425// down(&inode->i_zombie);
426 res = -ENOENT;
427 if (!IS_DEADDIR(inode)) {
428 lock_kernel();
429 res = __xattr_readdir(file, buf, filler);
430 unlock_kernel();
431 }
432// up(&inode->i_zombie);
433 up(&inode->i_sem);
434out:
435 return res;
436}
437
438
439/* Internal operations on file data */
440static inline void
441reiserfs_put_page(struct page *page)
442{
443 kunmap(page);
444 page_cache_release(page);
445}
446
447static struct page *
448reiserfs_get_page(struct inode *dir, unsigned long n)
449{
450 struct address_space *mapping = dir->i_mapping;
451 struct page *page;
452 /* We can deadlock if we try to free dentries,
453 and an unlink/rmdir has just occured - GFP_NOFS avoids this */
454 mapping->flags = (mapping->flags & ~__GFP_BITS_MASK) | GFP_NOFS;
455 page = read_cache_page (mapping, n,
456 (filler_t*)mapping->a_ops->readpage, NULL);
457 if (!IS_ERR(page)) {
458 wait_on_page_locked(page);
459 kmap(page);
460 if (!PageUptodate(page))
461 goto fail;
462
463 if (PageError(page))
464 goto fail;
465 }
466 return page;
467
468fail:
469 reiserfs_put_page(page);
470 return ERR_PTR(-EIO);
471}
472
473static inline __u32
474xattr_hash (const char *msg, int len)
475{
476 return csum_partial (msg, len, 0);
477}
478
479/* Generic extended attribute operations that can be used by xa plugins */
480
481/*
482 * inode->i_sem: down
483 */
484int
485reiserfs_xattr_set (struct inode *inode, const char *name, const void *buffer,
486 size_t buffer_size, int flags)
487{
488 int err = 0;
489 struct file *fp;
490 struct page *page;
491 char *data;
492 struct address_space *mapping;
493 size_t file_pos = 0;
494 size_t buffer_pos = 0;
495 struct inode *xinode;
496 struct iattr newattrs;
497 __u32 xahash = 0;
498
499 if (IS_RDONLY (inode))
500 return -EROFS;
501
502 if (IS_IMMUTABLE (inode) || IS_APPEND (inode))
503 return -EPERM;
504
505 if (get_inode_sd_version (inode) == STAT_DATA_V1)
506 return -EOPNOTSUPP;
507
508 /* Empty xattrs are ok, they're just empty files, no hash */
509 if (buffer && buffer_size)
510 xahash = xattr_hash (buffer, buffer_size);
511
512open_file:
513 fp = open_xa_file (inode, name, flags);
514 if (IS_ERR (fp)) {
515 err = PTR_ERR (fp);
516 goto out;
517 }
518
519 xinode = fp->f_dentry->d_inode;
520 REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
521
522 /* we need to copy it off.. */
523 if (xinode->i_nlink > 1) {
524 fput(fp);
525 err = reiserfs_xattr_del (inode, name);
526 if (err < 0)
527 goto out;
528 /* We just killed the old one, we're not replacing anymore */
529 if (flags & XATTR_REPLACE)
530 flags &= ~XATTR_REPLACE;
531 goto open_file;
532 }
533
534 /* Resize it so we're ok to write there */
535 newattrs.ia_size = buffer_size;
536 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
537 down (&xinode->i_sem);
538 err = notify_change(fp->f_dentry, &newattrs);
539 if (err)
540 goto out_filp;
541
542 mapping = xinode->i_mapping;
543 while (buffer_pos < buffer_size || buffer_pos == 0) {
544 size_t chunk;
545 size_t skip = 0;
546 size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1));
547 if (buffer_size - buffer_pos > PAGE_CACHE_SIZE)
548 chunk = PAGE_CACHE_SIZE;
549 else
550 chunk = buffer_size - buffer_pos;
551
552 page = reiserfs_get_page (xinode, file_pos >> PAGE_CACHE_SHIFT);
553 if (IS_ERR (page)) {
554 err = PTR_ERR (page);
555 goto out_filp;
556 }
557
558 lock_page (page);
559 data = page_address (page);
560
561 if (file_pos == 0) {
562 struct reiserfs_xattr_header *rxh;
563 skip = file_pos = sizeof (struct reiserfs_xattr_header);
564 if (chunk + skip > PAGE_CACHE_SIZE)
565 chunk = PAGE_CACHE_SIZE - skip;
566 rxh = (struct reiserfs_xattr_header *)data;
567 rxh->h_magic = cpu_to_le32 (REISERFS_XATTR_MAGIC);
568 rxh->h_hash = cpu_to_le32 (xahash);
569 }
570
571 err = mapping->a_ops->prepare_write (fp, page, page_offset,
572 page_offset + chunk + skip);
573 if (!err) {
574 if (buffer)
575 memcpy (data + skip, buffer + buffer_pos, chunk);
576 err = mapping->a_ops->commit_write (fp, page, page_offset,
577 page_offset + chunk + skip);
578 }
579 unlock_page (page);
580 reiserfs_put_page (page);
581 buffer_pos += chunk;
582 file_pos += chunk;
583 skip = 0;
584 if (err || buffer_size == 0 || !buffer)
585 break;
586 }
587
588 /* We can't mark the inode dirty if it's not hashed. This is the case
589 * when we're inheriting the default ACL. If we dirty it, the inode
590 * gets marked dirty, but won't (ever) make it onto the dirty list until
591 * it's synced explicitly to clear I_DIRTY. This is bad. */
592 if (!hlist_unhashed(&inode->i_hash)) {
593 inode->i_ctime = CURRENT_TIME_SEC;
594 mark_inode_dirty (inode);
595 }
596
597out_filp:
598 up (&xinode->i_sem);
599 fput(fp);
600
601out:
602 return err;
603}
604
605/*
606 * inode->i_sem: down
607 */
608int
609reiserfs_xattr_get (const struct inode *inode, const char *name, void *buffer,
610 size_t buffer_size)
611{
612 ssize_t err = 0;
613 struct file *fp;
614 size_t isize;
615 size_t file_pos = 0;
616 size_t buffer_pos = 0;
617 struct page *page;
618 struct inode *xinode;
619 __u32 hash = 0;
620
621 if (name == NULL)
622 return -EINVAL;
623
624 /* We can't have xattrs attached to v1 items since they don't have
625 * generation numbers */
626 if (get_inode_sd_version (inode) == STAT_DATA_V1)
627 return -EOPNOTSUPP;
628
629 fp = open_xa_file (inode, name, FL_READONLY);
630 if (IS_ERR (fp)) {
631 err = PTR_ERR (fp);
632 goto out;
633 }
634
635 xinode = fp->f_dentry->d_inode;
636 isize = xinode->i_size;
637 REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
638
639 /* Just return the size needed */
640 if (buffer == NULL) {
641 err = isize - sizeof (struct reiserfs_xattr_header);
642 goto out_dput;
643 }
644
645 if (buffer_size < isize - sizeof (struct reiserfs_xattr_header)) {
646 err = -ERANGE;
647 goto out_dput;
648 }
649
650 while (file_pos < isize) {
651 size_t chunk;
652 char *data;
653 size_t skip = 0;
654 if (isize - file_pos > PAGE_CACHE_SIZE)
655 chunk = PAGE_CACHE_SIZE;
656 else
657 chunk = isize - file_pos;
658
659 page = reiserfs_get_page (xinode, file_pos >> PAGE_CACHE_SHIFT);
660 if (IS_ERR (page)) {
661 err = PTR_ERR (page);
662 goto out_dput;
663 }
664
665 lock_page (page);
666 data = page_address (page);
667 if (file_pos == 0) {
668 struct reiserfs_xattr_header *rxh =
669 (struct reiserfs_xattr_header *)data;
670 skip = file_pos = sizeof (struct reiserfs_xattr_header);
671 chunk -= skip;
672 /* Magic doesn't match up.. */
673 if (rxh->h_magic != cpu_to_le32 (REISERFS_XATTR_MAGIC)) {
674 unlock_page (page);
675 reiserfs_put_page (page);
676 reiserfs_warning (inode->i_sb, "Invalid magic for xattr (%s) "
677 "associated with %k", name,
678 INODE_PKEY (inode));
679 err = -EIO;
680 goto out_dput;
681 }
682 hash = le32_to_cpu (rxh->h_hash);
683 }
684 memcpy (buffer + buffer_pos, data + skip, chunk);
685 unlock_page (page);
686 reiserfs_put_page (page);
687 file_pos += chunk;
688 buffer_pos += chunk;
689 skip = 0;
690 }
691 err = isize - sizeof (struct reiserfs_xattr_header);
692
693 if (xattr_hash (buffer, isize - sizeof (struct reiserfs_xattr_header)) != hash) {
694 reiserfs_warning (inode->i_sb, "Invalid hash for xattr (%s) associated "
695 "with %k", name, INODE_PKEY (inode));
696 err = -EIO;
697 }
698
699out_dput:
700 fput(fp);
701
702out:
703 return err;
704}
705
706static int
707__reiserfs_xattr_del (struct dentry *xadir, const char *name, int namelen)
708{
709 struct dentry *dentry;
710 struct inode *dir = xadir->d_inode;
711 int err = 0;
712
713 dentry = lookup_one_len (name, xadir, namelen);
714 if (IS_ERR (dentry)) {
715 err = PTR_ERR (dentry);
716 goto out;
717 } else if (!dentry->d_inode) {
718 err = -ENODATA;
719 goto out_file;
720 }
721
722 /* Skip directories.. */
723 if (S_ISDIR (dentry->d_inode->i_mode))
724 goto out_file;
725
726 if (!is_reiserfs_priv_object (dentry->d_inode)) {
727 reiserfs_warning (dir->i_sb, "OID %08x [%.*s/%.*s] doesn't have "
728 "priv flag set [parent is %sset].",
729 le32_to_cpu (INODE_PKEY (dentry->d_inode)->k_objectid),
730 xadir->d_name.len, xadir->d_name.name, namelen, name,
731 is_reiserfs_priv_object (xadir->d_inode) ? "" : "not ");
732 dput (dentry);
733 return -EIO;
734 }
735
736 err = dir->i_op->unlink (dir, dentry);
737 if (!err)
738 d_delete (dentry);
739
740out_file:
741 dput (dentry);
742
743out:
744 return err;
745}
746
747
748int
749reiserfs_xattr_del (struct inode *inode, const char *name)
750{
751 struct dentry *dir;
752 int err;
753
754 if (IS_RDONLY (inode))
755 return -EROFS;
756
757 dir = open_xa_dir (inode, FL_READONLY);
758 if (IS_ERR (dir)) {
759 err = PTR_ERR (dir);
760 goto out;
761 }
762
763 err = __reiserfs_xattr_del (dir, name, strlen (name));
764 dput (dir);
765
766 if (!err) {
767 inode->i_ctime = CURRENT_TIME_SEC;
768 mark_inode_dirty (inode);
769 }
770
771out:
772 return err;
773}
774
775/* The following are side effects of other operations that aren't explicitly
776 * modifying extended attributes. This includes operations such as permissions
777 * or ownership changes, object deletions, etc. */
778
779static int
780reiserfs_delete_xattrs_filler (void *buf, const char *name, int namelen,
781 loff_t offset, ino_t ino, unsigned int d_type)
782{
783 struct dentry *xadir = (struct dentry *)buf;
784
785 return __reiserfs_xattr_del (xadir, name, namelen);
786
787}
788
789/* This is called w/ inode->i_sem downed */
790int
791reiserfs_delete_xattrs (struct inode *inode)
792{
793 struct file *fp;
794 struct dentry *dir, *root;
795 int err = 0;
796
797 /* Skip out, an xattr has no xattrs associated with it */
798 if (is_reiserfs_priv_object (inode) ||
799 get_inode_sd_version (inode) == STAT_DATA_V1 ||
800 !reiserfs_xattrs(inode->i_sb))
801 {
802 return 0;
803 }
804 reiserfs_read_lock_xattrs (inode->i_sb);
805 dir = open_xa_dir (inode, FL_READONLY);
806 reiserfs_read_unlock_xattrs (inode->i_sb);
807 if (IS_ERR (dir)) {
808 err = PTR_ERR (dir);
809 goto out;
810 } else if (!dir->d_inode) {
811 dput (dir);
812 return 0;
813 }
814
815 fp = dentry_open (dir, NULL, O_RDWR);
816 if (IS_ERR (fp)) {
817 err = PTR_ERR (fp);
818 /* dentry_open dputs the dentry if it fails */
819 goto out;
820 }
821
822 lock_kernel ();
823 err = xattr_readdir (fp, reiserfs_delete_xattrs_filler, dir);
824 if (err) {
825 unlock_kernel ();
826 goto out_dir;
827 }
828
829 /* Leftovers besides . and .. -- that's not good. */
830 if (dir->d_inode->i_nlink <= 2) {
831 root = get_xa_root (inode->i_sb);
832 reiserfs_write_lock_xattrs (inode->i_sb);
833 err = vfs_rmdir (root->d_inode, dir);
834 reiserfs_write_unlock_xattrs (inode->i_sb);
835 dput (root);
836 } else {
837 reiserfs_warning (inode->i_sb,
838 "Couldn't remove all entries in directory");
839 }
840 unlock_kernel ();
841
842out_dir:
843 fput(fp);
844
845out:
846 if (!err)
847 REISERFS_I(inode)->i_flags = REISERFS_I(inode)->i_flags & ~i_has_xattr_dir;
848 return err;
849}
850
851struct reiserfs_chown_buf {
852 struct inode *inode;
853 struct dentry *xadir;
854 struct iattr *attrs;
855};
856
857/* XXX: If there is a better way to do this, I'd love to hear about it */
858static int
859reiserfs_chown_xattrs_filler (void *buf, const char *name, int namelen,
860 loff_t offset, ino_t ino, unsigned int d_type)
861{
862 struct reiserfs_chown_buf *chown_buf = (struct reiserfs_chown_buf *)buf;
863 struct dentry *xafile, *xadir = chown_buf->xadir;
864 struct iattr *attrs = chown_buf->attrs;
865 int err = 0;
866
867 xafile = lookup_one_len (name, xadir, namelen);
868 if (IS_ERR (xafile))
869 return PTR_ERR (xafile);
870 else if (!xafile->d_inode) {
871 dput (xafile);
872 return -ENODATA;
873 }
874
875 if (!S_ISDIR (xafile->d_inode->i_mode))
876 err = notify_change (xafile, attrs);
877 dput (xafile);
878
879 return err;
880}
881
882int
883reiserfs_chown_xattrs (struct inode *inode, struct iattr *attrs)
884{
885 struct file *fp;
886 struct dentry *dir;
887 int err = 0;
888 struct reiserfs_chown_buf buf;
889 unsigned int ia_valid = attrs->ia_valid;
890
891 /* Skip out, an xattr has no xattrs associated with it */
892 if (is_reiserfs_priv_object (inode) ||
893 get_inode_sd_version (inode) == STAT_DATA_V1 ||
894 !reiserfs_xattrs(inode->i_sb))
895 {
896 return 0;
897 }
898 reiserfs_read_lock_xattrs (inode->i_sb);
899 dir = open_xa_dir (inode, FL_READONLY);
900 reiserfs_read_unlock_xattrs (inode->i_sb);
901 if (IS_ERR (dir)) {
902 if (PTR_ERR (dir) != -ENODATA)
903 err = PTR_ERR (dir);
904 goto out;
905 } else if (!dir->d_inode) {
906 dput (dir);
907 goto out;
908 }
909
910 fp = dentry_open (dir, NULL, O_RDWR);
911 if (IS_ERR (fp)) {
912 err = PTR_ERR (fp);
913 /* dentry_open dputs the dentry if it fails */
914 goto out;
915 }
916
917 lock_kernel ();
918
919 attrs->ia_valid &= (ATTR_UID | ATTR_GID | ATTR_CTIME);
920 buf.xadir = dir;
921 buf.attrs = attrs;
922 buf.inode = inode;
923
924 err = xattr_readdir (fp, reiserfs_chown_xattrs_filler, &buf);
925 if (err) {
926 unlock_kernel ();
927 goto out_dir;
928 }
929
930 err = notify_change (dir, attrs);
931 unlock_kernel ();
932
933out_dir:
934 fput(fp);
935
936out:
937 attrs->ia_valid = ia_valid;
938 return err;
939}
940
941
942/* Actual operations that are exported to VFS-land */
943
944/*
945 * Inode operation getxattr()
946 * Preliminary locking: we down dentry->d_inode->i_sem
947 */
948ssize_t
949reiserfs_getxattr (struct dentry *dentry, const char *name, void *buffer,
950 size_t size)
951{
952 struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name);
953 int err;
954
955 if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
956 get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1)
957 return -EOPNOTSUPP;
958
959 reiserfs_read_lock_xattr_i (dentry->d_inode);
960 reiserfs_read_lock_xattrs (dentry->d_sb);
961 err = xah->get (dentry->d_inode, name, buffer, size);
962 reiserfs_read_unlock_xattrs (dentry->d_sb);
963 reiserfs_read_unlock_xattr_i (dentry->d_inode);
964 return err;
965}
966
967
968/*
969 * Inode operation setxattr()
970 *
971 * dentry->d_inode->i_sem down
972 */
973int
974reiserfs_setxattr (struct dentry *dentry, const char *name, const void *value,
975 size_t size, int flags)
976{
977 struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name);
978 int err;
979 int lock;
980
981 if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
982 get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1)
983 return -EOPNOTSUPP;
984
985 if (IS_RDONLY (dentry->d_inode))
986 return -EROFS;
987
988 if (IS_IMMUTABLE (dentry->d_inode) || IS_APPEND (dentry->d_inode))
989 return -EROFS;
990
991 reiserfs_write_lock_xattr_i (dentry->d_inode);
992 lock = !has_xattr_dir (dentry->d_inode);
993 if (lock)
994 reiserfs_write_lock_xattrs (dentry->d_sb);
995 else
996 reiserfs_read_lock_xattrs (dentry->d_sb);
997 err = xah->set (dentry->d_inode, name, value, size, flags);
998 if (lock)
999 reiserfs_write_unlock_xattrs (dentry->d_sb);
1000 else
1001 reiserfs_read_unlock_xattrs (dentry->d_sb);
1002 reiserfs_write_unlock_xattr_i (dentry->d_inode);
1003 return err;
1004}
1005
1006/*
1007 * Inode operation removexattr()
1008 *
1009 * dentry->d_inode->i_sem down
1010 */
1011int
1012reiserfs_removexattr (struct dentry *dentry, const char *name)
1013{
1014 int err;
1015 struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name);
1016
1017 if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
1018 get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1)
1019 return -EOPNOTSUPP;
1020
1021 if (IS_RDONLY (dentry->d_inode))
1022 return -EROFS;
1023
1024 if (IS_IMMUTABLE (dentry->d_inode) || IS_APPEND (dentry->d_inode))
1025 return -EPERM;
1026
1027 reiserfs_write_lock_xattr_i (dentry->d_inode);
1028 reiserfs_read_lock_xattrs (dentry->d_sb);
1029
1030 /* Deletion pre-operation */
1031 if (xah->del) {
1032 err = xah->del (dentry->d_inode, name);
1033 if (err)
1034 goto out;
1035 }
1036
1037 err = reiserfs_xattr_del (dentry->d_inode, name);
1038
1039 dentry->d_inode->i_ctime = CURRENT_TIME_SEC;
1040 mark_inode_dirty (dentry->d_inode);
1041
1042out:
1043 reiserfs_read_unlock_xattrs (dentry->d_sb);
1044 reiserfs_write_unlock_xattr_i (dentry->d_inode);
1045 return err;
1046}
1047
1048
1049/* This is what filldir will use:
1050 * r_pos will always contain the amount of space required for the entire
1051 * list. If r_pos becomes larger than r_size, we need more space and we
1052 * return an error indicating this. If r_pos is less than r_size, then we've
1053 * filled the buffer successfully and we return success */
1054struct reiserfs_listxattr_buf {
1055 int r_pos;
1056 int r_size;
1057 char *r_buf;
1058 struct inode *r_inode;
1059};
1060
1061static int
1062reiserfs_listxattr_filler (void *buf, const char *name, int namelen,
1063 loff_t offset, ino_t ino, unsigned int d_type)
1064{
1065 struct reiserfs_listxattr_buf *b = (struct reiserfs_listxattr_buf *)buf;
1066 int len = 0;
1067 if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) {
1068 struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name);
1069 if (!xah) return 0; /* Unsupported xattr name, skip it */
1070
1071 /* We call ->list() twice because the operation isn't required to just
1072 * return the name back - we want to make sure we have enough space */
1073 len += xah->list (b->r_inode, name, namelen, NULL);
1074
1075 if (len) {
1076 if (b->r_pos + len + 1 <= b->r_size) {
1077 char *p = b->r_buf + b->r_pos;
1078 p += xah->list (b->r_inode, name, namelen, p);
1079 *p++ = '\0';
1080 }
1081 b->r_pos += len + 1;
1082 }
1083 }
1084
1085 return 0;
1086}
1087/*
1088 * Inode operation listxattr()
1089 *
1090 * Preliminary locking: we down dentry->d_inode->i_sem
1091 */
1092ssize_t
1093reiserfs_listxattr (struct dentry *dentry, char *buffer, size_t size)
1094{
1095 struct file *fp;
1096 struct dentry *dir;
1097 int err = 0;
1098 struct reiserfs_listxattr_buf buf;
1099
1100 if (!dentry->d_inode)
1101 return -EINVAL;
1102
1103 if (!reiserfs_xattrs(dentry->d_sb) ||
1104 get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1)
1105 return -EOPNOTSUPP;
1106
1107 reiserfs_read_lock_xattr_i (dentry->d_inode);
1108 reiserfs_read_lock_xattrs (dentry->d_sb);
1109 dir = open_xa_dir (dentry->d_inode, FL_READONLY);
1110 reiserfs_read_unlock_xattrs (dentry->d_sb);
1111 if (IS_ERR (dir)) {
1112 err = PTR_ERR (dir);
1113 if (err == -ENODATA)
1114 err = 0; /* Not an error if there aren't any xattrs */
1115 goto out;
1116 }
1117
1118 fp = dentry_open (dir, NULL, O_RDWR);
1119 if (IS_ERR (fp)) {
1120 err = PTR_ERR (fp);
1121 /* dentry_open dputs the dentry if it fails */
1122 goto out;
1123 }
1124
1125 buf.r_buf = buffer;
1126 buf.r_size = buffer ? size : 0;
1127 buf.r_pos = 0;
1128 buf.r_inode = dentry->d_inode;
1129
1130 REISERFS_I(dentry->d_inode)->i_flags |= i_has_xattr_dir;
1131
1132 err = xattr_readdir (fp, reiserfs_listxattr_filler, &buf);
1133 if (err)
1134 goto out_dir;
1135
1136 if (buf.r_pos > buf.r_size && buffer != NULL)
1137 err = -ERANGE;
1138 else
1139 err = buf.r_pos;
1140
1141out_dir:
1142 fput(fp);
1143
1144out:
1145 reiserfs_read_unlock_xattr_i (dentry->d_inode);
1146 return err;
1147}
1148
1149/* This is the implementation for the xattr plugin infrastructure */
1150static struct list_head xattr_handlers = LIST_HEAD_INIT (xattr_handlers);
1151static DEFINE_RWLOCK(handler_lock);
1152
1153static struct reiserfs_xattr_handler *
1154find_xattr_handler_prefix (const char *prefix)
1155{
1156 struct reiserfs_xattr_handler *xah = NULL;
1157 struct list_head *p;
1158
1159 read_lock (&handler_lock);
1160 list_for_each (p, &xattr_handlers) {
1161 xah = list_entry (p, struct reiserfs_xattr_handler, handlers);
1162 if (strncmp (xah->prefix, prefix, strlen (xah->prefix)) == 0)
1163 break;
1164 xah = NULL;
1165 }
1166
1167 read_unlock (&handler_lock);
1168 return xah;
1169}
1170
1171static void
1172__unregister_handlers (void)
1173{
1174 struct reiserfs_xattr_handler *xah;
1175 struct list_head *p, *tmp;
1176
1177 list_for_each_safe (p, tmp, &xattr_handlers) {
1178 xah = list_entry (p, struct reiserfs_xattr_handler, handlers);
1179 if (xah->exit)
1180 xah->exit();
1181
1182 list_del_init (p);
1183 }
1184 INIT_LIST_HEAD (&xattr_handlers);
1185}
1186
1187int __init
1188reiserfs_xattr_register_handlers (void)
1189{
1190 int err = 0;
1191 struct reiserfs_xattr_handler *xah;
1192 struct list_head *p;
1193
1194 write_lock (&handler_lock);
1195
1196 /* If we're already initialized, nothing to do */
1197 if (!list_empty (&xattr_handlers)) {
1198 write_unlock (&handler_lock);
1199 return 0;
1200 }
1201
1202 /* Add the handlers */
1203 list_add_tail (&user_handler.handlers, &xattr_handlers);
1204 list_add_tail (&trusted_handler.handlers, &xattr_handlers);
1205#ifdef CONFIG_REISERFS_FS_SECURITY
1206 list_add_tail (&security_handler.handlers, &xattr_handlers);
1207#endif
1208#ifdef CONFIG_REISERFS_FS_POSIX_ACL
1209 list_add_tail (&posix_acl_access_handler.handlers, &xattr_handlers);
1210 list_add_tail (&posix_acl_default_handler.handlers, &xattr_handlers);
1211#endif
1212
1213 /* Run initializers, if available */
1214 list_for_each (p, &xattr_handlers) {
1215 xah = list_entry (p, struct reiserfs_xattr_handler, handlers);
1216 if (xah->init) {
1217 err = xah->init ();
1218 if (err) {
1219 list_del_init (p);
1220 break;
1221 }
1222 }
1223 }
1224
1225 /* Clean up other handlers, if any failed */
1226 if (err)
1227 __unregister_handlers ();
1228
1229 write_unlock (&handler_lock);
1230 return err;
1231}
1232
1233void
1234reiserfs_xattr_unregister_handlers (void)
1235{
1236 write_lock (&handler_lock);
1237 __unregister_handlers ();
1238 write_unlock (&handler_lock);
1239}
1240
1241/* This will catch lookups from the fs root to .reiserfs_priv */
1242static int
1243xattr_lookup_poison (struct dentry *dentry, struct qstr *q1, struct qstr *name)
1244{
1245 struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root;
1246 if (name->len == priv_root->d_name.len &&
1247 name->hash == priv_root->d_name.hash &&
1248 !memcmp (name->name, priv_root->d_name.name, name->len)) {
1249 return -ENOENT;
1250 } else if (q1->len == name->len &&
1251 !memcmp(q1->name, name->name, name->len))
1252 return 0;
1253 return 1;
1254}
1255
1256static struct dentry_operations xattr_lookup_poison_ops = {
1257 .d_compare = xattr_lookup_poison,
1258};
1259
1260
1261/* We need to take a copy of the mount flags since things like
1262 * MS_RDONLY don't get set until *after* we're called.
1263 * mount_flags != mount_options */
1264int
1265reiserfs_xattr_init (struct super_block *s, int mount_flags)
1266{
1267 int err = 0;
1268
1269 /* We need generation numbers to ensure that the oid mapping is correct
1270 * v3.5 filesystems don't have them. */
1271 if (!old_format_only (s)) {
1272 set_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
1273 } else if (reiserfs_xattrs_optional (s)) {
1274 /* Old format filesystem, but optional xattrs have been enabled
1275 * at mount time. Error out. */
1276 reiserfs_warning (s, "xattrs/ACLs not supported on pre v3.6 "
1277 "format filesystem. Failing mount.");
1278 err = -EOPNOTSUPP;
1279 goto error;
1280 } else {
1281 /* Old format filesystem, but no optional xattrs have been enabled. This
1282 * means we silently disable xattrs on the filesystem. */
1283 clear_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
1284 }
1285
1286 /* If we don't have the privroot located yet - go find it */
1287 if (reiserfs_xattrs (s) && !REISERFS_SB(s)->priv_root) {
1288 struct dentry *dentry;
1289 dentry = lookup_one_len (PRIVROOT_NAME, s->s_root,
1290 strlen (PRIVROOT_NAME));
1291 if (!IS_ERR (dentry)) {
1292 if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) {
1293 struct inode *inode = dentry->d_parent->d_inode;
1294 down (&inode->i_sem);
1295 err = inode->i_op->mkdir (inode, dentry, 0700);
1296 up (&inode->i_sem);
1297 if (err) {
1298 dput (dentry);
1299 dentry = NULL;
1300 }
1301
1302 if (dentry && dentry->d_inode)
1303 reiserfs_warning (s, "Created %s on %s - reserved for "
1304 "xattr storage.", PRIVROOT_NAME,
1305 reiserfs_bdevname (inode->i_sb));
1306 } else if (!dentry->d_inode) {
1307 dput (dentry);
1308 dentry = NULL;
1309 }
1310 } else
1311 err = PTR_ERR (dentry);
1312
1313 if (!err && dentry) {
1314 s->s_root->d_op = &xattr_lookup_poison_ops;
1315 reiserfs_mark_inode_private (dentry->d_inode);
1316 REISERFS_SB(s)->priv_root = dentry;
1317 } else if (!(mount_flags & MS_RDONLY)) { /* xattrs are unavailable */
1318 /* If we're read-only it just means that the dir hasn't been
1319 * created. Not an error -- just no xattrs on the fs. We'll
1320 * check again if we go read-write */
1321 reiserfs_warning (s, "xattrs/ACLs enabled and couldn't "
1322 "find/create .reiserfs_priv. Failing mount.");
1323 err = -EOPNOTSUPP;
1324 }
1325 }
1326
1327error:
1328 /* This is only nonzero if there was an error initializing the xattr
1329 * directory or if there is a condition where we don't support them. */
1330 if (err) {
1331 clear_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
1332 clear_bit (REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt));
1333 clear_bit (REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt));
1334 }
1335
1336 /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
1337 s->s_flags = s->s_flags & ~MS_POSIXACL;
1338 if (reiserfs_posixacl (s))
1339 s->s_flags |= MS_POSIXACL;
1340
1341 return err;
1342}
1343
1344static int
1345__reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd,
1346 int need_lock)
1347{
1348 umode_t mode = inode->i_mode;
1349
1350 if (mask & MAY_WRITE) {
1351 /*
1352 * Nobody gets write access to a read-only fs.
1353 */
1354 if (IS_RDONLY(inode) &&
1355 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
1356 return -EROFS;
1357
1358 /*
1359 * Nobody gets write access to an immutable file.
1360 */
1361 if (IS_IMMUTABLE(inode))
1362 return -EACCES;
1363 }
1364
1365 /* We don't do permission checks on the internal objects.
1366 * Permissions are determined by the "owning" object. */
1367 if (is_reiserfs_priv_object (inode))
1368 return 0;
1369
1370 if (current->fsuid == inode->i_uid) {
1371 mode >>= 6;
1372#ifdef CONFIG_REISERFS_FS_POSIX_ACL
1373 } else if (reiserfs_posixacl(inode->i_sb) &&
1374 get_inode_sd_version (inode) != STAT_DATA_V1) {
1375 struct posix_acl *acl;
1376
1377 /* ACL can't contain additional permissions if
1378 the ACL_MASK entry is 0 */
1379 if (!(mode & S_IRWXG))
1380 goto check_groups;
1381
1382 if (need_lock) {
1383 reiserfs_read_lock_xattr_i (inode);
1384 reiserfs_read_lock_xattrs (inode->i_sb);
1385 }
1386 acl = reiserfs_get_acl (inode, ACL_TYPE_ACCESS);
1387 if (need_lock) {
1388 reiserfs_read_unlock_xattrs (inode->i_sb);
1389 reiserfs_read_unlock_xattr_i (inode);
1390 }
1391 if (IS_ERR (acl)) {
1392 if (PTR_ERR (acl) == -ENODATA)
1393 goto check_groups;
1394 return PTR_ERR (acl);
1395 }
1396
1397 if (acl) {
1398 int err = posix_acl_permission (inode, acl, mask);
1399 posix_acl_release (acl);
1400 if (err == -EACCES) {
1401 goto check_capabilities;
1402 }
1403 return err;
1404 } else {
1405 goto check_groups;
1406 }
1407#endif
1408 } else {
1409check_groups:
1410 if (in_group_p(inode->i_gid))
1411 mode >>= 3;
1412 }
1413
1414 /*
1415 * If the DACs are ok we don't need any capability check.
1416 */
1417 if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask))
1418 return 0;
1419
1420check_capabilities:
1421 /*
1422 * Read/write DACs are always overridable.
1423 * Executable DACs are overridable if at least one exec bit is set.
1424 */
1425 if (!(mask & MAY_EXEC) ||
1426 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
1427 if (capable(CAP_DAC_OVERRIDE))
1428 return 0;
1429
1430 /*
1431 * Searching includes executable on directories, else just read.
1432 */
1433 if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
1434 if (capable(CAP_DAC_READ_SEARCH))
1435 return 0;
1436
1437 return -EACCES;
1438}
1439
1440int
1441reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd)
1442{
1443 return __reiserfs_permission (inode, mask, nd, 1);
1444}
1445
1446int
1447reiserfs_permission_locked (struct inode *inode, int mask, struct nameidata *nd)
1448{
1449 return __reiserfs_permission (inode, mask, nd, 0);
1450}
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
new file mode 100644
index 000000000000..e302071903a1
--- /dev/null
+++ b/fs/reiserfs/xattr_acl.c
@@ -0,0 +1,571 @@
1#include <linux/fs.h>
2#include <linux/posix_acl.h>
3#include <linux/reiserfs_fs.h>
4#include <linux/errno.h>
5#include <linux/pagemap.h>
6#include <linux/xattr.h>
7#include <linux/xattr_acl.h>
8#include <linux/reiserfs_xattr.h>
9#include <linux/reiserfs_acl.h>
10#include <asm/uaccess.h>
11
12static int reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl);
13
14static int
15xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
16{
17 struct posix_acl *acl;
18 int error;
19
20 if (!reiserfs_posixacl(inode->i_sb))
21 return -EOPNOTSUPP;
22 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
23 return -EPERM;
24
25 if (value) {
26 acl = posix_acl_from_xattr(value, size);
27 if (IS_ERR(acl)) {
28 return PTR_ERR(acl);
29 } else if (acl) {
30 error = posix_acl_valid(acl);
31 if (error)
32 goto release_and_out;
33 }
34 } else
35 acl = NULL;
36
37 error = reiserfs_set_acl (inode, type, acl);
38
39release_and_out:
40 posix_acl_release(acl);
41 return error;
42}
43
44
45static int
46xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
47{
48 struct posix_acl *acl;
49 int error;
50
51 if (!reiserfs_posixacl(inode->i_sb))
52 return -EOPNOTSUPP;
53
54 acl = reiserfs_get_acl (inode, type);
55 if (IS_ERR(acl))
56 return PTR_ERR(acl);
57 if (acl == NULL)
58 return -ENODATA;
59 error = posix_acl_to_xattr(acl, buffer, size);
60 posix_acl_release(acl);
61
62 return error;
63}
64
65
66/*
67 * Convert from filesystem to in-memory representation.
68 */
69static struct posix_acl *
70posix_acl_from_disk(const void *value, size_t size)
71{
72 const char *end = (char *)value + size;
73 int n, count;
74 struct posix_acl *acl;
75
76 if (!value)
77 return NULL;
78 if (size < sizeof(reiserfs_acl_header))
79 return ERR_PTR(-EINVAL);
80 if (((reiserfs_acl_header *)value)->a_version !=
81 cpu_to_le32(REISERFS_ACL_VERSION))
82 return ERR_PTR(-EINVAL);
83 value = (char *)value + sizeof(reiserfs_acl_header);
84 count = reiserfs_acl_count(size);
85 if (count < 0)
86 return ERR_PTR(-EINVAL);
87 if (count == 0)
88 return NULL;
89 acl = posix_acl_alloc(count, GFP_NOFS);
90 if (!acl)
91 return ERR_PTR(-ENOMEM);
92 for (n=0; n < count; n++) {
93 reiserfs_acl_entry *entry =
94 (reiserfs_acl_entry *)value;
95 if ((char *)value + sizeof(reiserfs_acl_entry_short) > end)
96 goto fail;
97 acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
98 acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
99 switch(acl->a_entries[n].e_tag) {
100 case ACL_USER_OBJ:
101 case ACL_GROUP_OBJ:
102 case ACL_MASK:
103 case ACL_OTHER:
104 value = (char *)value +
105 sizeof(reiserfs_acl_entry_short);
106 acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
107 break;
108
109 case ACL_USER:
110 case ACL_GROUP:
111 value = (char *)value + sizeof(reiserfs_acl_entry);
112 if ((char *)value > end)
113 goto fail;
114 acl->a_entries[n].e_id =
115 le32_to_cpu(entry->e_id);
116 break;
117
118 default:
119 goto fail;
120 }
121 }
122 if (value != end)
123 goto fail;
124 return acl;
125
126fail:
127 posix_acl_release(acl);
128 return ERR_PTR(-EINVAL);
129}
130
131/*
132 * Convert from in-memory to filesystem representation.
133 */
134static void *
135posix_acl_to_disk(const struct posix_acl *acl, size_t *size)
136{
137 reiserfs_acl_header *ext_acl;
138 char *e;
139 int n;
140
141 *size = reiserfs_acl_size(acl->a_count);
142 ext_acl = (reiserfs_acl_header *)kmalloc(sizeof(reiserfs_acl_header) +
143 acl->a_count * sizeof(reiserfs_acl_entry), GFP_NOFS);
144 if (!ext_acl)
145 return ERR_PTR(-ENOMEM);
146 ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION);
147 e = (char *)ext_acl + sizeof(reiserfs_acl_header);
148 for (n=0; n < acl->a_count; n++) {
149 reiserfs_acl_entry *entry = (reiserfs_acl_entry *)e;
150 entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
151 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
152 switch(acl->a_entries[n].e_tag) {
153 case ACL_USER:
154 case ACL_GROUP:
155 entry->e_id =
156 cpu_to_le32(acl->a_entries[n].e_id);
157 e += sizeof(reiserfs_acl_entry);
158 break;
159
160 case ACL_USER_OBJ:
161 case ACL_GROUP_OBJ:
162 case ACL_MASK:
163 case ACL_OTHER:
164 e += sizeof(reiserfs_acl_entry_short);
165 break;
166
167 default:
168 goto fail;
169 }
170 }
171 return (char *)ext_acl;
172
173fail:
174 kfree(ext_acl);
175 return ERR_PTR(-EINVAL);
176}
177
178/*
179 * Inode operation get_posix_acl().
180 *
181 * inode->i_sem: down
182 * BKL held [before 2.5.x]
183 */
184struct posix_acl *
185reiserfs_get_acl(struct inode *inode, int type)
186{
187 char *name, *value;
188 struct posix_acl *acl, **p_acl;
189 size_t size;
190 int retval;
191 struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
192
193 switch (type) {
194 case ACL_TYPE_ACCESS:
195 name = XATTR_NAME_ACL_ACCESS;
196 p_acl = &reiserfs_i->i_acl_access;
197 break;
198 case ACL_TYPE_DEFAULT:
199 name = XATTR_NAME_ACL_DEFAULT;
200 p_acl = &reiserfs_i->i_acl_default;
201 break;
202 default:
203 return ERR_PTR (-EINVAL);
204 }
205
206 if (IS_ERR (*p_acl)) {
207 if (PTR_ERR (*p_acl) == -ENODATA)
208 return NULL;
209 } else if (*p_acl != NULL)
210 return posix_acl_dup (*p_acl);
211
212 size = reiserfs_xattr_get (inode, name, NULL, 0);
213 if ((int)size < 0) {
214 if (size == -ENODATA || size == -ENOSYS) {
215 *p_acl = ERR_PTR (-ENODATA);
216 return NULL;
217 }
218 return ERR_PTR (size);
219 }
220
221 value = kmalloc (size, GFP_NOFS);
222 if (!value)
223 return ERR_PTR (-ENOMEM);
224
225 retval = reiserfs_xattr_get(inode, name, value, size);
226 if (retval == -ENODATA || retval == -ENOSYS) {
227 /* This shouldn't actually happen as it should have
228 been caught above.. but just in case */
229 acl = NULL;
230 *p_acl = ERR_PTR (-ENODATA);
231 } else if (retval < 0) {
232 acl = ERR_PTR(retval);
233 } else {
234 acl = posix_acl_from_disk(value, retval);
235 *p_acl = posix_acl_dup (acl);
236 }
237
238 kfree(value);
239 return acl;
240}
241
242/*
243 * Inode operation set_posix_acl().
244 *
245 * inode->i_sem: down
246 * BKL held [before 2.5.x]
247 */
248static int
249reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
250{
251 char *name;
252 void *value = NULL;
253 struct posix_acl **p_acl;
254 size_t size;
255 int error;
256 struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
257
258 if (S_ISLNK(inode->i_mode))
259 return -EOPNOTSUPP;
260
261 switch (type) {
262 case ACL_TYPE_ACCESS:
263 name = XATTR_NAME_ACL_ACCESS;
264 p_acl = &reiserfs_i->i_acl_access;
265 if (acl) {
266 mode_t mode = inode->i_mode;
267 error = posix_acl_equiv_mode (acl, &mode);
268 if (error < 0)
269 return error;
270 else {
271 inode->i_mode = mode;
272 if (error == 0)
273 acl = NULL;
274 }
275 }
276 break;
277 case ACL_TYPE_DEFAULT:
278 name = XATTR_NAME_ACL_DEFAULT;
279 p_acl = &reiserfs_i->i_acl_default;
280 if (!S_ISDIR (inode->i_mode))
281 return acl ? -EACCES : 0;
282 break;
283 default:
284 return -EINVAL;
285 }
286
287 if (acl) {
288 value = posix_acl_to_disk(acl, &size);
289 if (IS_ERR(value))
290 return (int)PTR_ERR(value);
291 error = reiserfs_xattr_set(inode, name, value, size, 0);
292 } else {
293 error = reiserfs_xattr_del (inode, name);
294 if (error == -ENODATA) {
295 /* This may seem odd here, but it means that the ACL was set
296 * with a value representable with mode bits. If there was
297 * an ACL before, reiserfs_xattr_del already dirtied the inode.
298 */
299 mark_inode_dirty (inode);
300 error = 0;
301 }
302 }
303
304 if (value)
305 kfree(value);
306
307 if (!error) {
308 /* Release the old one */
309 if (!IS_ERR (*p_acl) && *p_acl)
310 posix_acl_release (*p_acl);
311
312 if (acl == NULL)
313 *p_acl = ERR_PTR (-ENODATA);
314 else
315 *p_acl = posix_acl_dup (acl);
316 }
317
318 return error;
319}
320
321/* dir->i_sem: down,
322 * inode is new and not released into the wild yet */
323int
324reiserfs_inherit_default_acl (struct inode *dir, struct dentry *dentry, struct inode *inode)
325{
326 struct posix_acl *acl;
327 int err = 0;
328
329 /* ACLs only get applied to files and directories */
330 if (S_ISLNK (inode->i_mode))
331 return 0;
332
333 /* ACLs can only be used on "new" objects, so if it's an old object
334 * there is nothing to inherit from */
335 if (get_inode_sd_version (dir) == STAT_DATA_V1)
336 goto apply_umask;
337
338 /* Don't apply ACLs to objects in the .reiserfs_priv tree.. This
339 * would be useless since permissions are ignored, and a pain because
340 * it introduces locking cycles */
341 if (is_reiserfs_priv_object (dir)) {
342 reiserfs_mark_inode_private (inode);
343 goto apply_umask;
344 }
345
346 acl = reiserfs_get_acl (dir, ACL_TYPE_DEFAULT);
347 if (IS_ERR (acl)) {
348 if (PTR_ERR (acl) == -ENODATA)
349 goto apply_umask;
350 return PTR_ERR (acl);
351 }
352
353 if (acl) {
354 struct posix_acl *acl_copy;
355 mode_t mode = inode->i_mode;
356 int need_acl;
357
358 /* Copy the default ACL to the default ACL of a new directory */
359 if (S_ISDIR (inode->i_mode)) {
360 err = reiserfs_set_acl (inode, ACL_TYPE_DEFAULT, acl);
361 if (err)
362 goto cleanup;
363 }
364
365 /* Now we reconcile the new ACL and the mode,
366 potentially modifying both */
367 acl_copy = posix_acl_clone (acl, GFP_NOFS);
368 if (!acl_copy) {
369 err = -ENOMEM;
370 goto cleanup;
371 }
372
373
374 need_acl = posix_acl_create_masq (acl_copy, &mode);
375 if (need_acl >= 0) {
376 if (mode != inode->i_mode) {
377 inode->i_mode = mode;
378 }
379
380 /* If we need an ACL.. */
381 if (need_acl > 0) {
382 err = reiserfs_set_acl (inode, ACL_TYPE_ACCESS, acl_copy);
383 if (err)
384 goto cleanup_copy;
385 }
386 }
387cleanup_copy:
388 posix_acl_release (acl_copy);
389cleanup:
390 posix_acl_release (acl);
391 } else {
392apply_umask:
393 /* no ACL, apply umask */
394 inode->i_mode &= ~current->fs->umask;
395 }
396
397 return err;
398}
399
400/* Looks up and caches the result of the default ACL.
401 * We do this so that we don't need to carry the xattr_sem into
402 * reiserfs_new_inode if we don't need to */
403int
404reiserfs_cache_default_acl (struct inode *inode)
405{
406 int ret = 0;
407 if (reiserfs_posixacl (inode->i_sb) &&
408 !is_reiserfs_priv_object (inode)) {
409 struct posix_acl *acl;
410 reiserfs_read_lock_xattr_i (inode);
411 reiserfs_read_lock_xattrs (inode->i_sb);
412 acl = reiserfs_get_acl (inode, ACL_TYPE_DEFAULT);
413 reiserfs_read_unlock_xattrs (inode->i_sb);
414 reiserfs_read_unlock_xattr_i (inode);
415 ret = acl ? 1 : 0;
416 posix_acl_release (acl);
417 }
418
419 return ret;
420}
421
422int
423reiserfs_acl_chmod (struct inode *inode)
424{
425 struct posix_acl *acl, *clone;
426 int error;
427
428 if (S_ISLNK(inode->i_mode))
429 return -EOPNOTSUPP;
430
431 if (get_inode_sd_version (inode) == STAT_DATA_V1 ||
432 !reiserfs_posixacl(inode->i_sb))
433 {
434 return 0;
435 }
436
437 reiserfs_read_lock_xattrs (inode->i_sb);
438 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
439 reiserfs_read_unlock_xattrs (inode->i_sb);
440 if (!acl)
441 return 0;
442 if (IS_ERR(acl))
443 return PTR_ERR(acl);
444 clone = posix_acl_clone(acl, GFP_NOFS);
445 posix_acl_release(acl);
446 if (!clone)
447 return -ENOMEM;
448 error = posix_acl_chmod_masq(clone, inode->i_mode);
449 if (!error) {
450 int lock = !has_xattr_dir (inode);
451 reiserfs_write_lock_xattr_i (inode);
452 if (lock)
453 reiserfs_write_lock_xattrs (inode->i_sb);
454 else
455 reiserfs_read_lock_xattrs (inode->i_sb);
456 error = reiserfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
457 if (lock)
458 reiserfs_write_unlock_xattrs (inode->i_sb);
459 else
460 reiserfs_read_unlock_xattrs (inode->i_sb);
461 reiserfs_write_unlock_xattr_i (inode);
462 }
463 posix_acl_release(clone);
464 return error;
465}
466
467static int
468posix_acl_access_get(struct inode *inode, const char *name,
469 void *buffer, size_t size)
470{
471 if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1)
472 return -EINVAL;
473 return xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
474}
475
476static int
477posix_acl_access_set(struct inode *inode, const char *name,
478 const void *value, size_t size, int flags)
479{
480 if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1)
481 return -EINVAL;
482 return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
483}
484
485static int
486posix_acl_access_del (struct inode *inode, const char *name)
487{
488 struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
489 struct posix_acl **acl = &reiserfs_i->i_acl_access;
490 if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1)
491 return -EINVAL;
492 if (!IS_ERR (*acl) && *acl) {
493 posix_acl_release (*acl);
494 *acl = ERR_PTR (-ENODATA);
495 }
496
497 return 0;
498}
499
500static int
501posix_acl_access_list (struct inode *inode, const char *name, int namelen, char *out)
502{
503 int len = namelen;
504 if (!reiserfs_posixacl (inode->i_sb))
505 return 0;
506 if (out)
507 memcpy (out, name, len);
508
509 return len;
510}
511
512struct reiserfs_xattr_handler posix_acl_access_handler = {
513 .prefix = XATTR_NAME_ACL_ACCESS,
514 .get = posix_acl_access_get,
515 .set = posix_acl_access_set,
516 .del = posix_acl_access_del,
517 .list = posix_acl_access_list,
518};
519
520static int
521posix_acl_default_get (struct inode *inode, const char *name,
522 void *buffer, size_t size)
523{
524 if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1)
525 return -EINVAL;
526 return xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
527}
528
529static int
530posix_acl_default_set(struct inode *inode, const char *name,
531 const void *value, size_t size, int flags)
532{
533 if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1)
534 return -EINVAL;
535 return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
536}
537
538static int
539posix_acl_default_del (struct inode *inode, const char *name)
540{
541 struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
542 struct posix_acl **acl = &reiserfs_i->i_acl_default;
543 if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1)
544 return -EINVAL;
545 if (!IS_ERR (*acl) && *acl) {
546 posix_acl_release (*acl);
547 *acl = ERR_PTR (-ENODATA);
548 }
549
550 return 0;
551}
552
553static int
554posix_acl_default_list (struct inode *inode, const char *name, int namelen, char *out)
555{
556 int len = namelen;
557 if (!reiserfs_posixacl (inode->i_sb))
558 return 0;
559 if (out)
560 memcpy (out, name, len);
561
562 return len;
563}
564
565struct reiserfs_xattr_handler posix_acl_default_handler = {
566 .prefix = XATTR_NAME_ACL_DEFAULT,
567 .get = posix_acl_default_get,
568 .set = posix_acl_default_set,
569 .del = posix_acl_default_del,
570 .list = posix_acl_default_list,
571};
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
new file mode 100644
index 000000000000..e044d5117117
--- /dev/null
+++ b/fs/reiserfs/xattr_security.c
@@ -0,0 +1,69 @@
1#include <linux/reiserfs_fs.h>
2#include <linux/errno.h>
3#include <linux/fs.h>
4#include <linux/pagemap.h>
5#include <linux/xattr.h>
6#include <linux/reiserfs_xattr.h>
7#include <asm/uaccess.h>
8
9#define XATTR_SECURITY_PREFIX "security."
10
11static int
12security_get (struct inode *inode, const char *name, void *buffer, size_t size)
13{
14 if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
15 return -EINVAL;
16
17 if (is_reiserfs_priv_object(inode))
18 return -EPERM;
19
20 return reiserfs_xattr_get (inode, name, buffer, size);
21}
22
23static int
24security_set (struct inode *inode, const char *name, const void *buffer,
25 size_t size, int flags)
26{
27 if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
28 return -EINVAL;
29
30 if (is_reiserfs_priv_object(inode))
31 return -EPERM;
32
33 return reiserfs_xattr_set (inode, name, buffer, size, flags);
34}
35
36static int
37security_del (struct inode *inode, const char *name)
38{
39 if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
40 return -EINVAL;
41
42 if (is_reiserfs_priv_object(inode))
43 return -EPERM;
44
45 return 0;
46}
47
48static int
49security_list (struct inode *inode, const char *name, int namelen, char *out)
50{
51 int len = namelen;
52
53 if (is_reiserfs_priv_object(inode))
54 return 0;
55
56 if (out)
57 memcpy (out, name, len);
58
59 return len;
60}
61
62
63struct reiserfs_xattr_handler security_handler = {
64 .prefix = XATTR_SECURITY_PREFIX,
65 .get = security_get,
66 .set = security_set,
67 .del = security_del,
68 .list = security_list,
69};
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
new file mode 100644
index 000000000000..43762197fb0a
--- /dev/null
+++ b/fs/reiserfs/xattr_trusted.c
@@ -0,0 +1,81 @@
1#include <linux/reiserfs_fs.h>
2#include <linux/errno.h>
3#include <linux/fs.h>
4#include <linux/pagemap.h>
5#include <linux/xattr.h>
6#include <linux/reiserfs_xattr.h>
7#include <asm/uaccess.h>
8
9#define XATTR_TRUSTED_PREFIX "trusted."
10
11static int
12trusted_get (struct inode *inode, const char *name, void *buffer, size_t size)
13{
14 if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
15 return -EINVAL;
16
17 if (!reiserfs_xattrs (inode->i_sb))
18 return -EOPNOTSUPP;
19
20 if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
21 return -EPERM;
22
23 return reiserfs_xattr_get (inode, name, buffer, size);
24}
25
26static int
27trusted_set (struct inode *inode, const char *name, const void *buffer,
28 size_t size, int flags)
29{
30 if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
31 return -EINVAL;
32
33 if (!reiserfs_xattrs (inode->i_sb))
34 return -EOPNOTSUPP;
35
36 if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
37 return -EPERM;
38
39 return reiserfs_xattr_set (inode, name, buffer, size, flags);
40}
41
42static int
43trusted_del (struct inode *inode, const char *name)
44{
45 if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
46 return -EINVAL;
47
48 if (!reiserfs_xattrs (inode->i_sb))
49 return -EOPNOTSUPP;
50
51 if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
52 return -EPERM;
53
54 return 0;
55}
56
57static int
58trusted_list (struct inode *inode, const char *name, int namelen, char *out)
59{
60 int len = namelen;
61
62 if (!reiserfs_xattrs (inode->i_sb))
63 return 0;
64
65 if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
66 return 0;
67
68 if (out)
69 memcpy (out, name, len);
70
71 return len;
72}
73
74
75struct reiserfs_xattr_handler trusted_handler = {
76 .prefix = XATTR_TRUSTED_PREFIX,
77 .get = trusted_get,
78 .set = trusted_set,
79 .del = trusted_del,
80 .list = trusted_list,
81};
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
new file mode 100644
index 000000000000..0772806466a8
--- /dev/null
+++ b/fs/reiserfs/xattr_user.c
@@ -0,0 +1,99 @@
1#include <linux/reiserfs_fs.h>
2#include <linux/errno.h>
3#include <linux/fs.h>
4#include <linux/pagemap.h>
5#include <linux/xattr.h>
6#include <linux/reiserfs_xattr.h>
7#include <asm/uaccess.h>
8
9#ifdef CONFIG_REISERFS_FS_POSIX_ACL
10# include <linux/reiserfs_acl.h>
11#endif
12
13#define XATTR_USER_PREFIX "user."
14
15static int
16user_get (struct inode *inode, const char *name, void *buffer, size_t size)
17{
18
19 int error;
20
21 if (strlen(name) < sizeof(XATTR_USER_PREFIX))
22 return -EINVAL;
23
24 if (!reiserfs_xattrs_user (inode->i_sb))
25 return -EOPNOTSUPP;
26
27 error = reiserfs_permission_locked (inode, MAY_READ, NULL);
28 if (error)
29 return error;
30
31 return reiserfs_xattr_get (inode, name, buffer, size);
32}
33
34static int
35user_set (struct inode *inode, const char *name, const void *buffer,
36 size_t size, int flags)
37{
38
39 int error;
40
41 if (strlen(name) < sizeof(XATTR_USER_PREFIX))
42 return -EINVAL;
43
44 if (!reiserfs_xattrs_user (inode->i_sb))
45 return -EOPNOTSUPP;
46
47 if (!S_ISREG (inode->i_mode) &&
48 (!S_ISDIR (inode->i_mode) || inode->i_mode & S_ISVTX))
49 return -EPERM;
50
51 error = reiserfs_permission_locked (inode, MAY_WRITE, NULL);
52 if (error)
53 return error;
54
55 return reiserfs_xattr_set (inode, name, buffer, size, flags);
56}
57
58static int
59user_del (struct inode *inode, const char *name)
60{
61 int error;
62
63 if (strlen(name) < sizeof(XATTR_USER_PREFIX))
64 return -EINVAL;
65
66 if (!reiserfs_xattrs_user (inode->i_sb))
67 return -EOPNOTSUPP;
68
69 if (!S_ISREG (inode->i_mode) &&
70 (!S_ISDIR (inode->i_mode) || inode->i_mode & S_ISVTX))
71 return -EPERM;
72
73 error = reiserfs_permission_locked (inode, MAY_WRITE, NULL);
74 if (error)
75 return error;
76
77 return 0;
78}
79
80static int
81user_list (struct inode *inode, const char *name, int namelen, char *out)
82{
83 int len = namelen;
84 if (!reiserfs_xattrs_user (inode->i_sb))
85 return 0;
86
87 if (out)
88 memcpy (out, name, len);
89
90 return len;
91}
92
93struct reiserfs_xattr_handler user_handler = {
94 .prefix = XATTR_USER_PREFIX,
95 .get = user_get,
96 .set = user_set,
97 .del = user_del,
98 .list = user_list,
99};