diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/reiserfs |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'fs/reiserfs')
-rw-r--r-- | fs/reiserfs/Makefile | 36 | ||||
-rw-r--r-- | fs/reiserfs/README | 161 | ||||
-rw-r--r-- | fs/reiserfs/bitmap.c | 1169 | ||||
-rw-r--r-- | fs/reiserfs/dir.c | 275 | ||||
-rw-r--r-- | fs/reiserfs/do_balan.c | 1597 | ||||
-rw-r--r-- | fs/reiserfs/file.c | 1408 | ||||
-rw-r--r-- | fs/reiserfs/fix_node.c | 2518 | ||||
-rw-r--r-- | fs/reiserfs/hashes.c | 209 | ||||
-rw-r--r-- | fs/reiserfs/ibalance.c | 1058 | ||||
-rw-r--r-- | fs/reiserfs/inode.c | 2846 | ||||
-rw-r--r-- | fs/reiserfs/ioctl.c | 151 | ||||
-rw-r--r-- | fs/reiserfs/item_ops.c | 788 | ||||
-rw-r--r-- | fs/reiserfs/journal.c | 3876 | ||||
-rw-r--r-- | fs/reiserfs/lbalance.c | 1222 | ||||
-rw-r--r-- | fs/reiserfs/namei.c | 1491 | ||||
-rw-r--r-- | fs/reiserfs/objectid.c | 206 | ||||
-rw-r--r-- | fs/reiserfs/prints.c | 727 | ||||
-rw-r--r-- | fs/reiserfs/procfs.c | 664 | ||||
-rw-r--r-- | fs/reiserfs/resize.c | 182 | ||||
-rw-r--r-- | fs/reiserfs/stree.c | 2073 | ||||
-rw-r--r-- | fs/reiserfs/super.c | 2148 | ||||
-rw-r--r-- | fs/reiserfs/tail_conversion.c | 276 | ||||
-rw-r--r-- | fs/reiserfs/xattr.c | 1450 | ||||
-rw-r--r-- | fs/reiserfs/xattr_acl.c | 571 | ||||
-rw-r--r-- | fs/reiserfs/xattr_security.c | 69 | ||||
-rw-r--r-- | fs/reiserfs/xattr_trusted.c | 81 | ||||
-rw-r--r-- | fs/reiserfs/xattr_user.c | 99 |
27 files changed, 27351 insertions, 0 deletions
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile new file mode 100644 index 000000000000..3a59309f3ca9 --- /dev/null +++ b/fs/reiserfs/Makefile | |||
@@ -0,0 +1,36 @@ | |||
1 | # | ||
2 | # Makefile for the linux reiser-filesystem routines. | ||
3 | # | ||
4 | |||
5 | obj-$(CONFIG_REISERFS_FS) += reiserfs.o | ||
6 | |||
7 | reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \ | ||
8 | super.o prints.o objectid.o lbalance.o ibalance.o stree.o \ | ||
9 | hashes.o tail_conversion.o journal.o resize.o \ | ||
10 | item_ops.o ioctl.o procfs.o | ||
11 | |||
12 | ifeq ($(CONFIG_REISERFS_FS_XATTR),y) | ||
13 | reiserfs-objs += xattr.o xattr_user.o xattr_trusted.o | ||
14 | endif | ||
15 | |||
16 | ifeq ($(CONFIG_REISERFS_FS_SECURITY),y) | ||
17 | reiserfs-objs += xattr_security.o | ||
18 | endif | ||
19 | |||
20 | ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y) | ||
21 | reiserfs-objs += xattr_acl.o | ||
22 | endif | ||
23 | |||
24 | # gcc -O2 (the kernel default) is overaggressive on ppc32 when many inline | ||
25 | # functions are used. This causes the compiler to advance the stack | ||
26 | # pointer out of the available stack space, corrupting kernel space, | ||
27 | # and causing a panic. Since this behavior only affects ppc32, this ifeq | ||
28 | # will work around it. If any other architecture displays this behavior, | ||
29 | # add it here. | ||
30 | ifeq ($(CONFIG_PPC32),y) | ||
31 | EXTRA_CFLAGS := -O1 | ||
32 | endif | ||
33 | |||
34 | TAGS: | ||
35 | etags *.c | ||
36 | |||
diff --git a/fs/reiserfs/README b/fs/reiserfs/README new file mode 100644 index 000000000000..90e1670e4e6f --- /dev/null +++ b/fs/reiserfs/README | |||
@@ -0,0 +1,161 @@ | |||
1 | [LICENSING] | ||
2 | |||
3 | ReiserFS is hereby licensed under the GNU General | ||
4 | Public License version 2. | ||
5 | |||
6 | Source code files that contain the phrase "licensing governed by | ||
7 | reiserfs/README" are "governed files" throughout this file. Governed | ||
8 | files are licensed under the GPL. The portions of them owned by Hans | ||
9 | Reiser, or authorized to be licensed by him, have been in the past, | ||
10 | and likely will be in the future, licensed to other parties under | ||
11 | other licenses. If you add your code to governed files, and don't | ||
12 | want it to be owned by Hans Reiser, put your copyright label on that | ||
13 | code so the poor blight and his customers can keep things straight. | ||
14 | All portions of governed files not labeled otherwise are owned by Hans | ||
15 | Reiser, and by adding your code to it, widely distributing it to | ||
16 | others or sending us a patch, and leaving the sentence in stating that | ||
17 | licensing is governed by the statement in this file, you accept this. | ||
18 | It will be a kindness if you identify whether Hans Reiser is allowed | ||
19 | to license code labeled as owned by you on your behalf other than | ||
20 | under the GPL, because he wants to know if it is okay to do so and put | ||
21 | a check in the mail to you (for non-trivial improvements) when he | ||
22 | makes his next sale. He makes no guarantees as to the amount if any, | ||
23 | though he feels motivated to motivate contributors, and you can surely | ||
24 | discuss this with him before or after contributing. You have the | ||
25 | right to decline to allow him to license your code contribution other | ||
26 | than under the GPL. | ||
27 | |||
28 | Further licensing options are available for commercial and/or other | ||
29 | interests directly from Hans Reiser: hans@reiser.to. If you interpret | ||
30 | the GPL as not allowing those additional licensing options, you read | ||
31 | it wrongly, and Richard Stallman agrees with me, when carefully read | ||
32 | you can see that those restrictions on additional terms do not apply | ||
33 | to the owner of the copyright, and my interpretation of this shall | ||
34 | govern for this license. | ||
35 | |||
36 | Finally, nothing in this license shall be interpreted to allow you to | ||
37 | fail to fairly credit me, or to remove my credits, without my | ||
38 | permission, unless you are an end user not redistributing to others. | ||
39 | If you have doubts about how to properly do that, or about what is | ||
40 | fair, ask. (Last I spoke with him Richard was contemplating how best | ||
41 | to address the fair crediting issue in the next GPL version.) | ||
42 | |||
43 | [END LICENSING] | ||
44 | |||
45 | Reiserfs is a file system based on balanced tree algorithms, which is | ||
46 | described at http://devlinux.com/namesys. | ||
47 | |||
48 | Stop reading here. Go there, then return. | ||
49 | |||
50 | Send bug reports to yura@namesys.botik.ru. | ||
51 | |||
52 | mkreiserfs and other utilities are in reiserfs/utils, or wherever your | ||
53 | Linux provider put them. There is some disagreement about how useful | ||
54 | it is for users to get their fsck and mkreiserfs out of sync with the | ||
55 | version of reiserfs that is in their kernel, with many important | ||
56 | distributors wanting them out of sync.:-) Please try to remember to | ||
57 | recompile and reinstall fsck and mkreiserfs with every update of | ||
58 | reiserfs, this is a common source of confusion. Note that some of the | ||
59 | utilities cannot be compiled without accessing the balancing code | ||
60 | which is in the kernel code, and relocating the utilities may require | ||
61 | you to specify where that code can be found. | ||
62 | |||
63 | Yes, if you update your reiserfs kernel module you do have to | ||
64 | recompile your kernel, most of the time. The errors you get will be | ||
65 | quite cryptic if your forget to do so. | ||
66 | |||
67 | Real users, as opposed to folks who want to hack and then understand | ||
68 | what went wrong, will want REISERFS_CHECK off. | ||
69 | |||
70 | Hideous Commercial Pitch: Spread your development costs across other OS | ||
71 | vendors. Select from the best in the world, not the best in your | ||
72 | building, by buying from third party OS component suppliers. Leverage | ||
73 | the software component development power of the internet. Be the most | ||
74 | aggressive in taking advantage of the commercial possibilities of | ||
75 | decentralized internet development, and add value through your branded | ||
76 | integration that you sell as an operating system. Let your competitors | ||
77 | be the ones to compete against the entire internet by themselves. Be | ||
78 | hip, get with the new economic trend, before your competitors do. Send | ||
79 | email to hans@reiser.to. | ||
80 | |||
81 | To understand the code, after reading the website, start reading the | ||
82 | code by reading reiserfs_fs.h first. | ||
83 | |||
84 | Hans Reiser was the project initiator, primary architect, source of all | ||
85 | funding for the first 5.5 years, and one of the programmers. He owns | ||
86 | the copyright. | ||
87 | |||
88 | Vladimir Saveljev was one of the programmers, and he worked long hours | ||
89 | writing the cleanest code. He always made the effort to be the best he | ||
90 | could be, and to make his code the best that it could be. What resulted | ||
91 | was quite remarkable. I don't think that money can ever motivate someone | ||
92 | to work the way he did, he is one of the most selfless men I know. | ||
93 | |||
94 | Yura helps with benchmarking, coding hashes, and block pre-allocation | ||
95 | code. | ||
96 | |||
97 | Anatoly Pinchuk is a former member of our team who worked closely with | ||
98 | Vladimir throughout the project's development. He wrote a quite | ||
99 | substantial portion of the total code. He realized that there was a | ||
100 | space problem with packing tails of files for files larger than a node | ||
101 | that start on a node aligned boundary (there are reasons to want to node | ||
102 | align files), and he invented and implemented indirect items and | ||
103 | unformatted nodes as the solution. | ||
104 | |||
105 | Konstantin Shvachko, with the help of the Russian version of a VC, | ||
106 | tried to put me in a position where I was forced into giving control | ||
107 | of the project to him. (Fortunately, as the person paying the money | ||
108 | for all salaries from my dayjob I owned all copyrights, and you can't | ||
109 | really force takeovers of sole proprietorships.) This was something | ||
110 | curious, because he never really understood the value of our project, | ||
111 | why we should do what we do, or why innovation was possible in | ||
112 | general, but he was sure that he ought to be controlling it. Every | ||
113 | innovation had to be forced past him while he was with us. He added | ||
114 | two years to the time required to complete reiserfs, and was a net | ||
115 | loss for me. Mikhail Gilula was a brilliant innovator who also left | ||
116 | in a destructive way that erased the value of his contributions, and | ||
117 | that he was shown much generosity just makes it more painful. | ||
118 | |||
119 | Grigory Zaigralin was an extremely effective system administrator for | ||
120 | our group. | ||
121 | |||
122 | Igor Krasheninnikov was wonderful at hardware procurement, repair, and | ||
123 | network installation. | ||
124 | |||
125 | Jeremy Fitzhardinge wrote the teahash.c code, and he gives credit to a | ||
126 | textbook he got the algorithm from in the code. Note that his analysis | ||
127 | of how we could use the hashing code in making 32 bit NFS cookies work | ||
128 | was probably more important than the actual algorithm. Colin Plumb also | ||
129 | contributed to it. | ||
130 | |||
131 | Chris Mason dived right into our code, and in just a few months produced | ||
132 | the journaling code that dramatically increased the value of ReiserFS. | ||
133 | He is just an amazing programmer. | ||
134 | |||
135 | Igor Zagorovsky is writing much of the new item handler and extent code | ||
136 | for our next major release. | ||
137 | |||
138 | Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the | ||
139 | resizer, and is hard at work on implementing allocate on flush. SGI | ||
140 | implemented allocate on flush before us for XFS, and generously took | ||
141 | the time to convince me we should do it also. They are great people, | ||
142 | and a great company. | ||
143 | |||
144 | Yuri Shevchuk and Nikita Danilov are doing squid cache optimization. | ||
145 | |||
146 | Vitaly Fertman is doing fsck. | ||
147 | |||
148 | Jeff Mahoney, of SuSE, contributed a few cleanup fixes, most notably | ||
149 | the endian safe patches which allow ReiserFS to run on any platform | ||
150 | supported by the Linux kernel. | ||
151 | |||
152 | SuSE, IntegratedLinux.com, Ecila, MP3.com, bigstorage.com, and the | ||
153 | Alpha PC Company made it possible for me to not have a day job | ||
154 | anymore, and to dramatically increase our staffing. Ecila funded | ||
155 | hypertext feature development, MP3.com funded journaling, SuSE funded | ||
156 | core development, IntegratedLinux.com funded squid web cache | ||
157 | appliances, bigstorage.com funded HSM, and the alpha PC company funded | ||
158 | the alpha port. Many of these tasks were helped by sponsors other | ||
159 | than the ones just named. SuSE has helped in much more than just | ||
160 | funding.... | ||
161 | |||
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c new file mode 100644 index 000000000000..a4e2ed544bbe --- /dev/null +++ b/fs/reiserfs/bitmap.c | |||
@@ -0,0 +1,1169 @@ | |||
1 | /* | ||
2 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
3 | */ | ||
4 | /* Reiserfs block (de)allocator, bitmap-based. */ | ||
5 | |||
6 | #include <linux/config.h> | ||
7 | #include <linux/time.h> | ||
8 | #include <linux/reiserfs_fs.h> | ||
9 | #include <linux/errno.h> | ||
10 | #include <linux/buffer_head.h> | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/pagemap.h> | ||
13 | #include <linux/reiserfs_fs_sb.h> | ||
14 | #include <linux/reiserfs_fs_i.h> | ||
15 | #include <linux/quotaops.h> | ||
16 | |||
17 | #define PREALLOCATION_SIZE 9 | ||
18 | |||
19 | /* different reiserfs block allocator options */ | ||
20 | |||
21 | #define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits) | ||
22 | |||
23 | #define _ALLOC_concentrating_formatted_nodes 0 | ||
24 | #define _ALLOC_displacing_large_files 1 | ||
25 | #define _ALLOC_displacing_new_packing_localities 2 | ||
26 | #define _ALLOC_old_hashed_relocation 3 | ||
27 | #define _ALLOC_new_hashed_relocation 4 | ||
28 | #define _ALLOC_skip_busy 5 | ||
29 | #define _ALLOC_displace_based_on_dirid 6 | ||
30 | #define _ALLOC_hashed_formatted_nodes 7 | ||
31 | #define _ALLOC_old_way 8 | ||
32 | #define _ALLOC_hundredth_slices 9 | ||
33 | #define _ALLOC_dirid_groups 10 | ||
34 | #define _ALLOC_oid_groups 11 | ||
35 | #define _ALLOC_packing_groups 12 | ||
36 | |||
37 | #define concentrating_formatted_nodes(s) test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s)) | ||
38 | #define displacing_large_files(s) test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s)) | ||
39 | #define displacing_new_packing_localities(s) test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s)) | ||
40 | |||
41 | #define SET_OPTION(optname) \ | ||
42 | do { \ | ||
43 | reiserfs_warning(s, "reiserfs: option \"%s\" is set", #optname); \ | ||
44 | set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \ | ||
45 | } while(0) | ||
46 | #define TEST_OPTION(optname, s) \ | ||
47 | test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)) | ||
48 | |||
49 | static inline void get_bit_address (struct super_block * s, | ||
50 | b_blocknr_t block, int * bmap_nr, int * offset) | ||
51 | { | ||
52 | /* It is in the bitmap block number equal to the block | ||
53 | * number divided by the number of bits in a block. */ | ||
54 | *bmap_nr = block / (s->s_blocksize << 3); | ||
55 | /* Within that bitmap block it is located at bit offset *offset. */ | ||
56 | *offset = block & ((s->s_blocksize << 3) - 1 ); | ||
57 | return; | ||
58 | } | ||
59 | |||
60 | #ifdef CONFIG_REISERFS_CHECK | ||
61 | int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value) | ||
62 | { | ||
63 | int i, j; | ||
64 | |||
65 | if (block == 0 || block >= SB_BLOCK_COUNT (s)) { | ||
66 | reiserfs_warning (s, "vs-4010: is_reusable: block number is out of range %lu (%u)", | ||
67 | block, SB_BLOCK_COUNT (s)); | ||
68 | return 0; | ||
69 | } | ||
70 | |||
71 | /* it can't be one of the bitmap blocks */ | ||
72 | for (i = 0; i < SB_BMAP_NR (s); i ++) | ||
73 | if (block == SB_AP_BITMAP (s)[i].bh->b_blocknr) { | ||
74 | reiserfs_warning (s, "vs: 4020: is_reusable: " | ||
75 | "bitmap block %lu(%u) can't be freed or reused", | ||
76 | block, SB_BMAP_NR (s)); | ||
77 | return 0; | ||
78 | } | ||
79 | |||
80 | get_bit_address (s, block, &i, &j); | ||
81 | |||
82 | if (i >= SB_BMAP_NR (s)) { | ||
83 | reiserfs_warning (s, "vs-4030: is_reusable: there is no so many bitmap blocks: " | ||
84 | "block=%lu, bitmap_nr=%d", block, i); | ||
85 | return 0; | ||
86 | } | ||
87 | |||
88 | if ((bit_value == 0 && | ||
89 | reiserfs_test_le_bit(j, SB_AP_BITMAP(s)[i].bh->b_data)) || | ||
90 | (bit_value == 1 && | ||
91 | reiserfs_test_le_bit(j, SB_AP_BITMAP (s)[i].bh->b_data) == 0)) { | ||
92 | reiserfs_warning (s, "vs-4040: is_reusable: corresponding bit of block %lu does not " | ||
93 | "match required value (i==%d, j==%d) test_bit==%d", | ||
94 | block, i, j, reiserfs_test_le_bit (j, SB_AP_BITMAP (s)[i].bh->b_data)); | ||
95 | |||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | if (bit_value == 0 && block == SB_ROOT_BLOCK (s)) { | ||
100 | reiserfs_warning (s, "vs-4050: is_reusable: this is root block (%u), " | ||
101 | "it must be busy", SB_ROOT_BLOCK (s)); | ||
102 | return 0; | ||
103 | } | ||
104 | |||
105 | return 1; | ||
106 | } | ||
107 | #endif /* CONFIG_REISERFS_CHECK */ | ||
108 | |||
109 | /* searches in journal structures for a given block number (bmap, off). If block | ||
110 | is found in reiserfs journal it suggests next free block candidate to test. */ | ||
111 | static inline int is_block_in_journal (struct super_block * s, int bmap, int | ||
112 | off, int *next) | ||
113 | { | ||
114 | b_blocknr_t tmp; | ||
115 | |||
116 | if (reiserfs_in_journal (s, bmap, off, 1, &tmp)) { | ||
117 | if (tmp) { /* hint supplied */ | ||
118 | *next = tmp; | ||
119 | PROC_INFO_INC( s, scan_bitmap.in_journal_hint ); | ||
120 | } else { | ||
121 | (*next) = off + 1; /* inc offset to avoid looping. */ | ||
122 | PROC_INFO_INC( s, scan_bitmap.in_journal_nohint ); | ||
123 | } | ||
124 | PROC_INFO_INC( s, scan_bitmap.retry ); | ||
125 | return 1; | ||
126 | } | ||
127 | return 0; | ||
128 | } | ||
129 | |||
130 | /* it searches for a window of zero bits with given minimum and maximum lengths in one bitmap | ||
131 | * block; */ | ||
132 | static int scan_bitmap_block (struct reiserfs_transaction_handle *th, | ||
133 | int bmap_n, int *beg, int boundary, int min, int max, int unfm) | ||
134 | { | ||
135 | struct super_block *s = th->t_super; | ||
136 | struct reiserfs_bitmap_info *bi=&SB_AP_BITMAP(s)[bmap_n]; | ||
137 | int end, next; | ||
138 | int org = *beg; | ||
139 | |||
140 | BUG_ON (!th->t_trans_id); | ||
141 | |||
142 | RFALSE(bmap_n >= SB_BMAP_NR (s), "Bitmap %d is out of range (0..%d)",bmap_n, SB_BMAP_NR (s) - 1); | ||
143 | PROC_INFO_INC( s, scan_bitmap.bmap ); | ||
144 | /* this is unclear and lacks comments, explain how journal bitmaps | ||
145 | work here for the reader. Convey a sense of the design here. What | ||
146 | is a window? */ | ||
147 | /* - I mean `a window of zero bits' as in description of this function - Zam. */ | ||
148 | |||
149 | if ( !bi ) { | ||
150 | reiserfs_warning (s, "NULL bitmap info pointer for bitmap %d", bmap_n); | ||
151 | return 0; | ||
152 | } | ||
153 | if (buffer_locked (bi->bh)) { | ||
154 | PROC_INFO_INC( s, scan_bitmap.wait ); | ||
155 | __wait_on_buffer (bi->bh); | ||
156 | } | ||
157 | |||
158 | while (1) { | ||
159 | cont: | ||
160 | if (bi->free_count < min) | ||
161 | return 0; // No free blocks in this bitmap | ||
162 | |||
163 | /* search for a first zero bit -- beggining of a window */ | ||
164 | *beg = reiserfs_find_next_zero_le_bit | ||
165 | ((unsigned long*)(bi->bh->b_data), boundary, *beg); | ||
166 | |||
167 | if (*beg + min > boundary) { /* search for a zero bit fails or the rest of bitmap block | ||
168 | * cannot contain a zero window of minimum size */ | ||
169 | return 0; | ||
170 | } | ||
171 | |||
172 | if (unfm && is_block_in_journal(s,bmap_n, *beg, beg)) | ||
173 | continue; | ||
174 | /* first zero bit found; we check next bits */ | ||
175 | for (end = *beg + 1;; end ++) { | ||
176 | if (end >= *beg + max || end >= boundary || reiserfs_test_le_bit (end, bi->bh->b_data)) { | ||
177 | next = end; | ||
178 | break; | ||
179 | } | ||
180 | /* finding the other end of zero bit window requires looking into journal structures (in | ||
181 | * case of searching for free blocks for unformatted nodes) */ | ||
182 | if (unfm && is_block_in_journal(s, bmap_n, end, &next)) | ||
183 | break; | ||
184 | } | ||
185 | |||
186 | /* now (*beg) points to beginning of zero bits window, | ||
187 | * (end) points to one bit after the window end */ | ||
188 | if (end - *beg >= min) { /* it seems we have found window of proper size */ | ||
189 | int i; | ||
190 | reiserfs_prepare_for_journal (s, bi->bh, 1); | ||
191 | /* try to set all blocks used checking are they still free */ | ||
192 | for (i = *beg; i < end; i++) { | ||
193 | /* It seems that we should not check in journal again. */ | ||
194 | if (reiserfs_test_and_set_le_bit (i, bi->bh->b_data)) { | ||
195 | /* bit was set by another process | ||
196 | * while we slept in prepare_for_journal() */ | ||
197 | PROC_INFO_INC( s, scan_bitmap.stolen ); | ||
198 | if (i >= *beg + min) { /* we can continue with smaller set of allocated blocks, | ||
199 | * if length of this set is more or equal to `min' */ | ||
200 | end = i; | ||
201 | break; | ||
202 | } | ||
203 | /* otherwise we clear all bit were set ... */ | ||
204 | while (--i >= *beg) | ||
205 | reiserfs_test_and_clear_le_bit (i, bi->bh->b_data); | ||
206 | reiserfs_restore_prepared_buffer (s, bi->bh); | ||
207 | *beg = org; | ||
208 | /* ... and search again in current block from beginning */ | ||
209 | goto cont; | ||
210 | } | ||
211 | } | ||
212 | bi->free_count -= (end - *beg); | ||
213 | journal_mark_dirty (th, s, bi->bh); | ||
214 | |||
215 | /* free block count calculation */ | ||
216 | reiserfs_prepare_for_journal (s, SB_BUFFER_WITH_SB(s), 1); | ||
217 | PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg)); | ||
218 | journal_mark_dirty (th, s, SB_BUFFER_WITH_SB(s)); | ||
219 | |||
220 | return end - (*beg); | ||
221 | } else { | ||
222 | *beg = next; | ||
223 | } | ||
224 | } | ||
225 | } | ||
226 | |||
227 | static int bmap_hash_id(struct super_block *s, u32 id) { | ||
228 | char * hash_in = NULL; | ||
229 | unsigned long hash; | ||
230 | unsigned bm; | ||
231 | |||
232 | if (id <= 2) { | ||
233 | bm = 1; | ||
234 | } else { | ||
235 | hash_in = (char *)(&id); | ||
236 | hash = keyed_hash(hash_in, 4); | ||
237 | bm = hash % SB_BMAP_NR(s); | ||
238 | if (!bm) | ||
239 | bm = 1; | ||
240 | } | ||
241 | /* this can only be true when SB_BMAP_NR = 1 */ | ||
242 | if (bm >= SB_BMAP_NR(s)) | ||
243 | bm = 0; | ||
244 | return bm; | ||
245 | } | ||
246 | |||
247 | /* | ||
248 | * hashes the id and then returns > 0 if the block group for the | ||
249 | * corresponding hash is full | ||
250 | */ | ||
251 | static inline int block_group_used(struct super_block *s, u32 id) { | ||
252 | int bm; | ||
253 | bm = bmap_hash_id(s, id); | ||
254 | if (SB_AP_BITMAP(s)[bm].free_count > ((s->s_blocksize << 3) * 60 / 100) ) { | ||
255 | return 0; | ||
256 | } | ||
257 | return 1; | ||
258 | } | ||
259 | |||
260 | /* | ||
261 | * the packing is returned in disk byte order | ||
262 | */ | ||
263 | u32 reiserfs_choose_packing(struct inode *dir) { | ||
264 | u32 packing; | ||
265 | if (TEST_OPTION(packing_groups, dir->i_sb)) { | ||
266 | u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id); | ||
267 | /* | ||
268 | * some versions of reiserfsck expect packing locality 1 to be | ||
269 | * special | ||
270 | */ | ||
271 | if (parent_dir == 1 || block_group_used(dir->i_sb,parent_dir)) | ||
272 | packing = INODE_PKEY(dir)->k_objectid; | ||
273 | else | ||
274 | packing = INODE_PKEY(dir)->k_dir_id; | ||
275 | } else | ||
276 | packing = INODE_PKEY(dir)->k_objectid; | ||
277 | return packing; | ||
278 | } | ||
279 | |||
280 | /* Tries to find contiguous zero bit window (given size) in given region of | ||
281 | * bitmap and place new blocks there. Returns number of allocated blocks. */ | ||
282 | static int scan_bitmap (struct reiserfs_transaction_handle *th, | ||
283 | b_blocknr_t *start, b_blocknr_t finish, | ||
284 | int min, int max, int unfm, unsigned long file_block) | ||
285 | { | ||
286 | int nr_allocated=0; | ||
287 | struct super_block * s = th->t_super; | ||
288 | /* find every bm and bmap and bmap_nr in this file, and change them all to bitmap_blocknr | ||
289 | * - Hans, it is not a block number - Zam. */ | ||
290 | |||
291 | int bm, off; | ||
292 | int end_bm, end_off; | ||
293 | int off_max = s->s_blocksize << 3; | ||
294 | |||
295 | BUG_ON (!th->t_trans_id); | ||
296 | |||
297 | PROC_INFO_INC( s, scan_bitmap.call ); | ||
298 | if ( SB_FREE_BLOCKS(s) <= 0) | ||
299 | return 0; // No point in looking for more free blocks | ||
300 | |||
301 | get_bit_address (s, *start, &bm, &off); | ||
302 | get_bit_address (s, finish, &end_bm, &end_off); | ||
303 | if (bm > SB_BMAP_NR(s)) | ||
304 | return 0; | ||
305 | if (end_bm > SB_BMAP_NR(s)) | ||
306 | end_bm = SB_BMAP_NR(s); | ||
307 | |||
308 | /* When the bitmap is more than 10% free, anyone can allocate. | ||
309 | * When it's less than 10% free, only files that already use the | ||
310 | * bitmap are allowed. Once we pass 80% full, this restriction | ||
311 | * is lifted. | ||
312 | * | ||
313 | * We do this so that files that grow later still have space close to | ||
314 | * their original allocation. This improves locality, and presumably | ||
315 | * performance as a result. | ||
316 | * | ||
317 | * This is only an allocation policy and does not make up for getting a | ||
318 | * bad hint. Decent hinting must be implemented for this to work well. | ||
319 | */ | ||
320 | if ( TEST_OPTION(skip_busy, s) && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s)/20 ) { | ||
321 | for (;bm < end_bm; bm++, off = 0) { | ||
322 | if ( ( off && (!unfm || (file_block != 0))) || SB_AP_BITMAP(s)[bm].free_count > (s->s_blocksize << 3) / 10 ) | ||
323 | nr_allocated = scan_bitmap_block(th, bm, &off, off_max, min, max, unfm); | ||
324 | if (nr_allocated) | ||
325 | goto ret; | ||
326 | } | ||
327 | /* we know from above that start is a reasonable number */ | ||
328 | get_bit_address (s, *start, &bm, &off); | ||
329 | } | ||
330 | |||
331 | for (;bm < end_bm; bm++, off = 0) { | ||
332 | nr_allocated = scan_bitmap_block(th, bm, &off, off_max, min, max, unfm); | ||
333 | if (nr_allocated) | ||
334 | goto ret; | ||
335 | } | ||
336 | |||
337 | nr_allocated = scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm); | ||
338 | |||
339 | ret: | ||
340 | *start = bm * off_max + off; | ||
341 | return nr_allocated; | ||
342 | |||
343 | } | ||
344 | |||
345 | static void _reiserfs_free_block (struct reiserfs_transaction_handle *th, | ||
346 | struct inode *inode, b_blocknr_t block, | ||
347 | int for_unformatted) | ||
348 | { | ||
349 | struct super_block * s = th->t_super; | ||
350 | struct reiserfs_super_block * rs; | ||
351 | struct buffer_head * sbh; | ||
352 | struct reiserfs_bitmap_info *apbi; | ||
353 | int nr, offset; | ||
354 | |||
355 | BUG_ON (!th->t_trans_id); | ||
356 | |||
357 | PROC_INFO_INC( s, free_block ); | ||
358 | |||
359 | rs = SB_DISK_SUPER_BLOCK (s); | ||
360 | sbh = SB_BUFFER_WITH_SB (s); | ||
361 | apbi = SB_AP_BITMAP(s); | ||
362 | |||
363 | get_bit_address (s, block, &nr, &offset); | ||
364 | |||
365 | if (nr >= sb_bmap_nr (rs)) { | ||
366 | reiserfs_warning (s, "vs-4075: reiserfs_free_block: " | ||
367 | "block %lu is out of range on %s", | ||
368 | block, reiserfs_bdevname (s)); | ||
369 | return; | ||
370 | } | ||
371 | |||
372 | reiserfs_prepare_for_journal(s, apbi[nr].bh, 1 ) ; | ||
373 | |||
374 | /* clear bit for the given block in bit map */ | ||
375 | if (!reiserfs_test_and_clear_le_bit (offset, apbi[nr].bh->b_data)) { | ||
376 | reiserfs_warning (s, "vs-4080: reiserfs_free_block: " | ||
377 | "free_block (%s:%lu)[dev:blocknr]: bit already cleared", | ||
378 | reiserfs_bdevname (s), block); | ||
379 | } | ||
380 | apbi[nr].free_count ++; | ||
381 | journal_mark_dirty (th, s, apbi[nr].bh); | ||
382 | |||
383 | reiserfs_prepare_for_journal(s, sbh, 1) ; | ||
384 | /* update super block */ | ||
385 | set_sb_free_blocks( rs, sb_free_blocks(rs) + 1 ); | ||
386 | |||
387 | journal_mark_dirty (th, s, sbh); | ||
388 | if (for_unformatted) | ||
389 | DQUOT_FREE_BLOCK_NODIRTY(inode, 1); | ||
390 | } | ||
391 | |||
392 | void reiserfs_free_block (struct reiserfs_transaction_handle *th, | ||
393 | struct inode *inode, b_blocknr_t block, | ||
394 | int for_unformatted) | ||
395 | { | ||
396 | struct super_block * s = th->t_super; | ||
397 | |||
398 | BUG_ON (!th->t_trans_id); | ||
399 | |||
400 | RFALSE(!s, "vs-4061: trying to free block on nonexistent device"); | ||
401 | RFALSE(is_reusable (s, block, 1) == 0, "vs-4071: can not free such block"); | ||
402 | /* mark it before we clear it, just in case */ | ||
403 | journal_mark_freed(th, s, block) ; | ||
404 | _reiserfs_free_block(th, inode, block, for_unformatted) ; | ||
405 | } | ||
406 | |||
407 | /* preallocated blocks don't need to be run through journal_mark_freed */ | ||
408 | static void reiserfs_free_prealloc_block (struct reiserfs_transaction_handle *th, | ||
409 | struct inode *inode, b_blocknr_t block) { | ||
410 | RFALSE(!th->t_super, "vs-4060: trying to free block on nonexistent device"); | ||
411 | RFALSE(is_reusable (th->t_super, block, 1) == 0, "vs-4070: can not free such block"); | ||
412 | BUG_ON (!th->t_trans_id); | ||
413 | _reiserfs_free_block(th, inode, block, 1) ; | ||
414 | } | ||
415 | |||
416 | static void __discard_prealloc (struct reiserfs_transaction_handle * th, | ||
417 | struct reiserfs_inode_info *ei) | ||
418 | { | ||
419 | unsigned long save = ei->i_prealloc_block ; | ||
420 | int dirty = 0; | ||
421 | struct inode *inode = &ei->vfs_inode; | ||
422 | BUG_ON (!th->t_trans_id); | ||
423 | #ifdef CONFIG_REISERFS_CHECK | ||
424 | if (ei->i_prealloc_count < 0) | ||
425 | reiserfs_warning (th->t_super, "zam-4001:%s: inode has negative prealloc blocks count.", __FUNCTION__ ); | ||
426 | #endif | ||
427 | while (ei->i_prealloc_count > 0) { | ||
428 | reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block); | ||
429 | ei->i_prealloc_block++; | ||
430 | ei->i_prealloc_count --; | ||
431 | dirty = 1; | ||
432 | } | ||
433 | if (dirty) | ||
434 | reiserfs_update_sd(th, inode); | ||
435 | ei->i_prealloc_block = save; | ||
436 | list_del_init(&(ei->i_prealloc_list)); | ||
437 | } | ||
438 | |||
439 | /* FIXME: It should be inline function */ | ||
440 | void reiserfs_discard_prealloc (struct reiserfs_transaction_handle *th, | ||
441 | struct inode *inode) | ||
442 | { | ||
443 | struct reiserfs_inode_info *ei = REISERFS_I(inode); | ||
444 | BUG_ON (!th->t_trans_id); | ||
445 | if (ei->i_prealloc_count) | ||
446 | __discard_prealloc(th, ei); | ||
447 | } | ||
448 | |||
449 | void reiserfs_discard_all_prealloc (struct reiserfs_transaction_handle *th) | ||
450 | { | ||
451 | struct list_head * plist = &SB_JOURNAL(th->t_super)->j_prealloc_list; | ||
452 | |||
453 | BUG_ON (!th->t_trans_id); | ||
454 | |||
455 | while (!list_empty(plist)) { | ||
456 | struct reiserfs_inode_info *ei; | ||
457 | ei = list_entry(plist->next, struct reiserfs_inode_info, i_prealloc_list); | ||
458 | #ifdef CONFIG_REISERFS_CHECK | ||
459 | if (!ei->i_prealloc_count) { | ||
460 | reiserfs_warning (th->t_super, "zam-4001:%s: inode is in prealloc list but has no preallocated blocks.", __FUNCTION__); | ||
461 | } | ||
462 | #endif | ||
463 | __discard_prealloc(th, ei); | ||
464 | } | ||
465 | } | ||
466 | |||
467 | void reiserfs_init_alloc_options (struct super_block *s) | ||
468 | { | ||
469 | set_bit (_ALLOC_skip_busy, &SB_ALLOC_OPTS(s)); | ||
470 | set_bit (_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s)); | ||
471 | set_bit (_ALLOC_packing_groups, &SB_ALLOC_OPTS(s)); | ||
472 | } | ||
473 | |||
474 | /* block allocator related options are parsed here */ | ||
475 | int reiserfs_parse_alloc_options(struct super_block * s, char * options) | ||
476 | { | ||
477 | char * this_char, * value; | ||
478 | |||
479 | REISERFS_SB(s)->s_alloc_options.bits = 0; /* clear default settings */ | ||
480 | |||
481 | while ( (this_char = strsep (&options, ":")) != NULL ) { | ||
482 | if ((value = strchr (this_char, '=')) != NULL) | ||
483 | *value++ = 0; | ||
484 | |||
485 | if (!strcmp(this_char, "concentrating_formatted_nodes")) { | ||
486 | int temp; | ||
487 | SET_OPTION(concentrating_formatted_nodes); | ||
488 | temp = (value && *value) ? simple_strtoul (value, &value, 0) : 10; | ||
489 | if (temp <= 0 || temp > 100) { | ||
490 | REISERFS_SB(s)->s_alloc_options.border = 10; | ||
491 | } else { | ||
492 | REISERFS_SB(s)->s_alloc_options.border = 100 / temp; | ||
493 | } | ||
494 | continue; | ||
495 | } | ||
496 | if (!strcmp(this_char, "displacing_large_files")) { | ||
497 | SET_OPTION(displacing_large_files); | ||
498 | REISERFS_SB(s)->s_alloc_options.large_file_size = | ||
499 | (value && *value) ? simple_strtoul (value, &value, 0) : 16; | ||
500 | continue; | ||
501 | } | ||
502 | if (!strcmp(this_char, "displacing_new_packing_localities")) { | ||
503 | SET_OPTION(displacing_new_packing_localities); | ||
504 | continue; | ||
505 | }; | ||
506 | |||
507 | if (!strcmp(this_char, "old_hashed_relocation")) { | ||
508 | SET_OPTION(old_hashed_relocation); | ||
509 | continue; | ||
510 | } | ||
511 | |||
512 | if (!strcmp(this_char, "new_hashed_relocation")) { | ||
513 | SET_OPTION(new_hashed_relocation); | ||
514 | continue; | ||
515 | } | ||
516 | |||
517 | if (!strcmp(this_char, "dirid_groups")) { | ||
518 | SET_OPTION(dirid_groups); | ||
519 | continue; | ||
520 | } | ||
521 | if (!strcmp(this_char, "oid_groups")) { | ||
522 | SET_OPTION(oid_groups); | ||
523 | continue; | ||
524 | } | ||
525 | if (!strcmp(this_char, "packing_groups")) { | ||
526 | SET_OPTION(packing_groups); | ||
527 | continue; | ||
528 | } | ||
529 | if (!strcmp(this_char, "hashed_formatted_nodes")) { | ||
530 | SET_OPTION(hashed_formatted_nodes); | ||
531 | continue; | ||
532 | } | ||
533 | |||
534 | if (!strcmp(this_char, "skip_busy")) { | ||
535 | SET_OPTION(skip_busy); | ||
536 | continue; | ||
537 | } | ||
538 | |||
539 | if (!strcmp(this_char, "hundredth_slices")) { | ||
540 | SET_OPTION(hundredth_slices); | ||
541 | continue; | ||
542 | } | ||
543 | |||
544 | if (!strcmp(this_char, "old_way")) { | ||
545 | SET_OPTION(old_way); | ||
546 | continue; | ||
547 | } | ||
548 | |||
549 | if (!strcmp(this_char, "displace_based_on_dirid")) { | ||
550 | SET_OPTION(displace_based_on_dirid); | ||
551 | continue; | ||
552 | } | ||
553 | |||
554 | if (!strcmp(this_char, "preallocmin")) { | ||
555 | REISERFS_SB(s)->s_alloc_options.preallocmin = | ||
556 | (value && *value) ? simple_strtoul (value, &value, 0) : 4; | ||
557 | continue; | ||
558 | } | ||
559 | |||
560 | if (!strcmp(this_char, "preallocsize")) { | ||
561 | REISERFS_SB(s)->s_alloc_options.preallocsize = | ||
562 | (value && *value) ? simple_strtoul (value, &value, 0) : PREALLOCATION_SIZE; | ||
563 | continue; | ||
564 | } | ||
565 | |||
566 | reiserfs_warning (s, "zam-4001: %s : unknown option - %s", | ||
567 | __FUNCTION__ , this_char); | ||
568 | return 1; | ||
569 | } | ||
570 | |||
571 | reiserfs_warning (s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s)); | ||
572 | return 0; | ||
573 | } | ||
574 | |||
575 | static inline void new_hashed_relocation (reiserfs_blocknr_hint_t * hint) | ||
576 | { | ||
577 | char * hash_in; | ||
578 | if (hint->formatted_node) { | ||
579 | hash_in = (char*)&hint->key.k_dir_id; | ||
580 | } else { | ||
581 | if (!hint->inode) { | ||
582 | //hint->search_start = hint->beg; | ||
583 | hash_in = (char*)&hint->key.k_dir_id; | ||
584 | } else | ||
585 | if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) | ||
586 | hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); | ||
587 | else | ||
588 | hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid); | ||
589 | } | ||
590 | |||
591 | hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); | ||
592 | } | ||
593 | |||
594 | /* | ||
595 | * Relocation based on dirid, hashing them into a given bitmap block | ||
596 | * files. Formatted nodes are unaffected, a seperate policy covers them | ||
597 | */ | ||
598 | static void | ||
599 | dirid_groups (reiserfs_blocknr_hint_t *hint) | ||
600 | { | ||
601 | unsigned long hash; | ||
602 | __u32 dirid = 0; | ||
603 | int bm = 0; | ||
604 | struct super_block *sb = hint->th->t_super; | ||
605 | if (hint->inode) | ||
606 | dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); | ||
607 | else if (hint->formatted_node) | ||
608 | dirid = hint->key.k_dir_id; | ||
609 | |||
610 | if (dirid) { | ||
611 | bm = bmap_hash_id(sb, dirid); | ||
612 | hash = bm * (sb->s_blocksize << 3); | ||
613 | /* give a portion of the block group to metadata */ | ||
614 | if (hint->inode) | ||
615 | hash += sb->s_blocksize/2; | ||
616 | hint->search_start = hash; | ||
617 | } | ||
618 | } | ||
619 | |||
620 | /* | ||
621 | * Relocation based on oid, hashing them into a given bitmap block | ||
622 | * files. Formatted nodes are unaffected, a seperate policy covers them | ||
623 | */ | ||
624 | static void | ||
625 | oid_groups (reiserfs_blocknr_hint_t *hint) | ||
626 | { | ||
627 | if (hint->inode) { | ||
628 | unsigned long hash; | ||
629 | __u32 oid; | ||
630 | __u32 dirid; | ||
631 | int bm; | ||
632 | |||
633 | dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); | ||
634 | |||
635 | /* keep the root dir and it's first set of subdirs close to | ||
636 | * the start of the disk | ||
637 | */ | ||
638 | if (dirid <= 2) | ||
639 | hash = (hint->inode->i_sb->s_blocksize << 3); | ||
640 | else { | ||
641 | oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid); | ||
642 | bm = bmap_hash_id(hint->inode->i_sb, oid); | ||
643 | hash = bm * (hint->inode->i_sb->s_blocksize << 3); | ||
644 | } | ||
645 | hint->search_start = hash; | ||
646 | } | ||
647 | } | ||
648 | |||
649 | /* returns 1 if it finds an indirect item and gets valid hint info | ||
650 | * from it, otherwise 0 | ||
651 | */ | ||
652 | static int get_left_neighbor(reiserfs_blocknr_hint_t *hint) | ||
653 | { | ||
654 | struct path * path; | ||
655 | struct buffer_head * bh; | ||
656 | struct item_head * ih; | ||
657 | int pos_in_item; | ||
658 | __u32 * item; | ||
659 | int ret = 0; | ||
660 | |||
661 | if (!hint->path) /* reiserfs code can call this function w/o pointer to path | ||
662 | * structure supplied; then we rely on supplied search_start */ | ||
663 | return 0; | ||
664 | |||
665 | path = hint->path; | ||
666 | bh = get_last_bh(path); | ||
667 | RFALSE( !bh, "green-4002: Illegal path specified to get_left_neighbor"); | ||
668 | ih = get_ih(path); | ||
669 | pos_in_item = path->pos_in_item; | ||
670 | item = get_item (path); | ||
671 | |||
672 | hint->search_start = bh->b_blocknr; | ||
673 | |||
674 | if (!hint->formatted_node && is_indirect_le_ih (ih)) { | ||
675 | /* for indirect item: go to left and look for the first non-hole entry | ||
676 | in the indirect item */ | ||
677 | if (pos_in_item == I_UNFM_NUM (ih)) | ||
678 | pos_in_item--; | ||
679 | // pos_in_item = I_UNFM_NUM (ih) - 1; | ||
680 | while (pos_in_item >= 0) { | ||
681 | int t=get_block_num(item,pos_in_item); | ||
682 | if (t) { | ||
683 | hint->search_start = t; | ||
684 | ret = 1; | ||
685 | break; | ||
686 | } | ||
687 | pos_in_item --; | ||
688 | } | ||
689 | } | ||
690 | |||
691 | /* does result value fit into specified region? */ | ||
692 | return ret; | ||
693 | } | ||
694 | |||
695 | /* should be, if formatted node, then try to put on first part of the device | ||
696 | specified as number of percent with mount option device, else try to put | ||
697 | on last of device. This is not to say it is good code to do so, | ||
698 | but the effect should be measured. */ | ||
699 | static inline void set_border_in_hint(struct super_block *s, reiserfs_blocknr_hint_t *hint) | ||
700 | { | ||
701 | b_blocknr_t border = SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border; | ||
702 | |||
703 | if (hint->formatted_node) | ||
704 | hint->end = border - 1; | ||
705 | else | ||
706 | hint->beg = border; | ||
707 | } | ||
708 | |||
709 | static inline void displace_large_file(reiserfs_blocknr_hint_t *hint) | ||
710 | { | ||
711 | if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) | ||
712 | hint->search_start = hint->beg + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id), 4) % (hint->end - hint->beg); | ||
713 | else | ||
714 | hint->search_start = hint->beg + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid), 4) % (hint->end - hint->beg); | ||
715 | } | ||
716 | |||
717 | static inline void hash_formatted_node(reiserfs_blocknr_hint_t *hint) | ||
718 | { | ||
719 | char * hash_in; | ||
720 | |||
721 | if (!hint->inode) | ||
722 | hash_in = (char*)&hint->key.k_dir_id; | ||
723 | else if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) | ||
724 | hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); | ||
725 | else | ||
726 | hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid); | ||
727 | |||
728 | hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); | ||
729 | } | ||
730 | |||
731 | static inline int this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t *hint) | ||
732 | { | ||
733 | return hint->block == REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size; | ||
734 | } | ||
735 | |||
736 | #ifdef DISPLACE_NEW_PACKING_LOCALITIES | ||
737 | static inline void displace_new_packing_locality (reiserfs_blocknr_hint_t *hint) | ||
738 | { | ||
739 | struct reiserfs_key * key = &hint->key; | ||
740 | |||
741 | hint->th->displace_new_blocks = 0; | ||
742 | hint->search_start = hint->beg + keyed_hash((char*)(&key->k_objectid),4) % (hint->end - hint->beg); | ||
743 | } | ||
744 | #endif | ||
745 | |||
746 | static inline int old_hashed_relocation (reiserfs_blocknr_hint_t * hint) | ||
747 | { | ||
748 | b_blocknr_t border; | ||
749 | u32 hash_in; | ||
750 | |||
751 | if (hint->formatted_node || hint->inode == NULL) { | ||
752 | return 0; | ||
753 | } | ||
754 | |||
755 | hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id); | ||
756 | border = hint->beg + (u32) keyed_hash(((char *) (&hash_in)), 4) % (hint->end - hint->beg - 1); | ||
757 | if (border > hint->search_start) | ||
758 | hint->search_start = border; | ||
759 | |||
760 | return 1; | ||
761 | } | ||
762 | |||
763 | static inline int old_way (reiserfs_blocknr_hint_t * hint) | ||
764 | { | ||
765 | b_blocknr_t border; | ||
766 | |||
767 | if (hint->formatted_node || hint->inode == NULL) { | ||
768 | return 0; | ||
769 | } | ||
770 | |||
771 | border = hint->beg + le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end - hint->beg); | ||
772 | if (border > hint->search_start) | ||
773 | hint->search_start = border; | ||
774 | |||
775 | return 1; | ||
776 | } | ||
777 | |||
778 | static inline void hundredth_slices (reiserfs_blocknr_hint_t * hint) | ||
779 | { | ||
780 | struct reiserfs_key * key = &hint->key; | ||
781 | b_blocknr_t slice_start; | ||
782 | |||
783 | slice_start = (keyed_hash((char*)(&key->k_dir_id),4) % 100) * (hint->end / 100); | ||
784 | if ( slice_start > hint->search_start || slice_start + (hint->end / 100) <= hint->search_start) { | ||
785 | hint->search_start = slice_start; | ||
786 | } | ||
787 | } | ||
788 | |||
789 | static void determine_search_start(reiserfs_blocknr_hint_t *hint, | ||
790 | int amount_needed) | ||
791 | { | ||
792 | struct super_block *s = hint->th->t_super; | ||
793 | int unfm_hint; | ||
794 | |||
795 | hint->beg = 0; | ||
796 | hint->end = SB_BLOCK_COUNT(s) - 1; | ||
797 | |||
798 | /* This is former border algorithm. Now with tunable border offset */ | ||
799 | if (concentrating_formatted_nodes(s)) | ||
800 | set_border_in_hint(s, hint); | ||
801 | |||
802 | #ifdef DISPLACE_NEW_PACKING_LOCALITIES | ||
803 | /* whenever we create a new directory, we displace it. At first we will | ||
804 | hash for location, later we might look for a moderately empty place for | ||
805 | it */ | ||
806 | if (displacing_new_packing_localities(s) | ||
807 | && hint->th->displace_new_blocks) { | ||
808 | displace_new_packing_locality(hint); | ||
809 | |||
810 | /* we do not continue determine_search_start, | ||
811 | * if new packing locality is being displaced */ | ||
812 | return; | ||
813 | } | ||
814 | #endif | ||
815 | |||
816 | /* all persons should feel encouraged to add more special cases here and | ||
817 | * test them */ | ||
818 | |||
819 | if (displacing_large_files(s) && !hint->formatted_node | ||
820 | && this_blocknr_allocation_would_make_it_a_large_file(hint)) { | ||
821 | displace_large_file(hint); | ||
822 | return; | ||
823 | } | ||
824 | |||
825 | /* if none of our special cases is relevant, use the left neighbor in the | ||
826 | tree order of the new node we are allocating for */ | ||
827 | if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes,s)) { | ||
828 | hash_formatted_node(hint); | ||
829 | return; | ||
830 | } | ||
831 | |||
832 | unfm_hint = get_left_neighbor(hint); | ||
833 | |||
834 | /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation, | ||
835 | new blocks are displaced based on directory ID. Also, if suggested search_start | ||
836 | is less than last preallocated block, we start searching from it, assuming that | ||
837 | HDD dataflow is faster in forward direction */ | ||
838 | if ( TEST_OPTION(old_way, s)) { | ||
839 | if (!hint->formatted_node) { | ||
840 | if ( !reiserfs_hashed_relocation(s)) | ||
841 | old_way(hint); | ||
842 | else if (!reiserfs_no_unhashed_relocation(s)) | ||
843 | old_hashed_relocation(hint); | ||
844 | |||
845 | if ( hint->inode && hint->search_start < REISERFS_I(hint->inode)->i_prealloc_block) | ||
846 | hint->search_start = REISERFS_I(hint->inode)->i_prealloc_block; | ||
847 | } | ||
848 | return; | ||
849 | } | ||
850 | |||
851 | /* This is an approach proposed by Hans */ | ||
852 | if ( TEST_OPTION(hundredth_slices, s) && ! (displacing_large_files(s) && !hint->formatted_node)) { | ||
853 | hundredth_slices(hint); | ||
854 | return; | ||
855 | } | ||
856 | |||
857 | /* old_hashed_relocation only works on unformatted */ | ||
858 | if (!unfm_hint && !hint->formatted_node && | ||
859 | TEST_OPTION(old_hashed_relocation, s)) | ||
860 | { | ||
861 | old_hashed_relocation(hint); | ||
862 | } | ||
863 | /* new_hashed_relocation works with both formatted/unformatted nodes */ | ||
864 | if ((!unfm_hint || hint->formatted_node) && | ||
865 | TEST_OPTION(new_hashed_relocation, s)) | ||
866 | { | ||
867 | new_hashed_relocation(hint); | ||
868 | } | ||
869 | /* dirid grouping works only on unformatted nodes */ | ||
870 | if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups,s)) | ||
871 | { | ||
872 | dirid_groups(hint); | ||
873 | } | ||
874 | |||
875 | #ifdef DISPLACE_NEW_PACKING_LOCALITIES | ||
876 | if (hint->formatted_node && TEST_OPTION(dirid_groups,s)) | ||
877 | { | ||
878 | dirid_groups(hint); | ||
879 | } | ||
880 | #endif | ||
881 | |||
882 | /* oid grouping works only on unformatted nodes */ | ||
883 | if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups,s)) | ||
884 | { | ||
885 | oid_groups(hint); | ||
886 | } | ||
887 | return; | ||
888 | } | ||
889 | |||
890 | static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint) | ||
891 | { | ||
892 | /* make minimum size a mount option and benchmark both ways */ | ||
893 | /* we preallocate blocks only for regular files, specific size */ | ||
894 | /* benchmark preallocating always and see what happens */ | ||
895 | |||
896 | hint->prealloc_size = 0; | ||
897 | |||
898 | if (!hint->formatted_node && hint->preallocate) { | ||
899 | if (S_ISREG(hint->inode->i_mode) | ||
900 | && hint->inode->i_size >= REISERFS_SB(hint->th->t_super)->s_alloc_options.preallocmin * hint->inode->i_sb->s_blocksize) | ||
901 | hint->prealloc_size = REISERFS_SB(hint->th->t_super)->s_alloc_options.preallocsize - 1; | ||
902 | } | ||
903 | return CARRY_ON; | ||
904 | } | ||
905 | |||
906 | /* XXX I know it could be merged with upper-level function; | ||
907 | but may be result function would be too complex. */ | ||
908 | static inline int allocate_without_wrapping_disk (reiserfs_blocknr_hint_t * hint, | ||
909 | b_blocknr_t * new_blocknrs, | ||
910 | b_blocknr_t start, b_blocknr_t finish, | ||
911 | int min, | ||
912 | int amount_needed, int prealloc_size) | ||
913 | { | ||
914 | int rest = amount_needed; | ||
915 | int nr_allocated; | ||
916 | |||
917 | while (rest > 0 && start <= finish) { | ||
918 | nr_allocated = scan_bitmap (hint->th, &start, finish, min, | ||
919 | rest + prealloc_size, !hint->formatted_node, | ||
920 | hint->block); | ||
921 | |||
922 | if (nr_allocated == 0) /* no new blocks allocated, return */ | ||
923 | break; | ||
924 | |||
925 | /* fill free_blocknrs array first */ | ||
926 | while (rest > 0 && nr_allocated > 0) { | ||
927 | * new_blocknrs ++ = start ++; | ||
928 | rest --; nr_allocated --; | ||
929 | } | ||
930 | |||
931 | /* do we have something to fill prealloc. array also ? */ | ||
932 | if (nr_allocated > 0) { | ||
933 | /* it means prealloc_size was greater that 0 and we do preallocation */ | ||
934 | list_add(&REISERFS_I(hint->inode)->i_prealloc_list, | ||
935 | &SB_JOURNAL(hint->th->t_super)->j_prealloc_list); | ||
936 | REISERFS_I(hint->inode)->i_prealloc_block = start; | ||
937 | REISERFS_I(hint->inode)->i_prealloc_count = nr_allocated; | ||
938 | break; | ||
939 | } | ||
940 | } | ||
941 | |||
942 | return (amount_needed - rest); | ||
943 | } | ||
944 | |||
945 | static inline int blocknrs_and_prealloc_arrays_from_search_start | ||
946 | (reiserfs_blocknr_hint_t *hint, b_blocknr_t *new_blocknrs, int amount_needed) | ||
947 | { | ||
948 | struct super_block *s = hint->th->t_super; | ||
949 | b_blocknr_t start = hint->search_start; | ||
950 | b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1; | ||
951 | int passno = 0; | ||
952 | int nr_allocated = 0; | ||
953 | int bigalloc = 0; | ||
954 | |||
955 | determine_prealloc_size(hint); | ||
956 | if (!hint->formatted_node) { | ||
957 | int quota_ret; | ||
958 | #ifdef REISERQUOTA_DEBUG | ||
959 | reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: allocating %d blocks id=%u", amount_needed, hint->inode->i_uid); | ||
960 | #endif | ||
961 | quota_ret = DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed); | ||
962 | if (quota_ret) /* Quota exceeded? */ | ||
963 | return QUOTA_EXCEEDED; | ||
964 | if (hint->preallocate && hint->prealloc_size ) { | ||
965 | #ifdef REISERQUOTA_DEBUG | ||
966 | reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: allocating (prealloc) %d blocks id=%u", hint->prealloc_size, hint->inode->i_uid); | ||
967 | #endif | ||
968 | quota_ret = DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode, hint->prealloc_size); | ||
969 | if (quota_ret) | ||
970 | hint->preallocate=hint->prealloc_size=0; | ||
971 | } | ||
972 | /* for unformatted nodes, force large allocations */ | ||
973 | bigalloc = amount_needed; | ||
974 | } | ||
975 | |||
976 | do { | ||
977 | /* in bigalloc mode, nr_allocated should stay zero until | ||
978 | * the entire allocation is filled | ||
979 | */ | ||
980 | if (unlikely(bigalloc && nr_allocated)) { | ||
981 | reiserfs_warning(s, "bigalloc is %d, nr_allocated %d\n", | ||
982 | bigalloc, nr_allocated); | ||
983 | /* reset things to a sane value */ | ||
984 | bigalloc = amount_needed - nr_allocated; | ||
985 | } | ||
986 | /* | ||
987 | * try pass 0 and pass 1 looking for a nice big | ||
988 | * contiguous allocation. Then reset and look | ||
989 | * for anything you can find. | ||
990 | */ | ||
991 | if (passno == 2 && bigalloc) { | ||
992 | passno = 0; | ||
993 | bigalloc = 0; | ||
994 | } | ||
995 | switch (passno++) { | ||
996 | case 0: /* Search from hint->search_start to end of disk */ | ||
997 | start = hint->search_start; | ||
998 | finish = SB_BLOCK_COUNT(s) - 1; | ||
999 | break; | ||
1000 | case 1: /* Search from hint->beg to hint->search_start */ | ||
1001 | start = hint->beg; | ||
1002 | finish = hint->search_start; | ||
1003 | break; | ||
1004 | case 2: /* Last chance: Search from 0 to hint->beg */ | ||
1005 | start = 0; | ||
1006 | finish = hint->beg; | ||
1007 | break; | ||
1008 | default: /* We've tried searching everywhere, not enough space */ | ||
1009 | /* Free the blocks */ | ||
1010 | if (!hint->formatted_node) { | ||
1011 | #ifdef REISERQUOTA_DEBUG | ||
1012 | reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: freeing (nospace) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated, hint->inode->i_uid); | ||
1013 | #endif | ||
1014 | DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); /* Free not allocated blocks */ | ||
1015 | } | ||
1016 | while (nr_allocated --) | ||
1017 | reiserfs_free_block(hint->th, hint->inode, new_blocknrs[nr_allocated], !hint->formatted_node); | ||
1018 | |||
1019 | return NO_DISK_SPACE; | ||
1020 | } | ||
1021 | } while ((nr_allocated += allocate_without_wrapping_disk (hint, | ||
1022 | new_blocknrs + nr_allocated, start, finish, | ||
1023 | bigalloc ? bigalloc : 1, | ||
1024 | amount_needed - nr_allocated, | ||
1025 | hint->prealloc_size)) | ||
1026 | < amount_needed); | ||
1027 | if ( !hint->formatted_node && | ||
1028 | amount_needed + hint->prealloc_size > | ||
1029 | nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) { | ||
1030 | /* Some of preallocation blocks were not allocated */ | ||
1031 | #ifdef REISERQUOTA_DEBUG | ||
1032 | reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: freeing (failed prealloc) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated - REISERFS_I(hint->inode)->i_prealloc_count, hint->inode->i_uid); | ||
1033 | #endif | ||
1034 | DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + | ||
1035 | hint->prealloc_size - nr_allocated - | ||
1036 | REISERFS_I(hint->inode)->i_prealloc_count); | ||
1037 | } | ||
1038 | |||
1039 | return CARRY_ON; | ||
1040 | } | ||
1041 | |||
1042 | /* grab new blocknrs from preallocated list */ | ||
1043 | /* return amount still needed after using them */ | ||
1044 | static int use_preallocated_list_if_available (reiserfs_blocknr_hint_t *hint, | ||
1045 | b_blocknr_t *new_blocknrs, int amount_needed) | ||
1046 | { | ||
1047 | struct inode * inode = hint->inode; | ||
1048 | |||
1049 | if (REISERFS_I(inode)->i_prealloc_count > 0) { | ||
1050 | while (amount_needed) { | ||
1051 | |||
1052 | *new_blocknrs ++ = REISERFS_I(inode)->i_prealloc_block ++; | ||
1053 | REISERFS_I(inode)->i_prealloc_count --; | ||
1054 | |||
1055 | amount_needed --; | ||
1056 | |||
1057 | if (REISERFS_I(inode)->i_prealloc_count <= 0) { | ||
1058 | list_del(&REISERFS_I(inode)->i_prealloc_list); | ||
1059 | break; | ||
1060 | } | ||
1061 | } | ||
1062 | } | ||
1063 | /* return amount still needed after using preallocated blocks */ | ||
1064 | return amount_needed; | ||
1065 | } | ||
1066 | |||
1067 | int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint, | ||
1068 | b_blocknr_t * new_blocknrs, int amount_needed, | ||
1069 | int reserved_by_us /* Amount of blocks we have | ||
1070 | already reserved */) | ||
1071 | { | ||
1072 | int initial_amount_needed = amount_needed; | ||
1073 | int ret; | ||
1074 | struct super_block *s = hint->th->t_super; | ||
1075 | |||
1076 | /* Check if there is enough space, taking into account reserved space */ | ||
1077 | if ( SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks < | ||
1078 | amount_needed - reserved_by_us) | ||
1079 | return NO_DISK_SPACE; | ||
1080 | /* should this be if !hint->inode && hint->preallocate? */ | ||
1081 | /* do you mean hint->formatted_node can be removed ? - Zam */ | ||
1082 | /* hint->formatted_node cannot be removed because we try to access | ||
1083 | inode information here, and there is often no inode assotiated with | ||
1084 | metadata allocations - green */ | ||
1085 | |||
1086 | if (!hint->formatted_node && hint->preallocate) { | ||
1087 | amount_needed = use_preallocated_list_if_available | ||
1088 | (hint, new_blocknrs, amount_needed); | ||
1089 | if (amount_needed == 0) /* all blocknrs we need we got from | ||
1090 | prealloc. list */ | ||
1091 | return CARRY_ON; | ||
1092 | new_blocknrs += (initial_amount_needed - amount_needed); | ||
1093 | } | ||
1094 | |||
1095 | /* find search start and save it in hint structure */ | ||
1096 | determine_search_start(hint, amount_needed); | ||
1097 | if (hint->search_start >= SB_BLOCK_COUNT(s)) | ||
1098 | hint->search_start = SB_BLOCK_COUNT(s) - 1; | ||
1099 | |||
1100 | /* allocation itself; fill new_blocknrs and preallocation arrays */ | ||
1101 | ret = blocknrs_and_prealloc_arrays_from_search_start | ||
1102 | (hint, new_blocknrs, amount_needed); | ||
1103 | |||
1104 | /* we used prealloc. list to fill (partially) new_blocknrs array. If final allocation fails we | ||
1105 | * need to return blocks back to prealloc. list or just free them. -- Zam (I chose second | ||
1106 | * variant) */ | ||
1107 | |||
1108 | if (ret != CARRY_ON) { | ||
1109 | while (amount_needed ++ < initial_amount_needed) { | ||
1110 | reiserfs_free_block(hint->th, hint->inode, *(--new_blocknrs), 1); | ||
1111 | } | ||
1112 | } | ||
1113 | return ret; | ||
1114 | } | ||
1115 | |||
1116 | /* These 2 functions are here to provide blocks reservation to the rest of kernel */ | ||
1117 | /* Reserve @blocks amount of blocks in fs pointed by @sb. Caller must make sure | ||
1118 | there are actually this much blocks on the FS available */ | ||
1119 | void reiserfs_claim_blocks_to_be_allocated( | ||
1120 | struct super_block *sb, /* super block of | ||
1121 | filesystem where | ||
1122 | blocks should be | ||
1123 | reserved */ | ||
1124 | int blocks /* How much to reserve */ | ||
1125 | ) | ||
1126 | { | ||
1127 | |||
1128 | /* Fast case, if reservation is zero - exit immediately. */ | ||
1129 | if ( !blocks ) | ||
1130 | return; | ||
1131 | |||
1132 | spin_lock(&REISERFS_SB(sb)->bitmap_lock); | ||
1133 | REISERFS_SB(sb)->reserved_blocks += blocks; | ||
1134 | spin_unlock(&REISERFS_SB(sb)->bitmap_lock); | ||
1135 | } | ||
1136 | |||
1137 | /* Unreserve @blocks amount of blocks in fs pointed by @sb */ | ||
1138 | void reiserfs_release_claimed_blocks( | ||
1139 | struct super_block *sb, /* super block of | ||
1140 | filesystem where | ||
1141 | blocks should be | ||
1142 | reserved */ | ||
1143 | int blocks /* How much to unreserve */ | ||
1144 | ) | ||
1145 | { | ||
1146 | |||
1147 | /* Fast case, if unreservation is zero - exit immediately. */ | ||
1148 | if ( !blocks ) | ||
1149 | return; | ||
1150 | |||
1151 | spin_lock(&REISERFS_SB(sb)->bitmap_lock); | ||
1152 | REISERFS_SB(sb)->reserved_blocks -= blocks; | ||
1153 | spin_unlock(&REISERFS_SB(sb)->bitmap_lock); | ||
1154 | RFALSE( REISERFS_SB(sb)->reserved_blocks < 0, "amount of blocks reserved became zero?"); | ||
1155 | } | ||
1156 | |||
1157 | /* This function estimates how much pages we will be able to write to FS | ||
1158 | used for reiserfs_file_write() purposes for now. */ | ||
1159 | int reiserfs_can_fit_pages ( struct super_block *sb /* superblock of filesystem | ||
1160 | to estimate space */ ) | ||
1161 | { | ||
1162 | int space; | ||
1163 | |||
1164 | spin_lock(&REISERFS_SB(sb)->bitmap_lock); | ||
1165 | space = (SB_FREE_BLOCKS(sb) - REISERFS_SB(sb)->reserved_blocks) >> ( PAGE_CACHE_SHIFT - sb->s_blocksize_bits); | ||
1166 | spin_unlock(&REISERFS_SB(sb)->bitmap_lock); | ||
1167 | |||
1168 | return space>0?space:0; | ||
1169 | } | ||
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c new file mode 100644 index 000000000000..d1514a9b0514 --- /dev/null +++ b/fs/reiserfs/dir.c | |||
@@ -0,0 +1,275 @@ | |||
1 | /* | ||
2 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
3 | */ | ||
4 | |||
5 | #include <linux/config.h> | ||
6 | #include <linux/string.h> | ||
7 | #include <linux/errno.h> | ||
8 | #include <linux/fs.h> | ||
9 | #include <linux/reiserfs_fs.h> | ||
10 | #include <linux/stat.h> | ||
11 | #include <linux/smp_lock.h> | ||
12 | #include <linux/buffer_head.h> | ||
13 | #include <asm/uaccess.h> | ||
14 | |||
15 | extern struct reiserfs_key MIN_KEY; | ||
16 | |||
17 | static int reiserfs_readdir (struct file *, void *, filldir_t); | ||
18 | static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) ; | ||
19 | |||
20 | struct file_operations reiserfs_dir_operations = { | ||
21 | .read = generic_read_dir, | ||
22 | .readdir = reiserfs_readdir, | ||
23 | .fsync = reiserfs_dir_fsync, | ||
24 | .ioctl = reiserfs_ioctl, | ||
25 | }; | ||
26 | |||
27 | static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) { | ||
28 | struct inode *inode = dentry->d_inode; | ||
29 | int err; | ||
30 | reiserfs_write_lock(inode->i_sb); | ||
31 | err = reiserfs_commit_for_inode(inode) ; | ||
32 | reiserfs_write_unlock(inode->i_sb) ; | ||
33 | if (err < 0) | ||
34 | return err; | ||
35 | return 0; | ||
36 | } | ||
37 | |||
38 | |||
39 | #define store_ih(where,what) copy_item_head (where, what) | ||
40 | |||
41 | // | ||
42 | static int reiserfs_readdir (struct file * filp, void * dirent, filldir_t filldir) | ||
43 | { | ||
44 | struct inode *inode = filp->f_dentry->d_inode; | ||
45 | struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ | ||
46 | INITIALIZE_PATH (path_to_entry); | ||
47 | struct buffer_head * bh; | ||
48 | int item_num, entry_num; | ||
49 | const struct reiserfs_key * rkey; | ||
50 | struct item_head * ih, tmp_ih; | ||
51 | int search_res; | ||
52 | char * local_buf; | ||
53 | loff_t next_pos; | ||
54 | char small_buf[32] ; /* avoid kmalloc if we can */ | ||
55 | struct reiserfs_dir_entry de; | ||
56 | int ret = 0; | ||
57 | |||
58 | reiserfs_write_lock(inode->i_sb); | ||
59 | |||
60 | reiserfs_check_lock_depth(inode->i_sb, "readdir") ; | ||
61 | |||
62 | /* form key for search the next directory entry using f_pos field of | ||
63 | file structure */ | ||
64 | make_cpu_key (&pos_key, inode, (filp->f_pos) ? (filp->f_pos) : DOT_OFFSET, | ||
65 | TYPE_DIRENTRY, 3); | ||
66 | next_pos = cpu_key_k_offset (&pos_key); | ||
67 | |||
68 | /* reiserfs_warning (inode->i_sb, "reiserfs_readdir 1: f_pos = %Ld", filp->f_pos);*/ | ||
69 | |||
70 | path_to_entry.reada = PATH_READA; | ||
71 | while (1) { | ||
72 | research: | ||
73 | /* search the directory item, containing entry with specified key */ | ||
74 | search_res = search_by_entry_key (inode->i_sb, &pos_key, &path_to_entry, &de); | ||
75 | if (search_res == IO_ERROR) { | ||
76 | // FIXME: we could just skip part of directory which could | ||
77 | // not be read | ||
78 | ret = -EIO; | ||
79 | goto out; | ||
80 | } | ||
81 | entry_num = de.de_entry_num; | ||
82 | bh = de.de_bh; | ||
83 | item_num = de.de_item_num; | ||
84 | ih = de.de_ih; | ||
85 | store_ih (&tmp_ih, ih); | ||
86 | |||
87 | /* we must have found item, that is item of this directory, */ | ||
88 | RFALSE( COMP_SHORT_KEYS (&(ih->ih_key), &pos_key), | ||
89 | "vs-9000: found item %h does not match to dir we readdir %K", | ||
90 | ih, &pos_key); | ||
91 | RFALSE( item_num > B_NR_ITEMS (bh) - 1, | ||
92 | "vs-9005 item_num == %d, item amount == %d", | ||
93 | item_num, B_NR_ITEMS (bh)); | ||
94 | |||
95 | /* and entry must be not more than number of entries in the item */ | ||
96 | RFALSE( I_ENTRY_COUNT (ih) < entry_num, | ||
97 | "vs-9010: entry number is too big %d (%d)", | ||
98 | entry_num, I_ENTRY_COUNT (ih)); | ||
99 | |||
100 | if (search_res == POSITION_FOUND || entry_num < I_ENTRY_COUNT (ih)) { | ||
101 | /* go through all entries in the directory item beginning from the entry, that has been found */ | ||
102 | struct reiserfs_de_head * deh = B_I_DEH (bh, ih) + entry_num; | ||
103 | |||
104 | for (; entry_num < I_ENTRY_COUNT (ih); entry_num ++, deh ++) { | ||
105 | int d_reclen; | ||
106 | char * d_name; | ||
107 | off_t d_off; | ||
108 | ino_t d_ino; | ||
109 | |||
110 | if (!de_visible (deh)) | ||
111 | /* it is hidden entry */ | ||
112 | continue; | ||
113 | d_reclen = entry_length (bh, ih, entry_num); | ||
114 | d_name = B_I_DEH_ENTRY_FILE_NAME (bh, ih, deh); | ||
115 | if (!d_name[d_reclen - 1]) | ||
116 | d_reclen = strlen (d_name); | ||
117 | |||
118 | if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)){ | ||
119 | /* too big to send back to VFS */ | ||
120 | continue ; | ||
121 | } | ||
122 | |||
123 | /* Ignore the .reiserfs_priv entry */ | ||
124 | if (reiserfs_xattrs (inode->i_sb) && | ||
125 | !old_format_only(inode->i_sb) && | ||
126 | filp->f_dentry == inode->i_sb->s_root && | ||
127 | REISERFS_SB(inode->i_sb)->priv_root && | ||
128 | REISERFS_SB(inode->i_sb)->priv_root->d_inode && | ||
129 | deh_objectid(deh) == le32_to_cpu (INODE_PKEY(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->k_objectid)) { | ||
130 | continue; | ||
131 | } | ||
132 | |||
133 | d_off = deh_offset (deh); | ||
134 | filp->f_pos = d_off ; | ||
135 | d_ino = deh_objectid (deh); | ||
136 | if (d_reclen <= 32) { | ||
137 | local_buf = small_buf ; | ||
138 | } else { | ||
139 | local_buf = reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb) ; | ||
140 | if (!local_buf) { | ||
141 | pathrelse (&path_to_entry); | ||
142 | ret = -ENOMEM ; | ||
143 | goto out; | ||
144 | } | ||
145 | if (item_moved (&tmp_ih, &path_to_entry)) { | ||
146 | reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; | ||
147 | goto research; | ||
148 | } | ||
149 | } | ||
150 | // Note, that we copy name to user space via temporary | ||
151 | // buffer (local_buf) because filldir will block if | ||
152 | // user space buffer is swapped out. At that time | ||
153 | // entry can move to somewhere else | ||
154 | memcpy (local_buf, d_name, d_reclen); | ||
155 | if (filldir (dirent, local_buf, d_reclen, d_off, d_ino, | ||
156 | DT_UNKNOWN) < 0) { | ||
157 | if (local_buf != small_buf) { | ||
158 | reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; | ||
159 | } | ||
160 | goto end; | ||
161 | } | ||
162 | if (local_buf != small_buf) { | ||
163 | reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; | ||
164 | } | ||
165 | |||
166 | // next entry should be looked for with such offset | ||
167 | next_pos = deh_offset (deh) + 1; | ||
168 | |||
169 | if (item_moved (&tmp_ih, &path_to_entry)) { | ||
170 | goto research; | ||
171 | } | ||
172 | } /* for */ | ||
173 | } | ||
174 | |||
175 | if (item_num != B_NR_ITEMS (bh) - 1) | ||
176 | // end of directory has been reached | ||
177 | goto end; | ||
178 | |||
179 | /* item we went through is last item of node. Using right | ||
180 | delimiting key check is it directory end */ | ||
181 | rkey = get_rkey (&path_to_entry, inode->i_sb); | ||
182 | if (! comp_le_keys (rkey, &MIN_KEY)) { | ||
183 | /* set pos_key to key, that is the smallest and greater | ||
184 | that key of the last entry in the item */ | ||
185 | set_cpu_key_k_offset (&pos_key, next_pos); | ||
186 | continue; | ||
187 | } | ||
188 | |||
189 | if ( COMP_SHORT_KEYS (rkey, &pos_key)) { | ||
190 | // end of directory has been reached | ||
191 | goto end; | ||
192 | } | ||
193 | |||
194 | /* directory continues in the right neighboring block */ | ||
195 | set_cpu_key_k_offset (&pos_key, le_key_k_offset (KEY_FORMAT_3_5, rkey)); | ||
196 | |||
197 | } /* while */ | ||
198 | |||
199 | |||
200 | end: | ||
201 | filp->f_pos = next_pos; | ||
202 | pathrelse (&path_to_entry); | ||
203 | reiserfs_check_path(&path_to_entry) ; | ||
204 | out: | ||
205 | reiserfs_write_unlock(inode->i_sb); | ||
206 | return ret; | ||
207 | } | ||
208 | |||
209 | /* compose directory item containing "." and ".." entries (entries are | ||
210 | not aligned to 4 byte boundary) */ | ||
211 | /* the last four params are LE */ | ||
212 | void make_empty_dir_item_v1 (char * body, __u32 dirid, __u32 objid, | ||
213 | __u32 par_dirid, __u32 par_objid) | ||
214 | { | ||
215 | struct reiserfs_de_head * deh; | ||
216 | |||
217 | memset (body, 0, EMPTY_DIR_SIZE_V1); | ||
218 | deh = (struct reiserfs_de_head *)body; | ||
219 | |||
220 | /* direntry header of "." */ | ||
221 | put_deh_offset( &(deh[0]), DOT_OFFSET ); | ||
222 | /* these two are from make_le_item_head, and are are LE */ | ||
223 | deh[0].deh_dir_id = dirid; | ||
224 | deh[0].deh_objectid = objid; | ||
225 | deh[0].deh_state = 0; /* Endian safe if 0 */ | ||
226 | put_deh_location( &(deh[0]), EMPTY_DIR_SIZE_V1 - strlen( "." )); | ||
227 | mark_de_visible(&(deh[0])); | ||
228 | |||
229 | /* direntry header of ".." */ | ||
230 | put_deh_offset( &(deh[1]), DOT_DOT_OFFSET); | ||
231 | /* key of ".." for the root directory */ | ||
232 | /* these two are from the inode, and are are LE */ | ||
233 | deh[1].deh_dir_id = par_dirid; | ||
234 | deh[1].deh_objectid = par_objid; | ||
235 | deh[1].deh_state = 0; /* Endian safe if 0 */ | ||
236 | put_deh_location( &(deh[1]), deh_location( &(deh[0]) ) - strlen( ".." ) ); | ||
237 | mark_de_visible(&(deh[1])); | ||
238 | |||
239 | /* copy ".." and "." */ | ||
240 | memcpy (body + deh_location( &(deh[0]) ), ".", 1); | ||
241 | memcpy (body + deh_location( &(deh[1]) ), "..", 2); | ||
242 | } | ||
243 | |||
244 | /* compose directory item containing "." and ".." entries */ | ||
245 | void make_empty_dir_item (char * body, __u32 dirid, __u32 objid, | ||
246 | __u32 par_dirid, __u32 par_objid) | ||
247 | { | ||
248 | struct reiserfs_de_head * deh; | ||
249 | |||
250 | memset (body, 0, EMPTY_DIR_SIZE); | ||
251 | deh = (struct reiserfs_de_head *)body; | ||
252 | |||
253 | /* direntry header of "." */ | ||
254 | put_deh_offset( &(deh[0]), DOT_OFFSET ); | ||
255 | /* these two are from make_le_item_head, and are are LE */ | ||
256 | deh[0].deh_dir_id = dirid; | ||
257 | deh[0].deh_objectid = objid; | ||
258 | deh[0].deh_state = 0; /* Endian safe if 0 */ | ||
259 | put_deh_location( &(deh[0]), EMPTY_DIR_SIZE - ROUND_UP( strlen( "." ) ) ); | ||
260 | mark_de_visible(&(deh[0])); | ||
261 | |||
262 | /* direntry header of ".." */ | ||
263 | put_deh_offset( &(deh[1]), DOT_DOT_OFFSET ); | ||
264 | /* key of ".." for the root directory */ | ||
265 | /* these two are from the inode, and are are LE */ | ||
266 | deh[1].deh_dir_id = par_dirid; | ||
267 | deh[1].deh_objectid = par_objid; | ||
268 | deh[1].deh_state = 0; /* Endian safe if 0 */ | ||
269 | put_deh_location( &(deh[1]), deh_location( &(deh[0])) - ROUND_UP( strlen( ".." ) ) ); | ||
270 | mark_de_visible(&(deh[1])); | ||
271 | |||
272 | /* copy ".." and "." */ | ||
273 | memcpy (body + deh_location( &(deh[0]) ), ".", 1); | ||
274 | memcpy (body + deh_location( &(deh[1]) ), "..", 2); | ||
275 | } | ||
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c new file mode 100644 index 000000000000..2118db2896c7 --- /dev/null +++ b/fs/reiserfs/do_balan.c | |||
@@ -0,0 +1,1597 @@ | |||
1 | /* | ||
2 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
3 | */ | ||
4 | |||
5 | /* Now we have all buffers that must be used in balancing of the tree */ | ||
6 | /* Further calculations can not cause schedule(), and thus the buffer */ | ||
7 | /* tree will be stable until the balancing will be finished */ | ||
8 | /* balance the tree according to the analysis made before, */ | ||
9 | /* and using buffers obtained after all above. */ | ||
10 | |||
11 | |||
12 | /** | ||
13 | ** balance_leaf_when_delete | ||
14 | ** balance_leaf | ||
15 | ** do_balance | ||
16 | ** | ||
17 | **/ | ||
18 | |||
19 | #include <linux/config.h> | ||
20 | #include <asm/uaccess.h> | ||
21 | #include <linux/time.h> | ||
22 | #include <linux/reiserfs_fs.h> | ||
23 | #include <linux/buffer_head.h> | ||
24 | |||
25 | #ifdef CONFIG_REISERFS_CHECK | ||
26 | |||
27 | struct tree_balance * cur_tb = NULL; /* detects whether more than one | ||
28 | copy of tb exists as a means | ||
29 | of checking whether schedule | ||
30 | is interrupting do_balance */ | ||
31 | #endif | ||
32 | |||
33 | inline void do_balance_mark_leaf_dirty (struct tree_balance * tb, | ||
34 | struct buffer_head * bh, int flag) | ||
35 | { | ||
36 | journal_mark_dirty(tb->transaction_handle, | ||
37 | tb->transaction_handle->t_super, bh) ; | ||
38 | } | ||
39 | |||
40 | #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty | ||
41 | #define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty | ||
42 | |||
43 | |||
44 | /* summary: | ||
45 | if deleting something ( tb->insert_size[0] < 0 ) | ||
46 | return(balance_leaf_when_delete()); (flag d handled here) | ||
47 | else | ||
48 | if lnum is larger than 0 we put items into the left node | ||
49 | if rnum is larger than 0 we put items into the right node | ||
50 | if snum1 is larger than 0 we put items into the new node s1 | ||
51 | if snum2 is larger than 0 we put items into the new node s2 | ||
52 | Note that all *num* count new items being created. | ||
53 | |||
54 | It would be easier to read balance_leaf() if each of these summary | ||
55 | lines was a separate procedure rather than being inlined. I think | ||
56 | that there are many passages here and in balance_leaf_when_delete() in | ||
57 | which two calls to one procedure can replace two passages, and it | ||
58 | might save cache space and improve software maintenance costs to do so. | ||
59 | |||
60 | Vladimir made the perceptive comment that we should offload most of | ||
61 | the decision making in this function into fix_nodes/check_balance, and | ||
62 | then create some sort of structure in tb that says what actions should | ||
63 | be performed by do_balance. | ||
64 | |||
65 | -Hans */ | ||
66 | |||
67 | |||
68 | |||
69 | /* Balance leaf node in case of delete or cut: insert_size[0] < 0 | ||
70 | * | ||
71 | * lnum, rnum can have values >= -1 | ||
72 | * -1 means that the neighbor must be joined with S | ||
73 | * 0 means that nothing should be done with the neighbor | ||
74 | * >0 means to shift entirely or partly the specified number of items to the neighbor | ||
75 | */ | ||
76 | static int balance_leaf_when_delete (struct tree_balance * tb, int flag) | ||
77 | { | ||
78 | struct buffer_head * tbS0 = PATH_PLAST_BUFFER (tb->tb_path); | ||
79 | int item_pos = PATH_LAST_POSITION (tb->tb_path); | ||
80 | int pos_in_item = tb->tb_path->pos_in_item; | ||
81 | struct buffer_info bi; | ||
82 | int n; | ||
83 | struct item_head * ih; | ||
84 | |||
85 | RFALSE( tb->FR[0] && B_LEVEL (tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1, | ||
86 | "vs- 12000: level: wrong FR %z", tb->FR[0]); | ||
87 | RFALSE( tb->blknum[0] > 1, | ||
88 | "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]); | ||
89 | RFALSE( ! tb->blknum[0] && ! PATH_H_PPARENT(tb->tb_path, 0), | ||
90 | "PAP-12010: tree can not be empty"); | ||
91 | |||
92 | ih = B_N_PITEM_HEAD (tbS0, item_pos); | ||
93 | |||
94 | /* Delete or truncate the item */ | ||
95 | |||
96 | switch (flag) { | ||
97 | case M_DELETE: /* delete item in S[0] */ | ||
98 | |||
99 | RFALSE( ih_item_len(ih) + IH_SIZE != -tb->insert_size[0], | ||
100 | "vs-12013: mode Delete, insert size %d, ih to be deleted %h", | ||
101 | -tb->insert_size [0], ih); | ||
102 | |||
103 | bi.tb = tb; | ||
104 | bi.bi_bh = tbS0; | ||
105 | bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); | ||
106 | bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); | ||
107 | leaf_delete_items (&bi, 0, item_pos, 1, -1); | ||
108 | |||
109 | if ( ! item_pos && tb->CFL[0] ) { | ||
110 | if ( B_NR_ITEMS(tbS0) ) { | ||
111 | replace_key(tb, tb->CFL[0],tb->lkey[0],tbS0,0); | ||
112 | } | ||
113 | else { | ||
114 | if ( ! PATH_H_POSITION (tb->tb_path, 1) ) | ||
115 | replace_key(tb, tb->CFL[0],tb->lkey[0],PATH_H_PPARENT(tb->tb_path, 0),0); | ||
116 | } | ||
117 | } | ||
118 | |||
119 | RFALSE( ! item_pos && !tb->CFL[0], | ||
120 | "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0], tb->L[0]); | ||
121 | |||
122 | break; | ||
123 | |||
124 | case M_CUT: { /* cut item in S[0] */ | ||
125 | bi.tb = tb; | ||
126 | bi.bi_bh = tbS0; | ||
127 | bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); | ||
128 | bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); | ||
129 | if (is_direntry_le_ih (ih)) { | ||
130 | |||
131 | /* UFS unlink semantics are such that you can only delete one directory entry at a time. */ | ||
132 | /* when we cut a directory tb->insert_size[0] means number of entries to be cut (always 1) */ | ||
133 | tb->insert_size[0] = -1; | ||
134 | leaf_cut_from_buffer (&bi, item_pos, pos_in_item, -tb->insert_size[0]); | ||
135 | |||
136 | RFALSE( ! item_pos && ! pos_in_item && ! tb->CFL[0], | ||
137 | "PAP-12030: can not change delimiting key. CFL[0]=%p", | ||
138 | tb->CFL[0]); | ||
139 | |||
140 | if ( ! item_pos && ! pos_in_item && tb->CFL[0] ) { | ||
141 | replace_key(tb, tb->CFL[0],tb->lkey[0],tbS0,0); | ||
142 | } | ||
143 | } else { | ||
144 | leaf_cut_from_buffer (&bi, item_pos, pos_in_item, -tb->insert_size[0]); | ||
145 | |||
146 | RFALSE( ! ih_item_len(ih), | ||
147 | "PAP-12035: cut must leave non-zero dynamic length of item"); | ||
148 | } | ||
149 | break; | ||
150 | } | ||
151 | |||
152 | default: | ||
153 | print_cur_tb ("12040"); | ||
154 | reiserfs_panic (tb->tb_sb, "PAP-12040: balance_leaf_when_delete: unexpectable mode: %s(%d)", | ||
155 | (flag == M_PASTE) ? "PASTE" : ((flag == M_INSERT) ? "INSERT" : "UNKNOWN"), flag); | ||
156 | } | ||
157 | |||
158 | /* the rule is that no shifting occurs unless by shifting a node can be freed */ | ||
159 | n = B_NR_ITEMS(tbS0); | ||
160 | if ( tb->lnum[0] ) /* L[0] takes part in balancing */ | ||
161 | { | ||
162 | if ( tb->lnum[0] == -1 ) /* L[0] must be joined with S[0] */ | ||
163 | { | ||
164 | if ( tb->rnum[0] == -1 ) /* R[0] must be also joined with S[0] */ | ||
165 | { | ||
166 | if ( tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0) ) | ||
167 | { | ||
168 | /* all contents of all the 3 buffers will be in L[0] */ | ||
169 | if ( PATH_H_POSITION (tb->tb_path, 1) == 0 && 1 < B_NR_ITEMS(tb->FR[0]) ) | ||
170 | replace_key(tb, tb->CFL[0],tb->lkey[0],tb->FR[0],1); | ||
171 | |||
172 | leaf_move_items (LEAF_FROM_S_TO_L, tb, n, -1, NULL); | ||
173 | leaf_move_items (LEAF_FROM_R_TO_L, tb, B_NR_ITEMS(tb->R[0]), -1, NULL); | ||
174 | |||
175 | reiserfs_invalidate_buffer (tb, tbS0); | ||
176 | reiserfs_invalidate_buffer (tb, tb->R[0]); | ||
177 | |||
178 | return 0; | ||
179 | } | ||
180 | /* all contents of all the 3 buffers will be in R[0] */ | ||
181 | leaf_move_items (LEAF_FROM_S_TO_R, tb, n, -1, NULL); | ||
182 | leaf_move_items (LEAF_FROM_L_TO_R, tb, B_NR_ITEMS(tb->L[0]), -1, NULL); | ||
183 | |||
184 | /* right_delimiting_key is correct in R[0] */ | ||
185 | replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); | ||
186 | |||
187 | reiserfs_invalidate_buffer (tb, tbS0); | ||
188 | reiserfs_invalidate_buffer (tb, tb->L[0]); | ||
189 | |||
190 | return -1; | ||
191 | } | ||
192 | |||
193 | RFALSE( tb->rnum[0] != 0, | ||
194 | "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]); | ||
195 | /* all contents of L[0] and S[0] will be in L[0] */ | ||
196 | leaf_shift_left(tb, n, -1); | ||
197 | |||
198 | reiserfs_invalidate_buffer (tb, tbS0); | ||
199 | |||
200 | return 0; | ||
201 | } | ||
202 | /* a part of contents of S[0] will be in L[0] and the rest part of S[0] will be in R[0] */ | ||
203 | |||
204 | RFALSE( ( tb->lnum[0] + tb->rnum[0] < n ) || | ||
205 | ( tb->lnum[0] + tb->rnum[0] > n+1 ), | ||
206 | "PAP-12050: rnum(%d) and lnum(%d) and item number(%d) in S[0] are not consistent", | ||
207 | tb->rnum[0], tb->lnum[0], n); | ||
208 | RFALSE( ( tb->lnum[0] + tb->rnum[0] == n ) && | ||
209 | (tb->lbytes != -1 || tb->rbytes != -1), | ||
210 | "PAP-12055: bad rbytes (%d)/lbytes (%d) parameters when items are not split", | ||
211 | tb->rbytes, tb->lbytes); | ||
212 | RFALSE( ( tb->lnum[0] + tb->rnum[0] == n + 1 ) && | ||
213 | (tb->lbytes < 1 || tb->rbytes != -1), | ||
214 | "PAP-12060: bad rbytes (%d)/lbytes (%d) parameters when items are split", | ||
215 | tb->rbytes, tb->lbytes); | ||
216 | |||
217 | leaf_shift_left (tb, tb->lnum[0], tb->lbytes); | ||
218 | leaf_shift_right(tb, tb->rnum[0], tb->rbytes); | ||
219 | |||
220 | reiserfs_invalidate_buffer (tb, tbS0); | ||
221 | |||
222 | return 0; | ||
223 | } | ||
224 | |||
225 | if ( tb->rnum[0] == -1 ) { | ||
226 | /* all contents of R[0] and S[0] will be in R[0] */ | ||
227 | leaf_shift_right(tb, n, -1); | ||
228 | reiserfs_invalidate_buffer (tb, tbS0); | ||
229 | return 0; | ||
230 | } | ||
231 | |||
232 | RFALSE( tb->rnum[0], | ||
233 | "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]); | ||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | |||
238 | static int balance_leaf (struct tree_balance * tb, | ||
239 | struct item_head * ih, /* item header of inserted item (this is on little endian) */ | ||
240 | const char * body, /* body of inserted item or bytes to paste */ | ||
241 | int flag, /* i - insert, d - delete, c - cut, p - paste | ||
242 | (see comment to do_balance) */ | ||
243 | struct item_head * insert_key, /* in our processing of one level we sometimes determine what | ||
244 | must be inserted into the next higher level. This insertion | ||
245 | consists of a key or two keys and their corresponding | ||
246 | pointers */ | ||
247 | struct buffer_head ** insert_ptr /* inserted node-ptrs for the next level */ | ||
248 | ) | ||
249 | { | ||
250 | struct buffer_head * tbS0 = PATH_PLAST_BUFFER (tb->tb_path); | ||
251 | int item_pos = PATH_LAST_POSITION (tb->tb_path); /* index into the array of item headers in S[0] | ||
252 | of the affected item */ | ||
253 | struct buffer_info bi; | ||
254 | struct buffer_head *S_new[2]; /* new nodes allocated to hold what could not fit into S */ | ||
255 | int snum[2]; /* number of items that will be placed | ||
256 | into S_new (includes partially shifted | ||
257 | items) */ | ||
258 | int sbytes[2]; /* if an item is partially shifted into S_new then | ||
259 | if it is a directory item | ||
260 | it is the number of entries from the item that are shifted into S_new | ||
261 | else | ||
262 | it is the number of bytes from the item that are shifted into S_new | ||
263 | */ | ||
264 | int n, i; | ||
265 | int ret_val; | ||
266 | int pos_in_item; | ||
267 | int zeros_num; | ||
268 | |||
269 | PROC_INFO_INC( tb -> tb_sb, balance_at[ 0 ] ); | ||
270 | |||
271 | /* Make balance in case insert_size[0] < 0 */ | ||
272 | if ( tb->insert_size[0] < 0 ) | ||
273 | return balance_leaf_when_delete (tb, flag); | ||
274 | |||
275 | zeros_num = 0; | ||
276 | if (flag == M_INSERT && body == 0) | ||
277 | zeros_num = ih_item_len( ih ); | ||
278 | |||
279 | pos_in_item = tb->tb_path->pos_in_item; | ||
280 | /* for indirect item pos_in_item is measured in unformatted node | ||
281 | pointers. Recalculate to bytes */ | ||
282 | if (flag != M_INSERT && is_indirect_le_ih (B_N_PITEM_HEAD (tbS0, item_pos))) | ||
283 | pos_in_item *= UNFM_P_SIZE; | ||
284 | |||
285 | if ( tb->lnum[0] > 0 ) { | ||
286 | /* Shift lnum[0] items from S[0] to the left neighbor L[0] */ | ||
287 | if ( item_pos < tb->lnum[0] ) { | ||
288 | /* new item or it part falls to L[0], shift it too */ | ||
289 | n = B_NR_ITEMS(tb->L[0]); | ||
290 | |||
291 | switch (flag) { | ||
292 | case M_INSERT: /* insert item into L[0] */ | ||
293 | |||
294 | if ( item_pos == tb->lnum[0] - 1 && tb->lbytes != -1 ) { | ||
295 | /* part of new item falls into L[0] */ | ||
296 | int new_item_len; | ||
297 | int version; | ||
298 | |||
299 | ret_val = leaf_shift_left (tb, tb->lnum[0]-1, -1); | ||
300 | |||
301 | /* Calculate item length to insert to S[0] */ | ||
302 | new_item_len = ih_item_len(ih) - tb->lbytes; | ||
303 | /* Calculate and check item length to insert to L[0] */ | ||
304 | put_ih_item_len(ih, ih_item_len(ih) - new_item_len ); | ||
305 | |||
306 | RFALSE( ih_item_len(ih) <= 0, | ||
307 | "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d", | ||
308 | ih_item_len(ih)); | ||
309 | |||
310 | /* Insert new item into L[0] */ | ||
311 | bi.tb = tb; | ||
312 | bi.bi_bh = tb->L[0]; | ||
313 | bi.bi_parent = tb->FL[0]; | ||
314 | bi.bi_position = get_left_neighbor_position (tb, 0); | ||
315 | leaf_insert_into_buf (&bi, n + item_pos - ret_val, ih, body, | ||
316 | zeros_num > ih_item_len(ih) ? ih_item_len(ih) : zeros_num); | ||
317 | |||
318 | version = ih_version (ih); | ||
319 | |||
320 | /* Calculate key component, item length and body to insert into S[0] */ | ||
321 | set_le_ih_k_offset( ih, le_ih_k_offset( ih ) + (tb->lbytes << (is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0)) ); | ||
322 | |||
323 | put_ih_item_len( ih, new_item_len ); | ||
324 | if ( tb->lbytes > zeros_num ) { | ||
325 | body += (tb->lbytes - zeros_num); | ||
326 | zeros_num = 0; | ||
327 | } | ||
328 | else | ||
329 | zeros_num -= tb->lbytes; | ||
330 | |||
331 | RFALSE( ih_item_len(ih) <= 0, | ||
332 | "PAP-12085: there is nothing to insert into S[0]: ih_item_len=%d", | ||
333 | ih_item_len(ih)); | ||
334 | } else { | ||
335 | /* new item in whole falls into L[0] */ | ||
336 | /* Shift lnum[0]-1 items to L[0] */ | ||
337 | ret_val = leaf_shift_left(tb, tb->lnum[0]-1, tb->lbytes); | ||
338 | /* Insert new item into L[0] */ | ||
339 | bi.tb = tb; | ||
340 | bi.bi_bh = tb->L[0]; | ||
341 | bi.bi_parent = tb->FL[0]; | ||
342 | bi.bi_position = get_left_neighbor_position (tb, 0); | ||
343 | leaf_insert_into_buf (&bi, n + item_pos - ret_val, ih, body, zeros_num); | ||
344 | tb->insert_size[0] = 0; | ||
345 | zeros_num = 0; | ||
346 | } | ||
347 | break; | ||
348 | |||
349 | case M_PASTE: /* append item in L[0] */ | ||
350 | |||
351 | if ( item_pos == tb->lnum[0] - 1 && tb->lbytes != -1 ) { | ||
352 | /* we must shift the part of the appended item */ | ||
353 | if ( is_direntry_le_ih (B_N_PITEM_HEAD (tbS0, item_pos))) { | ||
354 | |||
355 | RFALSE( zeros_num, | ||
356 | "PAP-12090: invalid parameter in case of a directory"); | ||
357 | /* directory item */ | ||
358 | if ( tb->lbytes > pos_in_item ) { | ||
359 | /* new directory entry falls into L[0] */ | ||
360 | struct item_head * pasted; | ||
361 | int l_pos_in_item = pos_in_item; | ||
362 | |||
363 | /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */ | ||
364 | ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1); | ||
365 | if ( ret_val && ! item_pos ) { | ||
366 | pasted = B_N_PITEM_HEAD(tb->L[0],B_NR_ITEMS(tb->L[0])-1); | ||
367 | l_pos_in_item += I_ENTRY_COUNT(pasted) - (tb->lbytes-1); | ||
368 | } | ||
369 | |||
370 | /* Append given directory entry to directory item */ | ||
371 | bi.tb = tb; | ||
372 | bi.bi_bh = tb->L[0]; | ||
373 | bi.bi_parent = tb->FL[0]; | ||
374 | bi.bi_position = get_left_neighbor_position (tb, 0); | ||
375 | leaf_paste_in_buffer (&bi, n + item_pos - ret_val, l_pos_in_item, | ||
376 | tb->insert_size[0], body, zeros_num); | ||
377 | |||
378 | /* previous string prepared space for pasting new entry, following string pastes this entry */ | ||
379 | |||
380 | /* when we have merge directory item, pos_in_item has been changed too */ | ||
381 | |||
382 | /* paste new directory entry. 1 is entry number */ | ||
383 | leaf_paste_entries (bi.bi_bh, n + item_pos - ret_val, l_pos_in_item, 1, | ||
384 | (struct reiserfs_de_head *)body, | ||
385 | body + DEH_SIZE, tb->insert_size[0] | ||
386 | ); | ||
387 | tb->insert_size[0] = 0; | ||
388 | } else { | ||
389 | /* new directory item doesn't fall into L[0] */ | ||
390 | /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */ | ||
391 | leaf_shift_left (tb, tb->lnum[0], tb->lbytes); | ||
392 | } | ||
393 | /* Calculate new position to append in item body */ | ||
394 | pos_in_item -= tb->lbytes; | ||
395 | } | ||
396 | else { | ||
397 | /* regular object */ | ||
398 | RFALSE( tb->lbytes <= 0, | ||
399 | "PAP-12095: there is nothing to shift to L[0]. lbytes=%d", | ||
400 | tb->lbytes); | ||
401 | RFALSE( pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)), | ||
402 | "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d", | ||
403 | ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos)), pos_in_item); | ||
404 | |||
405 | if ( tb->lbytes >= pos_in_item ) { | ||
406 | /* appended item will be in L[0] in whole */ | ||
407 | int l_n; | ||
408 | |||
409 | /* this bytes number must be appended to the last item of L[h] */ | ||
410 | l_n = tb->lbytes - pos_in_item; | ||
411 | |||
412 | /* Calculate new insert_size[0] */ | ||
413 | tb->insert_size[0] -= l_n; | ||
414 | |||
415 | RFALSE( tb->insert_size[0] <= 0, | ||
416 | "PAP-12105: there is nothing to paste into L[0]. insert_size=%d", | ||
417 | tb->insert_size[0]); | ||
418 | ret_val = leaf_shift_left(tb,tb->lnum[0], | ||
419 | ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos))); | ||
420 | /* Append to body of item in L[0] */ | ||
421 | bi.tb = tb; | ||
422 | bi.bi_bh = tb->L[0]; | ||
423 | bi.bi_parent = tb->FL[0]; | ||
424 | bi.bi_position = get_left_neighbor_position (tb, 0); | ||
425 | leaf_paste_in_buffer( | ||
426 | &bi,n + item_pos - ret_val, | ||
427 | ih_item_len( B_N_PITEM_HEAD(tb->L[0],n+item_pos-ret_val)), | ||
428 | l_n,body, zeros_num > l_n ? l_n : zeros_num | ||
429 | ); | ||
430 | /* 0-th item in S0 can be only of DIRECT type when l_n != 0*/ | ||
431 | { | ||
432 | int version; | ||
433 | int temp_l = l_n; | ||
434 | |||
435 | RFALSE (ih_item_len (B_N_PITEM_HEAD (tbS0, 0)), | ||
436 | "PAP-12106: item length must be 0"); | ||
437 | RFALSE (comp_short_le_keys (B_N_PKEY (tbS0, 0), | ||
438 | B_N_PKEY (tb->L[0], | ||
439 | n + item_pos - ret_val)), | ||
440 | "PAP-12107: items must be of the same file"); | ||
441 | if (is_indirect_le_ih(B_N_PITEM_HEAD (tb->L[0], | ||
442 | n + item_pos - ret_val))) { | ||
443 | temp_l = l_n << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT); | ||
444 | } | ||
445 | /* update key of first item in S0 */ | ||
446 | version = ih_version (B_N_PITEM_HEAD (tbS0, 0)); | ||
447 | set_le_key_k_offset (version, B_N_PKEY (tbS0, 0), | ||
448 | le_key_k_offset (version, B_N_PKEY (tbS0, 0)) + temp_l); | ||
449 | /* update left delimiting key */ | ||
450 | set_le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0]), | ||
451 | le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0])) + temp_l); | ||
452 | } | ||
453 | |||
454 | /* Calculate new body, position in item and insert_size[0] */ | ||
455 | if ( l_n > zeros_num ) { | ||
456 | body += (l_n - zeros_num); | ||
457 | zeros_num = 0; | ||
458 | } | ||
459 | else | ||
460 | zeros_num -= l_n; | ||
461 | pos_in_item = 0; | ||
462 | |||
463 | RFALSE( comp_short_le_keys | ||
464 | (B_N_PKEY(tbS0,0), | ||
465 | B_N_PKEY(tb->L[0],B_NR_ITEMS(tb->L[0])-1)) || | ||
466 | |||
467 | !op_is_left_mergeable | ||
468 | (B_N_PKEY (tbS0, 0), tbS0->b_size) || | ||
469 | !op_is_left_mergeable | ||
470 | (B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0]), | ||
471 | tbS0->b_size), | ||
472 | "PAP-12120: item must be merge-able with left neighboring item"); | ||
473 | } | ||
474 | else /* only part of the appended item will be in L[0] */ | ||
475 | { | ||
476 | /* Calculate position in item for append in S[0] */ | ||
477 | pos_in_item -= tb->lbytes; | ||
478 | |||
479 | RFALSE( pos_in_item <= 0, | ||
480 | "PAP-12125: no place for paste. pos_in_item=%d", pos_in_item); | ||
481 | |||
482 | /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ | ||
483 | leaf_shift_left(tb,tb->lnum[0],tb->lbytes); | ||
484 | } | ||
485 | } | ||
486 | } | ||
487 | else /* appended item will be in L[0] in whole */ | ||
488 | { | ||
489 | struct item_head * pasted; | ||
490 | |||
491 | if ( ! item_pos && op_is_left_mergeable (B_N_PKEY (tbS0, 0), tbS0->b_size) ) | ||
492 | { /* if we paste into first item of S[0] and it is left mergable */ | ||
493 | /* then increment pos_in_item by the size of the last item in L[0] */ | ||
494 | pasted = B_N_PITEM_HEAD(tb->L[0],n-1); | ||
495 | if ( is_direntry_le_ih (pasted) ) | ||
496 | pos_in_item += ih_entry_count(pasted); | ||
497 | else | ||
498 | pos_in_item += ih_item_len(pasted); | ||
499 | } | ||
500 | |||
501 | /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ | ||
502 | ret_val = leaf_shift_left(tb,tb->lnum[0],tb->lbytes); | ||
503 | /* Append to body of item in L[0] */ | ||
504 | bi.tb = tb; | ||
505 | bi.bi_bh = tb->L[0]; | ||
506 | bi.bi_parent = tb->FL[0]; | ||
507 | bi.bi_position = get_left_neighbor_position (tb, 0); | ||
508 | leaf_paste_in_buffer (&bi, n + item_pos - ret_val, pos_in_item, tb->insert_size[0], | ||
509 | body, zeros_num); | ||
510 | |||
511 | /* if appended item is directory, paste entry */ | ||
512 | pasted = B_N_PITEM_HEAD (tb->L[0], n + item_pos - ret_val); | ||
513 | if (is_direntry_le_ih (pasted)) | ||
514 | leaf_paste_entries ( | ||
515 | bi.bi_bh, n + item_pos - ret_val, pos_in_item, 1, | ||
516 | (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0] | ||
517 | ); | ||
518 | /* if appended item is indirect item, put unformatted node into un list */ | ||
519 | if (is_indirect_le_ih (pasted)) | ||
520 | set_ih_free_space (pasted, 0); | ||
521 | tb->insert_size[0] = 0; | ||
522 | zeros_num = 0; | ||
523 | } | ||
524 | break; | ||
525 | default: /* cases d and t */ | ||
526 | reiserfs_panic (tb->tb_sb, "PAP-12130: balance_leaf: lnum > 0: unexpectable mode: %s(%d)", | ||
527 | (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); | ||
528 | } | ||
529 | } else { | ||
530 | /* new item doesn't fall into L[0] */ | ||
531 | leaf_shift_left(tb,tb->lnum[0],tb->lbytes); | ||
532 | } | ||
533 | } /* tb->lnum[0] > 0 */ | ||
534 | |||
535 | /* Calculate new item position */ | ||
536 | item_pos -= ( tb->lnum[0] - (( tb->lbytes != -1 ) ? 1 : 0)); | ||
537 | |||
538 | if ( tb->rnum[0] > 0 ) { | ||
539 | /* shift rnum[0] items from S[0] to the right neighbor R[0] */ | ||
540 | n = B_NR_ITEMS(tbS0); | ||
541 | switch ( flag ) { | ||
542 | |||
543 | case M_INSERT: /* insert item */ | ||
544 | if ( n - tb->rnum[0] < item_pos ) | ||
545 | { /* new item or its part falls to R[0] */ | ||
546 | if ( item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1 ) | ||
547 | { /* part of new item falls into R[0] */ | ||
548 | loff_t old_key_comp, old_len, r_zeros_number; | ||
549 | const char * r_body; | ||
550 | int version; | ||
551 | loff_t offset; | ||
552 | |||
553 | leaf_shift_right(tb,tb->rnum[0]-1,-1); | ||
554 | |||
555 | version = ih_version(ih); | ||
556 | /* Remember key component and item length */ | ||
557 | old_key_comp = le_ih_k_offset( ih ); | ||
558 | old_len = ih_item_len(ih); | ||
559 | |||
560 | /* Calculate key component and item length to insert into R[0] */ | ||
561 | offset = le_ih_k_offset( ih ) + ((old_len - tb->rbytes )<<(is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0)); | ||
562 | set_le_ih_k_offset( ih, offset ); | ||
563 | put_ih_item_len( ih, tb->rbytes); | ||
564 | /* Insert part of the item into R[0] */ | ||
565 | bi.tb = tb; | ||
566 | bi.bi_bh = tb->R[0]; | ||
567 | bi.bi_parent = tb->FR[0]; | ||
568 | bi.bi_position = get_right_neighbor_position (tb, 0); | ||
569 | if ( (old_len - tb->rbytes) > zeros_num ) { | ||
570 | r_zeros_number = 0; | ||
571 | r_body = body + (old_len - tb->rbytes) - zeros_num; | ||
572 | } | ||
573 | else { | ||
574 | r_body = body; | ||
575 | r_zeros_number = zeros_num - (old_len - tb->rbytes); | ||
576 | zeros_num -= r_zeros_number; | ||
577 | } | ||
578 | |||
579 | leaf_insert_into_buf (&bi, 0, ih, r_body, r_zeros_number); | ||
580 | |||
581 | /* Replace right delimiting key by first key in R[0] */ | ||
582 | replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); | ||
583 | |||
584 | /* Calculate key component and item length to insert into S[0] */ | ||
585 | set_le_ih_k_offset( ih, old_key_comp ); | ||
586 | put_ih_item_len( ih, old_len - tb->rbytes ); | ||
587 | |||
588 | tb->insert_size[0] -= tb->rbytes; | ||
589 | |||
590 | } | ||
591 | else /* whole new item falls into R[0] */ | ||
592 | { | ||
593 | /* Shift rnum[0]-1 items to R[0] */ | ||
594 | ret_val = leaf_shift_right(tb,tb->rnum[0]-1,tb->rbytes); | ||
595 | /* Insert new item into R[0] */ | ||
596 | bi.tb = tb; | ||
597 | bi.bi_bh = tb->R[0]; | ||
598 | bi.bi_parent = tb->FR[0]; | ||
599 | bi.bi_position = get_right_neighbor_position (tb, 0); | ||
600 | leaf_insert_into_buf (&bi, item_pos - n + tb->rnum[0] - 1, ih, body, zeros_num); | ||
601 | |||
602 | if ( item_pos - n + tb->rnum[0] - 1 == 0 ) { | ||
603 | replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); | ||
604 | |||
605 | } | ||
606 | zeros_num = tb->insert_size[0] = 0; | ||
607 | } | ||
608 | } | ||
609 | else /* new item or part of it doesn't fall into R[0] */ | ||
610 | { | ||
611 | leaf_shift_right(tb,tb->rnum[0],tb->rbytes); | ||
612 | } | ||
613 | break; | ||
614 | |||
615 | case M_PASTE: /* append item */ | ||
616 | |||
617 | if ( n - tb->rnum[0] <= item_pos ) /* pasted item or part of it falls to R[0] */ | ||
618 | { | ||
619 | if ( item_pos == n - tb->rnum[0] && tb->rbytes != -1 ) | ||
620 | { /* we must shift the part of the appended item */ | ||
621 | if ( is_direntry_le_ih (B_N_PITEM_HEAD(tbS0, item_pos))) | ||
622 | { /* we append to directory item */ | ||
623 | int entry_count; | ||
624 | |||
625 | RFALSE( zeros_num, | ||
626 | "PAP-12145: invalid parameter in case of a directory"); | ||
627 | entry_count = I_ENTRY_COUNT(B_N_PITEM_HEAD(tbS0, item_pos)); | ||
628 | if ( entry_count - tb->rbytes < pos_in_item ) | ||
629 | /* new directory entry falls into R[0] */ | ||
630 | { | ||
631 | int paste_entry_position; | ||
632 | |||
633 | RFALSE( tb->rbytes - 1 >= entry_count || | ||
634 | ! tb->insert_size[0], | ||
635 | "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d", | ||
636 | tb->rbytes, entry_count); | ||
637 | /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */ | ||
638 | leaf_shift_right(tb,tb->rnum[0],tb->rbytes - 1); | ||
639 | /* Paste given directory entry to directory item */ | ||
640 | paste_entry_position = pos_in_item - entry_count + tb->rbytes - 1; | ||
641 | bi.tb = tb; | ||
642 | bi.bi_bh = tb->R[0]; | ||
643 | bi.bi_parent = tb->FR[0]; | ||
644 | bi.bi_position = get_right_neighbor_position (tb, 0); | ||
645 | leaf_paste_in_buffer (&bi, 0, paste_entry_position, | ||
646 | tb->insert_size[0],body,zeros_num); | ||
647 | /* paste entry */ | ||
648 | leaf_paste_entries ( | ||
649 | bi.bi_bh, 0, paste_entry_position, 1, (struct reiserfs_de_head *)body, | ||
650 | body + DEH_SIZE, tb->insert_size[0] | ||
651 | ); | ||
652 | |||
653 | if ( paste_entry_position == 0 ) { | ||
654 | /* change delimiting keys */ | ||
655 | replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); | ||
656 | } | ||
657 | |||
658 | tb->insert_size[0] = 0; | ||
659 | pos_in_item++; | ||
660 | } | ||
661 | else /* new directory entry doesn't fall into R[0] */ | ||
662 | { | ||
663 | leaf_shift_right(tb,tb->rnum[0],tb->rbytes); | ||
664 | } | ||
665 | } | ||
666 | else /* regular object */ | ||
667 | { | ||
668 | int n_shift, n_rem, r_zeros_number; | ||
669 | const char * r_body; | ||
670 | |||
671 | /* Calculate number of bytes which must be shifted from appended item */ | ||
672 | if ( (n_shift = tb->rbytes - tb->insert_size[0]) < 0 ) | ||
673 | n_shift = 0; | ||
674 | |||
675 | RFALSE(pos_in_item != ih_item_len(B_N_PITEM_HEAD (tbS0, item_pos)), | ||
676 | "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d", | ||
677 | pos_in_item, ih_item_len( B_N_PITEM_HEAD(tbS0,item_pos))); | ||
678 | |||
679 | leaf_shift_right(tb,tb->rnum[0],n_shift); | ||
680 | /* Calculate number of bytes which must remain in body after appending to R[0] */ | ||
681 | if ( (n_rem = tb->insert_size[0] - tb->rbytes) < 0 ) | ||
682 | n_rem = 0; | ||
683 | |||
684 | { | ||
685 | int version; | ||
686 | unsigned long temp_rem = n_rem; | ||
687 | |||
688 | version = ih_version (B_N_PITEM_HEAD (tb->R[0],0)); | ||
689 | if (is_indirect_le_key(version,B_N_PKEY(tb->R[0],0))){ | ||
690 | temp_rem = n_rem << (tb->tb_sb->s_blocksize_bits - | ||
691 | UNFM_P_SHIFT); | ||
692 | } | ||
693 | set_le_key_k_offset (version, B_N_PKEY(tb->R[0],0), | ||
694 | le_key_k_offset (version, B_N_PKEY(tb->R[0],0)) + temp_rem); | ||
695 | set_le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0]), | ||
696 | le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) + temp_rem); | ||
697 | } | ||
698 | /* k_offset (B_N_PKEY(tb->R[0],0)) += n_rem; | ||
699 | k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/ | ||
700 | do_balance_mark_internal_dirty (tb, tb->CFR[0], 0); | ||
701 | |||
702 | /* Append part of body into R[0] */ | ||
703 | bi.tb = tb; | ||
704 | bi.bi_bh = tb->R[0]; | ||
705 | bi.bi_parent = tb->FR[0]; | ||
706 | bi.bi_position = get_right_neighbor_position (tb, 0); | ||
707 | if ( n_rem > zeros_num ) { | ||
708 | r_zeros_number = 0; | ||
709 | r_body = body + n_rem - zeros_num; | ||
710 | } | ||
711 | else { | ||
712 | r_body = body; | ||
713 | r_zeros_number = zeros_num - n_rem; | ||
714 | zeros_num -= r_zeros_number; | ||
715 | } | ||
716 | |||
717 | leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, r_body, r_zeros_number); | ||
718 | |||
719 | if (is_indirect_le_ih (B_N_PITEM_HEAD(tb->R[0],0))) { | ||
720 | #if 0 | ||
721 | RFALSE( n_rem, | ||
722 | "PAP-12160: paste more than one unformatted node pointer"); | ||
723 | #endif | ||
724 | set_ih_free_space (B_N_PITEM_HEAD(tb->R[0],0), 0); | ||
725 | } | ||
726 | tb->insert_size[0] = n_rem; | ||
727 | if ( ! n_rem ) | ||
728 | pos_in_item ++; | ||
729 | } | ||
730 | } | ||
731 | else /* pasted item in whole falls into R[0] */ | ||
732 | { | ||
733 | struct item_head * pasted; | ||
734 | |||
735 | ret_val = leaf_shift_right(tb,tb->rnum[0],tb->rbytes); | ||
736 | /* append item in R[0] */ | ||
737 | if ( pos_in_item >= 0 ) { | ||
738 | bi.tb = tb; | ||
739 | bi.bi_bh = tb->R[0]; | ||
740 | bi.bi_parent = tb->FR[0]; | ||
741 | bi.bi_position = get_right_neighbor_position (tb, 0); | ||
742 | leaf_paste_in_buffer(&bi,item_pos - n + tb->rnum[0], pos_in_item, | ||
743 | tb->insert_size[0],body, zeros_num); | ||
744 | } | ||
745 | |||
746 | /* paste new entry, if item is directory item */ | ||
747 | pasted = B_N_PITEM_HEAD(tb->R[0], item_pos - n + tb->rnum[0]); | ||
748 | if (is_direntry_le_ih (pasted) && pos_in_item >= 0 ) { | ||
749 | leaf_paste_entries ( | ||
750 | bi.bi_bh, item_pos - n + tb->rnum[0], pos_in_item, 1, | ||
751 | (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0] | ||
752 | ); | ||
753 | if ( ! pos_in_item ) { | ||
754 | |||
755 | RFALSE( item_pos - n + tb->rnum[0], | ||
756 | "PAP-12165: directory item must be first item of node when pasting is in 0th position"); | ||
757 | |||
758 | /* update delimiting keys */ | ||
759 | replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); | ||
760 | } | ||
761 | } | ||
762 | |||
763 | if (is_indirect_le_ih (pasted)) | ||
764 | set_ih_free_space (pasted, 0); | ||
765 | zeros_num = tb->insert_size[0] = 0; | ||
766 | } | ||
767 | } | ||
768 | else /* new item doesn't fall into R[0] */ | ||
769 | { | ||
770 | leaf_shift_right(tb,tb->rnum[0],tb->rbytes); | ||
771 | } | ||
772 | break; | ||
773 | default: /* cases d and t */ | ||
774 | reiserfs_panic (tb->tb_sb, "PAP-12175: balance_leaf: rnum > 0: unexpectable mode: %s(%d)", | ||
775 | (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); | ||
776 | } | ||
777 | |||
778 | } /* tb->rnum[0] > 0 */ | ||
779 | |||
780 | |||
781 | RFALSE( tb->blknum[0] > 3, | ||
782 | "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]); | ||
783 | RFALSE( tb->blknum[0] < 0, | ||
784 | "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]); | ||
785 | |||
786 | /* if while adding to a node we discover that it is possible to split | ||
787 | it in two, and merge the left part into the left neighbor and the | ||
788 | right part into the right neighbor, eliminating the node */ | ||
789 | if ( tb->blknum[0] == 0 ) { /* node S[0] is empty now */ | ||
790 | |||
791 | RFALSE( ! tb->lnum[0] || ! tb->rnum[0], | ||
792 | "PAP-12190: lnum and rnum must not be zero"); | ||
793 | /* if insertion was done before 0-th position in R[0], right | ||
794 | delimiting key of the tb->L[0]'s and left delimiting key are | ||
795 | not set correctly */ | ||
796 | if (tb->CFL[0]) { | ||
797 | if (!tb->CFR[0]) | ||
798 | reiserfs_panic (tb->tb_sb, "vs-12195: balance_leaf: CFR not initialized"); | ||
799 | copy_key (B_N_PDELIM_KEY (tb->CFL[0], tb->lkey[0]), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0])); | ||
800 | do_balance_mark_internal_dirty (tb, tb->CFL[0], 0); | ||
801 | } | ||
802 | |||
803 | reiserfs_invalidate_buffer(tb,tbS0); | ||
804 | return 0; | ||
805 | } | ||
806 | |||
807 | |||
808 | /* Fill new nodes that appear in place of S[0] */ | ||
809 | |||
810 | /* I am told that this copying is because we need an array to enable | ||
811 | the looping code. -Hans */ | ||
812 | snum[0] = tb->s1num, | ||
813 | snum[1] = tb->s2num; | ||
814 | sbytes[0] = tb->s1bytes; | ||
815 | sbytes[1] = tb->s2bytes; | ||
816 | for( i = tb->blknum[0] - 2; i >= 0; i-- ) { | ||
817 | |||
818 | RFALSE( !snum[i], "PAP-12200: snum[%d] == %d. Must be > 0", i, snum[i]); | ||
819 | |||
820 | /* here we shift from S to S_new nodes */ | ||
821 | |||
822 | S_new[i] = get_FEB(tb); | ||
823 | |||
824 | /* initialized block type and tree level */ | ||
825 | set_blkh_level( B_BLK_HEAD(S_new[i]), DISK_LEAF_NODE_LEVEL ); | ||
826 | |||
827 | |||
828 | n = B_NR_ITEMS(tbS0); | ||
829 | |||
830 | switch (flag) { | ||
831 | case M_INSERT: /* insert item */ | ||
832 | |||
833 | if ( n - snum[i] < item_pos ) | ||
834 | { /* new item or it's part falls to first new node S_new[i]*/ | ||
835 | if ( item_pos == n - snum[i] + 1 && sbytes[i] != -1 ) | ||
836 | { /* part of new item falls into S_new[i] */ | ||
837 | int old_key_comp, old_len, r_zeros_number; | ||
838 | const char * r_body; | ||
839 | int version; | ||
840 | |||
841 | /* Move snum[i]-1 items from S[0] to S_new[i] */ | ||
842 | leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i] - 1, -1, S_new[i]); | ||
843 | /* Remember key component and item length */ | ||
844 | version = ih_version (ih); | ||
845 | old_key_comp = le_ih_k_offset( ih ); | ||
846 | old_len = ih_item_len(ih); | ||
847 | |||
848 | /* Calculate key component and item length to insert into S_new[i] */ | ||
849 | set_le_ih_k_offset( ih, | ||
850 | le_ih_k_offset(ih) + ((old_len - sbytes[i] )<<(is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0)) ); | ||
851 | |||
852 | put_ih_item_len( ih, sbytes[i] ); | ||
853 | |||
854 | /* Insert part of the item into S_new[i] before 0-th item */ | ||
855 | bi.tb = tb; | ||
856 | bi.bi_bh = S_new[i]; | ||
857 | bi.bi_parent = NULL; | ||
858 | bi.bi_position = 0; | ||
859 | |||
860 | if ( (old_len - sbytes[i]) > zeros_num ) { | ||
861 | r_zeros_number = 0; | ||
862 | r_body = body + (old_len - sbytes[i]) - zeros_num; | ||
863 | } | ||
864 | else { | ||
865 | r_body = body; | ||
866 | r_zeros_number = zeros_num - (old_len - sbytes[i]); | ||
867 | zeros_num -= r_zeros_number; | ||
868 | } | ||
869 | |||
870 | leaf_insert_into_buf (&bi, 0, ih, r_body, r_zeros_number); | ||
871 | |||
872 | /* Calculate key component and item length to insert into S[i] */ | ||
873 | set_le_ih_k_offset( ih, old_key_comp ); | ||
874 | put_ih_item_len( ih, old_len - sbytes[i] ); | ||
875 | tb->insert_size[0] -= sbytes[i]; | ||
876 | } | ||
877 | else /* whole new item falls into S_new[i] */ | ||
878 | { | ||
879 | /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */ | ||
880 | leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i] - 1, sbytes[i], S_new[i]); | ||
881 | |||
882 | /* Insert new item into S_new[i] */ | ||
883 | bi.tb = tb; | ||
884 | bi.bi_bh = S_new[i]; | ||
885 | bi.bi_parent = NULL; | ||
886 | bi.bi_position = 0; | ||
887 | leaf_insert_into_buf (&bi, item_pos - n + snum[i] - 1, ih, body, zeros_num); | ||
888 | |||
889 | zeros_num = tb->insert_size[0] = 0; | ||
890 | } | ||
891 | } | ||
892 | |||
893 | else /* new item or it part don't falls into S_new[i] */ | ||
894 | { | ||
895 | leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); | ||
896 | } | ||
897 | break; | ||
898 | |||
899 | case M_PASTE: /* append item */ | ||
900 | |||
901 | if ( n - snum[i] <= item_pos ) /* pasted item or part if it falls to S_new[i] */ | ||
902 | { | ||
903 | if ( item_pos == n - snum[i] && sbytes[i] != -1 ) | ||
904 | { /* we must shift part of the appended item */ | ||
905 | struct item_head * aux_ih; | ||
906 | |||
907 | RFALSE( ih, "PAP-12210: ih must be 0"); | ||
908 | |||
909 | if ( is_direntry_le_ih (aux_ih = B_N_PITEM_HEAD(tbS0,item_pos))) { | ||
910 | /* we append to directory item */ | ||
911 | |||
912 | int entry_count; | ||
913 | |||
914 | entry_count = ih_entry_count(aux_ih); | ||
915 | |||
916 | if ( entry_count - sbytes[i] < pos_in_item && pos_in_item <= entry_count ) { | ||
917 | /* new directory entry falls into S_new[i] */ | ||
918 | |||
919 | RFALSE( ! tb->insert_size[0], | ||
920 | "PAP-12215: insert_size is already 0"); | ||
921 | RFALSE( sbytes[i] - 1 >= entry_count, | ||
922 | "PAP-12220: there are no so much entries (%d), only %d", | ||
923 | sbytes[i] - 1, entry_count); | ||
924 | |||
925 | /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */ | ||
926 | leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i]-1, S_new[i]); | ||
927 | /* Paste given directory entry to directory item */ | ||
928 | bi.tb = tb; | ||
929 | bi.bi_bh = S_new[i]; | ||
930 | bi.bi_parent = NULL; | ||
931 | bi.bi_position = 0; | ||
932 | leaf_paste_in_buffer (&bi, 0, pos_in_item - entry_count + sbytes[i] - 1, | ||
933 | tb->insert_size[0], body,zeros_num); | ||
934 | /* paste new directory entry */ | ||
935 | leaf_paste_entries ( | ||
936 | bi.bi_bh, 0, pos_in_item - entry_count + sbytes[i] - 1, | ||
937 | 1, (struct reiserfs_de_head *)body, body + DEH_SIZE, | ||
938 | tb->insert_size[0] | ||
939 | ); | ||
940 | tb->insert_size[0] = 0; | ||
941 | pos_in_item++; | ||
942 | } else { /* new directory entry doesn't fall into S_new[i] */ | ||
943 | leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); | ||
944 | } | ||
945 | } | ||
946 | else /* regular object */ | ||
947 | { | ||
948 | int n_shift, n_rem, r_zeros_number; | ||
949 | const char * r_body; | ||
950 | |||
951 | RFALSE( pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos)) || | ||
952 | tb->insert_size[0] <= 0, | ||
953 | "PAP-12225: item too short or insert_size <= 0"); | ||
954 | |||
955 | /* Calculate number of bytes which must be shifted from appended item */ | ||
956 | n_shift = sbytes[i] - tb->insert_size[0]; | ||
957 | if ( n_shift < 0 ) | ||
958 | n_shift = 0; | ||
959 | leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], n_shift, S_new[i]); | ||
960 | |||
961 | /* Calculate number of bytes which must remain in body after append to S_new[i] */ | ||
962 | n_rem = tb->insert_size[0] - sbytes[i]; | ||
963 | if ( n_rem < 0 ) | ||
964 | n_rem = 0; | ||
965 | /* Append part of body into S_new[0] */ | ||
966 | bi.tb = tb; | ||
967 | bi.bi_bh = S_new[i]; | ||
968 | bi.bi_parent = NULL; | ||
969 | bi.bi_position = 0; | ||
970 | |||
971 | if ( n_rem > zeros_num ) { | ||
972 | r_zeros_number = 0; | ||
973 | r_body = body + n_rem - zeros_num; | ||
974 | } | ||
975 | else { | ||
976 | r_body = body; | ||
977 | r_zeros_number = zeros_num - n_rem; | ||
978 | zeros_num -= r_zeros_number; | ||
979 | } | ||
980 | |||
981 | leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0]-n_rem, r_body,r_zeros_number); | ||
982 | { | ||
983 | struct item_head * tmp; | ||
984 | |||
985 | tmp = B_N_PITEM_HEAD(S_new[i],0); | ||
986 | if (is_indirect_le_ih (tmp)) { | ||
987 | set_ih_free_space (tmp, 0); | ||
988 | set_le_ih_k_offset( tmp, le_ih_k_offset(tmp) + | ||
989 | (n_rem << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT))); | ||
990 | } else { | ||
991 | set_le_ih_k_offset( tmp, le_ih_k_offset(tmp) + | ||
992 | n_rem ); | ||
993 | } | ||
994 | } | ||
995 | |||
996 | tb->insert_size[0] = n_rem; | ||
997 | if ( ! n_rem ) | ||
998 | pos_in_item++; | ||
999 | } | ||
1000 | } | ||
1001 | else | ||
1002 | /* item falls wholly into S_new[i] */ | ||
1003 | { | ||
1004 | int ret_val; | ||
1005 | struct item_head * pasted; | ||
1006 | |||
1007 | #ifdef CONFIG_REISERFS_CHECK | ||
1008 | struct item_head * ih = B_N_PITEM_HEAD(tbS0,item_pos); | ||
1009 | |||
1010 | if ( ! is_direntry_le_ih(ih) && (pos_in_item != ih_item_len(ih) || | ||
1011 | tb->insert_size[0] <= 0) ) | ||
1012 | reiserfs_panic (tb->tb_sb, "PAP-12235: balance_leaf: pos_in_item must be equal to ih_item_len"); | ||
1013 | #endif /* CONFIG_REISERFS_CHECK */ | ||
1014 | |||
1015 | ret_val = leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); | ||
1016 | |||
1017 | RFALSE( ret_val, | ||
1018 | "PAP-12240: unexpected value returned by leaf_move_items (%d)", | ||
1019 | ret_val); | ||
1020 | |||
1021 | /* paste into item */ | ||
1022 | bi.tb = tb; | ||
1023 | bi.bi_bh = S_new[i]; | ||
1024 | bi.bi_parent = NULL; | ||
1025 | bi.bi_position = 0; | ||
1026 | leaf_paste_in_buffer(&bi, item_pos - n + snum[i], pos_in_item, tb->insert_size[0], body, zeros_num); | ||
1027 | |||
1028 | pasted = B_N_PITEM_HEAD(S_new[i], item_pos - n + snum[i]); | ||
1029 | if (is_direntry_le_ih (pasted)) | ||
1030 | { | ||
1031 | leaf_paste_entries ( | ||
1032 | bi.bi_bh, item_pos - n + snum[i], pos_in_item, 1, | ||
1033 | (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0] | ||
1034 | ); | ||
1035 | } | ||
1036 | |||
1037 | /* if we paste to indirect item update ih_free_space */ | ||
1038 | if (is_indirect_le_ih (pasted)) | ||
1039 | set_ih_free_space (pasted, 0); | ||
1040 | zeros_num = tb->insert_size[0] = 0; | ||
1041 | } | ||
1042 | } | ||
1043 | |||
1044 | else /* pasted item doesn't fall into S_new[i] */ | ||
1045 | { | ||
1046 | leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); | ||
1047 | } | ||
1048 | break; | ||
1049 | default: /* cases d and t */ | ||
1050 | reiserfs_panic (tb->tb_sb, "PAP-12245: balance_leaf: blknum > 2: unexpectable mode: %s(%d)", | ||
1051 | (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); | ||
1052 | } | ||
1053 | |||
1054 | memcpy (insert_key + i,B_N_PKEY(S_new[i],0),KEY_SIZE); | ||
1055 | insert_ptr[i] = S_new[i]; | ||
1056 | |||
1057 | RFALSE (!buffer_journaled (S_new [i]) || buffer_journal_dirty (S_new [i]) || | ||
1058 | buffer_dirty (S_new [i]), | ||
1059 | "PAP-12247: S_new[%d] : (%b)", i, S_new[i]); | ||
1060 | } | ||
1061 | |||
1062 | /* if the affected item was not wholly shifted then we perform all necessary operations on that part or whole of the | ||
1063 | affected item which remains in S */ | ||
1064 | if ( 0 <= item_pos && item_pos < tb->s0num ) | ||
1065 | { /* if we must insert or append into buffer S[0] */ | ||
1066 | |||
1067 | switch (flag) | ||
1068 | { | ||
1069 | case M_INSERT: /* insert item into S[0] */ | ||
1070 | bi.tb = tb; | ||
1071 | bi.bi_bh = tbS0; | ||
1072 | bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); | ||
1073 | bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); | ||
1074 | leaf_insert_into_buf (&bi, item_pos, ih, body, zeros_num); | ||
1075 | |||
1076 | /* If we insert the first key change the delimiting key */ | ||
1077 | if( item_pos == 0 ) { | ||
1078 | if (tb->CFL[0]) /* can be 0 in reiserfsck */ | ||
1079 | replace_key(tb, tb->CFL[0], tb->lkey[0],tbS0,0); | ||
1080 | |||
1081 | } | ||
1082 | break; | ||
1083 | |||
1084 | case M_PASTE: { /* append item in S[0] */ | ||
1085 | struct item_head * pasted; | ||
1086 | |||
1087 | pasted = B_N_PITEM_HEAD (tbS0, item_pos); | ||
1088 | /* when directory, may be new entry already pasted */ | ||
1089 | if (is_direntry_le_ih (pasted)) { | ||
1090 | if ( pos_in_item >= 0 && | ||
1091 | pos_in_item <= ih_entry_count(pasted) ) { | ||
1092 | |||
1093 | RFALSE( ! tb->insert_size[0], | ||
1094 | "PAP-12260: insert_size is 0 already"); | ||
1095 | |||
1096 | /* prepare space */ | ||
1097 | bi.tb = tb; | ||
1098 | bi.bi_bh = tbS0; | ||
1099 | bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); | ||
1100 | bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); | ||
1101 | leaf_paste_in_buffer(&bi, item_pos, pos_in_item, tb->insert_size[0], body, zeros_num); | ||
1102 | |||
1103 | /* paste entry */ | ||
1104 | leaf_paste_entries ( | ||
1105 | bi.bi_bh, item_pos, pos_in_item, 1, (struct reiserfs_de_head *)body, | ||
1106 | body + DEH_SIZE, tb->insert_size[0] | ||
1107 | ); | ||
1108 | if ( ! item_pos && ! pos_in_item ) { | ||
1109 | RFALSE( !tb->CFL[0] || !tb->L[0], | ||
1110 | "PAP-12270: CFL[0]/L[0] must be specified"); | ||
1111 | if (tb->CFL[0]) { | ||
1112 | replace_key(tb, tb->CFL[0], tb->lkey[0],tbS0,0); | ||
1113 | |||
1114 | } | ||
1115 | } | ||
1116 | tb->insert_size[0] = 0; | ||
1117 | } | ||
1118 | } else { /* regular object */ | ||
1119 | if ( pos_in_item == ih_item_len(pasted) ) { | ||
1120 | |||
1121 | RFALSE( tb->insert_size[0] <= 0, | ||
1122 | "PAP-12275: insert size must not be %d", | ||
1123 | tb->insert_size[0]); | ||
1124 | bi.tb = tb; | ||
1125 | bi.bi_bh = tbS0; | ||
1126 | bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); | ||
1127 | bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); | ||
1128 | leaf_paste_in_buffer (&bi, item_pos, pos_in_item, tb->insert_size[0], body, zeros_num); | ||
1129 | |||
1130 | if (is_indirect_le_ih (pasted)) { | ||
1131 | #if 0 | ||
1132 | RFALSE( tb->insert_size[0] != UNFM_P_SIZE, | ||
1133 | "PAP-12280: insert_size for indirect item must be %d, not %d", | ||
1134 | UNFM_P_SIZE, tb->insert_size[0]); | ||
1135 | #endif | ||
1136 | set_ih_free_space (pasted, 0); | ||
1137 | } | ||
1138 | tb->insert_size[0] = 0; | ||
1139 | } | ||
1140 | |||
1141 | #ifdef CONFIG_REISERFS_CHECK | ||
1142 | else { | ||
1143 | if ( tb->insert_size[0] ) { | ||
1144 | print_cur_tb ("12285"); | ||
1145 | reiserfs_panic (tb->tb_sb, "PAP-12285: balance_leaf: insert_size must be 0 (%d)", tb->insert_size[0]); | ||
1146 | } | ||
1147 | } | ||
1148 | #endif /* CONFIG_REISERFS_CHECK */ | ||
1149 | |||
1150 | } | ||
1151 | } /* case M_PASTE: */ | ||
1152 | } | ||
1153 | } | ||
1154 | |||
1155 | #ifdef CONFIG_REISERFS_CHECK | ||
1156 | if ( flag == M_PASTE && tb->insert_size[0] ) { | ||
1157 | print_cur_tb ("12290"); | ||
1158 | reiserfs_panic (tb->tb_sb, "PAP-12290: balance_leaf: insert_size is still not 0 (%d)", tb->insert_size[0]); | ||
1159 | } | ||
1160 | #endif /* CONFIG_REISERFS_CHECK */ | ||
1161 | |||
1162 | return 0; | ||
1163 | } /* Leaf level of the tree is balanced (end of balance_leaf) */ | ||
1164 | |||
1165 | |||
1166 | |||
1167 | /* Make empty node */ | ||
1168 | void make_empty_node (struct buffer_info * bi) | ||
1169 | { | ||
1170 | struct block_head * blkh; | ||
1171 | |||
1172 | RFALSE( bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL"); | ||
1173 | |||
1174 | blkh = B_BLK_HEAD(bi->bi_bh); | ||
1175 | set_blkh_nr_item( blkh, 0 ); | ||
1176 | set_blkh_free_space( blkh, MAX_CHILD_SIZE(bi->bi_bh) ); | ||
1177 | |||
1178 | if (bi->bi_parent) | ||
1179 | B_N_CHILD (bi->bi_parent, bi->bi_position)->dc_size = 0; /* Endian safe if 0 */ | ||
1180 | } | ||
1181 | |||
1182 | |||
1183 | /* Get first empty buffer */ | ||
1184 | struct buffer_head * get_FEB (struct tree_balance * tb) | ||
1185 | { | ||
1186 | int i; | ||
1187 | struct buffer_head * first_b; | ||
1188 | struct buffer_info bi; | ||
1189 | |||
1190 | for (i = 0; i < MAX_FEB_SIZE; i ++) | ||
1191 | if (tb->FEB[i] != 0) | ||
1192 | break; | ||
1193 | |||
1194 | if (i == MAX_FEB_SIZE) | ||
1195 | reiserfs_panic(tb->tb_sb, "vs-12300: get_FEB: FEB list is empty"); | ||
1196 | |||
1197 | bi.tb = tb; | ||
1198 | bi.bi_bh = first_b = tb->FEB[i]; | ||
1199 | bi.bi_parent = NULL; | ||
1200 | bi.bi_position = 0; | ||
1201 | make_empty_node (&bi); | ||
1202 | set_buffer_uptodate(first_b); | ||
1203 | tb->FEB[i] = NULL; | ||
1204 | tb->used[i] = first_b; | ||
1205 | |||
1206 | return(first_b); | ||
1207 | } | ||
1208 | |||
1209 | |||
1210 | /* This is now used because reiserfs_free_block has to be able to | ||
1211 | ** schedule. | ||
1212 | */ | ||
1213 | static void store_thrown (struct tree_balance * tb, struct buffer_head * bh) | ||
1214 | { | ||
1215 | int i; | ||
1216 | |||
1217 | if (buffer_dirty (bh)) | ||
1218 | reiserfs_warning (tb->tb_sb, "store_thrown deals with dirty buffer"); | ||
1219 | for (i = 0; i < sizeof (tb->thrown)/sizeof (tb->thrown[0]); i ++) | ||
1220 | if (!tb->thrown[i]) { | ||
1221 | tb->thrown[i] = bh; | ||
1222 | get_bh(bh) ; /* free_thrown puts this */ | ||
1223 | return; | ||
1224 | } | ||
1225 | reiserfs_warning (tb->tb_sb, "store_thrown: too many thrown buffers"); | ||
1226 | } | ||
1227 | |||
1228 | static void free_thrown(struct tree_balance *tb) { | ||
1229 | int i ; | ||
1230 | b_blocknr_t blocknr ; | ||
1231 | for (i = 0; i < sizeof (tb->thrown)/sizeof (tb->thrown[0]); i++) { | ||
1232 | if (tb->thrown[i]) { | ||
1233 | blocknr = tb->thrown[i]->b_blocknr ; | ||
1234 | if (buffer_dirty (tb->thrown[i])) | ||
1235 | reiserfs_warning (tb->tb_sb, | ||
1236 | "free_thrown deals with dirty buffer %d", | ||
1237 | blocknr); | ||
1238 | brelse(tb->thrown[i]) ; /* incremented in store_thrown */ | ||
1239 | reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0); | ||
1240 | } | ||
1241 | } | ||
1242 | } | ||
1243 | |||
1244 | void reiserfs_invalidate_buffer (struct tree_balance * tb, struct buffer_head * bh) | ||
1245 | { | ||
1246 | struct block_head *blkh; | ||
1247 | blkh = B_BLK_HEAD(bh); | ||
1248 | set_blkh_level( blkh, FREE_LEVEL ); | ||
1249 | set_blkh_nr_item( blkh, 0 ); | ||
1250 | |||
1251 | clear_buffer_dirty(bh); | ||
1252 | store_thrown (tb, bh); | ||
1253 | } | ||
1254 | |||
1255 | /* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/ | ||
1256 | void replace_key (struct tree_balance * tb, struct buffer_head * dest, int n_dest, | ||
1257 | struct buffer_head * src, int n_src) | ||
1258 | { | ||
1259 | |||
1260 | RFALSE( dest == NULL || src == NULL, | ||
1261 | "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)", | ||
1262 | src, dest); | ||
1263 | RFALSE( ! B_IS_KEYS_LEVEL (dest), | ||
1264 | "vs-12310: invalid level (%z) for destination buffer. dest must be leaf", | ||
1265 | dest); | ||
1266 | RFALSE( n_dest < 0 || n_src < 0, | ||
1267 | "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest); | ||
1268 | RFALSE( n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src), | ||
1269 | "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big", | ||
1270 | n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest)); | ||
1271 | |||
1272 | if (B_IS_ITEMS_LEVEL (src)) | ||
1273 | /* source buffer contains leaf node */ | ||
1274 | memcpy (B_N_PDELIM_KEY(dest,n_dest), B_N_PITEM_HEAD(src,n_src), KEY_SIZE); | ||
1275 | else | ||
1276 | memcpy (B_N_PDELIM_KEY(dest,n_dest), B_N_PDELIM_KEY(src,n_src), KEY_SIZE); | ||
1277 | |||
1278 | do_balance_mark_internal_dirty (tb, dest, 0); | ||
1279 | } | ||
1280 | |||
1281 | |||
1282 | int get_left_neighbor_position ( | ||
1283 | struct tree_balance * tb, | ||
1284 | int h | ||
1285 | ) | ||
1286 | { | ||
1287 | int Sh_position = PATH_H_POSITION (tb->tb_path, h + 1); | ||
1288 | |||
1289 | RFALSE( PATH_H_PPARENT (tb->tb_path, h) == 0 || tb->FL[h] == 0, | ||
1290 | "vs-12325: FL[%d](%p) or F[%d](%p) does not exist", | ||
1291 | h, tb->FL[h], h, PATH_H_PPARENT (tb->tb_path, h)); | ||
1292 | |||
1293 | if (Sh_position == 0) | ||
1294 | return B_NR_ITEMS (tb->FL[h]); | ||
1295 | else | ||
1296 | return Sh_position - 1; | ||
1297 | } | ||
1298 | |||
1299 | |||
1300 | int get_right_neighbor_position (struct tree_balance * tb, int h) | ||
1301 | { | ||
1302 | int Sh_position = PATH_H_POSITION (tb->tb_path, h + 1); | ||
1303 | |||
1304 | RFALSE( PATH_H_PPARENT (tb->tb_path, h) == 0 || tb->FR[h] == 0, | ||
1305 | "vs-12330: F[%d](%p) or FR[%d](%p) does not exist", | ||
1306 | h, PATH_H_PPARENT (tb->tb_path, h), h, tb->FR[h]); | ||
1307 | |||
1308 | if (Sh_position == B_NR_ITEMS (PATH_H_PPARENT (tb->tb_path, h))) | ||
1309 | return 0; | ||
1310 | else | ||
1311 | return Sh_position + 1; | ||
1312 | } | ||
1313 | |||
1314 | |||
1315 | #ifdef CONFIG_REISERFS_CHECK | ||
1316 | |||
1317 | int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value); | ||
1318 | static void check_internal_node (struct super_block * s, struct buffer_head * bh, char * mes) | ||
1319 | { | ||
1320 | struct disk_child * dc; | ||
1321 | int i; | ||
1322 | |||
1323 | RFALSE( !bh, "PAP-12336: bh == 0"); | ||
1324 | |||
1325 | if (!bh || !B_IS_IN_TREE (bh)) | ||
1326 | return; | ||
1327 | |||
1328 | RFALSE( !buffer_dirty (bh) && | ||
1329 | !(buffer_journaled(bh) || buffer_journal_dirty(bh)), | ||
1330 | "PAP-12337: buffer (%b) must be dirty", bh); | ||
1331 | dc = B_N_CHILD (bh, 0); | ||
1332 | |||
1333 | for (i = 0; i <= B_NR_ITEMS (bh); i ++, dc ++) { | ||
1334 | if (!is_reusable (s, dc_block_number(dc), 1) ) { | ||
1335 | print_cur_tb (mes); | ||
1336 | reiserfs_panic (s, "PAP-12338: check_internal_node: invalid child pointer %y in %b", dc, bh); | ||
1337 | } | ||
1338 | } | ||
1339 | } | ||
1340 | |||
1341 | |||
1342 | static int locked_or_not_in_tree (struct buffer_head * bh, char * which) | ||
1343 | { | ||
1344 | if ( (!buffer_journal_prepared (bh) && buffer_locked (bh)) || | ||
1345 | !B_IS_IN_TREE (bh) ) { | ||
1346 | reiserfs_warning (NULL, "vs-12339: locked_or_not_in_tree: %s (%b)", | ||
1347 | which, bh); | ||
1348 | return 1; | ||
1349 | } | ||
1350 | return 0; | ||
1351 | } | ||
1352 | |||
1353 | |||
1354 | static int check_before_balancing (struct tree_balance * tb) | ||
1355 | { | ||
1356 | int retval = 0; | ||
1357 | |||
1358 | if ( cur_tb ) { | ||
1359 | reiserfs_panic (tb->tb_sb, "vs-12335: check_before_balancing: " | ||
1360 | "suspect that schedule occurred based on cur_tb not being null at this point in code. " | ||
1361 | "do_balance cannot properly handle schedule occurring while it runs."); | ||
1362 | } | ||
1363 | |||
1364 | /* double check that buffers that we will modify are unlocked. (fix_nodes should already have | ||
1365 | prepped all of these for us). */ | ||
1366 | if ( tb->lnum[0] ) { | ||
1367 | retval |= locked_or_not_in_tree (tb->L[0], "L[0]"); | ||
1368 | retval |= locked_or_not_in_tree (tb->FL[0], "FL[0]"); | ||
1369 | retval |= locked_or_not_in_tree (tb->CFL[0], "CFL[0]"); | ||
1370 | check_leaf (tb->L[0]); | ||
1371 | } | ||
1372 | if ( tb->rnum[0] ) { | ||
1373 | retval |= locked_or_not_in_tree (tb->R[0], "R[0]"); | ||
1374 | retval |= locked_or_not_in_tree (tb->FR[0], "FR[0]"); | ||
1375 | retval |= locked_or_not_in_tree (tb->CFR[0], "CFR[0]"); | ||
1376 | check_leaf (tb->R[0]); | ||
1377 | } | ||
1378 | retval |= locked_or_not_in_tree (PATH_PLAST_BUFFER (tb->tb_path), "S[0]"); | ||
1379 | check_leaf (PATH_PLAST_BUFFER (tb->tb_path)); | ||
1380 | |||
1381 | return retval; | ||
1382 | } | ||
1383 | |||
1384 | |||
1385 | static void check_after_balance_leaf (struct tree_balance * tb) | ||
1386 | { | ||
1387 | if (tb->lnum[0]) { | ||
1388 | if (B_FREE_SPACE (tb->L[0]) != | ||
1389 | MAX_CHILD_SIZE (tb->L[0]) - dc_size(B_N_CHILD (tb->FL[0], get_left_neighbor_position (tb, 0)))) { | ||
1390 | print_cur_tb ("12221"); | ||
1391 | reiserfs_panic (tb->tb_sb, "PAP-12355: check_after_balance_leaf: shift to left was incorrect"); | ||
1392 | } | ||
1393 | } | ||
1394 | if (tb->rnum[0]) { | ||
1395 | if (B_FREE_SPACE (tb->R[0]) != | ||
1396 | MAX_CHILD_SIZE (tb->R[0]) - dc_size(B_N_CHILD (tb->FR[0], get_right_neighbor_position (tb, 0)))) { | ||
1397 | print_cur_tb ("12222"); | ||
1398 | reiserfs_panic (tb->tb_sb, "PAP-12360: check_after_balance_leaf: shift to right was incorrect"); | ||
1399 | } | ||
1400 | } | ||
1401 | if (PATH_H_PBUFFER(tb->tb_path,1) && | ||
1402 | (B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) != | ||
1403 | (MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)) - | ||
1404 | dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1), | ||
1405 | PATH_H_POSITION (tb->tb_path, 1)))) )) { | ||
1406 | int left = B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)); | ||
1407 | int right = (MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)) - | ||
1408 | dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1), | ||
1409 | PATH_H_POSITION (tb->tb_path, 1)))); | ||
1410 | print_cur_tb ("12223"); | ||
1411 | reiserfs_warning (tb->tb_sb, | ||
1412 | "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; " | ||
1413 | "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d", | ||
1414 | left, | ||
1415 | MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)), | ||
1416 | PATH_H_PBUFFER(tb->tb_path,1), | ||
1417 | PATH_H_POSITION (tb->tb_path, 1), | ||
1418 | dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1), PATH_H_POSITION (tb->tb_path, 1 )) ), | ||
1419 | right ); | ||
1420 | reiserfs_panic (tb->tb_sb, "PAP-12365: check_after_balance_leaf: S is incorrect"); | ||
1421 | } | ||
1422 | } | ||
1423 | |||
1424 | |||
1425 | static void check_leaf_level (struct tree_balance * tb) | ||
1426 | { | ||
1427 | check_leaf (tb->L[0]); | ||
1428 | check_leaf (tb->R[0]); | ||
1429 | check_leaf (PATH_PLAST_BUFFER (tb->tb_path)); | ||
1430 | } | ||
1431 | |||
1432 | static void check_internal_levels (struct tree_balance * tb) | ||
1433 | { | ||
1434 | int h; | ||
1435 | |||
1436 | /* check all internal nodes */ | ||
1437 | for (h = 1; tb->insert_size[h]; h ++) { | ||
1438 | check_internal_node (tb->tb_sb, PATH_H_PBUFFER (tb->tb_path, h), "BAD BUFFER ON PATH"); | ||
1439 | if (tb->lnum[h]) | ||
1440 | check_internal_node (tb->tb_sb, tb->L[h], "BAD L"); | ||
1441 | if (tb->rnum[h]) | ||
1442 | check_internal_node (tb->tb_sb, tb->R[h], "BAD R"); | ||
1443 | } | ||
1444 | |||
1445 | } | ||
1446 | |||
1447 | #endif | ||
1448 | |||
1449 | |||
1450 | |||
1451 | |||
1452 | |||
1453 | |||
1454 | /* Now we have all of the buffers that must be used in balancing of | ||
1455 | the tree. We rely on the assumption that schedule() will not occur | ||
1456 | while do_balance works. ( Only interrupt handlers are acceptable.) | ||
1457 | We balance the tree according to the analysis made before this, | ||
1458 | using buffers already obtained. For SMP support it will someday be | ||
1459 | necessary to add ordered locking of tb. */ | ||
1460 | |||
1461 | /* Some interesting rules of balancing: | ||
1462 | |||
1463 | we delete a maximum of two nodes per level per balancing: we never | ||
1464 | delete R, when we delete two of three nodes L, S, R then we move | ||
1465 | them into R. | ||
1466 | |||
1467 | we only delete L if we are deleting two nodes, if we delete only | ||
1468 | one node we delete S | ||
1469 | |||
1470 | if we shift leaves then we shift as much as we can: this is a | ||
1471 | deliberate policy of extremism in node packing which results in | ||
1472 | higher average utilization after repeated random balance operations | ||
1473 | at the cost of more memory copies and more balancing as a result of | ||
1474 | small insertions to full nodes. | ||
1475 | |||
1476 | if we shift internal nodes we try to evenly balance the node | ||
1477 | utilization, with consequent less balancing at the cost of lower | ||
1478 | utilization. | ||
1479 | |||
1480 | one could argue that the policy for directories in leaves should be | ||
1481 | that of internal nodes, but we will wait until another day to | ||
1482 | evaluate this.... It would be nice to someday measure and prove | ||
1483 | these assumptions as to what is optimal.... | ||
1484 | |||
1485 | */ | ||
1486 | |||
1487 | static inline void do_balance_starts (struct tree_balance *tb) | ||
1488 | { | ||
1489 | /* use print_cur_tb() to see initial state of struct | ||
1490 | tree_balance */ | ||
1491 | |||
1492 | /* store_print_tb (tb); */ | ||
1493 | |||
1494 | /* do not delete, just comment it out */ | ||
1495 | /* print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb, | ||
1496 | "check");*/ | ||
1497 | RFALSE( check_before_balancing (tb), "PAP-12340: locked buffers in TB"); | ||
1498 | #ifdef CONFIG_REISERFS_CHECK | ||
1499 | cur_tb = tb; | ||
1500 | #endif | ||
1501 | } | ||
1502 | |||
1503 | |||
1504 | static inline void do_balance_completed (struct tree_balance * tb) | ||
1505 | { | ||
1506 | |||
1507 | #ifdef CONFIG_REISERFS_CHECK | ||
1508 | check_leaf_level (tb); | ||
1509 | check_internal_levels (tb); | ||
1510 | cur_tb = NULL; | ||
1511 | #endif | ||
1512 | |||
1513 | /* reiserfs_free_block is no longer schedule safe. So, we need to | ||
1514 | ** put the buffers we want freed on the thrown list during do_balance, | ||
1515 | ** and then free them now | ||
1516 | */ | ||
1517 | |||
1518 | REISERFS_SB(tb->tb_sb)->s_do_balance ++; | ||
1519 | |||
1520 | |||
1521 | /* release all nodes hold to perform the balancing */ | ||
1522 | unfix_nodes(tb); | ||
1523 | |||
1524 | free_thrown(tb) ; | ||
1525 | } | ||
1526 | |||
1527 | |||
1528 | |||
1529 | |||
1530 | |||
1531 | void do_balance (struct tree_balance * tb, /* tree_balance structure */ | ||
1532 | struct item_head * ih, /* item header of inserted item */ | ||
1533 | const char * body, /* body of inserted item or bytes to paste */ | ||
1534 | int flag) /* i - insert, d - delete | ||
1535 | c - cut, p - paste | ||
1536 | |||
1537 | Cut means delete part of an item | ||
1538 | (includes removing an entry from a | ||
1539 | directory). | ||
1540 | |||
1541 | Delete means delete whole item. | ||
1542 | |||
1543 | Insert means add a new item into the | ||
1544 | tree. | ||
1545 | |||
1546 | Paste means to append to the end of an | ||
1547 | existing file or to insert a directory | ||
1548 | entry. */ | ||
1549 | { | ||
1550 | int child_pos, /* position of a child node in its parent */ | ||
1551 | h; /* level of the tree being processed */ | ||
1552 | struct item_head insert_key[2]; /* in our processing of one level | ||
1553 | we sometimes determine what | ||
1554 | must be inserted into the next | ||
1555 | higher level. This insertion | ||
1556 | consists of a key or two keys | ||
1557 | and their corresponding | ||
1558 | pointers */ | ||
1559 | struct buffer_head *insert_ptr[2]; /* inserted node-ptrs for the next | ||
1560 | level */ | ||
1561 | |||
1562 | tb->tb_mode = flag; | ||
1563 | tb->need_balance_dirty = 0; | ||
1564 | |||
1565 | if (FILESYSTEM_CHANGED_TB(tb)) { | ||
1566 | reiserfs_panic(tb->tb_sb, "clm-6000: do_balance, fs generation has changed\n") ; | ||
1567 | } | ||
1568 | /* if we have no real work to do */ | ||
1569 | if ( ! tb->insert_size[0] ) { | ||
1570 | reiserfs_warning (tb->tb_sb, | ||
1571 | "PAP-12350: do_balance: insert_size == 0, mode == %c", | ||
1572 | flag); | ||
1573 | unfix_nodes(tb); | ||
1574 | return; | ||
1575 | } | ||
1576 | |||
1577 | atomic_inc (&(fs_generation (tb->tb_sb))); | ||
1578 | do_balance_starts (tb); | ||
1579 | |||
1580 | /* balance leaf returns 0 except if combining L R and S into | ||
1581 | one node. see balance_internal() for explanation of this | ||
1582 | line of code.*/ | ||
1583 | child_pos = PATH_H_B_ITEM_ORDER (tb->tb_path, 0) + | ||
1584 | balance_leaf (tb, ih, body, flag, insert_key, insert_ptr); | ||
1585 | |||
1586 | #ifdef CONFIG_REISERFS_CHECK | ||
1587 | check_after_balance_leaf (tb); | ||
1588 | #endif | ||
1589 | |||
1590 | /* Balance internal level of the tree. */ | ||
1591 | for ( h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++ ) | ||
1592 | child_pos = balance_internal (tb, h, child_pos, insert_key, insert_ptr); | ||
1593 | |||
1594 | |||
1595 | do_balance_completed (tb); | ||
1596 | |||
1597 | } | ||
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c new file mode 100644 index 000000000000..26950113af8c --- /dev/null +++ b/fs/reiserfs/file.c | |||
@@ -0,0 +1,1408 @@ | |||
1 | /* | ||
2 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
3 | */ | ||
4 | |||
5 | |||
6 | #include <linux/time.h> | ||
7 | #include <linux/reiserfs_fs.h> | ||
8 | #include <linux/reiserfs_acl.h> | ||
9 | #include <linux/reiserfs_xattr.h> | ||
10 | #include <linux/smp_lock.h> | ||
11 | #include <asm/uaccess.h> | ||
12 | #include <linux/pagemap.h> | ||
13 | #include <linux/swap.h> | ||
14 | #include <linux/writeback.h> | ||
15 | #include <linux/blkdev.h> | ||
16 | #include <linux/buffer_head.h> | ||
17 | #include <linux/quotaops.h> | ||
18 | |||
19 | /* | ||
20 | ** We pack the tails of files on file close, not at the time they are written. | ||
21 | ** This implies an unnecessary copy of the tail and an unnecessary indirect item | ||
22 | ** insertion/balancing, for files that are written in one write. | ||
23 | ** It avoids unnecessary tail packings (balances) for files that are written in | ||
24 | ** multiple writes and are small enough to have tails. | ||
25 | ** | ||
26 | ** file_release is called by the VFS layer when the file is closed. If | ||
27 | ** this is the last open file descriptor, and the file | ||
28 | ** small enough to have a tail, and the tail is currently in an | ||
29 | ** unformatted node, the tail is converted back into a direct item. | ||
30 | ** | ||
31 | ** We use reiserfs_truncate_file to pack the tail, since it already has | ||
32 | ** all the conditions coded. | ||
33 | */ | ||
34 | static int reiserfs_file_release (struct inode * inode, struct file * filp) | ||
35 | { | ||
36 | |||
37 | struct reiserfs_transaction_handle th ; | ||
38 | int err; | ||
39 | int jbegin_failure = 0; | ||
40 | |||
41 | if (!S_ISREG (inode->i_mode)) | ||
42 | BUG (); | ||
43 | |||
44 | /* fast out for when nothing needs to be done */ | ||
45 | if ((atomic_read(&inode->i_count) > 1 || | ||
46 | !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) || | ||
47 | !tail_has_to_be_packed(inode)) && | ||
48 | REISERFS_I(inode)->i_prealloc_count <= 0) { | ||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | reiserfs_write_lock(inode->i_sb); | ||
53 | down (&inode->i_sem); | ||
54 | /* freeing preallocation only involves relogging blocks that | ||
55 | * are already in the current transaction. preallocation gets | ||
56 | * freed at the end of each transaction, so it is impossible for | ||
57 | * us to log any additional blocks (including quota blocks) | ||
58 | */ | ||
59 | err = journal_begin(&th, inode->i_sb, 1); | ||
60 | if (err) { | ||
61 | /* uh oh, we can't allow the inode to go away while there | ||
62 | * is still preallocation blocks pending. Try to join the | ||
63 | * aborted transaction | ||
64 | */ | ||
65 | jbegin_failure = err; | ||
66 | err = journal_join_abort(&th, inode->i_sb, 1); | ||
67 | |||
68 | if (err) { | ||
69 | /* hmpf, our choices here aren't good. We can pin the inode | ||
70 | * which will disallow unmount from every happening, we can | ||
71 | * do nothing, which will corrupt random memory on unmount, | ||
72 | * or we can forcibly remove the file from the preallocation | ||
73 | * list, which will leak blocks on disk. Lets pin the inode | ||
74 | * and let the admin know what is going on. | ||
75 | */ | ||
76 | igrab(inode); | ||
77 | reiserfs_warning(inode->i_sb, "pinning inode %lu because the " | ||
78 | "preallocation can't be freed"); | ||
79 | goto out; | ||
80 | } | ||
81 | } | ||
82 | reiserfs_update_inode_transaction(inode) ; | ||
83 | |||
84 | #ifdef REISERFS_PREALLOCATE | ||
85 | reiserfs_discard_prealloc (&th, inode); | ||
86 | #endif | ||
87 | err = journal_end(&th, inode->i_sb, 1); | ||
88 | |||
89 | /* copy back the error code from journal_begin */ | ||
90 | if (!err) | ||
91 | err = jbegin_failure; | ||
92 | |||
93 | if (!err && atomic_read(&inode->i_count) <= 1 && | ||
94 | (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) && | ||
95 | tail_has_to_be_packed (inode)) { | ||
96 | /* if regular file is released by last holder and it has been | ||
97 | appended (we append by unformatted node only) or its direct | ||
98 | item(s) had to be converted, then it may have to be | ||
99 | indirect2direct converted */ | ||
100 | err = reiserfs_truncate_file(inode, 0) ; | ||
101 | } | ||
102 | out: | ||
103 | up (&inode->i_sem); | ||
104 | reiserfs_write_unlock(inode->i_sb); | ||
105 | return err; | ||
106 | } | ||
107 | |||
108 | static void reiserfs_vfs_truncate_file(struct inode *inode) { | ||
109 | reiserfs_truncate_file(inode, 1) ; | ||
110 | } | ||
111 | |||
112 | /* Sync a reiserfs file. */ | ||
113 | |||
114 | /* | ||
115 | * FIXME: sync_mapping_buffers() never has anything to sync. Can | ||
116 | * be removed... | ||
117 | */ | ||
118 | |||
119 | static int reiserfs_sync_file( | ||
120 | struct file * p_s_filp, | ||
121 | struct dentry * p_s_dentry, | ||
122 | int datasync | ||
123 | ) { | ||
124 | struct inode * p_s_inode = p_s_dentry->d_inode; | ||
125 | int n_err; | ||
126 | int barrier_done; | ||
127 | |||
128 | if (!S_ISREG(p_s_inode->i_mode)) | ||
129 | BUG (); | ||
130 | n_err = sync_mapping_buffers(p_s_inode->i_mapping) ; | ||
131 | reiserfs_write_lock(p_s_inode->i_sb); | ||
132 | barrier_done = reiserfs_commit_for_inode(p_s_inode); | ||
133 | reiserfs_write_unlock(p_s_inode->i_sb); | ||
134 | if (barrier_done != 1) | ||
135 | blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL); | ||
136 | if (barrier_done < 0) | ||
137 | return barrier_done; | ||
138 | return ( n_err < 0 ) ? -EIO : 0; | ||
139 | } | ||
140 | |||
141 | /* I really do not want to play with memory shortage right now, so | ||
142 | to simplify the code, we are not going to write more than this much pages at | ||
143 | a time. This still should considerably improve performance compared to 4k | ||
144 | at a time case. This is 32 pages of 4k size. */ | ||
145 | #define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE | ||
146 | |||
147 | /* Allocates blocks for a file to fulfil write request. | ||
148 | Maps all unmapped but prepared pages from the list. | ||
149 | Updates metadata with newly allocated blocknumbers as needed */ | ||
150 | static int reiserfs_allocate_blocks_for_region( | ||
151 | struct reiserfs_transaction_handle *th, | ||
152 | struct inode *inode, /* Inode we work with */ | ||
153 | loff_t pos, /* Writing position */ | ||
154 | int num_pages, /* number of pages write going | ||
155 | to touch */ | ||
156 | int write_bytes, /* amount of bytes to write */ | ||
157 | struct page **prepared_pages, /* array of | ||
158 | prepared pages | ||
159 | */ | ||
160 | int blocks_to_allocate /* Amount of blocks we | ||
161 | need to allocate to | ||
162 | fit the data into file | ||
163 | */ | ||
164 | ) | ||
165 | { | ||
166 | struct cpu_key key; // cpu key of item that we are going to deal with | ||
167 | struct item_head *ih; // pointer to item head that we are going to deal with | ||
168 | struct buffer_head *bh; // Buffer head that contains items that we are going to deal with | ||
169 | __u32 * item; // pointer to item we are going to deal with | ||
170 | INITIALIZE_PATH(path); // path to item, that we are going to deal with. | ||
171 | b_blocknr_t *allocated_blocks; // Pointer to a place where allocated blocknumbers would be stored. | ||
172 | reiserfs_blocknr_hint_t hint; // hint structure for block allocator. | ||
173 | size_t res; // return value of various functions that we call. | ||
174 | int curr_block; // current block used to keep track of unmapped blocks. | ||
175 | int i; // loop counter | ||
176 | int itempos; // position in item | ||
177 | unsigned int from = (pos & (PAGE_CACHE_SIZE - 1)); // writing position in | ||
178 | // first page | ||
179 | unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1; /* last modified byte offset in last page */ | ||
180 | __u64 hole_size ; // amount of blocks for a file hole, if it needed to be created. | ||
181 | int modifying_this_item = 0; // Flag for items traversal code to keep track | ||
182 | // of the fact that we already prepared | ||
183 | // current block for journal | ||
184 | int will_prealloc = 0; | ||
185 | RFALSE(!blocks_to_allocate, "green-9004: tried to allocate zero blocks?"); | ||
186 | |||
187 | /* only preallocate if this is a small write */ | ||
188 | if (REISERFS_I(inode)->i_prealloc_count || | ||
189 | (!(write_bytes & (inode->i_sb->s_blocksize -1)) && | ||
190 | blocks_to_allocate < | ||
191 | REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize)) | ||
192 | will_prealloc = REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize; | ||
193 | |||
194 | allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) * | ||
195 | sizeof(b_blocknr_t), GFP_NOFS); | ||
196 | |||
197 | /* First we compose a key to point at the writing position, we want to do | ||
198 | that outside of any locking region. */ | ||
199 | make_cpu_key (&key, inode, pos+1, TYPE_ANY, 3/*key length*/); | ||
200 | |||
201 | /* If we came here, it means we absolutely need to open a transaction, | ||
202 | since we need to allocate some blocks */ | ||
203 | reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that. | ||
204 | res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS); // Wish I know if this number enough | ||
205 | if (res) | ||
206 | goto error_exit; | ||
207 | reiserfs_update_inode_transaction(inode) ; | ||
208 | |||
209 | /* Look for the in-tree position of our write, need path for block allocator */ | ||
210 | res = search_for_position_by_key(inode->i_sb, &key, &path); | ||
211 | if ( res == IO_ERROR ) { | ||
212 | res = -EIO; | ||
213 | goto error_exit; | ||
214 | } | ||
215 | |||
216 | /* Allocate blocks */ | ||
217 | /* First fill in "hint" structure for block allocator */ | ||
218 | hint.th = th; // transaction handle. | ||
219 | hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine. | ||
220 | hint.inode = inode; // Inode is needed by block allocator too. | ||
221 | hint.search_start = 0; // We have no hint on where to search free blocks for block allocator. | ||
222 | hint.key = key.on_disk_key; // on disk key of file. | ||
223 | hint.block = inode->i_blocks>>(inode->i_sb->s_blocksize_bits-9); // Number of disk blocks this file occupies already. | ||
224 | hint.formatted_node = 0; // We are allocating blocks for unformatted node. | ||
225 | hint.preallocate = will_prealloc; | ||
226 | |||
227 | /* Call block allocator to allocate blocks */ | ||
228 | res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate); | ||
229 | if ( res != CARRY_ON ) { | ||
230 | if ( res == NO_DISK_SPACE ) { | ||
231 | /* We flush the transaction in case of no space. This way some | ||
232 | blocks might become free */ | ||
233 | SB_JOURNAL(inode->i_sb)->j_must_wait = 1; | ||
234 | res = restart_transaction(th, inode, &path); | ||
235 | if (res) | ||
236 | goto error_exit; | ||
237 | |||
238 | /* We might have scheduled, so search again */ | ||
239 | res = search_for_position_by_key(inode->i_sb, &key, &path); | ||
240 | if ( res == IO_ERROR ) { | ||
241 | res = -EIO; | ||
242 | goto error_exit; | ||
243 | } | ||
244 | |||
245 | /* update changed info for hint structure. */ | ||
246 | res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate); | ||
247 | if ( res != CARRY_ON ) { | ||
248 | res = -ENOSPC; | ||
249 | pathrelse(&path); | ||
250 | goto error_exit; | ||
251 | } | ||
252 | } else { | ||
253 | res = -ENOSPC; | ||
254 | pathrelse(&path); | ||
255 | goto error_exit; | ||
256 | } | ||
257 | } | ||
258 | |||
259 | #ifdef __BIG_ENDIAN | ||
260 | // Too bad, I have not found any way to convert a given region from | ||
261 | // cpu format to little endian format | ||
262 | { | ||
263 | int i; | ||
264 | for ( i = 0; i < blocks_to_allocate ; i++) | ||
265 | allocated_blocks[i]=cpu_to_le32(allocated_blocks[i]); | ||
266 | } | ||
267 | #endif | ||
268 | |||
269 | /* Blocks allocating well might have scheduled and tree might have changed, | ||
270 | let's search the tree again */ | ||
271 | /* find where in the tree our write should go */ | ||
272 | res = search_for_position_by_key(inode->i_sb, &key, &path); | ||
273 | if ( res == IO_ERROR ) { | ||
274 | res = -EIO; | ||
275 | goto error_exit_free_blocks; | ||
276 | } | ||
277 | |||
278 | bh = get_last_bh( &path ); // Get a bufferhead for last element in path. | ||
279 | ih = get_ih( &path ); // Get a pointer to last item head in path. | ||
280 | item = get_item( &path ); // Get a pointer to last item in path | ||
281 | |||
282 | /* Let's see what we have found */ | ||
283 | if ( res != POSITION_FOUND ) { /* position not found, this means that we | ||
284 | might need to append file with holes | ||
285 | first */ | ||
286 | // Since we are writing past the file's end, we need to find out if | ||
287 | // there is a hole that needs to be inserted before our writing | ||
288 | // position, and how many blocks it is going to cover (we need to | ||
289 | // populate pointers to file blocks representing the hole with zeros) | ||
290 | |||
291 | { | ||
292 | int item_offset = 1; | ||
293 | /* | ||
294 | * if ih is stat data, its offset is 0 and we don't want to | ||
295 | * add 1 to pos in the hole_size calculation | ||
296 | */ | ||
297 | if (is_statdata_le_ih(ih)) | ||
298 | item_offset = 0; | ||
299 | hole_size = (pos + item_offset - | ||
300 | (le_key_k_offset( get_inode_item_key_version(inode), | ||
301 | &(ih->ih_key)) + | ||
302 | op_bytes_number(ih, inode->i_sb->s_blocksize))) >> | ||
303 | inode->i_sb->s_blocksize_bits; | ||
304 | } | ||
305 | |||
306 | if ( hole_size > 0 ) { | ||
307 | int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE ); // How much data to insert first time. | ||
308 | /* area filled with zeroes, to supply as list of zero blocknumbers | ||
309 | We allocate it outside of loop just in case loop would spin for | ||
310 | several iterations. */ | ||
311 | char *zeros = kmalloc(to_paste*UNFM_P_SIZE, GFP_ATOMIC); // We cannot insert more than MAX_ITEM_LEN bytes anyway. | ||
312 | if ( !zeros ) { | ||
313 | res = -ENOMEM; | ||
314 | goto error_exit_free_blocks; | ||
315 | } | ||
316 | memset ( zeros, 0, to_paste*UNFM_P_SIZE); | ||
317 | do { | ||
318 | to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE ); | ||
319 | if ( is_indirect_le_ih(ih) ) { | ||
320 | /* Ok, there is existing indirect item already. Need to append it */ | ||
321 | /* Calculate position past inserted item */ | ||
322 | make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3); | ||
323 | res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)zeros, UNFM_P_SIZE*to_paste); | ||
324 | if ( res ) { | ||
325 | kfree(zeros); | ||
326 | goto error_exit_free_blocks; | ||
327 | } | ||
328 | } else if ( is_statdata_le_ih(ih) ) { | ||
329 | /* No existing item, create it */ | ||
330 | /* item head for new item */ | ||
331 | struct item_head ins_ih; | ||
332 | |||
333 | /* create a key for our new item */ | ||
334 | make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3); | ||
335 | |||
336 | /* Create new item head for our new item */ | ||
337 | make_le_item_head (&ins_ih, &key, key.version, 1, | ||
338 | TYPE_INDIRECT, to_paste*UNFM_P_SIZE, | ||
339 | 0 /* free space */); | ||
340 | |||
341 | /* Find where such item should live in the tree */ | ||
342 | res = search_item (inode->i_sb, &key, &path); | ||
343 | if ( res != ITEM_NOT_FOUND ) { | ||
344 | /* item should not exist, otherwise we have error */ | ||
345 | if ( res != -ENOSPC ) { | ||
346 | reiserfs_warning (inode->i_sb, | ||
347 | "green-9008: search_by_key (%K) returned %d", | ||
348 | &key, res); | ||
349 | } | ||
350 | res = -EIO; | ||
351 | kfree(zeros); | ||
352 | goto error_exit_free_blocks; | ||
353 | } | ||
354 | res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)zeros); | ||
355 | } else { | ||
356 | reiserfs_panic(inode->i_sb, "green-9011: Unexpected key type %K\n", &key); | ||
357 | } | ||
358 | if ( res ) { | ||
359 | kfree(zeros); | ||
360 | goto error_exit_free_blocks; | ||
361 | } | ||
362 | /* Now we want to check if transaction is too full, and if it is | ||
363 | we restart it. This will also free the path. */ | ||
364 | if (journal_transaction_should_end(th, th->t_blocks_allocated)) { | ||
365 | res = restart_transaction(th, inode, &path); | ||
366 | if (res) { | ||
367 | pathrelse (&path); | ||
368 | kfree(zeros); | ||
369 | goto error_exit; | ||
370 | } | ||
371 | } | ||
372 | |||
373 | /* Well, need to recalculate path and stuff */ | ||
374 | set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + (to_paste << inode->i_blkbits)); | ||
375 | res = search_for_position_by_key(inode->i_sb, &key, &path); | ||
376 | if ( res == IO_ERROR ) { | ||
377 | res = -EIO; | ||
378 | kfree(zeros); | ||
379 | goto error_exit_free_blocks; | ||
380 | } | ||
381 | bh=get_last_bh(&path); | ||
382 | ih=get_ih(&path); | ||
383 | item = get_item(&path); | ||
384 | hole_size -= to_paste; | ||
385 | } while ( hole_size ); | ||
386 | kfree(zeros); | ||
387 | } | ||
388 | } | ||
389 | |||
390 | // Go through existing indirect items first | ||
391 | // replace all zeroes with blocknumbers from list | ||
392 | // Note that if no corresponding item was found, by previous search, | ||
393 | // it means there are no existing in-tree representation for file area | ||
394 | // we are going to overwrite, so there is nothing to scan through for holes. | ||
395 | for ( curr_block = 0, itempos = path.pos_in_item ; curr_block < blocks_to_allocate && res == POSITION_FOUND ; ) { | ||
396 | retry: | ||
397 | |||
398 | if ( itempos >= ih_item_len(ih)/UNFM_P_SIZE ) { | ||
399 | /* We run out of data in this indirect item, let's look for another | ||
400 | one. */ | ||
401 | /* First if we are already modifying current item, log it */ | ||
402 | if ( modifying_this_item ) { | ||
403 | journal_mark_dirty (th, inode->i_sb, bh); | ||
404 | modifying_this_item = 0; | ||
405 | } | ||
406 | /* Then set the key to look for a new indirect item (offset of old | ||
407 | item is added to old item length */ | ||
408 | set_cpu_key_k_offset( &key, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize)); | ||
409 | /* Search ofor position of new key in the tree. */ | ||
410 | res = search_for_position_by_key(inode->i_sb, &key, &path); | ||
411 | if ( res == IO_ERROR) { | ||
412 | res = -EIO; | ||
413 | goto error_exit_free_blocks; | ||
414 | } | ||
415 | bh=get_last_bh(&path); | ||
416 | ih=get_ih(&path); | ||
417 | item = get_item(&path); | ||
418 | itempos = path.pos_in_item; | ||
419 | continue; // loop to check all kinds of conditions and so on. | ||
420 | } | ||
421 | /* Ok, we have correct position in item now, so let's see if it is | ||
422 | representing file hole (blocknumber is zero) and fill it if needed */ | ||
423 | if ( !item[itempos] ) { | ||
424 | /* Ok, a hole. Now we need to check if we already prepared this | ||
425 | block to be journaled */ | ||
426 | while ( !modifying_this_item ) { // loop until succeed | ||
427 | /* Well, this item is not journaled yet, so we must prepare | ||
428 | it for journal first, before we can change it */ | ||
429 | struct item_head tmp_ih; // We copy item head of found item, | ||
430 | // here to detect if fs changed under | ||
431 | // us while we were preparing for | ||
432 | // journal. | ||
433 | int fs_gen; // We store fs generation here to find if someone | ||
434 | // changes fs under our feet | ||
435 | |||
436 | copy_item_head (&tmp_ih, ih); // Remember itemhead | ||
437 | fs_gen = get_generation (inode->i_sb); // remember fs generation | ||
438 | reiserfs_prepare_for_journal(inode->i_sb, bh, 1); // Prepare a buffer within which indirect item is stored for changing. | ||
439 | if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { | ||
440 | // Sigh, fs was changed under us, we need to look for new | ||
441 | // location of item we are working with | ||
442 | |||
443 | /* unmark prepaerd area as journaled and search for it's | ||
444 | new position */ | ||
445 | reiserfs_restore_prepared_buffer(inode->i_sb, bh); | ||
446 | res = search_for_position_by_key(inode->i_sb, &key, &path); | ||
447 | if ( res == IO_ERROR) { | ||
448 | res = -EIO; | ||
449 | goto error_exit_free_blocks; | ||
450 | } | ||
451 | bh=get_last_bh(&path); | ||
452 | ih=get_ih(&path); | ||
453 | item = get_item(&path); | ||
454 | itempos = path.pos_in_item; | ||
455 | goto retry; | ||
456 | } | ||
457 | modifying_this_item = 1; | ||
458 | } | ||
459 | item[itempos] = allocated_blocks[curr_block]; // Assign new block | ||
460 | curr_block++; | ||
461 | } | ||
462 | itempos++; | ||
463 | } | ||
464 | |||
465 | if ( modifying_this_item ) { // We need to log last-accessed block, if it | ||
466 | // was modified, but not logged yet. | ||
467 | journal_mark_dirty (th, inode->i_sb, bh); | ||
468 | } | ||
469 | |||
470 | if ( curr_block < blocks_to_allocate ) { | ||
471 | // Oh, well need to append to indirect item, or to create indirect item | ||
472 | // if there weren't any | ||
473 | if ( is_indirect_le_ih(ih) ) { | ||
474 | // Existing indirect item - append. First calculate key for append | ||
475 | // position. We do not need to recalculate path as it should | ||
476 | // already point to correct place. | ||
477 | make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3); | ||
478 | res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block)); | ||
479 | if ( res ) { | ||
480 | goto error_exit_free_blocks; | ||
481 | } | ||
482 | } else if (is_statdata_le_ih(ih) ) { | ||
483 | // Last found item was statdata. That means we need to create indirect item. | ||
484 | struct item_head ins_ih; /* itemhead for new item */ | ||
485 | |||
486 | /* create a key for our new item */ | ||
487 | make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3); // Position one, | ||
488 | // because that's | ||
489 | // where first | ||
490 | // indirect item | ||
491 | // begins | ||
492 | /* Create new item head for our new item */ | ||
493 | make_le_item_head (&ins_ih, &key, key.version, 1, TYPE_INDIRECT, | ||
494 | (blocks_to_allocate-curr_block)*UNFM_P_SIZE, | ||
495 | 0 /* free space */); | ||
496 | /* Find where such item should live in the tree */ | ||
497 | res = search_item (inode->i_sb, &key, &path); | ||
498 | if ( res != ITEM_NOT_FOUND ) { | ||
499 | /* Well, if we have found such item already, or some error | ||
500 | occured, we need to warn user and return error */ | ||
501 | if ( res != -ENOSPC ) { | ||
502 | reiserfs_warning (inode->i_sb, | ||
503 | "green-9009: search_by_key (%K) " | ||
504 | "returned %d", &key, res); | ||
505 | } | ||
506 | res = -EIO; | ||
507 | goto error_exit_free_blocks; | ||
508 | } | ||
509 | /* Insert item into the tree with the data as its body */ | ||
510 | res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)(allocated_blocks+curr_block)); | ||
511 | } else { | ||
512 | reiserfs_panic(inode->i_sb, "green-9010: unexpected item type for key %K\n",&key); | ||
513 | } | ||
514 | } | ||
515 | |||
516 | // the caller is responsible for closing the transaction | ||
517 | // unless we return an error, they are also responsible for logging | ||
518 | // the inode. | ||
519 | // | ||
520 | pathrelse(&path); | ||
521 | /* | ||
522 | * cleanup prellocation from previous writes | ||
523 | * if this is a partial block write | ||
524 | */ | ||
525 | if (write_bytes & (inode->i_sb->s_blocksize -1)) | ||
526 | reiserfs_discard_prealloc(th, inode); | ||
527 | reiserfs_write_unlock(inode->i_sb); | ||
528 | |||
529 | // go through all the pages/buffers and map the buffers to newly allocated | ||
530 | // blocks (so that system knows where to write these pages later). | ||
531 | curr_block = 0; | ||
532 | for ( i = 0; i < num_pages ; i++ ) { | ||
533 | struct page *page=prepared_pages[i]; //current page | ||
534 | struct buffer_head *head = page_buffers(page);// first buffer for a page | ||
535 | int block_start, block_end; // in-page offsets for buffers. | ||
536 | |||
537 | if (!page_buffers(page)) | ||
538 | reiserfs_panic(inode->i_sb, "green-9005: No buffers for prepared page???"); | ||
539 | |||
540 | /* For each buffer in page */ | ||
541 | for(bh = head, block_start = 0; bh != head || !block_start; | ||
542 | block_start=block_end, bh = bh->b_this_page) { | ||
543 | if (!bh) | ||
544 | reiserfs_panic(inode->i_sb, "green-9006: Allocated but absent buffer for a page?"); | ||
545 | block_end = block_start+inode->i_sb->s_blocksize; | ||
546 | if (i == 0 && block_end <= from ) | ||
547 | /* if this buffer is before requested data to map, skip it */ | ||
548 | continue; | ||
549 | if (i == num_pages - 1 && block_start >= to) | ||
550 | /* If this buffer is after requested data to map, abort | ||
551 | processing of current page */ | ||
552 | break; | ||
553 | |||
554 | if ( !buffer_mapped(bh) ) { // Ok, unmapped buffer, need to map it | ||
555 | map_bh( bh, inode->i_sb, le32_to_cpu(allocated_blocks[curr_block])); | ||
556 | curr_block++; | ||
557 | set_buffer_new(bh); | ||
558 | } | ||
559 | } | ||
560 | } | ||
561 | |||
562 | RFALSE( curr_block > blocks_to_allocate, "green-9007: Used too many blocks? weird"); | ||
563 | |||
564 | kfree(allocated_blocks); | ||
565 | return 0; | ||
566 | |||
567 | // Need to deal with transaction here. | ||
568 | error_exit_free_blocks: | ||
569 | pathrelse(&path); | ||
570 | // free blocks | ||
571 | for( i = 0; i < blocks_to_allocate; i++ ) | ||
572 | reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]), 1); | ||
573 | |||
574 | error_exit: | ||
575 | if (th->t_trans_id) { | ||
576 | int err; | ||
577 | // update any changes we made to blk count | ||
578 | reiserfs_update_sd(th, inode); | ||
579 | err = journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS); | ||
580 | if (err) | ||
581 | res = err; | ||
582 | } | ||
583 | reiserfs_write_unlock(inode->i_sb); | ||
584 | kfree(allocated_blocks); | ||
585 | |||
586 | return res; | ||
587 | } | ||
588 | |||
589 | /* Unlock pages prepared by reiserfs_prepare_file_region_for_write */ | ||
590 | static void reiserfs_unprepare_pages(struct page **prepared_pages, /* list of locked pages */ | ||
591 | size_t num_pages /* amount of pages */) { | ||
592 | int i; // loop counter | ||
593 | |||
594 | for (i=0; i < num_pages ; i++) { | ||
595 | struct page *page = prepared_pages[i]; | ||
596 | |||
597 | try_to_free_buffers(page); | ||
598 | unlock_page(page); | ||
599 | page_cache_release(page); | ||
600 | } | ||
601 | } | ||
602 | |||
603 | /* This function will copy data from userspace to specified pages within | ||
604 | supplied byte range */ | ||
605 | static int reiserfs_copy_from_user_to_file_region( | ||
606 | loff_t pos, /* In-file position */ | ||
607 | int num_pages, /* Number of pages affected */ | ||
608 | int write_bytes, /* Amount of bytes to write */ | ||
609 | struct page **prepared_pages, /* pointer to | ||
610 | array to | ||
611 | prepared pages | ||
612 | */ | ||
613 | const char __user *buf /* Pointer to user-supplied | ||
614 | data*/ | ||
615 | ) | ||
616 | { | ||
617 | long page_fault=0; // status of copy_from_user. | ||
618 | int i; // loop counter. | ||
619 | int offset; // offset in page | ||
620 | |||
621 | for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) { | ||
622 | size_t count = min_t(size_t,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page | ||
623 | struct page *page=prepared_pages[i]; // Current page we process. | ||
624 | |||
625 | fault_in_pages_readable( buf, count); | ||
626 | |||
627 | /* Copy data from userspace to the current page */ | ||
628 | kmap(page); | ||
629 | page_fault = __copy_from_user(page_address(page)+offset, buf, count); // Copy the data. | ||
630 | /* Flush processor's dcache for this page */ | ||
631 | flush_dcache_page(page); | ||
632 | kunmap(page); | ||
633 | buf+=count; | ||
634 | write_bytes-=count; | ||
635 | |||
636 | if (page_fault) | ||
637 | break; // Was there a fault? abort. | ||
638 | } | ||
639 | |||
640 | return page_fault?-EFAULT:0; | ||
641 | } | ||
642 | |||
643 | /* taken fs/buffer.c:__block_commit_write */ | ||
644 | int reiserfs_commit_page(struct inode *inode, struct page *page, | ||
645 | unsigned from, unsigned to) | ||
646 | { | ||
647 | unsigned block_start, block_end; | ||
648 | int partial = 0; | ||
649 | unsigned blocksize; | ||
650 | struct buffer_head *bh, *head; | ||
651 | unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT; | ||
652 | int new; | ||
653 | int logit = reiserfs_file_data_log(inode); | ||
654 | struct super_block *s = inode->i_sb; | ||
655 | int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; | ||
656 | struct reiserfs_transaction_handle th; | ||
657 | int ret = 0; | ||
658 | |||
659 | th.t_trans_id = 0; | ||
660 | blocksize = 1 << inode->i_blkbits; | ||
661 | |||
662 | if (logit) { | ||
663 | reiserfs_write_lock(s); | ||
664 | ret = journal_begin(&th, s, bh_per_page + 1); | ||
665 | if (ret) | ||
666 | goto drop_write_lock; | ||
667 | reiserfs_update_inode_transaction(inode); | ||
668 | } | ||
669 | for(bh = head = page_buffers(page), block_start = 0; | ||
670 | bh != head || !block_start; | ||
671 | block_start=block_end, bh = bh->b_this_page) | ||
672 | { | ||
673 | |||
674 | new = buffer_new(bh); | ||
675 | clear_buffer_new(bh); | ||
676 | block_end = block_start + blocksize; | ||
677 | if (block_end <= from || block_start >= to) { | ||
678 | if (!buffer_uptodate(bh)) | ||
679 | partial = 1; | ||
680 | } else { | ||
681 | set_buffer_uptodate(bh); | ||
682 | if (logit) { | ||
683 | reiserfs_prepare_for_journal(s, bh, 1); | ||
684 | journal_mark_dirty(&th, s, bh); | ||
685 | } else if (!buffer_dirty(bh)) { | ||
686 | mark_buffer_dirty(bh); | ||
687 | /* do data=ordered on any page past the end | ||
688 | * of file and any buffer marked BH_New. | ||
689 | */ | ||
690 | if (reiserfs_data_ordered(inode->i_sb) && | ||
691 | (new || page->index >= i_size_index)) { | ||
692 | reiserfs_add_ordered_list(inode, bh); | ||
693 | } | ||
694 | } | ||
695 | } | ||
696 | } | ||
697 | if (logit) { | ||
698 | ret = journal_end(&th, s, bh_per_page + 1); | ||
699 | drop_write_lock: | ||
700 | reiserfs_write_unlock(s); | ||
701 | } | ||
702 | /* | ||
703 | * If this is a partial write which happened to make all buffers | ||
704 | * uptodate then we can optimize away a bogus readpage() for | ||
705 | * the next read(). Here we 'discover' whether the page went | ||
706 | * uptodate as a result of this (potentially partial) write. | ||
707 | */ | ||
708 | if (!partial) | ||
709 | SetPageUptodate(page); | ||
710 | return ret; | ||
711 | } | ||
712 | |||
713 | |||
714 | /* Submit pages for write. This was separated from actual file copying | ||
715 | because we might want to allocate block numbers in-between. | ||
716 | This function assumes that caller will adjust file size to correct value. */ | ||
717 | static int reiserfs_submit_file_region_for_write( | ||
718 | struct reiserfs_transaction_handle *th, | ||
719 | struct inode *inode, | ||
720 | loff_t pos, /* Writing position offset */ | ||
721 | size_t num_pages, /* Number of pages to write */ | ||
722 | size_t write_bytes, /* number of bytes to write */ | ||
723 | struct page **prepared_pages /* list of pages */ | ||
724 | ) | ||
725 | { | ||
726 | int status; // return status of block_commit_write. | ||
727 | int retval = 0; // Return value we are going to return. | ||
728 | int i; // loop counter | ||
729 | int offset; // Writing offset in page. | ||
730 | int orig_write_bytes = write_bytes; | ||
731 | int sd_update = 0; | ||
732 | |||
733 | for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) { | ||
734 | int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page | ||
735 | struct page *page=prepared_pages[i]; // Current page we process. | ||
736 | |||
737 | status = reiserfs_commit_page(inode, page, offset, offset+count); | ||
738 | if ( status ) | ||
739 | retval = status; // To not overcomplicate matters We are going to | ||
740 | // submit all the pages even if there was error. | ||
741 | // we only remember error status to report it on | ||
742 | // exit. | ||
743 | write_bytes-=count; | ||
744 | } | ||
745 | /* now that we've gotten all the ordered buffers marked dirty, | ||
746 | * we can safely update i_size and close any running transaction | ||
747 | */ | ||
748 | if ( pos + orig_write_bytes > inode->i_size) { | ||
749 | inode->i_size = pos + orig_write_bytes; // Set new size | ||
750 | /* If the file have grown so much that tail packing is no | ||
751 | * longer possible, reset "need to pack" flag */ | ||
752 | if ( (have_large_tails (inode->i_sb) && | ||
753 | inode->i_size > i_block_size (inode)*4) || | ||
754 | (have_small_tails (inode->i_sb) && | ||
755 | inode->i_size > i_block_size(inode)) ) | ||
756 | REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ; | ||
757 | else if ( (have_large_tails (inode->i_sb) && | ||
758 | inode->i_size < i_block_size (inode)*4) || | ||
759 | (have_small_tails (inode->i_sb) && | ||
760 | inode->i_size < i_block_size(inode)) ) | ||
761 | REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ; | ||
762 | |||
763 | if (th->t_trans_id) { | ||
764 | reiserfs_write_lock(inode->i_sb); | ||
765 | reiserfs_update_sd(th, inode); // And update on-disk metadata | ||
766 | reiserfs_write_unlock(inode->i_sb); | ||
767 | } else | ||
768 | inode->i_sb->s_op->dirty_inode(inode); | ||
769 | |||
770 | sd_update = 1; | ||
771 | } | ||
772 | if (th->t_trans_id) { | ||
773 | reiserfs_write_lock(inode->i_sb); | ||
774 | if (!sd_update) | ||
775 | reiserfs_update_sd(th, inode); | ||
776 | status = journal_end(th, th->t_super, th->t_blocks_allocated); | ||
777 | if (status) | ||
778 | retval = status; | ||
779 | reiserfs_write_unlock(inode->i_sb); | ||
780 | } | ||
781 | th->t_trans_id = 0; | ||
782 | |||
783 | /* | ||
784 | * we have to unlock the pages after updating i_size, otherwise | ||
785 | * we race with writepage | ||
786 | */ | ||
787 | for ( i = 0; i < num_pages ; i++) { | ||
788 | struct page *page=prepared_pages[i]; | ||
789 | unlock_page(page); | ||
790 | mark_page_accessed(page); | ||
791 | page_cache_release(page); | ||
792 | } | ||
793 | return retval; | ||
794 | } | ||
795 | |||
796 | /* Look if passed writing region is going to touch file's tail | ||
797 | (if it is present). And if it is, convert the tail to unformatted node */ | ||
798 | static int reiserfs_check_for_tail_and_convert( struct inode *inode, /* inode to deal with */ | ||
799 | loff_t pos, /* Writing position */ | ||
800 | int write_bytes /* amount of bytes to write */ | ||
801 | ) | ||
802 | { | ||
803 | INITIALIZE_PATH(path); // needed for search_for_position | ||
804 | struct cpu_key key; // Key that would represent last touched writing byte. | ||
805 | struct item_head *ih; // item header of found block; | ||
806 | int res; // Return value of various functions we call. | ||
807 | int cont_expand_offset; // We will put offset for generic_cont_expand here | ||
808 | // This can be int just because tails are created | ||
809 | // only for small files. | ||
810 | |||
811 | /* this embodies a dependency on a particular tail policy */ | ||
812 | if ( inode->i_size >= inode->i_sb->s_blocksize*4 ) { | ||
813 | /* such a big files do not have tails, so we won't bother ourselves | ||
814 | to look for tails, simply return */ | ||
815 | return 0; | ||
816 | } | ||
817 | |||
818 | reiserfs_write_lock(inode->i_sb); | ||
819 | /* find the item containing the last byte to be written, or if | ||
820 | * writing past the end of the file then the last item of the | ||
821 | * file (and then we check its type). */ | ||
822 | make_cpu_key (&key, inode, pos+write_bytes+1, TYPE_ANY, 3/*key length*/); | ||
823 | res = search_for_position_by_key(inode->i_sb, &key, &path); | ||
824 | if ( res == IO_ERROR ) { | ||
825 | reiserfs_write_unlock(inode->i_sb); | ||
826 | return -EIO; | ||
827 | } | ||
828 | ih = get_ih(&path); | ||
829 | res = 0; | ||
830 | if ( is_direct_le_ih(ih) ) { | ||
831 | /* Ok, closest item is file tail (tails are stored in "direct" | ||
832 | * items), so we need to unpack it. */ | ||
833 | /* To not overcomplicate matters, we just call generic_cont_expand | ||
834 | which will in turn call other stuff and finally will boil down to | ||
835 | reiserfs_get_block() that would do necessary conversion. */ | ||
836 | cont_expand_offset = le_key_k_offset(get_inode_item_key_version(inode), &(ih->ih_key)); | ||
837 | pathrelse(&path); | ||
838 | res = generic_cont_expand( inode, cont_expand_offset); | ||
839 | } else | ||
840 | pathrelse(&path); | ||
841 | |||
842 | reiserfs_write_unlock(inode->i_sb); | ||
843 | return res; | ||
844 | } | ||
845 | |||
846 | /* This function locks pages starting from @pos for @inode. | ||
847 | @num_pages pages are locked and stored in | ||
848 | @prepared_pages array. Also buffers are allocated for these pages. | ||
849 | First and last page of the region is read if it is overwritten only | ||
850 | partially. If last page did not exist before write (file hole or file | ||
851 | append), it is zeroed, then. | ||
852 | Returns number of unallocated blocks that should be allocated to cover | ||
853 | new file data.*/ | ||
854 | static int reiserfs_prepare_file_region_for_write( | ||
855 | struct inode *inode /* Inode of the file */, | ||
856 | loff_t pos, /* position in the file */ | ||
857 | size_t num_pages, /* number of pages to | ||
858 | prepare */ | ||
859 | size_t write_bytes, /* Amount of bytes to be | ||
860 | overwritten from | ||
861 | @pos */ | ||
862 | struct page **prepared_pages /* pointer to array | ||
863 | where to store | ||
864 | prepared pages */ | ||
865 | ) | ||
866 | { | ||
867 | int res=0; // Return values of different functions we call. | ||
868 | unsigned long index = pos >> PAGE_CACHE_SHIFT; // Offset in file in pages. | ||
869 | int from = (pos & (PAGE_CACHE_SIZE - 1)); // Writing offset in first page | ||
870 | int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1; | ||
871 | /* offset of last modified byte in last | ||
872 | page */ | ||
873 | struct address_space *mapping = inode->i_mapping; // Pages are mapped here. | ||
874 | int i; // Simple counter | ||
875 | int blocks = 0; /* Return value (blocks that should be allocated) */ | ||
876 | struct buffer_head *bh, *head; // Current bufferhead and first bufferhead | ||
877 | // of a page. | ||
878 | unsigned block_start, block_end; // Starting and ending offsets of current | ||
879 | // buffer in the page. | ||
880 | struct buffer_head *wait[2], **wait_bh=wait; // Buffers for page, if | ||
881 | // Page appeared to be not up | ||
882 | // to date. Note how we have | ||
883 | // at most 2 buffers, this is | ||
884 | // because we at most may | ||
885 | // partially overwrite two | ||
886 | // buffers for one page. One at // the beginning of write area | ||
887 | // and one at the end. | ||
888 | // Everything inthe middle gets // overwritten totally. | ||
889 | |||
890 | struct cpu_key key; // cpu key of item that we are going to deal with | ||
891 | struct item_head *ih = NULL; // pointer to item head that we are going to deal with | ||
892 | struct buffer_head *itembuf=NULL; // Buffer head that contains items that we are going to deal with | ||
893 | INITIALIZE_PATH(path); // path to item, that we are going to deal with. | ||
894 | __u32 * item=NULL; // pointer to item we are going to deal with | ||
895 | int item_pos=-1; /* Position in indirect item */ | ||
896 | |||
897 | |||
898 | if ( num_pages < 1 ) { | ||
899 | reiserfs_warning (inode->i_sb, | ||
900 | "green-9001: reiserfs_prepare_file_region_for_write " | ||
901 | "called with zero number of pages to process"); | ||
902 | return -EFAULT; | ||
903 | } | ||
904 | |||
905 | /* We have 2 loops for pages. In first loop we grab and lock the pages, so | ||
906 | that nobody would touch these until we release the pages. Then | ||
907 | we'd start to deal with mapping buffers to blocks. */ | ||
908 | for ( i = 0; i < num_pages; i++) { | ||
909 | prepared_pages[i] = grab_cache_page(mapping, index + i); // locks the page | ||
910 | if ( !prepared_pages[i]) { | ||
911 | res = -ENOMEM; | ||
912 | goto failed_page_grabbing; | ||
913 | } | ||
914 | if (!page_has_buffers(prepared_pages[i])) | ||
915 | create_empty_buffers(prepared_pages[i], inode->i_sb->s_blocksize, 0); | ||
916 | } | ||
917 | |||
918 | /* Let's count amount of blocks for a case where all the blocks | ||
919 | overwritten are new (we will substract already allocated blocks later)*/ | ||
920 | if ( num_pages > 2 ) | ||
921 | /* These are full-overwritten pages so we count all the blocks in | ||
922 | these pages are counted as needed to be allocated */ | ||
923 | blocks = (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
924 | |||
925 | /* count blocks needed for first page (possibly partially written) */ | ||
926 | blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) + | ||
927 | !!(from & (inode->i_sb->s_blocksize-1)); /* roundup */ | ||
928 | |||
929 | /* Now we account for last page. If last page == first page (we | ||
930 | overwrite only one page), we substract all the blocks past the | ||
931 | last writing position in a page out of already calculated number | ||
932 | of blocks */ | ||
933 | blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT-inode->i_blkbits)) - | ||
934 | ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits); | ||
935 | /* Note how we do not roundup here since partial blocks still | ||
936 | should be allocated */ | ||
937 | |||
938 | /* Now if all the write area lies past the file end, no point in | ||
939 | maping blocks, since there is none, so we just zero out remaining | ||
940 | parts of first and last pages in write area (if needed) */ | ||
941 | if ( (pos & ~((loff_t)PAGE_CACHE_SIZE - 1)) > inode->i_size ) { | ||
942 | if ( from != 0 ) {/* First page needs to be partially zeroed */ | ||
943 | char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0); | ||
944 | memset(kaddr, 0, from); | ||
945 | kunmap_atomic( kaddr, KM_USER0); | ||
946 | } | ||
947 | if ( to != PAGE_CACHE_SIZE ) { /* Last page needs to be partially zeroed */ | ||
948 | char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0); | ||
949 | memset(kaddr+to, 0, PAGE_CACHE_SIZE - to); | ||
950 | kunmap_atomic( kaddr, KM_USER0); | ||
951 | } | ||
952 | |||
953 | /* Since all blocks are new - use already calculated value */ | ||
954 | return blocks; | ||
955 | } | ||
956 | |||
957 | /* Well, since we write somewhere into the middle of a file, there is | ||
958 | possibility we are writing over some already allocated blocks, so | ||
959 | let's map these blocks and substract number of such blocks out of blocks | ||
960 | we need to allocate (calculated above) */ | ||
961 | /* Mask write position to start on blocksize, we do it out of the | ||
962 | loop for performance reasons */ | ||
963 | pos &= ~((loff_t) inode->i_sb->s_blocksize - 1); | ||
964 | /* Set cpu key to the starting position in a file (on left block boundary)*/ | ||
965 | make_cpu_key (&key, inode, 1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)), TYPE_ANY, 3/*key length*/); | ||
966 | |||
967 | reiserfs_write_lock(inode->i_sb); // We need that for at least search_by_key() | ||
968 | for ( i = 0; i < num_pages ; i++ ) { | ||
969 | |||
970 | head = page_buffers(prepared_pages[i]); | ||
971 | /* For each buffer in the page */ | ||
972 | for(bh = head, block_start = 0; bh != head || !block_start; | ||
973 | block_start=block_end, bh = bh->b_this_page) { | ||
974 | if (!bh) | ||
975 | reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?"); | ||
976 | /* Find where this buffer ends */ | ||
977 | block_end = block_start+inode->i_sb->s_blocksize; | ||
978 | if (i == 0 && block_end <= from ) | ||
979 | /* if this buffer is before requested data to map, skip it*/ | ||
980 | continue; | ||
981 | |||
982 | if (i == num_pages - 1 && block_start >= to) { | ||
983 | /* If this buffer is after requested data to map, abort | ||
984 | processing of current page */ | ||
985 | break; | ||
986 | } | ||
987 | |||
988 | if ( buffer_mapped(bh) && bh->b_blocknr !=0 ) { | ||
989 | /* This is optimisation for a case where buffer is mapped | ||
990 | and have blocknumber assigned. In case significant amount | ||
991 | of such buffers are present, we may avoid some amount | ||
992 | of search_by_key calls. | ||
993 | Probably it would be possible to move parts of this code | ||
994 | out of BKL, but I afraid that would overcomplicate code | ||
995 | without any noticeable benefit. | ||
996 | */ | ||
997 | item_pos++; | ||
998 | /* Update the key */ | ||
999 | set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize); | ||
1000 | blocks--; // Decrease the amount of blocks that need to be | ||
1001 | // allocated | ||
1002 | continue; // Go to the next buffer | ||
1003 | } | ||
1004 | |||
1005 | if ( !itembuf || /* if first iteration */ | ||
1006 | item_pos >= ih_item_len(ih)/UNFM_P_SIZE) | ||
1007 | { /* or if we progressed past the | ||
1008 | current unformatted_item */ | ||
1009 | /* Try to find next item */ | ||
1010 | res = search_for_position_by_key(inode->i_sb, &key, &path); | ||
1011 | /* Abort if no more items */ | ||
1012 | if ( res != POSITION_FOUND ) { | ||
1013 | /* make sure later loops don't use this item */ | ||
1014 | itembuf = NULL; | ||
1015 | item = NULL; | ||
1016 | break; | ||
1017 | } | ||
1018 | |||
1019 | /* Update information about current indirect item */ | ||
1020 | itembuf = get_last_bh( &path ); | ||
1021 | ih = get_ih( &path ); | ||
1022 | item = get_item( &path ); | ||
1023 | item_pos = path.pos_in_item; | ||
1024 | |||
1025 | RFALSE( !is_indirect_le_ih (ih), "green-9003: indirect item expected"); | ||
1026 | } | ||
1027 | |||
1028 | /* See if there is some block associated with the file | ||
1029 | at that position, map the buffer to this block */ | ||
1030 | if ( get_block_num(item,item_pos) ) { | ||
1031 | map_bh(bh, inode->i_sb, get_block_num(item,item_pos)); | ||
1032 | blocks--; // Decrease the amount of blocks that need to be | ||
1033 | // allocated | ||
1034 | } | ||
1035 | item_pos++; | ||
1036 | /* Update the key */ | ||
1037 | set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize); | ||
1038 | } | ||
1039 | } | ||
1040 | pathrelse(&path); // Free the path | ||
1041 | reiserfs_write_unlock(inode->i_sb); | ||
1042 | |||
1043 | /* Now zero out unmappend buffers for the first and last pages of | ||
1044 | write area or issue read requests if page is mapped. */ | ||
1045 | /* First page, see if it is not uptodate */ | ||
1046 | if ( !PageUptodate(prepared_pages[0]) ) { | ||
1047 | head = page_buffers(prepared_pages[0]); | ||
1048 | |||
1049 | /* For each buffer in page */ | ||
1050 | for(bh = head, block_start = 0; bh != head || !block_start; | ||
1051 | block_start=block_end, bh = bh->b_this_page) { | ||
1052 | |||
1053 | if (!bh) | ||
1054 | reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?"); | ||
1055 | /* Find where this buffer ends */ | ||
1056 | block_end = block_start+inode->i_sb->s_blocksize; | ||
1057 | if ( block_end <= from ) | ||
1058 | /* if this buffer is before requested data to map, skip it*/ | ||
1059 | continue; | ||
1060 | if ( block_start < from ) { /* Aha, our partial buffer */ | ||
1061 | if ( buffer_mapped(bh) ) { /* If it is mapped, we need to | ||
1062 | issue READ request for it to | ||
1063 | not loose data */ | ||
1064 | ll_rw_block(READ, 1, &bh); | ||
1065 | *wait_bh++=bh; | ||
1066 | } else { /* Not mapped, zero it */ | ||
1067 | char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0); | ||
1068 | memset(kaddr+block_start, 0, from-block_start); | ||
1069 | kunmap_atomic( kaddr, KM_USER0); | ||
1070 | set_buffer_uptodate(bh); | ||
1071 | } | ||
1072 | } | ||
1073 | } | ||
1074 | } | ||
1075 | |||
1076 | /* Last page, see if it is not uptodate, or if the last page is past the end of the file. */ | ||
1077 | if ( !PageUptodate(prepared_pages[num_pages-1]) || | ||
1078 | ((pos+write_bytes)>>PAGE_CACHE_SHIFT) > (inode->i_size>>PAGE_CACHE_SHIFT) ) { | ||
1079 | head = page_buffers(prepared_pages[num_pages-1]); | ||
1080 | |||
1081 | /* for each buffer in page */ | ||
1082 | for(bh = head, block_start = 0; bh != head || !block_start; | ||
1083 | block_start=block_end, bh = bh->b_this_page) { | ||
1084 | |||
1085 | if (!bh) | ||
1086 | reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?"); | ||
1087 | /* Find where this buffer ends */ | ||
1088 | block_end = block_start+inode->i_sb->s_blocksize; | ||
1089 | if ( block_start >= to ) | ||
1090 | /* if this buffer is after requested data to map, skip it*/ | ||
1091 | break; | ||
1092 | if ( block_end > to ) { /* Aha, our partial buffer */ | ||
1093 | if ( buffer_mapped(bh) ) { /* If it is mapped, we need to | ||
1094 | issue READ request for it to | ||
1095 | not loose data */ | ||
1096 | ll_rw_block(READ, 1, &bh); | ||
1097 | *wait_bh++=bh; | ||
1098 | } else { /* Not mapped, zero it */ | ||
1099 | char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0); | ||
1100 | memset(kaddr+to, 0, block_end-to); | ||
1101 | kunmap_atomic( kaddr, KM_USER0); | ||
1102 | set_buffer_uptodate(bh); | ||
1103 | } | ||
1104 | } | ||
1105 | } | ||
1106 | } | ||
1107 | |||
1108 | /* Wait for read requests we made to happen, if necessary */ | ||
1109 | while(wait_bh > wait) { | ||
1110 | wait_on_buffer(*--wait_bh); | ||
1111 | if (!buffer_uptodate(*wait_bh)) { | ||
1112 | res = -EIO; | ||
1113 | goto failed_read; | ||
1114 | } | ||
1115 | } | ||
1116 | |||
1117 | return blocks; | ||
1118 | failed_page_grabbing: | ||
1119 | num_pages = i; | ||
1120 | failed_read: | ||
1121 | reiserfs_unprepare_pages(prepared_pages, num_pages); | ||
1122 | return res; | ||
1123 | } | ||
1124 | |||
1125 | /* Write @count bytes at position @ppos in a file indicated by @file | ||
1126 | from the buffer @buf. | ||
1127 | |||
1128 | generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want | ||
1129 | something simple that works. It is not for serious use by general purpose filesystems, excepting the one that it was | ||
1130 | written for (ext2/3). This is for several reasons: | ||
1131 | |||
1132 | * It has no understanding of any filesystem specific optimizations. | ||
1133 | |||
1134 | * It enters the filesystem repeatedly for each page that is written. | ||
1135 | |||
1136 | * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key | ||
1137 | * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time | ||
1138 | * to reiserfs which allows for fewer tree traversals. | ||
1139 | |||
1140 | * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks. | ||
1141 | |||
1142 | * Asking the block allocation code for blocks one at a time is slightly less efficient. | ||
1143 | |||
1144 | All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to | ||
1145 | use it, but we were in a hurry to make code freeze, and so it couldn't be revised then. This new code should make | ||
1146 | things right finally. | ||
1147 | |||
1148 | Future Features: providing search_by_key with hints. | ||
1149 | |||
1150 | */ | ||
1151 | static ssize_t reiserfs_file_write( struct file *file, /* the file we are going to write into */ | ||
1152 | const char __user *buf, /* pointer to user supplied data | ||
1153 | (in userspace) */ | ||
1154 | size_t count, /* amount of bytes to write */ | ||
1155 | loff_t *ppos /* pointer to position in file that we start writing at. Should be updated to | ||
1156 | * new current position before returning. */ ) | ||
1157 | { | ||
1158 | size_t already_written = 0; // Number of bytes already written to the file. | ||
1159 | loff_t pos; // Current position in the file. | ||
1160 | ssize_t res; // return value of various functions that we call. | ||
1161 | int err = 0; | ||
1162 | struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to. | ||
1163 | /* To simplify coding at this time, we store | ||
1164 | locked pages in array for now */ | ||
1165 | struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME]; | ||
1166 | struct reiserfs_transaction_handle th; | ||
1167 | th.t_trans_id = 0; | ||
1168 | |||
1169 | if ( file->f_flags & O_DIRECT) { // Direct IO needs treatment | ||
1170 | ssize_t result, after_file_end = 0; | ||
1171 | if ( (*ppos + count >= inode->i_size) || (file->f_flags & O_APPEND) ) { | ||
1172 | /* If we are appending a file, we need to put this savelink in here. | ||
1173 | If we will crash while doing direct io, finish_unfinished will | ||
1174 | cut the garbage from the file end. */ | ||
1175 | reiserfs_write_lock(inode->i_sb); | ||
1176 | err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT ); | ||
1177 | if (err) { | ||
1178 | reiserfs_write_unlock (inode->i_sb); | ||
1179 | return err; | ||
1180 | } | ||
1181 | reiserfs_update_inode_transaction(inode); | ||
1182 | add_save_link (&th, inode, 1 /* Truncate */); | ||
1183 | after_file_end = 1; | ||
1184 | err = journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT ); | ||
1185 | reiserfs_write_unlock(inode->i_sb); | ||
1186 | if (err) | ||
1187 | return err; | ||
1188 | } | ||
1189 | result = generic_file_write(file, buf, count, ppos); | ||
1190 | |||
1191 | if ( after_file_end ) { /* Now update i_size and remove the savelink */ | ||
1192 | struct reiserfs_transaction_handle th; | ||
1193 | reiserfs_write_lock(inode->i_sb); | ||
1194 | err = journal_begin(&th, inode->i_sb, 1); | ||
1195 | if (err) { | ||
1196 | reiserfs_write_unlock (inode->i_sb); | ||
1197 | return err; | ||
1198 | } | ||
1199 | reiserfs_update_inode_transaction(inode); | ||
1200 | reiserfs_update_sd(&th, inode); | ||
1201 | err = journal_end(&th, inode->i_sb, 1); | ||
1202 | if (err) { | ||
1203 | reiserfs_write_unlock (inode->i_sb); | ||
1204 | return err; | ||
1205 | } | ||
1206 | err = remove_save_link (inode, 1/* truncate */); | ||
1207 | reiserfs_write_unlock(inode->i_sb); | ||
1208 | if (err) | ||
1209 | return err; | ||
1210 | } | ||
1211 | |||
1212 | return result; | ||
1213 | } | ||
1214 | |||
1215 | if ( unlikely((ssize_t) count < 0 )) | ||
1216 | return -EINVAL; | ||
1217 | |||
1218 | if (unlikely(!access_ok(VERIFY_READ, buf, count))) | ||
1219 | return -EFAULT; | ||
1220 | |||
1221 | down(&inode->i_sem); // locks the entire file for just us | ||
1222 | |||
1223 | pos = *ppos; | ||
1224 | |||
1225 | /* Check if we can write to specified region of file, file | ||
1226 | is not overly big and this kind of stuff. Adjust pos and | ||
1227 | count, if needed */ | ||
1228 | res = generic_write_checks(file, &pos, &count, 0); | ||
1229 | if (res) | ||
1230 | goto out; | ||
1231 | |||
1232 | if ( count == 0 ) | ||
1233 | goto out; | ||
1234 | |||
1235 | res = remove_suid(file->f_dentry); | ||
1236 | if (res) | ||
1237 | goto out; | ||
1238 | |||
1239 | inode_update_time(inode, 1); /* Both mtime and ctime */ | ||
1240 | |||
1241 | // Ok, we are done with all the checks. | ||
1242 | |||
1243 | // Now we should start real work | ||
1244 | |||
1245 | /* If we are going to write past the file's packed tail or if we are going | ||
1246 | to overwrite part of the tail, we need that tail to be converted into | ||
1247 | unformatted node */ | ||
1248 | res = reiserfs_check_for_tail_and_convert( inode, pos, count); | ||
1249 | if (res) | ||
1250 | goto out; | ||
1251 | |||
1252 | while ( count > 0) { | ||
1253 | /* This is the main loop in which we running until some error occures | ||
1254 | or until we write all of the data. */ | ||
1255 | size_t num_pages;/* amount of pages we are going to write this iteration */ | ||
1256 | size_t write_bytes; /* amount of bytes to write during this iteration */ | ||
1257 | size_t blocks_to_allocate; /* how much blocks we need to allocate for this iteration */ | ||
1258 | |||
1259 | /* (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos*/ | ||
1260 | num_pages = !!((pos+count) & (PAGE_CACHE_SIZE - 1)) + /* round up partial | ||
1261 | pages */ | ||
1262 | ((count + (pos & (PAGE_CACHE_SIZE-1))) >> PAGE_CACHE_SHIFT); | ||
1263 | /* convert size to amount of | ||
1264 | pages */ | ||
1265 | reiserfs_write_lock(inode->i_sb); | ||
1266 | if ( num_pages > REISERFS_WRITE_PAGES_AT_A_TIME | ||
1267 | || num_pages > reiserfs_can_fit_pages(inode->i_sb) ) { | ||
1268 | /* If we were asked to write more data than we want to or if there | ||
1269 | is not that much space, then we shorten amount of data to write | ||
1270 | for this iteration. */ | ||
1271 | num_pages = min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME, reiserfs_can_fit_pages(inode->i_sb)); | ||
1272 | /* Also we should not forget to set size in bytes accordingly */ | ||
1273 | write_bytes = (num_pages << PAGE_CACHE_SHIFT) - | ||
1274 | (pos & (PAGE_CACHE_SIZE-1)); | ||
1275 | /* If position is not on the | ||
1276 | start of the page, we need | ||
1277 | to substract the offset | ||
1278 | within page */ | ||
1279 | } else | ||
1280 | write_bytes = count; | ||
1281 | |||
1282 | /* reserve the blocks to be allocated later, so that later on | ||
1283 | we still have the space to write the blocks to */ | ||
1284 | reiserfs_claim_blocks_to_be_allocated(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits)); | ||
1285 | reiserfs_write_unlock(inode->i_sb); | ||
1286 | |||
1287 | if ( !num_pages ) { /* If we do not have enough space even for */ | ||
1288 | res = -ENOSPC; /* single page, return -ENOSPC */ | ||
1289 | if ( pos > (inode->i_size & (inode->i_sb->s_blocksize-1))) | ||
1290 | break; // In case we are writing past the file end, break. | ||
1291 | // Otherwise we are possibly overwriting the file, so | ||
1292 | // let's set write size to be equal or less than blocksize. | ||
1293 | // This way we get it correctly for file holes. | ||
1294 | // But overwriting files on absolutelly full volumes would not | ||
1295 | // be very efficient. Well, people are not supposed to fill | ||
1296 | // 100% of disk space anyway. | ||
1297 | write_bytes = min_t(size_t, count, inode->i_sb->s_blocksize - (pos & (inode->i_sb->s_blocksize - 1))); | ||
1298 | num_pages = 1; | ||
1299 | // No blocks were claimed before, so do it now. | ||
1300 | reiserfs_claim_blocks_to_be_allocated(inode->i_sb, 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)); | ||
1301 | } | ||
1302 | |||
1303 | /* Prepare for writing into the region, read in all the | ||
1304 | partially overwritten pages, if needed. And lock the pages, | ||
1305 | so that nobody else can access these until we are done. | ||
1306 | We get number of actual blocks needed as a result.*/ | ||
1307 | blocks_to_allocate = reiserfs_prepare_file_region_for_write(inode, pos, num_pages, write_bytes, prepared_pages); | ||
1308 | if ( blocks_to_allocate < 0 ) { | ||
1309 | res = blocks_to_allocate; | ||
1310 | reiserfs_release_claimed_blocks(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits)); | ||
1311 | break; | ||
1312 | } | ||
1313 | |||
1314 | /* First we correct our estimate of how many blocks we need */ | ||
1315 | reiserfs_release_claimed_blocks(inode->i_sb, (num_pages << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits)) - blocks_to_allocate ); | ||
1316 | |||
1317 | if ( blocks_to_allocate > 0) {/*We only allocate blocks if we need to*/ | ||
1318 | /* Fill in all the possible holes and append the file if needed */ | ||
1319 | res = reiserfs_allocate_blocks_for_region(&th, inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate); | ||
1320 | } | ||
1321 | |||
1322 | /* well, we have allocated the blocks, so it is time to free | ||
1323 | the reservation we made earlier. */ | ||
1324 | reiserfs_release_claimed_blocks(inode->i_sb, blocks_to_allocate); | ||
1325 | if ( res ) { | ||
1326 | reiserfs_unprepare_pages(prepared_pages, num_pages); | ||
1327 | break; | ||
1328 | } | ||
1329 | |||
1330 | /* NOTE that allocating blocks and filling blocks can be done in reverse order | ||
1331 | and probably we would do that just to get rid of garbage in files after a | ||
1332 | crash */ | ||
1333 | |||
1334 | /* Copy data from user-supplied buffer to file's pages */ | ||
1335 | res = reiserfs_copy_from_user_to_file_region(pos, num_pages, write_bytes, prepared_pages, buf); | ||
1336 | if ( res ) { | ||
1337 | reiserfs_unprepare_pages(prepared_pages, num_pages); | ||
1338 | break; | ||
1339 | } | ||
1340 | |||
1341 | /* Send the pages to disk and unlock them. */ | ||
1342 | res = reiserfs_submit_file_region_for_write(&th, inode, pos, num_pages, | ||
1343 | write_bytes,prepared_pages); | ||
1344 | if ( res ) | ||
1345 | break; | ||
1346 | |||
1347 | already_written += write_bytes; | ||
1348 | buf += write_bytes; | ||
1349 | *ppos = pos += write_bytes; | ||
1350 | count -= write_bytes; | ||
1351 | balance_dirty_pages_ratelimited(inode->i_mapping); | ||
1352 | } | ||
1353 | |||
1354 | /* this is only true on error */ | ||
1355 | if (th.t_trans_id) { | ||
1356 | reiserfs_write_lock(inode->i_sb); | ||
1357 | err = journal_end(&th, th.t_super, th.t_blocks_allocated); | ||
1358 | reiserfs_write_unlock(inode->i_sb); | ||
1359 | if (err) { | ||
1360 | res = err; | ||
1361 | goto out; | ||
1362 | } | ||
1363 | } | ||
1364 | |||
1365 | if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) | ||
1366 | res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA); | ||
1367 | |||
1368 | up(&inode->i_sem); | ||
1369 | reiserfs_async_progress_wait(inode->i_sb); | ||
1370 | return (already_written != 0)?already_written:res; | ||
1371 | |||
1372 | out: | ||
1373 | up(&inode->i_sem); // unlock the file on exit. | ||
1374 | return res; | ||
1375 | } | ||
1376 | |||
1377 | static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user *buf, | ||
1378 | size_t count, loff_t pos) | ||
1379 | { | ||
1380 | return generic_file_aio_write(iocb, buf, count, pos); | ||
1381 | } | ||
1382 | |||
1383 | |||
1384 | |||
1385 | struct file_operations reiserfs_file_operations = { | ||
1386 | .read = generic_file_read, | ||
1387 | .write = reiserfs_file_write, | ||
1388 | .ioctl = reiserfs_ioctl, | ||
1389 | .mmap = generic_file_mmap, | ||
1390 | .release = reiserfs_file_release, | ||
1391 | .fsync = reiserfs_sync_file, | ||
1392 | .sendfile = generic_file_sendfile, | ||
1393 | .aio_read = generic_file_aio_read, | ||
1394 | .aio_write = reiserfs_aio_write, | ||
1395 | }; | ||
1396 | |||
1397 | |||
1398 | struct inode_operations reiserfs_file_inode_operations = { | ||
1399 | .truncate = reiserfs_vfs_truncate_file, | ||
1400 | .setattr = reiserfs_setattr, | ||
1401 | .setxattr = reiserfs_setxattr, | ||
1402 | .getxattr = reiserfs_getxattr, | ||
1403 | .listxattr = reiserfs_listxattr, | ||
1404 | .removexattr = reiserfs_removexattr, | ||
1405 | .permission = reiserfs_permission, | ||
1406 | }; | ||
1407 | |||
1408 | |||
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c new file mode 100644 index 000000000000..e4f64be9e15b --- /dev/null +++ b/fs/reiserfs/fix_node.c | |||
@@ -0,0 +1,2518 @@ | |||
1 | /* | ||
2 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
3 | */ | ||
4 | |||
5 | /** | ||
6 | ** old_item_num | ||
7 | ** old_entry_num | ||
8 | ** set_entry_sizes | ||
9 | ** create_virtual_node | ||
10 | ** check_left | ||
11 | ** check_right | ||
12 | ** directory_part_size | ||
13 | ** get_num_ver | ||
14 | ** set_parameters | ||
15 | ** is_leaf_removable | ||
16 | ** are_leaves_removable | ||
17 | ** get_empty_nodes | ||
18 | ** get_lfree | ||
19 | ** get_rfree | ||
20 | ** is_left_neighbor_in_cache | ||
21 | ** decrement_key | ||
22 | ** get_far_parent | ||
23 | ** get_parents | ||
24 | ** can_node_be_removed | ||
25 | ** ip_check_balance | ||
26 | ** dc_check_balance_internal | ||
27 | ** dc_check_balance_leaf | ||
28 | ** dc_check_balance | ||
29 | ** check_balance | ||
30 | ** get_direct_parent | ||
31 | ** get_neighbors | ||
32 | ** fix_nodes | ||
33 | ** | ||
34 | ** | ||
35 | **/ | ||
36 | |||
37 | |||
38 | #include <linux/config.h> | ||
39 | #include <linux/time.h> | ||
40 | #include <linux/string.h> | ||
41 | #include <linux/reiserfs_fs.h> | ||
42 | #include <linux/buffer_head.h> | ||
43 | |||
44 | |||
45 | /* To make any changes in the tree we find a node, that contains item | ||
46 | to be changed/deleted or position in the node we insert a new item | ||
47 | to. We call this node S. To do balancing we need to decide what we | ||
48 | will shift to left/right neighbor, or to a new node, where new item | ||
49 | will be etc. To make this analysis simpler we build virtual | ||
50 | node. Virtual node is an array of items, that will replace items of | ||
51 | node S. (For instance if we are going to delete an item, virtual | ||
52 | node does not contain it). Virtual node keeps information about | ||
53 | item sizes and types, mergeability of first and last items, sizes | ||
54 | of all entries in directory item. We use this array of items when | ||
55 | calculating what we can shift to neighbors and how many nodes we | ||
56 | have to have if we do not any shiftings, if we shift to left/right | ||
57 | neighbor or to both. */ | ||
58 | |||
59 | |||
60 | /* taking item number in virtual node, returns number of item, that it has in source buffer */ | ||
61 | static inline int old_item_num (int new_num, int affected_item_num, int mode) | ||
62 | { | ||
63 | if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num) | ||
64 | return new_num; | ||
65 | |||
66 | if (mode == M_INSERT) { | ||
67 | |||
68 | RFALSE( new_num == 0, | ||
69 | "vs-8005: for INSERT mode and item number of inserted item"); | ||
70 | |||
71 | return new_num - 1; | ||
72 | } | ||
73 | |||
74 | RFALSE( mode != M_DELETE, | ||
75 | "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", mode); | ||
76 | /* delete mode */ | ||
77 | return new_num + 1; | ||
78 | } | ||
79 | |||
80 | static void create_virtual_node (struct tree_balance * tb, int h) | ||
81 | { | ||
82 | struct item_head * ih; | ||
83 | struct virtual_node * vn = tb->tb_vn; | ||
84 | int new_num; | ||
85 | struct buffer_head * Sh; /* this comes from tb->S[h] */ | ||
86 | |||
87 | Sh = PATH_H_PBUFFER (tb->tb_path, h); | ||
88 | |||
89 | /* size of changed node */ | ||
90 | vn->vn_size = MAX_CHILD_SIZE (Sh) - B_FREE_SPACE (Sh) + tb->insert_size[h]; | ||
91 | |||
92 | /* for internal nodes array if virtual items is not created */ | ||
93 | if (h) { | ||
94 | vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE); | ||
95 | return; | ||
96 | } | ||
97 | |||
98 | /* number of items in virtual node */ | ||
99 | vn->vn_nr_item = B_NR_ITEMS (Sh) + ((vn->vn_mode == M_INSERT)? 1 : 0) - ((vn->vn_mode == M_DELETE)? 1 : 0); | ||
100 | |||
101 | /* first virtual item */ | ||
102 | vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1); | ||
103 | memset (vn->vn_vi, 0, vn->vn_nr_item * sizeof (struct virtual_item)); | ||
104 | vn->vn_free_ptr += vn->vn_nr_item * sizeof (struct virtual_item); | ||
105 | |||
106 | |||
107 | /* first item in the node */ | ||
108 | ih = B_N_PITEM_HEAD (Sh, 0); | ||
109 | |||
110 | /* define the mergeability for 0-th item (if it is not being deleted) */ | ||
111 | if (op_is_left_mergeable (&(ih->ih_key), Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num)) | ||
112 | vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE; | ||
113 | |||
114 | /* go through all items those remain in the virtual node (except for the new (inserted) one) */ | ||
115 | for (new_num = 0; new_num < vn->vn_nr_item; new_num ++) { | ||
116 | int j; | ||
117 | struct virtual_item * vi = vn->vn_vi + new_num; | ||
118 | int is_affected = ((new_num != vn->vn_affected_item_num) ? 0 : 1); | ||
119 | |||
120 | |||
121 | if (is_affected && vn->vn_mode == M_INSERT) | ||
122 | continue; | ||
123 | |||
124 | /* get item number in source node */ | ||
125 | j = old_item_num (new_num, vn->vn_affected_item_num, vn->vn_mode); | ||
126 | |||
127 | vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE; | ||
128 | vi->vi_ih = ih + j; | ||
129 | vi->vi_item = B_I_PITEM (Sh, ih + j); | ||
130 | vi->vi_uarea = vn->vn_free_ptr; | ||
131 | |||
132 | // FIXME: there is no check, that item operation did not | ||
133 | // consume too much memory | ||
134 | vn->vn_free_ptr += op_create_vi (vn, vi, is_affected, tb->insert_size [0]); | ||
135 | if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr) | ||
136 | reiserfs_panic (tb->tb_sb, "vs-8030: create_virtual_node: " | ||
137 | "virtual node space consumed"); | ||
138 | |||
139 | if (!is_affected) | ||
140 | /* this is not being changed */ | ||
141 | continue; | ||
142 | |||
143 | if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) { | ||
144 | vn->vn_vi[new_num].vi_item_len += tb->insert_size[0]; | ||
145 | vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted | ||
146 | } | ||
147 | } | ||
148 | |||
149 | |||
150 | /* virtual inserted item is not defined yet */ | ||
151 | if (vn->vn_mode == M_INSERT) { | ||
152 | struct virtual_item * vi = vn->vn_vi + vn->vn_affected_item_num; | ||
153 | |||
154 | RFALSE( vn->vn_ins_ih == 0, | ||
155 | "vs-8040: item header of inserted item is not specified"); | ||
156 | vi->vi_item_len = tb->insert_size[0]; | ||
157 | vi->vi_ih = vn->vn_ins_ih; | ||
158 | vi->vi_item = vn->vn_data; | ||
159 | vi->vi_uarea = vn->vn_free_ptr; | ||
160 | |||
161 | op_create_vi (vn, vi, 0/*not pasted or cut*/, tb->insert_size [0]); | ||
162 | } | ||
163 | |||
164 | /* set right merge flag we take right delimiting key and check whether it is a mergeable item */ | ||
165 | if (tb->CFR[0]) { | ||
166 | struct reiserfs_key * key; | ||
167 | |||
168 | key = B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]); | ||
169 | if (op_is_left_mergeable (key, Sh->b_size) && (vn->vn_mode != M_DELETE || | ||
170 | vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1)) | ||
171 | vn->vn_vi[vn->vn_nr_item-1].vi_type |= VI_TYPE_RIGHT_MERGEABLE; | ||
172 | |||
173 | #ifdef CONFIG_REISERFS_CHECK | ||
174 | if (op_is_left_mergeable (key, Sh->b_size) && | ||
175 | !(vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1) ) { | ||
176 | /* we delete last item and it could be merged with right neighbor's first item */ | ||
177 | if (!(B_NR_ITEMS (Sh) == 1 && is_direntry_le_ih (B_N_PITEM_HEAD (Sh, 0)) && | ||
178 | I_ENTRY_COUNT (B_N_PITEM_HEAD (Sh, 0)) == 1)) { | ||
179 | /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */ | ||
180 | print_block (Sh, 0, -1, -1); | ||
181 | reiserfs_panic (tb->tb_sb, "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c", | ||
182 | key, vn->vn_affected_item_num, vn->vn_mode, M_DELETE); | ||
183 | } else | ||
184 | /* we can delete directory item, that has only one directory entry in it */ | ||
185 | ; | ||
186 | } | ||
187 | #endif | ||
188 | |||
189 | } | ||
190 | } | ||
191 | |||
192 | |||
193 | /* using virtual node check, how many items can be shifted to left | ||
194 | neighbor */ | ||
195 | static void check_left (struct tree_balance * tb, int h, int cur_free) | ||
196 | { | ||
197 | int i; | ||
198 | struct virtual_node * vn = tb->tb_vn; | ||
199 | struct virtual_item * vi; | ||
200 | int d_size, ih_size; | ||
201 | |||
202 | RFALSE( cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free); | ||
203 | |||
204 | /* internal level */ | ||
205 | if (h > 0) { | ||
206 | tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE); | ||
207 | return; | ||
208 | } | ||
209 | |||
210 | /* leaf level */ | ||
211 | |||
212 | if (!cur_free || !vn->vn_nr_item) { | ||
213 | /* no free space or nothing to move */ | ||
214 | tb->lnum[h] = 0; | ||
215 | tb->lbytes = -1; | ||
216 | return; | ||
217 | } | ||
218 | |||
219 | RFALSE( !PATH_H_PPARENT (tb->tb_path, 0), | ||
220 | "vs-8055: parent does not exist or invalid"); | ||
221 | |||
222 | vi = vn->vn_vi; | ||
223 | if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) { | ||
224 | /* all contents of S[0] fits into L[0] */ | ||
225 | |||
226 | RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, | ||
227 | "vs-8055: invalid mode or balance condition failed"); | ||
228 | |||
229 | tb->lnum[0] = vn->vn_nr_item; | ||
230 | tb->lbytes = -1; | ||
231 | return; | ||
232 | } | ||
233 | |||
234 | |||
235 | d_size = 0, ih_size = IH_SIZE; | ||
236 | |||
237 | /* first item may be merge with last item in left neighbor */ | ||
238 | if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE) | ||
239 | d_size = -((int)IH_SIZE), ih_size = 0; | ||
240 | |||
241 | tb->lnum[0] = 0; | ||
242 | for (i = 0; i < vn->vn_nr_item; i ++, ih_size = IH_SIZE, d_size = 0, vi ++) { | ||
243 | d_size += vi->vi_item_len; | ||
244 | if (cur_free >= d_size) { | ||
245 | /* the item can be shifted entirely */ | ||
246 | cur_free -= d_size; | ||
247 | tb->lnum[0] ++; | ||
248 | continue; | ||
249 | } | ||
250 | |||
251 | /* the item cannot be shifted entirely, try to split it */ | ||
252 | /* check whether L[0] can hold ih and at least one byte of the item body */ | ||
253 | if (cur_free <= ih_size) { | ||
254 | /* cannot shift even a part of the current item */ | ||
255 | tb->lbytes = -1; | ||
256 | return; | ||
257 | } | ||
258 | cur_free -= ih_size; | ||
259 | |||
260 | tb->lbytes = op_check_left (vi, cur_free, 0, 0); | ||
261 | if (tb->lbytes != -1) | ||
262 | /* count partially shifted item */ | ||
263 | tb->lnum[0] ++; | ||
264 | |||
265 | break; | ||
266 | } | ||
267 | |||
268 | return; | ||
269 | } | ||
270 | |||
271 | |||
272 | /* using virtual node check, how many items can be shifted to right | ||
273 | neighbor */ | ||
274 | static void check_right (struct tree_balance * tb, int h, int cur_free) | ||
275 | { | ||
276 | int i; | ||
277 | struct virtual_node * vn = tb->tb_vn; | ||
278 | struct virtual_item * vi; | ||
279 | int d_size, ih_size; | ||
280 | |||
281 | RFALSE( cur_free < 0, "vs-8070: cur_free < 0"); | ||
282 | |||
283 | /* internal level */ | ||
284 | if (h > 0) { | ||
285 | tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE); | ||
286 | return; | ||
287 | } | ||
288 | |||
289 | /* leaf level */ | ||
290 | |||
291 | if (!cur_free || !vn->vn_nr_item) { | ||
292 | /* no free space */ | ||
293 | tb->rnum[h] = 0; | ||
294 | tb->rbytes = -1; | ||
295 | return; | ||
296 | } | ||
297 | |||
298 | RFALSE( !PATH_H_PPARENT (tb->tb_path, 0), | ||
299 | "vs-8075: parent does not exist or invalid"); | ||
300 | |||
301 | vi = vn->vn_vi + vn->vn_nr_item - 1; | ||
302 | if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) { | ||
303 | /* all contents of S[0] fits into R[0] */ | ||
304 | |||
305 | RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, | ||
306 | "vs-8080: invalid mode or balance condition failed"); | ||
307 | |||
308 | tb->rnum[h] = vn->vn_nr_item; | ||
309 | tb->rbytes = -1; | ||
310 | return; | ||
311 | } | ||
312 | |||
313 | d_size = 0, ih_size = IH_SIZE; | ||
314 | |||
315 | /* last item may be merge with first item in right neighbor */ | ||
316 | if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) | ||
317 | d_size = -(int)IH_SIZE, ih_size = 0; | ||
318 | |||
319 | tb->rnum[0] = 0; | ||
320 | for (i = vn->vn_nr_item - 1; i >= 0; i --, d_size = 0, ih_size = IH_SIZE, vi --) { | ||
321 | d_size += vi->vi_item_len; | ||
322 | if (cur_free >= d_size) { | ||
323 | /* the item can be shifted entirely */ | ||
324 | cur_free -= d_size; | ||
325 | tb->rnum[0] ++; | ||
326 | continue; | ||
327 | } | ||
328 | |||
329 | /* check whether R[0] can hold ih and at least one byte of the item body */ | ||
330 | if ( cur_free <= ih_size ) { /* cannot shift even a part of the current item */ | ||
331 | tb->rbytes = -1; | ||
332 | return; | ||
333 | } | ||
334 | |||
335 | /* R[0] can hold the header of the item and at least one byte of its body */ | ||
336 | cur_free -= ih_size; /* cur_free is still > 0 */ | ||
337 | |||
338 | tb->rbytes = op_check_right (vi, cur_free); | ||
339 | if (tb->rbytes != -1) | ||
340 | /* count partially shifted item */ | ||
341 | tb->rnum[0] ++; | ||
342 | |||
343 | break; | ||
344 | } | ||
345 | |||
346 | return; | ||
347 | } | ||
348 | |||
349 | |||
350 | /* | ||
351 | * from - number of items, which are shifted to left neighbor entirely | ||
352 | * to - number of item, which are shifted to right neighbor entirely | ||
353 | * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor | ||
354 | * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */ | ||
355 | static int get_num_ver (int mode, struct tree_balance * tb, int h, | ||
356 | int from, int from_bytes, | ||
357 | int to, int to_bytes, | ||
358 | short * snum012, int flow | ||
359 | ) | ||
360 | { | ||
361 | int i; | ||
362 | int cur_free; | ||
363 | // int bytes; | ||
364 | int units; | ||
365 | struct virtual_node * vn = tb->tb_vn; | ||
366 | // struct virtual_item * vi; | ||
367 | |||
368 | int total_node_size, max_node_size, current_item_size; | ||
369 | int needed_nodes; | ||
370 | int start_item, /* position of item we start filling node from */ | ||
371 | end_item, /* position of item we finish filling node by */ | ||
372 | start_bytes,/* number of first bytes (entries for directory) of start_item-th item | ||
373 | we do not include into node that is being filled */ | ||
374 | end_bytes; /* number of last bytes (entries for directory) of end_item-th item | ||
375 | we do node include into node that is being filled */ | ||
376 | int split_item_positions[2]; /* these are positions in virtual item of | ||
377 | items, that are split between S[0] and | ||
378 | S1new and S1new and S2new */ | ||
379 | |||
380 | split_item_positions[0] = -1; | ||
381 | split_item_positions[1] = -1; | ||
382 | |||
383 | /* We only create additional nodes if we are in insert or paste mode | ||
384 | or we are in replace mode at the internal level. If h is 0 and | ||
385 | the mode is M_REPLACE then in fix_nodes we change the mode to | ||
386 | paste or insert before we get here in the code. */ | ||
387 | RFALSE( tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE), | ||
388 | "vs-8100: insert_size < 0 in overflow"); | ||
389 | |||
390 | max_node_size = MAX_CHILD_SIZE (PATH_H_PBUFFER (tb->tb_path, h)); | ||
391 | |||
392 | /* snum012 [0-2] - number of items, that lay | ||
393 | to S[0], first new node and second new node */ | ||
394 | snum012[3] = -1; /* s1bytes */ | ||
395 | snum012[4] = -1; /* s2bytes */ | ||
396 | |||
397 | /* internal level */ | ||
398 | if (h > 0) { | ||
399 | i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE); | ||
400 | if (i == max_node_size) | ||
401 | return 1; | ||
402 | return (i / max_node_size + 1); | ||
403 | } | ||
404 | |||
405 | /* leaf level */ | ||
406 | needed_nodes = 1; | ||
407 | total_node_size = 0; | ||
408 | cur_free = max_node_size; | ||
409 | |||
410 | // start from 'from'-th item | ||
411 | start_item = from; | ||
412 | // skip its first 'start_bytes' units | ||
413 | start_bytes = ((from_bytes != -1) ? from_bytes : 0); | ||
414 | |||
415 | // last included item is the 'end_item'-th one | ||
416 | end_item = vn->vn_nr_item - to - 1; | ||
417 | // do not count last 'end_bytes' units of 'end_item'-th item | ||
418 | end_bytes = (to_bytes != -1) ? to_bytes : 0; | ||
419 | |||
420 | /* go through all item beginning from the start_item-th item and ending by | ||
421 | the end_item-th item. Do not count first 'start_bytes' units of | ||
422 | 'start_item'-th item and last 'end_bytes' of 'end_item'-th item */ | ||
423 | |||
424 | for (i = start_item; i <= end_item; i ++) { | ||
425 | struct virtual_item * vi = vn->vn_vi + i; | ||
426 | int skip_from_end = ((i == end_item) ? end_bytes : 0); | ||
427 | |||
428 | RFALSE( needed_nodes > 3, "vs-8105: too many nodes are needed"); | ||
429 | |||
430 | /* get size of current item */ | ||
431 | current_item_size = vi->vi_item_len; | ||
432 | |||
433 | /* do not take in calculation head part (from_bytes) of from-th item */ | ||
434 | current_item_size -= op_part_size (vi, 0/*from start*/, start_bytes); | ||
435 | |||
436 | /* do not take in calculation tail part of last item */ | ||
437 | current_item_size -= op_part_size (vi, 1/*from end*/, skip_from_end); | ||
438 | |||
439 | /* if item fits into current node entierly */ | ||
440 | if (total_node_size + current_item_size <= max_node_size) { | ||
441 | snum012[needed_nodes - 1] ++; | ||
442 | total_node_size += current_item_size; | ||
443 | start_bytes = 0; | ||
444 | continue; | ||
445 | } | ||
446 | |||
447 | if (current_item_size > max_node_size) { | ||
448 | /* virtual item length is longer, than max size of item in | ||
449 | a node. It is impossible for direct item */ | ||
450 | RFALSE( is_direct_le_ih (vi->vi_ih), | ||
451 | "vs-8110: " | ||
452 | "direct item length is %d. It can not be longer than %d", | ||
453 | current_item_size, max_node_size); | ||
454 | /* we will try to split it */ | ||
455 | flow = 1; | ||
456 | } | ||
457 | |||
458 | if (!flow) { | ||
459 | /* as we do not split items, take new node and continue */ | ||
460 | needed_nodes ++; i --; total_node_size = 0; | ||
461 | continue; | ||
462 | } | ||
463 | |||
464 | // calculate number of item units which fit into node being | ||
465 | // filled | ||
466 | { | ||
467 | int free_space; | ||
468 | |||
469 | free_space = max_node_size - total_node_size - IH_SIZE; | ||
470 | units = op_check_left (vi, free_space, start_bytes, skip_from_end); | ||
471 | if (units == -1) { | ||
472 | /* nothing fits into current node, take new node and continue */ | ||
473 | needed_nodes ++, i--, total_node_size = 0; | ||
474 | continue; | ||
475 | } | ||
476 | } | ||
477 | |||
478 | /* something fits into the current node */ | ||
479 | //if (snum012[3] != -1 || needed_nodes != 1) | ||
480 | // reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required"); | ||
481 | //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units; | ||
482 | start_bytes += units; | ||
483 | snum012[needed_nodes - 1 + 3] = units; | ||
484 | |||
485 | if (needed_nodes > 2) | ||
486 | reiserfs_warning (tb->tb_sb, "vs-8111: get_num_ver: " | ||
487 | "split_item_position is out of boundary"); | ||
488 | snum012[needed_nodes - 1] ++; | ||
489 | split_item_positions[needed_nodes - 1] = i; | ||
490 | needed_nodes ++; | ||
491 | /* continue from the same item with start_bytes != -1 */ | ||
492 | start_item = i; | ||
493 | i --; | ||
494 | total_node_size = 0; | ||
495 | } | ||
496 | |||
497 | // sum012[4] (if it is not -1) contains number of units of which | ||
498 | // are to be in S1new, snum012[3] - to be in S0. They are supposed | ||
499 | // to be S1bytes and S2bytes correspondingly, so recalculate | ||
500 | if (snum012[4] > 0) { | ||
501 | int split_item_num; | ||
502 | int bytes_to_r, bytes_to_l; | ||
503 | int bytes_to_S1new; | ||
504 | |||
505 | split_item_num = split_item_positions[1]; | ||
506 | bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0); | ||
507 | bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0); | ||
508 | bytes_to_S1new = ((split_item_positions[0] == split_item_positions[1]) ? snum012[3] : 0); | ||
509 | |||
510 | // s2bytes | ||
511 | snum012[4] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[4] - bytes_to_r - bytes_to_l - bytes_to_S1new; | ||
512 | |||
513 | if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY && | ||
514 | vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT) | ||
515 | reiserfs_warning (tb->tb_sb, "vs-8115: get_num_ver: not " | ||
516 | "directory or indirect item"); | ||
517 | } | ||
518 | |||
519 | /* now we know S2bytes, calculate S1bytes */ | ||
520 | if (snum012[3] > 0) { | ||
521 | int split_item_num; | ||
522 | int bytes_to_r, bytes_to_l; | ||
523 | int bytes_to_S2new; | ||
524 | |||
525 | split_item_num = split_item_positions[0]; | ||
526 | bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0); | ||
527 | bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0); | ||
528 | bytes_to_S2new = ((split_item_positions[0] == split_item_positions[1] && snum012[4] != -1) ? snum012[4] : 0); | ||
529 | |||
530 | // s1bytes | ||
531 | snum012[3] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[3] - bytes_to_r - bytes_to_l - bytes_to_S2new; | ||
532 | } | ||
533 | |||
534 | return needed_nodes; | ||
535 | } | ||
536 | |||
537 | |||
538 | #ifdef CONFIG_REISERFS_CHECK | ||
539 | extern struct tree_balance * cur_tb; | ||
540 | #endif | ||
541 | |||
542 | |||
543 | /* Set parameters for balancing. | ||
544 | * Performs write of results of analysis of balancing into structure tb, | ||
545 | * where it will later be used by the functions that actually do the balancing. | ||
546 | * Parameters: | ||
547 | * tb tree_balance structure; | ||
548 | * h current level of the node; | ||
549 | * lnum number of items from S[h] that must be shifted to L[h]; | ||
550 | * rnum number of items from S[h] that must be shifted to R[h]; | ||
551 | * blk_num number of blocks that S[h] will be splitted into; | ||
552 | * s012 number of items that fall into splitted nodes. | ||
553 | * lbytes number of bytes which flow to the left neighbor from the item that is not | ||
554 | * not shifted entirely | ||
555 | * rbytes number of bytes which flow to the right neighbor from the item that is not | ||
556 | * not shifted entirely | ||
557 | * s1bytes number of bytes which flow to the first new node when S[0] splits (this number is contained in s012 array) | ||
558 | */ | ||
559 | |||
560 | static void set_parameters (struct tree_balance * tb, int h, int lnum, | ||
561 | int rnum, int blk_num, short * s012, int lb, int rb) | ||
562 | { | ||
563 | |||
564 | tb->lnum[h] = lnum; | ||
565 | tb->rnum[h] = rnum; | ||
566 | tb->blknum[h] = blk_num; | ||
567 | |||
568 | if (h == 0) | ||
569 | { /* only for leaf level */ | ||
570 | if (s012 != NULL) | ||
571 | { | ||
572 | tb->s0num = * s012 ++, | ||
573 | tb->s1num = * s012 ++, | ||
574 | tb->s2num = * s012 ++; | ||
575 | tb->s1bytes = * s012 ++; | ||
576 | tb->s2bytes = * s012; | ||
577 | } | ||
578 | tb->lbytes = lb; | ||
579 | tb->rbytes = rb; | ||
580 | } | ||
581 | PROC_INFO_ADD( tb -> tb_sb, lnum[ h ], lnum ); | ||
582 | PROC_INFO_ADD( tb -> tb_sb, rnum[ h ], rnum ); | ||
583 | |||
584 | PROC_INFO_ADD( tb -> tb_sb, lbytes[ h ], lb ); | ||
585 | PROC_INFO_ADD( tb -> tb_sb, rbytes[ h ], rb ); | ||
586 | } | ||
587 | |||
588 | |||
589 | |||
590 | /* check, does node disappear if we shift tb->lnum[0] items to left | ||
591 | neighbor and tb->rnum[0] to the right one. */ | ||
592 | static int is_leaf_removable (struct tree_balance * tb) | ||
593 | { | ||
594 | struct virtual_node * vn = tb->tb_vn; | ||
595 | int to_left, to_right; | ||
596 | int size; | ||
597 | int remain_items; | ||
598 | |||
599 | /* number of items, that will be shifted to left (right) neighbor | ||
600 | entirely */ | ||
601 | to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0); | ||
602 | to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0); | ||
603 | remain_items = vn->vn_nr_item; | ||
604 | |||
605 | /* how many items remain in S[0] after shiftings to neighbors */ | ||
606 | remain_items -= (to_left + to_right); | ||
607 | |||
608 | if (remain_items < 1) { | ||
609 | /* all content of node can be shifted to neighbors */ | ||
610 | set_parameters (tb, 0, to_left, vn->vn_nr_item - to_left, 0, NULL, -1, -1); | ||
611 | return 1; | ||
612 | } | ||
613 | |||
614 | if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1) | ||
615 | /* S[0] is not removable */ | ||
616 | return 0; | ||
617 | |||
618 | /* check, whether we can divide 1 remaining item between neighbors */ | ||
619 | |||
620 | /* get size of remaining item (in item units) */ | ||
621 | size = op_unit_num (&(vn->vn_vi[to_left])); | ||
622 | |||
623 | if (tb->lbytes + tb->rbytes >= size) { | ||
624 | set_parameters (tb, 0, to_left + 1, to_right + 1, 0, NULL, tb->lbytes, -1); | ||
625 | return 1; | ||
626 | } | ||
627 | |||
628 | return 0; | ||
629 | } | ||
630 | |||
631 | |||
632 | /* check whether L, S, R can be joined in one node */ | ||
633 | static int are_leaves_removable (struct tree_balance * tb, int lfree, int rfree) | ||
634 | { | ||
635 | struct virtual_node * vn = tb->tb_vn; | ||
636 | int ih_size; | ||
637 | struct buffer_head *S0; | ||
638 | |||
639 | S0 = PATH_H_PBUFFER (tb->tb_path, 0); | ||
640 | |||
641 | ih_size = 0; | ||
642 | if (vn->vn_nr_item) { | ||
643 | if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE) | ||
644 | ih_size += IH_SIZE; | ||
645 | |||
646 | if (vn->vn_vi[vn->vn_nr_item-1].vi_type & VI_TYPE_RIGHT_MERGEABLE) | ||
647 | ih_size += IH_SIZE; | ||
648 | } else { | ||
649 | /* there was only one item and it will be deleted */ | ||
650 | struct item_head * ih; | ||
651 | |||
652 | RFALSE( B_NR_ITEMS (S0) != 1, | ||
653 | "vs-8125: item number must be 1: it is %d", B_NR_ITEMS(S0)); | ||
654 | |||
655 | ih = B_N_PITEM_HEAD (S0, 0); | ||
656 | if (tb->CFR[0] && !comp_short_le_keys (&(ih->ih_key), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]))) | ||
657 | if (is_direntry_le_ih (ih)) { | ||
658 | /* Directory must be in correct state here: that is | ||
659 | somewhere at the left side should exist first directory | ||
660 | item. But the item being deleted can not be that first | ||
661 | one because its right neighbor is item of the same | ||
662 | directory. (But first item always gets deleted in last | ||
663 | turn). So, neighbors of deleted item can be merged, so | ||
664 | we can save ih_size */ | ||
665 | ih_size = IH_SIZE; | ||
666 | |||
667 | /* we might check that left neighbor exists and is of the | ||
668 | same directory */ | ||
669 | RFALSE(le_ih_k_offset (ih) == DOT_OFFSET, | ||
670 | "vs-8130: first directory item can not be removed until directory is not empty"); | ||
671 | } | ||
672 | |||
673 | } | ||
674 | |||
675 | if (MAX_CHILD_SIZE (S0) + vn->vn_size <= rfree + lfree + ih_size) { | ||
676 | set_parameters (tb, 0, -1, -1, -1, NULL, -1, -1); | ||
677 | PROC_INFO_INC( tb -> tb_sb, leaves_removable ); | ||
678 | return 1; | ||
679 | } | ||
680 | return 0; | ||
681 | |||
682 | } | ||
683 | |||
684 | |||
685 | |||
686 | /* when we do not split item, lnum and rnum are numbers of entire items */ | ||
687 | #define SET_PAR_SHIFT_LEFT \ | ||
688 | if (h)\ | ||
689 | {\ | ||
690 | int to_l;\ | ||
691 | \ | ||
692 | to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\ | ||
693 | (MAX_NR_KEY(Sh) + 1 - lpar);\ | ||
694 | \ | ||
695 | set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\ | ||
696 | }\ | ||
697 | else \ | ||
698 | {\ | ||
699 | if (lset==LEFT_SHIFT_FLOW)\ | ||
700 | set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\ | ||
701 | tb->lbytes, -1);\ | ||
702 | else\ | ||
703 | set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\ | ||
704 | -1, -1);\ | ||
705 | } | ||
706 | |||
707 | |||
708 | #define SET_PAR_SHIFT_RIGHT \ | ||
709 | if (h)\ | ||
710 | {\ | ||
711 | int to_r;\ | ||
712 | \ | ||
713 | to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\ | ||
714 | \ | ||
715 | set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\ | ||
716 | }\ | ||
717 | else \ | ||
718 | {\ | ||
719 | if (rset==RIGHT_SHIFT_FLOW)\ | ||
720 | set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\ | ||
721 | -1, tb->rbytes);\ | ||
722 | else\ | ||
723 | set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\ | ||
724 | -1, -1);\ | ||
725 | } | ||
726 | |||
727 | |||
728 | static void free_buffers_in_tb ( | ||
729 | struct tree_balance * p_s_tb | ||
730 | ) { | ||
731 | int n_counter; | ||
732 | |||
733 | decrement_counters_in_path(p_s_tb->tb_path); | ||
734 | |||
735 | for ( n_counter = 0; n_counter < MAX_HEIGHT; n_counter++ ) { | ||
736 | decrement_bcount(p_s_tb->L[n_counter]); | ||
737 | p_s_tb->L[n_counter] = NULL; | ||
738 | decrement_bcount(p_s_tb->R[n_counter]); | ||
739 | p_s_tb->R[n_counter] = NULL; | ||
740 | decrement_bcount(p_s_tb->FL[n_counter]); | ||
741 | p_s_tb->FL[n_counter] = NULL; | ||
742 | decrement_bcount(p_s_tb->FR[n_counter]); | ||
743 | p_s_tb->FR[n_counter] = NULL; | ||
744 | decrement_bcount(p_s_tb->CFL[n_counter]); | ||
745 | p_s_tb->CFL[n_counter] = NULL; | ||
746 | decrement_bcount(p_s_tb->CFR[n_counter]); | ||
747 | p_s_tb->CFR[n_counter] = NULL; | ||
748 | } | ||
749 | } | ||
750 | |||
751 | |||
752 | /* Get new buffers for storing new nodes that are created while balancing. | ||
753 | * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; | ||
754 | * CARRY_ON - schedule didn't occur while the function worked; | ||
755 | * NO_DISK_SPACE - no disk space. | ||
756 | */ | ||
757 | /* The function is NOT SCHEDULE-SAFE! */ | ||
758 | static int get_empty_nodes( | ||
759 | struct tree_balance * p_s_tb, | ||
760 | int n_h | ||
761 | ) { | ||
762 | struct buffer_head * p_s_new_bh, | ||
763 | * p_s_Sh = PATH_H_PBUFFER (p_s_tb->tb_path, n_h); | ||
764 | b_blocknr_t * p_n_blocknr, | ||
765 | a_n_blocknrs[MAX_AMOUNT_NEEDED] = {0, }; | ||
766 | int n_counter, | ||
767 | n_number_of_freeblk, | ||
768 | n_amount_needed,/* number of needed empty blocks */ | ||
769 | n_retval = CARRY_ON; | ||
770 | struct super_block * p_s_sb = p_s_tb->tb_sb; | ||
771 | |||
772 | |||
773 | /* number_of_freeblk is the number of empty blocks which have been | ||
774 | acquired for use by the balancing algorithm minus the number of | ||
775 | empty blocks used in the previous levels of the analysis, | ||
776 | number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs | ||
777 | after empty blocks are acquired, and the balancing analysis is | ||
778 | then restarted, amount_needed is the number needed by this level | ||
779 | (n_h) of the balancing analysis. | ||
780 | |||
781 | Note that for systems with many processes writing, it would be | ||
782 | more layout optimal to calculate the total number needed by all | ||
783 | levels and then to run reiserfs_new_blocks to get all of them at once. */ | ||
784 | |||
785 | /* Initiate number_of_freeblk to the amount acquired prior to the restart of | ||
786 | the analysis or 0 if not restarted, then subtract the amount needed | ||
787 | by all of the levels of the tree below n_h. */ | ||
788 | /* blknum includes S[n_h], so we subtract 1 in this calculation */ | ||
789 | for ( n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum; n_counter < n_h; n_counter++ ) | ||
790 | n_number_of_freeblk -= ( p_s_tb->blknum[n_counter] ) ? (p_s_tb->blknum[n_counter] - 1) : 0; | ||
791 | |||
792 | /* Allocate missing empty blocks. */ | ||
793 | /* if p_s_Sh == 0 then we are getting a new root */ | ||
794 | n_amount_needed = ( p_s_Sh ) ? (p_s_tb->blknum[n_h] - 1) : 1; | ||
795 | /* Amount_needed = the amount that we need more than the amount that we have. */ | ||
796 | if ( n_amount_needed > n_number_of_freeblk ) | ||
797 | n_amount_needed -= n_number_of_freeblk; | ||
798 | else /* If we have enough already then there is nothing to do. */ | ||
799 | return CARRY_ON; | ||
800 | |||
801 | /* No need to check quota - is not allocated for blocks used for formatted nodes */ | ||
802 | if (reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs, | ||
803 | n_amount_needed) == NO_DISK_SPACE) | ||
804 | return NO_DISK_SPACE; | ||
805 | |||
806 | /* for each blocknumber we just got, get a buffer and stick it on FEB */ | ||
807 | for ( p_n_blocknr = a_n_blocknrs, n_counter = 0; n_counter < n_amount_needed; | ||
808 | p_n_blocknr++, n_counter++ ) { | ||
809 | |||
810 | RFALSE( ! *p_n_blocknr, | ||
811 | "PAP-8135: reiserfs_new_blocknrs failed when got new blocks"); | ||
812 | |||
813 | p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr); | ||
814 | RFALSE (buffer_dirty (p_s_new_bh) || | ||
815 | buffer_journaled (p_s_new_bh) || | ||
816 | buffer_journal_dirty (p_s_new_bh), | ||
817 | "PAP-8140: journlaled or dirty buffer %b for the new block", | ||
818 | p_s_new_bh); | ||
819 | |||
820 | /* Put empty buffers into the array. */ | ||
821 | RFALSE (p_s_tb->FEB[p_s_tb->cur_blknum], | ||
822 | "PAP-8141: busy slot for new buffer"); | ||
823 | |||
824 | set_buffer_journal_new (p_s_new_bh); | ||
825 | p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh; | ||
826 | } | ||
827 | |||
828 | if ( n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB (p_s_tb) ) | ||
829 | n_retval = REPEAT_SEARCH ; | ||
830 | |||
831 | return n_retval; | ||
832 | } | ||
833 | |||
834 | |||
835 | /* Get free space of the left neighbor, which is stored in the parent | ||
836 | * node of the left neighbor. */ | ||
837 | static int get_lfree (struct tree_balance * tb, int h) | ||
838 | { | ||
839 | struct buffer_head * l, * f; | ||
840 | int order; | ||
841 | |||
842 | if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0) | ||
843 | return 0; | ||
844 | |||
845 | if (f == l) | ||
846 | order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) - 1; | ||
847 | else { | ||
848 | order = B_NR_ITEMS (l); | ||
849 | f = l; | ||
850 | } | ||
851 | |||
852 | return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f,order))); | ||
853 | } | ||
854 | |||
855 | |||
856 | /* Get free space of the right neighbor, | ||
857 | * which is stored in the parent node of the right neighbor. | ||
858 | */ | ||
859 | static int get_rfree (struct tree_balance * tb, int h) | ||
860 | { | ||
861 | struct buffer_head * r, * f; | ||
862 | int order; | ||
863 | |||
864 | if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0) | ||
865 | return 0; | ||
866 | |||
867 | if (f == r) | ||
868 | order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) + 1; | ||
869 | else { | ||
870 | order = 0; | ||
871 | f = r; | ||
872 | } | ||
873 | |||
874 | return (MAX_CHILD_SIZE(f) - dc_size( B_N_CHILD(f,order))); | ||
875 | |||
876 | } | ||
877 | |||
878 | |||
879 | /* Check whether left neighbor is in memory. */ | ||
880 | static int is_left_neighbor_in_cache( | ||
881 | struct tree_balance * p_s_tb, | ||
882 | int n_h | ||
883 | ) { | ||
884 | struct buffer_head * p_s_father, * left; | ||
885 | struct super_block * p_s_sb = p_s_tb->tb_sb; | ||
886 | b_blocknr_t n_left_neighbor_blocknr; | ||
887 | int n_left_neighbor_position; | ||
888 | |||
889 | if ( ! p_s_tb->FL[n_h] ) /* Father of the left neighbor does not exist. */ | ||
890 | return 0; | ||
891 | |||
892 | /* Calculate father of the node to be balanced. */ | ||
893 | p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1); | ||
894 | |||
895 | RFALSE( ! p_s_father || | ||
896 | ! B_IS_IN_TREE (p_s_father) || | ||
897 | ! B_IS_IN_TREE (p_s_tb->FL[n_h]) || | ||
898 | ! buffer_uptodate (p_s_father) || | ||
899 | ! buffer_uptodate (p_s_tb->FL[n_h]), | ||
900 | "vs-8165: F[h] (%b) or FL[h] (%b) is invalid", | ||
901 | p_s_father, p_s_tb->FL[n_h]); | ||
902 | |||
903 | |||
904 | /* Get position of the pointer to the left neighbor into the left father. */ | ||
905 | n_left_neighbor_position = ( p_s_father == p_s_tb->FL[n_h] ) ? | ||
906 | p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]); | ||
907 | /* Get left neighbor block number. */ | ||
908 | n_left_neighbor_blocknr = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position); | ||
909 | /* Look for the left neighbor in the cache. */ | ||
910 | if ( (left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr)) ) { | ||
911 | |||
912 | RFALSE( buffer_uptodate (left) && ! B_IS_IN_TREE(left), | ||
913 | "vs-8170: left neighbor (%b %z) is not in the tree", left, left); | ||
914 | put_bh(left) ; | ||
915 | return 1; | ||
916 | } | ||
917 | |||
918 | return 0; | ||
919 | } | ||
920 | |||
921 | |||
922 | #define LEFT_PARENTS 'l' | ||
923 | #define RIGHT_PARENTS 'r' | ||
924 | |||
925 | |||
926 | static void decrement_key (struct cpu_key * p_s_key) | ||
927 | { | ||
928 | // call item specific function for this key | ||
929 | item_ops[cpu_key_k_type (p_s_key)]->decrement_key (p_s_key); | ||
930 | } | ||
931 | |||
932 | |||
933 | |||
934 | |||
935 | /* Calculate far left/right parent of the left/right neighbor of the current node, that | ||
936 | * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h]. | ||
937 | * Calculate left/right common parent of the current node and L[h]/R[h]. | ||
938 | * Calculate left/right delimiting key position. | ||
939 | * Returns: PATH_INCORRECT - path in the tree is not correct; | ||
940 | SCHEDULE_OCCURRED - schedule occurred while the function worked; | ||
941 | * CARRY_ON - schedule didn't occur while the function worked; | ||
942 | */ | ||
943 | static int get_far_parent (struct tree_balance * p_s_tb, | ||
944 | int n_h, | ||
945 | struct buffer_head ** pp_s_father, | ||
946 | struct buffer_head ** pp_s_com_father, | ||
947 | char c_lr_par) | ||
948 | { | ||
949 | struct buffer_head * p_s_parent; | ||
950 | INITIALIZE_PATH (s_path_to_neighbor_father); | ||
951 | struct path * p_s_path = p_s_tb->tb_path; | ||
952 | struct cpu_key s_lr_father_key; | ||
953 | int n_counter, | ||
954 | n_position = INT_MAX, | ||
955 | n_first_last_position = 0, | ||
956 | n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h); | ||
957 | |||
958 | /* Starting from F[n_h] go upwards in the tree, and look for the common | ||
959 | ancestor of F[n_h], and its neighbor l/r, that should be obtained. */ | ||
960 | |||
961 | n_counter = n_path_offset; | ||
962 | |||
963 | RFALSE( n_counter < FIRST_PATH_ELEMENT_OFFSET, | ||
964 | "PAP-8180: invalid path length"); | ||
965 | |||
966 | |||
967 | for ( ; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter-- ) { | ||
968 | /* Check whether parent of the current buffer in the path is really parent in the tree. */ | ||
969 | if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)) ) | ||
970 | return REPEAT_SEARCH; | ||
971 | /* Check whether position in the parent is correct. */ | ||
972 | if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_counter - 1)) > B_NR_ITEMS(p_s_parent) ) | ||
973 | return REPEAT_SEARCH; | ||
974 | /* Check whether parent at the path really points to the child. */ | ||
975 | if ( B_N_CHILD_NUM(p_s_parent, n_position) != | ||
976 | PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr ) | ||
977 | return REPEAT_SEARCH; | ||
978 | /* Return delimiting key if position in the parent is not equal to first/last one. */ | ||
979 | if ( c_lr_par == RIGHT_PARENTS ) | ||
980 | n_first_last_position = B_NR_ITEMS (p_s_parent); | ||
981 | if ( n_position != n_first_last_position ) { | ||
982 | *pp_s_com_father = p_s_parent; | ||
983 | get_bh(*pp_s_com_father) ; | ||
984 | /*(*pp_s_com_father = p_s_parent)->b_count++;*/ | ||
985 | break; | ||
986 | } | ||
987 | } | ||
988 | |||
989 | /* if we are in the root of the tree, then there is no common father */ | ||
990 | if ( n_counter == FIRST_PATH_ELEMENT_OFFSET ) { | ||
991 | /* Check whether first buffer in the path is the root of the tree. */ | ||
992 | if ( PATH_OFFSET_PBUFFER(p_s_tb->tb_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == | ||
993 | SB_ROOT_BLOCK (p_s_tb->tb_sb) ) { | ||
994 | *pp_s_father = *pp_s_com_father = NULL; | ||
995 | return CARRY_ON; | ||
996 | } | ||
997 | return REPEAT_SEARCH; | ||
998 | } | ||
999 | |||
1000 | RFALSE( B_LEVEL (*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL, | ||
1001 | "PAP-8185: (%b %z) level too small", | ||
1002 | *pp_s_com_father, *pp_s_com_father); | ||
1003 | |||
1004 | /* Check whether the common parent is locked. */ | ||
1005 | |||
1006 | if ( buffer_locked (*pp_s_com_father) ) { | ||
1007 | __wait_on_buffer(*pp_s_com_father); | ||
1008 | if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { | ||
1009 | decrement_bcount(*pp_s_com_father); | ||
1010 | return REPEAT_SEARCH; | ||
1011 | } | ||
1012 | } | ||
1013 | |||
1014 | /* So, we got common parent of the current node and its left/right neighbor. | ||
1015 | Now we are geting the parent of the left/right neighbor. */ | ||
1016 | |||
1017 | /* Form key to get parent of the left/right neighbor. */ | ||
1018 | le_key2cpu_key (&s_lr_father_key, B_N_PDELIM_KEY(*pp_s_com_father, ( c_lr_par == LEFT_PARENTS ) ? | ||
1019 | (p_s_tb->lkey[n_h - 1] = n_position - 1) : (p_s_tb->rkey[n_h - 1] = n_position))); | ||
1020 | |||
1021 | |||
1022 | if ( c_lr_par == LEFT_PARENTS ) | ||
1023 | decrement_key(&s_lr_father_key); | ||
1024 | |||
1025 | if (search_by_key(p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, n_h + 1) == IO_ERROR) | ||
1026 | // path is released | ||
1027 | return IO_ERROR; | ||
1028 | |||
1029 | if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { | ||
1030 | decrement_counters_in_path(&s_path_to_neighbor_father); | ||
1031 | decrement_bcount(*pp_s_com_father); | ||
1032 | return REPEAT_SEARCH; | ||
1033 | } | ||
1034 | |||
1035 | *pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father); | ||
1036 | |||
1037 | RFALSE( B_LEVEL (*pp_s_father) != n_h + 1, | ||
1038 | "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father); | ||
1039 | RFALSE( s_path_to_neighbor_father.path_length < FIRST_PATH_ELEMENT_OFFSET, | ||
1040 | "PAP-8192: path length is too small"); | ||
1041 | |||
1042 | s_path_to_neighbor_father.path_length--; | ||
1043 | decrement_counters_in_path(&s_path_to_neighbor_father); | ||
1044 | return CARRY_ON; | ||
1045 | } | ||
1046 | |||
1047 | |||
1048 | /* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of | ||
1049 | * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset], | ||
1050 | * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset]. | ||
1051 | * Calculate numbers of left and right delimiting keys position: lkey[n_path_offset], rkey[n_path_offset]. | ||
1052 | * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; | ||
1053 | * CARRY_ON - schedule didn't occur while the function worked; | ||
1054 | */ | ||
1055 | static int get_parents (struct tree_balance * p_s_tb, int n_h) | ||
1056 | { | ||
1057 | struct path * p_s_path = p_s_tb->tb_path; | ||
1058 | int n_position, | ||
1059 | n_ret_value, | ||
1060 | n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h); | ||
1061 | struct buffer_head * p_s_curf, | ||
1062 | * p_s_curcf; | ||
1063 | |||
1064 | /* Current node is the root of the tree or will be root of the tree */ | ||
1065 | if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) { | ||
1066 | /* The root can not have parents. | ||
1067 | Release nodes which previously were obtained as parents of the current node neighbors. */ | ||
1068 | decrement_bcount(p_s_tb->FL[n_h]); | ||
1069 | decrement_bcount(p_s_tb->CFL[n_h]); | ||
1070 | decrement_bcount(p_s_tb->FR[n_h]); | ||
1071 | decrement_bcount(p_s_tb->CFR[n_h]); | ||
1072 | p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] = p_s_tb->CFR[n_h] = NULL; | ||
1073 | return CARRY_ON; | ||
1074 | } | ||
1075 | |||
1076 | /* Get parent FL[n_path_offset] of L[n_path_offset]. */ | ||
1077 | if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) ) { | ||
1078 | /* Current node is not the first child of its parent. */ | ||
1079 | /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/ | ||
1080 | p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1); | ||
1081 | get_bh(p_s_curf) ; | ||
1082 | get_bh(p_s_curf) ; | ||
1083 | p_s_tb->lkey[n_h] = n_position - 1; | ||
1084 | } | ||
1085 | else { | ||
1086 | /* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node. | ||
1087 | Calculate current common parent of L[n_path_offset] and the current node. Note that | ||
1088 | CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset]. | ||
1089 | Calculate lkey[n_path_offset]. */ | ||
1090 | if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf, | ||
1091 | &p_s_curcf, LEFT_PARENTS)) != CARRY_ON ) | ||
1092 | return n_ret_value; | ||
1093 | } | ||
1094 | |||
1095 | decrement_bcount(p_s_tb->FL[n_h]); | ||
1096 | p_s_tb->FL[n_h] = p_s_curf; /* New initialization of FL[n_h]. */ | ||
1097 | decrement_bcount(p_s_tb->CFL[n_h]); | ||
1098 | p_s_tb->CFL[n_h] = p_s_curcf; /* New initialization of CFL[n_h]. */ | ||
1099 | |||
1100 | RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) || | ||
1101 | (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)), | ||
1102 | "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf); | ||
1103 | |||
1104 | /* Get parent FR[n_h] of R[n_h]. */ | ||
1105 | |||
1106 | /* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */ | ||
1107 | if ( n_position == B_NR_ITEMS (PATH_H_PBUFFER(p_s_path, n_h + 1)) ) { | ||
1108 | /* Calculate current parent of R[n_h], which is the right neighbor of F[n_h]. | ||
1109 | Calculate current common parent of R[n_h] and current node. Note that CFR[n_h] | ||
1110 | not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */ | ||
1111 | if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf, RIGHT_PARENTS)) != CARRY_ON ) | ||
1112 | return n_ret_value; | ||
1113 | } | ||
1114 | else { | ||
1115 | /* Current node is not the last child of its parent F[n_h]. */ | ||
1116 | /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/ | ||
1117 | p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1); | ||
1118 | get_bh(p_s_curf) ; | ||
1119 | get_bh(p_s_curf) ; | ||
1120 | p_s_tb->rkey[n_h] = n_position; | ||
1121 | } | ||
1122 | |||
1123 | decrement_bcount(p_s_tb->FR[n_h]); | ||
1124 | p_s_tb->FR[n_h] = p_s_curf; /* New initialization of FR[n_path_offset]. */ | ||
1125 | |||
1126 | decrement_bcount(p_s_tb->CFR[n_h]); | ||
1127 | p_s_tb->CFR[n_h] = p_s_curcf; /* New initialization of CFR[n_path_offset]. */ | ||
1128 | |||
1129 | RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) || | ||
1130 | (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)), | ||
1131 | "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf); | ||
1132 | |||
1133 | return CARRY_ON; | ||
1134 | } | ||
1135 | |||
1136 | |||
1137 | /* it is possible to remove node as result of shiftings to | ||
1138 | neighbors even when we insert or paste item. */ | ||
1139 | static inline int can_node_be_removed (int mode, int lfree, int sfree, int rfree, struct tree_balance * tb, int h) | ||
1140 | { | ||
1141 | struct buffer_head * Sh = PATH_H_PBUFFER (tb->tb_path, h); | ||
1142 | int levbytes = tb->insert_size[h]; | ||
1143 | struct item_head * ih; | ||
1144 | struct reiserfs_key * r_key = NULL; | ||
1145 | |||
1146 | ih = B_N_PITEM_HEAD (Sh, 0); | ||
1147 | if ( tb->CFR[h] ) | ||
1148 | r_key = B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]); | ||
1149 | |||
1150 | if ( | ||
1151 | lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes | ||
1152 | /* shifting may merge items which might save space */ | ||
1153 | - (( ! h && op_is_left_mergeable (&(ih->ih_key), Sh->b_size) ) ? IH_SIZE : 0) | ||
1154 | - (( ! h && r_key && op_is_left_mergeable (r_key, Sh->b_size) ) ? IH_SIZE : 0) | ||
1155 | + (( h ) ? KEY_SIZE : 0)) | ||
1156 | { | ||
1157 | /* node can not be removed */ | ||
1158 | if (sfree >= levbytes ) { /* new item fits into node S[h] without any shifting */ | ||
1159 | if ( ! h ) | ||
1160 | tb->s0num = B_NR_ITEMS(Sh) + ((mode == M_INSERT ) ? 1 : 0); | ||
1161 | set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); | ||
1162 | return NO_BALANCING_NEEDED; | ||
1163 | } | ||
1164 | } | ||
1165 | PROC_INFO_INC( tb -> tb_sb, can_node_be_removed[ h ] ); | ||
1166 | return !NO_BALANCING_NEEDED; | ||
1167 | } | ||
1168 | |||
1169 | |||
1170 | |||
1171 | /* Check whether current node S[h] is balanced when increasing its size by | ||
1172 | * Inserting or Pasting. | ||
1173 | * Calculate parameters for balancing for current level h. | ||
1174 | * Parameters: | ||
1175 | * tb tree_balance structure; | ||
1176 | * h current level of the node; | ||
1177 | * inum item number in S[h]; | ||
1178 | * mode i - insert, p - paste; | ||
1179 | * Returns: 1 - schedule occurred; | ||
1180 | * 0 - balancing for higher levels needed; | ||
1181 | * -1 - no balancing for higher levels needed; | ||
1182 | * -2 - no disk space. | ||
1183 | */ | ||
1184 | /* ip means Inserting or Pasting */ | ||
1185 | static int ip_check_balance (struct tree_balance * tb, int h) | ||
1186 | { | ||
1187 | struct virtual_node * vn = tb->tb_vn; | ||
1188 | int levbytes, /* Number of bytes that must be inserted into (value | ||
1189 | is negative if bytes are deleted) buffer which | ||
1190 | contains node being balanced. The mnemonic is | ||
1191 | that the attempted change in node space used level | ||
1192 | is levbytes bytes. */ | ||
1193 | n_ret_value; | ||
1194 | |||
1195 | int lfree, sfree, rfree /* free space in L, S and R */; | ||
1196 | |||
1197 | /* nver is short for number of vertixes, and lnver is the number if | ||
1198 | we shift to the left, rnver is the number if we shift to the | ||
1199 | right, and lrnver is the number if we shift in both directions. | ||
1200 | The goal is to minimize first the number of vertixes, and second, | ||
1201 | the number of vertixes whose contents are changed by shifting, | ||
1202 | and third the number of uncached vertixes whose contents are | ||
1203 | changed by shifting and must be read from disk. */ | ||
1204 | int nver, lnver, rnver, lrnver; | ||
1205 | |||
1206 | /* used at leaf level only, S0 = S[0] is the node being balanced, | ||
1207 | sInum [ I = 0,1,2 ] is the number of items that will | ||
1208 | remain in node SI after balancing. S1 and S2 are new | ||
1209 | nodes that might be created. */ | ||
1210 | |||
1211 | /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters. | ||
1212 | where 4th parameter is s1bytes and 5th - s2bytes | ||
1213 | */ | ||
1214 | short snum012[40] = {0,}; /* s0num, s1num, s2num for 8 cases | ||
1215 | 0,1 - do not shift and do not shift but bottle | ||
1216 | 2 - shift only whole item to left | ||
1217 | 3 - shift to left and bottle as much as possible | ||
1218 | 4,5 - shift to right (whole items and as much as possible | ||
1219 | 6,7 - shift to both directions (whole items and as much as possible) | ||
1220 | */ | ||
1221 | |||
1222 | /* Sh is the node whose balance is currently being checked */ | ||
1223 | struct buffer_head * Sh; | ||
1224 | |||
1225 | Sh = PATH_H_PBUFFER (tb->tb_path, h); | ||
1226 | levbytes = tb->insert_size[h]; | ||
1227 | |||
1228 | /* Calculate balance parameters for creating new root. */ | ||
1229 | if ( ! Sh ) { | ||
1230 | if ( ! h ) | ||
1231 | reiserfs_panic (tb->tb_sb, "vs-8210: ip_check_balance: S[0] can not be 0"); | ||
1232 | switch ( n_ret_value = get_empty_nodes (tb, h) ) { | ||
1233 | case CARRY_ON: | ||
1234 | set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); | ||
1235 | return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ | ||
1236 | |||
1237 | case NO_DISK_SPACE: | ||
1238 | case REPEAT_SEARCH: | ||
1239 | return n_ret_value; | ||
1240 | default: | ||
1241 | reiserfs_panic(tb->tb_sb, "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes"); | ||
1242 | } | ||
1243 | } | ||
1244 | |||
1245 | if ( (n_ret_value = get_parents (tb, h)) != CARRY_ON ) /* get parents of S[h] neighbors. */ | ||
1246 | return n_ret_value; | ||
1247 | |||
1248 | sfree = B_FREE_SPACE (Sh); | ||
1249 | |||
1250 | /* get free space of neighbors */ | ||
1251 | rfree = get_rfree (tb, h); | ||
1252 | lfree = get_lfree (tb, h); | ||
1253 | |||
1254 | if (can_node_be_removed (vn->vn_mode, lfree, sfree, rfree, tb, h) == NO_BALANCING_NEEDED) | ||
1255 | /* and new item fits into node S[h] without any shifting */ | ||
1256 | return NO_BALANCING_NEEDED; | ||
1257 | |||
1258 | create_virtual_node (tb, h); | ||
1259 | |||
1260 | /* | ||
1261 | determine maximal number of items we can shift to the left neighbor (in tb structure) | ||
1262 | and the maximal number of bytes that can flow to the left neighbor | ||
1263 | from the left most liquid item that cannot be shifted from S[0] entirely (returned value) | ||
1264 | */ | ||
1265 | check_left (tb, h, lfree); | ||
1266 | |||
1267 | /* | ||
1268 | determine maximal number of items we can shift to the right neighbor (in tb structure) | ||
1269 | and the maximal number of bytes that can flow to the right neighbor | ||
1270 | from the right most liquid item that cannot be shifted from S[0] entirely (returned value) | ||
1271 | */ | ||
1272 | check_right (tb, h, rfree); | ||
1273 | |||
1274 | |||
1275 | /* all contents of internal node S[h] can be moved into its | ||
1276 | neighbors, S[h] will be removed after balancing */ | ||
1277 | if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) { | ||
1278 | int to_r; | ||
1279 | |||
1280 | /* Since we are working on internal nodes, and our internal | ||
1281 | nodes have fixed size entries, then we can balance by the | ||
1282 | number of items rather than the space they consume. In this | ||
1283 | routine we set the left node equal to the right node, | ||
1284 | allowing a difference of less than or equal to 1 child | ||
1285 | pointer. */ | ||
1286 | to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 - | ||
1287 | (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); | ||
1288 | set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1); | ||
1289 | return CARRY_ON; | ||
1290 | } | ||
1291 | |||
1292 | /* this checks balance condition, that any two neighboring nodes can not fit in one node */ | ||
1293 | RFALSE( h && | ||
1294 | ( tb->lnum[h] >= vn->vn_nr_item + 1 || | ||
1295 | tb->rnum[h] >= vn->vn_nr_item + 1), | ||
1296 | "vs-8220: tree is not balanced on internal level"); | ||
1297 | RFALSE( ! h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) || | ||
1298 | (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1)) ), | ||
1299 | "vs-8225: tree is not balanced on leaf level"); | ||
1300 | |||
1301 | /* all contents of S[0] can be moved into its neighbors | ||
1302 | S[0] will be removed after balancing. */ | ||
1303 | if (!h && is_leaf_removable (tb)) | ||
1304 | return CARRY_ON; | ||
1305 | |||
1306 | |||
1307 | /* why do we perform this check here rather than earlier?? | ||
1308 | Answer: we can win 1 node in some cases above. Moreover we | ||
1309 | checked it above, when we checked, that S[0] is not removable | ||
1310 | in principle */ | ||
1311 | if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */ | ||
1312 | if ( ! h ) | ||
1313 | tb->s0num = vn->vn_nr_item; | ||
1314 | set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); | ||
1315 | return NO_BALANCING_NEEDED; | ||
1316 | } | ||
1317 | |||
1318 | |||
1319 | { | ||
1320 | int lpar, rpar, nset, lset, rset, lrset; | ||
1321 | /* | ||
1322 | * regular overflowing of the node | ||
1323 | */ | ||
1324 | |||
1325 | /* get_num_ver works in 2 modes (FLOW & NO_FLOW) | ||
1326 | lpar, rpar - number of items we can shift to left/right neighbor (including splitting item) | ||
1327 | nset, lset, rset, lrset - shows, whether flowing items give better packing | ||
1328 | */ | ||
1329 | #define FLOW 1 | ||
1330 | #define NO_FLOW 0 /* do not any splitting */ | ||
1331 | |||
1332 | /* we choose one the following */ | ||
1333 | #define NOTHING_SHIFT_NO_FLOW 0 | ||
1334 | #define NOTHING_SHIFT_FLOW 5 | ||
1335 | #define LEFT_SHIFT_NO_FLOW 10 | ||
1336 | #define LEFT_SHIFT_FLOW 15 | ||
1337 | #define RIGHT_SHIFT_NO_FLOW 20 | ||
1338 | #define RIGHT_SHIFT_FLOW 25 | ||
1339 | #define LR_SHIFT_NO_FLOW 30 | ||
1340 | #define LR_SHIFT_FLOW 35 | ||
1341 | |||
1342 | |||
1343 | lpar = tb->lnum[h]; | ||
1344 | rpar = tb->rnum[h]; | ||
1345 | |||
1346 | |||
1347 | /* calculate number of blocks S[h] must be split into when | ||
1348 | nothing is shifted to the neighbors, | ||
1349 | as well as number of items in each part of the split node (s012 numbers), | ||
1350 | and number of bytes (s1bytes) of the shared drop which flow to S1 if any */ | ||
1351 | nset = NOTHING_SHIFT_NO_FLOW; | ||
1352 | nver = get_num_ver (vn->vn_mode, tb, h, | ||
1353 | 0, -1, h?vn->vn_nr_item:0, -1, | ||
1354 | snum012, NO_FLOW); | ||
1355 | |||
1356 | if (!h) | ||
1357 | { | ||
1358 | int nver1; | ||
1359 | |||
1360 | /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */ | ||
1361 | nver1 = get_num_ver (vn->vn_mode, tb, h, | ||
1362 | 0, -1, 0, -1, | ||
1363 | snum012 + NOTHING_SHIFT_FLOW, FLOW); | ||
1364 | if (nver > nver1) | ||
1365 | nset = NOTHING_SHIFT_FLOW, nver = nver1; | ||
1366 | } | ||
1367 | |||
1368 | |||
1369 | /* calculate number of blocks S[h] must be split into when | ||
1370 | l_shift_num first items and l_shift_bytes of the right most | ||
1371 | liquid item to be shifted are shifted to the left neighbor, | ||
1372 | as well as number of items in each part of the splitted node (s012 numbers), | ||
1373 | and number of bytes (s1bytes) of the shared drop which flow to S1 if any | ||
1374 | */ | ||
1375 | lset = LEFT_SHIFT_NO_FLOW; | ||
1376 | lnver = get_num_ver (vn->vn_mode, tb, h, | ||
1377 | lpar - (( h || tb->lbytes == -1 ) ? 0 : 1), -1, h ? vn->vn_nr_item:0, -1, | ||
1378 | snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW); | ||
1379 | if (!h) | ||
1380 | { | ||
1381 | int lnver1; | ||
1382 | |||
1383 | lnver1 = get_num_ver (vn->vn_mode, tb, h, | ||
1384 | lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, 0, -1, | ||
1385 | snum012 + LEFT_SHIFT_FLOW, FLOW); | ||
1386 | if (lnver > lnver1) | ||
1387 | lset = LEFT_SHIFT_FLOW, lnver = lnver1; | ||
1388 | } | ||
1389 | |||
1390 | |||
1391 | /* calculate number of blocks S[h] must be split into when | ||
1392 | r_shift_num first items and r_shift_bytes of the left most | ||
1393 | liquid item to be shifted are shifted to the right neighbor, | ||
1394 | as well as number of items in each part of the splitted node (s012 numbers), | ||
1395 | and number of bytes (s1bytes) of the shared drop which flow to S1 if any | ||
1396 | */ | ||
1397 | rset = RIGHT_SHIFT_NO_FLOW; | ||
1398 | rnver = get_num_ver (vn->vn_mode, tb, h, | ||
1399 | 0, -1, h ? (vn->vn_nr_item-rpar) : (rpar - (( tb->rbytes != -1 ) ? 1 : 0)), -1, | ||
1400 | snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW); | ||
1401 | if (!h) | ||
1402 | { | ||
1403 | int rnver1; | ||
1404 | |||
1405 | rnver1 = get_num_ver (vn->vn_mode, tb, h, | ||
1406 | 0, -1, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes, | ||
1407 | snum012 + RIGHT_SHIFT_FLOW, FLOW); | ||
1408 | |||
1409 | if (rnver > rnver1) | ||
1410 | rset = RIGHT_SHIFT_FLOW, rnver = rnver1; | ||
1411 | } | ||
1412 | |||
1413 | |||
1414 | /* calculate number of blocks S[h] must be split into when | ||
1415 | items are shifted in both directions, | ||
1416 | as well as number of items in each part of the splitted node (s012 numbers), | ||
1417 | and number of bytes (s1bytes) of the shared drop which flow to S1 if any | ||
1418 | */ | ||
1419 | lrset = LR_SHIFT_NO_FLOW; | ||
1420 | lrnver = get_num_ver (vn->vn_mode, tb, h, | ||
1421 | lpar - ((h || tb->lbytes == -1) ? 0 : 1), -1, h ? (vn->vn_nr_item-rpar):(rpar - ((tb->rbytes != -1) ? 1 : 0)), -1, | ||
1422 | snum012 + LR_SHIFT_NO_FLOW, NO_FLOW); | ||
1423 | if (!h) | ||
1424 | { | ||
1425 | int lrnver1; | ||
1426 | |||
1427 | lrnver1 = get_num_ver (vn->vn_mode, tb, h, | ||
1428 | lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes, | ||
1429 | snum012 + LR_SHIFT_FLOW, FLOW); | ||
1430 | if (lrnver > lrnver1) | ||
1431 | lrset = LR_SHIFT_FLOW, lrnver = lrnver1; | ||
1432 | } | ||
1433 | |||
1434 | |||
1435 | |||
1436 | /* Our general shifting strategy is: | ||
1437 | 1) to minimized number of new nodes; | ||
1438 | 2) to minimized number of neighbors involved in shifting; | ||
1439 | 3) to minimized number of disk reads; */ | ||
1440 | |||
1441 | /* we can win TWO or ONE nodes by shifting in both directions */ | ||
1442 | if (lrnver < lnver && lrnver < rnver) | ||
1443 | { | ||
1444 | RFALSE( h && | ||
1445 | (tb->lnum[h] != 1 || | ||
1446 | tb->rnum[h] != 1 || | ||
1447 | lrnver != 1 || rnver != 2 || lnver != 2 || h != 1), | ||
1448 | "vs-8230: bad h"); | ||
1449 | if (lrset == LR_SHIFT_FLOW) | ||
1450 | set_parameters (tb, h, tb->lnum[h], tb->rnum[h], lrnver, snum012 + lrset, | ||
1451 | tb->lbytes, tb->rbytes); | ||
1452 | else | ||
1453 | set_parameters (tb, h, tb->lnum[h] - ((tb->lbytes == -1) ? 0 : 1), | ||
1454 | tb->rnum[h] - ((tb->rbytes == -1) ? 0 : 1), lrnver, snum012 + lrset, -1, -1); | ||
1455 | |||
1456 | return CARRY_ON; | ||
1457 | } | ||
1458 | |||
1459 | /* if shifting doesn't lead to better packing then don't shift */ | ||
1460 | if (nver == lrnver) | ||
1461 | { | ||
1462 | set_parameters (tb, h, 0, 0, nver, snum012 + nset, -1, -1); | ||
1463 | return CARRY_ON; | ||
1464 | } | ||
1465 | |||
1466 | |||
1467 | /* now we know that for better packing shifting in only one | ||
1468 | direction either to the left or to the right is required */ | ||
1469 | |||
1470 | /* if shifting to the left is better than shifting to the right */ | ||
1471 | if (lnver < rnver) | ||
1472 | { | ||
1473 | SET_PAR_SHIFT_LEFT; | ||
1474 | return CARRY_ON; | ||
1475 | } | ||
1476 | |||
1477 | /* if shifting to the right is better than shifting to the left */ | ||
1478 | if (lnver > rnver) | ||
1479 | { | ||
1480 | SET_PAR_SHIFT_RIGHT; | ||
1481 | return CARRY_ON; | ||
1482 | } | ||
1483 | |||
1484 | |||
1485 | /* now shifting in either direction gives the same number | ||
1486 | of nodes and we can make use of the cached neighbors */ | ||
1487 | if (is_left_neighbor_in_cache (tb,h)) | ||
1488 | { | ||
1489 | SET_PAR_SHIFT_LEFT; | ||
1490 | return CARRY_ON; | ||
1491 | } | ||
1492 | |||
1493 | /* shift to the right independently on whether the right neighbor in cache or not */ | ||
1494 | SET_PAR_SHIFT_RIGHT; | ||
1495 | return CARRY_ON; | ||
1496 | } | ||
1497 | } | ||
1498 | |||
1499 | |||
1500 | /* Check whether current node S[h] is balanced when Decreasing its size by | ||
1501 | * Deleting or Cutting for INTERNAL node of S+tree. | ||
1502 | * Calculate parameters for balancing for current level h. | ||
1503 | * Parameters: | ||
1504 | * tb tree_balance structure; | ||
1505 | * h current level of the node; | ||
1506 | * inum item number in S[h]; | ||
1507 | * mode i - insert, p - paste; | ||
1508 | * Returns: 1 - schedule occurred; | ||
1509 | * 0 - balancing for higher levels needed; | ||
1510 | * -1 - no balancing for higher levels needed; | ||
1511 | * -2 - no disk space. | ||
1512 | * | ||
1513 | * Note: Items of internal nodes have fixed size, so the balance condition for | ||
1514 | * the internal part of S+tree is as for the B-trees. | ||
1515 | */ | ||
1516 | static int dc_check_balance_internal (struct tree_balance * tb, int h) | ||
1517 | { | ||
1518 | struct virtual_node * vn = tb->tb_vn; | ||
1519 | |||
1520 | /* Sh is the node whose balance is currently being checked, | ||
1521 | and Fh is its father. */ | ||
1522 | struct buffer_head * Sh, * Fh; | ||
1523 | int maxsize, | ||
1524 | n_ret_value; | ||
1525 | int lfree, rfree /* free space in L and R */; | ||
1526 | |||
1527 | Sh = PATH_H_PBUFFER (tb->tb_path, h); | ||
1528 | Fh = PATH_H_PPARENT (tb->tb_path, h); | ||
1529 | |||
1530 | maxsize = MAX_CHILD_SIZE(Sh); | ||
1531 | |||
1532 | /* using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */ | ||
1533 | /* new_nr_item = number of items node would have if operation is */ | ||
1534 | /* performed without balancing (new_nr_item); */ | ||
1535 | create_virtual_node (tb, h); | ||
1536 | |||
1537 | if ( ! Fh ) | ||
1538 | { /* S[h] is the root. */ | ||
1539 | if ( vn->vn_nr_item > 0 ) | ||
1540 | { | ||
1541 | set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); | ||
1542 | return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ | ||
1543 | } | ||
1544 | /* new_nr_item == 0. | ||
1545 | * Current root will be deleted resulting in | ||
1546 | * decrementing the tree height. */ | ||
1547 | set_parameters (tb, h, 0, 0, 0, NULL, -1, -1); | ||
1548 | return CARRY_ON; | ||
1549 | } | ||
1550 | |||
1551 | if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON ) | ||
1552 | return n_ret_value; | ||
1553 | |||
1554 | |||
1555 | /* get free space of neighbors */ | ||
1556 | rfree = get_rfree (tb, h); | ||
1557 | lfree = get_lfree (tb, h); | ||
1558 | |||
1559 | /* determine maximal number of items we can fit into neighbors */ | ||
1560 | check_left (tb, h, lfree); | ||
1561 | check_right (tb, h, rfree); | ||
1562 | |||
1563 | |||
1564 | if ( vn->vn_nr_item >= MIN_NR_KEY(Sh) ) | ||
1565 | { /* Balance condition for the internal node is valid. | ||
1566 | * In this case we balance only if it leads to better packing. */ | ||
1567 | if ( vn->vn_nr_item == MIN_NR_KEY(Sh) ) | ||
1568 | { /* Here we join S[h] with one of its neighbors, | ||
1569 | * which is impossible with greater values of new_nr_item. */ | ||
1570 | if ( tb->lnum[h] >= vn->vn_nr_item + 1 ) | ||
1571 | { | ||
1572 | /* All contents of S[h] can be moved to L[h]. */ | ||
1573 | int n; | ||
1574 | int order_L; | ||
1575 | |||
1576 | order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; | ||
1577 | n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE); | ||
1578 | set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1); | ||
1579 | return CARRY_ON; | ||
1580 | } | ||
1581 | |||
1582 | if ( tb->rnum[h] >= vn->vn_nr_item + 1 ) | ||
1583 | { | ||
1584 | /* All contents of S[h] can be moved to R[h]. */ | ||
1585 | int n; | ||
1586 | int order_R; | ||
1587 | |||
1588 | order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : n + 1; | ||
1589 | n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE); | ||
1590 | set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1); | ||
1591 | return CARRY_ON; | ||
1592 | } | ||
1593 | } | ||
1594 | |||
1595 | if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) | ||
1596 | { | ||
1597 | /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ | ||
1598 | int to_r; | ||
1599 | |||
1600 | to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 - | ||
1601 | (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); | ||
1602 | set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1); | ||
1603 | return CARRY_ON; | ||
1604 | } | ||
1605 | |||
1606 | /* Balancing does not lead to better packing. */ | ||
1607 | set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); | ||
1608 | return NO_BALANCING_NEEDED; | ||
1609 | } | ||
1610 | |||
1611 | /* Current node contain insufficient number of items. Balancing is required. */ | ||
1612 | /* Check whether we can merge S[h] with left neighbor. */ | ||
1613 | if (tb->lnum[h] >= vn->vn_nr_item + 1) | ||
1614 | if (is_left_neighbor_in_cache (tb,h) || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) | ||
1615 | { | ||
1616 | int n; | ||
1617 | int order_L; | ||
1618 | |||
1619 | order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; | ||
1620 | n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE); | ||
1621 | set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1); | ||
1622 | return CARRY_ON; | ||
1623 | } | ||
1624 | |||
1625 | /* Check whether we can merge S[h] with right neighbor. */ | ||
1626 | if (tb->rnum[h] >= vn->vn_nr_item + 1) | ||
1627 | { | ||
1628 | int n; | ||
1629 | int order_R; | ||
1630 | |||
1631 | order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : (n + 1); | ||
1632 | n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE); | ||
1633 | set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1); | ||
1634 | return CARRY_ON; | ||
1635 | } | ||
1636 | |||
1637 | /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ | ||
1638 | if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) | ||
1639 | { | ||
1640 | int to_r; | ||
1641 | |||
1642 | to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 - | ||
1643 | (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); | ||
1644 | set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1); | ||
1645 | return CARRY_ON; | ||
1646 | } | ||
1647 | |||
1648 | /* For internal nodes try to borrow item from a neighbor */ | ||
1649 | RFALSE( !tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root"); | ||
1650 | |||
1651 | /* Borrow one or two items from caching neighbor */ | ||
1652 | if (is_left_neighbor_in_cache (tb,h) || !tb->FR[h]) | ||
1653 | { | ||
1654 | int from_l; | ||
1655 | |||
1656 | from_l = (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + 1) / 2 - (vn->vn_nr_item + 1); | ||
1657 | set_parameters (tb, h, -from_l, 0, 1, NULL, -1, -1); | ||
1658 | return CARRY_ON; | ||
1659 | } | ||
1660 | |||
1661 | set_parameters (tb, h, 0, -((MAX_NR_KEY(Sh)+1-tb->rnum[h]+vn->vn_nr_item+1)/2-(vn->vn_nr_item+1)), 1, | ||
1662 | NULL, -1, -1); | ||
1663 | return CARRY_ON; | ||
1664 | } | ||
1665 | |||
1666 | |||
1667 | /* Check whether current node S[h] is balanced when Decreasing its size by | ||
1668 | * Deleting or Truncating for LEAF node of S+tree. | ||
1669 | * Calculate parameters for balancing for current level h. | ||
1670 | * Parameters: | ||
1671 | * tb tree_balance structure; | ||
1672 | * h current level of the node; | ||
1673 | * inum item number in S[h]; | ||
1674 | * mode i - insert, p - paste; | ||
1675 | * Returns: 1 - schedule occurred; | ||
1676 | * 0 - balancing for higher levels needed; | ||
1677 | * -1 - no balancing for higher levels needed; | ||
1678 | * -2 - no disk space. | ||
1679 | */ | ||
1680 | static int dc_check_balance_leaf (struct tree_balance * tb, int h) | ||
1681 | { | ||
1682 | struct virtual_node * vn = tb->tb_vn; | ||
1683 | |||
1684 | /* Number of bytes that must be deleted from | ||
1685 | (value is negative if bytes are deleted) buffer which | ||
1686 | contains node being balanced. The mnemonic is that the | ||
1687 | attempted change in node space used level is levbytes bytes. */ | ||
1688 | int levbytes; | ||
1689 | /* the maximal item size */ | ||
1690 | int maxsize, | ||
1691 | n_ret_value; | ||
1692 | /* S0 is the node whose balance is currently being checked, | ||
1693 | and F0 is its father. */ | ||
1694 | struct buffer_head * S0, * F0; | ||
1695 | int lfree, rfree /* free space in L and R */; | ||
1696 | |||
1697 | S0 = PATH_H_PBUFFER (tb->tb_path, 0); | ||
1698 | F0 = PATH_H_PPARENT (tb->tb_path, 0); | ||
1699 | |||
1700 | levbytes = tb->insert_size[h]; | ||
1701 | |||
1702 | maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */ | ||
1703 | |||
1704 | if ( ! F0 ) | ||
1705 | { /* S[0] is the root now. */ | ||
1706 | |||
1707 | RFALSE( -levbytes >= maxsize - B_FREE_SPACE (S0), | ||
1708 | "vs-8240: attempt to create empty buffer tree"); | ||
1709 | |||
1710 | set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); | ||
1711 | return NO_BALANCING_NEEDED; | ||
1712 | } | ||
1713 | |||
1714 | if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON ) | ||
1715 | return n_ret_value; | ||
1716 | |||
1717 | /* get free space of neighbors */ | ||
1718 | rfree = get_rfree (tb, h); | ||
1719 | lfree = get_lfree (tb, h); | ||
1720 | |||
1721 | create_virtual_node (tb, h); | ||
1722 | |||
1723 | /* if 3 leaves can be merge to one, set parameters and return */ | ||
1724 | if (are_leaves_removable (tb, lfree, rfree)) | ||
1725 | return CARRY_ON; | ||
1726 | |||
1727 | /* determine maximal number of items we can shift to the left/right neighbor | ||
1728 | and the maximal number of bytes that can flow to the left/right neighbor | ||
1729 | from the left/right most liquid item that cannot be shifted from S[0] entirely | ||
1730 | */ | ||
1731 | check_left (tb, h, lfree); | ||
1732 | check_right (tb, h, rfree); | ||
1733 | |||
1734 | /* check whether we can merge S with left neighbor. */ | ||
1735 | if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1) | ||
1736 | if (is_left_neighbor_in_cache (tb,h) || | ||
1737 | ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */ | ||
1738 | !tb->FR[h]) { | ||
1739 | |||
1740 | RFALSE( !tb->FL[h], "vs-8245: dc_check_balance_leaf: FL[h] must exist"); | ||
1741 | |||
1742 | /* set parameter to merge S[0] with its left neighbor */ | ||
1743 | set_parameters (tb, h, -1, 0, 0, NULL, -1, -1); | ||
1744 | return CARRY_ON; | ||
1745 | } | ||
1746 | |||
1747 | /* check whether we can merge S[0] with right neighbor. */ | ||
1748 | if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) { | ||
1749 | set_parameters (tb, h, 0, -1, 0, NULL, -1, -1); | ||
1750 | return CARRY_ON; | ||
1751 | } | ||
1752 | |||
1753 | /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */ | ||
1754 | if (is_leaf_removable (tb)) | ||
1755 | return CARRY_ON; | ||
1756 | |||
1757 | /* Balancing is not required. */ | ||
1758 | tb->s0num = vn->vn_nr_item; | ||
1759 | set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); | ||
1760 | return NO_BALANCING_NEEDED; | ||
1761 | } | ||
1762 | |||
1763 | |||
1764 | |||
1765 | /* Check whether current node S[h] is balanced when Decreasing its size by | ||
1766 | * Deleting or Cutting. | ||
1767 | * Calculate parameters for balancing for current level h. | ||
1768 | * Parameters: | ||
1769 | * tb tree_balance structure; | ||
1770 | * h current level of the node; | ||
1771 | * inum item number in S[h]; | ||
1772 | * mode d - delete, c - cut. | ||
1773 | * Returns: 1 - schedule occurred; | ||
1774 | * 0 - balancing for higher levels needed; | ||
1775 | * -1 - no balancing for higher levels needed; | ||
1776 | * -2 - no disk space. | ||
1777 | */ | ||
1778 | static int dc_check_balance (struct tree_balance * tb, int h) | ||
1779 | { | ||
1780 | RFALSE( ! (PATH_H_PBUFFER (tb->tb_path, h)), "vs-8250: S is not initialized"); | ||
1781 | |||
1782 | if ( h ) | ||
1783 | return dc_check_balance_internal (tb, h); | ||
1784 | else | ||
1785 | return dc_check_balance_leaf (tb, h); | ||
1786 | } | ||
1787 | |||
1788 | |||
1789 | |||
1790 | /* Check whether current node S[h] is balanced. | ||
1791 | * Calculate parameters for balancing for current level h. | ||
1792 | * Parameters: | ||
1793 | * | ||
1794 | * tb tree_balance structure: | ||
1795 | * | ||
1796 | * tb is a large structure that must be read about in the header file | ||
1797 | * at the same time as this procedure if the reader is to successfully | ||
1798 | * understand this procedure | ||
1799 | * | ||
1800 | * h current level of the node; | ||
1801 | * inum item number in S[h]; | ||
1802 | * mode i - insert, p - paste, d - delete, c - cut. | ||
1803 | * Returns: 1 - schedule occurred; | ||
1804 | * 0 - balancing for higher levels needed; | ||
1805 | * -1 - no balancing for higher levels needed; | ||
1806 | * -2 - no disk space. | ||
1807 | */ | ||
1808 | static int check_balance (int mode, | ||
1809 | struct tree_balance * tb, | ||
1810 | int h, | ||
1811 | int inum, | ||
1812 | int pos_in_item, | ||
1813 | struct item_head * ins_ih, | ||
1814 | const void * data | ||
1815 | ) | ||
1816 | { | ||
1817 | struct virtual_node * vn; | ||
1818 | |||
1819 | vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf); | ||
1820 | vn->vn_free_ptr = (char *)(tb->tb_vn + 1); | ||
1821 | vn->vn_mode = mode; | ||
1822 | vn->vn_affected_item_num = inum; | ||
1823 | vn->vn_pos_in_item = pos_in_item; | ||
1824 | vn->vn_ins_ih = ins_ih; | ||
1825 | vn->vn_data = data; | ||
1826 | |||
1827 | RFALSE( mode == M_INSERT && !vn->vn_ins_ih, | ||
1828 | "vs-8255: ins_ih can not be 0 in insert mode"); | ||
1829 | |||
1830 | if ( tb->insert_size[h] > 0 ) | ||
1831 | /* Calculate balance parameters when size of node is increasing. */ | ||
1832 | return ip_check_balance (tb, h); | ||
1833 | |||
1834 | /* Calculate balance parameters when size of node is decreasing. */ | ||
1835 | return dc_check_balance (tb, h); | ||
1836 | } | ||
1837 | |||
1838 | |||
1839 | |||
1840 | /* Check whether parent at the path is the really parent of the current node.*/ | ||
1841 | static int get_direct_parent( | ||
1842 | struct tree_balance * p_s_tb, | ||
1843 | int n_h | ||
1844 | ) { | ||
1845 | struct buffer_head * p_s_bh; | ||
1846 | struct path * p_s_path = p_s_tb->tb_path; | ||
1847 | int n_position, | ||
1848 | n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h); | ||
1849 | |||
1850 | /* We are in the root or in the new root. */ | ||
1851 | if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) { | ||
1852 | |||
1853 | RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1, | ||
1854 | "PAP-8260: invalid offset in the path"); | ||
1855 | |||
1856 | if ( PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == | ||
1857 | SB_ROOT_BLOCK (p_s_tb->tb_sb) ) { | ||
1858 | /* Root is not changed. */ | ||
1859 | PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL; | ||
1860 | PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0; | ||
1861 | return CARRY_ON; | ||
1862 | } | ||
1863 | return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */ | ||
1864 | } | ||
1865 | |||
1866 | if ( ! B_IS_IN_TREE(p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)) ) | ||
1867 | return REPEAT_SEARCH; /* Parent in the path is not in the tree. */ | ||
1868 | |||
1869 | if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) > B_NR_ITEMS(p_s_bh) ) | ||
1870 | return REPEAT_SEARCH; | ||
1871 | |||
1872 | if ( B_N_CHILD_NUM(p_s_bh, n_position) != PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr ) | ||
1873 | /* Parent in the path is not parent of the current node in the tree. */ | ||
1874 | return REPEAT_SEARCH; | ||
1875 | |||
1876 | if ( buffer_locked(p_s_bh) ) { | ||
1877 | __wait_on_buffer(p_s_bh); | ||
1878 | if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) | ||
1879 | return REPEAT_SEARCH; | ||
1880 | } | ||
1881 | |||
1882 | return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */ | ||
1883 | } | ||
1884 | |||
1885 | |||
1886 | /* Using lnum[n_h] and rnum[n_h] we should determine what neighbors | ||
1887 | * of S[n_h] we | ||
1888 | * need in order to balance S[n_h], and get them if necessary. | ||
1889 | * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; | ||
1890 | * CARRY_ON - schedule didn't occur while the function worked; | ||
1891 | */ | ||
1892 | static int get_neighbors( | ||
1893 | struct tree_balance * p_s_tb, | ||
1894 | int n_h | ||
1895 | ) { | ||
1896 | int n_child_position, | ||
1897 | n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1); | ||
1898 | unsigned long n_son_number; | ||
1899 | struct super_block * p_s_sb = p_s_tb->tb_sb; | ||
1900 | struct buffer_head * p_s_bh; | ||
1901 | |||
1902 | |||
1903 | PROC_INFO_INC( p_s_sb, get_neighbors[ n_h ] ); | ||
1904 | |||
1905 | if ( p_s_tb->lnum[n_h] ) { | ||
1906 | /* We need left neighbor to balance S[n_h]. */ | ||
1907 | PROC_INFO_INC( p_s_sb, need_l_neighbor[ n_h ] ); | ||
1908 | p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset); | ||
1909 | |||
1910 | RFALSE( p_s_bh == p_s_tb->FL[n_h] && | ||
1911 | ! PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset), | ||
1912 | "PAP-8270: invalid position in the parent"); | ||
1913 | |||
1914 | n_child_position = ( p_s_bh == p_s_tb->FL[n_h] ) ? p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]); | ||
1915 | n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position); | ||
1916 | p_s_bh = sb_bread(p_s_sb, n_son_number); | ||
1917 | if (!p_s_bh) | ||
1918 | return IO_ERROR; | ||
1919 | if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { | ||
1920 | decrement_bcount(p_s_bh); | ||
1921 | PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] ); | ||
1922 | return REPEAT_SEARCH; | ||
1923 | } | ||
1924 | |||
1925 | RFALSE( ! B_IS_IN_TREE(p_s_tb->FL[n_h]) || | ||
1926 | n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) || | ||
1927 | B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) != | ||
1928 | p_s_bh->b_blocknr, "PAP-8275: invalid parent"); | ||
1929 | RFALSE( ! B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child"); | ||
1930 | RFALSE( ! n_h && | ||
1931 | B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FL[0],n_child_position)), | ||
1932 | "PAP-8290: invalid child size of left neighbor"); | ||
1933 | |||
1934 | decrement_bcount(p_s_tb->L[n_h]); | ||
1935 | p_s_tb->L[n_h] = p_s_bh; | ||
1936 | } | ||
1937 | |||
1938 | |||
1939 | if ( p_s_tb->rnum[n_h] ) { /* We need right neighbor to balance S[n_path_offset]. */ | ||
1940 | PROC_INFO_INC( p_s_sb, need_r_neighbor[ n_h ] ); | ||
1941 | p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset); | ||
1942 | |||
1943 | RFALSE( p_s_bh == p_s_tb->FR[n_h] && | ||
1944 | PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset) >= B_NR_ITEMS(p_s_bh), | ||
1945 | "PAP-8295: invalid position in the parent"); | ||
1946 | |||
1947 | n_child_position = ( p_s_bh == p_s_tb->FR[n_h] ) ? p_s_tb->rkey[n_h] + 1 : 0; | ||
1948 | n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position); | ||
1949 | p_s_bh = sb_bread(p_s_sb, n_son_number); | ||
1950 | if (!p_s_bh) | ||
1951 | return IO_ERROR; | ||
1952 | if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { | ||
1953 | decrement_bcount(p_s_bh); | ||
1954 | PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] ); | ||
1955 | return REPEAT_SEARCH; | ||
1956 | } | ||
1957 | decrement_bcount(p_s_tb->R[n_h]); | ||
1958 | p_s_tb->R[n_h] = p_s_bh; | ||
1959 | |||
1960 | RFALSE( ! n_h && B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)), | ||
1961 | "PAP-8300: invalid child size of right neighbor (%d != %d - %d)", | ||
1962 | B_FREE_SPACE (p_s_bh), MAX_CHILD_SIZE (p_s_bh), | ||
1963 | dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position))); | ||
1964 | |||
1965 | } | ||
1966 | return CARRY_ON; | ||
1967 | } | ||
1968 | |||
1969 | #ifdef CONFIG_REISERFS_CHECK | ||
1970 | void * reiserfs_kmalloc (size_t size, int flags, struct super_block * s) | ||
1971 | { | ||
1972 | void * vp; | ||
1973 | static size_t malloced; | ||
1974 | |||
1975 | |||
1976 | vp = kmalloc (size, flags); | ||
1977 | if (vp) { | ||
1978 | REISERFS_SB(s)->s_kmallocs += size; | ||
1979 | if (REISERFS_SB(s)->s_kmallocs > malloced + 200000) { | ||
1980 | reiserfs_warning (s, | ||
1981 | "vs-8301: reiserfs_kmalloc: allocated memory %d", | ||
1982 | REISERFS_SB(s)->s_kmallocs); | ||
1983 | malloced = REISERFS_SB(s)->s_kmallocs; | ||
1984 | } | ||
1985 | } | ||
1986 | return vp; | ||
1987 | } | ||
1988 | |||
1989 | void reiserfs_kfree (const void * vp, size_t size, struct super_block * s) | ||
1990 | { | ||
1991 | kfree (vp); | ||
1992 | |||
1993 | REISERFS_SB(s)->s_kmallocs -= size; | ||
1994 | if (REISERFS_SB(s)->s_kmallocs < 0) | ||
1995 | reiserfs_warning (s, "vs-8302: reiserfs_kfree: allocated memory %d", | ||
1996 | REISERFS_SB(s)->s_kmallocs); | ||
1997 | |||
1998 | } | ||
1999 | #endif | ||
2000 | |||
2001 | |||
2002 | static int get_virtual_node_size (struct super_block * sb, struct buffer_head * bh) | ||
2003 | { | ||
2004 | int max_num_of_items; | ||
2005 | int max_num_of_entries; | ||
2006 | unsigned long blocksize = sb->s_blocksize; | ||
2007 | |||
2008 | #define MIN_NAME_LEN 1 | ||
2009 | |||
2010 | max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN); | ||
2011 | max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) / | ||
2012 | (DEH_SIZE + MIN_NAME_LEN); | ||
2013 | |||
2014 | return sizeof(struct virtual_node) + | ||
2015 | max(max_num_of_items * sizeof (struct virtual_item), | ||
2016 | sizeof (struct virtual_item) + sizeof(struct direntry_uarea) + | ||
2017 | (max_num_of_entries - 1) * sizeof (__u16)); | ||
2018 | } | ||
2019 | |||
2020 | |||
2021 | |||
2022 | /* maybe we should fail balancing we are going to perform when kmalloc | ||
2023 | fails several times. But now it will loop until kmalloc gets | ||
2024 | required memory */ | ||
2025 | static int get_mem_for_virtual_node (struct tree_balance * tb) | ||
2026 | { | ||
2027 | int check_fs = 0; | ||
2028 | int size; | ||
2029 | char * buf; | ||
2030 | |||
2031 | size = get_virtual_node_size (tb->tb_sb, PATH_PLAST_BUFFER (tb->tb_path)); | ||
2032 | |||
2033 | if (size > tb->vn_buf_size) { | ||
2034 | /* we have to allocate more memory for virtual node */ | ||
2035 | if (tb->vn_buf) { | ||
2036 | /* free memory allocated before */ | ||
2037 | reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb); | ||
2038 | /* this is not needed if kfree is atomic */ | ||
2039 | check_fs = 1; | ||
2040 | } | ||
2041 | |||
2042 | /* virtual node requires now more memory */ | ||
2043 | tb->vn_buf_size = size; | ||
2044 | |||
2045 | /* get memory for virtual item */ | ||
2046 | buf = reiserfs_kmalloc(size, GFP_ATOMIC | __GFP_NOWARN, tb->tb_sb); | ||
2047 | if ( ! buf ) { | ||
2048 | /* getting memory with GFP_KERNEL priority may involve | ||
2049 | balancing now (due to indirect_to_direct conversion on | ||
2050 | dcache shrinking). So, release path and collected | ||
2051 | resources here */ | ||
2052 | free_buffers_in_tb (tb); | ||
2053 | buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb); | ||
2054 | if ( !buf ) { | ||
2055 | #ifdef CONFIG_REISERFS_CHECK | ||
2056 | reiserfs_warning (tb->tb_sb, | ||
2057 | "vs-8345: get_mem_for_virtual_node: " | ||
2058 | "kmalloc failed. reiserfs kmalloced %d bytes", | ||
2059 | REISERFS_SB(tb->tb_sb)->s_kmallocs); | ||
2060 | #endif | ||
2061 | tb->vn_buf_size = 0; | ||
2062 | } | ||
2063 | tb->vn_buf = buf; | ||
2064 | schedule() ; | ||
2065 | return REPEAT_SEARCH; | ||
2066 | } | ||
2067 | |||
2068 | tb->vn_buf = buf; | ||
2069 | } | ||
2070 | |||
2071 | if ( check_fs && FILESYSTEM_CHANGED_TB (tb) ) | ||
2072 | return REPEAT_SEARCH; | ||
2073 | |||
2074 | return CARRY_ON; | ||
2075 | } | ||
2076 | |||
2077 | |||
2078 | #ifdef CONFIG_REISERFS_CHECK | ||
2079 | static void tb_buffer_sanity_check (struct super_block * p_s_sb, | ||
2080 | struct buffer_head * p_s_bh, | ||
2081 | const char *descr, int level) { | ||
2082 | if (p_s_bh) { | ||
2083 | if (atomic_read (&(p_s_bh->b_count)) <= 0) { | ||
2084 | |||
2085 | reiserfs_panic (p_s_sb, "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n", descr, level, p_s_bh); | ||
2086 | } | ||
2087 | |||
2088 | if ( ! buffer_uptodate (p_s_bh) ) { | ||
2089 | reiserfs_panic (p_s_sb, "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n", descr, level, p_s_bh); | ||
2090 | } | ||
2091 | |||
2092 | if ( ! B_IS_IN_TREE (p_s_bh) ) { | ||
2093 | reiserfs_panic (p_s_sb, "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n", descr, level, p_s_bh); | ||
2094 | } | ||
2095 | |||
2096 | if (p_s_bh->b_bdev != p_s_sb->s_bdev) { | ||
2097 | reiserfs_panic (p_s_sb, "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n", descr, level, p_s_bh); | ||
2098 | } | ||
2099 | |||
2100 | if (p_s_bh->b_size != p_s_sb->s_blocksize) { | ||
2101 | reiserfs_panic (p_s_sb, "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n", descr, level, p_s_bh); | ||
2102 | } | ||
2103 | |||
2104 | if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) { | ||
2105 | reiserfs_panic (p_s_sb, "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n", descr, level, p_s_bh); | ||
2106 | } | ||
2107 | } | ||
2108 | } | ||
2109 | #else | ||
2110 | static void tb_buffer_sanity_check (struct super_block * p_s_sb, | ||
2111 | struct buffer_head * p_s_bh, | ||
2112 | const char *descr, int level) | ||
2113 | {;} | ||
2114 | #endif | ||
2115 | |||
2116 | static int clear_all_dirty_bits(struct super_block *s, | ||
2117 | struct buffer_head *bh) { | ||
2118 | return reiserfs_prepare_for_journal(s, bh, 0) ; | ||
2119 | } | ||
2120 | |||
2121 | static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb) | ||
2122 | { | ||
2123 | struct buffer_head * locked; | ||
2124 | #ifdef CONFIG_REISERFS_CHECK | ||
2125 | int repeat_counter = 0; | ||
2126 | #endif | ||
2127 | int i; | ||
2128 | |||
2129 | do { | ||
2130 | |||
2131 | locked = NULL; | ||
2132 | |||
2133 | for ( i = p_s_tb->tb_path->path_length; !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i-- ) { | ||
2134 | if ( PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i) ) { | ||
2135 | /* if I understand correctly, we can only be sure the last buffer | ||
2136 | ** in the path is in the tree --clm | ||
2137 | */ | ||
2138 | #ifdef CONFIG_REISERFS_CHECK | ||
2139 | if (PATH_PLAST_BUFFER(p_s_tb->tb_path) == | ||
2140 | PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) { | ||
2141 | tb_buffer_sanity_check (p_s_tb->tb_sb, | ||
2142 | PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i), | ||
2143 | "S", | ||
2144 | p_s_tb->tb_path->path_length - i); | ||
2145 | } | ||
2146 | #endif | ||
2147 | if (!clear_all_dirty_bits(p_s_tb->tb_sb, | ||
2148 | PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i))) | ||
2149 | { | ||
2150 | locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i); | ||
2151 | } | ||
2152 | } | ||
2153 | } | ||
2154 | |||
2155 | for ( i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i]; i++ ) { | ||
2156 | |||
2157 | if (p_s_tb->lnum[i] ) { | ||
2158 | |||
2159 | if ( p_s_tb->L[i] ) { | ||
2160 | tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i); | ||
2161 | if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i])) | ||
2162 | locked = p_s_tb->L[i]; | ||
2163 | } | ||
2164 | |||
2165 | if ( !locked && p_s_tb->FL[i] ) { | ||
2166 | tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i); | ||
2167 | if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i])) | ||
2168 | locked = p_s_tb->FL[i]; | ||
2169 | } | ||
2170 | |||
2171 | if ( !locked && p_s_tb->CFL[i] ) { | ||
2172 | tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i); | ||
2173 | if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i])) | ||
2174 | locked = p_s_tb->CFL[i]; | ||
2175 | } | ||
2176 | |||
2177 | } | ||
2178 | |||
2179 | if ( !locked && (p_s_tb->rnum[i]) ) { | ||
2180 | |||
2181 | if ( p_s_tb->R[i] ) { | ||
2182 | tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i); | ||
2183 | if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i])) | ||
2184 | locked = p_s_tb->R[i]; | ||
2185 | } | ||
2186 | |||
2187 | |||
2188 | if ( !locked && p_s_tb->FR[i] ) { | ||
2189 | tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i); | ||
2190 | if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i])) | ||
2191 | locked = p_s_tb->FR[i]; | ||
2192 | } | ||
2193 | |||
2194 | if ( !locked && p_s_tb->CFR[i] ) { | ||
2195 | tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i); | ||
2196 | if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i])) | ||
2197 | locked = p_s_tb->CFR[i]; | ||
2198 | } | ||
2199 | } | ||
2200 | } | ||
2201 | /* as far as I can tell, this is not required. The FEB list seems | ||
2202 | ** to be full of newly allocated nodes, which will never be locked, | ||
2203 | ** dirty, or anything else. | ||
2204 | ** To be safe, I'm putting in the checks and waits in. For the moment, | ||
2205 | ** they are needed to keep the code in journal.c from complaining | ||
2206 | ** about the buffer. That code is inside CONFIG_REISERFS_CHECK as well. | ||
2207 | ** --clm | ||
2208 | */ | ||
2209 | for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) { | ||
2210 | if ( p_s_tb->FEB[i] ) { | ||
2211 | if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i])) | ||
2212 | locked = p_s_tb->FEB[i] ; | ||
2213 | } | ||
2214 | } | ||
2215 | |||
2216 | if (locked) { | ||
2217 | #ifdef CONFIG_REISERFS_CHECK | ||
2218 | repeat_counter++; | ||
2219 | if ( (repeat_counter % 10000) == 0) { | ||
2220 | reiserfs_warning (p_s_tb->tb_sb, | ||
2221 | "wait_tb_buffers_until_released(): too many " | ||
2222 | "iterations waiting for buffer to unlock " | ||
2223 | "(%b)", locked); | ||
2224 | |||
2225 | /* Don't loop forever. Try to recover from possible error. */ | ||
2226 | |||
2227 | return ( FILESYSTEM_CHANGED_TB (p_s_tb) ) ? REPEAT_SEARCH : CARRY_ON; | ||
2228 | } | ||
2229 | #endif | ||
2230 | __wait_on_buffer (locked); | ||
2231 | if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { | ||
2232 | return REPEAT_SEARCH; | ||
2233 | } | ||
2234 | } | ||
2235 | |||
2236 | } while (locked); | ||
2237 | |||
2238 | return CARRY_ON; | ||
2239 | } | ||
2240 | |||
2241 | |||
2242 | /* Prepare for balancing, that is | ||
2243 | * get all necessary parents, and neighbors; | ||
2244 | * analyze what and where should be moved; | ||
2245 | * get sufficient number of new nodes; | ||
2246 | * Balancing will start only after all resources will be collected at a time. | ||
2247 | * | ||
2248 | * When ported to SMP kernels, only at the last moment after all needed nodes | ||
2249 | * are collected in cache, will the resources be locked using the usual | ||
2250 | * textbook ordered lock acquisition algorithms. Note that ensuring that | ||
2251 | * this code neither write locks what it does not need to write lock nor locks out of order | ||
2252 | * will be a pain in the butt that could have been avoided. Grumble grumble. -Hans | ||
2253 | * | ||
2254 | * fix is meant in the sense of render unchanging | ||
2255 | * | ||
2256 | * Latency might be improved by first gathering a list of what buffers are needed | ||
2257 | * and then getting as many of them in parallel as possible? -Hans | ||
2258 | * | ||
2259 | * Parameters: | ||
2260 | * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append) | ||
2261 | * tb tree_balance structure; | ||
2262 | * inum item number in S[h]; | ||
2263 | * pos_in_item - comment this if you can | ||
2264 | * ins_ih & ins_sd are used when inserting | ||
2265 | * Returns: 1 - schedule occurred while the function worked; | ||
2266 | * 0 - schedule didn't occur while the function worked; | ||
2267 | * -1 - if no_disk_space | ||
2268 | */ | ||
2269 | |||
2270 | |||
2271 | int fix_nodes (int n_op_mode, | ||
2272 | struct tree_balance * p_s_tb, | ||
2273 | struct item_head * p_s_ins_ih, // item head of item being inserted | ||
2274 | const void * data // inserted item or data to be pasted | ||
2275 | ) { | ||
2276 | int n_ret_value, | ||
2277 | n_h, | ||
2278 | n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path); | ||
2279 | int n_pos_in_item; | ||
2280 | |||
2281 | /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared | ||
2282 | ** during wait_tb_buffers_run | ||
2283 | */ | ||
2284 | int wait_tb_buffers_run = 0 ; | ||
2285 | struct buffer_head * p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path); | ||
2286 | |||
2287 | ++ REISERFS_SB(p_s_tb -> tb_sb) -> s_fix_nodes; | ||
2288 | |||
2289 | n_pos_in_item = p_s_tb->tb_path->pos_in_item; | ||
2290 | |||
2291 | |||
2292 | p_s_tb->fs_gen = get_generation (p_s_tb->tb_sb); | ||
2293 | |||
2294 | /* we prepare and log the super here so it will already be in the | ||
2295 | ** transaction when do_balance needs to change it. | ||
2296 | ** This way do_balance won't have to schedule when trying to prepare | ||
2297 | ** the super for logging | ||
2298 | */ | ||
2299 | reiserfs_prepare_for_journal(p_s_tb->tb_sb, | ||
2300 | SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1) ; | ||
2301 | journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb, | ||
2302 | SB_BUFFER_WITH_SB(p_s_tb->tb_sb)) ; | ||
2303 | if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) | ||
2304 | return REPEAT_SEARCH; | ||
2305 | |||
2306 | /* if it possible in indirect_to_direct conversion */ | ||
2307 | if (buffer_locked (p_s_tbS0)) { | ||
2308 | __wait_on_buffer (p_s_tbS0); | ||
2309 | if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) | ||
2310 | return REPEAT_SEARCH; | ||
2311 | } | ||
2312 | |||
2313 | #ifdef CONFIG_REISERFS_CHECK | ||
2314 | if ( cur_tb ) { | ||
2315 | print_cur_tb ("fix_nodes"); | ||
2316 | reiserfs_panic(p_s_tb->tb_sb,"PAP-8305: fix_nodes: there is pending do_balance"); | ||
2317 | } | ||
2318 | |||
2319 | if (!buffer_uptodate (p_s_tbS0) || !B_IS_IN_TREE (p_s_tbS0)) { | ||
2320 | reiserfs_panic (p_s_tb->tb_sb, "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate " | ||
2321 | "at the beginning of fix_nodes or not in tree (mode %c)", p_s_tbS0, p_s_tbS0, n_op_mode); | ||
2322 | } | ||
2323 | |||
2324 | /* Check parameters. */ | ||
2325 | switch (n_op_mode) { | ||
2326 | case M_INSERT: | ||
2327 | if ( n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0) ) | ||
2328 | reiserfs_panic(p_s_tb->tb_sb,"PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert", | ||
2329 | n_item_num, B_NR_ITEMS(p_s_tbS0)); | ||
2330 | break; | ||
2331 | case M_PASTE: | ||
2332 | case M_DELETE: | ||
2333 | case M_CUT: | ||
2334 | if ( n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0) ) { | ||
2335 | print_block (p_s_tbS0, 0, -1, -1); | ||
2336 | reiserfs_panic(p_s_tb->tb_sb,"PAP-8335: fix_nodes: Incorrect item number(%d); mode = %c insert_size = %d\n", n_item_num, n_op_mode, p_s_tb->insert_size[0]); | ||
2337 | } | ||
2338 | break; | ||
2339 | default: | ||
2340 | reiserfs_panic(p_s_tb->tb_sb,"PAP-8340: fix_nodes: Incorrect mode of operation"); | ||
2341 | } | ||
2342 | #endif | ||
2343 | |||
2344 | if (get_mem_for_virtual_node (p_s_tb) == REPEAT_SEARCH) | ||
2345 | // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat | ||
2346 | return REPEAT_SEARCH; | ||
2347 | |||
2348 | |||
2349 | /* Starting from the leaf level; for all levels n_h of the tree. */ | ||
2350 | for ( n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++ ) { | ||
2351 | if ( (n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON ) { | ||
2352 | goto repeat; | ||
2353 | } | ||
2354 | |||
2355 | if ( (n_ret_value = check_balance (n_op_mode, p_s_tb, n_h, n_item_num, | ||
2356 | n_pos_in_item, p_s_ins_ih, data)) != CARRY_ON ) { | ||
2357 | if ( n_ret_value == NO_BALANCING_NEEDED ) { | ||
2358 | /* No balancing for higher levels needed. */ | ||
2359 | if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) { | ||
2360 | goto repeat; | ||
2361 | } | ||
2362 | if ( n_h != MAX_HEIGHT - 1 ) | ||
2363 | p_s_tb->insert_size[n_h + 1] = 0; | ||
2364 | /* ok, analysis and resource gathering are complete */ | ||
2365 | break; | ||
2366 | } | ||
2367 | goto repeat; | ||
2368 | } | ||
2369 | |||
2370 | if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) { | ||
2371 | goto repeat; | ||
2372 | } | ||
2373 | |||
2374 | if ( (n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON ) { | ||
2375 | goto repeat; /* No disk space, or schedule occurred and | ||
2376 | analysis may be invalid and needs to be redone. */ | ||
2377 | } | ||
2378 | |||
2379 | if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h) ) { | ||
2380 | /* We have a positive insert size but no nodes exist on this | ||
2381 | level, this means that we are creating a new root. */ | ||
2382 | |||
2383 | RFALSE( p_s_tb->blknum[n_h] != 1, | ||
2384 | "PAP-8350: creating new empty root"); | ||
2385 | |||
2386 | if ( n_h < MAX_HEIGHT - 1 ) | ||
2387 | p_s_tb->insert_size[n_h + 1] = 0; | ||
2388 | } | ||
2389 | else | ||
2390 | if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1) ) { | ||
2391 | if ( p_s_tb->blknum[n_h] > 1 ) { | ||
2392 | /* The tree needs to be grown, so this node S[n_h] | ||
2393 | which is the root node is split into two nodes, | ||
2394 | and a new node (S[n_h+1]) will be created to | ||
2395 | become the root node. */ | ||
2396 | |||
2397 | RFALSE( n_h == MAX_HEIGHT - 1, | ||
2398 | "PAP-8355: attempt to create too high of a tree"); | ||
2399 | |||
2400 | p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) + DC_SIZE; | ||
2401 | } | ||
2402 | else | ||
2403 | if ( n_h < MAX_HEIGHT - 1 ) | ||
2404 | p_s_tb->insert_size[n_h + 1] = 0; | ||
2405 | } | ||
2406 | else | ||
2407 | p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1); | ||
2408 | } | ||
2409 | |||
2410 | if ((n_ret_value = wait_tb_buffers_until_unlocked (p_s_tb)) == CARRY_ON) { | ||
2411 | if (FILESYSTEM_CHANGED_TB(p_s_tb)) { | ||
2412 | wait_tb_buffers_run = 1 ; | ||
2413 | n_ret_value = REPEAT_SEARCH ; | ||
2414 | goto repeat; | ||
2415 | } else { | ||
2416 | return CARRY_ON; | ||
2417 | } | ||
2418 | } else { | ||
2419 | wait_tb_buffers_run = 1 ; | ||
2420 | goto repeat; | ||
2421 | } | ||
2422 | |||
2423 | repeat: | ||
2424 | // fix_nodes was unable to perform its calculation due to | ||
2425 | // filesystem got changed under us, lack of free disk space or i/o | ||
2426 | // failure. If the first is the case - the search will be | ||
2427 | // repeated. For now - free all resources acquired so far except | ||
2428 | // for the new allocated nodes | ||
2429 | { | ||
2430 | int i; | ||
2431 | |||
2432 | /* Release path buffers. */ | ||
2433 | if (wait_tb_buffers_run) { | ||
2434 | pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path) ; | ||
2435 | } else { | ||
2436 | pathrelse (p_s_tb->tb_path); | ||
2437 | } | ||
2438 | /* brelse all resources collected for balancing */ | ||
2439 | for ( i = 0; i < MAX_HEIGHT; i++ ) { | ||
2440 | if (wait_tb_buffers_run) { | ||
2441 | reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->L[i]); | ||
2442 | reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->R[i]); | ||
2443 | reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FL[i]); | ||
2444 | reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FR[i]); | ||
2445 | reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFL[i]); | ||
2446 | reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFR[i]); | ||
2447 | } | ||
2448 | |||
2449 | brelse (p_s_tb->L[i]);p_s_tb->L[i] = NULL; | ||
2450 | brelse (p_s_tb->R[i]);p_s_tb->R[i] = NULL; | ||
2451 | brelse (p_s_tb->FL[i]);p_s_tb->FL[i] = NULL; | ||
2452 | brelse (p_s_tb->FR[i]);p_s_tb->FR[i] = NULL; | ||
2453 | brelse (p_s_tb->CFL[i]);p_s_tb->CFL[i] = NULL; | ||
2454 | brelse (p_s_tb->CFR[i]);p_s_tb->CFR[i] = NULL; | ||
2455 | } | ||
2456 | |||
2457 | if (wait_tb_buffers_run) { | ||
2458 | for ( i = 0; i < MAX_FEB_SIZE; i++ ) { | ||
2459 | if ( p_s_tb->FEB[i] ) { | ||
2460 | reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, | ||
2461 | p_s_tb->FEB[i]) ; | ||
2462 | } | ||
2463 | } | ||
2464 | } | ||
2465 | return n_ret_value; | ||
2466 | } | ||
2467 | |||
2468 | } | ||
2469 | |||
2470 | |||
2471 | /* Anatoly will probably forgive me renaming p_s_tb to tb. I just | ||
2472 | wanted to make lines shorter */ | ||
2473 | void unfix_nodes (struct tree_balance * tb) | ||
2474 | { | ||
2475 | int i; | ||
2476 | |||
2477 | /* Release path buffers. */ | ||
2478 | pathrelse_and_restore (tb->tb_sb, tb->tb_path); | ||
2479 | |||
2480 | /* brelse all resources collected for balancing */ | ||
2481 | for ( i = 0; i < MAX_HEIGHT; i++ ) { | ||
2482 | reiserfs_restore_prepared_buffer (tb->tb_sb, tb->L[i]); | ||
2483 | reiserfs_restore_prepared_buffer (tb->tb_sb, tb->R[i]); | ||
2484 | reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FL[i]); | ||
2485 | reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FR[i]); | ||
2486 | reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFL[i]); | ||
2487 | reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFR[i]); | ||
2488 | |||
2489 | brelse (tb->L[i]); | ||
2490 | brelse (tb->R[i]); | ||
2491 | brelse (tb->FL[i]); | ||
2492 | brelse (tb->FR[i]); | ||
2493 | brelse (tb->CFL[i]); | ||
2494 | brelse (tb->CFR[i]); | ||
2495 | } | ||
2496 | |||
2497 | /* deal with list of allocated (used and unused) nodes */ | ||
2498 | for ( i = 0; i < MAX_FEB_SIZE; i++ ) { | ||
2499 | if ( tb->FEB[i] ) { | ||
2500 | b_blocknr_t blocknr = tb->FEB[i]->b_blocknr ; | ||
2501 | /* de-allocated block which was not used by balancing and | ||
2502 | bforget about buffer for it */ | ||
2503 | brelse (tb->FEB[i]); | ||
2504 | reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0); | ||
2505 | } | ||
2506 | if (tb->used[i]) { | ||
2507 | /* release used as new nodes including a new root */ | ||
2508 | brelse (tb->used[i]); | ||
2509 | } | ||
2510 | } | ||
2511 | |||
2512 | if (tb->vn_buf) | ||
2513 | reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb); | ||
2514 | |||
2515 | } | ||
2516 | |||
2517 | |||
2518 | |||
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c new file mode 100644 index 000000000000..08d0508c2d39 --- /dev/null +++ b/fs/reiserfs/hashes.c | |||
@@ -0,0 +1,209 @@ | |||
1 | |||
2 | /* | ||
3 | * Keyed 32-bit hash function using TEA in a Davis-Meyer function | ||
4 | * H0 = Key | ||
5 | * Hi = E Mi(Hi-1) + Hi-1 | ||
6 | * | ||
7 | * (see Applied Cryptography, 2nd edition, p448). | ||
8 | * | ||
9 | * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998 | ||
10 | * | ||
11 | * Jeremy has agreed to the contents of reiserfs/README. -Hans | ||
12 | * Yura's function is added (04/07/2000) | ||
13 | */ | ||
14 | |||
15 | // | ||
16 | // keyed_hash | ||
17 | // yura_hash | ||
18 | // r5_hash | ||
19 | // | ||
20 | |||
21 | #include <linux/kernel.h> | ||
22 | #include <asm/types.h> | ||
23 | #include <asm/bug.h> | ||
24 | |||
25 | |||
26 | #define DELTA 0x9E3779B9 | ||
27 | #define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ | ||
28 | #define PARTROUNDS 6 /* 6 gets complete mixing */ | ||
29 | |||
30 | /* a, b, c, d - data; h0, h1 - accumulated hash */ | ||
31 | #define TEACORE(rounds) \ | ||
32 | do { \ | ||
33 | u32 sum = 0; \ | ||
34 | int n = rounds; \ | ||
35 | u32 b0, b1; \ | ||
36 | \ | ||
37 | b0 = h0; \ | ||
38 | b1 = h1; \ | ||
39 | \ | ||
40 | do \ | ||
41 | { \ | ||
42 | sum += DELTA; \ | ||
43 | b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \ | ||
44 | b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \ | ||
45 | } while(--n); \ | ||
46 | \ | ||
47 | h0 += b0; \ | ||
48 | h1 += b1; \ | ||
49 | } while(0) | ||
50 | |||
51 | |||
52 | u32 keyed_hash(const signed char *msg, int len) | ||
53 | { | ||
54 | u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3}; | ||
55 | |||
56 | u32 h0 = k[0], h1 = k[1]; | ||
57 | u32 a, b, c, d; | ||
58 | u32 pad; | ||
59 | int i; | ||
60 | |||
61 | // assert(len >= 0 && len < 256); | ||
62 | |||
63 | pad = (u32)len | ((u32)len << 8); | ||
64 | pad |= pad << 16; | ||
65 | |||
66 | while(len >= 16) | ||
67 | { | ||
68 | a = (u32)msg[ 0] | | ||
69 | (u32)msg[ 1] << 8 | | ||
70 | (u32)msg[ 2] << 16| | ||
71 | (u32)msg[ 3] << 24; | ||
72 | b = (u32)msg[ 4] | | ||
73 | (u32)msg[ 5] << 8 | | ||
74 | (u32)msg[ 6] << 16| | ||
75 | (u32)msg[ 7] << 24; | ||
76 | c = (u32)msg[ 8] | | ||
77 | (u32)msg[ 9] << 8 | | ||
78 | (u32)msg[10] << 16| | ||
79 | (u32)msg[11] << 24; | ||
80 | d = (u32)msg[12] | | ||
81 | (u32)msg[13] << 8 | | ||
82 | (u32)msg[14] << 16| | ||
83 | (u32)msg[15] << 24; | ||
84 | |||
85 | TEACORE(PARTROUNDS); | ||
86 | |||
87 | len -= 16; | ||
88 | msg += 16; | ||
89 | } | ||
90 | |||
91 | if (len >= 12) | ||
92 | { | ||
93 | a = (u32)msg[ 0] | | ||
94 | (u32)msg[ 1] << 8 | | ||
95 | (u32)msg[ 2] << 16| | ||
96 | (u32)msg[ 3] << 24; | ||
97 | b = (u32)msg[ 4] | | ||
98 | (u32)msg[ 5] << 8 | | ||
99 | (u32)msg[ 6] << 16| | ||
100 | (u32)msg[ 7] << 24; | ||
101 | c = (u32)msg[ 8] | | ||
102 | (u32)msg[ 9] << 8 | | ||
103 | (u32)msg[10] << 16| | ||
104 | (u32)msg[11] << 24; | ||
105 | |||
106 | d = pad; | ||
107 | for(i = 12; i < len; i++) | ||
108 | { | ||
109 | d <<= 8; | ||
110 | d |= msg[i]; | ||
111 | } | ||
112 | } | ||
113 | else if (len >= 8) | ||
114 | { | ||
115 | a = (u32)msg[ 0] | | ||
116 | (u32)msg[ 1] << 8 | | ||
117 | (u32)msg[ 2] << 16| | ||
118 | (u32)msg[ 3] << 24; | ||
119 | b = (u32)msg[ 4] | | ||
120 | (u32)msg[ 5] << 8 | | ||
121 | (u32)msg[ 6] << 16| | ||
122 | (u32)msg[ 7] << 24; | ||
123 | |||
124 | c = d = pad; | ||
125 | for(i = 8; i < len; i++) | ||
126 | { | ||
127 | c <<= 8; | ||
128 | c |= msg[i]; | ||
129 | } | ||
130 | } | ||
131 | else if (len >= 4) | ||
132 | { | ||
133 | a = (u32)msg[ 0] | | ||
134 | (u32)msg[ 1] << 8 | | ||
135 | (u32)msg[ 2] << 16| | ||
136 | (u32)msg[ 3] << 24; | ||
137 | |||
138 | b = c = d = pad; | ||
139 | for(i = 4; i < len; i++) | ||
140 | { | ||
141 | b <<= 8; | ||
142 | b |= msg[i]; | ||
143 | } | ||
144 | } | ||
145 | else | ||
146 | { | ||
147 | a = b = c = d = pad; | ||
148 | for(i = 0; i < len; i++) | ||
149 | { | ||
150 | a <<= 8; | ||
151 | a |= msg[i]; | ||
152 | } | ||
153 | } | ||
154 | |||
155 | TEACORE(FULLROUNDS); | ||
156 | |||
157 | /* return 0;*/ | ||
158 | return h0^h1; | ||
159 | } | ||
160 | |||
161 | /* What follows in this file is copyright 2000 by Hans Reiser, and the | ||
162 | * licensing of what follows is governed by reiserfs/README */ | ||
163 | |||
164 | u32 yura_hash (const signed char *msg, int len) | ||
165 | { | ||
166 | int j, pow; | ||
167 | u32 a, c; | ||
168 | int i; | ||
169 | |||
170 | for (pow=1,i=1; i < len; i++) pow = pow * 10; | ||
171 | |||
172 | if (len == 1) | ||
173 | a = msg[0]-48; | ||
174 | else | ||
175 | a = (msg[0] - 48) * pow; | ||
176 | |||
177 | for (i=1; i < len; i++) { | ||
178 | c = msg[i] - 48; | ||
179 | for (pow=1,j=i; j < len-1; j++) pow = pow * 10; | ||
180 | a = a + c * pow; | ||
181 | } | ||
182 | |||
183 | for (; i < 40; i++) { | ||
184 | c = '0' - 48; | ||
185 | for (pow=1,j=i; j < len-1; j++) pow = pow * 10; | ||
186 | a = a + c * pow; | ||
187 | } | ||
188 | |||
189 | for (; i < 256; i++) { | ||
190 | c = i; | ||
191 | for (pow=1,j=i; j < len-1; j++) pow = pow * 10; | ||
192 | a = a + c * pow; | ||
193 | } | ||
194 | |||
195 | a = a << 7; | ||
196 | return a; | ||
197 | } | ||
198 | |||
199 | u32 r5_hash (const signed char *msg, int len) | ||
200 | { | ||
201 | u32 a=0; | ||
202 | while(*msg) { | ||
203 | a += *msg << 4; | ||
204 | a += *msg >> 4; | ||
205 | a *= 11; | ||
206 | msg++; | ||
207 | } | ||
208 | return a; | ||
209 | } | ||
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c new file mode 100644 index 000000000000..a362125da0d8 --- /dev/null +++ b/fs/reiserfs/ibalance.c | |||
@@ -0,0 +1,1058 @@ | |||
1 | /* | ||
2 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
3 | */ | ||
4 | |||
5 | #include <linux/config.h> | ||
6 | #include <asm/uaccess.h> | ||
7 | #include <linux/string.h> | ||
8 | #include <linux/time.h> | ||
9 | #include <linux/reiserfs_fs.h> | ||
10 | #include <linux/buffer_head.h> | ||
11 | |||
12 | /* this is one and only function that is used outside (do_balance.c) */ | ||
13 | int balance_internal ( | ||
14 | struct tree_balance * , | ||
15 | int, | ||
16 | int, | ||
17 | struct item_head * , | ||
18 | struct buffer_head ** | ||
19 | ); | ||
20 | |||
21 | /* modes of internal_shift_left, internal_shift_right and internal_insert_childs */ | ||
22 | #define INTERNAL_SHIFT_FROM_S_TO_L 0 | ||
23 | #define INTERNAL_SHIFT_FROM_R_TO_S 1 | ||
24 | #define INTERNAL_SHIFT_FROM_L_TO_S 2 | ||
25 | #define INTERNAL_SHIFT_FROM_S_TO_R 3 | ||
26 | #define INTERNAL_INSERT_TO_S 4 | ||
27 | #define INTERNAL_INSERT_TO_L 5 | ||
28 | #define INTERNAL_INSERT_TO_R 6 | ||
29 | |||
30 | static void internal_define_dest_src_infos ( | ||
31 | int shift_mode, | ||
32 | struct tree_balance * tb, | ||
33 | int h, | ||
34 | struct buffer_info * dest_bi, | ||
35 | struct buffer_info * src_bi, | ||
36 | int * d_key, | ||
37 | struct buffer_head ** cf | ||
38 | ) | ||
39 | { | ||
40 | memset (dest_bi, 0, sizeof (struct buffer_info)); | ||
41 | memset (src_bi, 0, sizeof (struct buffer_info)); | ||
42 | /* define dest, src, dest parent, dest position */ | ||
43 | switch (shift_mode) { | ||
44 | case INTERNAL_SHIFT_FROM_S_TO_L: /* used in internal_shift_left */ | ||
45 | src_bi->tb = tb; | ||
46 | src_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); | ||
47 | src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); | ||
48 | src_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); | ||
49 | dest_bi->tb = tb; | ||
50 | dest_bi->bi_bh = tb->L[h]; | ||
51 | dest_bi->bi_parent = tb->FL[h]; | ||
52 | dest_bi->bi_position = get_left_neighbor_position (tb, h); | ||
53 | *d_key = tb->lkey[h]; | ||
54 | *cf = tb->CFL[h]; | ||
55 | break; | ||
56 | case INTERNAL_SHIFT_FROM_L_TO_S: | ||
57 | src_bi->tb = tb; | ||
58 | src_bi->bi_bh = tb->L[h]; | ||
59 | src_bi->bi_parent = tb->FL[h]; | ||
60 | src_bi->bi_position = get_left_neighbor_position (tb, h); | ||
61 | dest_bi->tb = tb; | ||
62 | dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); | ||
63 | dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); | ||
64 | dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); /* dest position is analog of dest->b_item_order */ | ||
65 | *d_key = tb->lkey[h]; | ||
66 | *cf = tb->CFL[h]; | ||
67 | break; | ||
68 | |||
69 | case INTERNAL_SHIFT_FROM_R_TO_S: /* used in internal_shift_left */ | ||
70 | src_bi->tb = tb; | ||
71 | src_bi->bi_bh = tb->R[h]; | ||
72 | src_bi->bi_parent = tb->FR[h]; | ||
73 | src_bi->bi_position = get_right_neighbor_position (tb, h); | ||
74 | dest_bi->tb = tb; | ||
75 | dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); | ||
76 | dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); | ||
77 | dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); | ||
78 | *d_key = tb->rkey[h]; | ||
79 | *cf = tb->CFR[h]; | ||
80 | break; | ||
81 | |||
82 | case INTERNAL_SHIFT_FROM_S_TO_R: | ||
83 | src_bi->tb = tb; | ||
84 | src_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); | ||
85 | src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); | ||
86 | src_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); | ||
87 | dest_bi->tb = tb; | ||
88 | dest_bi->bi_bh = tb->R[h]; | ||
89 | dest_bi->bi_parent = tb->FR[h]; | ||
90 | dest_bi->bi_position = get_right_neighbor_position (tb, h); | ||
91 | *d_key = tb->rkey[h]; | ||
92 | *cf = tb->CFR[h]; | ||
93 | break; | ||
94 | |||
95 | case INTERNAL_INSERT_TO_L: | ||
96 | dest_bi->tb = tb; | ||
97 | dest_bi->bi_bh = tb->L[h]; | ||
98 | dest_bi->bi_parent = tb->FL[h]; | ||
99 | dest_bi->bi_position = get_left_neighbor_position (tb, h); | ||
100 | break; | ||
101 | |||
102 | case INTERNAL_INSERT_TO_S: | ||
103 | dest_bi->tb = tb; | ||
104 | dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); | ||
105 | dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); | ||
106 | dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); | ||
107 | break; | ||
108 | |||
109 | case INTERNAL_INSERT_TO_R: | ||
110 | dest_bi->tb = tb; | ||
111 | dest_bi->bi_bh = tb->R[h]; | ||
112 | dest_bi->bi_parent = tb->FR[h]; | ||
113 | dest_bi->bi_position = get_right_neighbor_position (tb, h); | ||
114 | break; | ||
115 | |||
116 | default: | ||
117 | reiserfs_panic (tb->tb_sb, "internal_define_dest_src_infos: shift type is unknown (%d)", shift_mode); | ||
118 | } | ||
119 | } | ||
120 | |||
121 | |||
122 | |||
123 | /* Insert count node pointers into buffer cur before position to + 1. | ||
124 | * Insert count items into buffer cur before position to. | ||
125 | * Items and node pointers are specified by inserted and bh respectively. | ||
126 | */ | ||
127 | static void internal_insert_childs (struct buffer_info * cur_bi, | ||
128 | int to, int count, | ||
129 | struct item_head * inserted, | ||
130 | struct buffer_head ** bh | ||
131 | ) | ||
132 | { | ||
133 | struct buffer_head * cur = cur_bi->bi_bh; | ||
134 | struct block_head * blkh; | ||
135 | int nr; | ||
136 | struct reiserfs_key * ih; | ||
137 | struct disk_child new_dc[2]; | ||
138 | struct disk_child * dc; | ||
139 | int i; | ||
140 | |||
141 | if (count <= 0) | ||
142 | return; | ||
143 | |||
144 | blkh = B_BLK_HEAD(cur); | ||
145 | nr = blkh_nr_item(blkh); | ||
146 | |||
147 | RFALSE( count > 2, | ||
148 | "too many children (%d) are to be inserted", count); | ||
149 | RFALSE( B_FREE_SPACE (cur) < count * (KEY_SIZE + DC_SIZE), | ||
150 | "no enough free space (%d), needed %d bytes", | ||
151 | B_FREE_SPACE (cur), count * (KEY_SIZE + DC_SIZE)); | ||
152 | |||
153 | /* prepare space for count disk_child */ | ||
154 | dc = B_N_CHILD(cur,to+1); | ||
155 | |||
156 | memmove (dc + count, dc, (nr+1-(to+1)) * DC_SIZE); | ||
157 | |||
158 | /* copy to_be_insert disk children */ | ||
159 | for (i = 0; i < count; i ++) { | ||
160 | put_dc_size( &(new_dc[i]), MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i])); | ||
161 | put_dc_block_number( &(new_dc[i]), bh[i]->b_blocknr ); | ||
162 | } | ||
163 | memcpy (dc, new_dc, DC_SIZE * count); | ||
164 | |||
165 | |||
166 | /* prepare space for count items */ | ||
167 | ih = B_N_PDELIM_KEY (cur, ((to == -1) ? 0 : to)); | ||
168 | |||
169 | memmove (ih + count, ih, (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE); | ||
170 | |||
171 | /* copy item headers (keys) */ | ||
172 | memcpy (ih, inserted, KEY_SIZE); | ||
173 | if ( count > 1 ) | ||
174 | memcpy (ih + 1, inserted + 1, KEY_SIZE); | ||
175 | |||
176 | /* sizes, item number */ | ||
177 | set_blkh_nr_item( blkh, blkh_nr_item(blkh) + count ); | ||
178 | set_blkh_free_space( blkh, | ||
179 | blkh_free_space(blkh) - count * (DC_SIZE + KEY_SIZE ) ); | ||
180 | |||
181 | do_balance_mark_internal_dirty (cur_bi->tb, cur,0); | ||
182 | |||
183 | /*&&&&&&&&&&&&&&&&&&&&&&&&*/ | ||
184 | check_internal (cur); | ||
185 | /*&&&&&&&&&&&&&&&&&&&&&&&&*/ | ||
186 | |||
187 | if (cur_bi->bi_parent) { | ||
188 | struct disk_child *t_dc = B_N_CHILD (cur_bi->bi_parent,cur_bi->bi_position); | ||
189 | put_dc_size( t_dc, dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE))); | ||
190 | do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, 0); | ||
191 | |||
192 | /*&&&&&&&&&&&&&&&&&&&&&&&&*/ | ||
193 | check_internal (cur_bi->bi_parent); | ||
194 | /*&&&&&&&&&&&&&&&&&&&&&&&&*/ | ||
195 | } | ||
196 | |||
197 | } | ||
198 | |||
199 | |||
200 | /* Delete del_num items and node pointers from buffer cur starting from * | ||
201 | * the first_i'th item and first_p'th pointers respectively. */ | ||
202 | static void internal_delete_pointers_items ( | ||
203 | struct buffer_info * cur_bi, | ||
204 | int first_p, | ||
205 | int first_i, | ||
206 | int del_num | ||
207 | ) | ||
208 | { | ||
209 | struct buffer_head * cur = cur_bi->bi_bh; | ||
210 | int nr; | ||
211 | struct block_head * blkh; | ||
212 | struct reiserfs_key * key; | ||
213 | struct disk_child * dc; | ||
214 | |||
215 | RFALSE( cur == NULL, "buffer is 0"); | ||
216 | RFALSE( del_num < 0, | ||
217 | "negative number of items (%d) can not be deleted", del_num); | ||
218 | RFALSE( first_p < 0 || first_p + del_num > B_NR_ITEMS (cur) + 1 || first_i < 0, | ||
219 | "first pointer order (%d) < 0 or " | ||
220 | "no so many pointers (%d), only (%d) or " | ||
221 | "first key order %d < 0", first_p, | ||
222 | first_p + del_num, B_NR_ITEMS (cur) + 1, first_i); | ||
223 | if ( del_num == 0 ) | ||
224 | return; | ||
225 | |||
226 | blkh = B_BLK_HEAD(cur); | ||
227 | nr = blkh_nr_item(blkh); | ||
228 | |||
229 | if ( first_p == 0 && del_num == nr + 1 ) { | ||
230 | RFALSE( first_i != 0, "1st deleted key must have order 0, not %d", first_i); | ||
231 | make_empty_node (cur_bi); | ||
232 | return; | ||
233 | } | ||
234 | |||
235 | RFALSE( first_i + del_num > B_NR_ITEMS (cur), | ||
236 | "first_i = %d del_num = %d " | ||
237 | "no so many keys (%d) in the node (%b)(%z)", | ||
238 | first_i, del_num, first_i + del_num, cur, cur); | ||
239 | |||
240 | |||
241 | /* deleting */ | ||
242 | dc = B_N_CHILD (cur, first_p); | ||
243 | |||
244 | memmove (dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE); | ||
245 | key = B_N_PDELIM_KEY (cur, first_i); | ||
246 | memmove (key, key + del_num, (nr - first_i - del_num) * KEY_SIZE + (nr + 1 - del_num) * DC_SIZE); | ||
247 | |||
248 | |||
249 | /* sizes, item number */ | ||
250 | set_blkh_nr_item( blkh, blkh_nr_item(blkh) - del_num ); | ||
251 | set_blkh_free_space( blkh, | ||
252 | blkh_free_space(blkh) + (del_num * (KEY_SIZE + DC_SIZE) ) ); | ||
253 | |||
254 | do_balance_mark_internal_dirty (cur_bi->tb, cur, 0); | ||
255 | /*&&&&&&&&&&&&&&&&&&&&&&&*/ | ||
256 | check_internal (cur); | ||
257 | /*&&&&&&&&&&&&&&&&&&&&&&&*/ | ||
258 | |||
259 | if (cur_bi->bi_parent) { | ||
260 | struct disk_child *t_dc; | ||
261 | t_dc = B_N_CHILD (cur_bi->bi_parent, cur_bi->bi_position); | ||
262 | put_dc_size( t_dc, dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE) ) ); | ||
263 | |||
264 | do_balance_mark_internal_dirty (cur_bi->tb, cur_bi->bi_parent,0); | ||
265 | /*&&&&&&&&&&&&&&&&&&&&&&&&*/ | ||
266 | check_internal (cur_bi->bi_parent); | ||
267 | /*&&&&&&&&&&&&&&&&&&&&&&&&*/ | ||
268 | } | ||
269 | } | ||
270 | |||
271 | |||
272 | /* delete n node pointers and items starting from given position */ | ||
273 | static void internal_delete_childs (struct buffer_info * cur_bi, | ||
274 | int from, int n) | ||
275 | { | ||
276 | int i_from; | ||
277 | |||
278 | i_from = (from == 0) ? from : from - 1; | ||
279 | |||
280 | /* delete n pointers starting from `from' position in CUR; | ||
281 | delete n keys starting from 'i_from' position in CUR; | ||
282 | */ | ||
283 | internal_delete_pointers_items (cur_bi, from, i_from, n); | ||
284 | } | ||
285 | |||
286 | |||
287 | /* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest | ||
288 | * last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest | ||
289 | * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest | ||
290 | */ | ||
291 | static void internal_copy_pointers_items ( | ||
292 | struct buffer_info * dest_bi, | ||
293 | struct buffer_head * src, | ||
294 | int last_first, int cpy_num | ||
295 | ) | ||
296 | { | ||
297 | /* ATTENTION! Number of node pointers in DEST is equal to number of items in DEST * | ||
298 | * as delimiting key have already inserted to buffer dest.*/ | ||
299 | struct buffer_head * dest = dest_bi->bi_bh; | ||
300 | int nr_dest, nr_src; | ||
301 | int dest_order, src_order; | ||
302 | struct block_head * blkh; | ||
303 | struct reiserfs_key * key; | ||
304 | struct disk_child * dc; | ||
305 | |||
306 | nr_src = B_NR_ITEMS (src); | ||
307 | |||
308 | RFALSE( dest == NULL || src == NULL, | ||
309 | "src (%p) or dest (%p) buffer is 0", src, dest); | ||
310 | RFALSE( last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, | ||
311 | "invalid last_first parameter (%d)", last_first); | ||
312 | RFALSE( nr_src < cpy_num - 1, | ||
313 | "no so many items (%d) in src (%d)", cpy_num, nr_src); | ||
314 | RFALSE( cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num); | ||
315 | RFALSE( cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest), | ||
316 | "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)", | ||
317 | cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest)); | ||
318 | |||
319 | if ( cpy_num == 0 ) | ||
320 | return; | ||
321 | |||
322 | /* coping */ | ||
323 | blkh = B_BLK_HEAD(dest); | ||
324 | nr_dest = blkh_nr_item(blkh); | ||
325 | |||
326 | /*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest;*/ | ||
327 | /*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0;*/ | ||
328 | (last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order = nr_src - cpy_num + 1) : | ||
329 | (dest_order = nr_dest, src_order = 0); | ||
330 | |||
331 | /* prepare space for cpy_num pointers */ | ||
332 | dc = B_N_CHILD (dest, dest_order); | ||
333 | |||
334 | memmove (dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE); | ||
335 | |||
336 | /* insert pointers */ | ||
337 | memcpy (dc, B_N_CHILD (src, src_order), DC_SIZE * cpy_num); | ||
338 | |||
339 | |||
340 | /* prepare space for cpy_num - 1 item headers */ | ||
341 | key = B_N_PDELIM_KEY(dest, dest_order); | ||
342 | memmove (key + cpy_num - 1, key, | ||
343 | KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest + cpy_num)); | ||
344 | |||
345 | |||
346 | /* insert headers */ | ||
347 | memcpy (key, B_N_PDELIM_KEY (src, src_order), KEY_SIZE * (cpy_num - 1)); | ||
348 | |||
349 | /* sizes, item number */ | ||
350 | set_blkh_nr_item( blkh, blkh_nr_item(blkh) + (cpy_num - 1 ) ); | ||
351 | set_blkh_free_space( blkh, | ||
352 | blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) + DC_SIZE * cpy_num ) ); | ||
353 | |||
354 | do_balance_mark_internal_dirty (dest_bi->tb, dest, 0); | ||
355 | |||
356 | /*&&&&&&&&&&&&&&&&&&&&&&&&*/ | ||
357 | check_internal (dest); | ||
358 | /*&&&&&&&&&&&&&&&&&&&&&&&&*/ | ||
359 | |||
360 | if (dest_bi->bi_parent) { | ||
361 | struct disk_child *t_dc; | ||
362 | t_dc = B_N_CHILD(dest_bi->bi_parent,dest_bi->bi_position); | ||
363 | put_dc_size( t_dc, dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) + DC_SIZE * cpy_num) ); | ||
364 | |||
365 | do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent,0); | ||
366 | /*&&&&&&&&&&&&&&&&&&&&&&&&*/ | ||
367 | check_internal (dest_bi->bi_parent); | ||
368 | /*&&&&&&&&&&&&&&&&&&&&&&&&*/ | ||
369 | } | ||
370 | |||
371 | } | ||
372 | |||
373 | |||
374 | /* Copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest. | ||
375 | * Delete cpy_num - del_par items and node pointers from buffer src. | ||
376 | * last_first == FIRST_TO_LAST means, that we copy/delete first items from src. | ||
377 | * last_first == LAST_TO_FIRST means, that we copy/delete last items from src. | ||
378 | */ | ||
379 | static void internal_move_pointers_items (struct buffer_info * dest_bi, | ||
380 | struct buffer_info * src_bi, | ||
381 | int last_first, int cpy_num, int del_par) | ||
382 | { | ||
383 | int first_pointer; | ||
384 | int first_item; | ||
385 | |||
386 | internal_copy_pointers_items (dest_bi, src_bi->bi_bh, last_first, cpy_num); | ||
387 | |||
388 | if (last_first == FIRST_TO_LAST) { /* shift_left occurs */ | ||
389 | first_pointer = 0; | ||
390 | first_item = 0; | ||
391 | /* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer, | ||
392 | for key - with first_item */ | ||
393 | internal_delete_pointers_items (src_bi, first_pointer, first_item, cpy_num - del_par); | ||
394 | } else { /* shift_right occurs */ | ||
395 | int i, j; | ||
396 | |||
397 | i = ( cpy_num - del_par == ( j = B_NR_ITEMS(src_bi->bi_bh)) + 1 ) ? 0 : j - cpy_num + del_par; | ||
398 | |||
399 | internal_delete_pointers_items (src_bi, j + 1 - cpy_num + del_par, i, cpy_num - del_par); | ||
400 | } | ||
401 | } | ||
402 | |||
403 | /* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */ | ||
404 | static void internal_insert_key (struct buffer_info * dest_bi, | ||
405 | int dest_position_before, /* insert key before key with n_dest number */ | ||
406 | struct buffer_head * src, | ||
407 | int src_position) | ||
408 | { | ||
409 | struct buffer_head * dest = dest_bi->bi_bh; | ||
410 | int nr; | ||
411 | struct block_head * blkh; | ||
412 | struct reiserfs_key * key; | ||
413 | |||
414 | RFALSE( dest == NULL || src == NULL, | ||
415 | "source(%p) or dest(%p) buffer is 0", src, dest); | ||
416 | RFALSE( dest_position_before < 0 || src_position < 0, | ||
417 | "source(%d) or dest(%d) key number less than 0", | ||
418 | src_position, dest_position_before); | ||
419 | RFALSE( dest_position_before > B_NR_ITEMS (dest) || | ||
420 | src_position >= B_NR_ITEMS(src), | ||
421 | "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))", | ||
422 | dest_position_before, B_NR_ITEMS (dest), | ||
423 | src_position, B_NR_ITEMS(src)); | ||
424 | RFALSE( B_FREE_SPACE (dest) < KEY_SIZE, | ||
425 | "no enough free space (%d) in dest buffer", B_FREE_SPACE (dest)); | ||
426 | |||
427 | blkh = B_BLK_HEAD(dest); | ||
428 | nr = blkh_nr_item(blkh); | ||
429 | |||
430 | /* prepare space for inserting key */ | ||
431 | key = B_N_PDELIM_KEY (dest, dest_position_before); | ||
432 | memmove (key + 1, key, (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE); | ||
433 | |||
434 | /* insert key */ | ||
435 | memcpy (key, B_N_PDELIM_KEY(src, src_position), KEY_SIZE); | ||
436 | |||
437 | /* Change dirt, free space, item number fields. */ | ||
438 | |||
439 | set_blkh_nr_item( blkh, blkh_nr_item(blkh) + 1 ); | ||
440 | set_blkh_free_space( blkh, blkh_free_space(blkh) - KEY_SIZE ); | ||
441 | |||
442 | do_balance_mark_internal_dirty (dest_bi->tb, dest, 0); | ||
443 | |||
444 | if (dest_bi->bi_parent) { | ||
445 | struct disk_child *t_dc; | ||
446 | t_dc = B_N_CHILD(dest_bi->bi_parent,dest_bi->bi_position); | ||
447 | put_dc_size( t_dc, dc_size(t_dc) + KEY_SIZE ); | ||
448 | |||
449 | do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent,0); | ||
450 | } | ||
451 | } | ||
452 | |||
453 | |||
454 | |||
455 | /* Insert d_key'th (delimiting) key from buffer cfl to tail of dest. | ||
456 | * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest. | ||
457 | * Replace d_key'th key in buffer cfl. | ||
458 | * Delete pointer_amount items and node pointers from buffer src. | ||
459 | */ | ||
460 | /* this can be invoked both to shift from S to L and from R to S */ | ||
461 | static void internal_shift_left ( | ||
462 | int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S */ | ||
463 | struct tree_balance * tb, | ||
464 | int h, | ||
465 | int pointer_amount | ||
466 | ) | ||
467 | { | ||
468 | struct buffer_info dest_bi, src_bi; | ||
469 | struct buffer_head * cf; | ||
470 | int d_key_position; | ||
471 | |||
472 | internal_define_dest_src_infos (mode, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); | ||
473 | |||
474 | /*printk("pointer_amount = %d\n",pointer_amount);*/ | ||
475 | |||
476 | if (pointer_amount) { | ||
477 | /* insert delimiting key from common father of dest and src to node dest into position B_NR_ITEM(dest) */ | ||
478 | internal_insert_key (&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position); | ||
479 | |||
480 | if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) { | ||
481 | if (src_bi.bi_position/*src->b_item_order*/ == 0) | ||
482 | replace_key (tb, cf, d_key_position, src_bi.bi_parent/*src->b_parent*/, 0); | ||
483 | } else | ||
484 | replace_key (tb, cf, d_key_position, src_bi.bi_bh, pointer_amount - 1); | ||
485 | } | ||
486 | /* last parameter is del_parameter */ | ||
487 | internal_move_pointers_items (&dest_bi, &src_bi, FIRST_TO_LAST, pointer_amount, 0); | ||
488 | |||
489 | } | ||
490 | |||
491 | /* Insert delimiting key to L[h]. | ||
492 | * Copy n node pointers and n - 1 items from buffer S[h] to L[h]. | ||
493 | * Delete n - 1 items and node pointers from buffer S[h]. | ||
494 | */ | ||
495 | /* it always shifts from S[h] to L[h] */ | ||
496 | static void internal_shift1_left ( | ||
497 | struct tree_balance * tb, | ||
498 | int h, | ||
499 | int pointer_amount | ||
500 | ) | ||
501 | { | ||
502 | struct buffer_info dest_bi, src_bi; | ||
503 | struct buffer_head * cf; | ||
504 | int d_key_position; | ||
505 | |||
506 | internal_define_dest_src_infos (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); | ||
507 | |||
508 | if ( pointer_amount > 0 ) /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */ | ||
509 | internal_insert_key (&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position); | ||
510 | /* internal_insert_key (tb->L[h], B_NR_ITEM(tb->L[h]), tb->CFL[h], tb->lkey[h]);*/ | ||
511 | |||
512 | /* last parameter is del_parameter */ | ||
513 | internal_move_pointers_items (&dest_bi, &src_bi, FIRST_TO_LAST, pointer_amount, 1); | ||
514 | /* internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1);*/ | ||
515 | } | ||
516 | |||
517 | |||
518 | /* Insert d_key'th (delimiting) key from buffer cfr to head of dest. | ||
519 | * Copy n node pointers and n - 1 items from buffer src to buffer dest. | ||
520 | * Replace d_key'th key in buffer cfr. | ||
521 | * Delete n items and node pointers from buffer src. | ||
522 | */ | ||
523 | static void internal_shift_right ( | ||
524 | int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S */ | ||
525 | struct tree_balance * tb, | ||
526 | int h, | ||
527 | int pointer_amount | ||
528 | ) | ||
529 | { | ||
530 | struct buffer_info dest_bi, src_bi; | ||
531 | struct buffer_head * cf; | ||
532 | int d_key_position; | ||
533 | int nr; | ||
534 | |||
535 | |||
536 | internal_define_dest_src_infos (mode, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); | ||
537 | |||
538 | nr = B_NR_ITEMS (src_bi.bi_bh); | ||
539 | |||
540 | if (pointer_amount > 0) { | ||
541 | /* insert delimiting key from common father of dest and src to dest node into position 0 */ | ||
542 | internal_insert_key (&dest_bi, 0, cf, d_key_position); | ||
543 | if (nr == pointer_amount - 1) { | ||
544 | RFALSE( src_bi.bi_bh != PATH_H_PBUFFER (tb->tb_path, h)/*tb->S[h]*/ || | ||
545 | dest_bi.bi_bh != tb->R[h], | ||
546 | "src (%p) must be == tb->S[h](%p) when it disappears", | ||
547 | src_bi.bi_bh, PATH_H_PBUFFER (tb->tb_path, h)); | ||
548 | /* when S[h] disappers replace left delemiting key as well */ | ||
549 | if (tb->CFL[h]) | ||
550 | replace_key (tb, cf, d_key_position, tb->CFL[h], tb->lkey[h]); | ||
551 | } else | ||
552 | replace_key (tb, cf, d_key_position, src_bi.bi_bh, nr - pointer_amount); | ||
553 | } | ||
554 | |||
555 | /* last parameter is del_parameter */ | ||
556 | internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, pointer_amount, 0); | ||
557 | } | ||
558 | |||
559 | /* Insert delimiting key to R[h]. | ||
560 | * Copy n node pointers and n - 1 items from buffer S[h] to R[h]. | ||
561 | * Delete n - 1 items and node pointers from buffer S[h]. | ||
562 | */ | ||
563 | /* it always shift from S[h] to R[h] */ | ||
564 | static void internal_shift1_right ( | ||
565 | struct tree_balance * tb, | ||
566 | int h, | ||
567 | int pointer_amount | ||
568 | ) | ||
569 | { | ||
570 | struct buffer_info dest_bi, src_bi; | ||
571 | struct buffer_head * cf; | ||
572 | int d_key_position; | ||
573 | |||
574 | internal_define_dest_src_infos (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); | ||
575 | |||
576 | if (pointer_amount > 0) /* insert rkey from CFR[h] to right neighbor R[h] */ | ||
577 | internal_insert_key (&dest_bi, 0, cf, d_key_position); | ||
578 | /* internal_insert_key (tb->R[h], 0, tb->CFR[h], tb->rkey[h]);*/ | ||
579 | |||
580 | /* last parameter is del_parameter */ | ||
581 | internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, pointer_amount, 1); | ||
582 | /* internal_move_pointers_items (tb->R[h], tb->S[h], LAST_TO_FIRST, pointer_amount, 1);*/ | ||
583 | } | ||
584 | |||
585 | |||
586 | /* Delete insert_num node pointers together with their left items | ||
587 | * and balance current node.*/ | ||
588 | static void balance_internal_when_delete (struct tree_balance * tb, | ||
589 | int h, int child_pos) | ||
590 | { | ||
591 | int insert_num; | ||
592 | int n; | ||
593 | struct buffer_head * tbSh = PATH_H_PBUFFER (tb->tb_path, h); | ||
594 | struct buffer_info bi; | ||
595 | |||
596 | insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE)); | ||
597 | |||
598 | /* delete child-node-pointer(s) together with their left item(s) */ | ||
599 | bi.tb = tb; | ||
600 | bi.bi_bh = tbSh; | ||
601 | bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h); | ||
602 | bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1); | ||
603 | |||
604 | internal_delete_childs (&bi, child_pos, -insert_num); | ||
605 | |||
606 | RFALSE( tb->blknum[h] > 1, | ||
607 | "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]); | ||
608 | |||
609 | n = B_NR_ITEMS(tbSh); | ||
610 | |||
611 | if ( tb->lnum[h] == 0 && tb->rnum[h] == 0 ) { | ||
612 | if ( tb->blknum[h] == 0 ) { | ||
613 | /* node S[h] (root of the tree) is empty now */ | ||
614 | struct buffer_head *new_root; | ||
615 | |||
616 | RFALSE( n || B_FREE_SPACE (tbSh) != MAX_CHILD_SIZE(tbSh) - DC_SIZE, | ||
617 | "buffer must have only 0 keys (%d)", n); | ||
618 | RFALSE( bi.bi_parent, "root has parent (%p)", bi.bi_parent); | ||
619 | |||
620 | /* choose a new root */ | ||
621 | if ( ! tb->L[h-1] || ! B_NR_ITEMS(tb->L[h-1]) ) | ||
622 | new_root = tb->R[h-1]; | ||
623 | else | ||
624 | new_root = tb->L[h-1]; | ||
625 | /* switch super block's tree root block number to the new value */ | ||
626 | PUT_SB_ROOT_BLOCK( tb->tb_sb, new_root->b_blocknr ); | ||
627 | //REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; | ||
628 | PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) - 1 ); | ||
629 | |||
630 | do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1); | ||
631 | /*&&&&&&&&&&&&&&&&&&&&&&*/ | ||
632 | if (h > 1) | ||
633 | /* use check_internal if new root is an internal node */ | ||
634 | check_internal (new_root); | ||
635 | /*&&&&&&&&&&&&&&&&&&&&&&*/ | ||
636 | |||
637 | /* do what is needed for buffer thrown from tree */ | ||
638 | reiserfs_invalidate_buffer(tb, tbSh); | ||
639 | return; | ||
640 | } | ||
641 | return; | ||
642 | } | ||
643 | |||
644 | if ( tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1 ) { /* join S[h] with L[h] */ | ||
645 | |||
646 | RFALSE( tb->rnum[h] != 0, | ||
647 | "invalid tb->rnum[%d]==%d when joining S[h] with L[h]", | ||
648 | h, tb->rnum[h]); | ||
649 | |||
650 | internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1); | ||
651 | reiserfs_invalidate_buffer(tb, tbSh); | ||
652 | |||
653 | return; | ||
654 | } | ||
655 | |||
656 | if ( tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1 ) { /* join S[h] with R[h] */ | ||
657 | RFALSE( tb->lnum[h] != 0, | ||
658 | "invalid tb->lnum[%d]==%d when joining S[h] with R[h]", | ||
659 | h, tb->lnum[h]); | ||
660 | |||
661 | internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1); | ||
662 | |||
663 | reiserfs_invalidate_buffer(tb,tbSh); | ||
664 | return; | ||
665 | } | ||
666 | |||
667 | if ( tb->lnum[h] < 0 ) { /* borrow from left neighbor L[h] */ | ||
668 | RFALSE( tb->rnum[h] != 0, | ||
669 | "wrong tb->rnum[%d]==%d when borrow from L[h]", h, tb->rnum[h]); | ||
670 | /*internal_shift_right (tb, h, tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], -tb->lnum[h]);*/ | ||
671 | internal_shift_right (INTERNAL_SHIFT_FROM_L_TO_S, tb, h, -tb->lnum[h]); | ||
672 | return; | ||
673 | } | ||
674 | |||
675 | if ( tb->rnum[h] < 0 ) { /* borrow from right neighbor R[h] */ | ||
676 | RFALSE( tb->lnum[h] != 0, | ||
677 | "invalid tb->lnum[%d]==%d when borrow from R[h]", | ||
678 | h, tb->lnum[h]); | ||
679 | internal_shift_left (INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]);/*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]);*/ | ||
680 | return; | ||
681 | } | ||
682 | |||
683 | if ( tb->lnum[h] > 0 ) { /* split S[h] into two parts and put them into neighbors */ | ||
684 | RFALSE( tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1, | ||
685 | "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them", | ||
686 | h, tb->lnum[h], h, tb->rnum[h], n); | ||
687 | |||
688 | internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]);/*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]);*/ | ||
689 | internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]); | ||
690 | |||
691 | reiserfs_invalidate_buffer (tb, tbSh); | ||
692 | |||
693 | return; | ||
694 | } | ||
695 | reiserfs_panic (tb->tb_sb, "balance_internal_when_delete: unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d", | ||
696 | h, tb->lnum[h], h, tb->rnum[h]); | ||
697 | } | ||
698 | |||
699 | |||
700 | /* Replace delimiting key of buffers L[h] and S[h] by the given key.*/ | ||
701 | static void replace_lkey ( | ||
702 | struct tree_balance * tb, | ||
703 | int h, | ||
704 | struct item_head * key | ||
705 | ) | ||
706 | { | ||
707 | RFALSE( tb->L[h] == NULL || tb->CFL[h] == NULL, | ||
708 | "L[h](%p) and CFL[h](%p) must exist in replace_lkey", | ||
709 | tb->L[h], tb->CFL[h]); | ||
710 | |||
711 | if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0) | ||
712 | return; | ||
713 | |||
714 | memcpy (B_N_PDELIM_KEY(tb->CFL[h],tb->lkey[h]), key, KEY_SIZE); | ||
715 | |||
716 | do_balance_mark_internal_dirty (tb, tb->CFL[h],0); | ||
717 | } | ||
718 | |||
719 | |||
720 | /* Replace delimiting key of buffers S[h] and R[h] by the given key.*/ | ||
721 | static void replace_rkey ( | ||
722 | struct tree_balance * tb, | ||
723 | int h, | ||
724 | struct item_head * key | ||
725 | ) | ||
726 | { | ||
727 | RFALSE( tb->R[h] == NULL || tb->CFR[h] == NULL, | ||
728 | "R[h](%p) and CFR[h](%p) must exist in replace_rkey", | ||
729 | tb->R[h], tb->CFR[h]); | ||
730 | RFALSE( B_NR_ITEMS(tb->R[h]) == 0, | ||
731 | "R[h] can not be empty if it exists (item number=%d)", | ||
732 | B_NR_ITEMS(tb->R[h])); | ||
733 | |||
734 | memcpy (B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]), key, KEY_SIZE); | ||
735 | |||
736 | do_balance_mark_internal_dirty (tb, tb->CFR[h], 0); | ||
737 | } | ||
738 | |||
739 | |||
740 | int balance_internal (struct tree_balance * tb, /* tree_balance structure */ | ||
741 | int h, /* level of the tree */ | ||
742 | int child_pos, | ||
743 | struct item_head * insert_key, /* key for insertion on higher level */ | ||
744 | struct buffer_head ** insert_ptr /* node for insertion on higher level*/ | ||
745 | ) | ||
746 | /* if inserting/pasting | ||
747 | { | ||
748 | child_pos is the position of the node-pointer in S[h] that * | ||
749 | pointed to S[h-1] before balancing of the h-1 level; * | ||
750 | this means that new pointers and items must be inserted AFTER * | ||
751 | child_pos | ||
752 | } | ||
753 | else | ||
754 | { | ||
755 | it is the position of the leftmost pointer that must be deleted (together with | ||
756 | its corresponding key to the left of the pointer) | ||
757 | as a result of the previous level's balancing. | ||
758 | } | ||
759 | */ | ||
760 | { | ||
761 | struct buffer_head * tbSh = PATH_H_PBUFFER (tb->tb_path, h); | ||
762 | struct buffer_info bi; | ||
763 | int order; /* we return this: it is 0 if there is no S[h], else it is tb->S[h]->b_item_order */ | ||
764 | int insert_num, n, k; | ||
765 | struct buffer_head * S_new; | ||
766 | struct item_head new_insert_key; | ||
767 | struct buffer_head * new_insert_ptr = NULL; | ||
768 | struct item_head * new_insert_key_addr = insert_key; | ||
769 | |||
770 | RFALSE( h < 1, "h (%d) can not be < 1 on internal level", h); | ||
771 | |||
772 | PROC_INFO_INC( tb -> tb_sb, balance_at[ h ] ); | ||
773 | |||
774 | order = ( tbSh ) ? PATH_H_POSITION (tb->tb_path, h + 1)/*tb->S[h]->b_item_order*/ : 0; | ||
775 | |||
776 | /* Using insert_size[h] calculate the number insert_num of items | ||
777 | that must be inserted to or deleted from S[h]. */ | ||
778 | insert_num = tb->insert_size[h]/((int)(KEY_SIZE + DC_SIZE)); | ||
779 | |||
780 | /* Check whether insert_num is proper **/ | ||
781 | RFALSE( insert_num < -2 || insert_num > 2, | ||
782 | "incorrect number of items inserted to the internal node (%d)", | ||
783 | insert_num); | ||
784 | RFALSE( h > 1 && (insert_num > 1 || insert_num < -1), | ||
785 | "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level", | ||
786 | insert_num, h); | ||
787 | |||
788 | /* Make balance in case insert_num < 0 */ | ||
789 | if ( insert_num < 0 ) { | ||
790 | balance_internal_when_delete (tb, h, child_pos); | ||
791 | return order; | ||
792 | } | ||
793 | |||
794 | k = 0; | ||
795 | if ( tb->lnum[h] > 0 ) { | ||
796 | /* shift lnum[h] items from S[h] to the left neighbor L[h]. | ||
797 | check how many of new items fall into L[h] or CFL[h] after | ||
798 | shifting */ | ||
799 | n = B_NR_ITEMS (tb->L[h]); /* number of items in L[h] */ | ||
800 | if ( tb->lnum[h] <= child_pos ) { | ||
801 | /* new items don't fall into L[h] or CFL[h] */ | ||
802 | internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]); | ||
803 | /*internal_shift_left (tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,tb->lnum[h]);*/ | ||
804 | child_pos -= tb->lnum[h]; | ||
805 | } else if ( tb->lnum[h] > child_pos + insert_num ) { | ||
806 | /* all new items fall into L[h] */ | ||
807 | internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h] - insert_num); | ||
808 | /* internal_shift_left(tb->L[h],tb->CFL[h],tb->lkey[h],tbSh, | ||
809 | tb->lnum[h]-insert_num); | ||
810 | */ | ||
811 | /* insert insert_num keys and node-pointers into L[h] */ | ||
812 | bi.tb = tb; | ||
813 | bi.bi_bh = tb->L[h]; | ||
814 | bi.bi_parent = tb->FL[h]; | ||
815 | bi.bi_position = get_left_neighbor_position (tb, h); | ||
816 | internal_insert_childs (&bi,/*tb->L[h], tb->S[h-1]->b_next*/ n + child_pos + 1, | ||
817 | insert_num,insert_key,insert_ptr); | ||
818 | |||
819 | insert_num = 0; | ||
820 | } else { | ||
821 | struct disk_child * dc; | ||
822 | |||
823 | /* some items fall into L[h] or CFL[h], but some don't fall */ | ||
824 | internal_shift1_left(tb,h,child_pos+1); | ||
825 | /* calculate number of new items that fall into L[h] */ | ||
826 | k = tb->lnum[h] - child_pos - 1; | ||
827 | bi.tb = tb; | ||
828 | bi.bi_bh = tb->L[h]; | ||
829 | bi.bi_parent = tb->FL[h]; | ||
830 | bi.bi_position = get_left_neighbor_position (tb, h); | ||
831 | internal_insert_childs (&bi,/*tb->L[h], tb->S[h-1]->b_next,*/ n + child_pos + 1,k, | ||
832 | insert_key,insert_ptr); | ||
833 | |||
834 | replace_lkey(tb,h,insert_key + k); | ||
835 | |||
836 | /* replace the first node-ptr in S[h] by node-ptr to insert_ptr[k] */ | ||
837 | dc = B_N_CHILD(tbSh, 0); | ||
838 | put_dc_size( dc, MAX_CHILD_SIZE(insert_ptr[k]) - B_FREE_SPACE (insert_ptr[k])); | ||
839 | put_dc_block_number( dc, insert_ptr[k]->b_blocknr ); | ||
840 | |||
841 | do_balance_mark_internal_dirty (tb, tbSh, 0); | ||
842 | |||
843 | k++; | ||
844 | insert_key += k; | ||
845 | insert_ptr += k; | ||
846 | insert_num -= k; | ||
847 | child_pos = 0; | ||
848 | } | ||
849 | } /* tb->lnum[h] > 0 */ | ||
850 | |||
851 | if ( tb->rnum[h] > 0 ) { | ||
852 | /*shift rnum[h] items from S[h] to the right neighbor R[h]*/ | ||
853 | /* check how many of new items fall into R or CFR after shifting */ | ||
854 | n = B_NR_ITEMS (tbSh); /* number of items in S[h] */ | ||
855 | if ( n - tb->rnum[h] >= child_pos ) | ||
856 | /* new items fall into S[h] */ | ||
857 | /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],tb->rnum[h]);*/ | ||
858 | internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]); | ||
859 | else | ||
860 | if ( n + insert_num - tb->rnum[h] < child_pos ) | ||
861 | { | ||
862 | /* all new items fall into R[h] */ | ||
863 | /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h], | ||
864 | tb->rnum[h] - insert_num);*/ | ||
865 | internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h] - insert_num); | ||
866 | |||
867 | /* insert insert_num keys and node-pointers into R[h] */ | ||
868 | bi.tb = tb; | ||
869 | bi.bi_bh = tb->R[h]; | ||
870 | bi.bi_parent = tb->FR[h]; | ||
871 | bi.bi_position = get_right_neighbor_position (tb, h); | ||
872 | internal_insert_childs (&bi, /*tb->R[h],tb->S[h-1]->b_next*/ child_pos - n - insert_num + tb->rnum[h] - 1, | ||
873 | insert_num,insert_key,insert_ptr); | ||
874 | insert_num = 0; | ||
875 | } | ||
876 | else | ||
877 | { | ||
878 | struct disk_child * dc; | ||
879 | |||
880 | /* one of the items falls into CFR[h] */ | ||
881 | internal_shift1_right(tb,h,n - child_pos + 1); | ||
882 | /* calculate number of new items that fall into R[h] */ | ||
883 | k = tb->rnum[h] - n + child_pos - 1; | ||
884 | bi.tb = tb; | ||
885 | bi.bi_bh = tb->R[h]; | ||
886 | bi.bi_parent = tb->FR[h]; | ||
887 | bi.bi_position = get_right_neighbor_position (tb, h); | ||
888 | internal_insert_childs (&bi, /*tb->R[h], tb->R[h]->b_child,*/ 0, k, insert_key + 1, insert_ptr + 1); | ||
889 | |||
890 | replace_rkey(tb,h,insert_key + insert_num - k - 1); | ||
891 | |||
892 | /* replace the first node-ptr in R[h] by node-ptr insert_ptr[insert_num-k-1]*/ | ||
893 | dc = B_N_CHILD(tb->R[h], 0); | ||
894 | put_dc_size( dc, MAX_CHILD_SIZE(insert_ptr[insert_num-k-1]) - | ||
895 | B_FREE_SPACE (insert_ptr[insert_num-k-1])); | ||
896 | put_dc_block_number( dc, insert_ptr[insert_num-k-1]->b_blocknr ); | ||
897 | |||
898 | do_balance_mark_internal_dirty (tb, tb->R[h],0); | ||
899 | |||
900 | insert_num -= (k + 1); | ||
901 | } | ||
902 | } | ||
903 | |||
904 | /** Fill new node that appears instead of S[h] **/ | ||
905 | RFALSE( tb->blknum[h] > 2, "blknum can not be > 2 for internal level"); | ||
906 | RFALSE( tb->blknum[h] < 0, "blknum can not be < 0"); | ||
907 | |||
908 | if ( ! tb->blknum[h] ) | ||
909 | { /* node S[h] is empty now */ | ||
910 | RFALSE( ! tbSh, "S[h] is equal NULL"); | ||
911 | |||
912 | /* do what is needed for buffer thrown from tree */ | ||
913 | reiserfs_invalidate_buffer(tb,tbSh); | ||
914 | return order; | ||
915 | } | ||
916 | |||
917 | if ( ! tbSh ) { | ||
918 | /* create new root */ | ||
919 | struct disk_child * dc; | ||
920 | struct buffer_head * tbSh_1 = PATH_H_PBUFFER (tb->tb_path, h - 1); | ||
921 | struct block_head * blkh; | ||
922 | |||
923 | |||
924 | if ( tb->blknum[h] != 1 ) | ||
925 | reiserfs_panic(NULL, "balance_internal: One new node required for creating the new root"); | ||
926 | /* S[h] = empty buffer from the list FEB. */ | ||
927 | tbSh = get_FEB (tb); | ||
928 | blkh = B_BLK_HEAD(tbSh); | ||
929 | set_blkh_level( blkh, h + 1 ); | ||
930 | |||
931 | /* Put the unique node-pointer to S[h] that points to S[h-1]. */ | ||
932 | |||
933 | dc = B_N_CHILD(tbSh, 0); | ||
934 | put_dc_block_number( dc, tbSh_1->b_blocknr ); | ||
935 | put_dc_size( dc, (MAX_CHILD_SIZE (tbSh_1) - B_FREE_SPACE (tbSh_1))); | ||
936 | |||
937 | tb->insert_size[h] -= DC_SIZE; | ||
938 | set_blkh_free_space( blkh, blkh_free_space(blkh) - DC_SIZE ); | ||
939 | |||
940 | do_balance_mark_internal_dirty (tb, tbSh, 0); | ||
941 | |||
942 | /*&&&&&&&&&&&&&&&&&&&&&&&&*/ | ||
943 | check_internal (tbSh); | ||
944 | /*&&&&&&&&&&&&&&&&&&&&&&&&*/ | ||
945 | |||
946 | /* put new root into path structure */ | ||
947 | PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) = tbSh; | ||
948 | |||
949 | /* Change root in structure super block. */ | ||
950 | PUT_SB_ROOT_BLOCK( tb->tb_sb, tbSh->b_blocknr ); | ||
951 | PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1 ); | ||
952 | do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1); | ||
953 | } | ||
954 | |||
955 | if ( tb->blknum[h] == 2 ) { | ||
956 | int snum; | ||
957 | struct buffer_info dest_bi, src_bi; | ||
958 | |||
959 | |||
960 | /* S_new = free buffer from list FEB */ | ||
961 | S_new = get_FEB(tb); | ||
962 | |||
963 | set_blkh_level( B_BLK_HEAD(S_new), h + 1 ); | ||
964 | |||
965 | dest_bi.tb = tb; | ||
966 | dest_bi.bi_bh = S_new; | ||
967 | dest_bi.bi_parent = NULL; | ||
968 | dest_bi.bi_position = 0; | ||
969 | src_bi.tb = tb; | ||
970 | src_bi.bi_bh = tbSh; | ||
971 | src_bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h); | ||
972 | src_bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1); | ||
973 | |||
974 | n = B_NR_ITEMS (tbSh); /* number of items in S[h] */ | ||
975 | snum = (insert_num + n + 1)/2; | ||
976 | if ( n - snum >= child_pos ) { | ||
977 | /* new items don't fall into S_new */ | ||
978 | /* store the delimiting key for the next level */ | ||
979 | /* new_insert_key = (n - snum)'th key in S[h] */ | ||
980 | memcpy (&new_insert_key,B_N_PDELIM_KEY(tbSh,n - snum), | ||
981 | KEY_SIZE); | ||
982 | /* last parameter is del_par */ | ||
983 | internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, snum, 0); | ||
984 | /* internal_move_pointers_items(S_new, tbSh, LAST_TO_FIRST, snum, 0);*/ | ||
985 | } else if ( n + insert_num - snum < child_pos ) { | ||
986 | /* all new items fall into S_new */ | ||
987 | /* store the delimiting key for the next level */ | ||
988 | /* new_insert_key = (n + insert_item - snum)'th key in S[h] */ | ||
989 | memcpy(&new_insert_key,B_N_PDELIM_KEY(tbSh,n + insert_num - snum), | ||
990 | KEY_SIZE); | ||
991 | /* last parameter is del_par */ | ||
992 | internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, snum - insert_num, 0); | ||
993 | /* internal_move_pointers_items(S_new,tbSh,1,snum - insert_num,0);*/ | ||
994 | |||
995 | /* insert insert_num keys and node-pointers into S_new */ | ||
996 | internal_insert_childs (&dest_bi, /*S_new,tb->S[h-1]->b_next,*/child_pos - n - insert_num + snum - 1, | ||
997 | insert_num,insert_key,insert_ptr); | ||
998 | |||
999 | insert_num = 0; | ||
1000 | } else { | ||
1001 | struct disk_child * dc; | ||
1002 | |||
1003 | /* some items fall into S_new, but some don't fall */ | ||
1004 | /* last parameter is del_par */ | ||
1005 | internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, n - child_pos + 1, 1); | ||
1006 | /* internal_move_pointers_items(S_new,tbSh,1,n - child_pos + 1,1);*/ | ||
1007 | /* calculate number of new items that fall into S_new */ | ||
1008 | k = snum - n + child_pos - 1; | ||
1009 | |||
1010 | internal_insert_childs (&dest_bi, /*S_new,*/ 0, k, insert_key + 1, insert_ptr+1); | ||
1011 | |||
1012 | /* new_insert_key = insert_key[insert_num - k - 1] */ | ||
1013 | memcpy(&new_insert_key,insert_key + insert_num - k - 1, | ||
1014 | KEY_SIZE); | ||
1015 | /* replace first node-ptr in S_new by node-ptr to insert_ptr[insert_num-k-1] */ | ||
1016 | |||
1017 | dc = B_N_CHILD(S_new,0); | ||
1018 | put_dc_size( dc, (MAX_CHILD_SIZE(insert_ptr[insert_num-k-1]) - | ||
1019 | B_FREE_SPACE(insert_ptr[insert_num-k-1])) ); | ||
1020 | put_dc_block_number( dc, insert_ptr[insert_num-k-1]->b_blocknr ); | ||
1021 | |||
1022 | do_balance_mark_internal_dirty (tb, S_new,0); | ||
1023 | |||
1024 | insert_num -= (k + 1); | ||
1025 | } | ||
1026 | /* new_insert_ptr = node_pointer to S_new */ | ||
1027 | new_insert_ptr = S_new; | ||
1028 | |||
1029 | RFALSE (!buffer_journaled(S_new) || buffer_journal_dirty(S_new) || | ||
1030 | buffer_dirty (S_new), | ||
1031 | "cm-00001: bad S_new (%b)", S_new); | ||
1032 | |||
1033 | // S_new is released in unfix_nodes | ||
1034 | } | ||
1035 | |||
1036 | n = B_NR_ITEMS (tbSh); /*number of items in S[h] */ | ||
1037 | |||
1038 | if ( 0 <= child_pos && child_pos <= n && insert_num > 0 ) { | ||
1039 | bi.tb = tb; | ||
1040 | bi.bi_bh = tbSh; | ||
1041 | bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h); | ||
1042 | bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1); | ||
1043 | internal_insert_childs ( | ||
1044 | &bi,/*tbSh,*/ | ||
1045 | /* ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next : tb->S[h]->b_child->b_next,*/ | ||
1046 | child_pos,insert_num,insert_key,insert_ptr | ||
1047 | ); | ||
1048 | } | ||
1049 | |||
1050 | |||
1051 | memcpy (new_insert_key_addr,&new_insert_key,KEY_SIZE); | ||
1052 | insert_ptr[0] = new_insert_ptr; | ||
1053 | |||
1054 | return order; | ||
1055 | } | ||
1056 | |||
1057 | |||
1058 | |||
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c new file mode 100644 index 000000000000..7543031396f4 --- /dev/null +++ b/fs/reiserfs/inode.c | |||
@@ -0,0 +1,2846 @@ | |||
1 | /* | ||
2 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
3 | */ | ||
4 | |||
5 | #include <linux/config.h> | ||
6 | #include <linux/time.h> | ||
7 | #include <linux/fs.h> | ||
8 | #include <linux/reiserfs_fs.h> | ||
9 | #include <linux/reiserfs_acl.h> | ||
10 | #include <linux/reiserfs_xattr.h> | ||
11 | #include <linux/smp_lock.h> | ||
12 | #include <linux/pagemap.h> | ||
13 | #include <linux/highmem.h> | ||
14 | #include <asm/uaccess.h> | ||
15 | #include <asm/unaligned.h> | ||
16 | #include <linux/buffer_head.h> | ||
17 | #include <linux/mpage.h> | ||
18 | #include <linux/writeback.h> | ||
19 | #include <linux/quotaops.h> | ||
20 | |||
21 | extern int reiserfs_default_io_size; /* default io size devuned in super.c */ | ||
22 | |||
23 | static int reiserfs_commit_write(struct file *f, struct page *page, | ||
24 | unsigned from, unsigned to); | ||
25 | static int reiserfs_prepare_write(struct file *f, struct page *page, | ||
26 | unsigned from, unsigned to); | ||
27 | |||
28 | void reiserfs_delete_inode (struct inode * inode) | ||
29 | { | ||
30 | /* We need blocks for transaction + (user+group) quota update (possibly delete) */ | ||
31 | int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS; | ||
32 | struct reiserfs_transaction_handle th ; | ||
33 | |||
34 | reiserfs_write_lock(inode->i_sb); | ||
35 | |||
36 | /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ | ||
37 | if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ | ||
38 | down (&inode->i_sem); | ||
39 | |||
40 | reiserfs_delete_xattrs (inode); | ||
41 | |||
42 | if (journal_begin(&th, inode->i_sb, jbegin_count)) { | ||
43 | up (&inode->i_sem); | ||
44 | goto out; | ||
45 | } | ||
46 | reiserfs_update_inode_transaction(inode) ; | ||
47 | |||
48 | if (reiserfs_delete_object (&th, inode)) { | ||
49 | up (&inode->i_sem); | ||
50 | goto out; | ||
51 | } | ||
52 | |||
53 | /* Do quota update inside a transaction for journaled quotas. We must do that | ||
54 | * after delete_object so that quota updates go into the same transaction as | ||
55 | * stat data deletion */ | ||
56 | DQUOT_FREE_INODE(inode); | ||
57 | |||
58 | if (journal_end(&th, inode->i_sb, jbegin_count)) { | ||
59 | up (&inode->i_sem); | ||
60 | goto out; | ||
61 | } | ||
62 | |||
63 | up (&inode->i_sem); | ||
64 | |||
65 | /* all items of file are deleted, so we can remove "save" link */ | ||
66 | remove_save_link (inode, 0/* not truncate */); /* we can't do anything | ||
67 | * about an error here */ | ||
68 | } else { | ||
69 | /* no object items are in the tree */ | ||
70 | ; | ||
71 | } | ||
72 | out: | ||
73 | clear_inode (inode); /* note this must go after the journal_end to prevent deadlock */ | ||
74 | inode->i_blocks = 0; | ||
75 | reiserfs_write_unlock(inode->i_sb); | ||
76 | } | ||
77 | |||
78 | static void _make_cpu_key (struct cpu_key * key, int version, __u32 dirid, __u32 objectid, | ||
79 | loff_t offset, int type, int length ) | ||
80 | { | ||
81 | key->version = version; | ||
82 | |||
83 | key->on_disk_key.k_dir_id = dirid; | ||
84 | key->on_disk_key.k_objectid = objectid; | ||
85 | set_cpu_key_k_offset (key, offset); | ||
86 | set_cpu_key_k_type (key, type); | ||
87 | key->key_length = length; | ||
88 | } | ||
89 | |||
90 | |||
91 | /* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set | ||
92 | offset and type of key */ | ||
93 | void make_cpu_key (struct cpu_key * key, struct inode * inode, loff_t offset, | ||
94 | int type, int length ) | ||
95 | { | ||
96 | _make_cpu_key (key, get_inode_item_key_version (inode), le32_to_cpu (INODE_PKEY (inode)->k_dir_id), | ||
97 | le32_to_cpu (INODE_PKEY (inode)->k_objectid), | ||
98 | offset, type, length); | ||
99 | } | ||
100 | |||
101 | |||
102 | // | ||
103 | // when key is 0, do not set version and short key | ||
104 | // | ||
105 | inline void make_le_item_head (struct item_head * ih, const struct cpu_key * key, | ||
106 | int version, | ||
107 | loff_t offset, int type, int length, | ||
108 | int entry_count/*or ih_free_space*/) | ||
109 | { | ||
110 | if (key) { | ||
111 | ih->ih_key.k_dir_id = cpu_to_le32 (key->on_disk_key.k_dir_id); | ||
112 | ih->ih_key.k_objectid = cpu_to_le32 (key->on_disk_key.k_objectid); | ||
113 | } | ||
114 | put_ih_version( ih, version ); | ||
115 | set_le_ih_k_offset (ih, offset); | ||
116 | set_le_ih_k_type (ih, type); | ||
117 | put_ih_item_len( ih, length ); | ||
118 | /* set_ih_free_space (ih, 0);*/ | ||
119 | // for directory items it is entry count, for directs and stat | ||
120 | // datas - 0xffff, for indirects - 0 | ||
121 | put_ih_entry_count( ih, entry_count ); | ||
122 | } | ||
123 | |||
124 | // | ||
125 | // FIXME: we might cache recently accessed indirect item | ||
126 | |||
127 | // Ugh. Not too eager for that.... | ||
128 | // I cut the code until such time as I see a convincing argument (benchmark). | ||
129 | // I don't want a bloated inode struct..., and I don't like code complexity.... | ||
130 | |||
131 | /* cutting the code is fine, since it really isn't in use yet and is easy | ||
132 | ** to add back in. But, Vladimir has a really good idea here. Think | ||
133 | ** about what happens for reading a file. For each page, | ||
134 | ** The VFS layer calls reiserfs_readpage, who searches the tree to find | ||
135 | ** an indirect item. This indirect item has X number of pointers, where | ||
136 | ** X is a big number if we've done the block allocation right. But, | ||
137 | ** we only use one or two of these pointers during each call to readpage, | ||
138 | ** needlessly researching again later on. | ||
139 | ** | ||
140 | ** The size of the cache could be dynamic based on the size of the file. | ||
141 | ** | ||
142 | ** I'd also like to see us cache the location the stat data item, since | ||
143 | ** we are needlessly researching for that frequently. | ||
144 | ** | ||
145 | ** --chris | ||
146 | */ | ||
147 | |||
148 | /* If this page has a file tail in it, and | ||
149 | ** it was read in by get_block_create_0, the page data is valid, | ||
150 | ** but tail is still sitting in a direct item, and we can't write to | ||
151 | ** it. So, look through this page, and check all the mapped buffers | ||
152 | ** to make sure they have valid block numbers. Any that don't need | ||
153 | ** to be unmapped, so that block_prepare_write will correctly call | ||
154 | ** reiserfs_get_block to convert the tail into an unformatted node | ||
155 | */ | ||
156 | static inline void fix_tail_page_for_writing(struct page *page) { | ||
157 | struct buffer_head *head, *next, *bh ; | ||
158 | |||
159 | if (page && page_has_buffers(page)) { | ||
160 | head = page_buffers(page) ; | ||
161 | bh = head ; | ||
162 | do { | ||
163 | next = bh->b_this_page ; | ||
164 | if (buffer_mapped(bh) && bh->b_blocknr == 0) { | ||
165 | reiserfs_unmap_buffer(bh) ; | ||
166 | } | ||
167 | bh = next ; | ||
168 | } while (bh != head) ; | ||
169 | } | ||
170 | } | ||
171 | |||
172 | /* reiserfs_get_block does not need to allocate a block only if it has been | ||
173 | done already or non-hole position has been found in the indirect item */ | ||
174 | static inline int allocation_needed (int retval, b_blocknr_t allocated, | ||
175 | struct item_head * ih, | ||
176 | __u32 * item, int pos_in_item) | ||
177 | { | ||
178 | if (allocated) | ||
179 | return 0; | ||
180 | if (retval == POSITION_FOUND && is_indirect_le_ih (ih) && | ||
181 | get_block_num(item, pos_in_item)) | ||
182 | return 0; | ||
183 | return 1; | ||
184 | } | ||
185 | |||
186 | static inline int indirect_item_found (int retval, struct item_head * ih) | ||
187 | { | ||
188 | return (retval == POSITION_FOUND) && is_indirect_le_ih (ih); | ||
189 | } | ||
190 | |||
191 | |||
192 | static inline void set_block_dev_mapped (struct buffer_head * bh, | ||
193 | b_blocknr_t block, struct inode * inode) | ||
194 | { | ||
195 | map_bh(bh, inode->i_sb, block); | ||
196 | } | ||
197 | |||
198 | |||
199 | // | ||
200 | // files which were created in the earlier version can not be longer, | ||
201 | // than 2 gb | ||
202 | // | ||
203 | static int file_capable (struct inode * inode, long block) | ||
204 | { | ||
205 | if (get_inode_item_key_version (inode) != KEY_FORMAT_3_5 || // it is new file. | ||
206 | block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb | ||
207 | return 1; | ||
208 | |||
209 | return 0; | ||
210 | } | ||
211 | |||
212 | /*static*/ int restart_transaction(struct reiserfs_transaction_handle *th, | ||
213 | struct inode *inode, struct path *path) { | ||
214 | struct super_block *s = th->t_super ; | ||
215 | int len = th->t_blocks_allocated ; | ||
216 | int err; | ||
217 | |||
218 | BUG_ON (!th->t_trans_id); | ||
219 | BUG_ON (!th->t_refcount); | ||
220 | |||
221 | /* we cannot restart while nested */ | ||
222 | if (th->t_refcount > 1) { | ||
223 | return 0 ; | ||
224 | } | ||
225 | pathrelse(path) ; | ||
226 | reiserfs_update_sd(th, inode) ; | ||
227 | err = journal_end(th, s, len) ; | ||
228 | if (!err) { | ||
229 | err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6) ; | ||
230 | if (!err) | ||
231 | reiserfs_update_inode_transaction(inode) ; | ||
232 | } | ||
233 | return err; | ||
234 | } | ||
235 | |||
236 | // it is called by get_block when create == 0. Returns block number | ||
237 | // for 'block'-th logical block of file. When it hits direct item it | ||
238 | // returns 0 (being called from bmap) or read direct item into piece | ||
239 | // of page (bh_result) | ||
240 | |||
241 | // Please improve the english/clarity in the comment above, as it is | ||
242 | // hard to understand. | ||
243 | |||
244 | static int _get_block_create_0 (struct inode * inode, long block, | ||
245 | struct buffer_head * bh_result, | ||
246 | int args) | ||
247 | { | ||
248 | INITIALIZE_PATH (path); | ||
249 | struct cpu_key key; | ||
250 | struct buffer_head * bh; | ||
251 | struct item_head * ih, tmp_ih; | ||
252 | int fs_gen ; | ||
253 | int blocknr; | ||
254 | char * p = NULL; | ||
255 | int chars; | ||
256 | int ret ; | ||
257 | int done = 0 ; | ||
258 | unsigned long offset ; | ||
259 | |||
260 | // prepare the key to look for the 'block'-th block of file | ||
261 | make_cpu_key (&key, inode, | ||
262 | (loff_t)block * inode->i_sb->s_blocksize + 1, TYPE_ANY, 3); | ||
263 | |||
264 | research: | ||
265 | if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND) { | ||
266 | pathrelse (&path); | ||
267 | if (p) | ||
268 | kunmap(bh_result->b_page) ; | ||
269 | // We do not return -ENOENT if there is a hole but page is uptodate, because it means | ||
270 | // That there is some MMAPED data associated with it that is yet to be written to disk. | ||
271 | if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) { | ||
272 | return -ENOENT ; | ||
273 | } | ||
274 | return 0 ; | ||
275 | } | ||
276 | |||
277 | // | ||
278 | bh = get_last_bh (&path); | ||
279 | ih = get_ih (&path); | ||
280 | if (is_indirect_le_ih (ih)) { | ||
281 | __u32 * ind_item = (__u32 *)B_I_PITEM (bh, ih); | ||
282 | |||
283 | /* FIXME: here we could cache indirect item or part of it in | ||
284 | the inode to avoid search_by_key in case of subsequent | ||
285 | access to file */ | ||
286 | blocknr = get_block_num(ind_item, path.pos_in_item) ; | ||
287 | ret = 0 ; | ||
288 | if (blocknr) { | ||
289 | map_bh(bh_result, inode->i_sb, blocknr); | ||
290 | if (path.pos_in_item == ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) { | ||
291 | set_buffer_boundary(bh_result); | ||
292 | } | ||
293 | } else | ||
294 | // We do not return -ENOENT if there is a hole but page is uptodate, because it means | ||
295 | // That there is some MMAPED data associated with it that is yet to be written to disk. | ||
296 | if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) { | ||
297 | ret = -ENOENT ; | ||
298 | } | ||
299 | |||
300 | pathrelse (&path); | ||
301 | if (p) | ||
302 | kunmap(bh_result->b_page) ; | ||
303 | return ret ; | ||
304 | } | ||
305 | |||
306 | // requested data are in direct item(s) | ||
307 | if (!(args & GET_BLOCK_READ_DIRECT)) { | ||
308 | // we are called by bmap. FIXME: we can not map block of file | ||
309 | // when it is stored in direct item(s) | ||
310 | pathrelse (&path); | ||
311 | if (p) | ||
312 | kunmap(bh_result->b_page) ; | ||
313 | return -ENOENT; | ||
314 | } | ||
315 | |||
316 | /* if we've got a direct item, and the buffer or page was uptodate, | ||
317 | ** we don't want to pull data off disk again. skip to the | ||
318 | ** end, where we map the buffer and return | ||
319 | */ | ||
320 | if (buffer_uptodate(bh_result)) { | ||
321 | goto finished ; | ||
322 | } else | ||
323 | /* | ||
324 | ** grab_tail_page can trigger calls to reiserfs_get_block on up to date | ||
325 | ** pages without any buffers. If the page is up to date, we don't want | ||
326 | ** read old data off disk. Set the up to date bit on the buffer instead | ||
327 | ** and jump to the end | ||
328 | */ | ||
329 | if (!bh_result->b_page || PageUptodate(bh_result->b_page)) { | ||
330 | set_buffer_uptodate(bh_result); | ||
331 | goto finished ; | ||
332 | } | ||
333 | |||
334 | // read file tail into part of page | ||
335 | offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1) ; | ||
336 | fs_gen = get_generation(inode->i_sb) ; | ||
337 | copy_item_head (&tmp_ih, ih); | ||
338 | |||
339 | /* we only want to kmap if we are reading the tail into the page. | ||
340 | ** this is not the common case, so we don't kmap until we are | ||
341 | ** sure we need to. But, this means the item might move if | ||
342 | ** kmap schedules | ||
343 | */ | ||
344 | if (!p) { | ||
345 | p = (char *)kmap(bh_result->b_page) ; | ||
346 | if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { | ||
347 | goto research; | ||
348 | } | ||
349 | } | ||
350 | p += offset ; | ||
351 | memset (p, 0, inode->i_sb->s_blocksize); | ||
352 | do { | ||
353 | if (!is_direct_le_ih (ih)) { | ||
354 | BUG (); | ||
355 | } | ||
356 | /* make sure we don't read more bytes than actually exist in | ||
357 | ** the file. This can happen in odd cases where i_size isn't | ||
358 | ** correct, and when direct item padding results in a few | ||
359 | ** extra bytes at the end of the direct item | ||
360 | */ | ||
361 | if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size) | ||
362 | break ; | ||
363 | if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) { | ||
364 | chars = inode->i_size - (le_ih_k_offset(ih) - 1) - path.pos_in_item; | ||
365 | done = 1 ; | ||
366 | } else { | ||
367 | chars = ih_item_len(ih) - path.pos_in_item; | ||
368 | } | ||
369 | memcpy (p, B_I_PITEM (bh, ih) + path.pos_in_item, chars); | ||
370 | |||
371 | if (done) | ||
372 | break ; | ||
373 | |||
374 | p += chars; | ||
375 | |||
376 | if (PATH_LAST_POSITION (&path) != (B_NR_ITEMS (bh) - 1)) | ||
377 | // we done, if read direct item is not the last item of | ||
378 | // node FIXME: we could try to check right delimiting key | ||
379 | // to see whether direct item continues in the right | ||
380 | // neighbor or rely on i_size | ||
381 | break; | ||
382 | |||
383 | // update key to look for the next piece | ||
384 | set_cpu_key_k_offset (&key, cpu_key_k_offset (&key) + chars); | ||
385 | if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND) | ||
386 | // we read something from tail, even if now we got IO_ERROR | ||
387 | break; | ||
388 | bh = get_last_bh (&path); | ||
389 | ih = get_ih (&path); | ||
390 | } while (1); | ||
391 | |||
392 | flush_dcache_page(bh_result->b_page) ; | ||
393 | kunmap(bh_result->b_page) ; | ||
394 | |||
395 | finished: | ||
396 | pathrelse (&path); | ||
397 | /* this buffer has valid data, but isn't valid for io. mapping it to | ||
398 | * block #0 tells the rest of reiserfs it just has a tail in it | ||
399 | */ | ||
400 | map_bh(bh_result, inode->i_sb, 0); | ||
401 | set_buffer_uptodate (bh_result); | ||
402 | return 0; | ||
403 | } | ||
404 | |||
405 | |||
406 | // this is called to create file map. So, _get_block_create_0 will not | ||
407 | // read direct item | ||
408 | static int reiserfs_bmap (struct inode * inode, sector_t block, | ||
409 | struct buffer_head * bh_result, int create) | ||
410 | { | ||
411 | if (!file_capable (inode, block)) | ||
412 | return -EFBIG; | ||
413 | |||
414 | reiserfs_write_lock(inode->i_sb); | ||
415 | /* do not read the direct item */ | ||
416 | _get_block_create_0 (inode, block, bh_result, 0) ; | ||
417 | reiserfs_write_unlock(inode->i_sb); | ||
418 | return 0; | ||
419 | } | ||
420 | |||
421 | /* special version of get_block that is only used by grab_tail_page right | ||
422 | ** now. It is sent to block_prepare_write, and when you try to get a | ||
423 | ** block past the end of the file (or a block from a hole) it returns | ||
424 | ** -ENOENT instead of a valid buffer. block_prepare_write expects to | ||
425 | ** be able to do i/o on the buffers returned, unless an error value | ||
426 | ** is also returned. | ||
427 | ** | ||
428 | ** So, this allows block_prepare_write to be used for reading a single block | ||
429 | ** in a page. Where it does not produce a valid page for holes, or past the | ||
430 | ** end of the file. This turns out to be exactly what we need for reading | ||
431 | ** tails for conversion. | ||
432 | ** | ||
433 | ** The point of the wrapper is forcing a certain value for create, even | ||
434 | ** though the VFS layer is calling this function with create==1. If you | ||
435 | ** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, | ||
436 | ** don't use this function. | ||
437 | */ | ||
438 | static int reiserfs_get_block_create_0 (struct inode * inode, sector_t block, | ||
439 | struct buffer_head * bh_result, int create) { | ||
440 | return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE) ; | ||
441 | } | ||
442 | |||
443 | /* This is special helper for reiserfs_get_block in case we are executing | ||
444 | direct_IO request. */ | ||
445 | static int reiserfs_get_blocks_direct_io(struct inode *inode, | ||
446 | sector_t iblock, | ||
447 | unsigned long max_blocks, | ||
448 | struct buffer_head *bh_result, | ||
449 | int create) | ||
450 | { | ||
451 | int ret ; | ||
452 | |||
453 | bh_result->b_page = NULL; | ||
454 | |||
455 | /* We set the b_size before reiserfs_get_block call since it is | ||
456 | referenced in convert_tail_for_hole() that may be called from | ||
457 | reiserfs_get_block() */ | ||
458 | bh_result->b_size = (1 << inode->i_blkbits); | ||
459 | |||
460 | ret = reiserfs_get_block(inode, iblock, bh_result, | ||
461 | create | GET_BLOCK_NO_DANGLE) ; | ||
462 | if (ret) | ||
463 | goto out; | ||
464 | |||
465 | /* don't allow direct io onto tail pages */ | ||
466 | if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { | ||
467 | /* make sure future calls to the direct io funcs for this offset | ||
468 | ** in the file fail by unmapping the buffer | ||
469 | */ | ||
470 | clear_buffer_mapped(bh_result); | ||
471 | ret = -EINVAL ; | ||
472 | } | ||
473 | /* Possible unpacked tail. Flush the data before pages have | ||
474 | disappeared */ | ||
475 | if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { | ||
476 | int err; | ||
477 | lock_kernel(); | ||
478 | err = reiserfs_commit_for_inode(inode); | ||
479 | REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; | ||
480 | unlock_kernel(); | ||
481 | if (err < 0) | ||
482 | ret = err; | ||
483 | } | ||
484 | out: | ||
485 | return ret ; | ||
486 | } | ||
487 | |||
488 | |||
489 | /* | ||
490 | ** helper function for when reiserfs_get_block is called for a hole | ||
491 | ** but the file tail is still in a direct item | ||
492 | ** bh_result is the buffer head for the hole | ||
493 | ** tail_offset is the offset of the start of the tail in the file | ||
494 | ** | ||
495 | ** This calls prepare_write, which will start a new transaction | ||
496 | ** you should not be in a transaction, or have any paths held when you | ||
497 | ** call this. | ||
498 | */ | ||
499 | static int convert_tail_for_hole(struct inode *inode, | ||
500 | struct buffer_head *bh_result, | ||
501 | loff_t tail_offset) { | ||
502 | unsigned long index ; | ||
503 | unsigned long tail_end ; | ||
504 | unsigned long tail_start ; | ||
505 | struct page * tail_page ; | ||
506 | struct page * hole_page = bh_result->b_page ; | ||
507 | int retval = 0 ; | ||
508 | |||
509 | if ((tail_offset & (bh_result->b_size - 1)) != 1) | ||
510 | return -EIO ; | ||
511 | |||
512 | /* always try to read until the end of the block */ | ||
513 | tail_start = tail_offset & (PAGE_CACHE_SIZE - 1) ; | ||
514 | tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ; | ||
515 | |||
516 | index = tail_offset >> PAGE_CACHE_SHIFT ; | ||
517 | /* hole_page can be zero in case of direct_io, we are sure | ||
518 | that we cannot get here if we write with O_DIRECT into | ||
519 | tail page */ | ||
520 | if (!hole_page || index != hole_page->index) { | ||
521 | tail_page = grab_cache_page(inode->i_mapping, index) ; | ||
522 | retval = -ENOMEM; | ||
523 | if (!tail_page) { | ||
524 | goto out ; | ||
525 | } | ||
526 | } else { | ||
527 | tail_page = hole_page ; | ||
528 | } | ||
529 | |||
530 | /* we don't have to make sure the conversion did not happen while | ||
531 | ** we were locking the page because anyone that could convert | ||
532 | ** must first take i_sem. | ||
533 | ** | ||
534 | ** We must fix the tail page for writing because it might have buffers | ||
535 | ** that are mapped, but have a block number of 0. This indicates tail | ||
536 | ** data that has been read directly into the page, and block_prepare_write | ||
537 | ** won't trigger a get_block in this case. | ||
538 | */ | ||
539 | fix_tail_page_for_writing(tail_page) ; | ||
540 | retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end); | ||
541 | if (retval) | ||
542 | goto unlock ; | ||
543 | |||
544 | /* tail conversion might change the data in the page */ | ||
545 | flush_dcache_page(tail_page) ; | ||
546 | |||
547 | retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end) ; | ||
548 | |||
549 | unlock: | ||
550 | if (tail_page != hole_page) { | ||
551 | unlock_page(tail_page) ; | ||
552 | page_cache_release(tail_page) ; | ||
553 | } | ||
554 | out: | ||
555 | return retval ; | ||
556 | } | ||
557 | |||
558 | static inline int _allocate_block(struct reiserfs_transaction_handle *th, | ||
559 | long block, | ||
560 | struct inode *inode, | ||
561 | b_blocknr_t *allocated_block_nr, | ||
562 | struct path * path, | ||
563 | int flags) { | ||
564 | BUG_ON (!th->t_trans_id); | ||
565 | |||
566 | #ifdef REISERFS_PREALLOCATE | ||
567 | if (!(flags & GET_BLOCK_NO_ISEM)) { | ||
568 | return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, path, block); | ||
569 | } | ||
570 | #endif | ||
571 | return reiserfs_new_unf_blocknrs (th, inode, allocated_block_nr, path, block); | ||
572 | } | ||
573 | |||
574 | int reiserfs_get_block (struct inode * inode, sector_t block, | ||
575 | struct buffer_head * bh_result, int create) | ||
576 | { | ||
577 | int repeat, retval = 0; | ||
578 | b_blocknr_t allocated_block_nr = 0;// b_blocknr_t is (unsigned) 32 bit int | ||
579 | INITIALIZE_PATH(path); | ||
580 | int pos_in_item; | ||
581 | struct cpu_key key; | ||
582 | struct buffer_head * bh, * unbh = NULL; | ||
583 | struct item_head * ih, tmp_ih; | ||
584 | __u32 * item; | ||
585 | int done; | ||
586 | int fs_gen; | ||
587 | struct reiserfs_transaction_handle *th = NULL; | ||
588 | /* space reserved in transaction batch: | ||
589 | . 3 balancings in direct->indirect conversion | ||
590 | . 1 block involved into reiserfs_update_sd() | ||
591 | XXX in practically impossible worst case direct2indirect() | ||
592 | can incur (much) more than 3 balancings. | ||
593 | quota update for user, group */ | ||
594 | int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS; | ||
595 | int version; | ||
596 | int dangle = 1; | ||
597 | loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ; | ||
598 | |||
599 | /* bad.... */ | ||
600 | reiserfs_write_lock(inode->i_sb); | ||
601 | version = get_inode_item_key_version (inode); | ||
602 | |||
603 | if (block < 0) { | ||
604 | reiserfs_write_unlock(inode->i_sb); | ||
605 | return -EIO; | ||
606 | } | ||
607 | |||
608 | if (!file_capable (inode, block)) { | ||
609 | reiserfs_write_unlock(inode->i_sb); | ||
610 | return -EFBIG; | ||
611 | } | ||
612 | |||
613 | /* if !create, we aren't changing the FS, so we don't need to | ||
614 | ** log anything, so we don't need to start a transaction | ||
615 | */ | ||
616 | if (!(create & GET_BLOCK_CREATE)) { | ||
617 | int ret ; | ||
618 | /* find number of block-th logical block of the file */ | ||
619 | ret = _get_block_create_0 (inode, block, bh_result, | ||
620 | create | GET_BLOCK_READ_DIRECT) ; | ||
621 | reiserfs_write_unlock(inode->i_sb); | ||
622 | return ret; | ||
623 | } | ||
624 | /* | ||
625 | * if we're already in a transaction, make sure to close | ||
626 | * any new transactions we start in this func | ||
627 | */ | ||
628 | if ((create & GET_BLOCK_NO_DANGLE) || | ||
629 | reiserfs_transaction_running(inode->i_sb)) | ||
630 | dangle = 0; | ||
631 | |||
632 | /* If file is of such a size, that it might have a tail and tails are enabled | ||
633 | ** we should mark it as possibly needing tail packing on close | ||
634 | */ | ||
635 | if ( (have_large_tails (inode->i_sb) && inode->i_size < i_block_size (inode)*4) || | ||
636 | (have_small_tails (inode->i_sb) && inode->i_size < i_block_size(inode)) ) | ||
637 | REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ; | ||
638 | |||
639 | /* set the key of the first byte in the 'block'-th block of file */ | ||
640 | make_cpu_key (&key, inode, new_offset, | ||
641 | TYPE_ANY, 3/*key length*/); | ||
642 | if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) { | ||
643 | start_trans: | ||
644 | th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count); | ||
645 | if (!th) { | ||
646 | retval = -ENOMEM; | ||
647 | goto failure; | ||
648 | } | ||
649 | reiserfs_update_inode_transaction(inode) ; | ||
650 | } | ||
651 | research: | ||
652 | |||
653 | retval = search_for_position_by_key (inode->i_sb, &key, &path); | ||
654 | if (retval == IO_ERROR) { | ||
655 | retval = -EIO; | ||
656 | goto failure; | ||
657 | } | ||
658 | |||
659 | bh = get_last_bh (&path); | ||
660 | ih = get_ih (&path); | ||
661 | item = get_item (&path); | ||
662 | pos_in_item = path.pos_in_item; | ||
663 | |||
664 | fs_gen = get_generation (inode->i_sb); | ||
665 | copy_item_head (&tmp_ih, ih); | ||
666 | |||
667 | if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) { | ||
668 | /* we have to allocate block for the unformatted node */ | ||
669 | if (!th) { | ||
670 | pathrelse(&path) ; | ||
671 | goto start_trans; | ||
672 | } | ||
673 | |||
674 | repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create); | ||
675 | |||
676 | if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) { | ||
677 | /* restart the transaction to give the journal a chance to free | ||
678 | ** some blocks. releases the path, so we have to go back to | ||
679 | ** research if we succeed on the second try | ||
680 | */ | ||
681 | SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1; | ||
682 | retval = restart_transaction(th, inode, &path) ; | ||
683 | if (retval) | ||
684 | goto failure; | ||
685 | repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create); | ||
686 | |||
687 | if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) { | ||
688 | goto research ; | ||
689 | } | ||
690 | if (repeat == QUOTA_EXCEEDED) | ||
691 | retval = -EDQUOT; | ||
692 | else | ||
693 | retval = -ENOSPC; | ||
694 | goto failure; | ||
695 | } | ||
696 | |||
697 | if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { | ||
698 | goto research; | ||
699 | } | ||
700 | } | ||
701 | |||
702 | if (indirect_item_found (retval, ih)) { | ||
703 | b_blocknr_t unfm_ptr; | ||
704 | /* 'block'-th block is in the file already (there is | ||
705 | corresponding cell in some indirect item). But it may be | ||
706 | zero unformatted node pointer (hole) */ | ||
707 | unfm_ptr = get_block_num (item, pos_in_item); | ||
708 | if (unfm_ptr == 0) { | ||
709 | /* use allocated block to plug the hole */ | ||
710 | reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ; | ||
711 | if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { | ||
712 | reiserfs_restore_prepared_buffer(inode->i_sb, bh) ; | ||
713 | goto research; | ||
714 | } | ||
715 | set_buffer_new(bh_result); | ||
716 | if (buffer_dirty(bh_result) && reiserfs_data_ordered(inode->i_sb)) | ||
717 | reiserfs_add_ordered_list(inode, bh_result); | ||
718 | put_block_num(item, pos_in_item, allocated_block_nr) ; | ||
719 | unfm_ptr = allocated_block_nr; | ||
720 | journal_mark_dirty (th, inode->i_sb, bh); | ||
721 | reiserfs_update_sd(th, inode) ; | ||
722 | } | ||
723 | set_block_dev_mapped(bh_result, unfm_ptr, inode); | ||
724 | pathrelse (&path); | ||
725 | retval = 0; | ||
726 | if (!dangle && th) | ||
727 | retval = reiserfs_end_persistent_transaction(th); | ||
728 | |||
729 | reiserfs_write_unlock(inode->i_sb); | ||
730 | |||
731 | /* the item was found, so new blocks were not added to the file | ||
732 | ** there is no need to make sure the inode is updated with this | ||
733 | ** transaction | ||
734 | */ | ||
735 | return retval; | ||
736 | } | ||
737 | |||
738 | if (!th) { | ||
739 | pathrelse(&path) ; | ||
740 | goto start_trans; | ||
741 | } | ||
742 | |||
743 | /* desired position is not found or is in the direct item. We have | ||
744 | to append file with holes up to 'block'-th block converting | ||
745 | direct items to indirect one if necessary */ | ||
746 | done = 0; | ||
747 | do { | ||
748 | if (is_statdata_le_ih (ih)) { | ||
749 | __u32 unp = 0; | ||
750 | struct cpu_key tmp_key; | ||
751 | |||
752 | /* indirect item has to be inserted */ | ||
753 | make_le_item_head (&tmp_ih, &key, version, 1, TYPE_INDIRECT, | ||
754 | UNFM_P_SIZE, 0/* free_space */); | ||
755 | |||
756 | if (cpu_key_k_offset (&key) == 1) { | ||
757 | /* we are going to add 'block'-th block to the file. Use | ||
758 | allocated block for that */ | ||
759 | unp = cpu_to_le32 (allocated_block_nr); | ||
760 | set_block_dev_mapped (bh_result, allocated_block_nr, inode); | ||
761 | set_buffer_new(bh_result); | ||
762 | done = 1; | ||
763 | } | ||
764 | tmp_key = key; // ;) | ||
765 | set_cpu_key_k_offset (&tmp_key, 1); | ||
766 | PATH_LAST_POSITION(&path) ++; | ||
767 | |||
768 | retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, inode, (char *)&unp); | ||
769 | if (retval) { | ||
770 | reiserfs_free_block (th, inode, allocated_block_nr, 1); | ||
771 | goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST | ||
772 | } | ||
773 | //mark_tail_converted (inode); | ||
774 | } else if (is_direct_le_ih (ih)) { | ||
775 | /* direct item has to be converted */ | ||
776 | loff_t tail_offset; | ||
777 | |||
778 | tail_offset = ((le_ih_k_offset (ih) - 1) & ~(inode->i_sb->s_blocksize - 1)) + 1; | ||
779 | if (tail_offset == cpu_key_k_offset (&key)) { | ||
780 | /* direct item we just found fits into block we have | ||
781 | to map. Convert it into unformatted node: use | ||
782 | bh_result for the conversion */ | ||
783 | set_block_dev_mapped (bh_result, allocated_block_nr, inode); | ||
784 | unbh = bh_result; | ||
785 | done = 1; | ||
786 | } else { | ||
787 | /* we have to padd file tail stored in direct item(s) | ||
788 | up to block size and convert it to unformatted | ||
789 | node. FIXME: this should also get into page cache */ | ||
790 | |||
791 | pathrelse(&path) ; | ||
792 | /* | ||
793 | * ugly, but we can only end the transaction if | ||
794 | * we aren't nested | ||
795 | */ | ||
796 | BUG_ON (!th->t_refcount); | ||
797 | if (th->t_refcount == 1) { | ||
798 | retval = reiserfs_end_persistent_transaction(th); | ||
799 | th = NULL; | ||
800 | if (retval) | ||
801 | goto failure; | ||
802 | } | ||
803 | |||
804 | retval = convert_tail_for_hole(inode, bh_result, tail_offset) ; | ||
805 | if (retval) { | ||
806 | if ( retval != -ENOSPC ) | ||
807 | reiserfs_warning (inode->i_sb, "clm-6004: convert tail failed inode %lu, error %d", inode->i_ino, retval) ; | ||
808 | if (allocated_block_nr) { | ||
809 | /* the bitmap, the super, and the stat data == 3 */ | ||
810 | if (!th) | ||
811 | th = reiserfs_persistent_transaction(inode->i_sb,3); | ||
812 | if (th) | ||
813 | reiserfs_free_block (th,inode,allocated_block_nr,1); | ||
814 | } | ||
815 | goto failure ; | ||
816 | } | ||
817 | goto research ; | ||
818 | } | ||
819 | retval = direct2indirect (th, inode, &path, unbh, tail_offset); | ||
820 | if (retval) { | ||
821 | reiserfs_unmap_buffer(unbh); | ||
822 | reiserfs_free_block (th, inode, allocated_block_nr, 1); | ||
823 | goto failure; | ||
824 | } | ||
825 | /* it is important the set_buffer_uptodate is done after | ||
826 | ** the direct2indirect. The buffer might contain valid | ||
827 | ** data newer than the data on disk (read by readpage, changed, | ||
828 | ** and then sent here by writepage). direct2indirect needs | ||
829 | ** to know if unbh was already up to date, so it can decide | ||
830 | ** if the data in unbh needs to be replaced with data from | ||
831 | ** the disk | ||
832 | */ | ||
833 | set_buffer_uptodate (unbh); | ||
834 | |||
835 | /* unbh->b_page == NULL in case of DIRECT_IO request, this means | ||
836 | buffer will disappear shortly, so it should not be added to | ||
837 | */ | ||
838 | if ( unbh->b_page ) { | ||
839 | /* we've converted the tail, so we must | ||
840 | ** flush unbh before the transaction commits | ||
841 | */ | ||
842 | reiserfs_add_tail_list(inode, unbh) ; | ||
843 | |||
844 | /* mark it dirty now to prevent commit_write from adding | ||
845 | ** this buffer to the inode's dirty buffer list | ||
846 | */ | ||
847 | /* | ||
848 | * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty(). | ||
849 | * It's still atomic, but it sets the page dirty too, | ||
850 | * which makes it eligible for writeback at any time by the | ||
851 | * VM (which was also the case with __mark_buffer_dirty()) | ||
852 | */ | ||
853 | mark_buffer_dirty(unbh) ; | ||
854 | } | ||
855 | } else { | ||
856 | /* append indirect item with holes if needed, when appending | ||
857 | pointer to 'block'-th block use block, which is already | ||
858 | allocated */ | ||
859 | struct cpu_key tmp_key; | ||
860 | unp_t unf_single=0; // We use this in case we need to allocate only | ||
861 | // one block which is a fastpath | ||
862 | unp_t *un; | ||
863 | __u64 max_to_insert=MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE; | ||
864 | __u64 blocks_needed; | ||
865 | |||
866 | RFALSE( pos_in_item != ih_item_len(ih) / UNFM_P_SIZE, | ||
867 | "vs-804: invalid position for append"); | ||
868 | /* indirect item has to be appended, set up key of that position */ | ||
869 | make_cpu_key (&tmp_key, inode, | ||
870 | le_key_k_offset (version, &(ih->ih_key)) + op_bytes_number (ih, inode->i_sb->s_blocksize), | ||
871 | //pos_in_item * inode->i_sb->s_blocksize, | ||
872 | TYPE_INDIRECT, 3);// key type is unimportant | ||
873 | |||
874 | blocks_needed = 1 + ((cpu_key_k_offset (&key) - cpu_key_k_offset (&tmp_key)) >> inode->i_sb->s_blocksize_bits); | ||
875 | RFALSE( blocks_needed < 0, "green-805: invalid offset"); | ||
876 | |||
877 | if ( blocks_needed == 1 ) { | ||
878 | un = &unf_single; | ||
879 | } else { | ||
880 | un=kmalloc( min(blocks_needed,max_to_insert)*UNFM_P_SIZE, | ||
881 | GFP_ATOMIC); // We need to avoid scheduling. | ||
882 | if ( !un) { | ||
883 | un = &unf_single; | ||
884 | blocks_needed = 1; | ||
885 | max_to_insert = 0; | ||
886 | } else | ||
887 | memset(un, 0, UNFM_P_SIZE * min(blocks_needed,max_to_insert)); | ||
888 | } | ||
889 | if ( blocks_needed <= max_to_insert) { | ||
890 | /* we are going to add target block to the file. Use allocated | ||
891 | block for that */ | ||
892 | un[blocks_needed-1] = cpu_to_le32 (allocated_block_nr); | ||
893 | set_block_dev_mapped (bh_result, allocated_block_nr, inode); | ||
894 | set_buffer_new(bh_result); | ||
895 | done = 1; | ||
896 | } else { | ||
897 | /* paste hole to the indirect item */ | ||
898 | /* If kmalloc failed, max_to_insert becomes zero and it means we | ||
899 | only have space for one block */ | ||
900 | blocks_needed=max_to_insert?max_to_insert:1; | ||
901 | } | ||
902 | retval = reiserfs_paste_into_item (th, &path, &tmp_key, inode, (char *)un, UNFM_P_SIZE * blocks_needed); | ||
903 | |||
904 | if (blocks_needed != 1) | ||
905 | kfree(un); | ||
906 | |||
907 | if (retval) { | ||
908 | reiserfs_free_block (th, inode, allocated_block_nr, 1); | ||
909 | goto failure; | ||
910 | } | ||
911 | if (!done) { | ||
912 | /* We need to mark new file size in case this function will be | ||
913 | interrupted/aborted later on. And we may do this only for | ||
914 | holes. */ | ||
915 | inode->i_size += inode->i_sb->s_blocksize * blocks_needed; | ||
916 | } | ||
917 | } | ||
918 | |||
919 | if (done == 1) | ||
920 | break; | ||
921 | |||
922 | /* this loop could log more blocks than we had originally asked | ||
923 | ** for. So, we have to allow the transaction to end if it is | ||
924 | ** too big or too full. Update the inode so things are | ||
925 | ** consistent if we crash before the function returns | ||
926 | ** | ||
927 | ** release the path so that anybody waiting on the path before | ||
928 | ** ending their transaction will be able to continue. | ||
929 | */ | ||
930 | if (journal_transaction_should_end(th, th->t_blocks_allocated)) { | ||
931 | retval = restart_transaction(th, inode, &path) ; | ||
932 | if (retval) | ||
933 | goto failure; | ||
934 | } | ||
935 | /* inserting indirect pointers for a hole can take a | ||
936 | ** long time. reschedule if needed | ||
937 | */ | ||
938 | cond_resched(); | ||
939 | |||
940 | retval = search_for_position_by_key (inode->i_sb, &key, &path); | ||
941 | if (retval == IO_ERROR) { | ||
942 | retval = -EIO; | ||
943 | goto failure; | ||
944 | } | ||
945 | if (retval == POSITION_FOUND) { | ||
946 | reiserfs_warning (inode->i_sb, "vs-825: reiserfs_get_block: " | ||
947 | "%K should not be found", &key); | ||
948 | retval = -EEXIST; | ||
949 | if (allocated_block_nr) | ||
950 | reiserfs_free_block (th, inode, allocated_block_nr, 1); | ||
951 | pathrelse(&path) ; | ||
952 | goto failure; | ||
953 | } | ||
954 | bh = get_last_bh (&path); | ||
955 | ih = get_ih (&path); | ||
956 | item = get_item (&path); | ||
957 | pos_in_item = path.pos_in_item; | ||
958 | } while (1); | ||
959 | |||
960 | |||
961 | retval = 0; | ||
962 | |||
963 | failure: | ||
964 | if (th && (!dangle || (retval && !th->t_trans_id))) { | ||
965 | int err; | ||
966 | if (th->t_trans_id) | ||
967 | reiserfs_update_sd(th, inode); | ||
968 | err = reiserfs_end_persistent_transaction(th); | ||
969 | if (err) | ||
970 | retval = err; | ||
971 | } | ||
972 | |||
973 | reiserfs_write_unlock(inode->i_sb); | ||
974 | reiserfs_check_path(&path) ; | ||
975 | return retval; | ||
976 | } | ||
977 | |||
978 | static int | ||
979 | reiserfs_readpages(struct file *file, struct address_space *mapping, | ||
980 | struct list_head *pages, unsigned nr_pages) | ||
981 | { | ||
982 | return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); | ||
983 | } | ||
984 | |||
985 | /* Compute real number of used bytes by file | ||
986 | * Following three functions can go away when we'll have enough space in stat item | ||
987 | */ | ||
988 | static int real_space_diff(struct inode *inode, int sd_size) | ||
989 | { | ||
990 | int bytes; | ||
991 | loff_t blocksize = inode->i_sb->s_blocksize ; | ||
992 | |||
993 | if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) | ||
994 | return sd_size ; | ||
995 | |||
996 | /* End of file is also in full block with indirect reference, so round | ||
997 | ** up to the next block. | ||
998 | ** | ||
999 | ** there is just no way to know if the tail is actually packed | ||
1000 | ** on the file, so we have to assume it isn't. When we pack the | ||
1001 | ** tail, we add 4 bytes to pretend there really is an unformatted | ||
1002 | ** node pointer | ||
1003 | */ | ||
1004 | bytes = ((inode->i_size + (blocksize-1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + sd_size; | ||
1005 | return bytes ; | ||
1006 | } | ||
1007 | |||
1008 | static inline loff_t to_real_used_space(struct inode *inode, ulong blocks, | ||
1009 | int sd_size) | ||
1010 | { | ||
1011 | if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { | ||
1012 | return inode->i_size + (loff_t)(real_space_diff(inode, sd_size)) ; | ||
1013 | } | ||
1014 | return ((loff_t)real_space_diff(inode, sd_size)) + (((loff_t)blocks) << 9); | ||
1015 | } | ||
1016 | |||
1017 | /* Compute number of blocks used by file in ReiserFS counting */ | ||
1018 | static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size) | ||
1019 | { | ||
1020 | loff_t bytes = inode_get_bytes(inode) ; | ||
1021 | loff_t real_space = real_space_diff(inode, sd_size) ; | ||
1022 | |||
1023 | /* keeps fsck and non-quota versions of reiserfs happy */ | ||
1024 | if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { | ||
1025 | bytes += (loff_t)511 ; | ||
1026 | } | ||
1027 | |||
1028 | /* files from before the quota patch might i_blocks such that | ||
1029 | ** bytes < real_space. Deal with that here to prevent it from | ||
1030 | ** going negative. | ||
1031 | */ | ||
1032 | if (bytes < real_space) | ||
1033 | return 0 ; | ||
1034 | return (bytes - real_space) >> 9; | ||
1035 | } | ||
1036 | |||
1037 | // | ||
1038 | // BAD: new directories have stat data of new type and all other items | ||
1039 | // of old type. Version stored in the inode says about body items, so | ||
1040 | // in update_stat_data we can not rely on inode, but have to check | ||
1041 | // item version directly | ||
1042 | // | ||
1043 | |||
1044 | // called by read_locked_inode | ||
1045 | static void init_inode (struct inode * inode, struct path * path) | ||
1046 | { | ||
1047 | struct buffer_head * bh; | ||
1048 | struct item_head * ih; | ||
1049 | __u32 rdev; | ||
1050 | //int version = ITEM_VERSION_1; | ||
1051 | |||
1052 | bh = PATH_PLAST_BUFFER (path); | ||
1053 | ih = PATH_PITEM_HEAD (path); | ||
1054 | |||
1055 | |||
1056 | copy_key (INODE_PKEY (inode), &(ih->ih_key)); | ||
1057 | inode->i_blksize = reiserfs_default_io_size; | ||
1058 | |||
1059 | INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list )); | ||
1060 | REISERFS_I(inode)->i_flags = 0; | ||
1061 | REISERFS_I(inode)->i_prealloc_block = 0; | ||
1062 | REISERFS_I(inode)->i_prealloc_count = 0; | ||
1063 | REISERFS_I(inode)->i_trans_id = 0; | ||
1064 | REISERFS_I(inode)->i_jl = NULL; | ||
1065 | REISERFS_I(inode)->i_acl_access = NULL; | ||
1066 | REISERFS_I(inode)->i_acl_default = NULL; | ||
1067 | init_rwsem (&REISERFS_I(inode)->xattr_sem); | ||
1068 | |||
1069 | if (stat_data_v1 (ih)) { | ||
1070 | struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih); | ||
1071 | unsigned long blocks; | ||
1072 | |||
1073 | set_inode_item_key_version (inode, KEY_FORMAT_3_5); | ||
1074 | set_inode_sd_version (inode, STAT_DATA_V1); | ||
1075 | inode->i_mode = sd_v1_mode(sd); | ||
1076 | inode->i_nlink = sd_v1_nlink(sd); | ||
1077 | inode->i_uid = sd_v1_uid(sd); | ||
1078 | inode->i_gid = sd_v1_gid(sd); | ||
1079 | inode->i_size = sd_v1_size(sd); | ||
1080 | inode->i_atime.tv_sec = sd_v1_atime(sd); | ||
1081 | inode->i_mtime.tv_sec = sd_v1_mtime(sd); | ||
1082 | inode->i_ctime.tv_sec = sd_v1_ctime(sd); | ||
1083 | inode->i_atime.tv_nsec = 0; | ||
1084 | inode->i_ctime.tv_nsec = 0; | ||
1085 | inode->i_mtime.tv_nsec = 0; | ||
1086 | |||
1087 | inode->i_blocks = sd_v1_blocks(sd); | ||
1088 | inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id); | ||
1089 | blocks = (inode->i_size + 511) >> 9; | ||
1090 | blocks = _ROUND_UP (blocks, inode->i_sb->s_blocksize >> 9); | ||
1091 | if (inode->i_blocks > blocks) { | ||
1092 | // there was a bug in <=3.5.23 when i_blocks could take negative | ||
1093 | // values. Starting from 3.5.17 this value could even be stored in | ||
1094 | // stat data. For such files we set i_blocks based on file | ||
1095 | // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be | ||
1096 | // only updated if file's inode will ever change | ||
1097 | inode->i_blocks = blocks; | ||
1098 | } | ||
1099 | |||
1100 | rdev = sd_v1_rdev(sd); | ||
1101 | REISERFS_I(inode)->i_first_direct_byte = sd_v1_first_direct_byte(sd); | ||
1102 | /* an early bug in the quota code can give us an odd number for the | ||
1103 | ** block count. This is incorrect, fix it here. | ||
1104 | */ | ||
1105 | if (inode->i_blocks & 1) { | ||
1106 | inode->i_blocks++ ; | ||
1107 | } | ||
1108 | inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, | ||
1109 | SD_V1_SIZE)); | ||
1110 | /* nopack is initially zero for v1 objects. For v2 objects, | ||
1111 | nopack is initialised from sd_attrs */ | ||
1112 | REISERFS_I(inode)->i_flags &= ~i_nopack_mask; | ||
1113 | } else { | ||
1114 | // new stat data found, but object may have old items | ||
1115 | // (directories and symlinks) | ||
1116 | struct stat_data * sd = (struct stat_data *)B_I_PITEM (bh, ih); | ||
1117 | |||
1118 | inode->i_mode = sd_v2_mode(sd); | ||
1119 | inode->i_nlink = sd_v2_nlink(sd); | ||
1120 | inode->i_uid = sd_v2_uid(sd); | ||
1121 | inode->i_size = sd_v2_size(sd); | ||
1122 | inode->i_gid = sd_v2_gid(sd); | ||
1123 | inode->i_mtime.tv_sec = sd_v2_mtime(sd); | ||
1124 | inode->i_atime.tv_sec = sd_v2_atime(sd); | ||
1125 | inode->i_ctime.tv_sec = sd_v2_ctime(sd); | ||
1126 | inode->i_ctime.tv_nsec = 0; | ||
1127 | inode->i_mtime.tv_nsec = 0; | ||
1128 | inode->i_atime.tv_nsec = 0; | ||
1129 | inode->i_blocks = sd_v2_blocks(sd); | ||
1130 | rdev = sd_v2_rdev(sd); | ||
1131 | if( S_ISCHR( inode -> i_mode ) || S_ISBLK( inode -> i_mode ) ) | ||
1132 | inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id); | ||
1133 | else | ||
1134 | inode->i_generation = sd_v2_generation(sd); | ||
1135 | |||
1136 | if (S_ISDIR (inode->i_mode) || S_ISLNK (inode->i_mode)) | ||
1137 | set_inode_item_key_version (inode, KEY_FORMAT_3_5); | ||
1138 | else | ||
1139 | set_inode_item_key_version (inode, KEY_FORMAT_3_6); | ||
1140 | REISERFS_I(inode)->i_first_direct_byte = 0; | ||
1141 | set_inode_sd_version (inode, STAT_DATA_V2); | ||
1142 | inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, | ||
1143 | SD_V2_SIZE)); | ||
1144 | /* read persistent inode attributes from sd and initalise | ||
1145 | generic inode flags from them */ | ||
1146 | REISERFS_I(inode)->i_attrs = sd_v2_attrs( sd ); | ||
1147 | sd_attrs_to_i_attrs( sd_v2_attrs( sd ), inode ); | ||
1148 | } | ||
1149 | |||
1150 | pathrelse (path); | ||
1151 | if (S_ISREG (inode->i_mode)) { | ||
1152 | inode->i_op = &reiserfs_file_inode_operations; | ||
1153 | inode->i_fop = &reiserfs_file_operations; | ||
1154 | inode->i_mapping->a_ops = &reiserfs_address_space_operations ; | ||
1155 | } else if (S_ISDIR (inode->i_mode)) { | ||
1156 | inode->i_op = &reiserfs_dir_inode_operations; | ||
1157 | inode->i_fop = &reiserfs_dir_operations; | ||
1158 | } else if (S_ISLNK (inode->i_mode)) { | ||
1159 | inode->i_op = &reiserfs_symlink_inode_operations; | ||
1160 | inode->i_mapping->a_ops = &reiserfs_address_space_operations; | ||
1161 | } else { | ||
1162 | inode->i_blocks = 0; | ||
1163 | inode->i_op = &reiserfs_special_inode_operations; | ||
1164 | init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); | ||
1165 | } | ||
1166 | } | ||
1167 | |||
1168 | |||
1169 | // update new stat data with inode fields | ||
1170 | static void inode2sd (void * sd, struct inode * inode, loff_t size) | ||
1171 | { | ||
1172 | struct stat_data * sd_v2 = (struct stat_data *)sd; | ||
1173 | __u16 flags; | ||
1174 | |||
1175 | set_sd_v2_mode(sd_v2, inode->i_mode ); | ||
1176 | set_sd_v2_nlink(sd_v2, inode->i_nlink ); | ||
1177 | set_sd_v2_uid(sd_v2, inode->i_uid ); | ||
1178 | set_sd_v2_size(sd_v2, size ); | ||
1179 | set_sd_v2_gid(sd_v2, inode->i_gid ); | ||
1180 | set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec ); | ||
1181 | set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec ); | ||
1182 | set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec ); | ||
1183 | set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE)); | ||
1184 | if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) | ||
1185 | set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); | ||
1186 | else | ||
1187 | set_sd_v2_generation(sd_v2, inode->i_generation); | ||
1188 | flags = REISERFS_I(inode)->i_attrs; | ||
1189 | i_attrs_to_sd_attrs( inode, &flags ); | ||
1190 | set_sd_v2_attrs( sd_v2, flags ); | ||
1191 | } | ||
1192 | |||
1193 | |||
1194 | // used to copy inode's fields to old stat data | ||
1195 | static void inode2sd_v1 (void * sd, struct inode * inode, loff_t size) | ||
1196 | { | ||
1197 | struct stat_data_v1 * sd_v1 = (struct stat_data_v1 *)sd; | ||
1198 | |||
1199 | set_sd_v1_mode(sd_v1, inode->i_mode ); | ||
1200 | set_sd_v1_uid(sd_v1, inode->i_uid ); | ||
1201 | set_sd_v1_gid(sd_v1, inode->i_gid ); | ||
1202 | set_sd_v1_nlink(sd_v1, inode->i_nlink ); | ||
1203 | set_sd_v1_size(sd_v1, size ); | ||
1204 | set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec ); | ||
1205 | set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec ); | ||
1206 | set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec ); | ||
1207 | |||
1208 | if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) | ||
1209 | set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev)); | ||
1210 | else | ||
1211 | set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE)); | ||
1212 | |||
1213 | // Sigh. i_first_direct_byte is back | ||
1214 | set_sd_v1_first_direct_byte(sd_v1, REISERFS_I(inode)->i_first_direct_byte); | ||
1215 | } | ||
1216 | |||
1217 | |||
1218 | /* NOTE, you must prepare the buffer head before sending it here, | ||
1219 | ** and then log it after the call | ||
1220 | */ | ||
1221 | static void update_stat_data (struct path * path, struct inode * inode, | ||
1222 | loff_t size) | ||
1223 | { | ||
1224 | struct buffer_head * bh; | ||
1225 | struct item_head * ih; | ||
1226 | |||
1227 | bh = PATH_PLAST_BUFFER (path); | ||
1228 | ih = PATH_PITEM_HEAD (path); | ||
1229 | |||
1230 | if (!is_statdata_le_ih (ih)) | ||
1231 | reiserfs_panic (inode->i_sb, "vs-13065: update_stat_data: key %k, found item %h", | ||
1232 | INODE_PKEY (inode), ih); | ||
1233 | |||
1234 | if (stat_data_v1 (ih)) { | ||
1235 | // path points to old stat data | ||
1236 | inode2sd_v1 (B_I_PITEM (bh, ih), inode, size); | ||
1237 | } else { | ||
1238 | inode2sd (B_I_PITEM (bh, ih), inode, size); | ||
1239 | } | ||
1240 | |||
1241 | return; | ||
1242 | } | ||
1243 | |||
1244 | |||
1245 | void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th, | ||
1246 | struct inode * inode, loff_t size) | ||
1247 | { | ||
1248 | struct cpu_key key; | ||
1249 | INITIALIZE_PATH(path); | ||
1250 | struct buffer_head *bh ; | ||
1251 | int fs_gen ; | ||
1252 | struct item_head *ih, tmp_ih ; | ||
1253 | int retval; | ||
1254 | |||
1255 | BUG_ON (!th->t_trans_id); | ||
1256 | |||
1257 | make_cpu_key (&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);//key type is unimportant | ||
1258 | |||
1259 | for(;;) { | ||
1260 | int pos; | ||
1261 | /* look for the object's stat data */ | ||
1262 | retval = search_item (inode->i_sb, &key, &path); | ||
1263 | if (retval == IO_ERROR) { | ||
1264 | reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: " | ||
1265 | "i/o failure occurred trying to update %K stat data", | ||
1266 | &key); | ||
1267 | return; | ||
1268 | } | ||
1269 | if (retval == ITEM_NOT_FOUND) { | ||
1270 | pos = PATH_LAST_POSITION (&path); | ||
1271 | pathrelse(&path) ; | ||
1272 | if (inode->i_nlink == 0) { | ||
1273 | /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found");*/ | ||
1274 | return; | ||
1275 | } | ||
1276 | reiserfs_warning (inode->i_sb, "vs-13060: reiserfs_update_sd: " | ||
1277 | "stat data of object %k (nlink == %d) not found (pos %d)", | ||
1278 | INODE_PKEY (inode), inode->i_nlink, pos); | ||
1279 | reiserfs_check_path(&path) ; | ||
1280 | return; | ||
1281 | } | ||
1282 | |||
1283 | /* sigh, prepare_for_journal might schedule. When it schedules the | ||
1284 | ** FS might change. We have to detect that, and loop back to the | ||
1285 | ** search if the stat data item has moved | ||
1286 | */ | ||
1287 | bh = get_last_bh(&path) ; | ||
1288 | ih = get_ih(&path) ; | ||
1289 | copy_item_head (&tmp_ih, ih); | ||
1290 | fs_gen = get_generation (inode->i_sb); | ||
1291 | reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ; | ||
1292 | if (fs_changed (fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) { | ||
1293 | reiserfs_restore_prepared_buffer(inode->i_sb, bh) ; | ||
1294 | continue ; /* Stat_data item has been moved after scheduling. */ | ||
1295 | } | ||
1296 | break; | ||
1297 | } | ||
1298 | update_stat_data (&path, inode, size); | ||
1299 | journal_mark_dirty(th, th->t_super, bh) ; | ||
1300 | pathrelse (&path); | ||
1301 | return; | ||
1302 | } | ||
1303 | |||
1304 | /* reiserfs_read_locked_inode is called to read the inode off disk, and it | ||
1305 | ** does a make_bad_inode when things go wrong. But, we need to make sure | ||
1306 | ** and clear the key in the private portion of the inode, otherwise a | ||
1307 | ** corresponding iput might try to delete whatever object the inode last | ||
1308 | ** represented. | ||
1309 | */ | ||
1310 | static void reiserfs_make_bad_inode(struct inode *inode) { | ||
1311 | memset(INODE_PKEY(inode), 0, KEY_SIZE); | ||
1312 | make_bad_inode(inode); | ||
1313 | } | ||
1314 | |||
1315 | // | ||
1316 | // initially this function was derived from minix or ext2's analog and | ||
1317 | // evolved as the prototype did | ||
1318 | // | ||
1319 | |||
1320 | int reiserfs_init_locked_inode (struct inode * inode, void *p) | ||
1321 | { | ||
1322 | struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p ; | ||
1323 | inode->i_ino = args->objectid; | ||
1324 | INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid); | ||
1325 | return 0; | ||
1326 | } | ||
1327 | |||
1328 | /* looks for stat data in the tree, and fills up the fields of in-core | ||
1329 | inode stat data fields */ | ||
1330 | void reiserfs_read_locked_inode (struct inode * inode, struct reiserfs_iget_args *args) | ||
1331 | { | ||
1332 | INITIALIZE_PATH (path_to_sd); | ||
1333 | struct cpu_key key; | ||
1334 | unsigned long dirino; | ||
1335 | int retval; | ||
1336 | |||
1337 | dirino = args->dirid ; | ||
1338 | |||
1339 | /* set version 1, version 2 could be used too, because stat data | ||
1340 | key is the same in both versions */ | ||
1341 | key.version = KEY_FORMAT_3_5; | ||
1342 | key.on_disk_key.k_dir_id = dirino; | ||
1343 | key.on_disk_key.k_objectid = inode->i_ino; | ||
1344 | key.on_disk_key.u.k_offset_v1.k_offset = SD_OFFSET; | ||
1345 | key.on_disk_key.u.k_offset_v1.k_uniqueness = SD_UNIQUENESS; | ||
1346 | |||
1347 | /* look for the object's stat data */ | ||
1348 | retval = search_item (inode->i_sb, &key, &path_to_sd); | ||
1349 | if (retval == IO_ERROR) { | ||
1350 | reiserfs_warning (inode->i_sb, "vs-13070: reiserfs_read_locked_inode: " | ||
1351 | "i/o failure occurred trying to find stat data of %K", | ||
1352 | &key); | ||
1353 | reiserfs_make_bad_inode(inode) ; | ||
1354 | return; | ||
1355 | } | ||
1356 | if (retval != ITEM_FOUND) { | ||
1357 | /* a stale NFS handle can trigger this without it being an error */ | ||
1358 | pathrelse (&path_to_sd); | ||
1359 | reiserfs_make_bad_inode(inode) ; | ||
1360 | inode->i_nlink = 0; | ||
1361 | return; | ||
1362 | } | ||
1363 | |||
1364 | init_inode (inode, &path_to_sd); | ||
1365 | |||
1366 | /* It is possible that knfsd is trying to access inode of a file | ||
1367 | that is being removed from the disk by some other thread. As we | ||
1368 | update sd on unlink all that is required is to check for nlink | ||
1369 | here. This bug was first found by Sizif when debugging | ||
1370 | SquidNG/Butterfly, forgotten, and found again after Philippe | ||
1371 | Gramoulle <philippe.gramoulle@mmania.com> reproduced it. | ||
1372 | |||
1373 | More logical fix would require changes in fs/inode.c:iput() to | ||
1374 | remove inode from hash-table _after_ fs cleaned disk stuff up and | ||
1375 | in iget() to return NULL if I_FREEING inode is found in | ||
1376 | hash-table. */ | ||
1377 | /* Currently there is one place where it's ok to meet inode with | ||
1378 | nlink==0: processing of open-unlinked and half-truncated files | ||
1379 | during mount (fs/reiserfs/super.c:finish_unfinished()). */ | ||
1380 | if( ( inode -> i_nlink == 0 ) && | ||
1381 | ! REISERFS_SB(inode -> i_sb) -> s_is_unlinked_ok ) { | ||
1382 | reiserfs_warning (inode->i_sb, | ||
1383 | "vs-13075: reiserfs_read_locked_inode: " | ||
1384 | "dead inode read from disk %K. " | ||
1385 | "This is likely to be race with knfsd. Ignore", | ||
1386 | &key ); | ||
1387 | reiserfs_make_bad_inode( inode ); | ||
1388 | } | ||
1389 | |||
1390 | reiserfs_check_path(&path_to_sd) ; /* init inode should be relsing */ | ||
1391 | |||
1392 | } | ||
1393 | |||
1394 | /** | ||
1395 | * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked(). | ||
1396 | * | ||
1397 | * @inode: inode from hash table to check | ||
1398 | * @opaque: "cookie" passed to iget5_locked(). This is &reiserfs_iget_args. | ||
1399 | * | ||
1400 | * This function is called by iget5_locked() to distinguish reiserfs inodes | ||
1401 | * having the same inode numbers. Such inodes can only exist due to some | ||
1402 | * error condition. One of them should be bad. Inodes with identical | ||
1403 | * inode numbers (objectids) are distinguished by parent directory ids. | ||
1404 | * | ||
1405 | */ | ||
1406 | int reiserfs_find_actor( struct inode *inode, void *opaque ) | ||
1407 | { | ||
1408 | struct reiserfs_iget_args *args; | ||
1409 | |||
1410 | args = opaque; | ||
1411 | /* args is already in CPU order */ | ||
1412 | return (inode->i_ino == args->objectid) && | ||
1413 | (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid); | ||
1414 | } | ||
1415 | |||
1416 | struct inode * reiserfs_iget (struct super_block * s, const struct cpu_key * key) | ||
1417 | { | ||
1418 | struct inode * inode; | ||
1419 | struct reiserfs_iget_args args ; | ||
1420 | |||
1421 | args.objectid = key->on_disk_key.k_objectid ; | ||
1422 | args.dirid = key->on_disk_key.k_dir_id ; | ||
1423 | inode = iget5_locked (s, key->on_disk_key.k_objectid, | ||
1424 | reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); | ||
1425 | if (!inode) | ||
1426 | return ERR_PTR(-ENOMEM) ; | ||
1427 | |||
1428 | if (inode->i_state & I_NEW) { | ||
1429 | reiserfs_read_locked_inode(inode, &args); | ||
1430 | unlock_new_inode(inode); | ||
1431 | } | ||
1432 | |||
1433 | if (comp_short_keys (INODE_PKEY (inode), key) || is_bad_inode (inode)) { | ||
1434 | /* either due to i/o error or a stale NFS handle */ | ||
1435 | iput (inode); | ||
1436 | inode = NULL; | ||
1437 | } | ||
1438 | return inode; | ||
1439 | } | ||
1440 | |||
1441 | struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp) | ||
1442 | { | ||
1443 | __u32 *data = vobjp; | ||
1444 | struct cpu_key key ; | ||
1445 | struct dentry *result; | ||
1446 | struct inode *inode; | ||
1447 | |||
1448 | key.on_disk_key.k_objectid = data[0] ; | ||
1449 | key.on_disk_key.k_dir_id = data[1] ; | ||
1450 | reiserfs_write_lock(sb); | ||
1451 | inode = reiserfs_iget(sb, &key) ; | ||
1452 | if (inode && !IS_ERR(inode) && data[2] != 0 && | ||
1453 | data[2] != inode->i_generation) { | ||
1454 | iput(inode) ; | ||
1455 | inode = NULL ; | ||
1456 | } | ||
1457 | reiserfs_write_unlock(sb); | ||
1458 | if (!inode) | ||
1459 | inode = ERR_PTR(-ESTALE); | ||
1460 | if (IS_ERR(inode)) | ||
1461 | return ERR_PTR(PTR_ERR(inode)); | ||
1462 | result = d_alloc_anon(inode); | ||
1463 | if (!result) { | ||
1464 | iput(inode); | ||
1465 | return ERR_PTR(-ENOMEM); | ||
1466 | } | ||
1467 | return result; | ||
1468 | } | ||
1469 | |||
1470 | struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 *data, | ||
1471 | int len, int fhtype, | ||
1472 | int (*acceptable)(void *contect, struct dentry *de), | ||
1473 | void *context) { | ||
1474 | __u32 obj[3], parent[3]; | ||
1475 | |||
1476 | /* fhtype happens to reflect the number of u32s encoded. | ||
1477 | * due to a bug in earlier code, fhtype might indicate there | ||
1478 | * are more u32s then actually fitted. | ||
1479 | * so if fhtype seems to be more than len, reduce fhtype. | ||
1480 | * Valid types are: | ||
1481 | * 2 - objectid + dir_id - legacy support | ||
1482 | * 3 - objectid + dir_id + generation | ||
1483 | * 4 - objectid + dir_id + objectid and dirid of parent - legacy | ||
1484 | * 5 - objectid + dir_id + generation + objectid and dirid of parent | ||
1485 | * 6 - as above plus generation of directory | ||
1486 | * 6 does not fit in NFSv2 handles | ||
1487 | */ | ||
1488 | if (fhtype > len) { | ||
1489 | if (fhtype != 6 || len != 5) | ||
1490 | reiserfs_warning (sb, "nfsd/reiserfs, fhtype=%d, len=%d - odd", | ||
1491 | fhtype, len); | ||
1492 | fhtype = 5; | ||
1493 | } | ||
1494 | |||
1495 | obj[0] = data[0]; | ||
1496 | obj[1] = data[1]; | ||
1497 | if (fhtype == 3 || fhtype >= 5) | ||
1498 | obj[2] = data[2]; | ||
1499 | else obj[2] = 0; /* generation number */ | ||
1500 | |||
1501 | if (fhtype >= 4) { | ||
1502 | parent[0] = data[fhtype>=5?3:2] ; | ||
1503 | parent[1] = data[fhtype>=5?4:3] ; | ||
1504 | if (fhtype == 6) | ||
1505 | parent[2] = data[5]; | ||
1506 | else parent[2] = 0; | ||
1507 | } | ||
1508 | return sb->s_export_op->find_exported_dentry(sb, obj, fhtype < 4 ? NULL : parent, | ||
1509 | acceptable, context); | ||
1510 | } | ||
1511 | |||
1512 | int reiserfs_encode_fh(struct dentry *dentry, __u32 *data, int *lenp, int need_parent) { | ||
1513 | struct inode *inode = dentry->d_inode ; | ||
1514 | int maxlen = *lenp; | ||
1515 | |||
1516 | if (maxlen < 3) | ||
1517 | return 255 ; | ||
1518 | |||
1519 | data[0] = inode->i_ino ; | ||
1520 | data[1] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ; | ||
1521 | data[2] = inode->i_generation ; | ||
1522 | *lenp = 3 ; | ||
1523 | /* no room for directory info? return what we've stored so far */ | ||
1524 | if (maxlen < 5 || ! need_parent) | ||
1525 | return 3 ; | ||
1526 | |||
1527 | spin_lock(&dentry->d_lock); | ||
1528 | inode = dentry->d_parent->d_inode ; | ||
1529 | data[3] = inode->i_ino ; | ||
1530 | data[4] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ; | ||
1531 | *lenp = 5 ; | ||
1532 | if (maxlen >= 6) { | ||
1533 | data[5] = inode->i_generation ; | ||
1534 | *lenp = 6 ; | ||
1535 | } | ||
1536 | spin_unlock(&dentry->d_lock); | ||
1537 | return *lenp ; | ||
1538 | } | ||
1539 | |||
1540 | |||
1541 | /* looks for stat data, then copies fields to it, marks the buffer | ||
1542 | containing stat data as dirty */ | ||
1543 | /* reiserfs inodes are never really dirty, since the dirty inode call | ||
1544 | ** always logs them. This call allows the VFS inode marking routines | ||
1545 | ** to properly mark inodes for datasync and such, but only actually | ||
1546 | ** does something when called for a synchronous update. | ||
1547 | */ | ||
1548 | int reiserfs_write_inode (struct inode * inode, int do_sync) { | ||
1549 | struct reiserfs_transaction_handle th ; | ||
1550 | int jbegin_count = 1 ; | ||
1551 | |||
1552 | if (inode->i_sb->s_flags & MS_RDONLY) | ||
1553 | return -EROFS; | ||
1554 | /* memory pressure can sometimes initiate write_inode calls with sync == 1, | ||
1555 | ** these cases are just when the system needs ram, not when the | ||
1556 | ** inode needs to reach disk for safety, and they can safely be | ||
1557 | ** ignored because the altered inode has already been logged. | ||
1558 | */ | ||
1559 | if (do_sync && !(current->flags & PF_MEMALLOC)) { | ||
1560 | reiserfs_write_lock(inode->i_sb); | ||
1561 | if (!journal_begin(&th, inode->i_sb, jbegin_count)) { | ||
1562 | reiserfs_update_sd (&th, inode); | ||
1563 | journal_end_sync(&th, inode->i_sb, jbegin_count) ; | ||
1564 | } | ||
1565 | reiserfs_write_unlock(inode->i_sb); | ||
1566 | } | ||
1567 | return 0; | ||
1568 | } | ||
1569 | |||
1570 | /* stat data of new object is inserted already, this inserts the item | ||
1571 | containing "." and ".." entries */ | ||
1572 | static int reiserfs_new_directory (struct reiserfs_transaction_handle *th, | ||
1573 | struct inode *inode, | ||
1574 | struct item_head * ih, struct path * path, | ||
1575 | struct inode * dir) | ||
1576 | { | ||
1577 | struct super_block * sb = th->t_super; | ||
1578 | char empty_dir [EMPTY_DIR_SIZE]; | ||
1579 | char * body = empty_dir; | ||
1580 | struct cpu_key key; | ||
1581 | int retval; | ||
1582 | |||
1583 | BUG_ON (!th->t_trans_id); | ||
1584 | |||
1585 | _make_cpu_key (&key, KEY_FORMAT_3_5, le32_to_cpu (ih->ih_key.k_dir_id), | ||
1586 | le32_to_cpu (ih->ih_key.k_objectid), DOT_OFFSET, TYPE_DIRENTRY, 3/*key length*/); | ||
1587 | |||
1588 | /* compose item head for new item. Directories consist of items of | ||
1589 | old type (ITEM_VERSION_1). Do not set key (second arg is 0), it | ||
1590 | is done by reiserfs_new_inode */ | ||
1591 | if (old_format_only (sb)) { | ||
1592 | make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2); | ||
1593 | |||
1594 | make_empty_dir_item_v1 (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid, | ||
1595 | INODE_PKEY (dir)->k_dir_id, | ||
1596 | INODE_PKEY (dir)->k_objectid ); | ||
1597 | } else { | ||
1598 | make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2); | ||
1599 | |||
1600 | make_empty_dir_item (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid, | ||
1601 | INODE_PKEY (dir)->k_dir_id, | ||
1602 | INODE_PKEY (dir)->k_objectid ); | ||
1603 | } | ||
1604 | |||
1605 | /* look for place in the tree for new item */ | ||
1606 | retval = search_item (sb, &key, path); | ||
1607 | if (retval == IO_ERROR) { | ||
1608 | reiserfs_warning (sb, "vs-13080: reiserfs_new_directory: " | ||
1609 | "i/o failure occurred creating new directory"); | ||
1610 | return -EIO; | ||
1611 | } | ||
1612 | if (retval == ITEM_FOUND) { | ||
1613 | pathrelse (path); | ||
1614 | reiserfs_warning (sb, "vs-13070: reiserfs_new_directory: " | ||
1615 | "object with this key exists (%k)", &(ih->ih_key)); | ||
1616 | return -EEXIST; | ||
1617 | } | ||
1618 | |||
1619 | /* insert item, that is empty directory item */ | ||
1620 | return reiserfs_insert_item (th, path, &key, ih, inode, body); | ||
1621 | } | ||
1622 | |||
1623 | |||
1624 | /* stat data of object has been inserted, this inserts the item | ||
1625 | containing the body of symlink */ | ||
1626 | static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th, | ||
1627 | struct inode *inode, /* Inode of symlink */ | ||
1628 | struct item_head * ih, | ||
1629 | struct path * path, const char * symname, int item_len) | ||
1630 | { | ||
1631 | struct super_block * sb = th->t_super; | ||
1632 | struct cpu_key key; | ||
1633 | int retval; | ||
1634 | |||
1635 | BUG_ON (!th->t_trans_id); | ||
1636 | |||
1637 | _make_cpu_key (&key, KEY_FORMAT_3_5, | ||
1638 | le32_to_cpu (ih->ih_key.k_dir_id), | ||
1639 | le32_to_cpu (ih->ih_key.k_objectid), | ||
1640 | 1, TYPE_DIRECT, 3/*key length*/); | ||
1641 | |||
1642 | make_le_item_head (ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len, 0/*free_space*/); | ||
1643 | |||
1644 | /* look for place in the tree for new item */ | ||
1645 | retval = search_item (sb, &key, path); | ||
1646 | if (retval == IO_ERROR) { | ||
1647 | reiserfs_warning (sb, "vs-13080: reiserfs_new_symlinik: " | ||
1648 | "i/o failure occurred creating new symlink"); | ||
1649 | return -EIO; | ||
1650 | } | ||
1651 | if (retval == ITEM_FOUND) { | ||
1652 | pathrelse (path); | ||
1653 | reiserfs_warning (sb, "vs-13080: reiserfs_new_symlink: " | ||
1654 | "object with this key exists (%k)", &(ih->ih_key)); | ||
1655 | return -EEXIST; | ||
1656 | } | ||
1657 | |||
1658 | /* insert item, that is body of symlink */ | ||
1659 | return reiserfs_insert_item (th, path, &key, ih, inode, symname); | ||
1660 | } | ||
1661 | |||
1662 | |||
1663 | /* inserts the stat data into the tree, and then calls | ||
1664 | reiserfs_new_directory (to insert ".", ".." item if new object is | ||
1665 | directory) or reiserfs_new_symlink (to insert symlink body if new | ||
1666 | object is symlink) or nothing (if new object is regular file) | ||
1667 | |||
1668 | NOTE! uid and gid must already be set in the inode. If we return | ||
1669 | non-zero due to an error, we have to drop the quota previously allocated | ||
1670 | for the fresh inode. This can only be done outside a transaction, so | ||
1671 | if we return non-zero, we also end the transaction. */ | ||
1672 | int reiserfs_new_inode (struct reiserfs_transaction_handle *th, | ||
1673 | struct inode * dir, int mode, | ||
1674 | const char * symname, | ||
1675 | /* 0 for regular, EMTRY_DIR_SIZE for dirs, | ||
1676 | strlen (symname) for symlinks)*/ | ||
1677 | loff_t i_size, struct dentry *dentry, | ||
1678 | struct inode *inode) | ||
1679 | { | ||
1680 | struct super_block * sb; | ||
1681 | INITIALIZE_PATH (path_to_key); | ||
1682 | struct cpu_key key; | ||
1683 | struct item_head ih; | ||
1684 | struct stat_data sd; | ||
1685 | int retval; | ||
1686 | int err; | ||
1687 | |||
1688 | BUG_ON (!th->t_trans_id); | ||
1689 | |||
1690 | if (DQUOT_ALLOC_INODE(inode)) { | ||
1691 | err = -EDQUOT; | ||
1692 | goto out_end_trans; | ||
1693 | } | ||
1694 | if (!dir || !dir->i_nlink) { | ||
1695 | err = -EPERM; | ||
1696 | goto out_bad_inode; | ||
1697 | } | ||
1698 | |||
1699 | sb = dir->i_sb; | ||
1700 | |||
1701 | /* item head of new item */ | ||
1702 | ih.ih_key.k_dir_id = reiserfs_choose_packing(dir); | ||
1703 | ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid (th)); | ||
1704 | if (!ih.ih_key.k_objectid) { | ||
1705 | err = -ENOMEM; | ||
1706 | goto out_bad_inode ; | ||
1707 | } | ||
1708 | if (old_format_only (sb)) | ||
1709 | /* not a perfect generation count, as object ids can be reused, but | ||
1710 | ** this is as good as reiserfs can do right now. | ||
1711 | ** note that the private part of inode isn't filled in yet, we have | ||
1712 | ** to use the directory. | ||
1713 | */ | ||
1714 | inode->i_generation = le32_to_cpu (INODE_PKEY (dir)->k_objectid); | ||
1715 | else | ||
1716 | #if defined( USE_INODE_GENERATION_COUNTER ) | ||
1717 | inode->i_generation = le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation); | ||
1718 | #else | ||
1719 | inode->i_generation = ++event; | ||
1720 | #endif | ||
1721 | |||
1722 | /* fill stat data */ | ||
1723 | inode->i_nlink = (S_ISDIR (mode) ? 2 : 1); | ||
1724 | |||
1725 | /* uid and gid must already be set by the caller for quota init */ | ||
1726 | |||
1727 | /* symlink cannot be immutable or append only, right? */ | ||
1728 | if( S_ISLNK( inode -> i_mode ) ) | ||
1729 | inode -> i_flags &= ~ ( S_IMMUTABLE | S_APPEND ); | ||
1730 | |||
1731 | inode->i_mtime = inode->i_atime = inode->i_ctime = | ||
1732 | CURRENT_TIME_SEC; | ||
1733 | inode->i_size = i_size; | ||
1734 | inode->i_blocks = 0; | ||
1735 | inode->i_bytes = 0; | ||
1736 | REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 : | ||
1737 | U32_MAX/*NO_BYTES_IN_DIRECT_ITEM*/; | ||
1738 | |||
1739 | INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list )); | ||
1740 | REISERFS_I(inode)->i_flags = 0; | ||
1741 | REISERFS_I(inode)->i_prealloc_block = 0; | ||
1742 | REISERFS_I(inode)->i_prealloc_count = 0; | ||
1743 | REISERFS_I(inode)->i_trans_id = 0; | ||
1744 | REISERFS_I(inode)->i_jl = NULL; | ||
1745 | REISERFS_I(inode)->i_attrs = | ||
1746 | REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK; | ||
1747 | sd_attrs_to_i_attrs( REISERFS_I(inode) -> i_attrs, inode ); | ||
1748 | REISERFS_I(inode)->i_acl_access = NULL; | ||
1749 | REISERFS_I(inode)->i_acl_default = NULL; | ||
1750 | init_rwsem (&REISERFS_I(inode)->xattr_sem); | ||
1751 | |||
1752 | if (old_format_only (sb)) | ||
1753 | make_le_item_head (&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT); | ||
1754 | else | ||
1755 | make_le_item_head (&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); | ||
1756 | |||
1757 | /* key to search for correct place for new stat data */ | ||
1758 | _make_cpu_key (&key, KEY_FORMAT_3_6, le32_to_cpu (ih.ih_key.k_dir_id), | ||
1759 | le32_to_cpu (ih.ih_key.k_objectid), SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/); | ||
1760 | |||
1761 | /* find proper place for inserting of stat data */ | ||
1762 | retval = search_item (sb, &key, &path_to_key); | ||
1763 | if (retval == IO_ERROR) { | ||
1764 | err = -EIO; | ||
1765 | goto out_bad_inode; | ||
1766 | } | ||
1767 | if (retval == ITEM_FOUND) { | ||
1768 | pathrelse (&path_to_key); | ||
1769 | err = -EEXIST; | ||
1770 | goto out_bad_inode; | ||
1771 | } | ||
1772 | if (old_format_only (sb)) { | ||
1773 | if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) { | ||
1774 | pathrelse (&path_to_key); | ||
1775 | /* i_uid or i_gid is too big to be stored in stat data v3.5 */ | ||
1776 | err = -EINVAL; | ||
1777 | goto out_bad_inode; | ||
1778 | } | ||
1779 | inode2sd_v1 (&sd, inode, inode->i_size); | ||
1780 | } else { | ||
1781 | inode2sd (&sd, inode, inode->i_size); | ||
1782 | } | ||
1783 | // these do not go to on-disk stat data | ||
1784 | inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid); | ||
1785 | inode->i_blksize = reiserfs_default_io_size; | ||
1786 | |||
1787 | // store in in-core inode the key of stat data and version all | ||
1788 | // object items will have (directory items will have old offset | ||
1789 | // format, other new objects will consist of new items) | ||
1790 | memcpy (INODE_PKEY (inode), &(ih.ih_key), KEY_SIZE); | ||
1791 | if (old_format_only (sb) || S_ISDIR(mode) || S_ISLNK(mode)) | ||
1792 | set_inode_item_key_version (inode, KEY_FORMAT_3_5); | ||
1793 | else | ||
1794 | set_inode_item_key_version (inode, KEY_FORMAT_3_6); | ||
1795 | if (old_format_only (sb)) | ||
1796 | set_inode_sd_version (inode, STAT_DATA_V1); | ||
1797 | else | ||
1798 | set_inode_sd_version (inode, STAT_DATA_V2); | ||
1799 | |||
1800 | /* insert the stat data into the tree */ | ||
1801 | #ifdef DISPLACE_NEW_PACKING_LOCALITIES | ||
1802 | if (REISERFS_I(dir)->new_packing_locality) | ||
1803 | th->displace_new_blocks = 1; | ||
1804 | #endif | ||
1805 | retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, inode, (char *)(&sd)); | ||
1806 | if (retval) { | ||
1807 | err = retval; | ||
1808 | reiserfs_check_path(&path_to_key) ; | ||
1809 | goto out_bad_inode; | ||
1810 | } | ||
1811 | |||
1812 | #ifdef DISPLACE_NEW_PACKING_LOCALITIES | ||
1813 | if (!th->displace_new_blocks) | ||
1814 | REISERFS_I(dir)->new_packing_locality = 0; | ||
1815 | #endif | ||
1816 | if (S_ISDIR(mode)) { | ||
1817 | /* insert item with "." and ".." */ | ||
1818 | retval = reiserfs_new_directory (th, inode, &ih, &path_to_key, dir); | ||
1819 | } | ||
1820 | |||
1821 | if (S_ISLNK(mode)) { | ||
1822 | /* insert body of symlink */ | ||
1823 | if (!old_format_only (sb)) | ||
1824 | i_size = ROUND_UP(i_size); | ||
1825 | retval = reiserfs_new_symlink (th, inode, &ih, &path_to_key, symname, i_size); | ||
1826 | } | ||
1827 | if (retval) { | ||
1828 | err = retval; | ||
1829 | reiserfs_check_path(&path_to_key) ; | ||
1830 | journal_end(th, th->t_super, th->t_blocks_allocated); | ||
1831 | goto out_inserted_sd; | ||
1832 | } | ||
1833 | |||
1834 | /* XXX CHECK THIS */ | ||
1835 | if (reiserfs_posixacl (inode->i_sb)) { | ||
1836 | retval = reiserfs_inherit_default_acl (dir, dentry, inode); | ||
1837 | if (retval) { | ||
1838 | err = retval; | ||
1839 | reiserfs_check_path(&path_to_key) ; | ||
1840 | journal_end(th, th->t_super, th->t_blocks_allocated); | ||
1841 | goto out_inserted_sd; | ||
1842 | } | ||
1843 | } else if (inode->i_sb->s_flags & MS_POSIXACL) { | ||
1844 | reiserfs_warning (inode->i_sb, "ACLs aren't enabled in the fs, " | ||
1845 | "but vfs thinks they are!"); | ||
1846 | } else if (is_reiserfs_priv_object (dir)) { | ||
1847 | reiserfs_mark_inode_private (inode); | ||
1848 | } | ||
1849 | |||
1850 | insert_inode_hash (inode); | ||
1851 | reiserfs_update_sd(th, inode); | ||
1852 | reiserfs_check_path(&path_to_key) ; | ||
1853 | |||
1854 | return 0; | ||
1855 | |||
1856 | /* it looks like you can easily compress these two goto targets into | ||
1857 | * one. Keeping it like this doesn't actually hurt anything, and they | ||
1858 | * are place holders for what the quota code actually needs. | ||
1859 | */ | ||
1860 | out_bad_inode: | ||
1861 | /* Invalidate the object, nothing was inserted yet */ | ||
1862 | INODE_PKEY(inode)->k_objectid = 0; | ||
1863 | |||
1864 | /* Quota change must be inside a transaction for journaling */ | ||
1865 | DQUOT_FREE_INODE(inode); | ||
1866 | |||
1867 | out_end_trans: | ||
1868 | journal_end(th, th->t_super, th->t_blocks_allocated) ; | ||
1869 | /* Drop can be outside and it needs more credits so it's better to have it outside */ | ||
1870 | DQUOT_DROP(inode); | ||
1871 | inode->i_flags |= S_NOQUOTA; | ||
1872 | make_bad_inode(inode); | ||
1873 | |||
1874 | out_inserted_sd: | ||
1875 | inode->i_nlink = 0; | ||
1876 | th->t_trans_id = 0; /* so the caller can't use this handle later */ | ||
1877 | iput(inode); | ||
1878 | return err; | ||
1879 | } | ||
1880 | |||
1881 | /* | ||
1882 | ** finds the tail page in the page cache, | ||
1883 | ** reads the last block in. | ||
1884 | ** | ||
1885 | ** On success, page_result is set to a locked, pinned page, and bh_result | ||
1886 | ** is set to an up to date buffer for the last block in the file. returns 0. | ||
1887 | ** | ||
1888 | ** tail conversion is not done, so bh_result might not be valid for writing | ||
1889 | ** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before | ||
1890 | ** trying to write the block. | ||
1891 | ** | ||
1892 | ** on failure, nonzero is returned, page_result and bh_result are untouched. | ||
1893 | */ | ||
1894 | static int grab_tail_page(struct inode *p_s_inode, | ||
1895 | struct page **page_result, | ||
1896 | struct buffer_head **bh_result) { | ||
1897 | |||
1898 | /* we want the page with the last byte in the file, | ||
1899 | ** not the page that will hold the next byte for appending | ||
1900 | */ | ||
1901 | unsigned long index = (p_s_inode->i_size-1) >> PAGE_CACHE_SHIFT ; | ||
1902 | unsigned long pos = 0 ; | ||
1903 | unsigned long start = 0 ; | ||
1904 | unsigned long blocksize = p_s_inode->i_sb->s_blocksize ; | ||
1905 | unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1) ; | ||
1906 | struct buffer_head *bh ; | ||
1907 | struct buffer_head *head ; | ||
1908 | struct page * page ; | ||
1909 | int error ; | ||
1910 | |||
1911 | /* we know that we are only called with inode->i_size > 0. | ||
1912 | ** we also know that a file tail can never be as big as a block | ||
1913 | ** If i_size % blocksize == 0, our file is currently block aligned | ||
1914 | ** and it won't need converting or zeroing after a truncate. | ||
1915 | */ | ||
1916 | if ((offset & (blocksize - 1)) == 0) { | ||
1917 | return -ENOENT ; | ||
1918 | } | ||
1919 | page = grab_cache_page(p_s_inode->i_mapping, index) ; | ||
1920 | error = -ENOMEM ; | ||
1921 | if (!page) { | ||
1922 | goto out ; | ||
1923 | } | ||
1924 | /* start within the page of the last block in the file */ | ||
1925 | start = (offset / blocksize) * blocksize ; | ||
1926 | |||
1927 | error = block_prepare_write(page, start, offset, | ||
1928 | reiserfs_get_block_create_0) ; | ||
1929 | if (error) | ||
1930 | goto unlock ; | ||
1931 | |||
1932 | head = page_buffers(page) ; | ||
1933 | bh = head; | ||
1934 | do { | ||
1935 | if (pos >= start) { | ||
1936 | break ; | ||
1937 | } | ||
1938 | bh = bh->b_this_page ; | ||
1939 | pos += blocksize ; | ||
1940 | } while(bh != head) ; | ||
1941 | |||
1942 | if (!buffer_uptodate(bh)) { | ||
1943 | /* note, this should never happen, prepare_write should | ||
1944 | ** be taking care of this for us. If the buffer isn't up to date, | ||
1945 | ** I've screwed up the code to find the buffer, or the code to | ||
1946 | ** call prepare_write | ||
1947 | */ | ||
1948 | reiserfs_warning (p_s_inode->i_sb, | ||
1949 | "clm-6000: error reading block %lu on dev %s", | ||
1950 | bh->b_blocknr, | ||
1951 | reiserfs_bdevname (p_s_inode->i_sb)) ; | ||
1952 | error = -EIO ; | ||
1953 | goto unlock ; | ||
1954 | } | ||
1955 | *bh_result = bh ; | ||
1956 | *page_result = page ; | ||
1957 | |||
1958 | out: | ||
1959 | return error ; | ||
1960 | |||
1961 | unlock: | ||
1962 | unlock_page(page) ; | ||
1963 | page_cache_release(page) ; | ||
1964 | return error ; | ||
1965 | } | ||
1966 | |||
1967 | /* | ||
1968 | ** vfs version of truncate file. Must NOT be called with | ||
1969 | ** a transaction already started. | ||
1970 | ** | ||
1971 | ** some code taken from block_truncate_page | ||
1972 | */ | ||
1973 | int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) { | ||
1974 | struct reiserfs_transaction_handle th ; | ||
1975 | /* we want the offset for the first byte after the end of the file */ | ||
1976 | unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1) ; | ||
1977 | unsigned blocksize = p_s_inode->i_sb->s_blocksize ; | ||
1978 | unsigned length ; | ||
1979 | struct page *page = NULL ; | ||
1980 | int error ; | ||
1981 | struct buffer_head *bh = NULL ; | ||
1982 | |||
1983 | reiserfs_write_lock(p_s_inode->i_sb); | ||
1984 | |||
1985 | if (p_s_inode->i_size > 0) { | ||
1986 | if ((error = grab_tail_page(p_s_inode, &page, &bh))) { | ||
1987 | // -ENOENT means we truncated past the end of the file, | ||
1988 | // and get_block_create_0 could not find a block to read in, | ||
1989 | // which is ok. | ||
1990 | if (error != -ENOENT) | ||
1991 | reiserfs_warning (p_s_inode->i_sb, | ||
1992 | "clm-6001: grab_tail_page failed %d", | ||
1993 | error); | ||
1994 | page = NULL ; | ||
1995 | bh = NULL ; | ||
1996 | } | ||
1997 | } | ||
1998 | |||
1999 | /* so, if page != NULL, we have a buffer head for the offset at | ||
2000 | ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0, | ||
2001 | ** then we have an unformatted node. Otherwise, we have a direct item, | ||
2002 | ** and no zeroing is required on disk. We zero after the truncate, | ||
2003 | ** because the truncate might pack the item anyway | ||
2004 | ** (it will unmap bh if it packs). | ||
2005 | */ | ||
2006 | /* it is enough to reserve space in transaction for 2 balancings: | ||
2007 | one for "save" link adding and another for the first | ||
2008 | cut_from_item. 1 is for update_sd */ | ||
2009 | error = journal_begin (&th, p_s_inode->i_sb, | ||
2010 | JOURNAL_PER_BALANCE_CNT * 2 + 1); | ||
2011 | if (error) | ||
2012 | goto out; | ||
2013 | reiserfs_update_inode_transaction(p_s_inode) ; | ||
2014 | if (update_timestamps) | ||
2015 | /* we are doing real truncate: if the system crashes before the last | ||
2016 | transaction of truncating gets committed - on reboot the file | ||
2017 | either appears truncated properly or not truncated at all */ | ||
2018 | add_save_link (&th, p_s_inode, 1); | ||
2019 | error = reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ; | ||
2020 | if (error) | ||
2021 | goto out; | ||
2022 | error = journal_end (&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1); | ||
2023 | if (error) | ||
2024 | goto out; | ||
2025 | |||
2026 | if (update_timestamps) { | ||
2027 | error = remove_save_link (p_s_inode, 1/* truncate */); | ||
2028 | if (error) | ||
2029 | goto out; | ||
2030 | } | ||
2031 | |||
2032 | if (page) { | ||
2033 | length = offset & (blocksize - 1) ; | ||
2034 | /* if we are not on a block boundary */ | ||
2035 | if (length) { | ||
2036 | char *kaddr; | ||
2037 | |||
2038 | length = blocksize - length ; | ||
2039 | kaddr = kmap_atomic(page, KM_USER0) ; | ||
2040 | memset(kaddr + offset, 0, length) ; | ||
2041 | flush_dcache_page(page) ; | ||
2042 | kunmap_atomic(kaddr, KM_USER0) ; | ||
2043 | if (buffer_mapped(bh) && bh->b_blocknr != 0) { | ||
2044 | mark_buffer_dirty(bh) ; | ||
2045 | } | ||
2046 | } | ||
2047 | unlock_page(page) ; | ||
2048 | page_cache_release(page) ; | ||
2049 | } | ||
2050 | |||
2051 | reiserfs_write_unlock(p_s_inode->i_sb); | ||
2052 | return 0; | ||
2053 | out: | ||
2054 | if (page) { | ||
2055 | unlock_page (page); | ||
2056 | page_cache_release (page); | ||
2057 | } | ||
2058 | reiserfs_write_unlock(p_s_inode->i_sb); | ||
2059 | return error; | ||
2060 | } | ||
2061 | |||
2062 | static int map_block_for_writepage(struct inode *inode, | ||
2063 | struct buffer_head *bh_result, | ||
2064 | unsigned long block) { | ||
2065 | struct reiserfs_transaction_handle th ; | ||
2066 | int fs_gen ; | ||
2067 | struct item_head tmp_ih ; | ||
2068 | struct item_head *ih ; | ||
2069 | struct buffer_head *bh ; | ||
2070 | __u32 *item ; | ||
2071 | struct cpu_key key ; | ||
2072 | INITIALIZE_PATH(path) ; | ||
2073 | int pos_in_item ; | ||
2074 | int jbegin_count = JOURNAL_PER_BALANCE_CNT ; | ||
2075 | loff_t byte_offset = (block << inode->i_sb->s_blocksize_bits) + 1 ; | ||
2076 | int retval ; | ||
2077 | int use_get_block = 0 ; | ||
2078 | int bytes_copied = 0 ; | ||
2079 | int copy_size ; | ||
2080 | int trans_running = 0; | ||
2081 | |||
2082 | /* catch places below that try to log something without starting a trans */ | ||
2083 | th.t_trans_id = 0; | ||
2084 | |||
2085 | if (!buffer_uptodate(bh_result)) { | ||
2086 | return -EIO; | ||
2087 | } | ||
2088 | |||
2089 | kmap(bh_result->b_page) ; | ||
2090 | start_over: | ||
2091 | reiserfs_write_lock(inode->i_sb); | ||
2092 | make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3) ; | ||
2093 | |||
2094 | research: | ||
2095 | retval = search_for_position_by_key(inode->i_sb, &key, &path) ; | ||
2096 | if (retval != POSITION_FOUND) { | ||
2097 | use_get_block = 1; | ||
2098 | goto out ; | ||
2099 | } | ||
2100 | |||
2101 | bh = get_last_bh(&path) ; | ||
2102 | ih = get_ih(&path) ; | ||
2103 | item = get_item(&path) ; | ||
2104 | pos_in_item = path.pos_in_item ; | ||
2105 | |||
2106 | /* we've found an unformatted node */ | ||
2107 | if (indirect_item_found(retval, ih)) { | ||
2108 | if (bytes_copied > 0) { | ||
2109 | reiserfs_warning (inode->i_sb, "clm-6002: bytes_copied %d", | ||
2110 | bytes_copied) ; | ||
2111 | } | ||
2112 | if (!get_block_num(item, pos_in_item)) { | ||
2113 | /* crap, we are writing to a hole */ | ||
2114 | use_get_block = 1; | ||
2115 | goto out ; | ||
2116 | } | ||
2117 | set_block_dev_mapped(bh_result, get_block_num(item,pos_in_item),inode); | ||
2118 | } else if (is_direct_le_ih(ih)) { | ||
2119 | char *p ; | ||
2120 | p = page_address(bh_result->b_page) ; | ||
2121 | p += (byte_offset -1) & (PAGE_CACHE_SIZE - 1) ; | ||
2122 | copy_size = ih_item_len(ih) - pos_in_item; | ||
2123 | |||
2124 | fs_gen = get_generation(inode->i_sb) ; | ||
2125 | copy_item_head(&tmp_ih, ih) ; | ||
2126 | |||
2127 | if (!trans_running) { | ||
2128 | /* vs-3050 is gone, no need to drop the path */ | ||
2129 | retval = journal_begin(&th, inode->i_sb, jbegin_count) ; | ||
2130 | if (retval) | ||
2131 | goto out; | ||
2132 | reiserfs_update_inode_transaction(inode) ; | ||
2133 | trans_running = 1; | ||
2134 | if (fs_changed(fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) { | ||
2135 | reiserfs_restore_prepared_buffer(inode->i_sb, bh) ; | ||
2136 | goto research; | ||
2137 | } | ||
2138 | } | ||
2139 | |||
2140 | reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ; | ||
2141 | |||
2142 | if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { | ||
2143 | reiserfs_restore_prepared_buffer(inode->i_sb, bh) ; | ||
2144 | goto research; | ||
2145 | } | ||
2146 | |||
2147 | memcpy( B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, copy_size) ; | ||
2148 | |||
2149 | journal_mark_dirty(&th, inode->i_sb, bh) ; | ||
2150 | bytes_copied += copy_size ; | ||
2151 | set_block_dev_mapped(bh_result, 0, inode); | ||
2152 | |||
2153 | /* are there still bytes left? */ | ||
2154 | if (bytes_copied < bh_result->b_size && | ||
2155 | (byte_offset + bytes_copied) < inode->i_size) { | ||
2156 | set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + copy_size) ; | ||
2157 | goto research ; | ||
2158 | } | ||
2159 | } else { | ||
2160 | reiserfs_warning (inode->i_sb, | ||
2161 | "clm-6003: bad item inode %lu, device %s", | ||
2162 | inode->i_ino, reiserfs_bdevname (inode->i_sb)) ; | ||
2163 | retval = -EIO ; | ||
2164 | goto out ; | ||
2165 | } | ||
2166 | retval = 0 ; | ||
2167 | |||
2168 | out: | ||
2169 | pathrelse(&path) ; | ||
2170 | if (trans_running) { | ||
2171 | int err = journal_end(&th, inode->i_sb, jbegin_count) ; | ||
2172 | if (err) | ||
2173 | retval = err; | ||
2174 | trans_running = 0; | ||
2175 | } | ||
2176 | reiserfs_write_unlock(inode->i_sb); | ||
2177 | |||
2178 | /* this is where we fill in holes in the file. */ | ||
2179 | if (use_get_block) { | ||
2180 | retval = reiserfs_get_block(inode, block, bh_result, | ||
2181 | GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM | | ||
2182 | GET_BLOCK_NO_DANGLE); | ||
2183 | if (!retval) { | ||
2184 | if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) { | ||
2185 | /* get_block failed to find a mapped unformatted node. */ | ||
2186 | use_get_block = 0 ; | ||
2187 | goto start_over ; | ||
2188 | } | ||
2189 | } | ||
2190 | } | ||
2191 | kunmap(bh_result->b_page) ; | ||
2192 | |||
2193 | if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { | ||
2194 | /* we've copied data from the page into the direct item, so the | ||
2195 | * buffer in the page is now clean, mark it to reflect that. | ||
2196 | */ | ||
2197 | lock_buffer(bh_result); | ||
2198 | clear_buffer_dirty(bh_result); | ||
2199 | unlock_buffer(bh_result); | ||
2200 | } | ||
2201 | return retval ; | ||
2202 | } | ||
2203 | |||
2204 | /* | ||
2205 | * mason@suse.com: updated in 2.5.54 to follow the same general io | ||
2206 | * start/recovery path as __block_write_full_page, along with special | ||
2207 | * code to handle reiserfs tails. | ||
2208 | */ | ||
2209 | static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) { | ||
2210 | struct inode *inode = page->mapping->host ; | ||
2211 | unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ; | ||
2212 | int error = 0; | ||
2213 | unsigned long block ; | ||
2214 | struct buffer_head *head, *bh; | ||
2215 | int partial = 0 ; | ||
2216 | int nr = 0; | ||
2217 | int checked = PageChecked(page); | ||
2218 | struct reiserfs_transaction_handle th; | ||
2219 | struct super_block *s = inode->i_sb; | ||
2220 | int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; | ||
2221 | th.t_trans_id = 0; | ||
2222 | |||
2223 | /* The page dirty bit is cleared before writepage is called, which | ||
2224 | * means we have to tell create_empty_buffers to make dirty buffers | ||
2225 | * The page really should be up to date at this point, so tossing | ||
2226 | * in the BH_Uptodate is just a sanity check. | ||
2227 | */ | ||
2228 | if (!page_has_buffers(page)) { | ||
2229 | create_empty_buffers(page, s->s_blocksize, | ||
2230 | (1 << BH_Dirty) | (1 << BH_Uptodate)); | ||
2231 | } | ||
2232 | head = page_buffers(page) ; | ||
2233 | |||
2234 | /* last page in the file, zero out any contents past the | ||
2235 | ** last byte in the file | ||
2236 | */ | ||
2237 | if (page->index >= end_index) { | ||
2238 | char *kaddr; | ||
2239 | unsigned last_offset; | ||
2240 | |||
2241 | last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1) ; | ||
2242 | /* no file contents in this page */ | ||
2243 | if (page->index >= end_index + 1 || !last_offset) { | ||
2244 | unlock_page(page); | ||
2245 | return 0; | ||
2246 | } | ||
2247 | kaddr = kmap_atomic(page, KM_USER0); | ||
2248 | memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE-last_offset) ; | ||
2249 | flush_dcache_page(page) ; | ||
2250 | kunmap_atomic(kaddr, KM_USER0) ; | ||
2251 | } | ||
2252 | bh = head ; | ||
2253 | block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits) ; | ||
2254 | /* first map all the buffers, logging any direct items we find */ | ||
2255 | do { | ||
2256 | if ((checked || buffer_dirty(bh)) && (!buffer_mapped(bh) || | ||
2257 | (buffer_mapped(bh) && bh->b_blocknr == 0))) { | ||
2258 | /* not mapped yet, or it points to a direct item, search | ||
2259 | * the btree for the mapping info, and log any direct | ||
2260 | * items found | ||
2261 | */ | ||
2262 | if ((error = map_block_for_writepage(inode, bh, block))) { | ||
2263 | goto fail ; | ||
2264 | } | ||
2265 | } | ||
2266 | bh = bh->b_this_page; | ||
2267 | block++; | ||
2268 | } while(bh != head) ; | ||
2269 | |||
2270 | /* | ||
2271 | * we start the transaction after map_block_for_writepage, | ||
2272 | * because it can create holes in the file (an unbounded operation). | ||
2273 | * starting it here, we can make a reliable estimate for how many | ||
2274 | * blocks we're going to log | ||
2275 | */ | ||
2276 | if (checked) { | ||
2277 | ClearPageChecked(page); | ||
2278 | reiserfs_write_lock(s); | ||
2279 | error = journal_begin(&th, s, bh_per_page + 1); | ||
2280 | if (error) { | ||
2281 | reiserfs_write_unlock(s); | ||
2282 | goto fail; | ||
2283 | } | ||
2284 | reiserfs_update_inode_transaction(inode); | ||
2285 | } | ||
2286 | /* now go through and lock any dirty buffers on the page */ | ||
2287 | do { | ||
2288 | get_bh(bh); | ||
2289 | if (!buffer_mapped(bh)) | ||
2290 | continue; | ||
2291 | if (buffer_mapped(bh) && bh->b_blocknr == 0) | ||
2292 | continue; | ||
2293 | |||
2294 | if (checked) { | ||
2295 | reiserfs_prepare_for_journal(s, bh, 1); | ||
2296 | journal_mark_dirty(&th, s, bh); | ||
2297 | continue; | ||
2298 | } | ||
2299 | /* from this point on, we know the buffer is mapped to a | ||
2300 | * real block and not a direct item | ||
2301 | */ | ||
2302 | if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { | ||
2303 | lock_buffer(bh); | ||
2304 | } else { | ||
2305 | if (test_set_buffer_locked(bh)) { | ||
2306 | redirty_page_for_writepage(wbc, page); | ||
2307 | continue; | ||
2308 | } | ||
2309 | } | ||
2310 | if (test_clear_buffer_dirty(bh)) { | ||
2311 | mark_buffer_async_write(bh); | ||
2312 | } else { | ||
2313 | unlock_buffer(bh); | ||
2314 | } | ||
2315 | } while((bh = bh->b_this_page) != head); | ||
2316 | |||
2317 | if (checked) { | ||
2318 | error = journal_end(&th, s, bh_per_page + 1); | ||
2319 | reiserfs_write_unlock(s); | ||
2320 | if (error) | ||
2321 | goto fail; | ||
2322 | } | ||
2323 | BUG_ON(PageWriteback(page)); | ||
2324 | set_page_writeback(page); | ||
2325 | unlock_page(page); | ||
2326 | |||
2327 | /* | ||
2328 | * since any buffer might be the only dirty buffer on the page, | ||
2329 | * the first submit_bh can bring the page out of writeback. | ||
2330 | * be careful with the buffers. | ||
2331 | */ | ||
2332 | do { | ||
2333 | struct buffer_head *next = bh->b_this_page; | ||
2334 | if (buffer_async_write(bh)) { | ||
2335 | submit_bh(WRITE, bh); | ||
2336 | nr++; | ||
2337 | } | ||
2338 | put_bh(bh); | ||
2339 | bh = next; | ||
2340 | } while(bh != head); | ||
2341 | |||
2342 | error = 0; | ||
2343 | done: | ||
2344 | if (nr == 0) { | ||
2345 | /* | ||
2346 | * if this page only had a direct item, it is very possible for | ||
2347 | * no io to be required without there being an error. Or, | ||
2348 | * someone else could have locked them and sent them down the | ||
2349 | * pipe without locking the page | ||
2350 | */ | ||
2351 | bh = head ; | ||
2352 | do { | ||
2353 | if (!buffer_uptodate(bh)) { | ||
2354 | partial = 1; | ||
2355 | break; | ||
2356 | } | ||
2357 | bh = bh->b_this_page; | ||
2358 | } while(bh != head); | ||
2359 | if (!partial) | ||
2360 | SetPageUptodate(page); | ||
2361 | end_page_writeback(page); | ||
2362 | } | ||
2363 | return error; | ||
2364 | |||
2365 | fail: | ||
2366 | /* catches various errors, we need to make sure any valid dirty blocks | ||
2367 | * get to the media. The page is currently locked and not marked for | ||
2368 | * writeback | ||
2369 | */ | ||
2370 | ClearPageUptodate(page); | ||
2371 | bh = head; | ||
2372 | do { | ||
2373 | get_bh(bh); | ||
2374 | if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) { | ||
2375 | lock_buffer(bh); | ||
2376 | mark_buffer_async_write(bh); | ||
2377 | } else { | ||
2378 | /* | ||
2379 | * clear any dirty bits that might have come from getting | ||
2380 | * attached to a dirty page | ||
2381 | */ | ||
2382 | clear_buffer_dirty(bh); | ||
2383 | } | ||
2384 | bh = bh->b_this_page; | ||
2385 | } while(bh != head); | ||
2386 | SetPageError(page); | ||
2387 | BUG_ON(PageWriteback(page)); | ||
2388 | set_page_writeback(page); | ||
2389 | unlock_page(page); | ||
2390 | do { | ||
2391 | struct buffer_head *next = bh->b_this_page; | ||
2392 | if (buffer_async_write(bh)) { | ||
2393 | clear_buffer_dirty(bh); | ||
2394 | submit_bh(WRITE, bh); | ||
2395 | nr++; | ||
2396 | } | ||
2397 | put_bh(bh); | ||
2398 | bh = next; | ||
2399 | } while(bh != head); | ||
2400 | goto done; | ||
2401 | } | ||
2402 | |||
2403 | |||
2404 | static int reiserfs_readpage (struct file *f, struct page * page) | ||
2405 | { | ||
2406 | return block_read_full_page (page, reiserfs_get_block); | ||
2407 | } | ||
2408 | |||
2409 | |||
2410 | static int reiserfs_writepage (struct page * page, struct writeback_control *wbc) | ||
2411 | { | ||
2412 | struct inode *inode = page->mapping->host ; | ||
2413 | reiserfs_wait_on_write_block(inode->i_sb) ; | ||
2414 | return reiserfs_write_full_page(page, wbc) ; | ||
2415 | } | ||
2416 | |||
2417 | static int reiserfs_prepare_write(struct file *f, struct page *page, | ||
2418 | unsigned from, unsigned to) { | ||
2419 | struct inode *inode = page->mapping->host ; | ||
2420 | int ret; | ||
2421 | int old_ref = 0; | ||
2422 | |||
2423 | reiserfs_wait_on_write_block(inode->i_sb) ; | ||
2424 | fix_tail_page_for_writing(page) ; | ||
2425 | if (reiserfs_transaction_running(inode->i_sb)) { | ||
2426 | struct reiserfs_transaction_handle *th; | ||
2427 | th = (struct reiserfs_transaction_handle *)current->journal_info; | ||
2428 | BUG_ON (!th->t_refcount); | ||
2429 | BUG_ON (!th->t_trans_id); | ||
2430 | old_ref = th->t_refcount; | ||
2431 | th->t_refcount++; | ||
2432 | } | ||
2433 | |||
2434 | ret = block_prepare_write(page, from, to, reiserfs_get_block) ; | ||
2435 | if (ret && reiserfs_transaction_running(inode->i_sb)) { | ||
2436 | struct reiserfs_transaction_handle *th = current->journal_info; | ||
2437 | /* this gets a little ugly. If reiserfs_get_block returned an | ||
2438 | * error and left a transacstion running, we've got to close it, | ||
2439 | * and we've got to free handle if it was a persistent transaction. | ||
2440 | * | ||
2441 | * But, if we had nested into an existing transaction, we need | ||
2442 | * to just drop the ref count on the handle. | ||
2443 | * | ||
2444 | * If old_ref == 0, the transaction is from reiserfs_get_block, | ||
2445 | * and it was a persistent trans. Otherwise, it was nested above. | ||
2446 | */ | ||
2447 | if (th->t_refcount > old_ref) { | ||
2448 | if (old_ref) | ||
2449 | th->t_refcount--; | ||
2450 | else { | ||
2451 | int err; | ||
2452 | reiserfs_write_lock(inode->i_sb); | ||
2453 | err = reiserfs_end_persistent_transaction(th); | ||
2454 | reiserfs_write_unlock(inode->i_sb); | ||
2455 | if (err) | ||
2456 | ret = err; | ||
2457 | } | ||
2458 | } | ||
2459 | } | ||
2460 | return ret; | ||
2461 | |||
2462 | } | ||
2463 | |||
2464 | |||
2465 | static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) { | ||
2466 | return generic_block_bmap(as, block, reiserfs_bmap) ; | ||
2467 | } | ||
2468 | |||
2469 | static int reiserfs_commit_write(struct file *f, struct page *page, | ||
2470 | unsigned from, unsigned to) { | ||
2471 | struct inode *inode = page->mapping->host ; | ||
2472 | loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; | ||
2473 | int ret = 0; | ||
2474 | int update_sd = 0; | ||
2475 | struct reiserfs_transaction_handle *th = NULL; | ||
2476 | |||
2477 | reiserfs_wait_on_write_block(inode->i_sb) ; | ||
2478 | if (reiserfs_transaction_running(inode->i_sb)) { | ||
2479 | th = current->journal_info; | ||
2480 | } | ||
2481 | reiserfs_commit_page(inode, page, from, to); | ||
2482 | |||
2483 | /* generic_commit_write does this for us, but does not update the | ||
2484 | ** transaction tracking stuff when the size changes. So, we have | ||
2485 | ** to do the i_size updates here. | ||
2486 | */ | ||
2487 | if (pos > inode->i_size) { | ||
2488 | struct reiserfs_transaction_handle myth ; | ||
2489 | reiserfs_write_lock(inode->i_sb); | ||
2490 | /* If the file have grown beyond the border where it | ||
2491 | can have a tail, unmark it as needing a tail | ||
2492 | packing */ | ||
2493 | if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) || | ||
2494 | (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) ) | ||
2495 | REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ; | ||
2496 | |||
2497 | ret = journal_begin(&myth, inode->i_sb, 1) ; | ||
2498 | if (ret) { | ||
2499 | reiserfs_write_unlock(inode->i_sb); | ||
2500 | goto journal_error; | ||
2501 | } | ||
2502 | reiserfs_update_inode_transaction(inode) ; | ||
2503 | inode->i_size = pos ; | ||
2504 | reiserfs_update_sd(&myth, inode) ; | ||
2505 | update_sd = 1; | ||
2506 | ret = journal_end(&myth, inode->i_sb, 1) ; | ||
2507 | reiserfs_write_unlock(inode->i_sb); | ||
2508 | if (ret) | ||
2509 | goto journal_error; | ||
2510 | } | ||
2511 | if (th) { | ||
2512 | reiserfs_write_lock(inode->i_sb); | ||
2513 | if (!update_sd) | ||
2514 | reiserfs_update_sd(th, inode) ; | ||
2515 | ret = reiserfs_end_persistent_transaction(th); | ||
2516 | reiserfs_write_unlock(inode->i_sb); | ||
2517 | if (ret) | ||
2518 | goto out; | ||
2519 | } | ||
2520 | |||
2521 | /* we test for O_SYNC here so we can commit the transaction | ||
2522 | ** for any packed tails the file might have had | ||
2523 | */ | ||
2524 | if (f && (f->f_flags & O_SYNC)) { | ||
2525 | reiserfs_write_lock(inode->i_sb); | ||
2526 | ret = reiserfs_commit_for_inode(inode) ; | ||
2527 | reiserfs_write_unlock(inode->i_sb); | ||
2528 | } | ||
2529 | out: | ||
2530 | return ret ; | ||
2531 | |||
2532 | journal_error: | ||
2533 | if (th) { | ||
2534 | reiserfs_write_lock(inode->i_sb); | ||
2535 | if (!update_sd) | ||
2536 | reiserfs_update_sd(th, inode) ; | ||
2537 | ret = reiserfs_end_persistent_transaction(th); | ||
2538 | reiserfs_write_unlock(inode->i_sb); | ||
2539 | } | ||
2540 | |||
2541 | return ret; | ||
2542 | } | ||
2543 | |||
2544 | void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode ) | ||
2545 | { | ||
2546 | if( reiserfs_attrs( inode -> i_sb ) ) { | ||
2547 | if( sd_attrs & REISERFS_SYNC_FL ) | ||
2548 | inode -> i_flags |= S_SYNC; | ||
2549 | else | ||
2550 | inode -> i_flags &= ~S_SYNC; | ||
2551 | if( sd_attrs & REISERFS_IMMUTABLE_FL ) | ||
2552 | inode -> i_flags |= S_IMMUTABLE; | ||
2553 | else | ||
2554 | inode -> i_flags &= ~S_IMMUTABLE; | ||
2555 | if( sd_attrs & REISERFS_APPEND_FL ) | ||
2556 | inode -> i_flags |= S_APPEND; | ||
2557 | else | ||
2558 | inode -> i_flags &= ~S_APPEND; | ||
2559 | if( sd_attrs & REISERFS_NOATIME_FL ) | ||
2560 | inode -> i_flags |= S_NOATIME; | ||
2561 | else | ||
2562 | inode -> i_flags &= ~S_NOATIME; | ||
2563 | if( sd_attrs & REISERFS_NOTAIL_FL ) | ||
2564 | REISERFS_I(inode)->i_flags |= i_nopack_mask; | ||
2565 | else | ||
2566 | REISERFS_I(inode)->i_flags &= ~i_nopack_mask; | ||
2567 | } | ||
2568 | } | ||
2569 | |||
2570 | void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs ) | ||
2571 | { | ||
2572 | if( reiserfs_attrs( inode -> i_sb ) ) { | ||
2573 | if( inode -> i_flags & S_IMMUTABLE ) | ||
2574 | *sd_attrs |= REISERFS_IMMUTABLE_FL; | ||
2575 | else | ||
2576 | *sd_attrs &= ~REISERFS_IMMUTABLE_FL; | ||
2577 | if( inode -> i_flags & S_SYNC ) | ||
2578 | *sd_attrs |= REISERFS_SYNC_FL; | ||
2579 | else | ||
2580 | *sd_attrs &= ~REISERFS_SYNC_FL; | ||
2581 | if( inode -> i_flags & S_NOATIME ) | ||
2582 | *sd_attrs |= REISERFS_NOATIME_FL; | ||
2583 | else | ||
2584 | *sd_attrs &= ~REISERFS_NOATIME_FL; | ||
2585 | if( REISERFS_I(inode)->i_flags & i_nopack_mask ) | ||
2586 | *sd_attrs |= REISERFS_NOTAIL_FL; | ||
2587 | else | ||
2588 | *sd_attrs &= ~REISERFS_NOTAIL_FL; | ||
2589 | } | ||
2590 | } | ||
2591 | |||
2592 | /* decide if this buffer needs to stay around for data logging or ordered | ||
2593 | ** write purposes | ||
2594 | */ | ||
2595 | static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) | ||
2596 | { | ||
2597 | int ret = 1 ; | ||
2598 | struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ; | ||
2599 | |||
2600 | spin_lock(&j->j_dirty_buffers_lock) ; | ||
2601 | if (!buffer_mapped(bh)) { | ||
2602 | goto free_jh; | ||
2603 | } | ||
2604 | /* the page is locked, and the only places that log a data buffer | ||
2605 | * also lock the page. | ||
2606 | */ | ||
2607 | if (reiserfs_file_data_log(inode)) { | ||
2608 | /* | ||
2609 | * very conservative, leave the buffer pinned if | ||
2610 | * anyone might need it. | ||
2611 | */ | ||
2612 | if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { | ||
2613 | ret = 0 ; | ||
2614 | } | ||
2615 | } else | ||
2616 | if (buffer_dirty(bh) || buffer_locked(bh)) { | ||
2617 | struct reiserfs_journal_list *jl; | ||
2618 | struct reiserfs_jh *jh = bh->b_private; | ||
2619 | |||
2620 | /* why is this safe? | ||
2621 | * reiserfs_setattr updates i_size in the on disk | ||
2622 | * stat data before allowing vmtruncate to be called. | ||
2623 | * | ||
2624 | * If buffer was put onto the ordered list for this | ||
2625 | * transaction, we know for sure either this transaction | ||
2626 | * or an older one already has updated i_size on disk, | ||
2627 | * and this ordered data won't be referenced in the file | ||
2628 | * if we crash. | ||
2629 | * | ||
2630 | * if the buffer was put onto the ordered list for an older | ||
2631 | * transaction, we need to leave it around | ||
2632 | */ | ||
2633 | if (jh && (jl = jh->jl) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl) | ||
2634 | ret = 0; | ||
2635 | } | ||
2636 | free_jh: | ||
2637 | if (ret && bh->b_private) { | ||
2638 | reiserfs_free_jh(bh); | ||
2639 | } | ||
2640 | spin_unlock(&j->j_dirty_buffers_lock) ; | ||
2641 | return ret ; | ||
2642 | } | ||
2643 | |||
2644 | /* clm -- taken from fs/buffer.c:block_invalidate_page */ | ||
2645 | static int reiserfs_invalidatepage(struct page *page, unsigned long offset) | ||
2646 | { | ||
2647 | struct buffer_head *head, *bh, *next; | ||
2648 | struct inode *inode = page->mapping->host; | ||
2649 | unsigned int curr_off = 0; | ||
2650 | int ret = 1; | ||
2651 | |||
2652 | BUG_ON(!PageLocked(page)); | ||
2653 | |||
2654 | if (offset == 0) | ||
2655 | ClearPageChecked(page); | ||
2656 | |||
2657 | if (!page_has_buffers(page)) | ||
2658 | goto out; | ||
2659 | |||
2660 | head = page_buffers(page); | ||
2661 | bh = head; | ||
2662 | do { | ||
2663 | unsigned int next_off = curr_off + bh->b_size; | ||
2664 | next = bh->b_this_page; | ||
2665 | |||
2666 | /* | ||
2667 | * is this block fully invalidated? | ||
2668 | */ | ||
2669 | if (offset <= curr_off) { | ||
2670 | if (invalidatepage_can_drop(inode, bh)) | ||
2671 | reiserfs_unmap_buffer(bh); | ||
2672 | else | ||
2673 | ret = 0; | ||
2674 | } | ||
2675 | curr_off = next_off; | ||
2676 | bh = next; | ||
2677 | } while (bh != head); | ||
2678 | |||
2679 | /* | ||
2680 | * We release buffers only if the entire page is being invalidated. | ||
2681 | * The get_block cached value has been unconditionally invalidated, | ||
2682 | * so real IO is not possible anymore. | ||
2683 | */ | ||
2684 | if (!offset && ret) | ||
2685 | ret = try_to_release_page(page, 0); | ||
2686 | out: | ||
2687 | return ret; | ||
2688 | } | ||
2689 | |||
2690 | static int reiserfs_set_page_dirty(struct page *page) { | ||
2691 | struct inode *inode = page->mapping->host; | ||
2692 | if (reiserfs_file_data_log(inode)) { | ||
2693 | SetPageChecked(page); | ||
2694 | return __set_page_dirty_nobuffers(page); | ||
2695 | } | ||
2696 | return __set_page_dirty_buffers(page); | ||
2697 | } | ||
2698 | |||
2699 | /* | ||
2700 | * Returns 1 if the page's buffers were dropped. The page is locked. | ||
2701 | * | ||
2702 | * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads | ||
2703 | * in the buffers at page_buffers(page). | ||
2704 | * | ||
2705 | * even in -o notail mode, we can't be sure an old mount without -o notail | ||
2706 | * didn't create files with tails. | ||
2707 | */ | ||
2708 | static int reiserfs_releasepage(struct page *page, int unused_gfp_flags) | ||
2709 | { | ||
2710 | struct inode *inode = page->mapping->host ; | ||
2711 | struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ; | ||
2712 | struct buffer_head *head ; | ||
2713 | struct buffer_head *bh ; | ||
2714 | int ret = 1 ; | ||
2715 | |||
2716 | WARN_ON(PageChecked(page)); | ||
2717 | spin_lock(&j->j_dirty_buffers_lock) ; | ||
2718 | head = page_buffers(page) ; | ||
2719 | bh = head ; | ||
2720 | do { | ||
2721 | if (bh->b_private) { | ||
2722 | if (!buffer_dirty(bh) && !buffer_locked(bh)) { | ||
2723 | reiserfs_free_jh(bh); | ||
2724 | } else { | ||
2725 | ret = 0 ; | ||
2726 | break ; | ||
2727 | } | ||
2728 | } | ||
2729 | bh = bh->b_this_page ; | ||
2730 | } while (bh != head) ; | ||
2731 | if (ret) | ||
2732 | ret = try_to_free_buffers(page) ; | ||
2733 | spin_unlock(&j->j_dirty_buffers_lock) ; | ||
2734 | return ret ; | ||
2735 | } | ||
2736 | |||
2737 | /* We thank Mingming Cao for helping us understand in great detail what | ||
2738 | to do in this section of the code. */ | ||
2739 | static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb, | ||
2740 | const struct iovec *iov, loff_t offset, unsigned long nr_segs) | ||
2741 | { | ||
2742 | struct file *file = iocb->ki_filp; | ||
2743 | struct inode *inode = file->f_mapping->host; | ||
2744 | |||
2745 | return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, | ||
2746 | offset, nr_segs, reiserfs_get_blocks_direct_io, NULL); | ||
2747 | } | ||
2748 | |||
2749 | int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) { | ||
2750 | struct inode *inode = dentry->d_inode ; | ||
2751 | int error ; | ||
2752 | unsigned int ia_valid = attr->ia_valid; | ||
2753 | reiserfs_write_lock(inode->i_sb); | ||
2754 | if (attr->ia_valid & ATTR_SIZE) { | ||
2755 | /* version 2 items will be caught by the s_maxbytes check | ||
2756 | ** done for us in vmtruncate | ||
2757 | */ | ||
2758 | if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 && | ||
2759 | attr->ia_size > MAX_NON_LFS) { | ||
2760 | error = -EFBIG ; | ||
2761 | goto out; | ||
2762 | } | ||
2763 | /* fill in hole pointers in the expanding truncate case. */ | ||
2764 | if (attr->ia_size > inode->i_size) { | ||
2765 | error = generic_cont_expand(inode, attr->ia_size) ; | ||
2766 | if (REISERFS_I(inode)->i_prealloc_count > 0) { | ||
2767 | int err; | ||
2768 | struct reiserfs_transaction_handle th ; | ||
2769 | /* we're changing at most 2 bitmaps, inode + super */ | ||
2770 | err = journal_begin(&th, inode->i_sb, 4) ; | ||
2771 | if (!err) { | ||
2772 | reiserfs_discard_prealloc (&th, inode); | ||
2773 | err = journal_end(&th, inode->i_sb, 4) ; | ||
2774 | } | ||
2775 | if (err) | ||
2776 | error = err; | ||
2777 | } | ||
2778 | if (error) | ||
2779 | goto out; | ||
2780 | } | ||
2781 | } | ||
2782 | |||
2783 | if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) || | ||
2784 | ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) && | ||
2785 | (get_inode_sd_version (inode) == STAT_DATA_V1)) { | ||
2786 | /* stat data of format v3.5 has 16 bit uid and gid */ | ||
2787 | error = -EINVAL; | ||
2788 | goto out; | ||
2789 | } | ||
2790 | |||
2791 | error = inode_change_ok(inode, attr) ; | ||
2792 | if (!error) { | ||
2793 | if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || | ||
2794 | (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { | ||
2795 | error = reiserfs_chown_xattrs (inode, attr); | ||
2796 | |||
2797 | if (!error) { | ||
2798 | struct reiserfs_transaction_handle th; | ||
2799 | |||
2800 | /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */ | ||
2801 | journal_begin(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2); | ||
2802 | error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; | ||
2803 | if (error) { | ||
2804 | journal_end(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2); | ||
2805 | goto out; | ||
2806 | } | ||
2807 | /* Update corresponding info in inode so that everything is in | ||
2808 | * one transaction */ | ||
2809 | if (attr->ia_valid & ATTR_UID) | ||
2810 | inode->i_uid = attr->ia_uid; | ||
2811 | if (attr->ia_valid & ATTR_GID) | ||
2812 | inode->i_gid = attr->ia_gid; | ||
2813 | mark_inode_dirty(inode); | ||
2814 | journal_end(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2); | ||
2815 | } | ||
2816 | } | ||
2817 | if (!error) | ||
2818 | error = inode_setattr(inode, attr) ; | ||
2819 | } | ||
2820 | |||
2821 | |||
2822 | if (!error && reiserfs_posixacl (inode->i_sb)) { | ||
2823 | if (attr->ia_valid & ATTR_MODE) | ||
2824 | error = reiserfs_acl_chmod (inode); | ||
2825 | } | ||
2826 | |||
2827 | out: | ||
2828 | reiserfs_write_unlock(inode->i_sb); | ||
2829 | return error ; | ||
2830 | } | ||
2831 | |||
2832 | |||
2833 | |||
2834 | struct address_space_operations reiserfs_address_space_operations = { | ||
2835 | .writepage = reiserfs_writepage, | ||
2836 | .readpage = reiserfs_readpage, | ||
2837 | .readpages = reiserfs_readpages, | ||
2838 | .releasepage = reiserfs_releasepage, | ||
2839 | .invalidatepage = reiserfs_invalidatepage, | ||
2840 | .sync_page = block_sync_page, | ||
2841 | .prepare_write = reiserfs_prepare_write, | ||
2842 | .commit_write = reiserfs_commit_write, | ||
2843 | .bmap = reiserfs_aop_bmap, | ||
2844 | .direct_IO = reiserfs_direct_IO, | ||
2845 | .set_page_dirty = reiserfs_set_page_dirty, | ||
2846 | } ; | ||
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c new file mode 100644 index 000000000000..94dc42475a04 --- /dev/null +++ b/fs/reiserfs/ioctl.c | |||
@@ -0,0 +1,151 @@ | |||
1 | /* | ||
2 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
3 | */ | ||
4 | |||
5 | #include <linux/fs.h> | ||
6 | #include <linux/reiserfs_fs.h> | ||
7 | #include <linux/time.h> | ||
8 | #include <asm/uaccess.h> | ||
9 | #include <linux/pagemap.h> | ||
10 | #include <linux/smp_lock.h> | ||
11 | |||
12 | static int reiserfs_unpack (struct inode * inode, struct file * filp); | ||
13 | |||
14 | /* | ||
15 | ** reiserfs_ioctl - handler for ioctl for inode | ||
16 | ** supported commands: | ||
17 | ** 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect | ||
18 | ** and prevent packing file (argument arg has to be non-zero) | ||
19 | ** 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION | ||
20 | ** 3) That's all for a while ... | ||
21 | */ | ||
22 | int reiserfs_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, | ||
23 | unsigned long arg) | ||
24 | { | ||
25 | unsigned int flags; | ||
26 | |||
27 | switch (cmd) { | ||
28 | case REISERFS_IOC_UNPACK: | ||
29 | if( S_ISREG( inode -> i_mode ) ) { | ||
30 | if (arg) | ||
31 | return reiserfs_unpack (inode, filp); | ||
32 | else | ||
33 | return 0; | ||
34 | } else | ||
35 | return -ENOTTY; | ||
36 | /* following two cases are taken from fs/ext2/ioctl.c by Remy | ||
37 | Card (card@masi.ibp.fr) */ | ||
38 | case REISERFS_IOC_GETFLAGS: | ||
39 | flags = REISERFS_I(inode) -> i_attrs; | ||
40 | i_attrs_to_sd_attrs( inode, ( __u16 * ) &flags ); | ||
41 | return put_user(flags, (int __user *) arg); | ||
42 | case REISERFS_IOC_SETFLAGS: { | ||
43 | if (IS_RDONLY(inode)) | ||
44 | return -EROFS; | ||
45 | |||
46 | if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) | ||
47 | return -EPERM; | ||
48 | |||
49 | if (get_user(flags, (int __user *) arg)) | ||
50 | return -EFAULT; | ||
51 | |||
52 | if ( ( ( flags ^ REISERFS_I(inode) -> i_attrs) & ( REISERFS_IMMUTABLE_FL | REISERFS_APPEND_FL)) && | ||
53 | !capable( CAP_LINUX_IMMUTABLE ) ) | ||
54 | return -EPERM; | ||
55 | |||
56 | if( ( flags & REISERFS_NOTAIL_FL ) && | ||
57 | S_ISREG( inode -> i_mode ) ) { | ||
58 | int result; | ||
59 | |||
60 | result = reiserfs_unpack( inode, filp ); | ||
61 | if( result ) | ||
62 | return result; | ||
63 | } | ||
64 | sd_attrs_to_i_attrs( flags, inode ); | ||
65 | REISERFS_I(inode) -> i_attrs = flags; | ||
66 | inode->i_ctime = CURRENT_TIME_SEC; | ||
67 | mark_inode_dirty(inode); | ||
68 | return 0; | ||
69 | } | ||
70 | case REISERFS_IOC_GETVERSION: | ||
71 | return put_user(inode->i_generation, (int __user *) arg); | ||
72 | case REISERFS_IOC_SETVERSION: | ||
73 | if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) | ||
74 | return -EPERM; | ||
75 | if (IS_RDONLY(inode)) | ||
76 | return -EROFS; | ||
77 | if (get_user(inode->i_generation, (int __user *) arg)) | ||
78 | return -EFAULT; | ||
79 | inode->i_ctime = CURRENT_TIME_SEC; | ||
80 | mark_inode_dirty(inode); | ||
81 | return 0; | ||
82 | default: | ||
83 | return -ENOTTY; | ||
84 | } | ||
85 | } | ||
86 | |||
87 | /* | ||
88 | ** reiserfs_unpack | ||
89 | ** Function try to convert tail from direct item into indirect. | ||
90 | ** It set up nopack attribute in the REISERFS_I(inode)->nopack | ||
91 | */ | ||
92 | static int reiserfs_unpack (struct inode * inode, struct file * filp) | ||
93 | { | ||
94 | int retval = 0; | ||
95 | int index ; | ||
96 | struct page *page ; | ||
97 | struct address_space *mapping ; | ||
98 | unsigned long write_from ; | ||
99 | unsigned long blocksize = inode->i_sb->s_blocksize ; | ||
100 | |||
101 | if (inode->i_size == 0) { | ||
102 | REISERFS_I(inode)->i_flags |= i_nopack_mask; | ||
103 | return 0 ; | ||
104 | } | ||
105 | /* ioctl already done */ | ||
106 | if (REISERFS_I(inode)->i_flags & i_nopack_mask) { | ||
107 | return 0 ; | ||
108 | } | ||
109 | reiserfs_write_lock(inode->i_sb); | ||
110 | |||
111 | /* we need to make sure nobody is changing the file size beneath | ||
112 | ** us | ||
113 | */ | ||
114 | down(&inode->i_sem) ; | ||
115 | |||
116 | write_from = inode->i_size & (blocksize - 1) ; | ||
117 | /* if we are on a block boundary, we are already unpacked. */ | ||
118 | if ( write_from == 0) { | ||
119 | REISERFS_I(inode)->i_flags |= i_nopack_mask; | ||
120 | goto out ; | ||
121 | } | ||
122 | |||
123 | /* we unpack by finding the page with the tail, and calling | ||
124 | ** reiserfs_prepare_write on that page. This will force a | ||
125 | ** reiserfs_get_block to unpack the tail for us. | ||
126 | */ | ||
127 | index = inode->i_size >> PAGE_CACHE_SHIFT ; | ||
128 | mapping = inode->i_mapping ; | ||
129 | page = grab_cache_page(mapping, index) ; | ||
130 | retval = -ENOMEM; | ||
131 | if (!page) { | ||
132 | goto out ; | ||
133 | } | ||
134 | retval = mapping->a_ops->prepare_write(NULL, page, write_from, write_from) ; | ||
135 | if (retval) | ||
136 | goto out_unlock ; | ||
137 | |||
138 | /* conversion can change page contents, must flush */ | ||
139 | flush_dcache_page(page) ; | ||
140 | retval = mapping->a_ops->commit_write(NULL, page, write_from, write_from) ; | ||
141 | REISERFS_I(inode)->i_flags |= i_nopack_mask; | ||
142 | |||
143 | out_unlock: | ||
144 | unlock_page(page) ; | ||
145 | page_cache_release(page) ; | ||
146 | |||
147 | out: | ||
148 | up(&inode->i_sem) ; | ||
149 | reiserfs_write_unlock(inode->i_sb); | ||
150 | return retval; | ||
151 | } | ||
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c new file mode 100644 index 000000000000..9cf7c13b120d --- /dev/null +++ b/fs/reiserfs/item_ops.c | |||
@@ -0,0 +1,788 @@ | |||
1 | /* | ||
2 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
3 | */ | ||
4 | |||
5 | #include <linux/time.h> | ||
6 | #include <linux/reiserfs_fs.h> | ||
7 | |||
8 | // this contains item handlers for old item types: sd, direct, | ||
9 | // indirect, directory | ||
10 | |||
11 | /* and where are the comments? how about saying where we can find an | ||
12 | explanation of each item handler method? -Hans */ | ||
13 | |||
14 | ////////////////////////////////////////////////////////////////////////////// | ||
15 | // stat data functions | ||
16 | // | ||
17 | static int sd_bytes_number (struct item_head * ih, int block_size) | ||
18 | { | ||
19 | return 0; | ||
20 | } | ||
21 | |||
22 | static void sd_decrement_key (struct cpu_key * key) | ||
23 | { | ||
24 | key->on_disk_key.k_objectid --; | ||
25 | set_cpu_key_k_type (key, TYPE_ANY); | ||
26 | set_cpu_key_k_offset(key, (loff_t)(-1)); | ||
27 | } | ||
28 | |||
29 | static int sd_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize) | ||
30 | { | ||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | |||
35 | |||
36 | static char * print_time (time_t t) | ||
37 | { | ||
38 | static char timebuf[256]; | ||
39 | |||
40 | sprintf (timebuf, "%ld", t); | ||
41 | return timebuf; | ||
42 | } | ||
43 | |||
44 | |||
45 | static void sd_print_item (struct item_head * ih, char * item) | ||
46 | { | ||
47 | printk ("\tmode | size | nlinks | first direct | mtime\n"); | ||
48 | if (stat_data_v1 (ih)) { | ||
49 | struct stat_data_v1 * sd = (struct stat_data_v1 *)item; | ||
50 | |||
51 | printk ("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd), | ||
52 | sd_v1_size(sd), sd_v1_nlink(sd), sd_v1_first_direct_byte(sd), | ||
53 | print_time( sd_v1_mtime(sd) ) ); | ||
54 | } else { | ||
55 | struct stat_data * sd = (struct stat_data *)item; | ||
56 | |||
57 | printk ("\t0%-6o | %6Lu | %2u | %d | %s\n", sd_v2_mode(sd), | ||
58 | (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd), | ||
59 | sd_v2_rdev(sd), print_time(sd_v2_mtime(sd))); | ||
60 | } | ||
61 | } | ||
62 | |||
63 | static void sd_check_item (struct item_head * ih, char * item) | ||
64 | { | ||
65 | // FIXME: type something here! | ||
66 | } | ||
67 | |||
68 | |||
69 | static int sd_create_vi (struct virtual_node * vn, | ||
70 | struct virtual_item * vi, | ||
71 | int is_affected, | ||
72 | int insert_size) | ||
73 | { | ||
74 | vi->vi_index = TYPE_STAT_DATA; | ||
75 | //vi->vi_type |= VI_TYPE_STAT_DATA;// not needed? | ||
76 | return 0; | ||
77 | } | ||
78 | |||
79 | |||
80 | static int sd_check_left (struct virtual_item * vi, int free, | ||
81 | int start_skip, int end_skip) | ||
82 | { | ||
83 | if (start_skip || end_skip) | ||
84 | BUG (); | ||
85 | return -1; | ||
86 | } | ||
87 | |||
88 | |||
89 | static int sd_check_right (struct virtual_item * vi, int free) | ||
90 | { | ||
91 | return -1; | ||
92 | } | ||
93 | |||
94 | static int sd_part_size (struct virtual_item * vi, int first, int count) | ||
95 | { | ||
96 | if (count) | ||
97 | BUG (); | ||
98 | return 0; | ||
99 | } | ||
100 | |||
101 | static int sd_unit_num (struct virtual_item * vi) | ||
102 | { | ||
103 | return vi->vi_item_len - IH_SIZE; | ||
104 | } | ||
105 | |||
106 | |||
107 | static void sd_print_vi (struct virtual_item * vi) | ||
108 | { | ||
109 | reiserfs_warning (NULL, "STATDATA, index %d, type 0x%x, %h", | ||
110 | vi->vi_index, vi->vi_type, vi->vi_ih); | ||
111 | } | ||
112 | |||
113 | static struct item_operations stat_data_ops = { | ||
114 | .bytes_number = sd_bytes_number, | ||
115 | .decrement_key = sd_decrement_key, | ||
116 | .is_left_mergeable = sd_is_left_mergeable, | ||
117 | .print_item = sd_print_item, | ||
118 | .check_item = sd_check_item, | ||
119 | |||
120 | .create_vi = sd_create_vi, | ||
121 | .check_left = sd_check_left, | ||
122 | .check_right = sd_check_right, | ||
123 | .part_size = sd_part_size, | ||
124 | .unit_num = sd_unit_num, | ||
125 | .print_vi = sd_print_vi | ||
126 | }; | ||
127 | |||
128 | |||
129 | |||
130 | ////////////////////////////////////////////////////////////////////////////// | ||
131 | // direct item functions | ||
132 | // | ||
133 | static int direct_bytes_number (struct item_head * ih, int block_size) | ||
134 | { | ||
135 | return ih_item_len(ih); | ||
136 | } | ||
137 | |||
138 | |||
139 | // FIXME: this should probably switch to indirect as well | ||
140 | static void direct_decrement_key (struct cpu_key * key) | ||
141 | { | ||
142 | cpu_key_k_offset_dec (key); | ||
143 | if (cpu_key_k_offset (key) == 0) | ||
144 | set_cpu_key_k_type (key, TYPE_STAT_DATA); | ||
145 | } | ||
146 | |||
147 | |||
148 | static int direct_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize) | ||
149 | { | ||
150 | int version = le_key_version (key); | ||
151 | return ((le_key_k_offset (version, key) & (bsize - 1)) != 1); | ||
152 | } | ||
153 | |||
154 | |||
155 | static void direct_print_item (struct item_head * ih, char * item) | ||
156 | { | ||
157 | int j = 0; | ||
158 | |||
159 | // return; | ||
160 | printk ("\""); | ||
161 | while (j < ih_item_len(ih)) | ||
162 | printk ("%c", item[j++]); | ||
163 | printk ("\"\n"); | ||
164 | } | ||
165 | |||
166 | |||
167 | static void direct_check_item (struct item_head * ih, char * item) | ||
168 | { | ||
169 | // FIXME: type something here! | ||
170 | } | ||
171 | |||
172 | |||
173 | static int direct_create_vi (struct virtual_node * vn, | ||
174 | struct virtual_item * vi, | ||
175 | int is_affected, | ||
176 | int insert_size) | ||
177 | { | ||
178 | vi->vi_index = TYPE_DIRECT; | ||
179 | //vi->vi_type |= VI_TYPE_DIRECT; | ||
180 | return 0; | ||
181 | } | ||
182 | |||
183 | static int direct_check_left (struct virtual_item * vi, int free, | ||
184 | int start_skip, int end_skip) | ||
185 | { | ||
186 | int bytes; | ||
187 | |||
188 | bytes = free - free % 8; | ||
189 | return bytes ?: -1; | ||
190 | } | ||
191 | |||
192 | |||
193 | static int direct_check_right (struct virtual_item * vi, int free) | ||
194 | { | ||
195 | return direct_check_left (vi, free, 0, 0); | ||
196 | } | ||
197 | |||
198 | static int direct_part_size (struct virtual_item * vi, int first, int count) | ||
199 | { | ||
200 | return count; | ||
201 | } | ||
202 | |||
203 | |||
204 | static int direct_unit_num (struct virtual_item * vi) | ||
205 | { | ||
206 | return vi->vi_item_len - IH_SIZE; | ||
207 | } | ||
208 | |||
209 | |||
210 | static void direct_print_vi (struct virtual_item * vi) | ||
211 | { | ||
212 | reiserfs_warning (NULL, "DIRECT, index %d, type 0x%x, %h", | ||
213 | vi->vi_index, vi->vi_type, vi->vi_ih); | ||
214 | } | ||
215 | |||
216 | static struct item_operations direct_ops = { | ||
217 | .bytes_number = direct_bytes_number, | ||
218 | .decrement_key = direct_decrement_key, | ||
219 | .is_left_mergeable = direct_is_left_mergeable, | ||
220 | .print_item = direct_print_item, | ||
221 | .check_item = direct_check_item, | ||
222 | |||
223 | .create_vi = direct_create_vi, | ||
224 | .check_left = direct_check_left, | ||
225 | .check_right = direct_check_right, | ||
226 | .part_size = direct_part_size, | ||
227 | .unit_num = direct_unit_num, | ||
228 | .print_vi = direct_print_vi | ||
229 | }; | ||
230 | |||
231 | |||
232 | |||
233 | ////////////////////////////////////////////////////////////////////////////// | ||
234 | // indirect item functions | ||
235 | // | ||
236 | |||
237 | static int indirect_bytes_number (struct item_head * ih, int block_size) | ||
238 | { | ||
239 | return ih_item_len(ih) / UNFM_P_SIZE * block_size; //- get_ih_free_space (ih); | ||
240 | } | ||
241 | |||
242 | |||
243 | // decrease offset, if it becomes 0, change type to stat data | ||
244 | static void indirect_decrement_key (struct cpu_key * key) | ||
245 | { | ||
246 | cpu_key_k_offset_dec (key); | ||
247 | if (cpu_key_k_offset (key) == 0) | ||
248 | set_cpu_key_k_type (key, TYPE_STAT_DATA); | ||
249 | } | ||
250 | |||
251 | |||
252 | // if it is not first item of the body, then it is mergeable | ||
253 | static int indirect_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize) | ||
254 | { | ||
255 | int version = le_key_version (key); | ||
256 | return (le_key_k_offset (version, key) != 1); | ||
257 | } | ||
258 | |||
259 | |||
260 | // printing of indirect item | ||
261 | static void start_new_sequence (__u32 * start, int * len, __u32 new) | ||
262 | { | ||
263 | *start = new; | ||
264 | *len = 1; | ||
265 | } | ||
266 | |||
267 | |||
268 | static int sequence_finished (__u32 start, int * len, __u32 new) | ||
269 | { | ||
270 | if (start == INT_MAX) | ||
271 | return 1; | ||
272 | |||
273 | if (start == 0 && new == 0) { | ||
274 | (*len) ++; | ||
275 | return 0; | ||
276 | } | ||
277 | if (start != 0 && (start + *len) == new) { | ||
278 | (*len) ++; | ||
279 | return 0; | ||
280 | } | ||
281 | return 1; | ||
282 | } | ||
283 | |||
284 | static void print_sequence (__u32 start, int len) | ||
285 | { | ||
286 | if (start == INT_MAX) | ||
287 | return; | ||
288 | |||
289 | if (len == 1) | ||
290 | printk (" %d", start); | ||
291 | else | ||
292 | printk (" %d(%d)", start, len); | ||
293 | } | ||
294 | |||
295 | |||
296 | static void indirect_print_item (struct item_head * ih, char * item) | ||
297 | { | ||
298 | int j; | ||
299 | __u32 * unp, prev = INT_MAX; | ||
300 | int num; | ||
301 | |||
302 | unp = (__u32 *)item; | ||
303 | |||
304 | if (ih_item_len(ih) % UNFM_P_SIZE) | ||
305 | reiserfs_warning (NULL, "indirect_print_item: invalid item len"); | ||
306 | |||
307 | printk ("%d pointers\n[ ", (int)I_UNFM_NUM (ih)); | ||
308 | for (j = 0; j < I_UNFM_NUM (ih); j ++) { | ||
309 | if (sequence_finished (prev, &num, get_block_num(unp, j))) { | ||
310 | print_sequence (prev, num); | ||
311 | start_new_sequence (&prev, &num, get_block_num(unp, j)); | ||
312 | } | ||
313 | } | ||
314 | print_sequence (prev, num); | ||
315 | printk ("]\n"); | ||
316 | } | ||
317 | |||
318 | static void indirect_check_item (struct item_head * ih, char * item) | ||
319 | { | ||
320 | // FIXME: type something here! | ||
321 | } | ||
322 | |||
323 | |||
324 | static int indirect_create_vi (struct virtual_node * vn, | ||
325 | struct virtual_item * vi, | ||
326 | int is_affected, | ||
327 | int insert_size) | ||
328 | { | ||
329 | vi->vi_index = TYPE_INDIRECT; | ||
330 | //vi->vi_type |= VI_TYPE_INDIRECT; | ||
331 | return 0; | ||
332 | } | ||
333 | |||
334 | static int indirect_check_left (struct virtual_item * vi, int free, | ||
335 | int start_skip, int end_skip) | ||
336 | { | ||
337 | int bytes; | ||
338 | |||
339 | bytes = free - free % UNFM_P_SIZE; | ||
340 | return bytes ?: -1; | ||
341 | } | ||
342 | |||
343 | |||
344 | static int indirect_check_right (struct virtual_item * vi, int free) | ||
345 | { | ||
346 | return indirect_check_left (vi, free, 0, 0); | ||
347 | } | ||
348 | |||
349 | |||
350 | |||
351 | // return size in bytes of 'units' units. If first == 0 - calculate from the head (left), otherwise - from tail (right) | ||
352 | static int indirect_part_size (struct virtual_item * vi, int first, int units) | ||
353 | { | ||
354 | // unit of indirect item is byte (yet) | ||
355 | return units; | ||
356 | } | ||
357 | |||
358 | static int indirect_unit_num (struct virtual_item * vi) | ||
359 | { | ||
360 | // unit of indirect item is byte (yet) | ||
361 | return vi->vi_item_len - IH_SIZE; | ||
362 | } | ||
363 | |||
364 | static void indirect_print_vi (struct virtual_item * vi) | ||
365 | { | ||
366 | reiserfs_warning (NULL, "INDIRECT, index %d, type 0x%x, %h", | ||
367 | vi->vi_index, vi->vi_type, vi->vi_ih); | ||
368 | } | ||
369 | |||
370 | static struct item_operations indirect_ops = { | ||
371 | .bytes_number = indirect_bytes_number, | ||
372 | .decrement_key = indirect_decrement_key, | ||
373 | .is_left_mergeable = indirect_is_left_mergeable, | ||
374 | .print_item = indirect_print_item, | ||
375 | .check_item = indirect_check_item, | ||
376 | |||
377 | .create_vi = indirect_create_vi, | ||
378 | .check_left = indirect_check_left, | ||
379 | .check_right = indirect_check_right, | ||
380 | .part_size = indirect_part_size, | ||
381 | .unit_num = indirect_unit_num, | ||
382 | .print_vi = indirect_print_vi | ||
383 | }; | ||
384 | |||
385 | |||
386 | ////////////////////////////////////////////////////////////////////////////// | ||
387 | // direntry functions | ||
388 | // | ||
389 | |||
390 | |||
391 | static int direntry_bytes_number (struct item_head * ih, int block_size) | ||
392 | { | ||
393 | reiserfs_warning (NULL, "vs-16090: direntry_bytes_number: " | ||
394 | "bytes number is asked for direntry"); | ||
395 | return 0; | ||
396 | } | ||
397 | |||
398 | static void direntry_decrement_key (struct cpu_key * key) | ||
399 | { | ||
400 | cpu_key_k_offset_dec (key); | ||
401 | if (cpu_key_k_offset (key) == 0) | ||
402 | set_cpu_key_k_type (key, TYPE_STAT_DATA); | ||
403 | } | ||
404 | |||
405 | |||
406 | static int direntry_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize) | ||
407 | { | ||
408 | if (le32_to_cpu (key->u.k_offset_v1.k_offset) == DOT_OFFSET) | ||
409 | return 0; | ||
410 | return 1; | ||
411 | |||
412 | } | ||
413 | |||
414 | |||
415 | static void direntry_print_item (struct item_head * ih, char * item) | ||
416 | { | ||
417 | int i; | ||
418 | int namelen; | ||
419 | struct reiserfs_de_head * deh; | ||
420 | char * name; | ||
421 | static char namebuf [80]; | ||
422 | |||
423 | |||
424 | printk ("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name", "Key of pointed object", "Hash", "Gen number", "Status"); | ||
425 | |||
426 | deh = (struct reiserfs_de_head *)item; | ||
427 | |||
428 | for (i = 0; i < I_ENTRY_COUNT (ih); i ++, deh ++) { | ||
429 | namelen = (i ? (deh_location(deh - 1)) : ih_item_len(ih)) - deh_location(deh); | ||
430 | name = item + deh_location(deh); | ||
431 | if (name[namelen-1] == 0) | ||
432 | namelen = strlen (name); | ||
433 | namebuf[0] = '"'; | ||
434 | if (namelen > sizeof (namebuf) - 3) { | ||
435 | strncpy (namebuf + 1, name, sizeof (namebuf) - 3); | ||
436 | namebuf[sizeof (namebuf) - 2] = '"'; | ||
437 | namebuf[sizeof (namebuf) - 1] = 0; | ||
438 | } else { | ||
439 | memcpy (namebuf + 1, name, namelen); | ||
440 | namebuf[namelen + 1] = '"'; | ||
441 | namebuf[namelen + 2] = 0; | ||
442 | } | ||
443 | |||
444 | printk ("%d: %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n", | ||
445 | i, namebuf, | ||
446 | deh_dir_id(deh), deh_objectid(deh), | ||
447 | GET_HASH_VALUE (deh_offset (deh)), GET_GENERATION_NUMBER ((deh_offset (deh))), | ||
448 | (de_hidden (deh)) ? "HIDDEN" : "VISIBLE"); | ||
449 | } | ||
450 | } | ||
451 | |||
452 | |||
453 | static void direntry_check_item (struct item_head * ih, char * item) | ||
454 | { | ||
455 | int i; | ||
456 | struct reiserfs_de_head * deh; | ||
457 | |||
458 | // FIXME: type something here! | ||
459 | deh = (struct reiserfs_de_head *)item; | ||
460 | for (i = 0; i < I_ENTRY_COUNT (ih); i ++, deh ++) { | ||
461 | ; | ||
462 | } | ||
463 | } | ||
464 | |||
465 | |||
466 | |||
467 | #define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1 | ||
468 | |||
469 | /* | ||
470 | * function returns old entry number in directory item in real node | ||
471 | * using new entry number in virtual item in virtual node */ | ||
472 | static inline int old_entry_num (int is_affected, int virtual_entry_num, int pos_in_item, int mode) | ||
473 | { | ||
474 | if ( mode == M_INSERT || mode == M_DELETE) | ||
475 | return virtual_entry_num; | ||
476 | |||
477 | if (!is_affected) | ||
478 | /* cut or paste is applied to another item */ | ||
479 | return virtual_entry_num; | ||
480 | |||
481 | if (virtual_entry_num < pos_in_item) | ||
482 | return virtual_entry_num; | ||
483 | |||
484 | if (mode == M_CUT) | ||
485 | return virtual_entry_num + 1; | ||
486 | |||
487 | RFALSE( mode != M_PASTE || virtual_entry_num == 0, | ||
488 | "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'", mode); | ||
489 | |||
490 | return virtual_entry_num - 1; | ||
491 | } | ||
492 | |||
493 | |||
494 | |||
495 | |||
496 | /* Create an array of sizes of directory entries for virtual | ||
497 | item. Return space used by an item. FIXME: no control over | ||
498 | consuming of space used by this item handler */ | ||
499 | static int direntry_create_vi (struct virtual_node * vn, | ||
500 | struct virtual_item * vi, | ||
501 | int is_affected, | ||
502 | int insert_size) | ||
503 | { | ||
504 | struct direntry_uarea * dir_u = vi->vi_uarea; | ||
505 | int i, j; | ||
506 | int size = sizeof (struct direntry_uarea); | ||
507 | struct reiserfs_de_head * deh; | ||
508 | |||
509 | vi->vi_index = TYPE_DIRENTRY; | ||
510 | |||
511 | if (!(vi->vi_ih) || !vi->vi_item) | ||
512 | BUG (); | ||
513 | |||
514 | |||
515 | dir_u->flags = 0; | ||
516 | if (le_ih_k_offset (vi->vi_ih) == DOT_OFFSET) | ||
517 | dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM; | ||
518 | |||
519 | deh = (struct reiserfs_de_head *)(vi->vi_item); | ||
520 | |||
521 | |||
522 | /* virtual directory item have this amount of entry after */ | ||
523 | dir_u->entry_count = ih_entry_count (vi->vi_ih) + | ||
524 | ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 : | ||
525 | (vn->vn_mode == M_PASTE ? 1 : 0)) : 0); | ||
526 | |||
527 | for (i = 0; i < dir_u->entry_count; i ++) { | ||
528 | j = old_entry_num (is_affected, i, vn->vn_pos_in_item, vn->vn_mode); | ||
529 | dir_u->entry_sizes[i] = (j ? deh_location( &(deh[j - 1]) ) : | ||
530 | ih_item_len (vi->vi_ih)) - | ||
531 | deh_location( &(deh[j])) + DEH_SIZE; | ||
532 | } | ||
533 | |||
534 | size += (dir_u->entry_count * sizeof (short)); | ||
535 | |||
536 | /* set size of pasted entry */ | ||
537 | if (is_affected && vn->vn_mode == M_PASTE) | ||
538 | dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size; | ||
539 | |||
540 | |||
541 | #ifdef CONFIG_REISERFS_CHECK | ||
542 | /* compare total size of entries with item length */ | ||
543 | { | ||
544 | int k, l; | ||
545 | |||
546 | l = 0; | ||
547 | for (k = 0; k < dir_u->entry_count; k ++) | ||
548 | l += dir_u->entry_sizes[k]; | ||
549 | |||
550 | if (l + IH_SIZE != vi->vi_item_len + | ||
551 | ((is_affected && (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT)) ? insert_size : 0) ) { | ||
552 | reiserfs_panic (NULL, "vs-8025: set_entry_sizes: (mode==%c, insert_size==%d), invalid length of directory item", | ||
553 | vn->vn_mode, insert_size); | ||
554 | } | ||
555 | } | ||
556 | #endif | ||
557 | |||
558 | return size; | ||
559 | |||
560 | |||
561 | } | ||
562 | |||
563 | |||
564 | // | ||
565 | // return number of entries which may fit into specified amount of | ||
566 | // free space, or -1 if free space is not enough even for 1 entry | ||
567 | // | ||
568 | static int direntry_check_left (struct virtual_item * vi, int free, | ||
569 | int start_skip, int end_skip) | ||
570 | { | ||
571 | int i; | ||
572 | int entries = 0; | ||
573 | struct direntry_uarea * dir_u = vi->vi_uarea; | ||
574 | |||
575 | for (i = start_skip; i < dir_u->entry_count - end_skip; i ++) { | ||
576 | if (dir_u->entry_sizes[i] > free) | ||
577 | /* i-th entry doesn't fit into the remaining free space */ | ||
578 | break; | ||
579 | |||
580 | free -= dir_u->entry_sizes[i]; | ||
581 | entries ++; | ||
582 | } | ||
583 | |||
584 | if (entries == dir_u->entry_count) { | ||
585 | reiserfs_panic (NULL, "free space %d, entry_count %d\n", free, dir_u->entry_count); | ||
586 | } | ||
587 | |||
588 | /* "." and ".." can not be separated from each other */ | ||
589 | if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) && entries < 2) | ||
590 | entries = 0; | ||
591 | |||
592 | return entries ?: -1; | ||
593 | } | ||
594 | |||
595 | |||
596 | static int direntry_check_right (struct virtual_item * vi, int free) | ||
597 | { | ||
598 | int i; | ||
599 | int entries = 0; | ||
600 | struct direntry_uarea * dir_u = vi->vi_uarea; | ||
601 | |||
602 | for (i = dir_u->entry_count - 1; i >= 0; i --) { | ||
603 | if (dir_u->entry_sizes[i] > free) | ||
604 | /* i-th entry doesn't fit into the remaining free space */ | ||
605 | break; | ||
606 | |||
607 | free -= dir_u->entry_sizes[i]; | ||
608 | entries ++; | ||
609 | } | ||
610 | if (entries == dir_u->entry_count) | ||
611 | BUG (); | ||
612 | |||
613 | /* "." and ".." can not be separated from each other */ | ||
614 | if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) && entries > dir_u->entry_count - 2) | ||
615 | entries = dir_u->entry_count - 2; | ||
616 | |||
617 | return entries ?: -1; | ||
618 | } | ||
619 | |||
620 | |||
621 | /* sum of entry sizes between from-th and to-th entries including both edges */ | ||
622 | static int direntry_part_size (struct virtual_item * vi, int first, int count) | ||
623 | { | ||
624 | int i, retval; | ||
625 | int from, to; | ||
626 | struct direntry_uarea * dir_u = vi->vi_uarea; | ||
627 | |||
628 | retval = 0; | ||
629 | if (first == 0) | ||
630 | from = 0; | ||
631 | else | ||
632 | from = dir_u->entry_count - count; | ||
633 | to = from + count - 1; | ||
634 | |||
635 | for (i = from; i <= to; i ++) | ||
636 | retval += dir_u->entry_sizes[i]; | ||
637 | |||
638 | return retval; | ||
639 | } | ||
640 | |||
641 | static int direntry_unit_num (struct virtual_item * vi) | ||
642 | { | ||
643 | struct direntry_uarea * dir_u = vi->vi_uarea; | ||
644 | |||
645 | return dir_u->entry_count; | ||
646 | } | ||
647 | |||
648 | |||
649 | |||
650 | static void direntry_print_vi (struct virtual_item * vi) | ||
651 | { | ||
652 | int i; | ||
653 | struct direntry_uarea * dir_u = vi->vi_uarea; | ||
654 | |||
655 | reiserfs_warning (NULL, "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x", | ||
656 | vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags); | ||
657 | printk ("%d entries: ", dir_u->entry_count); | ||
658 | for (i = 0; i < dir_u->entry_count; i ++) | ||
659 | printk ("%d ", dir_u->entry_sizes[i]); | ||
660 | printk ("\n"); | ||
661 | } | ||
662 | |||
663 | static struct item_operations direntry_ops = { | ||
664 | .bytes_number = direntry_bytes_number, | ||
665 | .decrement_key = direntry_decrement_key, | ||
666 | .is_left_mergeable = direntry_is_left_mergeable, | ||
667 | .print_item = direntry_print_item, | ||
668 | .check_item = direntry_check_item, | ||
669 | |||
670 | .create_vi = direntry_create_vi, | ||
671 | .check_left = direntry_check_left, | ||
672 | .check_right = direntry_check_right, | ||
673 | .part_size = direntry_part_size, | ||
674 | .unit_num = direntry_unit_num, | ||
675 | .print_vi = direntry_print_vi | ||
676 | }; | ||
677 | |||
678 | |||
679 | ////////////////////////////////////////////////////////////////////////////// | ||
680 | // Error catching functions to catch errors caused by incorrect item types. | ||
681 | // | ||
682 | static int errcatch_bytes_number (struct item_head * ih, int block_size) | ||
683 | { | ||
684 | reiserfs_warning (NULL, "green-16001: Invalid item type observed, run fsck ASAP"); | ||
685 | return 0; | ||
686 | } | ||
687 | |||
688 | static void errcatch_decrement_key (struct cpu_key * key) | ||
689 | { | ||
690 | reiserfs_warning (NULL, "green-16002: Invalid item type observed, run fsck ASAP"); | ||
691 | } | ||
692 | |||
693 | |||
694 | static int errcatch_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize) | ||
695 | { | ||
696 | reiserfs_warning (NULL, "green-16003: Invalid item type observed, run fsck ASAP"); | ||
697 | return 0; | ||
698 | } | ||
699 | |||
700 | |||
701 | static void errcatch_print_item (struct item_head * ih, char * item) | ||
702 | { | ||
703 | reiserfs_warning (NULL, "green-16004: Invalid item type observed, run fsck ASAP"); | ||
704 | } | ||
705 | |||
706 | |||
707 | static void errcatch_check_item (struct item_head * ih, char * item) | ||
708 | { | ||
709 | reiserfs_warning (NULL, "green-16005: Invalid item type observed, run fsck ASAP"); | ||
710 | } | ||
711 | |||
712 | static int errcatch_create_vi (struct virtual_node * vn, | ||
713 | struct virtual_item * vi, | ||
714 | int is_affected, | ||
715 | int insert_size) | ||
716 | { | ||
717 | reiserfs_warning (NULL, "green-16006: Invalid item type observed, run fsck ASAP"); | ||
718 | return 0; // We might return -1 here as well, but it won't help as create_virtual_node() from where | ||
719 | // this operation is called from is of return type void. | ||
720 | } | ||
721 | |||
722 | static int errcatch_check_left (struct virtual_item * vi, int free, | ||
723 | int start_skip, int end_skip) | ||
724 | { | ||
725 | reiserfs_warning (NULL, "green-16007: Invalid item type observed, run fsck ASAP"); | ||
726 | return -1; | ||
727 | } | ||
728 | |||
729 | |||
730 | static int errcatch_check_right (struct virtual_item * vi, int free) | ||
731 | { | ||
732 | reiserfs_warning (NULL, "green-16008: Invalid item type observed, run fsck ASAP"); | ||
733 | return -1; | ||
734 | } | ||
735 | |||
736 | static int errcatch_part_size (struct virtual_item * vi, int first, int count) | ||
737 | { | ||
738 | reiserfs_warning (NULL, "green-16009: Invalid item type observed, run fsck ASAP"); | ||
739 | return 0; | ||
740 | } | ||
741 | |||
742 | static int errcatch_unit_num (struct virtual_item * vi) | ||
743 | { | ||
744 | reiserfs_warning (NULL, "green-16010: Invalid item type observed, run fsck ASAP"); | ||
745 | return 0; | ||
746 | } | ||
747 | |||
748 | static void errcatch_print_vi (struct virtual_item * vi) | ||
749 | { | ||
750 | reiserfs_warning (NULL, "green-16011: Invalid item type observed, run fsck ASAP"); | ||
751 | } | ||
752 | |||
753 | static struct item_operations errcatch_ops = { | ||
754 | errcatch_bytes_number, | ||
755 | errcatch_decrement_key, | ||
756 | errcatch_is_left_mergeable, | ||
757 | errcatch_print_item, | ||
758 | errcatch_check_item, | ||
759 | |||
760 | errcatch_create_vi, | ||
761 | errcatch_check_left, | ||
762 | errcatch_check_right, | ||
763 | errcatch_part_size, | ||
764 | errcatch_unit_num, | ||
765 | errcatch_print_vi | ||
766 | }; | ||
767 | |||
768 | |||
769 | |||
770 | ////////////////////////////////////////////////////////////////////////////// | ||
771 | // | ||
772 | // | ||
773 | #if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3) | ||
774 | do not compile | ||
775 | #endif | ||
776 | |||
777 | struct item_operations * item_ops [TYPE_ANY + 1] = { | ||
778 | &stat_data_ops, | ||
779 | &indirect_ops, | ||
780 | &direct_ops, | ||
781 | &direntry_ops, | ||
782 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | ||
783 | &errcatch_ops /* This is to catch errors with invalid type (15th entry for TYPE_ANY) */ | ||
784 | }; | ||
785 | |||
786 | |||
787 | |||
788 | |||
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c new file mode 100644 index 000000000000..c9ad3a7849f4 --- /dev/null +++ b/fs/reiserfs/journal.c | |||
@@ -0,0 +1,3876 @@ | |||
1 | /* | ||
2 | ** Write ahead logging implementation copyright Chris Mason 2000 | ||
3 | ** | ||
4 | ** The background commits make this code very interelated, and | ||
5 | ** overly complex. I need to rethink things a bit....The major players: | ||
6 | ** | ||
7 | ** journal_begin -- call with the number of blocks you expect to log. | ||
8 | ** If the current transaction is too | ||
9 | ** old, it will block until the current transaction is | ||
10 | ** finished, and then start a new one. | ||
11 | ** Usually, your transaction will get joined in with | ||
12 | ** previous ones for speed. | ||
13 | ** | ||
14 | ** journal_join -- same as journal_begin, but won't block on the current | ||
15 | ** transaction regardless of age. Don't ever call | ||
16 | ** this. Ever. There are only two places it should be | ||
17 | ** called from, and they are both inside this file. | ||
18 | ** | ||
19 | ** journal_mark_dirty -- adds blocks into this transaction. clears any flags | ||
20 | ** that might make them get sent to disk | ||
21 | ** and then marks them BH_JDirty. Puts the buffer head | ||
22 | ** into the current transaction hash. | ||
23 | ** | ||
24 | ** journal_end -- if the current transaction is batchable, it does nothing | ||
25 | ** otherwise, it could do an async/synchronous commit, or | ||
26 | ** a full flush of all log and real blocks in the | ||
27 | ** transaction. | ||
28 | ** | ||
29 | ** flush_old_commits -- if the current transaction is too old, it is ended and | ||
30 | ** commit blocks are sent to disk. Forces commit blocks | ||
31 | ** to disk for all backgrounded commits that have been | ||
32 | ** around too long. | ||
33 | ** -- Note, if you call this as an immediate flush from | ||
34 | ** from within kupdate, it will ignore the immediate flag | ||
35 | */ | ||
36 | |||
37 | #include <linux/config.h> | ||
38 | #include <asm/uaccess.h> | ||
39 | #include <asm/system.h> | ||
40 | |||
41 | #include <linux/time.h> | ||
42 | #include <asm/semaphore.h> | ||
43 | |||
44 | #include <linux/vmalloc.h> | ||
45 | #include <linux/reiserfs_fs.h> | ||
46 | |||
47 | #include <linux/kernel.h> | ||
48 | #include <linux/errno.h> | ||
49 | #include <linux/fcntl.h> | ||
50 | #include <linux/stat.h> | ||
51 | #include <linux/string.h> | ||
52 | #include <linux/smp_lock.h> | ||
53 | #include <linux/buffer_head.h> | ||
54 | #include <linux/workqueue.h> | ||
55 | #include <linux/writeback.h> | ||
56 | #include <linux/blkdev.h> | ||
57 | |||
58 | |||
59 | /* gets a struct reiserfs_journal_list * from a list head */ | ||
60 | #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ | ||
61 | j_list)) | ||
62 | #define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ | ||
63 | j_working_list)) | ||
64 | |||
65 | /* the number of mounted filesystems. This is used to decide when to | ||
66 | ** start and kill the commit workqueue | ||
67 | */ | ||
68 | static int reiserfs_mounted_fs_count; | ||
69 | |||
70 | static struct workqueue_struct *commit_wq; | ||
71 | |||
72 | #define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit | ||
73 | structs at 4k */ | ||
74 | #define BUFNR 64 /*read ahead */ | ||
75 | |||
76 | /* cnode stat bits. Move these into reiserfs_fs.h */ | ||
77 | |||
78 | #define BLOCK_FREED 2 /* this block was freed, and can't be written. */ | ||
79 | #define BLOCK_FREED_HOLDER 3 /* this block was freed during this transaction, and can't be written */ | ||
80 | |||
81 | #define BLOCK_NEEDS_FLUSH 4 /* used in flush_journal_list */ | ||
82 | #define BLOCK_DIRTIED 5 | ||
83 | |||
84 | |||
85 | /* journal list state bits */ | ||
86 | #define LIST_TOUCHED 1 | ||
87 | #define LIST_DIRTY 2 | ||
88 | #define LIST_COMMIT_PENDING 4 /* someone will commit this list */ | ||
89 | |||
90 | /* flags for do_journal_end */ | ||
91 | #define FLUSH_ALL 1 /* flush commit and real blocks */ | ||
92 | #define COMMIT_NOW 2 /* end and commit this transaction */ | ||
93 | #define WAIT 4 /* wait for the log blocks to hit the disk*/ | ||
94 | |||
95 | static int do_journal_end(struct reiserfs_transaction_handle *,struct super_block *,unsigned long nblocks,int flags) ; | ||
96 | static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ; | ||
97 | static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ; | ||
98 | static int can_dirty(struct reiserfs_journal_cnode *cn) ; | ||
99 | static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks); | ||
100 | static int release_journal_dev( struct super_block *super, | ||
101 | struct reiserfs_journal *journal ); | ||
102 | static int dirty_one_transaction(struct super_block *s, | ||
103 | struct reiserfs_journal_list *jl); | ||
104 | static void flush_async_commits(void *p); | ||
105 | static void queue_log_writer(struct super_block *s); | ||
106 | |||
107 | /* values for join in do_journal_begin_r */ | ||
108 | enum { | ||
109 | JBEGIN_REG = 0, /* regular journal begin */ | ||
110 | JBEGIN_JOIN = 1, /* join the running transaction if at all possible */ | ||
111 | JBEGIN_ABORT = 2, /* called from cleanup code, ignores aborted flag */ | ||
112 | }; | ||
113 | |||
114 | static int do_journal_begin_r(struct reiserfs_transaction_handle *th, | ||
115 | struct super_block * p_s_sb, | ||
116 | unsigned long nblocks,int join); | ||
117 | |||
118 | static void init_journal_hash(struct super_block *p_s_sb) { | ||
119 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
120 | memset(journal->j_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ; | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | ** clears BH_Dirty and sticks the buffer on the clean list. Called because I can't allow refile_buffer to | ||
125 | ** make schedule happen after I've freed a block. Look at remove_from_transaction and journal_mark_freed for | ||
126 | ** more details. | ||
127 | */ | ||
128 | static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) { | ||
129 | if (bh) { | ||
130 | clear_buffer_dirty(bh); | ||
131 | clear_buffer_journal_test(bh); | ||
132 | } | ||
133 | return 0 ; | ||
134 | } | ||
135 | |||
136 | static void disable_barrier(struct super_block *s) | ||
137 | { | ||
138 | REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH); | ||
139 | printk("reiserfs: disabling flush barriers on %s\n", reiserfs_bdevname(s)); | ||
140 | } | ||
141 | |||
142 | static struct reiserfs_bitmap_node * | ||
143 | allocate_bitmap_node(struct super_block *p_s_sb) { | ||
144 | struct reiserfs_bitmap_node *bn ; | ||
145 | static int id; | ||
146 | |||
147 | bn = reiserfs_kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS, p_s_sb) ; | ||
148 | if (!bn) { | ||
149 | return NULL ; | ||
150 | } | ||
151 | bn->data = reiserfs_kmalloc(p_s_sb->s_blocksize, GFP_NOFS, p_s_sb) ; | ||
152 | if (!bn->data) { | ||
153 | reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ; | ||
154 | return NULL ; | ||
155 | } | ||
156 | bn->id = id++ ; | ||
157 | memset(bn->data, 0, p_s_sb->s_blocksize) ; | ||
158 | INIT_LIST_HEAD(&bn->list) ; | ||
159 | return bn ; | ||
160 | } | ||
161 | |||
162 | static struct reiserfs_bitmap_node * | ||
163 | get_bitmap_node(struct super_block *p_s_sb) { | ||
164 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
165 | struct reiserfs_bitmap_node *bn = NULL; | ||
166 | struct list_head *entry = journal->j_bitmap_nodes.next ; | ||
167 | |||
168 | journal->j_used_bitmap_nodes++ ; | ||
169 | repeat: | ||
170 | |||
171 | if(entry != &journal->j_bitmap_nodes) { | ||
172 | bn = list_entry(entry, struct reiserfs_bitmap_node, list) ; | ||
173 | list_del(entry) ; | ||
174 | memset(bn->data, 0, p_s_sb->s_blocksize) ; | ||
175 | journal->j_free_bitmap_nodes-- ; | ||
176 | return bn ; | ||
177 | } | ||
178 | bn = allocate_bitmap_node(p_s_sb) ; | ||
179 | if (!bn) { | ||
180 | yield(); | ||
181 | goto repeat ; | ||
182 | } | ||
183 | return bn ; | ||
184 | } | ||
185 | static inline void free_bitmap_node(struct super_block *p_s_sb, | ||
186 | struct reiserfs_bitmap_node *bn) { | ||
187 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
188 | journal->j_used_bitmap_nodes-- ; | ||
189 | if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) { | ||
190 | reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb) ; | ||
191 | reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ; | ||
192 | } else { | ||
193 | list_add(&bn->list, &journal->j_bitmap_nodes) ; | ||
194 | journal->j_free_bitmap_nodes++ ; | ||
195 | } | ||
196 | } | ||
197 | |||
198 | static void allocate_bitmap_nodes(struct super_block *p_s_sb) { | ||
199 | int i ; | ||
200 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
201 | struct reiserfs_bitmap_node *bn = NULL ; | ||
202 | for (i = 0 ; i < REISERFS_MIN_BITMAP_NODES ; i++) { | ||
203 | bn = allocate_bitmap_node(p_s_sb) ; | ||
204 | if (bn) { | ||
205 | list_add(&bn->list, &journal->j_bitmap_nodes) ; | ||
206 | journal->j_free_bitmap_nodes++ ; | ||
207 | } else { | ||
208 | break ; // this is ok, we'll try again when more are needed | ||
209 | } | ||
210 | } | ||
211 | } | ||
212 | |||
213 | static int set_bit_in_list_bitmap(struct super_block *p_s_sb, int block, | ||
214 | struct reiserfs_list_bitmap *jb) { | ||
215 | int bmap_nr = block / (p_s_sb->s_blocksize << 3) ; | ||
216 | int bit_nr = block % (p_s_sb->s_blocksize << 3) ; | ||
217 | |||
218 | if (!jb->bitmaps[bmap_nr]) { | ||
219 | jb->bitmaps[bmap_nr] = get_bitmap_node(p_s_sb) ; | ||
220 | } | ||
221 | set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data) ; | ||
222 | return 0 ; | ||
223 | } | ||
224 | |||
225 | static void cleanup_bitmap_list(struct super_block *p_s_sb, | ||
226 | struct reiserfs_list_bitmap *jb) { | ||
227 | int i; | ||
228 | if (jb->bitmaps == NULL) | ||
229 | return; | ||
230 | |||
231 | for (i = 0 ; i < SB_BMAP_NR(p_s_sb) ; i++) { | ||
232 | if (jb->bitmaps[i]) { | ||
233 | free_bitmap_node(p_s_sb, jb->bitmaps[i]) ; | ||
234 | jb->bitmaps[i] = NULL ; | ||
235 | } | ||
236 | } | ||
237 | } | ||
238 | |||
239 | /* | ||
240 | ** only call this on FS unmount. | ||
241 | */ | ||
242 | static int free_list_bitmaps(struct super_block *p_s_sb, | ||
243 | struct reiserfs_list_bitmap *jb_array) { | ||
244 | int i ; | ||
245 | struct reiserfs_list_bitmap *jb ; | ||
246 | for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) { | ||
247 | jb = jb_array + i ; | ||
248 | jb->journal_list = NULL ; | ||
249 | cleanup_bitmap_list(p_s_sb, jb) ; | ||
250 | vfree(jb->bitmaps) ; | ||
251 | jb->bitmaps = NULL ; | ||
252 | } | ||
253 | return 0; | ||
254 | } | ||
255 | |||
256 | static int free_bitmap_nodes(struct super_block *p_s_sb) { | ||
257 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
258 | struct list_head *next = journal->j_bitmap_nodes.next ; | ||
259 | struct reiserfs_bitmap_node *bn ; | ||
260 | |||
261 | while(next != &journal->j_bitmap_nodes) { | ||
262 | bn = list_entry(next, struct reiserfs_bitmap_node, list) ; | ||
263 | list_del(next) ; | ||
264 | reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb) ; | ||
265 | reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ; | ||
266 | next = journal->j_bitmap_nodes.next ; | ||
267 | journal->j_free_bitmap_nodes-- ; | ||
268 | } | ||
269 | |||
270 | return 0 ; | ||
271 | } | ||
272 | |||
273 | /* | ||
274 | ** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps. | ||
275 | ** jb_array is the array to be filled in. | ||
276 | */ | ||
277 | int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb, | ||
278 | struct reiserfs_list_bitmap *jb_array, | ||
279 | int bmap_nr) { | ||
280 | int i ; | ||
281 | int failed = 0 ; | ||
282 | struct reiserfs_list_bitmap *jb ; | ||
283 | int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *) ; | ||
284 | |||
285 | for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) { | ||
286 | jb = jb_array + i ; | ||
287 | jb->journal_list = NULL ; | ||
288 | jb->bitmaps = vmalloc( mem ) ; | ||
289 | if (!jb->bitmaps) { | ||
290 | reiserfs_warning(p_s_sb, "clm-2000, unable to allocate bitmaps for journal lists") ; | ||
291 | failed = 1; | ||
292 | break ; | ||
293 | } | ||
294 | memset(jb->bitmaps, 0, mem) ; | ||
295 | } | ||
296 | if (failed) { | ||
297 | free_list_bitmaps(p_s_sb, jb_array) ; | ||
298 | return -1 ; | ||
299 | } | ||
300 | return 0 ; | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | ** find an available list bitmap. If you can't find one, flush a commit list | ||
305 | ** and try again | ||
306 | */ | ||
307 | static struct reiserfs_list_bitmap * | ||
308 | get_list_bitmap(struct super_block *p_s_sb, struct reiserfs_journal_list *jl) { | ||
309 | int i,j ; | ||
310 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
311 | struct reiserfs_list_bitmap *jb = NULL ; | ||
312 | |||
313 | for (j = 0 ; j < (JOURNAL_NUM_BITMAPS * 3) ; j++) { | ||
314 | i = journal->j_list_bitmap_index ; | ||
315 | journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS ; | ||
316 | jb = journal->j_list_bitmap + i ; | ||
317 | if (journal->j_list_bitmap[i].journal_list) { | ||
318 | flush_commit_list(p_s_sb, journal->j_list_bitmap[i].journal_list, 1) ; | ||
319 | if (!journal->j_list_bitmap[i].journal_list) { | ||
320 | break ; | ||
321 | } | ||
322 | } else { | ||
323 | break ; | ||
324 | } | ||
325 | } | ||
326 | if (jb->journal_list) { /* double check to make sure if flushed correctly */ | ||
327 | return NULL ; | ||
328 | } | ||
329 | jb->journal_list = jl ; | ||
330 | return jb ; | ||
331 | } | ||
332 | |||
333 | /* | ||
334 | ** allocates a new chunk of X nodes, and links them all together as a list. | ||
335 | ** Uses the cnode->next and cnode->prev pointers | ||
336 | ** returns NULL on failure | ||
337 | */ | ||
338 | static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) { | ||
339 | struct reiserfs_journal_cnode *head ; | ||
340 | int i ; | ||
341 | if (num_cnodes <= 0) { | ||
342 | return NULL ; | ||
343 | } | ||
344 | head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)) ; | ||
345 | if (!head) { | ||
346 | return NULL ; | ||
347 | } | ||
348 | memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode)) ; | ||
349 | head[0].prev = NULL ; | ||
350 | head[0].next = head + 1 ; | ||
351 | for (i = 1 ; i < num_cnodes; i++) { | ||
352 | head[i].prev = head + (i - 1) ; | ||
353 | head[i].next = head + (i + 1) ; /* if last one, overwrite it after the if */ | ||
354 | } | ||
355 | head[num_cnodes -1].next = NULL ; | ||
356 | return head ; | ||
357 | } | ||
358 | |||
359 | /* | ||
360 | ** pulls a cnode off the free list, or returns NULL on failure | ||
361 | */ | ||
362 | static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb) { | ||
363 | struct reiserfs_journal_cnode *cn ; | ||
364 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
365 | |||
366 | reiserfs_check_lock_depth(p_s_sb, "get_cnode") ; | ||
367 | |||
368 | if (journal->j_cnode_free <= 0) { | ||
369 | return NULL ; | ||
370 | } | ||
371 | journal->j_cnode_used++ ; | ||
372 | journal->j_cnode_free-- ; | ||
373 | cn = journal->j_cnode_free_list ; | ||
374 | if (!cn) { | ||
375 | return cn ; | ||
376 | } | ||
377 | if (cn->next) { | ||
378 | cn->next->prev = NULL ; | ||
379 | } | ||
380 | journal->j_cnode_free_list = cn->next ; | ||
381 | memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; | ||
382 | return cn ; | ||
383 | } | ||
384 | |||
385 | /* | ||
386 | ** returns a cnode to the free list | ||
387 | */ | ||
388 | static void free_cnode(struct super_block *p_s_sb, struct reiserfs_journal_cnode *cn) { | ||
389 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
390 | |||
391 | reiserfs_check_lock_depth(p_s_sb, "free_cnode") ; | ||
392 | |||
393 | journal->j_cnode_used-- ; | ||
394 | journal->j_cnode_free++ ; | ||
395 | /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */ | ||
396 | cn->next = journal->j_cnode_free_list ; | ||
397 | if (journal->j_cnode_free_list) { | ||
398 | journal->j_cnode_free_list->prev = cn ; | ||
399 | } | ||
400 | cn->prev = NULL ; /* not needed with the memset, but I might kill the memset, and forget to do this */ | ||
401 | journal->j_cnode_free_list = cn ; | ||
402 | } | ||
403 | |||
404 | static void clear_prepared_bits(struct buffer_head *bh) { | ||
405 | clear_buffer_journal_prepared (bh); | ||
406 | clear_buffer_journal_restore_dirty (bh); | ||
407 | } | ||
408 | |||
409 | /* utility function to force a BUG if it is called without the big | ||
410 | ** kernel lock held. caller is the string printed just before calling BUG() | ||
411 | */ | ||
412 | void reiserfs_check_lock_depth(struct super_block *sb, char *caller) { | ||
413 | #ifdef CONFIG_SMP | ||
414 | if (current->lock_depth < 0) { | ||
415 | reiserfs_panic (sb, "%s called without kernel lock held", caller) ; | ||
416 | } | ||
417 | #else | ||
418 | ; | ||
419 | #endif | ||
420 | } | ||
421 | |||
422 | /* return a cnode with same dev, block number and size in table, or null if not found */ | ||
423 | static inline struct reiserfs_journal_cnode * | ||
424 | get_journal_hash_dev(struct super_block *sb, | ||
425 | struct reiserfs_journal_cnode **table, | ||
426 | long bl) | ||
427 | { | ||
428 | struct reiserfs_journal_cnode *cn ; | ||
429 | cn = journal_hash(table, sb, bl) ; | ||
430 | while(cn) { | ||
431 | if (cn->blocknr == bl && cn->sb == sb) | ||
432 | return cn ; | ||
433 | cn = cn->hnext ; | ||
434 | } | ||
435 | return (struct reiserfs_journal_cnode *)0 ; | ||
436 | } | ||
437 | |||
438 | /* | ||
439 | ** this actually means 'can this block be reallocated yet?'. If you set search_all, a block can only be allocated | ||
440 | ** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever | ||
441 | ** being overwritten by a replay after crashing. | ||
442 | ** | ||
443 | ** If you don't set search_all, a block can only be allocated if it is not in the current transaction. Since deleting | ||
444 | ** a block removes it from the current transaction, this case should never happen. If you don't set search_all, make | ||
445 | ** sure you never write the block without logging it. | ||
446 | ** | ||
447 | ** next_zero_bit is a suggestion about the next block to try for find_forward. | ||
448 | ** when bl is rejected because it is set in a journal list bitmap, we search | ||
449 | ** for the next zero bit in the bitmap that rejected bl. Then, we return that | ||
450 | ** through next_zero_bit for find_forward to try. | ||
451 | ** | ||
452 | ** Just because we return something in next_zero_bit does not mean we won't | ||
453 | ** reject it on the next call to reiserfs_in_journal | ||
454 | ** | ||
455 | */ | ||
456 | int reiserfs_in_journal(struct super_block *p_s_sb, | ||
457 | int bmap_nr, int bit_nr, int search_all, | ||
458 | b_blocknr_t *next_zero_bit) { | ||
459 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
460 | struct reiserfs_journal_cnode *cn ; | ||
461 | struct reiserfs_list_bitmap *jb ; | ||
462 | int i ; | ||
463 | unsigned long bl; | ||
464 | |||
465 | *next_zero_bit = 0 ; /* always start this at zero. */ | ||
466 | |||
467 | PROC_INFO_INC( p_s_sb, journal.in_journal ); | ||
468 | /* If we aren't doing a search_all, this is a metablock, and it will be logged before use. | ||
469 | ** if we crash before the transaction that freed it commits, this transaction won't | ||
470 | ** have committed either, and the block will never be written | ||
471 | */ | ||
472 | if (search_all) { | ||
473 | for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) { | ||
474 | PROC_INFO_INC( p_s_sb, journal.in_journal_bitmap ); | ||
475 | jb = journal->j_list_bitmap + i ; | ||
476 | if (jb->journal_list && jb->bitmaps[bmap_nr] && | ||
477 | test_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data)) { | ||
478 | *next_zero_bit = find_next_zero_bit((unsigned long *) | ||
479 | (jb->bitmaps[bmap_nr]->data), | ||
480 | p_s_sb->s_blocksize << 3, bit_nr+1) ; | ||
481 | return 1 ; | ||
482 | } | ||
483 | } | ||
484 | } | ||
485 | |||
486 | bl = bmap_nr * (p_s_sb->s_blocksize << 3) + bit_nr; | ||
487 | /* is it in any old transactions? */ | ||
488 | if (search_all && (cn = get_journal_hash_dev(p_s_sb, journal->j_list_hash_table, bl))) { | ||
489 | return 1; | ||
490 | } | ||
491 | |||
492 | /* is it in the current transaction. This should never happen */ | ||
493 | if ((cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, bl))) { | ||
494 | BUG(); | ||
495 | return 1; | ||
496 | } | ||
497 | |||
498 | PROC_INFO_INC( p_s_sb, journal.in_journal_reusable ); | ||
499 | /* safe for reuse */ | ||
500 | return 0 ; | ||
501 | } | ||
502 | |||
503 | /* insert cn into table | ||
504 | */ | ||
505 | static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, struct reiserfs_journal_cnode *cn) { | ||
506 | struct reiserfs_journal_cnode *cn_orig ; | ||
507 | |||
508 | cn_orig = journal_hash(table, cn->sb, cn->blocknr) ; | ||
509 | cn->hnext = cn_orig ; | ||
510 | cn->hprev = NULL ; | ||
511 | if (cn_orig) { | ||
512 | cn_orig->hprev = cn ; | ||
513 | } | ||
514 | journal_hash(table, cn->sb, cn->blocknr) = cn ; | ||
515 | } | ||
516 | |||
517 | /* lock the current transaction */ | ||
518 | inline static void lock_journal(struct super_block *p_s_sb) { | ||
519 | PROC_INFO_INC( p_s_sb, journal.lock_journal ); | ||
520 | down(&SB_JOURNAL(p_s_sb)->j_lock); | ||
521 | } | ||
522 | |||
523 | /* unlock the current transaction */ | ||
524 | inline static void unlock_journal(struct super_block *p_s_sb) { | ||
525 | up(&SB_JOURNAL(p_s_sb)->j_lock); | ||
526 | } | ||
527 | |||
528 | static inline void get_journal_list(struct reiserfs_journal_list *jl) | ||
529 | { | ||
530 | jl->j_refcount++; | ||
531 | } | ||
532 | |||
533 | static inline void put_journal_list(struct super_block *s, | ||
534 | struct reiserfs_journal_list *jl) | ||
535 | { | ||
536 | if (jl->j_refcount < 1) { | ||
537 | reiserfs_panic (s, "trans id %lu, refcount at %d", jl->j_trans_id, | ||
538 | jl->j_refcount); | ||
539 | } | ||
540 | if (--jl->j_refcount == 0) | ||
541 | reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s); | ||
542 | } | ||
543 | |||
544 | /* | ||
545 | ** this used to be much more involved, and I'm keeping it just in case things get ugly again. | ||
546 | ** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a | ||
547 | ** transaction. | ||
548 | */ | ||
549 | static void cleanup_freed_for_journal_list(struct super_block *p_s_sb, struct reiserfs_journal_list *jl) { | ||
550 | |||
551 | struct reiserfs_list_bitmap *jb = jl->j_list_bitmap ; | ||
552 | if (jb) { | ||
553 | cleanup_bitmap_list(p_s_sb, jb) ; | ||
554 | } | ||
555 | jl->j_list_bitmap->journal_list = NULL ; | ||
556 | jl->j_list_bitmap = NULL ; | ||
557 | } | ||
558 | |||
559 | static int journal_list_still_alive(struct super_block *s, | ||
560 | unsigned long trans_id) | ||
561 | { | ||
562 | struct reiserfs_journal *journal = SB_JOURNAL (s); | ||
563 | struct list_head *entry = &journal->j_journal_list; | ||
564 | struct reiserfs_journal_list *jl; | ||
565 | |||
566 | if (!list_empty(entry)) { | ||
567 | jl = JOURNAL_LIST_ENTRY(entry->next); | ||
568 | if (jl->j_trans_id <= trans_id) { | ||
569 | return 1; | ||
570 | } | ||
571 | } | ||
572 | return 0; | ||
573 | } | ||
574 | |||
575 | static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { | ||
576 | char b[BDEVNAME_SIZE]; | ||
577 | |||
578 | if (buffer_journaled(bh)) { | ||
579 | reiserfs_warning(NULL, "clm-2084: pinned buffer %lu:%s sent to disk", | ||
580 | bh->b_blocknr, bdevname(bh->b_bdev, b)) ; | ||
581 | } | ||
582 | if (uptodate) | ||
583 | set_buffer_uptodate(bh) ; | ||
584 | else | ||
585 | clear_buffer_uptodate(bh) ; | ||
586 | unlock_buffer(bh) ; | ||
587 | put_bh(bh) ; | ||
588 | } | ||
589 | |||
590 | static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) { | ||
591 | if (uptodate) | ||
592 | set_buffer_uptodate(bh) ; | ||
593 | else | ||
594 | clear_buffer_uptodate(bh) ; | ||
595 | unlock_buffer(bh) ; | ||
596 | put_bh(bh) ; | ||
597 | } | ||
598 | |||
599 | static void submit_logged_buffer(struct buffer_head *bh) { | ||
600 | get_bh(bh) ; | ||
601 | bh->b_end_io = reiserfs_end_buffer_io_sync ; | ||
602 | clear_buffer_journal_new (bh); | ||
603 | clear_buffer_dirty(bh) ; | ||
604 | if (!test_clear_buffer_journal_test (bh)) | ||
605 | BUG(); | ||
606 | if (!buffer_uptodate(bh)) | ||
607 | BUG(); | ||
608 | submit_bh(WRITE, bh) ; | ||
609 | } | ||
610 | |||
611 | static void submit_ordered_buffer(struct buffer_head *bh) { | ||
612 | get_bh(bh) ; | ||
613 | bh->b_end_io = reiserfs_end_ordered_io; | ||
614 | clear_buffer_dirty(bh) ; | ||
615 | if (!buffer_uptodate(bh)) | ||
616 | BUG(); | ||
617 | submit_bh(WRITE, bh) ; | ||
618 | } | ||
619 | |||
620 | static int submit_barrier_buffer(struct buffer_head *bh) { | ||
621 | get_bh(bh) ; | ||
622 | bh->b_end_io = reiserfs_end_ordered_io; | ||
623 | clear_buffer_dirty(bh) ; | ||
624 | if (!buffer_uptodate(bh)) | ||
625 | BUG(); | ||
626 | return submit_bh(WRITE_BARRIER, bh) ; | ||
627 | } | ||
628 | |||
629 | static void check_barrier_completion(struct super_block *s, | ||
630 | struct buffer_head *bh) { | ||
631 | if (buffer_eopnotsupp(bh)) { | ||
632 | clear_buffer_eopnotsupp(bh); | ||
633 | disable_barrier(s); | ||
634 | set_buffer_uptodate(bh); | ||
635 | set_buffer_dirty(bh); | ||
636 | sync_dirty_buffer(bh); | ||
637 | } | ||
638 | } | ||
639 | |||
640 | #define CHUNK_SIZE 32 | ||
641 | struct buffer_chunk { | ||
642 | struct buffer_head *bh[CHUNK_SIZE]; | ||
643 | int nr; | ||
644 | }; | ||
645 | |||
646 | static void write_chunk(struct buffer_chunk *chunk) { | ||
647 | int i; | ||
648 | for (i = 0; i < chunk->nr ; i++) { | ||
649 | submit_logged_buffer(chunk->bh[i]) ; | ||
650 | } | ||
651 | chunk->nr = 0; | ||
652 | } | ||
653 | |||
654 | static void write_ordered_chunk(struct buffer_chunk *chunk) { | ||
655 | int i; | ||
656 | for (i = 0; i < chunk->nr ; i++) { | ||
657 | submit_ordered_buffer(chunk->bh[i]) ; | ||
658 | } | ||
659 | chunk->nr = 0; | ||
660 | } | ||
661 | |||
662 | static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, | ||
663 | spinlock_t *lock, | ||
664 | void (fn)(struct buffer_chunk *)) | ||
665 | { | ||
666 | int ret = 0; | ||
667 | if (chunk->nr >= CHUNK_SIZE) | ||
668 | BUG(); | ||
669 | chunk->bh[chunk->nr++] = bh; | ||
670 | if (chunk->nr >= CHUNK_SIZE) { | ||
671 | ret = 1; | ||
672 | if (lock) | ||
673 | spin_unlock(lock); | ||
674 | fn(chunk); | ||
675 | if (lock) | ||
676 | spin_lock(lock); | ||
677 | } | ||
678 | return ret; | ||
679 | } | ||
680 | |||
681 | |||
682 | static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0); | ||
683 | static struct reiserfs_jh *alloc_jh(void) { | ||
684 | struct reiserfs_jh *jh; | ||
685 | while(1) { | ||
686 | jh = kmalloc(sizeof(*jh), GFP_NOFS); | ||
687 | if (jh) { | ||
688 | atomic_inc(&nr_reiserfs_jh); | ||
689 | return jh; | ||
690 | } | ||
691 | yield(); | ||
692 | } | ||
693 | } | ||
694 | |||
695 | /* | ||
696 | * we want to free the jh when the buffer has been written | ||
697 | * and waited on | ||
698 | */ | ||
699 | void reiserfs_free_jh(struct buffer_head *bh) { | ||
700 | struct reiserfs_jh *jh; | ||
701 | |||
702 | jh = bh->b_private; | ||
703 | if (jh) { | ||
704 | bh->b_private = NULL; | ||
705 | jh->bh = NULL; | ||
706 | list_del_init(&jh->list); | ||
707 | kfree(jh); | ||
708 | if (atomic_read(&nr_reiserfs_jh) <= 0) | ||
709 | BUG(); | ||
710 | atomic_dec(&nr_reiserfs_jh); | ||
711 | put_bh(bh); | ||
712 | } | ||
713 | } | ||
714 | |||
715 | static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh, | ||
716 | int tail) | ||
717 | { | ||
718 | struct reiserfs_jh *jh; | ||
719 | |||
720 | if (bh->b_private) { | ||
721 | spin_lock(&j->j_dirty_buffers_lock); | ||
722 | if (!bh->b_private) { | ||
723 | spin_unlock(&j->j_dirty_buffers_lock); | ||
724 | goto no_jh; | ||
725 | } | ||
726 | jh = bh->b_private; | ||
727 | list_del_init(&jh->list); | ||
728 | } else { | ||
729 | no_jh: | ||
730 | get_bh(bh); | ||
731 | jh = alloc_jh(); | ||
732 | spin_lock(&j->j_dirty_buffers_lock); | ||
733 | /* buffer must be locked for __add_jh, should be able to have | ||
734 | * two adds at the same time | ||
735 | */ | ||
736 | if (bh->b_private) | ||
737 | BUG(); | ||
738 | jh->bh = bh; | ||
739 | bh->b_private = jh; | ||
740 | } | ||
741 | jh->jl = j->j_current_jl; | ||
742 | if (tail) | ||
743 | list_add_tail(&jh->list, &jh->jl->j_tail_bh_list); | ||
744 | else { | ||
745 | list_add_tail(&jh->list, &jh->jl->j_bh_list); | ||
746 | } | ||
747 | spin_unlock(&j->j_dirty_buffers_lock); | ||
748 | return 0; | ||
749 | } | ||
750 | |||
751 | int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) { | ||
752 | return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1); | ||
753 | } | ||
754 | int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) { | ||
755 | return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0); | ||
756 | } | ||
757 | |||
758 | #define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list) | ||
759 | static int write_ordered_buffers(spinlock_t *lock, | ||
760 | struct reiserfs_journal *j, | ||
761 | struct reiserfs_journal_list *jl, | ||
762 | struct list_head *list) | ||
763 | { | ||
764 | struct buffer_head *bh; | ||
765 | struct reiserfs_jh *jh; | ||
766 | int ret = j->j_errno; | ||
767 | struct buffer_chunk chunk; | ||
768 | struct list_head tmp; | ||
769 | INIT_LIST_HEAD(&tmp); | ||
770 | |||
771 | chunk.nr = 0; | ||
772 | spin_lock(lock); | ||
773 | while(!list_empty(list)) { | ||
774 | jh = JH_ENTRY(list->next); | ||
775 | bh = jh->bh; | ||
776 | get_bh(bh); | ||
777 | if (test_set_buffer_locked(bh)) { | ||
778 | if (!buffer_dirty(bh)) { | ||
779 | list_del_init(&jh->list); | ||
780 | list_add(&jh->list, &tmp); | ||
781 | goto loop_next; | ||
782 | } | ||
783 | spin_unlock(lock); | ||
784 | if (chunk.nr) | ||
785 | write_ordered_chunk(&chunk); | ||
786 | wait_on_buffer(bh); | ||
787 | cond_resched(); | ||
788 | spin_lock(lock); | ||
789 | goto loop_next; | ||
790 | } | ||
791 | if (buffer_dirty(bh)) { | ||
792 | list_del_init(&jh->list); | ||
793 | list_add(&jh->list, &tmp); | ||
794 | add_to_chunk(&chunk, bh, lock, write_ordered_chunk); | ||
795 | } else { | ||
796 | reiserfs_free_jh(bh); | ||
797 | unlock_buffer(bh); | ||
798 | } | ||
799 | loop_next: | ||
800 | put_bh(bh); | ||
801 | cond_resched_lock(lock); | ||
802 | } | ||
803 | if (chunk.nr) { | ||
804 | spin_unlock(lock); | ||
805 | write_ordered_chunk(&chunk); | ||
806 | spin_lock(lock); | ||
807 | } | ||
808 | while(!list_empty(&tmp)) { | ||
809 | jh = JH_ENTRY(tmp.prev); | ||
810 | bh = jh->bh; | ||
811 | get_bh(bh); | ||
812 | reiserfs_free_jh(bh); | ||
813 | |||
814 | if (buffer_locked(bh)) { | ||
815 | spin_unlock(lock); | ||
816 | wait_on_buffer(bh); | ||
817 | spin_lock(lock); | ||
818 | } | ||
819 | if (!buffer_uptodate(bh)) { | ||
820 | ret = -EIO; | ||
821 | } | ||
822 | put_bh(bh); | ||
823 | cond_resched_lock(lock); | ||
824 | } | ||
825 | spin_unlock(lock); | ||
826 | return ret; | ||
827 | } | ||
828 | |||
829 | static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) { | ||
830 | struct reiserfs_journal *journal = SB_JOURNAL (s); | ||
831 | struct reiserfs_journal_list *other_jl; | ||
832 | struct reiserfs_journal_list *first_jl; | ||
833 | struct list_head *entry; | ||
834 | unsigned long trans_id = jl->j_trans_id; | ||
835 | unsigned long other_trans_id; | ||
836 | unsigned long first_trans_id; | ||
837 | |||
838 | find_first: | ||
839 | /* | ||
840 | * first we walk backwards to find the oldest uncommitted transation | ||
841 | */ | ||
842 | first_jl = jl; | ||
843 | entry = jl->j_list.prev; | ||
844 | while(1) { | ||
845 | other_jl = JOURNAL_LIST_ENTRY(entry); | ||
846 | if (entry == &journal->j_journal_list || | ||
847 | atomic_read(&other_jl->j_older_commits_done)) | ||
848 | break; | ||
849 | |||
850 | first_jl = other_jl; | ||
851 | entry = other_jl->j_list.prev; | ||
852 | } | ||
853 | |||
854 | /* if we didn't find any older uncommitted transactions, return now */ | ||
855 | if (first_jl == jl) { | ||
856 | return 0; | ||
857 | } | ||
858 | |||
859 | first_trans_id = first_jl->j_trans_id; | ||
860 | |||
861 | entry = &first_jl->j_list; | ||
862 | while(1) { | ||
863 | other_jl = JOURNAL_LIST_ENTRY(entry); | ||
864 | other_trans_id = other_jl->j_trans_id; | ||
865 | |||
866 | if (other_trans_id < trans_id) { | ||
867 | if (atomic_read(&other_jl->j_commit_left) != 0) { | ||
868 | flush_commit_list(s, other_jl, 0); | ||
869 | |||
870 | /* list we were called with is gone, return */ | ||
871 | if (!journal_list_still_alive(s, trans_id)) | ||
872 | return 1; | ||
873 | |||
874 | /* the one we just flushed is gone, this means all | ||
875 | * older lists are also gone, so first_jl is no longer | ||
876 | * valid either. Go back to the beginning. | ||
877 | */ | ||
878 | if (!journal_list_still_alive(s, other_trans_id)) { | ||
879 | goto find_first; | ||
880 | } | ||
881 | } | ||
882 | entry = entry->next; | ||
883 | if (entry == &journal->j_journal_list) | ||
884 | return 0; | ||
885 | } else { | ||
886 | return 0; | ||
887 | } | ||
888 | } | ||
889 | return 0; | ||
890 | } | ||
891 | int reiserfs_async_progress_wait(struct super_block *s) { | ||
892 | DEFINE_WAIT(wait); | ||
893 | struct reiserfs_journal *j = SB_JOURNAL(s); | ||
894 | if (atomic_read(&j->j_async_throttle)) | ||
895 | blk_congestion_wait(WRITE, HZ/10); | ||
896 | return 0; | ||
897 | } | ||
898 | |||
899 | /* | ||
900 | ** if this journal list still has commit blocks unflushed, send them to disk. | ||
901 | ** | ||
902 | ** log areas must be flushed in order (transaction 2 can't commit before transaction 1) | ||
903 | ** Before the commit block can by written, every other log block must be safely on disk | ||
904 | ** | ||
905 | */ | ||
906 | static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { | ||
907 | int i; | ||
908 | int bn ; | ||
909 | struct buffer_head *tbh = NULL ; | ||
910 | unsigned long trans_id = jl->j_trans_id; | ||
911 | struct reiserfs_journal *journal = SB_JOURNAL (s); | ||
912 | int barrier = 0; | ||
913 | int retval = 0; | ||
914 | |||
915 | reiserfs_check_lock_depth(s, "flush_commit_list") ; | ||
916 | |||
917 | if (atomic_read(&jl->j_older_commits_done)) { | ||
918 | return 0 ; | ||
919 | } | ||
920 | |||
921 | /* before we can put our commit blocks on disk, we have to make sure everyone older than | ||
922 | ** us is on disk too | ||
923 | */ | ||
924 | BUG_ON (jl->j_len <= 0); | ||
925 | BUG_ON (trans_id == journal->j_trans_id); | ||
926 | |||
927 | get_journal_list(jl); | ||
928 | if (flushall) { | ||
929 | if (flush_older_commits(s, jl) == 1) { | ||
930 | /* list disappeared during flush_older_commits. return */ | ||
931 | goto put_jl; | ||
932 | } | ||
933 | } | ||
934 | |||
935 | /* make sure nobody is trying to flush this one at the same time */ | ||
936 | down(&jl->j_commit_lock); | ||
937 | if (!journal_list_still_alive(s, trans_id)) { | ||
938 | up(&jl->j_commit_lock); | ||
939 | goto put_jl; | ||
940 | } | ||
941 | BUG_ON (jl->j_trans_id == 0); | ||
942 | |||
943 | /* this commit is done, exit */ | ||
944 | if (atomic_read(&(jl->j_commit_left)) <= 0) { | ||
945 | if (flushall) { | ||
946 | atomic_set(&(jl->j_older_commits_done), 1) ; | ||
947 | } | ||
948 | up(&jl->j_commit_lock); | ||
949 | goto put_jl; | ||
950 | } | ||
951 | |||
952 | if (!list_empty(&jl->j_bh_list)) { | ||
953 | unlock_kernel(); | ||
954 | write_ordered_buffers(&journal->j_dirty_buffers_lock, | ||
955 | journal, jl, &jl->j_bh_list); | ||
956 | lock_kernel(); | ||
957 | } | ||
958 | BUG_ON (!list_empty(&jl->j_bh_list)); | ||
959 | /* | ||
960 | * for the description block and all the log blocks, submit any buffers | ||
961 | * that haven't already reached the disk | ||
962 | */ | ||
963 | atomic_inc(&journal->j_async_throttle); | ||
964 | for (i = 0 ; i < (jl->j_len + 1) ; i++) { | ||
965 | bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) % | ||
966 | SB_ONDISK_JOURNAL_SIZE(s); | ||
967 | tbh = journal_find_get_block(s, bn) ; | ||
968 | if (buffer_dirty(tbh)) /* redundant, ll_rw_block() checks */ | ||
969 | ll_rw_block(WRITE, 1, &tbh) ; | ||
970 | put_bh(tbh) ; | ||
971 | } | ||
972 | atomic_dec(&journal->j_async_throttle); | ||
973 | |||
974 | /* wait on everything written so far before writing the commit | ||
975 | * if we are in barrier mode, send the commit down now | ||
976 | */ | ||
977 | barrier = reiserfs_barrier_flush(s); | ||
978 | if (barrier) { | ||
979 | int ret; | ||
980 | lock_buffer(jl->j_commit_bh); | ||
981 | ret = submit_barrier_buffer(jl->j_commit_bh); | ||
982 | if (ret == -EOPNOTSUPP) { | ||
983 | set_buffer_uptodate(jl->j_commit_bh); | ||
984 | disable_barrier(s); | ||
985 | barrier = 0; | ||
986 | } | ||
987 | } | ||
988 | for (i = 0 ; i < (jl->j_len + 1) ; i++) { | ||
989 | bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + | ||
990 | (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ; | ||
991 | tbh = journal_find_get_block(s, bn) ; | ||
992 | wait_on_buffer(tbh) ; | ||
993 | // since we're using ll_rw_blk above, it might have skipped over | ||
994 | // a locked buffer. Double check here | ||
995 | // | ||
996 | if (buffer_dirty(tbh)) /* redundant, sync_dirty_buffer() checks */ | ||
997 | sync_dirty_buffer(tbh); | ||
998 | if (unlikely (!buffer_uptodate(tbh))) { | ||
999 | #ifdef CONFIG_REISERFS_CHECK | ||
1000 | reiserfs_warning(s, "journal-601, buffer write failed") ; | ||
1001 | #endif | ||
1002 | retval = -EIO; | ||
1003 | } | ||
1004 | put_bh(tbh) ; /* once for journal_find_get_block */ | ||
1005 | put_bh(tbh) ; /* once due to original getblk in do_journal_end */ | ||
1006 | atomic_dec(&(jl->j_commit_left)) ; | ||
1007 | } | ||
1008 | |||
1009 | BUG_ON (atomic_read(&(jl->j_commit_left)) != 1); | ||
1010 | |||
1011 | if (!barrier) { | ||
1012 | if (buffer_dirty(jl->j_commit_bh)) | ||
1013 | BUG(); | ||
1014 | mark_buffer_dirty(jl->j_commit_bh) ; | ||
1015 | sync_dirty_buffer(jl->j_commit_bh) ; | ||
1016 | } else | ||
1017 | wait_on_buffer(jl->j_commit_bh); | ||
1018 | |||
1019 | check_barrier_completion(s, jl->j_commit_bh); | ||
1020 | |||
1021 | /* If there was a write error in the journal - we can't commit this | ||
1022 | * transaction - it will be invalid and, if successful, will just end | ||
1023 | * up propogating the write error out to the filesystem. */ | ||
1024 | if (unlikely (!buffer_uptodate(jl->j_commit_bh))) { | ||
1025 | #ifdef CONFIG_REISERFS_CHECK | ||
1026 | reiserfs_warning(s, "journal-615: buffer write failed") ; | ||
1027 | #endif | ||
1028 | retval = -EIO; | ||
1029 | } | ||
1030 | bforget(jl->j_commit_bh) ; | ||
1031 | if (journal->j_last_commit_id != 0 && | ||
1032 | (jl->j_trans_id - journal->j_last_commit_id) != 1) { | ||
1033 | reiserfs_warning(s, "clm-2200: last commit %lu, current %lu", | ||
1034 | journal->j_last_commit_id, | ||
1035 | jl->j_trans_id); | ||
1036 | } | ||
1037 | journal->j_last_commit_id = jl->j_trans_id; | ||
1038 | |||
1039 | /* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */ | ||
1040 | cleanup_freed_for_journal_list(s, jl) ; | ||
1041 | |||
1042 | retval = retval ? retval : journal->j_errno; | ||
1043 | |||
1044 | /* mark the metadata dirty */ | ||
1045 | if (!retval) | ||
1046 | dirty_one_transaction(s, jl); | ||
1047 | atomic_dec(&(jl->j_commit_left)) ; | ||
1048 | |||
1049 | if (flushall) { | ||
1050 | atomic_set(&(jl->j_older_commits_done), 1) ; | ||
1051 | } | ||
1052 | up(&jl->j_commit_lock); | ||
1053 | put_jl: | ||
1054 | put_journal_list(s, jl); | ||
1055 | |||
1056 | if (retval) | ||
1057 | reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__); | ||
1058 | return retval; | ||
1059 | } | ||
1060 | |||
1061 | /* | ||
1062 | ** flush_journal_list frequently needs to find a newer transaction for a given block. This does that, or | ||
1063 | ** returns NULL if it can't find anything | ||
1064 | */ | ||
1065 | static struct reiserfs_journal_list *find_newer_jl_for_cn(struct reiserfs_journal_cnode *cn) { | ||
1066 | struct super_block *sb = cn->sb; | ||
1067 | b_blocknr_t blocknr = cn->blocknr ; | ||
1068 | |||
1069 | cn = cn->hprev ; | ||
1070 | while(cn) { | ||
1071 | if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) { | ||
1072 | return cn->jlist ; | ||
1073 | } | ||
1074 | cn = cn->hprev ; | ||
1075 | } | ||
1076 | return NULL ; | ||
1077 | } | ||
1078 | |||
1079 | static void remove_journal_hash(struct super_block *, struct reiserfs_journal_cnode **, | ||
1080 | struct reiserfs_journal_list *, unsigned long, int); | ||
1081 | |||
1082 | /* | ||
1083 | ** once all the real blocks have been flushed, it is safe to remove them from the | ||
1084 | ** journal list for this transaction. Aside from freeing the cnode, this also allows the | ||
1085 | ** block to be reallocated for data blocks if it had been deleted. | ||
1086 | */ | ||
1087 | static void remove_all_from_journal_list(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, int debug) { | ||
1088 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
1089 | struct reiserfs_journal_cnode *cn, *last ; | ||
1090 | cn = jl->j_realblock ; | ||
1091 | |||
1092 | /* which is better, to lock once around the whole loop, or | ||
1093 | ** to lock for each call to remove_journal_hash? | ||
1094 | */ | ||
1095 | while(cn) { | ||
1096 | if (cn->blocknr != 0) { | ||
1097 | if (debug) { | ||
1098 | reiserfs_warning (p_s_sb, "block %u, bh is %d, state %ld", cn->blocknr, | ||
1099 | cn->bh ? 1: 0, cn->state) ; | ||
1100 | } | ||
1101 | cn->state = 0 ; | ||
1102 | remove_journal_hash(p_s_sb, journal->j_list_hash_table, jl, cn->blocknr, 1) ; | ||
1103 | } | ||
1104 | last = cn ; | ||
1105 | cn = cn->next ; | ||
1106 | free_cnode(p_s_sb, last) ; | ||
1107 | } | ||
1108 | jl->j_realblock = NULL ; | ||
1109 | } | ||
1110 | |||
1111 | /* | ||
1112 | ** if this timestamp is greater than the timestamp we wrote last to the header block, write it to the header block. | ||
1113 | ** once this is done, I can safely say the log area for this transaction won't ever be replayed, and I can start | ||
1114 | ** releasing blocks in this transaction for reuse as data blocks. | ||
1115 | ** called by flush_journal_list, before it calls remove_all_from_journal_list | ||
1116 | ** | ||
1117 | */ | ||
1118 | static int _update_journal_header_block(struct super_block *p_s_sb, unsigned long offset, unsigned long trans_id) { | ||
1119 | struct reiserfs_journal_header *jh ; | ||
1120 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
1121 | |||
1122 | if (reiserfs_is_journal_aborted (journal)) | ||
1123 | return -EIO; | ||
1124 | |||
1125 | if (trans_id >= journal->j_last_flush_trans_id) { | ||
1126 | if (buffer_locked((journal->j_header_bh))) { | ||
1127 | wait_on_buffer((journal->j_header_bh)) ; | ||
1128 | if (unlikely (!buffer_uptodate(journal->j_header_bh))) { | ||
1129 | #ifdef CONFIG_REISERFS_CHECK | ||
1130 | reiserfs_warning (p_s_sb, "journal-699: buffer write failed") ; | ||
1131 | #endif | ||
1132 | return -EIO; | ||
1133 | } | ||
1134 | } | ||
1135 | journal->j_last_flush_trans_id = trans_id ; | ||
1136 | journal->j_first_unflushed_offset = offset ; | ||
1137 | jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data) ; | ||
1138 | jh->j_last_flush_trans_id = cpu_to_le32(trans_id) ; | ||
1139 | jh->j_first_unflushed_offset = cpu_to_le32(offset) ; | ||
1140 | jh->j_mount_id = cpu_to_le32(journal->j_mount_id) ; | ||
1141 | |||
1142 | if (reiserfs_barrier_flush(p_s_sb)) { | ||
1143 | int ret; | ||
1144 | lock_buffer(journal->j_header_bh); | ||
1145 | ret = submit_barrier_buffer(journal->j_header_bh); | ||
1146 | if (ret == -EOPNOTSUPP) { | ||
1147 | set_buffer_uptodate(journal->j_header_bh); | ||
1148 | disable_barrier(p_s_sb); | ||
1149 | goto sync; | ||
1150 | } | ||
1151 | wait_on_buffer(journal->j_header_bh); | ||
1152 | check_barrier_completion(p_s_sb, journal->j_header_bh); | ||
1153 | } else { | ||
1154 | sync: | ||
1155 | set_buffer_dirty(journal->j_header_bh) ; | ||
1156 | sync_dirty_buffer(journal->j_header_bh) ; | ||
1157 | } | ||
1158 | if (!buffer_uptodate(journal->j_header_bh)) { | ||
1159 | reiserfs_warning (p_s_sb, "journal-837: IO error during journal replay"); | ||
1160 | return -EIO ; | ||
1161 | } | ||
1162 | } | ||
1163 | return 0 ; | ||
1164 | } | ||
1165 | |||
1166 | static int update_journal_header_block(struct super_block *p_s_sb, | ||
1167 | unsigned long offset, | ||
1168 | unsigned long trans_id) { | ||
1169 | return _update_journal_header_block(p_s_sb, offset, trans_id); | ||
1170 | } | ||
1171 | /* | ||
1172 | ** flush any and all journal lists older than you are | ||
1173 | ** can only be called from flush_journal_list | ||
1174 | */ | ||
1175 | static int flush_older_journal_lists(struct super_block *p_s_sb, | ||
1176 | struct reiserfs_journal_list *jl) | ||
1177 | { | ||
1178 | struct list_head *entry; | ||
1179 | struct reiserfs_journal_list *other_jl ; | ||
1180 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
1181 | unsigned long trans_id = jl->j_trans_id; | ||
1182 | |||
1183 | /* we know we are the only ones flushing things, no extra race | ||
1184 | * protection is required. | ||
1185 | */ | ||
1186 | restart: | ||
1187 | entry = journal->j_journal_list.next; | ||
1188 | /* Did we wrap? */ | ||
1189 | if (entry == &journal->j_journal_list) | ||
1190 | return 0; | ||
1191 | other_jl = JOURNAL_LIST_ENTRY(entry); | ||
1192 | if (other_jl->j_trans_id < trans_id) { | ||
1193 | BUG_ON (other_jl->j_refcount <= 0); | ||
1194 | /* do not flush all */ | ||
1195 | flush_journal_list(p_s_sb, other_jl, 0) ; | ||
1196 | |||
1197 | /* other_jl is now deleted from the list */ | ||
1198 | goto restart; | ||
1199 | } | ||
1200 | return 0 ; | ||
1201 | } | ||
1202 | |||
1203 | static void del_from_work_list(struct super_block *s, | ||
1204 | struct reiserfs_journal_list *jl) { | ||
1205 | struct reiserfs_journal *journal = SB_JOURNAL (s); | ||
1206 | if (!list_empty(&jl->j_working_list)) { | ||
1207 | list_del_init(&jl->j_working_list); | ||
1208 | journal->j_num_work_lists--; | ||
1209 | } | ||
1210 | } | ||
1211 | |||
1212 | /* flush a journal list, both commit and real blocks | ||
1213 | ** | ||
1214 | ** always set flushall to 1, unless you are calling from inside | ||
1215 | ** flush_journal_list | ||
1216 | ** | ||
1217 | ** IMPORTANT. This can only be called while there are no journal writers, | ||
1218 | ** and the journal is locked. That means it can only be called from | ||
1219 | ** do_journal_end, or by journal_release | ||
1220 | */ | ||
1221 | static int flush_journal_list(struct super_block *s, | ||
1222 | struct reiserfs_journal_list *jl, int flushall) { | ||
1223 | struct reiserfs_journal_list *pjl ; | ||
1224 | struct reiserfs_journal_cnode *cn, *last ; | ||
1225 | int count ; | ||
1226 | int was_jwait = 0 ; | ||
1227 | int was_dirty = 0 ; | ||
1228 | struct buffer_head *saved_bh ; | ||
1229 | unsigned long j_len_saved = jl->j_len ; | ||
1230 | struct reiserfs_journal *journal = SB_JOURNAL (s); | ||
1231 | int err = 0; | ||
1232 | |||
1233 | BUG_ON (j_len_saved <= 0); | ||
1234 | |||
1235 | if (atomic_read(&journal->j_wcount) != 0) { | ||
1236 | reiserfs_warning(s, "clm-2048: flush_journal_list called with wcount %d", | ||
1237 | atomic_read(&journal->j_wcount)) ; | ||
1238 | } | ||
1239 | BUG_ON (jl->j_trans_id == 0); | ||
1240 | |||
1241 | /* if flushall == 0, the lock is already held */ | ||
1242 | if (flushall) { | ||
1243 | down(&journal->j_flush_sem); | ||
1244 | } else if (!down_trylock(&journal->j_flush_sem)) { | ||
1245 | BUG(); | ||
1246 | } | ||
1247 | |||
1248 | count = 0 ; | ||
1249 | if (j_len_saved > journal->j_trans_max) { | ||
1250 | reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, trans id %lu\n", j_len_saved, jl->j_trans_id); | ||
1251 | return 0 ; | ||
1252 | } | ||
1253 | |||
1254 | /* if all the work is already done, get out of here */ | ||
1255 | if (atomic_read(&(jl->j_nonzerolen)) <= 0 && | ||
1256 | atomic_read(&(jl->j_commit_left)) <= 0) { | ||
1257 | goto flush_older_and_return ; | ||
1258 | } | ||
1259 | |||
1260 | /* start by putting the commit list on disk. This will also flush | ||
1261 | ** the commit lists of any olders transactions | ||
1262 | */ | ||
1263 | flush_commit_list(s, jl, 1) ; | ||
1264 | |||
1265 | if (!(jl->j_state & LIST_DIRTY) && !reiserfs_is_journal_aborted (journal)) | ||
1266 | BUG(); | ||
1267 | |||
1268 | /* are we done now? */ | ||
1269 | if (atomic_read(&(jl->j_nonzerolen)) <= 0 && | ||
1270 | atomic_read(&(jl->j_commit_left)) <= 0) { | ||
1271 | goto flush_older_and_return ; | ||
1272 | } | ||
1273 | |||
1274 | /* loop through each cnode, see if we need to write it, | ||
1275 | ** or wait on a more recent transaction, or just ignore it | ||
1276 | */ | ||
1277 | if (atomic_read(&(journal->j_wcount)) != 0) { | ||
1278 | reiserfs_panic(s, "journal-844: panic journal list is flushing, wcount is not 0\n") ; | ||
1279 | } | ||
1280 | cn = jl->j_realblock ; | ||
1281 | while(cn) { | ||
1282 | was_jwait = 0 ; | ||
1283 | was_dirty = 0 ; | ||
1284 | saved_bh = NULL ; | ||
1285 | /* blocknr of 0 is no longer in the hash, ignore it */ | ||
1286 | if (cn->blocknr == 0) { | ||
1287 | goto free_cnode ; | ||
1288 | } | ||
1289 | |||
1290 | /* This transaction failed commit. Don't write out to the disk */ | ||
1291 | if (!(jl->j_state & LIST_DIRTY)) | ||
1292 | goto free_cnode; | ||
1293 | |||
1294 | pjl = find_newer_jl_for_cn(cn) ; | ||
1295 | /* the order is important here. We check pjl to make sure we | ||
1296 | ** don't clear BH_JDirty_wait if we aren't the one writing this | ||
1297 | ** block to disk | ||
1298 | */ | ||
1299 | if (!pjl && cn->bh) { | ||
1300 | saved_bh = cn->bh ; | ||
1301 | |||
1302 | /* we do this to make sure nobody releases the buffer while | ||
1303 | ** we are working with it | ||
1304 | */ | ||
1305 | get_bh(saved_bh) ; | ||
1306 | |||
1307 | if (buffer_journal_dirty(saved_bh)) { | ||
1308 | BUG_ON (!can_dirty (cn)); | ||
1309 | was_jwait = 1 ; | ||
1310 | was_dirty = 1 ; | ||
1311 | } else if (can_dirty(cn)) { | ||
1312 | /* everything with !pjl && jwait should be writable */ | ||
1313 | BUG(); | ||
1314 | } | ||
1315 | } | ||
1316 | |||
1317 | /* if someone has this block in a newer transaction, just make | ||
1318 | ** sure they are commited, and don't try writing it to disk | ||
1319 | */ | ||
1320 | if (pjl) { | ||
1321 | if (atomic_read(&pjl->j_commit_left)) | ||
1322 | flush_commit_list(s, pjl, 1) ; | ||
1323 | goto free_cnode ; | ||
1324 | } | ||
1325 | |||
1326 | /* bh == NULL when the block got to disk on its own, OR, | ||
1327 | ** the block got freed in a future transaction | ||
1328 | */ | ||
1329 | if (saved_bh == NULL) { | ||
1330 | goto free_cnode ; | ||
1331 | } | ||
1332 | |||
1333 | /* this should never happen. kupdate_one_transaction has this list | ||
1334 | ** locked while it works, so we should never see a buffer here that | ||
1335 | ** is not marked JDirty_wait | ||
1336 | */ | ||
1337 | if ((!was_jwait) && !buffer_locked(saved_bh)) { | ||
1338 | reiserfs_warning (s, "journal-813: BAD! buffer %llu %cdirty %cjwait, " | ||
1339 | "not in a newer tranasction", | ||
1340 | (unsigned long long)saved_bh->b_blocknr, | ||
1341 | was_dirty ? ' ' : '!', was_jwait ? ' ' : '!') ; | ||
1342 | } | ||
1343 | if (was_dirty) { | ||
1344 | /* we inc again because saved_bh gets decremented at free_cnode */ | ||
1345 | get_bh(saved_bh) ; | ||
1346 | set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ; | ||
1347 | lock_buffer(saved_bh); | ||
1348 | BUG_ON (cn->blocknr != saved_bh->b_blocknr); | ||
1349 | if (buffer_dirty(saved_bh)) | ||
1350 | submit_logged_buffer(saved_bh) ; | ||
1351 | else | ||
1352 | unlock_buffer(saved_bh); | ||
1353 | count++ ; | ||
1354 | } else { | ||
1355 | reiserfs_warning (s, "clm-2082: Unable to flush buffer %llu in %s", | ||
1356 | (unsigned long long)saved_bh->b_blocknr, __FUNCTION__); | ||
1357 | } | ||
1358 | free_cnode: | ||
1359 | last = cn ; | ||
1360 | cn = cn->next ; | ||
1361 | if (saved_bh) { | ||
1362 | /* we incremented this to keep others from taking the buffer head away */ | ||
1363 | put_bh(saved_bh) ; | ||
1364 | if (atomic_read(&(saved_bh->b_count)) < 0) { | ||
1365 | reiserfs_warning (s, "journal-945: saved_bh->b_count < 0"); | ||
1366 | } | ||
1367 | } | ||
1368 | } | ||
1369 | if (count > 0) { | ||
1370 | cn = jl->j_realblock ; | ||
1371 | while(cn) { | ||
1372 | if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) { | ||
1373 | if (!cn->bh) { | ||
1374 | reiserfs_panic(s, "journal-1011: cn->bh is NULL\n") ; | ||
1375 | } | ||
1376 | wait_on_buffer(cn->bh) ; | ||
1377 | if (!cn->bh) { | ||
1378 | reiserfs_panic(s, "journal-1012: cn->bh is NULL\n") ; | ||
1379 | } | ||
1380 | if (unlikely (!buffer_uptodate(cn->bh))) { | ||
1381 | #ifdef CONFIG_REISERFS_CHECK | ||
1382 | reiserfs_warning(s, "journal-949: buffer write failed\n") ; | ||
1383 | #endif | ||
1384 | err = -EIO; | ||
1385 | } | ||
1386 | /* note, we must clear the JDirty_wait bit after the up to date | ||
1387 | ** check, otherwise we race against our flushpage routine | ||
1388 | */ | ||
1389 | BUG_ON (!test_clear_buffer_journal_dirty (cn->bh)); | ||
1390 | |||
1391 | /* undo the inc from journal_mark_dirty */ | ||
1392 | put_bh(cn->bh) ; | ||
1393 | brelse(cn->bh) ; | ||
1394 | } | ||
1395 | cn = cn->next ; | ||
1396 | } | ||
1397 | } | ||
1398 | |||
1399 | if (err) | ||
1400 | reiserfs_abort (s, -EIO, "Write error while pushing transaction to disk in %s", __FUNCTION__); | ||
1401 | flush_older_and_return: | ||
1402 | |||
1403 | |||
1404 | /* before we can update the journal header block, we _must_ flush all | ||
1405 | ** real blocks from all older transactions to disk. This is because | ||
1406 | ** once the header block is updated, this transaction will not be | ||
1407 | ** replayed after a crash | ||
1408 | */ | ||
1409 | if (flushall) { | ||
1410 | flush_older_journal_lists(s, jl); | ||
1411 | } | ||
1412 | |||
1413 | err = journal->j_errno; | ||
1414 | /* before we can remove everything from the hash tables for this | ||
1415 | ** transaction, we must make sure it can never be replayed | ||
1416 | ** | ||
1417 | ** since we are only called from do_journal_end, we know for sure there | ||
1418 | ** are no allocations going on while we are flushing journal lists. So, | ||
1419 | ** we only need to update the journal header block for the last list | ||
1420 | ** being flushed | ||
1421 | */ | ||
1422 | if (!err && flushall) { | ||
1423 | err = update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(s), jl->j_trans_id) ; | ||
1424 | if (err) | ||
1425 | reiserfs_abort (s, -EIO, "Write error while updating journal header in %s", __FUNCTION__); | ||
1426 | } | ||
1427 | remove_all_from_journal_list(s, jl, 0) ; | ||
1428 | list_del_init(&jl->j_list); | ||
1429 | journal->j_num_lists--; | ||
1430 | del_from_work_list(s, jl); | ||
1431 | |||
1432 | if (journal->j_last_flush_id != 0 && | ||
1433 | (jl->j_trans_id - journal->j_last_flush_id) != 1) { | ||
1434 | reiserfs_warning(s, "clm-2201: last flush %lu, current %lu", | ||
1435 | journal->j_last_flush_id, | ||
1436 | jl->j_trans_id); | ||
1437 | } | ||
1438 | journal->j_last_flush_id = jl->j_trans_id; | ||
1439 | |||
1440 | /* not strictly required since we are freeing the list, but it should | ||
1441 | * help find code using dead lists later on | ||
1442 | */ | ||
1443 | jl->j_len = 0 ; | ||
1444 | atomic_set(&(jl->j_nonzerolen), 0) ; | ||
1445 | jl->j_start = 0 ; | ||
1446 | jl->j_realblock = NULL ; | ||
1447 | jl->j_commit_bh = NULL ; | ||
1448 | jl->j_trans_id = 0 ; | ||
1449 | jl->j_state = 0; | ||
1450 | put_journal_list(s, jl); | ||
1451 | if (flushall) | ||
1452 | up(&journal->j_flush_sem); | ||
1453 | return err ; | ||
1454 | } | ||
1455 | |||
1456 | static int write_one_transaction(struct super_block *s, | ||
1457 | struct reiserfs_journal_list *jl, | ||
1458 | struct buffer_chunk *chunk) | ||
1459 | { | ||
1460 | struct reiserfs_journal_cnode *cn; | ||
1461 | int ret = 0 ; | ||
1462 | |||
1463 | jl->j_state |= LIST_TOUCHED; | ||
1464 | del_from_work_list(s, jl); | ||
1465 | if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) { | ||
1466 | return 0; | ||
1467 | } | ||
1468 | |||
1469 | cn = jl->j_realblock ; | ||
1470 | while(cn) { | ||
1471 | /* if the blocknr == 0, this has been cleared from the hash, | ||
1472 | ** skip it | ||
1473 | */ | ||
1474 | if (cn->blocknr == 0) { | ||
1475 | goto next ; | ||
1476 | } | ||
1477 | if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) { | ||
1478 | struct buffer_head *tmp_bh; | ||
1479 | /* we can race against journal_mark_freed when we try | ||
1480 | * to lock_buffer(cn->bh), so we have to inc the buffer | ||
1481 | * count, and recheck things after locking | ||
1482 | */ | ||
1483 | tmp_bh = cn->bh; | ||
1484 | get_bh(tmp_bh); | ||
1485 | lock_buffer(tmp_bh); | ||
1486 | if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) { | ||
1487 | if (!buffer_journal_dirty(tmp_bh) || | ||
1488 | buffer_journal_prepared(tmp_bh)) | ||
1489 | BUG(); | ||
1490 | add_to_chunk(chunk, tmp_bh, NULL, write_chunk); | ||
1491 | ret++; | ||
1492 | } else { | ||
1493 | /* note, cn->bh might be null now */ | ||
1494 | unlock_buffer(tmp_bh); | ||
1495 | } | ||
1496 | put_bh(tmp_bh); | ||
1497 | } | ||
1498 | next: | ||
1499 | cn = cn->next ; | ||
1500 | cond_resched(); | ||
1501 | } | ||
1502 | return ret ; | ||
1503 | } | ||
1504 | |||
1505 | /* used by flush_commit_list */ | ||
1506 | static int dirty_one_transaction(struct super_block *s, | ||
1507 | struct reiserfs_journal_list *jl) | ||
1508 | { | ||
1509 | struct reiserfs_journal_cnode *cn; | ||
1510 | struct reiserfs_journal_list *pjl; | ||
1511 | int ret = 0 ; | ||
1512 | |||
1513 | jl->j_state |= LIST_DIRTY; | ||
1514 | cn = jl->j_realblock ; | ||
1515 | while(cn) { | ||
1516 | /* look for a more recent transaction that logged this | ||
1517 | ** buffer. Only the most recent transaction with a buffer in | ||
1518 | ** it is allowed to send that buffer to disk | ||
1519 | */ | ||
1520 | pjl = find_newer_jl_for_cn(cn) ; | ||
1521 | if (!pjl && cn->blocknr && cn->bh && buffer_journal_dirty(cn->bh)) | ||
1522 | { | ||
1523 | BUG_ON (!can_dirty(cn)); | ||
1524 | /* if the buffer is prepared, it will either be logged | ||
1525 | * or restored. If restored, we need to make sure | ||
1526 | * it actually gets marked dirty | ||
1527 | */ | ||
1528 | clear_buffer_journal_new (cn->bh); | ||
1529 | if (buffer_journal_prepared (cn->bh)) { | ||
1530 | set_buffer_journal_restore_dirty (cn->bh); | ||
1531 | } else { | ||
1532 | set_buffer_journal_test (cn->bh); | ||
1533 | mark_buffer_dirty(cn->bh); | ||
1534 | } | ||
1535 | } | ||
1536 | cn = cn->next ; | ||
1537 | } | ||
1538 | return ret ; | ||
1539 | } | ||
1540 | |||
1541 | static int kupdate_transactions(struct super_block *s, | ||
1542 | struct reiserfs_journal_list *jl, | ||
1543 | struct reiserfs_journal_list **next_jl, | ||
1544 | unsigned long *next_trans_id, | ||
1545 | int num_blocks, | ||
1546 | int num_trans) { | ||
1547 | int ret = 0; | ||
1548 | int written = 0 ; | ||
1549 | int transactions_flushed = 0; | ||
1550 | unsigned long orig_trans_id = jl->j_trans_id; | ||
1551 | struct buffer_chunk chunk; | ||
1552 | struct list_head *entry; | ||
1553 | struct reiserfs_journal *journal = SB_JOURNAL (s); | ||
1554 | chunk.nr = 0; | ||
1555 | |||
1556 | down(&journal->j_flush_sem); | ||
1557 | if (!journal_list_still_alive(s, orig_trans_id)) { | ||
1558 | goto done; | ||
1559 | } | ||
1560 | |||
1561 | /* we've got j_flush_sem held, nobody is going to delete any | ||
1562 | * of these lists out from underneath us | ||
1563 | */ | ||
1564 | while((num_trans && transactions_flushed < num_trans) || | ||
1565 | (!num_trans && written < num_blocks)) { | ||
1566 | |||
1567 | if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) || | ||
1568 | atomic_read(&jl->j_commit_left) || !(jl->j_state & LIST_DIRTY)) | ||
1569 | { | ||
1570 | del_from_work_list(s, jl); | ||
1571 | break; | ||
1572 | } | ||
1573 | ret = write_one_transaction(s, jl, &chunk); | ||
1574 | |||
1575 | if (ret < 0) | ||
1576 | goto done; | ||
1577 | transactions_flushed++; | ||
1578 | written += ret; | ||
1579 | entry = jl->j_list.next; | ||
1580 | |||
1581 | /* did we wrap? */ | ||
1582 | if (entry == &journal->j_journal_list) { | ||
1583 | break; | ||
1584 | } | ||
1585 | jl = JOURNAL_LIST_ENTRY(entry); | ||
1586 | |||
1587 | /* don't bother with older transactions */ | ||
1588 | if (jl->j_trans_id <= orig_trans_id) | ||
1589 | break; | ||
1590 | } | ||
1591 | if (chunk.nr) { | ||
1592 | write_chunk(&chunk); | ||
1593 | } | ||
1594 | |||
1595 | done: | ||
1596 | up(&journal->j_flush_sem); | ||
1597 | return ret; | ||
1598 | } | ||
1599 | |||
1600 | /* for o_sync and fsync heavy applications, they tend to use | ||
1601 | ** all the journa list slots with tiny transactions. These | ||
1602 | ** trigger lots and lots of calls to update the header block, which | ||
1603 | ** adds seeks and slows things down. | ||
1604 | ** | ||
1605 | ** This function tries to clear out a large chunk of the journal lists | ||
1606 | ** at once, which makes everything faster since only the newest journal | ||
1607 | ** list updates the header block | ||
1608 | */ | ||
1609 | static int flush_used_journal_lists(struct super_block *s, | ||
1610 | struct reiserfs_journal_list *jl) { | ||
1611 | unsigned long len = 0; | ||
1612 | unsigned long cur_len; | ||
1613 | int ret; | ||
1614 | int i; | ||
1615 | int limit = 256; | ||
1616 | struct reiserfs_journal_list *tjl; | ||
1617 | struct reiserfs_journal_list *flush_jl; | ||
1618 | unsigned long trans_id; | ||
1619 | struct reiserfs_journal *journal = SB_JOURNAL (s); | ||
1620 | |||
1621 | flush_jl = tjl = jl; | ||
1622 | |||
1623 | /* in data logging mode, try harder to flush a lot of blocks */ | ||
1624 | if (reiserfs_data_log(s)) | ||
1625 | limit = 1024; | ||
1626 | /* flush for 256 transactions or limit blocks, whichever comes first */ | ||
1627 | for(i = 0 ; i < 256 && len < limit ; i++) { | ||
1628 | if (atomic_read(&tjl->j_commit_left) || | ||
1629 | tjl->j_trans_id < jl->j_trans_id) { | ||
1630 | break; | ||
1631 | } | ||
1632 | cur_len = atomic_read(&tjl->j_nonzerolen); | ||
1633 | if (cur_len > 0) { | ||
1634 | tjl->j_state &= ~LIST_TOUCHED; | ||
1635 | } | ||
1636 | len += cur_len; | ||
1637 | flush_jl = tjl; | ||
1638 | if (tjl->j_list.next == &journal->j_journal_list) | ||
1639 | break; | ||
1640 | tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next); | ||
1641 | } | ||
1642 | /* try to find a group of blocks we can flush across all the | ||
1643 | ** transactions, but only bother if we've actually spanned | ||
1644 | ** across multiple lists | ||
1645 | */ | ||
1646 | if (flush_jl != jl) { | ||
1647 | ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); | ||
1648 | } | ||
1649 | flush_journal_list(s, flush_jl, 1); | ||
1650 | return 0; | ||
1651 | } | ||
1652 | |||
1653 | /* | ||
1654 | ** removes any nodes in table with name block and dev as bh. | ||
1655 | ** only touchs the hnext and hprev pointers. | ||
1656 | */ | ||
1657 | void remove_journal_hash(struct super_block *sb, | ||
1658 | struct reiserfs_journal_cnode **table, | ||
1659 | struct reiserfs_journal_list *jl, | ||
1660 | unsigned long block, int remove_freed) | ||
1661 | { | ||
1662 | struct reiserfs_journal_cnode *cur ; | ||
1663 | struct reiserfs_journal_cnode **head ; | ||
1664 | |||
1665 | head= &(journal_hash(table, sb, block)) ; | ||
1666 | if (!head) { | ||
1667 | return ; | ||
1668 | } | ||
1669 | cur = *head ; | ||
1670 | while(cur) { | ||
1671 | if (cur->blocknr == block && cur->sb == sb && (jl == NULL || jl == cur->jlist) && | ||
1672 | (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) { | ||
1673 | if (cur->hnext) { | ||
1674 | cur->hnext->hprev = cur->hprev ; | ||
1675 | } | ||
1676 | if (cur->hprev) { | ||
1677 | cur->hprev->hnext = cur->hnext ; | ||
1678 | } else { | ||
1679 | *head = cur->hnext ; | ||
1680 | } | ||
1681 | cur->blocknr = 0 ; | ||
1682 | cur->sb = NULL ; | ||
1683 | cur->state = 0 ; | ||
1684 | if (cur->bh && cur->jlist) /* anybody who clears the cur->bh will also dec the nonzerolen */ | ||
1685 | atomic_dec(&(cur->jlist->j_nonzerolen)) ; | ||
1686 | cur->bh = NULL ; | ||
1687 | cur->jlist = NULL ; | ||
1688 | } | ||
1689 | cur = cur->hnext ; | ||
1690 | } | ||
1691 | } | ||
1692 | |||
1693 | static void free_journal_ram(struct super_block *p_s_sb) { | ||
1694 | struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); | ||
1695 | reiserfs_kfree(journal->j_current_jl, | ||
1696 | sizeof(struct reiserfs_journal_list), p_s_sb); | ||
1697 | journal->j_num_lists--; | ||
1698 | |||
1699 | vfree(journal->j_cnode_free_orig) ; | ||
1700 | free_list_bitmaps(p_s_sb, journal->j_list_bitmap) ; | ||
1701 | free_bitmap_nodes(p_s_sb) ; /* must be after free_list_bitmaps */ | ||
1702 | if (journal->j_header_bh) { | ||
1703 | brelse(journal->j_header_bh) ; | ||
1704 | } | ||
1705 | /* j_header_bh is on the journal dev, make sure not to release the journal | ||
1706 | * dev until we brelse j_header_bh | ||
1707 | */ | ||
1708 | release_journal_dev(p_s_sb, journal); | ||
1709 | vfree(journal) ; | ||
1710 | } | ||
1711 | |||
1712 | /* | ||
1713 | ** call on unmount. Only set error to 1 if you haven't made your way out | ||
1714 | ** of read_super() yet. Any other caller must keep error at 0. | ||
1715 | */ | ||
1716 | static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, int error) { | ||
1717 | struct reiserfs_transaction_handle myth ; | ||
1718 | int flushed = 0; | ||
1719 | struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); | ||
1720 | |||
1721 | /* we only want to flush out transactions if we were called with error == 0 | ||
1722 | */ | ||
1723 | if (!error && !(p_s_sb->s_flags & MS_RDONLY)) { | ||
1724 | /* end the current trans */ | ||
1725 | BUG_ON (!th->t_trans_id); | ||
1726 | do_journal_end(th, p_s_sb,10, FLUSH_ALL) ; | ||
1727 | |||
1728 | /* make sure something gets logged to force our way into the flush code */ | ||
1729 | if (!journal_join(&myth, p_s_sb, 1)) { | ||
1730 | reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; | ||
1731 | journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; | ||
1732 | do_journal_end(&myth, p_s_sb,1, FLUSH_ALL) ; | ||
1733 | flushed = 1; | ||
1734 | } | ||
1735 | } | ||
1736 | |||
1737 | /* this also catches errors during the do_journal_end above */ | ||
1738 | if (!error && reiserfs_is_journal_aborted(journal)) { | ||
1739 | memset(&myth, 0, sizeof(myth)); | ||
1740 | if (!journal_join_abort(&myth, p_s_sb, 1)) { | ||
1741 | reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; | ||
1742 | journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; | ||
1743 | do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL) ; | ||
1744 | } | ||
1745 | } | ||
1746 | |||
1747 | reiserfs_mounted_fs_count-- ; | ||
1748 | /* wait for all commits to finish */ | ||
1749 | cancel_delayed_work(&SB_JOURNAL(p_s_sb)->j_work); | ||
1750 | flush_workqueue(commit_wq); | ||
1751 | if (!reiserfs_mounted_fs_count) { | ||
1752 | destroy_workqueue(commit_wq); | ||
1753 | commit_wq = NULL; | ||
1754 | } | ||
1755 | |||
1756 | free_journal_ram(p_s_sb) ; | ||
1757 | |||
1758 | return 0 ; | ||
1759 | } | ||
1760 | |||
1761 | /* | ||
1762 | ** call on unmount. flush all journal trans, release all alloc'd ram | ||
1763 | */ | ||
1764 | int journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb) { | ||
1765 | return do_journal_release(th, p_s_sb, 0) ; | ||
1766 | } | ||
1767 | /* | ||
1768 | ** only call from an error condition inside reiserfs_read_super! | ||
1769 | */ | ||
1770 | int journal_release_error(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb) { | ||
1771 | return do_journal_release(th, p_s_sb, 1) ; | ||
1772 | } | ||
1773 | |||
1774 | /* compares description block with commit block. returns 1 if they differ, 0 if they are the same */ | ||
1775 | static int journal_compare_desc_commit(struct super_block *p_s_sb, struct reiserfs_journal_desc *desc, | ||
1776 | struct reiserfs_journal_commit *commit) { | ||
1777 | if (get_commit_trans_id (commit) != get_desc_trans_id (desc) || | ||
1778 | get_commit_trans_len (commit) != get_desc_trans_len (desc) || | ||
1779 | get_commit_trans_len (commit) > SB_JOURNAL(p_s_sb)->j_trans_max || | ||
1780 | get_commit_trans_len (commit) <= 0 | ||
1781 | ) { | ||
1782 | return 1 ; | ||
1783 | } | ||
1784 | return 0 ; | ||
1785 | } | ||
1786 | /* returns 0 if it did not find a description block | ||
1787 | ** returns -1 if it found a corrupt commit block | ||
1788 | ** returns 1 if both desc and commit were valid | ||
1789 | */ | ||
1790 | static int journal_transaction_is_valid(struct super_block *p_s_sb, struct buffer_head *d_bh, unsigned long *oldest_invalid_trans_id, unsigned long *newest_mount_id) { | ||
1791 | struct reiserfs_journal_desc *desc ; | ||
1792 | struct reiserfs_journal_commit *commit ; | ||
1793 | struct buffer_head *c_bh ; | ||
1794 | unsigned long offset ; | ||
1795 | |||
1796 | if (!d_bh) | ||
1797 | return 0 ; | ||
1798 | |||
1799 | desc = (struct reiserfs_journal_desc *)d_bh->b_data ; | ||
1800 | if (get_desc_trans_len(desc) > 0 && !memcmp(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8)) { | ||
1801 | if (oldest_invalid_trans_id && *oldest_invalid_trans_id && get_desc_trans_id(desc) > *oldest_invalid_trans_id) { | ||
1802 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-986: transaction " | ||
1803 | "is valid returning because trans_id %d is greater than " | ||
1804 | "oldest_invalid %lu", get_desc_trans_id(desc), | ||
1805 | *oldest_invalid_trans_id); | ||
1806 | return 0 ; | ||
1807 | } | ||
1808 | if (newest_mount_id && *newest_mount_id > get_desc_mount_id (desc)) { | ||
1809 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1087: transaction " | ||
1810 | "is valid returning because mount_id %d is less than " | ||
1811 | "newest_mount_id %lu", get_desc_mount_id (desc), | ||
1812 | *newest_mount_id) ; | ||
1813 | return -1 ; | ||
1814 | } | ||
1815 | if ( get_desc_trans_len(desc) > SB_JOURNAL(p_s_sb)->j_trans_max ) { | ||
1816 | reiserfs_warning(p_s_sb, "journal-2018: Bad transaction length %d encountered, ignoring transaction", get_desc_trans_len(desc)); | ||
1817 | return -1 ; | ||
1818 | } | ||
1819 | offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ; | ||
1820 | |||
1821 | /* ok, we have a journal description block, lets see if the transaction was valid */ | ||
1822 | c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + | ||
1823 | ((offset + get_desc_trans_len(desc) + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ; | ||
1824 | if (!c_bh) | ||
1825 | return 0 ; | ||
1826 | commit = (struct reiserfs_journal_commit *)c_bh->b_data ; | ||
1827 | if (journal_compare_desc_commit(p_s_sb, desc, commit)) { | ||
1828 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, | ||
1829 | "journal_transaction_is_valid, commit offset %ld had bad " | ||
1830 | "time %d or length %d", | ||
1831 | c_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), | ||
1832 | get_commit_trans_id (commit), | ||
1833 | get_commit_trans_len(commit)); | ||
1834 | brelse(c_bh) ; | ||
1835 | if (oldest_invalid_trans_id) { | ||
1836 | *oldest_invalid_trans_id = get_desc_trans_id(desc) ; | ||
1837 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1004: " | ||
1838 | "transaction_is_valid setting oldest invalid trans_id " | ||
1839 | "to %d", get_desc_trans_id(desc)) ; | ||
1840 | } | ||
1841 | return -1; | ||
1842 | } | ||
1843 | brelse(c_bh) ; | ||
1844 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1006: found valid " | ||
1845 | "transaction start offset %llu, len %d id %d", | ||
1846 | d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), | ||
1847 | get_desc_trans_len(desc), get_desc_trans_id(desc)) ; | ||
1848 | return 1 ; | ||
1849 | } else { | ||
1850 | return 0 ; | ||
1851 | } | ||
1852 | } | ||
1853 | |||
1854 | static void brelse_array(struct buffer_head **heads, int num) { | ||
1855 | int i ; | ||
1856 | for (i = 0 ; i < num ; i++) { | ||
1857 | brelse(heads[i]) ; | ||
1858 | } | ||
1859 | } | ||
1860 | |||
1861 | /* | ||
1862 | ** given the start, and values for the oldest acceptable transactions, | ||
1863 | ** this either reads in a replays a transaction, or returns because the transaction | ||
1864 | ** is invalid, or too old. | ||
1865 | */ | ||
1866 | static int journal_read_transaction(struct super_block *p_s_sb, unsigned long cur_dblock, unsigned long oldest_start, | ||
1867 | unsigned long oldest_trans_id, unsigned long newest_mount_id) { | ||
1868 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
1869 | struct reiserfs_journal_desc *desc ; | ||
1870 | struct reiserfs_journal_commit *commit ; | ||
1871 | unsigned long trans_id = 0 ; | ||
1872 | struct buffer_head *c_bh ; | ||
1873 | struct buffer_head *d_bh ; | ||
1874 | struct buffer_head **log_blocks = NULL ; | ||
1875 | struct buffer_head **real_blocks = NULL ; | ||
1876 | unsigned long trans_offset ; | ||
1877 | int i; | ||
1878 | int trans_half; | ||
1879 | |||
1880 | d_bh = journal_bread(p_s_sb, cur_dblock) ; | ||
1881 | if (!d_bh) | ||
1882 | return 1 ; | ||
1883 | desc = (struct reiserfs_journal_desc *)d_bh->b_data ; | ||
1884 | trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ; | ||
1885 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: " | ||
1886 | "journal_read_transaction, offset %llu, len %d mount_id %d", | ||
1887 | d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), | ||
1888 | get_desc_trans_len(desc), get_desc_mount_id(desc)) ; | ||
1889 | if (get_desc_trans_id(desc) < oldest_trans_id) { | ||
1890 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1039: " | ||
1891 | "journal_read_trans skipping because %lu is too old", | ||
1892 | cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)) ; | ||
1893 | brelse(d_bh) ; | ||
1894 | return 1 ; | ||
1895 | } | ||
1896 | if (get_desc_mount_id(desc) != newest_mount_id) { | ||
1897 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1146: " | ||
1898 | "journal_read_trans skipping because %d is != " | ||
1899 | "newest_mount_id %lu", get_desc_mount_id(desc), | ||
1900 | newest_mount_id) ; | ||
1901 | brelse(d_bh) ; | ||
1902 | return 1 ; | ||
1903 | } | ||
1904 | c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + | ||
1905 | ((trans_offset + get_desc_trans_len(desc) + 1) % | ||
1906 | SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ; | ||
1907 | if (!c_bh) { | ||
1908 | brelse(d_bh) ; | ||
1909 | return 1 ; | ||
1910 | } | ||
1911 | commit = (struct reiserfs_journal_commit *)c_bh->b_data ; | ||
1912 | if (journal_compare_desc_commit(p_s_sb, desc, commit)) { | ||
1913 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal_read_transaction, " | ||
1914 | "commit offset %llu had bad time %d or length %d", | ||
1915 | c_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), | ||
1916 | get_commit_trans_id(commit), get_commit_trans_len(commit)); | ||
1917 | brelse(c_bh) ; | ||
1918 | brelse(d_bh) ; | ||
1919 | return 1; | ||
1920 | } | ||
1921 | trans_id = get_desc_trans_id(desc) ; | ||
1922 | /* now we know we've got a good transaction, and it was inside the valid time ranges */ | ||
1923 | log_blocks = reiserfs_kmalloc(get_desc_trans_len(desc) * sizeof(struct buffer_head *), GFP_NOFS, p_s_sb) ; | ||
1924 | real_blocks = reiserfs_kmalloc(get_desc_trans_len(desc) * sizeof(struct buffer_head *), GFP_NOFS, p_s_sb) ; | ||
1925 | if (!log_blocks || !real_blocks) { | ||
1926 | brelse(c_bh) ; | ||
1927 | brelse(d_bh) ; | ||
1928 | reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; | ||
1929 | reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; | ||
1930 | reiserfs_warning(p_s_sb, "journal-1169: kmalloc failed, unable to mount FS") ; | ||
1931 | return -1 ; | ||
1932 | } | ||
1933 | /* get all the buffer heads */ | ||
1934 | trans_half = journal_trans_half (p_s_sb->s_blocksize) ; | ||
1935 | for(i = 0 ; i < get_desc_trans_len(desc) ; i++) { | ||
1936 | log_blocks[i] = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + (trans_offset + 1 + i) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)); | ||
1937 | if (i < trans_half) { | ||
1938 | real_blocks[i] = sb_getblk(p_s_sb, le32_to_cpu(desc->j_realblock[i])) ; | ||
1939 | } else { | ||
1940 | real_blocks[i] = sb_getblk(p_s_sb, le32_to_cpu(commit->j_realblock[i - trans_half])) ; | ||
1941 | } | ||
1942 | if ( real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(p_s_sb) ) { | ||
1943 | reiserfs_warning(p_s_sb, "journal-1207: REPLAY FAILURE fsck required! Block to replay is outside of filesystem"); | ||
1944 | goto abort_replay; | ||
1945 | } | ||
1946 | /* make sure we don't try to replay onto log or reserved area */ | ||
1947 | if (is_block_in_log_or_reserved_area(p_s_sb, real_blocks[i]->b_blocknr)) { | ||
1948 | reiserfs_warning(p_s_sb, "journal-1204: REPLAY FAILURE fsck required! Trying to replay onto a log block") ; | ||
1949 | abort_replay: | ||
1950 | brelse_array(log_blocks, i) ; | ||
1951 | brelse_array(real_blocks, i) ; | ||
1952 | brelse(c_bh) ; | ||
1953 | brelse(d_bh) ; | ||
1954 | reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; | ||
1955 | reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; | ||
1956 | return -1 ; | ||
1957 | } | ||
1958 | } | ||
1959 | /* read in the log blocks, memcpy to the corresponding real block */ | ||
1960 | ll_rw_block(READ, get_desc_trans_len(desc), log_blocks) ; | ||
1961 | for (i = 0 ; i < get_desc_trans_len(desc) ; i++) { | ||
1962 | wait_on_buffer(log_blocks[i]) ; | ||
1963 | if (!buffer_uptodate(log_blocks[i])) { | ||
1964 | reiserfs_warning(p_s_sb, "journal-1212: REPLAY FAILURE fsck required! buffer write failed") ; | ||
1965 | brelse_array(log_blocks + i, get_desc_trans_len(desc) - i) ; | ||
1966 | brelse_array(real_blocks, get_desc_trans_len(desc)) ; | ||
1967 | brelse(c_bh) ; | ||
1968 | brelse(d_bh) ; | ||
1969 | reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; | ||
1970 | reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; | ||
1971 | return -1 ; | ||
1972 | } | ||
1973 | memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, real_blocks[i]->b_size) ; | ||
1974 | set_buffer_uptodate(real_blocks[i]) ; | ||
1975 | brelse(log_blocks[i]) ; | ||
1976 | } | ||
1977 | /* flush out the real blocks */ | ||
1978 | for (i = 0 ; i < get_desc_trans_len(desc) ; i++) { | ||
1979 | set_buffer_dirty(real_blocks[i]) ; | ||
1980 | ll_rw_block(WRITE, 1, real_blocks + i) ; | ||
1981 | } | ||
1982 | for (i = 0 ; i < get_desc_trans_len(desc) ; i++) { | ||
1983 | wait_on_buffer(real_blocks[i]) ; | ||
1984 | if (!buffer_uptodate(real_blocks[i])) { | ||
1985 | reiserfs_warning(p_s_sb, "journal-1226: REPLAY FAILURE, fsck required! buffer write failed") ; | ||
1986 | brelse_array(real_blocks + i, get_desc_trans_len(desc) - i) ; | ||
1987 | brelse(c_bh) ; | ||
1988 | brelse(d_bh) ; | ||
1989 | reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; | ||
1990 | reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; | ||
1991 | return -1 ; | ||
1992 | } | ||
1993 | brelse(real_blocks[i]) ; | ||
1994 | } | ||
1995 | cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + ((trans_offset + get_desc_trans_len(desc) + 2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)) ; | ||
1996 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1095: setting journal " | ||
1997 | "start to offset %ld", | ||
1998 | cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)) ; | ||
1999 | |||
2000 | /* init starting values for the first transaction, in case this is the last transaction to be replayed. */ | ||
2001 | journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ; | ||
2002 | journal->j_last_flush_trans_id = trans_id ; | ||
2003 | journal->j_trans_id = trans_id + 1; | ||
2004 | brelse(c_bh) ; | ||
2005 | brelse(d_bh) ; | ||
2006 | reiserfs_kfree(log_blocks, le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), p_s_sb) ; | ||
2007 | reiserfs_kfree(real_blocks, le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), p_s_sb) ; | ||
2008 | return 0 ; | ||
2009 | } | ||
2010 | |||
2011 | /* This function reads blocks starting from block and to max_block of bufsize | ||
2012 | size (but no more than BUFNR blocks at a time). This proved to improve | ||
2013 | mounting speed on self-rebuilding raid5 arrays at least. | ||
2014 | Right now it is only used from journal code. But later we might use it | ||
2015 | from other places. | ||
2016 | Note: Do not use journal_getblk/sb_getblk functions here! */ | ||
2017 | static struct buffer_head * reiserfs_breada (struct block_device *dev, int block, int bufsize, | ||
2018 | unsigned int max_block) | ||
2019 | { | ||
2020 | struct buffer_head * bhlist[BUFNR]; | ||
2021 | unsigned int blocks = BUFNR; | ||
2022 | struct buffer_head * bh; | ||
2023 | int i, j; | ||
2024 | |||
2025 | bh = __getblk (dev, block, bufsize ); | ||
2026 | if (buffer_uptodate (bh)) | ||
2027 | return (bh); | ||
2028 | |||
2029 | if (block + BUFNR > max_block) { | ||
2030 | blocks = max_block - block; | ||
2031 | } | ||
2032 | bhlist[0] = bh; | ||
2033 | j = 1; | ||
2034 | for (i = 1; i < blocks; i++) { | ||
2035 | bh = __getblk (dev, block + i, bufsize); | ||
2036 | if (buffer_uptodate (bh)) { | ||
2037 | brelse (bh); | ||
2038 | break; | ||
2039 | } | ||
2040 | else bhlist[j++] = bh; | ||
2041 | } | ||
2042 | ll_rw_block (READ, j, bhlist); | ||
2043 | for(i = 1; i < j; i++) | ||
2044 | brelse (bhlist[i]); | ||
2045 | bh = bhlist[0]; | ||
2046 | wait_on_buffer (bh); | ||
2047 | if (buffer_uptodate (bh)) | ||
2048 | return bh; | ||
2049 | brelse (bh); | ||
2050 | return NULL; | ||
2051 | } | ||
2052 | |||
2053 | /* | ||
2054 | ** read and replay the log | ||
2055 | ** on a clean unmount, the journal header's next unflushed pointer will be to an invalid | ||
2056 | ** transaction. This tests that before finding all the transactions in the log, which makes normal mount times fast. | ||
2057 | ** | ||
2058 | ** After a crash, this starts with the next unflushed transaction, and replays until it finds one too old, or invalid. | ||
2059 | ** | ||
2060 | ** On exit, it sets things up so the first transaction will work correctly. | ||
2061 | */ | ||
2062 | static int journal_read(struct super_block *p_s_sb) { | ||
2063 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
2064 | struct reiserfs_journal_desc *desc ; | ||
2065 | unsigned long oldest_trans_id = 0; | ||
2066 | unsigned long oldest_invalid_trans_id = 0 ; | ||
2067 | time_t start ; | ||
2068 | unsigned long oldest_start = 0; | ||
2069 | unsigned long cur_dblock = 0 ; | ||
2070 | unsigned long newest_mount_id = 9 ; | ||
2071 | struct buffer_head *d_bh ; | ||
2072 | struct reiserfs_journal_header *jh ; | ||
2073 | int valid_journal_header = 0 ; | ||
2074 | int replay_count = 0 ; | ||
2075 | int continue_replay = 1 ; | ||
2076 | int ret ; | ||
2077 | char b[BDEVNAME_SIZE]; | ||
2078 | |||
2079 | cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ; | ||
2080 | reiserfs_info (p_s_sb, "checking transaction log (%s)\n", | ||
2081 | bdevname(journal->j_dev_bd, b)); | ||
2082 | start = get_seconds(); | ||
2083 | |||
2084 | /* step 1, read in the journal header block. Check the transaction it says | ||
2085 | ** is the first unflushed, and if that transaction is not valid, | ||
2086 | ** replay is done | ||
2087 | */ | ||
2088 | journal->j_header_bh = journal_bread(p_s_sb, | ||
2089 | SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + | ||
2090 | SB_ONDISK_JOURNAL_SIZE(p_s_sb)); | ||
2091 | if (!journal->j_header_bh) { | ||
2092 | return 1 ; | ||
2093 | } | ||
2094 | jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data) ; | ||
2095 | if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 && | ||
2096 | le32_to_cpu(jh->j_first_unflushed_offset) < SB_ONDISK_JOURNAL_SIZE(p_s_sb) && | ||
2097 | le32_to_cpu(jh->j_last_flush_trans_id) > 0) { | ||
2098 | oldest_start = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + | ||
2099 | le32_to_cpu(jh->j_first_unflushed_offset) ; | ||
2100 | oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1; | ||
2101 | newest_mount_id = le32_to_cpu(jh->j_mount_id); | ||
2102 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1153: found in " | ||
2103 | "header: first_unflushed_offset %d, last_flushed_trans_id " | ||
2104 | "%lu", le32_to_cpu(jh->j_first_unflushed_offset), | ||
2105 | le32_to_cpu(jh->j_last_flush_trans_id)) ; | ||
2106 | valid_journal_header = 1 ; | ||
2107 | |||
2108 | /* now, we try to read the first unflushed offset. If it is not valid, | ||
2109 | ** there is nothing more we can do, and it makes no sense to read | ||
2110 | ** through the whole log. | ||
2111 | */ | ||
2112 | d_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + le32_to_cpu(jh->j_first_unflushed_offset)) ; | ||
2113 | ret = journal_transaction_is_valid(p_s_sb, d_bh, NULL, NULL) ; | ||
2114 | if (!ret) { | ||
2115 | continue_replay = 0 ; | ||
2116 | } | ||
2117 | brelse(d_bh) ; | ||
2118 | goto start_log_replay; | ||
2119 | } | ||
2120 | |||
2121 | if (continue_replay && bdev_read_only(p_s_sb->s_bdev)) { | ||
2122 | reiserfs_warning (p_s_sb, | ||
2123 | "clm-2076: device is readonly, unable to replay log") ; | ||
2124 | return -1 ; | ||
2125 | } | ||
2126 | |||
2127 | /* ok, there are transactions that need to be replayed. start with the first log block, find | ||
2128 | ** all the valid transactions, and pick out the oldest. | ||
2129 | */ | ||
2130 | while(continue_replay && cur_dblock < (SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb))) { | ||
2131 | /* Note that it is required for blocksize of primary fs device and journal | ||
2132 | device to be the same */ | ||
2133 | d_bh = reiserfs_breada(journal->j_dev_bd, cur_dblock, p_s_sb->s_blocksize, | ||
2134 | SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) ; | ||
2135 | ret = journal_transaction_is_valid(p_s_sb, d_bh, &oldest_invalid_trans_id, &newest_mount_id) ; | ||
2136 | if (ret == 1) { | ||
2137 | desc = (struct reiserfs_journal_desc *)d_bh->b_data ; | ||
2138 | if (oldest_start == 0) { /* init all oldest_ values */ | ||
2139 | oldest_trans_id = get_desc_trans_id(desc) ; | ||
2140 | oldest_start = d_bh->b_blocknr ; | ||
2141 | newest_mount_id = get_desc_mount_id(desc) ; | ||
2142 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1179: Setting " | ||
2143 | "oldest_start to offset %llu, trans_id %lu", | ||
2144 | oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), | ||
2145 | oldest_trans_id) ; | ||
2146 | } else if (oldest_trans_id > get_desc_trans_id(desc)) { | ||
2147 | /* one we just read was older */ | ||
2148 | oldest_trans_id = get_desc_trans_id(desc) ; | ||
2149 | oldest_start = d_bh->b_blocknr ; | ||
2150 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1180: Resetting " | ||
2151 | "oldest_start to offset %lu, trans_id %lu", | ||
2152 | oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), | ||
2153 | oldest_trans_id) ; | ||
2154 | } | ||
2155 | if (newest_mount_id < get_desc_mount_id(desc)) { | ||
2156 | newest_mount_id = get_desc_mount_id(desc) ; | ||
2157 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting " | ||
2158 | "newest_mount_id to %d", get_desc_mount_id(desc)); | ||
2159 | } | ||
2160 | cur_dblock += get_desc_trans_len(desc) + 2 ; | ||
2161 | } else { | ||
2162 | cur_dblock++ ; | ||
2163 | } | ||
2164 | brelse(d_bh) ; | ||
2165 | } | ||
2166 | |||
2167 | start_log_replay: | ||
2168 | cur_dblock = oldest_start ; | ||
2169 | if (oldest_trans_id) { | ||
2170 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1206: Starting replay " | ||
2171 | "from offset %llu, trans_id %lu", | ||
2172 | cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), | ||
2173 | oldest_trans_id) ; | ||
2174 | |||
2175 | } | ||
2176 | replay_count = 0 ; | ||
2177 | while(continue_replay && oldest_trans_id > 0) { | ||
2178 | ret = journal_read_transaction(p_s_sb, cur_dblock, oldest_start, oldest_trans_id, newest_mount_id) ; | ||
2179 | if (ret < 0) { | ||
2180 | return ret ; | ||
2181 | } else if (ret != 0) { | ||
2182 | break ; | ||
2183 | } | ||
2184 | cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start ; | ||
2185 | replay_count++ ; | ||
2186 | if (cur_dblock == oldest_start) | ||
2187 | break; | ||
2188 | } | ||
2189 | |||
2190 | if (oldest_trans_id == 0) { | ||
2191 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1225: No valid " | ||
2192 | "transactions found") ; | ||
2193 | } | ||
2194 | /* j_start does not get set correctly if we don't replay any transactions. | ||
2195 | ** if we had a valid journal_header, set j_start to the first unflushed transaction value, | ||
2196 | ** copy the trans_id from the header | ||
2197 | */ | ||
2198 | if (valid_journal_header && replay_count == 0) { | ||
2199 | journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset) ; | ||
2200 | journal->j_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1; | ||
2201 | journal->j_last_flush_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) ; | ||
2202 | journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1; | ||
2203 | } else { | ||
2204 | journal->j_mount_id = newest_mount_id + 1 ; | ||
2205 | } | ||
2206 | reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting " | ||
2207 | "newest_mount_id to %lu", journal->j_mount_id) ; | ||
2208 | journal->j_first_unflushed_offset = journal->j_start ; | ||
2209 | if (replay_count > 0) { | ||
2210 | reiserfs_info (p_s_sb, "replayed %d transactions in %lu seconds\n", | ||
2211 | replay_count, get_seconds() - start) ; | ||
2212 | } | ||
2213 | if (!bdev_read_only(p_s_sb->s_bdev) && | ||
2214 | _update_journal_header_block(p_s_sb, journal->j_start, | ||
2215 | journal->j_last_flush_trans_id)) | ||
2216 | { | ||
2217 | /* replay failed, caller must call free_journal_ram and abort | ||
2218 | ** the mount | ||
2219 | */ | ||
2220 | return -1 ; | ||
2221 | } | ||
2222 | return 0 ; | ||
2223 | } | ||
2224 | |||
2225 | static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) | ||
2226 | { | ||
2227 | struct reiserfs_journal_list *jl; | ||
2228 | retry: | ||
2229 | jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, s); | ||
2230 | if (!jl) { | ||
2231 | yield(); | ||
2232 | goto retry; | ||
2233 | } | ||
2234 | memset(jl, 0, sizeof(*jl)); | ||
2235 | INIT_LIST_HEAD(&jl->j_list); | ||
2236 | INIT_LIST_HEAD(&jl->j_working_list); | ||
2237 | INIT_LIST_HEAD(&jl->j_tail_bh_list); | ||
2238 | INIT_LIST_HEAD(&jl->j_bh_list); | ||
2239 | sema_init(&jl->j_commit_lock, 1); | ||
2240 | SB_JOURNAL(s)->j_num_lists++; | ||
2241 | get_journal_list(jl); | ||
2242 | return jl; | ||
2243 | } | ||
2244 | |||
2245 | static void journal_list_init(struct super_block *p_s_sb) { | ||
2246 | SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb); | ||
2247 | } | ||
2248 | |||
2249 | static int release_journal_dev( struct super_block *super, | ||
2250 | struct reiserfs_journal *journal ) | ||
2251 | { | ||
2252 | int result; | ||
2253 | |||
2254 | result = 0; | ||
2255 | |||
2256 | if( journal -> j_dev_file != NULL ) { | ||
2257 | result = filp_close( journal -> j_dev_file, NULL ); | ||
2258 | journal -> j_dev_file = NULL; | ||
2259 | journal -> j_dev_bd = NULL; | ||
2260 | } else if( journal -> j_dev_bd != NULL ) { | ||
2261 | result = blkdev_put( journal -> j_dev_bd ); | ||
2262 | journal -> j_dev_bd = NULL; | ||
2263 | } | ||
2264 | |||
2265 | if( result != 0 ) { | ||
2266 | reiserfs_warning(super, "sh-457: release_journal_dev: Cannot release journal device: %i", result ); | ||
2267 | } | ||
2268 | return result; | ||
2269 | } | ||
2270 | |||
2271 | static int journal_init_dev( struct super_block *super, | ||
2272 | struct reiserfs_journal *journal, | ||
2273 | const char *jdev_name ) | ||
2274 | { | ||
2275 | int result; | ||
2276 | dev_t jdev; | ||
2277 | int blkdev_mode = FMODE_READ | FMODE_WRITE; | ||
2278 | char b[BDEVNAME_SIZE]; | ||
2279 | |||
2280 | result = 0; | ||
2281 | |||
2282 | journal -> j_dev_bd = NULL; | ||
2283 | journal -> j_dev_file = NULL; | ||
2284 | jdev = SB_ONDISK_JOURNAL_DEVICE( super ) ? | ||
2285 | new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; | ||
2286 | |||
2287 | if (bdev_read_only(super->s_bdev)) | ||
2288 | blkdev_mode = FMODE_READ; | ||
2289 | |||
2290 | /* there is no "jdev" option and journal is on separate device */ | ||
2291 | if( ( !jdev_name || !jdev_name[ 0 ] ) ) { | ||
2292 | journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode); | ||
2293 | if (IS_ERR(journal->j_dev_bd)) { | ||
2294 | result = PTR_ERR(journal->j_dev_bd); | ||
2295 | journal->j_dev_bd = NULL; | ||
2296 | reiserfs_warning (super, "sh-458: journal_init_dev: " | ||
2297 | "cannot init journal device '%s': %i", | ||
2298 | __bdevname(jdev, b), result ); | ||
2299 | return result; | ||
2300 | } else if (jdev != super->s_dev) | ||
2301 | set_blocksize(journal->j_dev_bd, super->s_blocksize); | ||
2302 | return 0; | ||
2303 | } | ||
2304 | |||
2305 | journal -> j_dev_file = filp_open( jdev_name, 0, 0 ); | ||
2306 | if( !IS_ERR( journal -> j_dev_file ) ) { | ||
2307 | struct inode *jdev_inode = journal->j_dev_file->f_mapping->host; | ||
2308 | if( !S_ISBLK( jdev_inode -> i_mode ) ) { | ||
2309 | reiserfs_warning (super, "journal_init_dev: '%s' is " | ||
2310 | "not a block device", jdev_name ); | ||
2311 | result = -ENOTBLK; | ||
2312 | } else { | ||
2313 | /* ok */ | ||
2314 | journal->j_dev_bd = I_BDEV(jdev_inode); | ||
2315 | set_blocksize(journal->j_dev_bd, super->s_blocksize); | ||
2316 | } | ||
2317 | } else { | ||
2318 | result = PTR_ERR( journal -> j_dev_file ); | ||
2319 | journal -> j_dev_file = NULL; | ||
2320 | reiserfs_warning (super, | ||
2321 | "journal_init_dev: Cannot open '%s': %i", | ||
2322 | jdev_name, result ); | ||
2323 | } | ||
2324 | if( result != 0 ) { | ||
2325 | release_journal_dev( super, journal ); | ||
2326 | } | ||
2327 | reiserfs_info(super, "journal_init_dev: journal device: %s\n", | ||
2328 | bdevname(journal->j_dev_bd, b)); | ||
2329 | return result; | ||
2330 | } | ||
2331 | |||
2332 | /* | ||
2333 | ** must be called once on fs mount. calls journal_read for you | ||
2334 | */ | ||
2335 | int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_format, unsigned int commit_max_age) { | ||
2336 | int num_cnodes = SB_ONDISK_JOURNAL_SIZE(p_s_sb) * 2 ; | ||
2337 | struct buffer_head *bhjh; | ||
2338 | struct reiserfs_super_block * rs; | ||
2339 | struct reiserfs_journal_header *jh; | ||
2340 | struct reiserfs_journal *journal; | ||
2341 | struct reiserfs_journal_list *jl; | ||
2342 | char b[BDEVNAME_SIZE]; | ||
2343 | |||
2344 | journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof (struct reiserfs_journal)) ; | ||
2345 | if (!journal) { | ||
2346 | reiserfs_warning (p_s_sb, "journal-1256: unable to get memory for journal structure") ; | ||
2347 | return 1 ; | ||
2348 | } | ||
2349 | memset(journal, 0, sizeof(struct reiserfs_journal)) ; | ||
2350 | INIT_LIST_HEAD(&journal->j_bitmap_nodes) ; | ||
2351 | INIT_LIST_HEAD (&journal->j_prealloc_list); | ||
2352 | INIT_LIST_HEAD(&journal->j_working_list); | ||
2353 | INIT_LIST_HEAD(&journal->j_journal_list); | ||
2354 | journal->j_persistent_trans = 0; | ||
2355 | if (reiserfs_allocate_list_bitmaps(p_s_sb, | ||
2356 | journal->j_list_bitmap, | ||
2357 | SB_BMAP_NR(p_s_sb))) | ||
2358 | goto free_and_return ; | ||
2359 | allocate_bitmap_nodes(p_s_sb) ; | ||
2360 | |||
2361 | /* reserved for journal area support */ | ||
2362 | SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ? | ||
2363 | REISERFS_OLD_DISK_OFFSET_IN_BYTES / p_s_sb->s_blocksize + | ||
2364 | SB_BMAP_NR(p_s_sb) + 1 : | ||
2365 | REISERFS_DISK_OFFSET_IN_BYTES / p_s_sb->s_blocksize + 2); | ||
2366 | |||
2367 | /* Sanity check to see is the standard journal fitting withing first bitmap | ||
2368 | (actual for small blocksizes) */ | ||
2369 | if ( !SB_ONDISK_JOURNAL_DEVICE( p_s_sb ) && | ||
2370 | (SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb) > p_s_sb->s_blocksize * 8) ) { | ||
2371 | reiserfs_warning (p_s_sb, "journal-1393: journal does not fit for area " | ||
2372 | "addressed by first of bitmap blocks. It starts at " | ||
2373 | "%u and its size is %u. Block size %ld", | ||
2374 | SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb), | ||
2375 | SB_ONDISK_JOURNAL_SIZE(p_s_sb), p_s_sb->s_blocksize); | ||
2376 | goto free_and_return; | ||
2377 | } | ||
2378 | |||
2379 | if( journal_init_dev( p_s_sb, journal, j_dev_name ) != 0 ) { | ||
2380 | reiserfs_warning (p_s_sb, "sh-462: unable to initialize jornal device"); | ||
2381 | goto free_and_return; | ||
2382 | } | ||
2383 | |||
2384 | rs = SB_DISK_SUPER_BLOCK(p_s_sb); | ||
2385 | |||
2386 | /* read journal header */ | ||
2387 | bhjh = journal_bread(p_s_sb, | ||
2388 | SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb)); | ||
2389 | if (!bhjh) { | ||
2390 | reiserfs_warning (p_s_sb, "sh-459: unable to read journal header"); | ||
2391 | goto free_and_return; | ||
2392 | } | ||
2393 | jh = (struct reiserfs_journal_header *)(bhjh->b_data); | ||
2394 | |||
2395 | /* make sure that journal matches to the super block */ | ||
2396 | if (is_reiserfs_jr(rs) && (jh->jh_journal.jp_journal_magic != sb_jp_journal_magic(rs))) { | ||
2397 | reiserfs_warning (p_s_sb, "sh-460: journal header magic %x " | ||
2398 | "(device %s) does not match to magic found in super " | ||
2399 | "block %x", | ||
2400 | jh->jh_journal.jp_journal_magic, | ||
2401 | bdevname( journal->j_dev_bd, b), | ||
2402 | sb_jp_journal_magic(rs)); | ||
2403 | brelse (bhjh); | ||
2404 | goto free_and_return; | ||
2405 | } | ||
2406 | |||
2407 | journal->j_trans_max = le32_to_cpu (jh->jh_journal.jp_journal_trans_max); | ||
2408 | journal->j_max_batch = le32_to_cpu (jh->jh_journal.jp_journal_max_batch); | ||
2409 | journal->j_max_commit_age = le32_to_cpu (jh->jh_journal.jp_journal_max_commit_age); | ||
2410 | journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; | ||
2411 | |||
2412 | if (journal->j_trans_max) { | ||
2413 | /* make sure these parameters are available, assign it if they are not */ | ||
2414 | __u32 initial = journal->j_trans_max; | ||
2415 | __u32 ratio = 1; | ||
2416 | |||
2417 | if (p_s_sb->s_blocksize < 4096) | ||
2418 | ratio = 4096 / p_s_sb->s_blocksize; | ||
2419 | |||
2420 | if (SB_ONDISK_JOURNAL_SIZE(p_s_sb)/journal->j_trans_max < JOURNAL_MIN_RATIO) | ||
2421 | journal->j_trans_max = SB_ONDISK_JOURNAL_SIZE(p_s_sb) / JOURNAL_MIN_RATIO; | ||
2422 | if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio) | ||
2423 | journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT / ratio; | ||
2424 | if (journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio) | ||
2425 | journal->j_trans_max = JOURNAL_TRANS_MIN_DEFAULT / ratio; | ||
2426 | |||
2427 | if (journal->j_trans_max != initial) | ||
2428 | reiserfs_warning (p_s_sb, "sh-461: journal_init: wrong transaction max size (%u). Changed to %u", | ||
2429 | initial, journal->j_trans_max); | ||
2430 | |||
2431 | journal->j_max_batch = journal->j_trans_max* | ||
2432 | JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT; | ||
2433 | } | ||
2434 | |||
2435 | if (!journal->j_trans_max) { | ||
2436 | /*we have the file system was created by old version of mkreiserfs | ||
2437 | so this field contains zero value */ | ||
2438 | journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT ; | ||
2439 | journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT ; | ||
2440 | journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE ; | ||
2441 | |||
2442 | /* for blocksize >= 4096 - max transaction size is 1024. For block size < 4096 | ||
2443 | trans max size is decreased proportionally */ | ||
2444 | if (p_s_sb->s_blocksize < 4096) { | ||
2445 | journal->j_trans_max /= (4096 / p_s_sb->s_blocksize) ; | ||
2446 | journal->j_max_batch = (journal->j_trans_max) * 9 / 10 ; | ||
2447 | } | ||
2448 | } | ||
2449 | |||
2450 | journal->j_default_max_commit_age = journal->j_max_commit_age; | ||
2451 | |||
2452 | if (commit_max_age != 0) { | ||
2453 | journal->j_max_commit_age = commit_max_age; | ||
2454 | journal->j_max_trans_age = commit_max_age; | ||
2455 | } | ||
2456 | |||
2457 | reiserfs_info (p_s_sb, "journal params: device %s, size %u, " | ||
2458 | "journal first block %u, max trans len %u, max batch %u, " | ||
2459 | "max commit age %u, max trans age %u\n", | ||
2460 | bdevname( journal->j_dev_bd, b), | ||
2461 | SB_ONDISK_JOURNAL_SIZE(p_s_sb), | ||
2462 | SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), | ||
2463 | journal->j_trans_max, | ||
2464 | journal->j_max_batch, | ||
2465 | journal->j_max_commit_age, | ||
2466 | journal->j_max_trans_age); | ||
2467 | |||
2468 | brelse (bhjh); | ||
2469 | |||
2470 | journal->j_list_bitmap_index = 0 ; | ||
2471 | journal_list_init(p_s_sb) ; | ||
2472 | |||
2473 | memset(journal->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ; | ||
2474 | |||
2475 | INIT_LIST_HEAD(&journal->j_dirty_buffers) ; | ||
2476 | spin_lock_init(&journal->j_dirty_buffers_lock) ; | ||
2477 | |||
2478 | journal->j_start = 0 ; | ||
2479 | journal->j_len = 0 ; | ||
2480 | journal->j_len_alloc = 0 ; | ||
2481 | atomic_set(&(journal->j_wcount), 0) ; | ||
2482 | atomic_set(&(journal->j_async_throttle), 0) ; | ||
2483 | journal->j_bcount = 0 ; | ||
2484 | journal->j_trans_start_time = 0 ; | ||
2485 | journal->j_last = NULL ; | ||
2486 | journal->j_first = NULL ; | ||
2487 | init_waitqueue_head(&(journal->j_join_wait)) ; | ||
2488 | sema_init(&journal->j_lock, 1); | ||
2489 | sema_init(&journal->j_flush_sem, 1); | ||
2490 | |||
2491 | journal->j_trans_id = 10 ; | ||
2492 | journal->j_mount_id = 10 ; | ||
2493 | journal->j_state = 0 ; | ||
2494 | atomic_set(&(journal->j_jlock), 0) ; | ||
2495 | journal->j_cnode_free_list = allocate_cnodes(num_cnodes) ; | ||
2496 | journal->j_cnode_free_orig = journal->j_cnode_free_list ; | ||
2497 | journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0 ; | ||
2498 | journal->j_cnode_used = 0 ; | ||
2499 | journal->j_must_wait = 0 ; | ||
2500 | |||
2501 | init_journal_hash(p_s_sb) ; | ||
2502 | jl = journal->j_current_jl; | ||
2503 | jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl); | ||
2504 | if (!jl->j_list_bitmap) { | ||
2505 | reiserfs_warning(p_s_sb, "journal-2005, get_list_bitmap failed for journal list 0") ; | ||
2506 | goto free_and_return; | ||
2507 | } | ||
2508 | if (journal_read(p_s_sb) < 0) { | ||
2509 | reiserfs_warning(p_s_sb, "Replay Failure, unable to mount") ; | ||
2510 | goto free_and_return; | ||
2511 | } | ||
2512 | |||
2513 | reiserfs_mounted_fs_count++ ; | ||
2514 | if (reiserfs_mounted_fs_count <= 1) | ||
2515 | commit_wq = create_workqueue("reiserfs"); | ||
2516 | |||
2517 | INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb); | ||
2518 | return 0 ; | ||
2519 | free_and_return: | ||
2520 | free_journal_ram(p_s_sb); | ||
2521 | return 1; | ||
2522 | } | ||
2523 | |||
2524 | /* | ||
2525 | ** test for a polite end of the current transaction. Used by file_write, and should | ||
2526 | ** be used by delete to make sure they don't write more than can fit inside a single | ||
2527 | ** transaction | ||
2528 | */ | ||
2529 | int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) { | ||
2530 | struct reiserfs_journal *journal = SB_JOURNAL (th->t_super); | ||
2531 | time_t now = get_seconds() ; | ||
2532 | /* cannot restart while nested */ | ||
2533 | BUG_ON (!th->t_trans_id); | ||
2534 | if (th->t_refcount > 1) | ||
2535 | return 0 ; | ||
2536 | if ( journal->j_must_wait > 0 || | ||
2537 | (journal->j_len_alloc + new_alloc) >= journal->j_max_batch || | ||
2538 | atomic_read(&(journal->j_jlock)) || | ||
2539 | (now - journal->j_trans_start_time) > journal->j_max_trans_age || | ||
2540 | journal->j_cnode_free < (journal->j_trans_max * 3)) { | ||
2541 | return 1 ; | ||
2542 | } | ||
2543 | return 0 ; | ||
2544 | } | ||
2545 | |||
2546 | /* this must be called inside a transaction, and requires the | ||
2547 | ** kernel_lock to be held | ||
2548 | */ | ||
2549 | void reiserfs_block_writes(struct reiserfs_transaction_handle *th) { | ||
2550 | struct reiserfs_journal *journal = SB_JOURNAL (th->t_super); | ||
2551 | BUG_ON (!th->t_trans_id); | ||
2552 | journal->j_must_wait = 1 ; | ||
2553 | set_bit(J_WRITERS_BLOCKED, &journal->j_state) ; | ||
2554 | return ; | ||
2555 | } | ||
2556 | |||
2557 | /* this must be called without a transaction started, and does not | ||
2558 | ** require BKL | ||
2559 | */ | ||
2560 | void reiserfs_allow_writes(struct super_block *s) { | ||
2561 | struct reiserfs_journal *journal = SB_JOURNAL (s); | ||
2562 | clear_bit(J_WRITERS_BLOCKED, &journal->j_state) ; | ||
2563 | wake_up(&journal->j_join_wait) ; | ||
2564 | } | ||
2565 | |||
2566 | /* this must be called without a transaction started, and does not | ||
2567 | ** require BKL | ||
2568 | */ | ||
2569 | void reiserfs_wait_on_write_block(struct super_block *s) { | ||
2570 | struct reiserfs_journal *journal = SB_JOURNAL (s); | ||
2571 | wait_event(journal->j_join_wait, | ||
2572 | !test_bit(J_WRITERS_BLOCKED, &journal->j_state)) ; | ||
2573 | } | ||
2574 | |||
2575 | static void queue_log_writer(struct super_block *s) { | ||
2576 | wait_queue_t wait; | ||
2577 | struct reiserfs_journal *journal = SB_JOURNAL (s); | ||
2578 | set_bit(J_WRITERS_QUEUED, &journal->j_state); | ||
2579 | |||
2580 | /* | ||
2581 | * we don't want to use wait_event here because | ||
2582 | * we only want to wait once. | ||
2583 | */ | ||
2584 | init_waitqueue_entry(&wait, current); | ||
2585 | add_wait_queue(&journal->j_join_wait, &wait); | ||
2586 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
2587 | if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) | ||
2588 | schedule(); | ||
2589 | current->state = TASK_RUNNING; | ||
2590 | remove_wait_queue(&journal->j_join_wait, &wait); | ||
2591 | } | ||
2592 | |||
2593 | static void wake_queued_writers(struct super_block *s) { | ||
2594 | struct reiserfs_journal *journal = SB_JOURNAL (s); | ||
2595 | if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state)) | ||
2596 | wake_up(&journal->j_join_wait); | ||
2597 | } | ||
2598 | |||
2599 | static void let_transaction_grow(struct super_block *sb, | ||
2600 | unsigned long trans_id) | ||
2601 | { | ||
2602 | struct reiserfs_journal *journal = SB_JOURNAL (sb); | ||
2603 | unsigned long bcount = journal->j_bcount; | ||
2604 | while(1) { | ||
2605 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
2606 | schedule_timeout(1); | ||
2607 | journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; | ||
2608 | while ((atomic_read(&journal->j_wcount) > 0 || | ||
2609 | atomic_read(&journal->j_jlock)) && | ||
2610 | journal->j_trans_id == trans_id) { | ||
2611 | queue_log_writer(sb); | ||
2612 | } | ||
2613 | if (journal->j_trans_id != trans_id) | ||
2614 | break; | ||
2615 | if (bcount == journal->j_bcount) | ||
2616 | break; | ||
2617 | bcount = journal->j_bcount; | ||
2618 | } | ||
2619 | } | ||
2620 | |||
2621 | /* join == true if you must join an existing transaction. | ||
2622 | ** join == false if you can deal with waiting for others to finish | ||
2623 | ** | ||
2624 | ** this will block until the transaction is joinable. send the number of blocks you | ||
2625 | ** expect to use in nblocks. | ||
2626 | */ | ||
2627 | static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,unsigned long nblocks,int join) { | ||
2628 | time_t now = get_seconds() ; | ||
2629 | int old_trans_id ; | ||
2630 | struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); | ||
2631 | struct reiserfs_transaction_handle myth; | ||
2632 | int sched_count = 0; | ||
2633 | int retval; | ||
2634 | |||
2635 | reiserfs_check_lock_depth(p_s_sb, "journal_begin") ; | ||
2636 | |||
2637 | PROC_INFO_INC( p_s_sb, journal.journal_being ); | ||
2638 | /* set here for journal_join */ | ||
2639 | th->t_refcount = 1; | ||
2640 | th->t_super = p_s_sb ; | ||
2641 | |||
2642 | relock: | ||
2643 | lock_journal(p_s_sb) ; | ||
2644 | if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted (journal)) { | ||
2645 | unlock_journal (p_s_sb); | ||
2646 | retval = journal->j_errno; | ||
2647 | goto out_fail; | ||
2648 | } | ||
2649 | journal->j_bcount++; | ||
2650 | |||
2651 | if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { | ||
2652 | unlock_journal(p_s_sb) ; | ||
2653 | reiserfs_wait_on_write_block(p_s_sb) ; | ||
2654 | PROC_INFO_INC( p_s_sb, journal.journal_relock_writers ); | ||
2655 | goto relock ; | ||
2656 | } | ||
2657 | now = get_seconds(); | ||
2658 | |||
2659 | /* if there is no room in the journal OR | ||
2660 | ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning | ||
2661 | ** we don't sleep if there aren't other writers | ||
2662 | */ | ||
2663 | |||
2664 | if ( (!join && journal->j_must_wait > 0) || | ||
2665 | ( !join && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch) || | ||
2666 | (!join && atomic_read(&journal->j_wcount) > 0 && journal->j_trans_start_time > 0 && | ||
2667 | (now - journal->j_trans_start_time) > journal->j_max_trans_age) || | ||
2668 | (!join && atomic_read(&journal->j_jlock)) || | ||
2669 | (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) { | ||
2670 | |||
2671 | old_trans_id = journal->j_trans_id; | ||
2672 | unlock_journal(p_s_sb) ; /* allow others to finish this transaction */ | ||
2673 | |||
2674 | if (!join && (journal->j_len_alloc + nblocks + 2) >= | ||
2675 | journal->j_max_batch && | ||
2676 | ((journal->j_len + nblocks + 2) * 100) < (journal->j_len_alloc * 75)) | ||
2677 | { | ||
2678 | if (atomic_read(&journal->j_wcount) > 10) { | ||
2679 | sched_count++; | ||
2680 | queue_log_writer(p_s_sb); | ||
2681 | goto relock; | ||
2682 | } | ||
2683 | } | ||
2684 | /* don't mess with joining the transaction if all we have to do is | ||
2685 | * wait for someone else to do a commit | ||
2686 | */ | ||
2687 | if (atomic_read(&journal->j_jlock)) { | ||
2688 | while (journal->j_trans_id == old_trans_id && | ||
2689 | atomic_read(&journal->j_jlock)) { | ||
2690 | queue_log_writer(p_s_sb); | ||
2691 | } | ||
2692 | goto relock; | ||
2693 | } | ||
2694 | retval = journal_join(&myth, p_s_sb, 1) ; | ||
2695 | if (retval) | ||
2696 | goto out_fail; | ||
2697 | |||
2698 | /* someone might have ended the transaction while we joined */ | ||
2699 | if (old_trans_id != journal->j_trans_id) { | ||
2700 | retval = do_journal_end(&myth, p_s_sb, 1, 0) ; | ||
2701 | } else { | ||
2702 | retval = do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW) ; | ||
2703 | } | ||
2704 | |||
2705 | if (retval) | ||
2706 | goto out_fail; | ||
2707 | |||
2708 | PROC_INFO_INC( p_s_sb, journal.journal_relock_wcount ); | ||
2709 | goto relock ; | ||
2710 | } | ||
2711 | /* we are the first writer, set trans_id */ | ||
2712 | if (journal->j_trans_start_time == 0) { | ||
2713 | journal->j_trans_start_time = get_seconds(); | ||
2714 | } | ||
2715 | atomic_inc(&(journal->j_wcount)) ; | ||
2716 | journal->j_len_alloc += nblocks ; | ||
2717 | th->t_blocks_logged = 0 ; | ||
2718 | th->t_blocks_allocated = nblocks ; | ||
2719 | th->t_trans_id = journal->j_trans_id ; | ||
2720 | unlock_journal(p_s_sb) ; | ||
2721 | INIT_LIST_HEAD (&th->t_list); | ||
2722 | return 0 ; | ||
2723 | |||
2724 | out_fail: | ||
2725 | memset (th, 0, sizeof (*th)); | ||
2726 | /* Re-set th->t_super, so we can properly keep track of how many | ||
2727 | * persistent transactions there are. We need to do this so if this | ||
2728 | * call is part of a failed restart_transaction, we can free it later */ | ||
2729 | th->t_super = p_s_sb; | ||
2730 | return retval; | ||
2731 | } | ||
2732 | |||
2733 | struct reiserfs_transaction_handle * | ||
2734 | reiserfs_persistent_transaction(struct super_block *s, int nblocks) { | ||
2735 | int ret ; | ||
2736 | struct reiserfs_transaction_handle *th ; | ||
2737 | |||
2738 | /* if we're nesting into an existing transaction. It will be | ||
2739 | ** persistent on its own | ||
2740 | */ | ||
2741 | if (reiserfs_transaction_running(s)) { | ||
2742 | th = current->journal_info ; | ||
2743 | th->t_refcount++ ; | ||
2744 | if (th->t_refcount < 2) { | ||
2745 | BUG() ; | ||
2746 | } | ||
2747 | return th ; | ||
2748 | } | ||
2749 | th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS, s) ; | ||
2750 | if (!th) | ||
2751 | return NULL; | ||
2752 | ret = journal_begin(th, s, nblocks) ; | ||
2753 | if (ret) { | ||
2754 | reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ; | ||
2755 | return NULL; | ||
2756 | } | ||
2757 | |||
2758 | SB_JOURNAL(s)->j_persistent_trans++; | ||
2759 | return th ; | ||
2760 | } | ||
2761 | |||
2762 | int | ||
2763 | reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) { | ||
2764 | struct super_block *s = th->t_super; | ||
2765 | int ret = 0; | ||
2766 | if (th->t_trans_id) | ||
2767 | ret = journal_end(th, th->t_super, th->t_blocks_allocated); | ||
2768 | else | ||
2769 | ret = -EIO; | ||
2770 | if (th->t_refcount == 0) { | ||
2771 | SB_JOURNAL(s)->j_persistent_trans--; | ||
2772 | reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ; | ||
2773 | } | ||
2774 | return ret; | ||
2775 | } | ||
2776 | |||
2777 | static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { | ||
2778 | struct reiserfs_transaction_handle *cur_th = current->journal_info; | ||
2779 | |||
2780 | /* this keeps do_journal_end from NULLing out the current->journal_info | ||
2781 | ** pointer | ||
2782 | */ | ||
2783 | th->t_handle_save = cur_th ; | ||
2784 | if (cur_th && cur_th->t_refcount > 1) { | ||
2785 | BUG() ; | ||
2786 | } | ||
2787 | return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_JOIN) ; | ||
2788 | } | ||
2789 | |||
2790 | int journal_join_abort(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { | ||
2791 | struct reiserfs_transaction_handle *cur_th = current->journal_info; | ||
2792 | |||
2793 | /* this keeps do_journal_end from NULLing out the current->journal_info | ||
2794 | ** pointer | ||
2795 | */ | ||
2796 | th->t_handle_save = cur_th ; | ||
2797 | if (cur_th && cur_th->t_refcount > 1) { | ||
2798 | BUG() ; | ||
2799 | } | ||
2800 | return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_ABORT) ; | ||
2801 | } | ||
2802 | |||
2803 | int journal_begin(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks) { | ||
2804 | struct reiserfs_transaction_handle *cur_th = current->journal_info ; | ||
2805 | int ret ; | ||
2806 | |||
2807 | th->t_handle_save = NULL ; | ||
2808 | if (cur_th) { | ||
2809 | /* we are nesting into the current transaction */ | ||
2810 | if (cur_th->t_super == p_s_sb) { | ||
2811 | BUG_ON (!cur_th->t_refcount); | ||
2812 | cur_th->t_refcount++ ; | ||
2813 | memcpy(th, cur_th, sizeof(*th)); | ||
2814 | if (th->t_refcount <= 1) | ||
2815 | reiserfs_warning (p_s_sb, "BAD: refcount <= 1, but journal_info != 0"); | ||
2816 | return 0; | ||
2817 | } else { | ||
2818 | /* we've ended up with a handle from a different filesystem. | ||
2819 | ** save it and restore on journal_end. This should never | ||
2820 | ** really happen... | ||
2821 | */ | ||
2822 | reiserfs_warning(p_s_sb, "clm-2100: nesting info a different FS") ; | ||
2823 | th->t_handle_save = current->journal_info ; | ||
2824 | current->journal_info = th; | ||
2825 | } | ||
2826 | } else { | ||
2827 | current->journal_info = th; | ||
2828 | } | ||
2829 | ret = do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_REG) ; | ||
2830 | if (current->journal_info != th) | ||
2831 | BUG() ; | ||
2832 | |||
2833 | /* I guess this boils down to being the reciprocal of clm-2100 above. | ||
2834 | * If do_journal_begin_r fails, we need to put it back, since journal_end | ||
2835 | * won't be called to do it. */ | ||
2836 | if (ret) | ||
2837 | current->journal_info = th->t_handle_save; | ||
2838 | else | ||
2839 | BUG_ON (!th->t_refcount); | ||
2840 | |||
2841 | return ret ; | ||
2842 | } | ||
2843 | |||
2844 | /* | ||
2845 | ** puts bh into the current transaction. If it was already there, reorders removes the | ||
2846 | ** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order). | ||
2847 | ** | ||
2848 | ** if it was dirty, cleans and files onto the clean list. I can't let it be dirty again until the | ||
2849 | ** transaction is committed. | ||
2850 | ** | ||
2851 | ** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len. | ||
2852 | */ | ||
2853 | int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) { | ||
2854 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
2855 | struct reiserfs_journal_cnode *cn = NULL; | ||
2856 | int count_already_incd = 0 ; | ||
2857 | int prepared = 0 ; | ||
2858 | BUG_ON (!th->t_trans_id); | ||
2859 | |||
2860 | PROC_INFO_INC( p_s_sb, journal.mark_dirty ); | ||
2861 | if (th->t_trans_id != journal->j_trans_id) { | ||
2862 | reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", | ||
2863 | th->t_trans_id, journal->j_trans_id); | ||
2864 | } | ||
2865 | |||
2866 | p_s_sb->s_dirt = 1; | ||
2867 | |||
2868 | prepared = test_clear_buffer_journal_prepared (bh); | ||
2869 | clear_buffer_journal_restore_dirty (bh); | ||
2870 | /* already in this transaction, we are done */ | ||
2871 | if (buffer_journaled(bh)) { | ||
2872 | PROC_INFO_INC( p_s_sb, journal.mark_dirty_already ); | ||
2873 | return 0 ; | ||
2874 | } | ||
2875 | |||
2876 | /* this must be turned into a panic instead of a warning. We can't allow | ||
2877 | ** a dirty or journal_dirty or locked buffer to be logged, as some changes | ||
2878 | ** could get to disk too early. NOT GOOD. | ||
2879 | */ | ||
2880 | if (!prepared || buffer_dirty(bh)) { | ||
2881 | reiserfs_warning (p_s_sb, "journal-1777: buffer %llu bad state " | ||
2882 | "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT", | ||
2883 | (unsigned long long)bh->b_blocknr, prepared ? ' ' : '!', | ||
2884 | buffer_locked(bh) ? ' ' : '!', | ||
2885 | buffer_dirty(bh) ? ' ' : '!', | ||
2886 | buffer_journal_dirty(bh) ? ' ' : '!') ; | ||
2887 | } | ||
2888 | |||
2889 | if (atomic_read(&(journal->j_wcount)) <= 0) { | ||
2890 | reiserfs_warning (p_s_sb, "journal-1409: journal_mark_dirty returning because j_wcount was %d", atomic_read(&(journal->j_wcount))) ; | ||
2891 | return 1 ; | ||
2892 | } | ||
2893 | /* this error means I've screwed up, and we've overflowed the transaction. | ||
2894 | ** Nothing can be done here, except make the FS readonly or panic. | ||
2895 | */ | ||
2896 | if (journal->j_len >= journal->j_trans_max) { | ||
2897 | reiserfs_panic(th->t_super, "journal-1413: journal_mark_dirty: j_len (%lu) is too big\n", journal->j_len) ; | ||
2898 | } | ||
2899 | |||
2900 | if (buffer_journal_dirty(bh)) { | ||
2901 | count_already_incd = 1 ; | ||
2902 | PROC_INFO_INC( p_s_sb, journal.mark_dirty_notjournal ); | ||
2903 | clear_buffer_journal_dirty (bh); | ||
2904 | } | ||
2905 | |||
2906 | if (journal->j_len > journal->j_len_alloc) { | ||
2907 | journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT ; | ||
2908 | } | ||
2909 | |||
2910 | set_buffer_journaled (bh); | ||
2911 | |||
2912 | /* now put this guy on the end */ | ||
2913 | if (!cn) { | ||
2914 | cn = get_cnode(p_s_sb) ; | ||
2915 | if (!cn) { | ||
2916 | reiserfs_panic(p_s_sb, "get_cnode failed!\n"); | ||
2917 | } | ||
2918 | |||
2919 | if (th->t_blocks_logged == th->t_blocks_allocated) { | ||
2920 | th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT ; | ||
2921 | journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT ; | ||
2922 | } | ||
2923 | th->t_blocks_logged++ ; | ||
2924 | journal->j_len++ ; | ||
2925 | |||
2926 | cn->bh = bh ; | ||
2927 | cn->blocknr = bh->b_blocknr ; | ||
2928 | cn->sb = p_s_sb; | ||
2929 | cn->jlist = NULL ; | ||
2930 | insert_journal_hash(journal->j_hash_table, cn) ; | ||
2931 | if (!count_already_incd) { | ||
2932 | get_bh(bh) ; | ||
2933 | } | ||
2934 | } | ||
2935 | cn->next = NULL ; | ||
2936 | cn->prev = journal->j_last ; | ||
2937 | cn->bh = bh ; | ||
2938 | if (journal->j_last) { | ||
2939 | journal->j_last->next = cn ; | ||
2940 | journal->j_last = cn ; | ||
2941 | } else { | ||
2942 | journal->j_first = cn ; | ||
2943 | journal->j_last = cn ; | ||
2944 | } | ||
2945 | return 0 ; | ||
2946 | } | ||
2947 | |||
2948 | int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { | ||
2949 | if (!current->journal_info && th->t_refcount > 1) | ||
2950 | reiserfs_warning (p_s_sb, "REISER-NESTING: th NULL, refcount %d", | ||
2951 | th->t_refcount); | ||
2952 | |||
2953 | if (!th->t_trans_id) { | ||
2954 | WARN_ON (1); | ||
2955 | return -EIO; | ||
2956 | } | ||
2957 | |||
2958 | th->t_refcount--; | ||
2959 | if (th->t_refcount > 0) { | ||
2960 | struct reiserfs_transaction_handle *cur_th = current->journal_info ; | ||
2961 | |||
2962 | /* we aren't allowed to close a nested transaction on a different | ||
2963 | ** filesystem from the one in the task struct | ||
2964 | */ | ||
2965 | if (cur_th->t_super != th->t_super) | ||
2966 | BUG() ; | ||
2967 | |||
2968 | if (th != cur_th) { | ||
2969 | memcpy(current->journal_info, th, sizeof(*th)); | ||
2970 | th->t_trans_id = 0; | ||
2971 | } | ||
2972 | return 0; | ||
2973 | } else { | ||
2974 | return do_journal_end(th, p_s_sb, nblocks, 0) ; | ||
2975 | } | ||
2976 | } | ||
2977 | |||
2978 | /* removes from the current transaction, relsing and descrementing any counters. | ||
2979 | ** also files the removed buffer directly onto the clean list | ||
2980 | ** | ||
2981 | ** called by journal_mark_freed when a block has been deleted | ||
2982 | ** | ||
2983 | ** returns 1 if it cleaned and relsed the buffer. 0 otherwise | ||
2984 | */ | ||
2985 | static int remove_from_transaction(struct super_block *p_s_sb, b_blocknr_t blocknr, int already_cleaned) { | ||
2986 | struct buffer_head *bh ; | ||
2987 | struct reiserfs_journal_cnode *cn ; | ||
2988 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
2989 | int ret = 0; | ||
2990 | |||
2991 | cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr) ; | ||
2992 | if (!cn || !cn->bh) { | ||
2993 | return ret ; | ||
2994 | } | ||
2995 | bh = cn->bh ; | ||
2996 | if (cn->prev) { | ||
2997 | cn->prev->next = cn->next ; | ||
2998 | } | ||
2999 | if (cn->next) { | ||
3000 | cn->next->prev = cn->prev ; | ||
3001 | } | ||
3002 | if (cn == journal->j_first) { | ||
3003 | journal->j_first = cn->next ; | ||
3004 | } | ||
3005 | if (cn == journal->j_last) { | ||
3006 | journal->j_last = cn->prev ; | ||
3007 | } | ||
3008 | if (bh) | ||
3009 | remove_journal_hash(p_s_sb, journal->j_hash_table, NULL, bh->b_blocknr, 0) ; | ||
3010 | clear_buffer_journaled (bh); /* don't log this one */ | ||
3011 | |||
3012 | if (!already_cleaned) { | ||
3013 | clear_buffer_journal_dirty (bh); | ||
3014 | clear_buffer_dirty(bh); | ||
3015 | clear_buffer_journal_test (bh); | ||
3016 | put_bh(bh) ; | ||
3017 | if (atomic_read(&(bh->b_count)) < 0) { | ||
3018 | reiserfs_warning (p_s_sb, "journal-1752: remove from trans, b_count < 0"); | ||
3019 | } | ||
3020 | ret = 1 ; | ||
3021 | } | ||
3022 | journal->j_len-- ; | ||
3023 | journal->j_len_alloc-- ; | ||
3024 | free_cnode(p_s_sb, cn) ; | ||
3025 | return ret ; | ||
3026 | } | ||
3027 | |||
3028 | /* | ||
3029 | ** for any cnode in a journal list, it can only be dirtied of all the | ||
3030 | ** transactions that include it are commited to disk. | ||
3031 | ** this checks through each transaction, and returns 1 if you are allowed to dirty, | ||
3032 | ** and 0 if you aren't | ||
3033 | ** | ||
3034 | ** it is called by dirty_journal_list, which is called after flush_commit_list has gotten all the log | ||
3035 | ** blocks for a given transaction on disk | ||
3036 | ** | ||
3037 | */ | ||
3038 | static int can_dirty(struct reiserfs_journal_cnode *cn) { | ||
3039 | struct super_block *sb = cn->sb; | ||
3040 | b_blocknr_t blocknr = cn->blocknr ; | ||
3041 | struct reiserfs_journal_cnode *cur = cn->hprev ; | ||
3042 | int can_dirty = 1 ; | ||
3043 | |||
3044 | /* first test hprev. These are all newer than cn, so any node here | ||
3045 | ** with the same block number and dev means this node can't be sent | ||
3046 | ** to disk right now. | ||
3047 | */ | ||
3048 | while(cur && can_dirty) { | ||
3049 | if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb && | ||
3050 | cur->blocknr == blocknr) { | ||
3051 | can_dirty = 0 ; | ||
3052 | } | ||
3053 | cur = cur->hprev ; | ||
3054 | } | ||
3055 | /* then test hnext. These are all older than cn. As long as they | ||
3056 | ** are committed to the log, it is safe to write cn to disk | ||
3057 | */ | ||
3058 | cur = cn->hnext ; | ||
3059 | while(cur && can_dirty) { | ||
3060 | if (cur->jlist && cur->jlist->j_len > 0 && | ||
3061 | atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh && | ||
3062 | cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) { | ||
3063 | can_dirty = 0 ; | ||
3064 | } | ||
3065 | cur = cur->hnext ; | ||
3066 | } | ||
3067 | return can_dirty ; | ||
3068 | } | ||
3069 | |||
3070 | /* syncs the commit blocks, but does not force the real buffers to disk | ||
3071 | ** will wait until the current transaction is done/commited before returning | ||
3072 | */ | ||
3073 | int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { | ||
3074 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
3075 | |||
3076 | BUG_ON (!th->t_trans_id); | ||
3077 | /* you can sync while nested, very, very bad */ | ||
3078 | if (th->t_refcount > 1) { | ||
3079 | BUG() ; | ||
3080 | } | ||
3081 | if (journal->j_len == 0) { | ||
3082 | reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; | ||
3083 | journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; | ||
3084 | } | ||
3085 | return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT) ; | ||
3086 | } | ||
3087 | |||
3088 | /* | ||
3089 | ** writeback the pending async commits to disk | ||
3090 | */ | ||
3091 | static void flush_async_commits(void *p) { | ||
3092 | struct super_block *p_s_sb = p; | ||
3093 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
3094 | struct reiserfs_journal_list *jl; | ||
3095 | struct list_head *entry; | ||
3096 | |||
3097 | lock_kernel(); | ||
3098 | if (!list_empty(&journal->j_journal_list)) { | ||
3099 | /* last entry is the youngest, commit it and you get everything */ | ||
3100 | entry = journal->j_journal_list.prev; | ||
3101 | jl = JOURNAL_LIST_ENTRY(entry); | ||
3102 | flush_commit_list(p_s_sb, jl, 1); | ||
3103 | } | ||
3104 | unlock_kernel(); | ||
3105 | /* | ||
3106 | * this is a little racey, but there's no harm in missing | ||
3107 | * the filemap_fdata_write | ||
3108 | */ | ||
3109 | if (!atomic_read(&journal->j_async_throttle) && !reiserfs_is_journal_aborted (journal)) { | ||
3110 | atomic_inc(&journal->j_async_throttle); | ||
3111 | filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping); | ||
3112 | atomic_dec(&journal->j_async_throttle); | ||
3113 | } | ||
3114 | } | ||
3115 | |||
3116 | /* | ||
3117 | ** flushes any old transactions to disk | ||
3118 | ** ends the current transaction if it is too old | ||
3119 | */ | ||
3120 | int reiserfs_flush_old_commits(struct super_block *p_s_sb) { | ||
3121 | time_t now ; | ||
3122 | struct reiserfs_transaction_handle th ; | ||
3123 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
3124 | |||
3125 | now = get_seconds(); | ||
3126 | /* safety check so we don't flush while we are replaying the log during | ||
3127 | * mount | ||
3128 | */ | ||
3129 | if (list_empty(&journal->j_journal_list)) { | ||
3130 | return 0 ; | ||
3131 | } | ||
3132 | |||
3133 | /* check the current transaction. If there are no writers, and it is | ||
3134 | * too old, finish it, and force the commit blocks to disk | ||
3135 | */ | ||
3136 | if (atomic_read(&journal->j_wcount) <= 0 && | ||
3137 | journal->j_trans_start_time > 0 && | ||
3138 | journal->j_len > 0 && | ||
3139 | (now - journal->j_trans_start_time) > journal->j_max_trans_age) | ||
3140 | { | ||
3141 | if (!journal_join(&th, p_s_sb, 1)) { | ||
3142 | reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; | ||
3143 | journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; | ||
3144 | |||
3145 | /* we're only being called from kreiserfsd, it makes no sense to do | ||
3146 | ** an async commit so that kreiserfsd can do it later | ||
3147 | */ | ||
3148 | do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ; | ||
3149 | } | ||
3150 | } | ||
3151 | return p_s_sb->s_dirt; | ||
3152 | } | ||
3153 | |||
3154 | /* | ||
3155 | ** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit | ||
3156 | ** | ||
3157 | ** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all | ||
3158 | ** the writers are done. By the time it wakes up, the transaction it was called has already ended, so it just | ||
3159 | ** flushes the commit list and returns 0. | ||
3160 | ** | ||
3161 | ** Won't batch when flush or commit_now is set. Also won't batch when others are waiting on j_join_wait. | ||
3162 | ** | ||
3163 | ** Note, we can't allow the journal_end to proceed while there are still writers in the log. | ||
3164 | */ | ||
3165 | static int check_journal_end(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, | ||
3166 | unsigned long nblocks, int flags) { | ||
3167 | |||
3168 | time_t now ; | ||
3169 | int flush = flags & FLUSH_ALL ; | ||
3170 | int commit_now = flags & COMMIT_NOW ; | ||
3171 | int wait_on_commit = flags & WAIT ; | ||
3172 | struct reiserfs_journal_list *jl; | ||
3173 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
3174 | |||
3175 | BUG_ON (!th->t_trans_id); | ||
3176 | |||
3177 | if (th->t_trans_id != journal->j_trans_id) { | ||
3178 | reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", | ||
3179 | th->t_trans_id, journal->j_trans_id); | ||
3180 | } | ||
3181 | |||
3182 | journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged) ; | ||
3183 | if (atomic_read(&(journal->j_wcount)) > 0) { /* <= 0 is allowed. unmounting might not call begin */ | ||
3184 | atomic_dec(&(journal->j_wcount)) ; | ||
3185 | } | ||
3186 | |||
3187 | /* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released | ||
3188 | ** will be dealt with by next transaction that actually writes something, but should be taken | ||
3189 | ** care of in this trans | ||
3190 | */ | ||
3191 | if (journal->j_len == 0) { | ||
3192 | BUG(); | ||
3193 | } | ||
3194 | /* if wcount > 0, and we are called to with flush or commit_now, | ||
3195 | ** we wait on j_join_wait. We will wake up when the last writer has | ||
3196 | ** finished the transaction, and started it on its way to the disk. | ||
3197 | ** Then, we flush the commit or journal list, and just return 0 | ||
3198 | ** because the rest of journal end was already done for this transaction. | ||
3199 | */ | ||
3200 | if (atomic_read(&(journal->j_wcount)) > 0) { | ||
3201 | if (flush || commit_now) { | ||
3202 | unsigned trans_id ; | ||
3203 | |||
3204 | jl = journal->j_current_jl; | ||
3205 | trans_id = jl->j_trans_id; | ||
3206 | if (wait_on_commit) | ||
3207 | jl->j_state |= LIST_COMMIT_PENDING; | ||
3208 | atomic_set(&(journal->j_jlock), 1) ; | ||
3209 | if (flush) { | ||
3210 | journal->j_next_full_flush = 1 ; | ||
3211 | } | ||
3212 | unlock_journal(p_s_sb) ; | ||
3213 | |||
3214 | /* sleep while the current transaction is still j_jlocked */ | ||
3215 | while(journal->j_trans_id == trans_id) { | ||
3216 | if (atomic_read(&journal->j_jlock)) { | ||
3217 | queue_log_writer(p_s_sb); | ||
3218 | } else { | ||
3219 | lock_journal(p_s_sb); | ||
3220 | if (journal->j_trans_id == trans_id) { | ||
3221 | atomic_set(&(journal->j_jlock), 1) ; | ||
3222 | } | ||
3223 | unlock_journal(p_s_sb); | ||
3224 | } | ||
3225 | } | ||
3226 | if (journal->j_trans_id == trans_id) { | ||
3227 | BUG(); | ||
3228 | } | ||
3229 | if (commit_now && journal_list_still_alive(p_s_sb, trans_id) && | ||
3230 | wait_on_commit) | ||
3231 | { | ||
3232 | flush_commit_list(p_s_sb, jl, 1) ; | ||
3233 | } | ||
3234 | return 0 ; | ||
3235 | } | ||
3236 | unlock_journal(p_s_sb) ; | ||
3237 | return 0 ; | ||
3238 | } | ||
3239 | |||
3240 | /* deal with old transactions where we are the last writers */ | ||
3241 | now = get_seconds(); | ||
3242 | if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) { | ||
3243 | commit_now = 1 ; | ||
3244 | journal->j_next_async_flush = 1 ; | ||
3245 | } | ||
3246 | /* don't batch when someone is waiting on j_join_wait */ | ||
3247 | /* don't batch when syncing the commit or flushing the whole trans */ | ||
3248 | if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock))) && !flush && !commit_now && | ||
3249 | (journal->j_len < journal->j_max_batch) && | ||
3250 | journal->j_len_alloc < journal->j_max_batch && journal->j_cnode_free > (journal->j_trans_max * 3)) { | ||
3251 | journal->j_bcount++ ; | ||
3252 | unlock_journal(p_s_sb) ; | ||
3253 | return 0 ; | ||
3254 | } | ||
3255 | |||
3256 | if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) { | ||
3257 | reiserfs_panic(p_s_sb, "journal-003: journal_end: j_start (%ld) is too high\n", journal->j_start) ; | ||
3258 | } | ||
3259 | return 1 ; | ||
3260 | } | ||
3261 | |||
3262 | /* | ||
3263 | ** Does all the work that makes deleting blocks safe. | ||
3264 | ** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on. | ||
3265 | ** | ||
3266 | ** otherwise: | ||
3267 | ** set a bit for the block in the journal bitmap. That will prevent it from being allocated for unformatted nodes | ||
3268 | ** before this transaction has finished. | ||
3269 | ** | ||
3270 | ** mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. That will prevent any old transactions with | ||
3271 | ** this block from trying to flush to the real location. Since we aren't removing the cnode from the journal_list_hash, | ||
3272 | ** the block can't be reallocated yet. | ||
3273 | ** | ||
3274 | ** Then remove it from the current transaction, decrementing any counters and filing it on the clean list. | ||
3275 | */ | ||
3276 | int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, b_blocknr_t blocknr) { | ||
3277 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
3278 | struct reiserfs_journal_cnode *cn = NULL ; | ||
3279 | struct buffer_head *bh = NULL ; | ||
3280 | struct reiserfs_list_bitmap *jb = NULL ; | ||
3281 | int cleaned = 0 ; | ||
3282 | BUG_ON (!th->t_trans_id); | ||
3283 | |||
3284 | cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr); | ||
3285 | if (cn && cn->bh) { | ||
3286 | bh = cn->bh ; | ||
3287 | get_bh(bh) ; | ||
3288 | } | ||
3289 | /* if it is journal new, we just remove it from this transaction */ | ||
3290 | if (bh && buffer_journal_new(bh)) { | ||
3291 | clear_buffer_journal_new (bh); | ||
3292 | clear_prepared_bits(bh) ; | ||
3293 | reiserfs_clean_and_file_buffer(bh) ; | ||
3294 | cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ; | ||
3295 | } else { | ||
3296 | /* set the bit for this block in the journal bitmap for this transaction */ | ||
3297 | jb = journal->j_current_jl->j_list_bitmap; | ||
3298 | if (!jb) { | ||
3299 | reiserfs_panic(p_s_sb, "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n") ; | ||
3300 | } | ||
3301 | set_bit_in_list_bitmap(p_s_sb, blocknr, jb) ; | ||
3302 | |||
3303 | /* Note, the entire while loop is not allowed to schedule. */ | ||
3304 | |||
3305 | if (bh) { | ||
3306 | clear_prepared_bits(bh) ; | ||
3307 | reiserfs_clean_and_file_buffer(bh) ; | ||
3308 | } | ||
3309 | cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ; | ||
3310 | |||
3311 | /* find all older transactions with this block, make sure they don't try to write it out */ | ||
3312 | cn = get_journal_hash_dev(p_s_sb,journal->j_list_hash_table, blocknr) ; | ||
3313 | while (cn) { | ||
3314 | if (p_s_sb == cn->sb && blocknr == cn->blocknr) { | ||
3315 | set_bit(BLOCK_FREED, &cn->state) ; | ||
3316 | if (cn->bh) { | ||
3317 | if (!cleaned) { | ||
3318 | /* remove_from_transaction will brelse the buffer if it was | ||
3319 | ** in the current trans | ||
3320 | */ | ||
3321 | clear_buffer_journal_dirty (cn->bh); | ||
3322 | clear_buffer_dirty(cn->bh); | ||
3323 | clear_buffer_journal_test(cn->bh); | ||
3324 | cleaned = 1 ; | ||
3325 | put_bh(cn->bh) ; | ||
3326 | if (atomic_read(&(cn->bh->b_count)) < 0) { | ||
3327 | reiserfs_warning (p_s_sb, "journal-2138: cn->bh->b_count < 0"); | ||
3328 | } | ||
3329 | } | ||
3330 | if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */ | ||
3331 | atomic_dec(&(cn->jlist->j_nonzerolen)) ; | ||
3332 | } | ||
3333 | cn->bh = NULL ; | ||
3334 | } | ||
3335 | } | ||
3336 | cn = cn->hnext ; | ||
3337 | } | ||
3338 | } | ||
3339 | |||
3340 | if (bh) { | ||
3341 | put_bh(bh) ; /* get_hash grabs the buffer */ | ||
3342 | if (atomic_read(&(bh->b_count)) < 0) { | ||
3343 | reiserfs_warning (p_s_sb, "journal-2165: bh->b_count < 0"); | ||
3344 | } | ||
3345 | } | ||
3346 | return 0 ; | ||
3347 | } | ||
3348 | |||
3349 | void reiserfs_update_inode_transaction(struct inode *inode) { | ||
3350 | struct reiserfs_journal *journal = SB_JOURNAL (inode->i_sb); | ||
3351 | REISERFS_I(inode)->i_jl = journal->j_current_jl; | ||
3352 | REISERFS_I(inode)->i_trans_id = journal->j_trans_id ; | ||
3353 | } | ||
3354 | |||
3355 | /* | ||
3356 | * returns -1 on error, 0 if no commits/barriers were done and 1 | ||
3357 | * if a transaction was actually committed and the barrier was done | ||
3358 | */ | ||
3359 | static int __commit_trans_jl(struct inode *inode, unsigned long id, | ||
3360 | struct reiserfs_journal_list *jl) | ||
3361 | { | ||
3362 | struct reiserfs_transaction_handle th ; | ||
3363 | struct super_block *sb = inode->i_sb ; | ||
3364 | struct reiserfs_journal *journal = SB_JOURNAL (sb); | ||
3365 | int ret = 0; | ||
3366 | |||
3367 | /* is it from the current transaction, or from an unknown transaction? */ | ||
3368 | if (id == journal->j_trans_id) { | ||
3369 | jl = journal->j_current_jl; | ||
3370 | /* try to let other writers come in and grow this transaction */ | ||
3371 | let_transaction_grow(sb, id); | ||
3372 | if (journal->j_trans_id != id) { | ||
3373 | goto flush_commit_only; | ||
3374 | } | ||
3375 | |||
3376 | ret = journal_begin(&th, sb, 1) ; | ||
3377 | if (ret) | ||
3378 | return ret; | ||
3379 | |||
3380 | /* someone might have ended this transaction while we joined */ | ||
3381 | if (journal->j_trans_id != id) { | ||
3382 | reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1) ; | ||
3383 | journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)) ; | ||
3384 | ret = journal_end(&th, sb, 1) ; | ||
3385 | goto flush_commit_only; | ||
3386 | } | ||
3387 | |||
3388 | ret = journal_end_sync(&th, sb, 1) ; | ||
3389 | if (!ret) | ||
3390 | ret = 1; | ||
3391 | |||
3392 | } else { | ||
3393 | /* this gets tricky, we have to make sure the journal list in | ||
3394 | * the inode still exists. We know the list is still around | ||
3395 | * if we've got a larger transaction id than the oldest list | ||
3396 | */ | ||
3397 | flush_commit_only: | ||
3398 | if (journal_list_still_alive(inode->i_sb, id)) { | ||
3399 | /* | ||
3400 | * we only set ret to 1 when we know for sure | ||
3401 | * the barrier hasn't been started yet on the commit | ||
3402 | * block. | ||
3403 | */ | ||
3404 | if (atomic_read(&jl->j_commit_left) > 1) | ||
3405 | ret = 1; | ||
3406 | flush_commit_list(sb, jl, 1) ; | ||
3407 | if (journal->j_errno) | ||
3408 | ret = journal->j_errno; | ||
3409 | } | ||
3410 | } | ||
3411 | /* otherwise the list is gone, and long since committed */ | ||
3412 | return ret; | ||
3413 | } | ||
3414 | |||
3415 | int reiserfs_commit_for_inode(struct inode *inode) { | ||
3416 | unsigned long id = REISERFS_I(inode)->i_trans_id; | ||
3417 | struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl; | ||
3418 | |||
3419 | /* for the whole inode, assume unset id means it was | ||
3420 | * changed in the current transaction. More conservative | ||
3421 | */ | ||
3422 | if (!id || !jl) { | ||
3423 | reiserfs_update_inode_transaction(inode) ; | ||
3424 | id = REISERFS_I(inode)->i_trans_id; | ||
3425 | /* jl will be updated in __commit_trans_jl */ | ||
3426 | } | ||
3427 | |||
3428 | return __commit_trans_jl(inode, id, jl); | ||
3429 | } | ||
3430 | |||
3431 | void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, | ||
3432 | struct buffer_head *bh) { | ||
3433 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
3434 | PROC_INFO_INC( p_s_sb, journal.restore_prepared ); | ||
3435 | if (!bh) { | ||
3436 | return ; | ||
3437 | } | ||
3438 | if (test_clear_buffer_journal_restore_dirty (bh) && | ||
3439 | buffer_journal_dirty(bh)) { | ||
3440 | struct reiserfs_journal_cnode *cn; | ||
3441 | cn = get_journal_hash_dev(p_s_sb, | ||
3442 | journal->j_list_hash_table, | ||
3443 | bh->b_blocknr); | ||
3444 | if (cn && can_dirty(cn)) { | ||
3445 | set_buffer_journal_test (bh); | ||
3446 | mark_buffer_dirty(bh); | ||
3447 | } | ||
3448 | } | ||
3449 | clear_buffer_journal_prepared (bh); | ||
3450 | } | ||
3451 | |||
3452 | extern struct tree_balance *cur_tb ; | ||
3453 | /* | ||
3454 | ** before we can change a metadata block, we have to make sure it won't | ||
3455 | ** be written to disk while we are altering it. So, we must: | ||
3456 | ** clean it | ||
3457 | ** wait on it. | ||
3458 | ** | ||
3459 | */ | ||
3460 | int reiserfs_prepare_for_journal(struct super_block *p_s_sb, | ||
3461 | struct buffer_head *bh, int wait) { | ||
3462 | PROC_INFO_INC( p_s_sb, journal.prepare ); | ||
3463 | |||
3464 | if (test_set_buffer_locked(bh)) { | ||
3465 | if (!wait) | ||
3466 | return 0; | ||
3467 | lock_buffer(bh); | ||
3468 | } | ||
3469 | set_buffer_journal_prepared (bh); | ||
3470 | if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) { | ||
3471 | clear_buffer_journal_test (bh); | ||
3472 | set_buffer_journal_restore_dirty (bh); | ||
3473 | } | ||
3474 | unlock_buffer(bh); | ||
3475 | return 1; | ||
3476 | } | ||
3477 | |||
3478 | static void flush_old_journal_lists(struct super_block *s) { | ||
3479 | struct reiserfs_journal *journal = SB_JOURNAL (s); | ||
3480 | struct reiserfs_journal_list *jl; | ||
3481 | struct list_head *entry; | ||
3482 | time_t now = get_seconds(); | ||
3483 | |||
3484 | while(!list_empty(&journal->j_journal_list)) { | ||
3485 | entry = journal->j_journal_list.next; | ||
3486 | jl = JOURNAL_LIST_ENTRY(entry); | ||
3487 | /* this check should always be run, to send old lists to disk */ | ||
3488 | if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) { | ||
3489 | flush_used_journal_lists(s, jl); | ||
3490 | } else { | ||
3491 | break; | ||
3492 | } | ||
3493 | } | ||
3494 | } | ||
3495 | |||
3496 | /* | ||
3497 | ** long and ugly. If flush, will not return until all commit | ||
3498 | ** blocks and all real buffers in the trans are on disk. | ||
3499 | ** If no_async, won't return until all commit blocks are on disk. | ||
3500 | ** | ||
3501 | ** keep reading, there are comments as you go along | ||
3502 | ** | ||
3503 | ** If the journal is aborted, we just clean up. Things like flushing | ||
3504 | ** journal lists, etc just won't happen. | ||
3505 | */ | ||
3506 | static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks, | ||
3507 | int flags) { | ||
3508 | struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); | ||
3509 | struct reiserfs_journal_cnode *cn, *next, *jl_cn; | ||
3510 | struct reiserfs_journal_cnode *last_cn = NULL; | ||
3511 | struct reiserfs_journal_desc *desc ; | ||
3512 | struct reiserfs_journal_commit *commit ; | ||
3513 | struct buffer_head *c_bh ; /* commit bh */ | ||
3514 | struct buffer_head *d_bh ; /* desc bh */ | ||
3515 | int cur_write_start = 0 ; /* start index of current log write */ | ||
3516 | int old_start ; | ||
3517 | int i ; | ||
3518 | int flush = flags & FLUSH_ALL ; | ||
3519 | int wait_on_commit = flags & WAIT ; | ||
3520 | struct reiserfs_journal_list *jl, *temp_jl; | ||
3521 | struct list_head *entry, *safe; | ||
3522 | unsigned long jindex; | ||
3523 | unsigned long commit_trans_id; | ||
3524 | int trans_half; | ||
3525 | |||
3526 | BUG_ON (th->t_refcount > 1); | ||
3527 | BUG_ON (!th->t_trans_id); | ||
3528 | |||
3529 | current->journal_info = th->t_handle_save; | ||
3530 | reiserfs_check_lock_depth(p_s_sb, "journal end"); | ||
3531 | if (journal->j_len == 0) { | ||
3532 | reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; | ||
3533 | journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; | ||
3534 | } | ||
3535 | |||
3536 | lock_journal(p_s_sb) ; | ||
3537 | if (journal->j_next_full_flush) { | ||
3538 | flags |= FLUSH_ALL ; | ||
3539 | flush = 1 ; | ||
3540 | } | ||
3541 | if (journal->j_next_async_flush) { | ||
3542 | flags |= COMMIT_NOW | WAIT; | ||
3543 | wait_on_commit = 1; | ||
3544 | } | ||
3545 | |||
3546 | /* check_journal_end locks the journal, and unlocks if it does not return 1 | ||
3547 | ** it tells us if we should continue with the journal_end, or just return | ||
3548 | */ | ||
3549 | if (!check_journal_end(th, p_s_sb, nblocks, flags)) { | ||
3550 | p_s_sb->s_dirt = 1; | ||
3551 | wake_queued_writers(p_s_sb); | ||
3552 | reiserfs_async_progress_wait(p_s_sb); | ||
3553 | goto out ; | ||
3554 | } | ||
3555 | |||
3556 | /* check_journal_end might set these, check again */ | ||
3557 | if (journal->j_next_full_flush) { | ||
3558 | flush = 1 ; | ||
3559 | } | ||
3560 | |||
3561 | /* | ||
3562 | ** j must wait means we have to flush the log blocks, and the real blocks for | ||
3563 | ** this transaction | ||
3564 | */ | ||
3565 | if (journal->j_must_wait > 0) { | ||
3566 | flush = 1 ; | ||
3567 | } | ||
3568 | |||
3569 | #ifdef REISERFS_PREALLOCATE | ||
3570 | /* quota ops might need to nest, setup the journal_info pointer for them */ | ||
3571 | current->journal_info = th ; | ||
3572 | reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into | ||
3573 | * the transaction */ | ||
3574 | current->journal_info = th->t_handle_save ; | ||
3575 | #endif | ||
3576 | |||
3577 | /* setup description block */ | ||
3578 | d_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start) ; | ||
3579 | set_buffer_uptodate(d_bh); | ||
3580 | desc = (struct reiserfs_journal_desc *)(d_bh)->b_data ; | ||
3581 | memset(d_bh->b_data, 0, d_bh->b_size) ; | ||
3582 | memcpy(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8) ; | ||
3583 | set_desc_trans_id(desc, journal->j_trans_id) ; | ||
3584 | |||
3585 | /* setup commit block. Don't write (keep it clean too) this one until after everyone else is written */ | ||
3586 | c_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + | ||
3587 | ((journal->j_start + journal->j_len + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ; | ||
3588 | commit = (struct reiserfs_journal_commit *)c_bh->b_data ; | ||
3589 | memset(c_bh->b_data, 0, c_bh->b_size) ; | ||
3590 | set_commit_trans_id(commit, journal->j_trans_id) ; | ||
3591 | set_buffer_uptodate(c_bh) ; | ||
3592 | |||
3593 | /* init this journal list */ | ||
3594 | jl = journal->j_current_jl; | ||
3595 | |||
3596 | /* we lock the commit before doing anything because | ||
3597 | * we want to make sure nobody tries to run flush_commit_list until | ||
3598 | * the new transaction is fully setup, and we've already flushed the | ||
3599 | * ordered bh list | ||
3600 | */ | ||
3601 | down(&jl->j_commit_lock); | ||
3602 | |||
3603 | /* save the transaction id in case we need to commit it later */ | ||
3604 | commit_trans_id = jl->j_trans_id; | ||
3605 | |||
3606 | atomic_set(&jl->j_older_commits_done, 0) ; | ||
3607 | jl->j_trans_id = journal->j_trans_id ; | ||
3608 | jl->j_timestamp = journal->j_trans_start_time ; | ||
3609 | jl->j_commit_bh = c_bh ; | ||
3610 | jl->j_start = journal->j_start ; | ||
3611 | jl->j_len = journal->j_len ; | ||
3612 | atomic_set(&jl->j_nonzerolen, journal->j_len) ; | ||
3613 | atomic_set(&jl->j_commit_left, journal->j_len + 2); | ||
3614 | jl->j_realblock = NULL ; | ||
3615 | |||
3616 | /* The ENTIRE FOR LOOP MUST not cause schedule to occur. | ||
3617 | ** for each real block, add it to the journal list hash, | ||
3618 | ** copy into real block index array in the commit or desc block | ||
3619 | */ | ||
3620 | trans_half = journal_trans_half(p_s_sb->s_blocksize); | ||
3621 | for (i = 0, cn = journal->j_first ; cn ; cn = cn->next, i++) { | ||
3622 | if (buffer_journaled (cn->bh)) { | ||
3623 | jl_cn = get_cnode(p_s_sb) ; | ||
3624 | if (!jl_cn) { | ||
3625 | reiserfs_panic(p_s_sb, "journal-1676, get_cnode returned NULL\n") ; | ||
3626 | } | ||
3627 | if (i == 0) { | ||
3628 | jl->j_realblock = jl_cn ; | ||
3629 | } | ||
3630 | jl_cn->prev = last_cn ; | ||
3631 | jl_cn->next = NULL ; | ||
3632 | if (last_cn) { | ||
3633 | last_cn->next = jl_cn ; | ||
3634 | } | ||
3635 | last_cn = jl_cn ; | ||
3636 | /* make sure the block we are trying to log is not a block | ||
3637 | of journal or reserved area */ | ||
3638 | |||
3639 | if (is_block_in_log_or_reserved_area(p_s_sb, cn->bh->b_blocknr)) { | ||
3640 | reiserfs_panic(p_s_sb, "journal-2332: Trying to log block %lu, which is a log block\n", cn->bh->b_blocknr) ; | ||
3641 | } | ||
3642 | jl_cn->blocknr = cn->bh->b_blocknr ; | ||
3643 | jl_cn->state = 0 ; | ||
3644 | jl_cn->sb = p_s_sb; | ||
3645 | jl_cn->bh = cn->bh ; | ||
3646 | jl_cn->jlist = jl; | ||
3647 | insert_journal_hash(journal->j_list_hash_table, jl_cn) ; | ||
3648 | if (i < trans_half) { | ||
3649 | desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr) ; | ||
3650 | } else { | ||
3651 | commit->j_realblock[i - trans_half] = cpu_to_le32(cn->bh->b_blocknr) ; | ||
3652 | } | ||
3653 | } else { | ||
3654 | i-- ; | ||
3655 | } | ||
3656 | } | ||
3657 | set_desc_trans_len(desc, journal->j_len) ; | ||
3658 | set_desc_mount_id(desc, journal->j_mount_id) ; | ||
3659 | set_desc_trans_id(desc, journal->j_trans_id) ; | ||
3660 | set_commit_trans_len(commit, journal->j_len); | ||
3661 | |||
3662 | /* special check in case all buffers in the journal were marked for not logging */ | ||
3663 | if (journal->j_len == 0) { | ||
3664 | BUG(); | ||
3665 | } | ||
3666 | |||
3667 | /* we're about to dirty all the log blocks, mark the description block | ||
3668 | * dirty now too. Don't mark the commit block dirty until all the | ||
3669 | * others are on disk | ||
3670 | */ | ||
3671 | mark_buffer_dirty(d_bh); | ||
3672 | |||
3673 | /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */ | ||
3674 | cur_write_start = journal->j_start ; | ||
3675 | cn = journal->j_first ; | ||
3676 | jindex = 1 ; /* start at one so we don't get the desc again */ | ||
3677 | while(cn) { | ||
3678 | clear_buffer_journal_new (cn->bh); | ||
3679 | /* copy all the real blocks into log area. dirty log blocks */ | ||
3680 | if (buffer_journaled (cn->bh)) { | ||
3681 | struct buffer_head *tmp_bh ; | ||
3682 | char *addr; | ||
3683 | struct page *page; | ||
3684 | tmp_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + | ||
3685 | ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ; | ||
3686 | set_buffer_uptodate(tmp_bh); | ||
3687 | page = cn->bh->b_page; | ||
3688 | addr = kmap(page); | ||
3689 | memcpy(tmp_bh->b_data, addr + offset_in_page(cn->bh->b_data), | ||
3690 | cn->bh->b_size); | ||
3691 | kunmap(page); | ||
3692 | mark_buffer_dirty(tmp_bh); | ||
3693 | jindex++ ; | ||
3694 | set_buffer_journal_dirty (cn->bh); | ||
3695 | clear_buffer_journaled (cn->bh); | ||
3696 | } else { | ||
3697 | /* JDirty cleared sometime during transaction. don't log this one */ | ||
3698 | reiserfs_warning(p_s_sb, "journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!") ; | ||
3699 | brelse(cn->bh) ; | ||
3700 | } | ||
3701 | next = cn->next ; | ||
3702 | free_cnode(p_s_sb, cn) ; | ||
3703 | cn = next ; | ||
3704 | cond_resched(); | ||
3705 | } | ||
3706 | |||
3707 | /* we are done with both the c_bh and d_bh, but | ||
3708 | ** c_bh must be written after all other commit blocks, | ||
3709 | ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. | ||
3710 | */ | ||
3711 | |||
3712 | journal->j_current_jl = alloc_journal_list(p_s_sb); | ||
3713 | |||
3714 | /* now it is safe to insert this transaction on the main list */ | ||
3715 | list_add_tail(&jl->j_list, &journal->j_journal_list); | ||
3716 | list_add_tail(&jl->j_working_list, &journal->j_working_list); | ||
3717 | journal->j_num_work_lists++; | ||
3718 | |||
3719 | /* reset journal values for the next transaction */ | ||
3720 | old_start = journal->j_start ; | ||
3721 | journal->j_start = (journal->j_start + journal->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb); | ||
3722 | atomic_set(&(journal->j_wcount), 0) ; | ||
3723 | journal->j_bcount = 0 ; | ||
3724 | journal->j_last = NULL ; | ||
3725 | journal->j_first = NULL ; | ||
3726 | journal->j_len = 0 ; | ||
3727 | journal->j_trans_start_time = 0 ; | ||
3728 | journal->j_trans_id++ ; | ||
3729 | journal->j_current_jl->j_trans_id = journal->j_trans_id; | ||
3730 | journal->j_must_wait = 0 ; | ||
3731 | journal->j_len_alloc = 0 ; | ||
3732 | journal->j_next_full_flush = 0 ; | ||
3733 | journal->j_next_async_flush = 0 ; | ||
3734 | init_journal_hash(p_s_sb) ; | ||
3735 | |||
3736 | // make sure reiserfs_add_jh sees the new current_jl before we | ||
3737 | // write out the tails | ||
3738 | smp_mb(); | ||
3739 | |||
3740 | /* tail conversion targets have to hit the disk before we end the | ||
3741 | * transaction. Otherwise a later transaction might repack the tail | ||
3742 | * before this transaction commits, leaving the data block unflushed and | ||
3743 | * clean, if we crash before the later transaction commits, the data block | ||
3744 | * is lost. | ||
3745 | */ | ||
3746 | if (!list_empty(&jl->j_tail_bh_list)) { | ||
3747 | unlock_kernel(); | ||
3748 | write_ordered_buffers(&journal->j_dirty_buffers_lock, | ||
3749 | journal, jl, &jl->j_tail_bh_list); | ||
3750 | lock_kernel(); | ||
3751 | } | ||
3752 | if (!list_empty(&jl->j_tail_bh_list)) | ||
3753 | BUG(); | ||
3754 | up(&jl->j_commit_lock); | ||
3755 | |||
3756 | /* honor the flush wishes from the caller, simple commits can | ||
3757 | ** be done outside the journal lock, they are done below | ||
3758 | ** | ||
3759 | ** if we don't flush the commit list right now, we put it into | ||
3760 | ** the work queue so the people waiting on the async progress work | ||
3761 | ** queue don't wait for this proc to flush journal lists and such. | ||
3762 | */ | ||
3763 | if (flush) { | ||
3764 | flush_commit_list(p_s_sb, jl, 1) ; | ||
3765 | flush_journal_list(p_s_sb, jl, 1) ; | ||
3766 | } else if (!(jl->j_state & LIST_COMMIT_PENDING)) | ||
3767 | queue_delayed_work(commit_wq, &journal->j_work, HZ/10); | ||
3768 | |||
3769 | |||
3770 | /* if the next transaction has any chance of wrapping, flush | ||
3771 | ** transactions that might get overwritten. If any journal lists are very | ||
3772 | ** old flush them as well. | ||
3773 | */ | ||
3774 | first_jl: | ||
3775 | list_for_each_safe(entry, safe, &journal->j_journal_list) { | ||
3776 | temp_jl = JOURNAL_LIST_ENTRY(entry); | ||
3777 | if (journal->j_start <= temp_jl->j_start) { | ||
3778 | if ((journal->j_start + journal->j_trans_max + 1) >= | ||
3779 | temp_jl->j_start) | ||
3780 | { | ||
3781 | flush_used_journal_lists(p_s_sb, temp_jl); | ||
3782 | goto first_jl; | ||
3783 | } else if ((journal->j_start + | ||
3784 | journal->j_trans_max + 1) < | ||
3785 | SB_ONDISK_JOURNAL_SIZE(p_s_sb)) | ||
3786 | { | ||
3787 | /* if we don't cross into the next transaction and we don't | ||
3788 | * wrap, there is no way we can overlap any later transactions | ||
3789 | * break now | ||
3790 | */ | ||
3791 | break; | ||
3792 | } | ||
3793 | } else if ((journal->j_start + | ||
3794 | journal->j_trans_max + 1) > | ||
3795 | SB_ONDISK_JOURNAL_SIZE(p_s_sb)) | ||
3796 | { | ||
3797 | if (((journal->j_start + journal->j_trans_max + 1) % | ||
3798 | SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= temp_jl->j_start) | ||
3799 | { | ||
3800 | flush_used_journal_lists(p_s_sb, temp_jl); | ||
3801 | goto first_jl; | ||
3802 | } else { | ||
3803 | /* we don't overlap anything from out start to the end of the | ||
3804 | * log, and our wrapped portion doesn't overlap anything at | ||
3805 | * the start of the log. We can break | ||
3806 | */ | ||
3807 | break; | ||
3808 | } | ||
3809 | } | ||
3810 | } | ||
3811 | flush_old_journal_lists(p_s_sb); | ||
3812 | |||
3813 | journal->j_current_jl->j_list_bitmap = get_list_bitmap(p_s_sb, journal->j_current_jl) ; | ||
3814 | |||
3815 | if (!(journal->j_current_jl->j_list_bitmap)) { | ||
3816 | reiserfs_panic(p_s_sb, "journal-1996: do_journal_end, could not get a list bitmap\n") ; | ||
3817 | } | ||
3818 | |||
3819 | atomic_set(&(journal->j_jlock), 0) ; | ||
3820 | unlock_journal(p_s_sb) ; | ||
3821 | /* wake up any body waiting to join. */ | ||
3822 | clear_bit(J_WRITERS_QUEUED, &journal->j_state); | ||
3823 | wake_up(&(journal->j_join_wait)) ; | ||
3824 | |||
3825 | if (!flush && wait_on_commit && | ||
3826 | journal_list_still_alive(p_s_sb, commit_trans_id)) { | ||
3827 | flush_commit_list(p_s_sb, jl, 1) ; | ||
3828 | } | ||
3829 | out: | ||
3830 | reiserfs_check_lock_depth(p_s_sb, "journal end2"); | ||
3831 | |||
3832 | memset (th, 0, sizeof (*th)); | ||
3833 | /* Re-set th->t_super, so we can properly keep track of how many | ||
3834 | * persistent transactions there are. We need to do this so if this | ||
3835 | * call is part of a failed restart_transaction, we can free it later */ | ||
3836 | th->t_super = p_s_sb; | ||
3837 | |||
3838 | return journal->j_errno; | ||
3839 | } | ||
3840 | |||
3841 | static void | ||
3842 | __reiserfs_journal_abort_hard (struct super_block *sb) | ||
3843 | { | ||
3844 | struct reiserfs_journal *journal = SB_JOURNAL (sb); | ||
3845 | if (test_bit (J_ABORTED, &journal->j_state)) | ||
3846 | return; | ||
3847 | |||
3848 | printk (KERN_CRIT "REISERFS: Aborting journal for filesystem on %s\n", | ||
3849 | reiserfs_bdevname (sb)); | ||
3850 | |||
3851 | sb->s_flags |= MS_RDONLY; | ||
3852 | set_bit (J_ABORTED, &journal->j_state); | ||
3853 | |||
3854 | #ifdef CONFIG_REISERFS_CHECK | ||
3855 | dump_stack(); | ||
3856 | #endif | ||
3857 | } | ||
3858 | |||
3859 | static void | ||
3860 | __reiserfs_journal_abort_soft (struct super_block *sb, int errno) | ||
3861 | { | ||
3862 | struct reiserfs_journal *journal = SB_JOURNAL (sb); | ||
3863 | if (test_bit (J_ABORTED, &journal->j_state)) | ||
3864 | return; | ||
3865 | |||
3866 | if (!journal->j_errno) | ||
3867 | journal->j_errno = errno; | ||
3868 | |||
3869 | __reiserfs_journal_abort_hard (sb); | ||
3870 | } | ||
3871 | |||
3872 | void | ||
3873 | reiserfs_journal_abort (struct super_block *sb, int errno) | ||
3874 | { | ||
3875 | return __reiserfs_journal_abort_soft (sb, errno); | ||
3876 | } | ||
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c new file mode 100644 index 000000000000..2406608fc5cd --- /dev/null +++ b/fs/reiserfs/lbalance.c | |||
@@ -0,0 +1,1222 @@ | |||
1 | /* | ||
2 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
3 | */ | ||
4 | |||
5 | #include <linux/config.h> | ||
6 | #include <asm/uaccess.h> | ||
7 | #include <linux/string.h> | ||
8 | #include <linux/time.h> | ||
9 | #include <linux/reiserfs_fs.h> | ||
10 | #include <linux/buffer_head.h> | ||
11 | |||
12 | /* these are used in do_balance.c */ | ||
13 | |||
14 | /* leaf_move_items | ||
15 | leaf_shift_left | ||
16 | leaf_shift_right | ||
17 | leaf_delete_items | ||
18 | leaf_insert_into_buf | ||
19 | leaf_paste_in_buffer | ||
20 | leaf_cut_from_buffer | ||
21 | leaf_paste_entries | ||
22 | */ | ||
23 | |||
24 | |||
25 | /* copy copy_count entries from source directory item to dest buffer (creating new item if needed) */ | ||
26 | static void leaf_copy_dir_entries (struct buffer_info * dest_bi, struct buffer_head * source, | ||
27 | int last_first, int item_num, int from, int copy_count) | ||
28 | { | ||
29 | struct buffer_head * dest = dest_bi->bi_bh; | ||
30 | int item_num_in_dest; /* either the number of target item, | ||
31 | or if we must create a new item, | ||
32 | the number of the item we will | ||
33 | create it next to */ | ||
34 | struct item_head * ih; | ||
35 | struct reiserfs_de_head * deh; | ||
36 | int copy_records_len; /* length of all records in item to be copied */ | ||
37 | char * records; | ||
38 | |||
39 | ih = B_N_PITEM_HEAD (source, item_num); | ||
40 | |||
41 | RFALSE( !is_direntry_le_ih (ih), "vs-10000: item must be directory item"); | ||
42 | |||
43 | /* length of all record to be copied and first byte of the last of them */ | ||
44 | deh = B_I_DEH (source, ih); | ||
45 | if (copy_count) { | ||
46 | copy_records_len = (from ? deh_location( &(deh[from - 1]) ) : | ||
47 | ih_item_len(ih)) - deh_location( &(deh[from + copy_count - 1])); | ||
48 | records = source->b_data + ih_location(ih) + | ||
49 | deh_location( &(deh[from + copy_count - 1])); | ||
50 | } else { | ||
51 | copy_records_len = 0; | ||
52 | records = NULL; | ||
53 | } | ||
54 | |||
55 | /* when copy last to first, dest buffer can contain 0 items */ | ||
56 | item_num_in_dest = (last_first == LAST_TO_FIRST) ? (( B_NR_ITEMS(dest) ) ? 0 : -1) : (B_NR_ITEMS(dest) - 1); | ||
57 | |||
58 | /* if there are no items in dest or the first/last item in dest is not item of the same directory */ | ||
59 | if ( (item_num_in_dest == - 1) || | ||
60 | (last_first == FIRST_TO_LAST && le_ih_k_offset (ih) == DOT_OFFSET) || | ||
61 | (last_first == LAST_TO_FIRST && comp_short_le_keys/*COMP_SHORT_KEYS*/ (&ih->ih_key, B_N_PKEY (dest, item_num_in_dest)))) { | ||
62 | /* create new item in dest */ | ||
63 | struct item_head new_ih; | ||
64 | |||
65 | /* form item header */ | ||
66 | memcpy (&new_ih.ih_key, &ih->ih_key, KEY_SIZE); | ||
67 | put_ih_version( &new_ih, KEY_FORMAT_3_5 ); | ||
68 | /* calculate item len */ | ||
69 | put_ih_item_len( &new_ih, DEH_SIZE * copy_count + copy_records_len ); | ||
70 | put_ih_entry_count( &new_ih, 0 ); | ||
71 | |||
72 | if (last_first == LAST_TO_FIRST) { | ||
73 | /* form key by the following way */ | ||
74 | if (from < I_ENTRY_COUNT(ih)) { | ||
75 | set_le_ih_k_offset( &new_ih, deh_offset( &(deh[from]) ) ); | ||
76 | /*memcpy (&new_ih.ih_key.k_offset, &deh[from].deh_offset, SHORT_KEY_SIZE);*/ | ||
77 | } else { | ||
78 | /* no entries will be copied to this item in this function */ | ||
79 | set_le_ih_k_offset (&new_ih, U32_MAX); | ||
80 | /* this item is not yet valid, but we want I_IS_DIRECTORY_ITEM to return 1 for it, so we -1 */ | ||
81 | } | ||
82 | set_le_key_k_type (KEY_FORMAT_3_5, &(new_ih.ih_key), TYPE_DIRENTRY); | ||
83 | } | ||
84 | |||
85 | /* insert item into dest buffer */ | ||
86 | leaf_insert_into_buf (dest_bi, (last_first == LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest), &new_ih, NULL, 0); | ||
87 | } else { | ||
88 | /* prepare space for entries */ | ||
89 | leaf_paste_in_buffer (dest_bi, (last_first==FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0, MAX_US_INT, | ||
90 | DEH_SIZE * copy_count + copy_records_len, records, 0 | ||
91 | ); | ||
92 | } | ||
93 | |||
94 | item_num_in_dest = (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest)-1) : 0; | ||
95 | |||
96 | leaf_paste_entries (dest_bi->bi_bh, item_num_in_dest, | ||
97 | (last_first == FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD (dest, item_num_in_dest)) : 0, | ||
98 | copy_count, deh + from, records, | ||
99 | DEH_SIZE * copy_count + copy_records_len | ||
100 | ); | ||
101 | } | ||
102 | |||
103 | |||
104 | /* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or | ||
105 | part of it or nothing (see the return 0 below) from SOURCE to the end | ||
106 | (if last_first) or beginning (!last_first) of the DEST */ | ||
107 | /* returns 1 if anything was copied, else 0 */ | ||
108 | static int leaf_copy_boundary_item (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, | ||
109 | int bytes_or_entries) | ||
110 | { | ||
111 | struct buffer_head * dest = dest_bi->bi_bh; | ||
112 | int dest_nr_item, src_nr_item; /* number of items in the source and destination buffers */ | ||
113 | struct item_head * ih; | ||
114 | struct item_head * dih; | ||
115 | |||
116 | dest_nr_item = B_NR_ITEMS(dest); | ||
117 | |||
118 | if ( last_first == FIRST_TO_LAST ) { | ||
119 | /* if ( DEST is empty or first item of SOURCE and last item of DEST are the items of different objects | ||
120 | or of different types ) then there is no need to treat this item differently from the other items | ||
121 | that we copy, so we return */ | ||
122 | ih = B_N_PITEM_HEAD (src, 0); | ||
123 | dih = B_N_PITEM_HEAD (dest, dest_nr_item - 1); | ||
124 | if (!dest_nr_item || (!op_is_left_mergeable (&(ih->ih_key), src->b_size))) | ||
125 | /* there is nothing to merge */ | ||
126 | return 0; | ||
127 | |||
128 | RFALSE( ! ih_item_len(ih), "vs-10010: item can not have empty length"); | ||
129 | |||
130 | if ( is_direntry_le_ih (ih) ) { | ||
131 | if ( bytes_or_entries == -1 ) | ||
132 | /* copy all entries to dest */ | ||
133 | bytes_or_entries = ih_entry_count(ih); | ||
134 | leaf_copy_dir_entries (dest_bi, src, FIRST_TO_LAST, 0, 0, bytes_or_entries); | ||
135 | return 1; | ||
136 | } | ||
137 | |||
138 | /* copy part of the body of the first item of SOURCE to the end of the body of the last item of the DEST | ||
139 | part defined by 'bytes_or_entries'; if bytes_or_entries == -1 copy whole body; don't create new item header | ||
140 | */ | ||
141 | if ( bytes_or_entries == -1 ) | ||
142 | bytes_or_entries = ih_item_len(ih); | ||
143 | |||
144 | #ifdef CONFIG_REISERFS_CHECK | ||
145 | else { | ||
146 | if (bytes_or_entries == ih_item_len(ih) && is_indirect_le_ih(ih)) | ||
147 | if (get_ih_free_space (ih)) | ||
148 | reiserfs_panic (NULL, "vs-10020: leaf_copy_boundary_item: " | ||
149 | "last unformatted node must be filled entirely (%h)", | ||
150 | ih); | ||
151 | } | ||
152 | #endif | ||
153 | |||
154 | /* merge first item (or its part) of src buffer with the last | ||
155 | item of dest buffer. Both are of the same file */ | ||
156 | leaf_paste_in_buffer (dest_bi, | ||
157 | dest_nr_item - 1, ih_item_len(dih), bytes_or_entries, B_I_PITEM(src,ih), 0 | ||
158 | ); | ||
159 | |||
160 | if (is_indirect_le_ih (dih)) { | ||
161 | RFALSE( get_ih_free_space (dih), | ||
162 | "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space", | ||
163 | ih); | ||
164 | if (bytes_or_entries == ih_item_len(ih)) | ||
165 | set_ih_free_space (dih, get_ih_free_space (ih)); | ||
166 | } | ||
167 | |||
168 | return 1; | ||
169 | } | ||
170 | |||
171 | |||
172 | /* copy boundary item to right (last_first == LAST_TO_FIRST) */ | ||
173 | |||
174 | /* ( DEST is empty or last item of SOURCE and first item of DEST | ||
175 | are the items of different object or of different types ) | ||
176 | */ | ||
177 | src_nr_item = B_NR_ITEMS (src); | ||
178 | ih = B_N_PITEM_HEAD (src, src_nr_item - 1); | ||
179 | dih = B_N_PITEM_HEAD (dest, 0); | ||
180 | |||
181 | if (!dest_nr_item || !op_is_left_mergeable (&(dih->ih_key), src->b_size)) | ||
182 | return 0; | ||
183 | |||
184 | if ( is_direntry_le_ih (ih)) { | ||
185 | if ( bytes_or_entries == -1 ) | ||
186 | /* bytes_or_entries = entries number in last item body of SOURCE */ | ||
187 | bytes_or_entries = ih_entry_count(ih); | ||
188 | |||
189 | leaf_copy_dir_entries (dest_bi, src, LAST_TO_FIRST, src_nr_item - 1, ih_entry_count(ih) - bytes_or_entries, bytes_or_entries); | ||
190 | return 1; | ||
191 | } | ||
192 | |||
193 | /* copy part of the body of the last item of SOURCE to the begin of the body of the first item of the DEST; | ||
194 | part defined by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; change first item key of the DEST; | ||
195 | don't create new item header | ||
196 | */ | ||
197 | |||
198 | RFALSE( is_indirect_le_ih(ih) && get_ih_free_space (ih), | ||
199 | "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)", | ||
200 | ih); | ||
201 | |||
202 | if ( bytes_or_entries == -1 ) { | ||
203 | /* bytes_or_entries = length of last item body of SOURCE */ | ||
204 | bytes_or_entries = ih_item_len(ih); | ||
205 | |||
206 | RFALSE( le_ih_k_offset (dih) != | ||
207 | le_ih_k_offset (ih) + op_bytes_number (ih, src->b_size), | ||
208 | "vs-10050: items %h and %h do not match", ih, dih); | ||
209 | |||
210 | /* change first item key of the DEST */ | ||
211 | set_le_ih_k_offset (dih, le_ih_k_offset (ih)); | ||
212 | |||
213 | /* item becomes non-mergeable */ | ||
214 | /* or mergeable if left item was */ | ||
215 | set_le_ih_k_type (dih, le_ih_k_type (ih)); | ||
216 | } else { | ||
217 | /* merge to right only part of item */ | ||
218 | RFALSE( ih_item_len(ih) <= bytes_or_entries, | ||
219 | "vs-10060: no so much bytes %lu (needed %lu)", | ||
220 | ( unsigned long )ih_item_len(ih), ( unsigned long )bytes_or_entries); | ||
221 | |||
222 | /* change first item key of the DEST */ | ||
223 | if ( is_direct_le_ih (dih) ) { | ||
224 | RFALSE( le_ih_k_offset (dih) <= (unsigned long)bytes_or_entries, | ||
225 | "vs-10070: dih %h, bytes_or_entries(%d)", dih, bytes_or_entries); | ||
226 | set_le_ih_k_offset (dih, le_ih_k_offset (dih) - bytes_or_entries); | ||
227 | } else { | ||
228 | RFALSE( le_ih_k_offset (dih) <= | ||
229 | (bytes_or_entries / UNFM_P_SIZE) * dest->b_size, | ||
230 | "vs-10080: dih %h, bytes_or_entries(%d)", | ||
231 | dih, (bytes_or_entries/UNFM_P_SIZE)*dest->b_size); | ||
232 | set_le_ih_k_offset (dih, le_ih_k_offset (dih) - ((bytes_or_entries / UNFM_P_SIZE) * dest->b_size)); | ||
233 | } | ||
234 | } | ||
235 | |||
236 | leaf_paste_in_buffer (dest_bi, 0, 0, bytes_or_entries, B_I_PITEM(src,ih) + ih_item_len(ih) - bytes_or_entries, 0); | ||
237 | return 1; | ||
238 | } | ||
239 | |||
240 | |||
241 | /* copy cpy_mun items from buffer src to buffer dest | ||
242 | * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning from first-th item in src to tail of dest | ||
243 | * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning from first-th item in src to head of dest | ||
244 | */ | ||
245 | static void leaf_copy_items_entirely (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, | ||
246 | int first, int cpy_num) | ||
247 | { | ||
248 | struct buffer_head * dest; | ||
249 | int nr, free_space; | ||
250 | int dest_before; | ||
251 | int last_loc, last_inserted_loc, location; | ||
252 | int i, j; | ||
253 | struct block_head * blkh; | ||
254 | struct item_head * ih; | ||
255 | |||
256 | RFALSE( last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST, | ||
257 | "vs-10090: bad last_first parameter %d", last_first); | ||
258 | RFALSE( B_NR_ITEMS (src) - first < cpy_num, | ||
259 | "vs-10100: too few items in source %d, required %d from %d", | ||
260 | B_NR_ITEMS(src), cpy_num, first); | ||
261 | RFALSE( cpy_num < 0, "vs-10110: can not copy negative amount of items"); | ||
262 | RFALSE( ! dest_bi, "vs-10120: can not copy negative amount of items"); | ||
263 | |||
264 | dest = dest_bi->bi_bh; | ||
265 | |||
266 | RFALSE( ! dest, "vs-10130: can not copy negative amount of items"); | ||
267 | |||
268 | if (cpy_num == 0) | ||
269 | return; | ||
270 | |||
271 | blkh = B_BLK_HEAD(dest); | ||
272 | nr = blkh_nr_item( blkh ); | ||
273 | free_space = blkh_free_space(blkh); | ||
274 | |||
275 | /* we will insert items before 0-th or nr-th item in dest buffer. It depends of last_first parameter */ | ||
276 | dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr; | ||
277 | |||
278 | /* location of head of first new item */ | ||
279 | ih = B_N_PITEM_HEAD (dest, dest_before); | ||
280 | |||
281 | RFALSE( blkh_free_space(blkh) < cpy_num * IH_SIZE, | ||
282 | "vs-10140: not enough free space for headers %d (needed %d)", | ||
283 | B_FREE_SPACE (dest), cpy_num * IH_SIZE); | ||
284 | |||
285 | /* prepare space for headers */ | ||
286 | memmove (ih + cpy_num, ih, (nr-dest_before) * IH_SIZE); | ||
287 | |||
288 | /* copy item headers */ | ||
289 | memcpy (ih, B_N_PITEM_HEAD (src, first), cpy_num * IH_SIZE); | ||
290 | |||
291 | free_space -= (IH_SIZE * cpy_num); | ||
292 | set_blkh_free_space( blkh, free_space ); | ||
293 | |||
294 | /* location of unmovable item */ | ||
295 | j = location = (dest_before == 0) ? dest->b_size : ih_location(ih-1); | ||
296 | for (i = dest_before; i < nr + cpy_num; i ++) { | ||
297 | location -= ih_item_len( ih + i - dest_before ); | ||
298 | put_ih_location( ih + i - dest_before, location ); | ||
299 | } | ||
300 | |||
301 | /* prepare space for items */ | ||
302 | last_loc = ih_location( &(ih[nr+cpy_num-1-dest_before]) ); | ||
303 | last_inserted_loc = ih_location( &(ih[cpy_num-1]) ); | ||
304 | |||
305 | /* check free space */ | ||
306 | RFALSE( free_space < j - last_inserted_loc, | ||
307 | "vs-10150: not enough free space for items %d (needed %d)", | ||
308 | free_space, j - last_inserted_loc); | ||
309 | |||
310 | memmove (dest->b_data + last_loc, | ||
311 | dest->b_data + last_loc + j - last_inserted_loc, | ||
312 | last_inserted_loc - last_loc); | ||
313 | |||
314 | /* copy items */ | ||
315 | memcpy (dest->b_data + last_inserted_loc, B_N_PITEM(src,(first + cpy_num - 1)), | ||
316 | j - last_inserted_loc); | ||
317 | |||
318 | /* sizes, item number */ | ||
319 | set_blkh_nr_item( blkh, nr + cpy_num ); | ||
320 | set_blkh_free_space( blkh, free_space - (j - last_inserted_loc) ); | ||
321 | |||
322 | do_balance_mark_leaf_dirty (dest_bi->tb, dest, 0); | ||
323 | |||
324 | if (dest_bi->bi_parent) { | ||
325 | struct disk_child *t_dc; | ||
326 | t_dc = B_N_CHILD (dest_bi->bi_parent, dest_bi->bi_position); | ||
327 | RFALSE( dc_block_number(t_dc) != dest->b_blocknr, | ||
328 | "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu", | ||
329 | ( long unsigned ) dest->b_blocknr, | ||
330 | ( long unsigned ) dc_block_number(t_dc)); | ||
331 | put_dc_size( t_dc, dc_size(t_dc) + (j - last_inserted_loc + IH_SIZE * cpy_num ) ); | ||
332 | |||
333 | do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent, 0); | ||
334 | } | ||
335 | } | ||
336 | |||
337 | |||
338 | /* This function splits the (liquid) item into two items (useful when | ||
339 | shifting part of an item into another node.) */ | ||
340 | static void leaf_item_bottle (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, | ||
341 | int item_num, int cpy_bytes) | ||
342 | { | ||
343 | struct buffer_head * dest = dest_bi->bi_bh; | ||
344 | struct item_head * ih; | ||
345 | |||
346 | RFALSE( cpy_bytes == -1, "vs-10170: bytes == - 1 means: do not split item"); | ||
347 | |||
348 | if ( last_first == FIRST_TO_LAST ) { | ||
349 | /* if ( if item in position item_num in buffer SOURCE is directory item ) */ | ||
350 | if (is_direntry_le_ih (ih = B_N_PITEM_HEAD(src,item_num))) | ||
351 | leaf_copy_dir_entries (dest_bi, src, FIRST_TO_LAST, item_num, 0, cpy_bytes); | ||
352 | else { | ||
353 | struct item_head n_ih; | ||
354 | |||
355 | /* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST | ||
356 | part defined by 'cpy_bytes'; create new item header; change old item_header (????); | ||
357 | n_ih = new item_header; | ||
358 | */ | ||
359 | memcpy (&n_ih, ih, IH_SIZE); | ||
360 | put_ih_item_len( &n_ih, cpy_bytes ); | ||
361 | if (is_indirect_le_ih (ih)) { | ||
362 | RFALSE( cpy_bytes == ih_item_len(ih) && get_ih_free_space(ih), | ||
363 | "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)", | ||
364 | ( long unsigned ) get_ih_free_space (ih)); | ||
365 | set_ih_free_space (&n_ih, 0); | ||
366 | } | ||
367 | |||
368 | RFALSE( op_is_left_mergeable (&(ih->ih_key), src->b_size), | ||
369 | "vs-10190: bad mergeability of item %h", ih); | ||
370 | n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ | ||
371 | leaf_insert_into_buf (dest_bi, B_NR_ITEMS(dest), &n_ih, B_N_PITEM (src, item_num), 0); | ||
372 | } | ||
373 | } else { | ||
374 | /* if ( if item in position item_num in buffer SOURCE is directory item ) */ | ||
375 | if (is_direntry_le_ih(ih = B_N_PITEM_HEAD (src, item_num))) | ||
376 | leaf_copy_dir_entries (dest_bi, src, LAST_TO_FIRST, item_num, I_ENTRY_COUNT(ih) - cpy_bytes, cpy_bytes); | ||
377 | else { | ||
378 | struct item_head n_ih; | ||
379 | |||
380 | /* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST | ||
381 | part defined by 'cpy_bytes'; create new item header; | ||
382 | n_ih = new item_header; | ||
383 | */ | ||
384 | memcpy (&n_ih, ih, SHORT_KEY_SIZE); | ||
385 | |||
386 | n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ | ||
387 | |||
388 | if (is_direct_le_ih (ih)) { | ||
389 | set_le_ih_k_offset (&n_ih, le_ih_k_offset (ih) + ih_item_len(ih) - cpy_bytes); | ||
390 | set_le_ih_k_type (&n_ih, TYPE_DIRECT); | ||
391 | set_ih_free_space (&n_ih, MAX_US_INT); | ||
392 | } else { | ||
393 | /* indirect item */ | ||
394 | RFALSE( !cpy_bytes && get_ih_free_space (ih), | ||
395 | "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended"); | ||
396 | set_le_ih_k_offset (&n_ih, le_ih_k_offset (ih) + (ih_item_len(ih) - cpy_bytes) / UNFM_P_SIZE * dest->b_size); | ||
397 | set_le_ih_k_type (&n_ih, TYPE_INDIRECT); | ||
398 | set_ih_free_space (&n_ih, get_ih_free_space (ih)); | ||
399 | } | ||
400 | |||
401 | /* set item length */ | ||
402 | put_ih_item_len( &n_ih, cpy_bytes ); | ||
403 | |||
404 | n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ | ||
405 | |||
406 | leaf_insert_into_buf (dest_bi, 0, &n_ih, B_N_PITEM(src,item_num) + ih_item_len(ih) - cpy_bytes, 0); | ||
407 | } | ||
408 | } | ||
409 | } | ||
410 | |||
411 | |||
412 | /* If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE to DEST. | ||
413 | If cpy_bytes not equal to minus one than copy cpy_num-1 whole items from SOURCE to DEST. | ||
414 | From last item copy cpy_num bytes for regular item and cpy_num directory entries for | ||
415 | directory item. */ | ||
416 | static int leaf_copy_items (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, int cpy_num, | ||
417 | int cpy_bytes) | ||
418 | { | ||
419 | struct buffer_head * dest; | ||
420 | int pos, i, src_nr_item, bytes; | ||
421 | |||
422 | dest = dest_bi->bi_bh; | ||
423 | RFALSE( !dest || !src, "vs-10210: !dest || !src"); | ||
424 | RFALSE( last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, | ||
425 | "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST"); | ||
426 | RFALSE( B_NR_ITEMS(src) < cpy_num, | ||
427 | "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src), cpy_num); | ||
428 | RFALSE( cpy_num < 0,"vs-10240: cpy_num < 0 (%d)", cpy_num); | ||
429 | |||
430 | if ( cpy_num == 0 ) | ||
431 | return 0; | ||
432 | |||
433 | if ( last_first == FIRST_TO_LAST ) { | ||
434 | /* copy items to left */ | ||
435 | pos = 0; | ||
436 | if ( cpy_num == 1 ) | ||
437 | bytes = cpy_bytes; | ||
438 | else | ||
439 | bytes = -1; | ||
440 | |||
441 | /* copy the first item or it part or nothing to the end of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) */ | ||
442 | i = leaf_copy_boundary_item (dest_bi, src, FIRST_TO_LAST, bytes); | ||
443 | cpy_num -= i; | ||
444 | if ( cpy_num == 0 ) | ||
445 | return i; | ||
446 | pos += i; | ||
447 | if ( cpy_bytes == -1 ) | ||
448 | /* copy first cpy_num items starting from position 'pos' of SOURCE to end of DEST */ | ||
449 | leaf_copy_items_entirely (dest_bi, src, FIRST_TO_LAST, pos, cpy_num); | ||
450 | else { | ||
451 | /* copy first cpy_num-1 items starting from position 'pos-1' of the SOURCE to the end of the DEST */ | ||
452 | leaf_copy_items_entirely (dest_bi, src, FIRST_TO_LAST, pos, cpy_num-1); | ||
453 | |||
454 | /* copy part of the item which number is cpy_num+pos-1 to the end of the DEST */ | ||
455 | leaf_item_bottle (dest_bi, src, FIRST_TO_LAST, cpy_num+pos-1, cpy_bytes); | ||
456 | } | ||
457 | } else { | ||
458 | /* copy items to right */ | ||
459 | src_nr_item = B_NR_ITEMS (src); | ||
460 | if ( cpy_num == 1 ) | ||
461 | bytes = cpy_bytes; | ||
462 | else | ||
463 | bytes = -1; | ||
464 | |||
465 | /* copy the last item or it part or nothing to the begin of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); */ | ||
466 | i = leaf_copy_boundary_item (dest_bi, src, LAST_TO_FIRST, bytes); | ||
467 | |||
468 | cpy_num -= i; | ||
469 | if ( cpy_num == 0 ) | ||
470 | return i; | ||
471 | |||
472 | pos = src_nr_item - cpy_num - i; | ||
473 | if ( cpy_bytes == -1 ) { | ||
474 | /* starting from position 'pos' copy last cpy_num items of SOURCE to begin of DEST */ | ||
475 | leaf_copy_items_entirely (dest_bi, src, LAST_TO_FIRST, pos, cpy_num); | ||
476 | } else { | ||
477 | /* copy last cpy_num-1 items starting from position 'pos+1' of the SOURCE to the begin of the DEST; */ | ||
478 | leaf_copy_items_entirely (dest_bi, src, LAST_TO_FIRST, pos+1, cpy_num-1); | ||
479 | |||
480 | /* copy part of the item which number is pos to the begin of the DEST */ | ||
481 | leaf_item_bottle (dest_bi, src, LAST_TO_FIRST, pos, cpy_bytes); | ||
482 | } | ||
483 | } | ||
484 | return i; | ||
485 | } | ||
486 | |||
487 | |||
488 | /* there are types of coping: from S[0] to L[0], from S[0] to R[0], | ||
489 | from R[0] to L[0]. for each of these we have to define parent and | ||
490 | positions of destination and source buffers */ | ||
491 | static void leaf_define_dest_src_infos (int shift_mode, struct tree_balance * tb, struct buffer_info * dest_bi, | ||
492 | struct buffer_info * src_bi, int * first_last, | ||
493 | struct buffer_head * Snew) | ||
494 | { | ||
495 | memset (dest_bi, 0, sizeof (struct buffer_info)); | ||
496 | memset (src_bi, 0, sizeof (struct buffer_info)); | ||
497 | |||
498 | /* define dest, src, dest parent, dest position */ | ||
499 | switch (shift_mode) { | ||
500 | case LEAF_FROM_S_TO_L: /* it is used in leaf_shift_left */ | ||
501 | src_bi->tb = tb; | ||
502 | src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path); | ||
503 | src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0); | ||
504 | src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0); /* src->b_item_order */ | ||
505 | dest_bi->tb = tb; | ||
506 | dest_bi->bi_bh = tb->L[0]; | ||
507 | dest_bi->bi_parent = tb->FL[0]; | ||
508 | dest_bi->bi_position = get_left_neighbor_position (tb, 0); | ||
509 | *first_last = FIRST_TO_LAST; | ||
510 | break; | ||
511 | |||
512 | case LEAF_FROM_S_TO_R: /* it is used in leaf_shift_right */ | ||
513 | src_bi->tb = tb; | ||
514 | src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path); | ||
515 | src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0); | ||
516 | src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0); | ||
517 | dest_bi->tb = tb; | ||
518 | dest_bi->bi_bh = tb->R[0]; | ||
519 | dest_bi->bi_parent = tb->FR[0]; | ||
520 | dest_bi->bi_position = get_right_neighbor_position (tb, 0); | ||
521 | *first_last = LAST_TO_FIRST; | ||
522 | break; | ||
523 | |||
524 | case LEAF_FROM_R_TO_L: /* it is used in balance_leaf_when_delete */ | ||
525 | src_bi->tb = tb; | ||
526 | src_bi->bi_bh = tb->R[0]; | ||
527 | src_bi->bi_parent = tb->FR[0]; | ||
528 | src_bi->bi_position = get_right_neighbor_position (tb, 0); | ||
529 | dest_bi->tb = tb; | ||
530 | dest_bi->bi_bh = tb->L[0]; | ||
531 | dest_bi->bi_parent = tb->FL[0]; | ||
532 | dest_bi->bi_position = get_left_neighbor_position (tb, 0); | ||
533 | *first_last = FIRST_TO_LAST; | ||
534 | break; | ||
535 | |||
536 | case LEAF_FROM_L_TO_R: /* it is used in balance_leaf_when_delete */ | ||
537 | src_bi->tb = tb; | ||
538 | src_bi->bi_bh = tb->L[0]; | ||
539 | src_bi->bi_parent = tb->FL[0]; | ||
540 | src_bi->bi_position = get_left_neighbor_position (tb, 0); | ||
541 | dest_bi->tb = tb; | ||
542 | dest_bi->bi_bh = tb->R[0]; | ||
543 | dest_bi->bi_parent = tb->FR[0]; | ||
544 | dest_bi->bi_position = get_right_neighbor_position (tb, 0); | ||
545 | *first_last = LAST_TO_FIRST; | ||
546 | break; | ||
547 | |||
548 | case LEAF_FROM_S_TO_SNEW: | ||
549 | src_bi->tb = tb; | ||
550 | src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path); | ||
551 | src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0); | ||
552 | src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0); | ||
553 | dest_bi->tb = tb; | ||
554 | dest_bi->bi_bh = Snew; | ||
555 | dest_bi->bi_parent = NULL; | ||
556 | dest_bi->bi_position = 0; | ||
557 | *first_last = LAST_TO_FIRST; | ||
558 | break; | ||
559 | |||
560 | default: | ||
561 | reiserfs_panic (NULL, "vs-10250: leaf_define_dest_src_infos: shift type is unknown (%d)", shift_mode); | ||
562 | } | ||
563 | RFALSE( src_bi->bi_bh == 0 || dest_bi->bi_bh == 0, | ||
564 | "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly", | ||
565 | shift_mode, src_bi->bi_bh, dest_bi->bi_bh); | ||
566 | } | ||
567 | |||
568 | |||
569 | |||
570 | |||
571 | /* copy mov_num items and mov_bytes of the (mov_num-1)th item to | ||
572 | neighbor. Delete them from source */ | ||
573 | int leaf_move_items (int shift_mode, struct tree_balance * tb, int mov_num, int mov_bytes, struct buffer_head * Snew) | ||
574 | { | ||
575 | int ret_value; | ||
576 | struct buffer_info dest_bi, src_bi; | ||
577 | int first_last; | ||
578 | |||
579 | leaf_define_dest_src_infos (shift_mode, tb, &dest_bi, &src_bi, &first_last, Snew); | ||
580 | |||
581 | ret_value = leaf_copy_items (&dest_bi, src_bi.bi_bh, first_last, mov_num, mov_bytes); | ||
582 | |||
583 | leaf_delete_items (&src_bi, first_last, (first_last == FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) - mov_num), mov_num, mov_bytes); | ||
584 | |||
585 | |||
586 | return ret_value; | ||
587 | } | ||
588 | |||
589 | |||
590 | /* Shift shift_num items (and shift_bytes of last shifted item if shift_bytes != -1) | ||
591 | from S[0] to L[0] and replace the delimiting key */ | ||
592 | int leaf_shift_left (struct tree_balance * tb, int shift_num, int shift_bytes) | ||
593 | { | ||
594 | struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path); | ||
595 | int i; | ||
596 | |||
597 | /* move shift_num (and shift_bytes bytes) items from S[0] to left neighbor L[0] */ | ||
598 | i = leaf_move_items (LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL); | ||
599 | |||
600 | if ( shift_num ) { | ||
601 | if (B_NR_ITEMS (S0) == 0) { /* number of items in S[0] == 0 */ | ||
602 | |||
603 | RFALSE( shift_bytes != -1, | ||
604 | "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)", | ||
605 | shift_bytes); | ||
606 | #ifdef CONFIG_REISERFS_CHECK | ||
607 | if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) { | ||
608 | print_cur_tb ("vs-10275"); | ||
609 | reiserfs_panic (tb->tb_sb, "vs-10275: leaf_shift_left: balance condition corrupted (%c)", tb->tb_mode); | ||
610 | } | ||
611 | #endif | ||
612 | |||
613 | if (PATH_H_POSITION (tb->tb_path, 1) == 0) | ||
614 | replace_key (tb, tb->CFL[0], tb->lkey[0], PATH_H_PPARENT (tb->tb_path, 0), 0); | ||
615 | |||
616 | } else { | ||
617 | /* replace lkey in CFL[0] by 0-th key from S[0]; */ | ||
618 | replace_key (tb, tb->CFL[0], tb->lkey[0], S0, 0); | ||
619 | |||
620 | RFALSE( (shift_bytes != -1 && | ||
621 | !(is_direntry_le_ih (B_N_PITEM_HEAD (S0, 0)) | ||
622 | && !I_ENTRY_COUNT (B_N_PITEM_HEAD (S0, 0)))) && | ||
623 | (!op_is_left_mergeable (B_N_PKEY (S0, 0), S0->b_size)), | ||
624 | "vs-10280: item must be mergeable"); | ||
625 | } | ||
626 | } | ||
627 | |||
628 | return i; | ||
629 | } | ||
630 | |||
631 | |||
632 | |||
633 | |||
634 | |||
635 | /* CLEANING STOPPED HERE */ | ||
636 | |||
637 | |||
638 | |||
639 | |||
640 | /* Shift shift_num (shift_bytes) items from S[0] to the right neighbor, and replace the delimiting key */ | ||
641 | int leaf_shift_right( | ||
642 | struct tree_balance * tb, | ||
643 | int shift_num, | ||
644 | int shift_bytes | ||
645 | ) | ||
646 | { | ||
647 | // struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path); | ||
648 | int ret_value; | ||
649 | |||
650 | /* move shift_num (and shift_bytes) items from S[0] to right neighbor R[0] */ | ||
651 | ret_value = leaf_move_items (LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL); | ||
652 | |||
653 | /* replace rkey in CFR[0] by the 0-th key from R[0] */ | ||
654 | if (shift_num) { | ||
655 | replace_key (tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); | ||
656 | |||
657 | } | ||
658 | |||
659 | return ret_value; | ||
660 | } | ||
661 | |||
662 | |||
663 | |||
664 | static void leaf_delete_items_entirely (struct buffer_info * bi, | ||
665 | int first, int del_num); | ||
666 | /* If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR. | ||
667 | If not. | ||
668 | If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of | ||
669 | the first item. Part defined by del_bytes. Don't delete first item header | ||
670 | If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of | ||
671 | the last item . Part defined by del_bytes. Don't delete last item header. | ||
672 | */ | ||
673 | void leaf_delete_items (struct buffer_info * cur_bi, int last_first, | ||
674 | int first, int del_num, int del_bytes) | ||
675 | { | ||
676 | struct buffer_head * bh; | ||
677 | int item_amount = B_NR_ITEMS (bh = cur_bi->bi_bh); | ||
678 | |||
679 | RFALSE( !bh, "10155: bh is not defined"); | ||
680 | RFALSE( del_num < 0, "10160: del_num can not be < 0. del_num==%d", del_num); | ||
681 | RFALSE( first < 0 || first + del_num > item_amount, | ||
682 | "10165: invalid number of first item to be deleted (%d) or " | ||
683 | "no so much items (%d) to delete (only %d)", | ||
684 | first, first + del_num, item_amount); | ||
685 | |||
686 | if ( del_num == 0 ) | ||
687 | return; | ||
688 | |||
689 | if ( first == 0 && del_num == item_amount && del_bytes == -1 ) { | ||
690 | make_empty_node (cur_bi); | ||
691 | do_balance_mark_leaf_dirty (cur_bi->tb, bh, 0); | ||
692 | return; | ||
693 | } | ||
694 | |||
695 | if ( del_bytes == -1 ) | ||
696 | /* delete del_num items beginning from item in position first */ | ||
697 | leaf_delete_items_entirely (cur_bi, first, del_num); | ||
698 | else { | ||
699 | if ( last_first == FIRST_TO_LAST ) { | ||
700 | /* delete del_num-1 items beginning from item in position first */ | ||
701 | leaf_delete_items_entirely (cur_bi, first, del_num-1); | ||
702 | |||
703 | /* delete the part of the first item of the bh | ||
704 | do not delete item header | ||
705 | */ | ||
706 | leaf_cut_from_buffer (cur_bi, 0, 0, del_bytes); | ||
707 | } else { | ||
708 | struct item_head * ih; | ||
709 | int len; | ||
710 | |||
711 | /* delete del_num-1 items beginning from item in position first+1 */ | ||
712 | leaf_delete_items_entirely (cur_bi, first+1, del_num-1); | ||
713 | |||
714 | if (is_direntry_le_ih (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh)-1))) /* the last item is directory */ | ||
715 | /* len = numbers of directory entries in this item */ | ||
716 | len = ih_entry_count(ih); | ||
717 | else | ||
718 | /* len = body len of item */ | ||
719 | len = ih_item_len(ih); | ||
720 | |||
721 | /* delete the part of the last item of the bh | ||
722 | do not delete item header | ||
723 | */ | ||
724 | leaf_cut_from_buffer (cur_bi, B_NR_ITEMS(bh)-1, len - del_bytes, del_bytes); | ||
725 | } | ||
726 | } | ||
727 | } | ||
728 | |||
729 | |||
730 | /* insert item into the leaf node in position before */ | ||
731 | void leaf_insert_into_buf (struct buffer_info * bi, int before, | ||
732 | struct item_head * inserted_item_ih, | ||
733 | const char * inserted_item_body, | ||
734 | int zeros_number) | ||
735 | { | ||
736 | struct buffer_head * bh = bi->bi_bh; | ||
737 | int nr, free_space; | ||
738 | struct block_head * blkh; | ||
739 | struct item_head * ih; | ||
740 | int i; | ||
741 | int last_loc, unmoved_loc; | ||
742 | char * to; | ||
743 | |||
744 | |||
745 | blkh = B_BLK_HEAD(bh); | ||
746 | nr = blkh_nr_item(blkh); | ||
747 | free_space = blkh_free_space( blkh ); | ||
748 | |||
749 | /* check free space */ | ||
750 | RFALSE( free_space < ih_item_len(inserted_item_ih) + IH_SIZE, | ||
751 | "vs-10170: not enough free space in block %z, new item %h", | ||
752 | bh, inserted_item_ih); | ||
753 | RFALSE( zeros_number > ih_item_len(inserted_item_ih), | ||
754 | "vs-10172: zero number == %d, item length == %d", | ||
755 | zeros_number, ih_item_len(inserted_item_ih)); | ||
756 | |||
757 | |||
758 | /* get item new item must be inserted before */ | ||
759 | ih = B_N_PITEM_HEAD (bh, before); | ||
760 | |||
761 | /* prepare space for the body of new item */ | ||
762 | last_loc = nr ? ih_location( &(ih[nr - before - 1]) ) : bh->b_size; | ||
763 | unmoved_loc = before ? ih_location( ih-1 ) : bh->b_size; | ||
764 | |||
765 | |||
766 | memmove (bh->b_data + last_loc - ih_item_len(inserted_item_ih), | ||
767 | bh->b_data + last_loc, unmoved_loc - last_loc); | ||
768 | |||
769 | to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih); | ||
770 | memset (to, 0, zeros_number); | ||
771 | to += zeros_number; | ||
772 | |||
773 | /* copy body to prepared space */ | ||
774 | if (inserted_item_body) | ||
775 | memmove (to, inserted_item_body, ih_item_len(inserted_item_ih) - zeros_number); | ||
776 | else | ||
777 | memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number); | ||
778 | |||
779 | /* insert item header */ | ||
780 | memmove (ih + 1, ih, IH_SIZE * (nr - before)); | ||
781 | memmove (ih, inserted_item_ih, IH_SIZE); | ||
782 | |||
783 | /* change locations */ | ||
784 | for (i = before; i < nr + 1; i ++) | ||
785 | { | ||
786 | unmoved_loc -= ih_item_len( &(ih[i-before])); | ||
787 | put_ih_location( &(ih[i-before]), unmoved_loc ); | ||
788 | } | ||
789 | |||
790 | /* sizes, free space, item number */ | ||
791 | set_blkh_nr_item( blkh, blkh_nr_item(blkh) + 1 ); | ||
792 | set_blkh_free_space( blkh, | ||
793 | free_space - (IH_SIZE + ih_item_len(inserted_item_ih ) ) ); | ||
794 | do_balance_mark_leaf_dirty (bi->tb, bh, 1); | ||
795 | |||
796 | if (bi->bi_parent) { | ||
797 | struct disk_child *t_dc; | ||
798 | t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position); | ||
799 | put_dc_size( t_dc, dc_size(t_dc) + (IH_SIZE + ih_item_len(inserted_item_ih))); | ||
800 | do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0); | ||
801 | } | ||
802 | } | ||
803 | |||
804 | |||
805 | /* paste paste_size bytes to affected_item_num-th item. | ||
806 | When item is a directory, this only prepare space for new entries */ | ||
807 | void leaf_paste_in_buffer (struct buffer_info * bi, int affected_item_num, | ||
808 | int pos_in_item, int paste_size, | ||
809 | const char * body, | ||
810 | int zeros_number) | ||
811 | { | ||
812 | struct buffer_head * bh = bi->bi_bh; | ||
813 | int nr, free_space; | ||
814 | struct block_head * blkh; | ||
815 | struct item_head * ih; | ||
816 | int i; | ||
817 | int last_loc, unmoved_loc; | ||
818 | |||
819 | blkh = B_BLK_HEAD(bh); | ||
820 | nr = blkh_nr_item(blkh); | ||
821 | free_space = blkh_free_space(blkh); | ||
822 | |||
823 | |||
824 | /* check free space */ | ||
825 | RFALSE( free_space < paste_size, | ||
826 | "vs-10175: not enough free space: needed %d, available %d", | ||
827 | paste_size, free_space); | ||
828 | |||
829 | #ifdef CONFIG_REISERFS_CHECK | ||
830 | if (zeros_number > paste_size) { | ||
831 | print_cur_tb ("10177"); | ||
832 | reiserfs_panic ( NULL, "vs-10177: leaf_paste_in_buffer: ero number == %d, paste_size == %d", | ||
833 | zeros_number, paste_size); | ||
834 | } | ||
835 | #endif /* CONFIG_REISERFS_CHECK */ | ||
836 | |||
837 | |||
838 | /* item to be appended */ | ||
839 | ih = B_N_PITEM_HEAD(bh, affected_item_num); | ||
840 | |||
841 | last_loc = ih_location( &(ih[nr - affected_item_num - 1]) ); | ||
842 | unmoved_loc = affected_item_num ? ih_location( ih-1 ) : bh->b_size; | ||
843 | |||
844 | /* prepare space */ | ||
845 | memmove (bh->b_data + last_loc - paste_size, bh->b_data + last_loc, | ||
846 | unmoved_loc - last_loc); | ||
847 | |||
848 | |||
849 | /* change locations */ | ||
850 | for (i = affected_item_num; i < nr; i ++) | ||
851 | put_ih_location( &(ih[i-affected_item_num]), | ||
852 | ih_location( &(ih[i-affected_item_num])) - paste_size ); | ||
853 | |||
854 | if ( body ) { | ||
855 | if (!is_direntry_le_ih (ih)) { | ||
856 | if (!pos_in_item) { | ||
857 | /* shift data to right */ | ||
858 | memmove (bh->b_data + ih_location(ih) + paste_size, | ||
859 | bh->b_data + ih_location(ih), ih_item_len(ih)); | ||
860 | /* paste data in the head of item */ | ||
861 | memset (bh->b_data + ih_location(ih), 0, zeros_number); | ||
862 | memcpy (bh->b_data + ih_location(ih) + zeros_number, body, paste_size - zeros_number); | ||
863 | } else { | ||
864 | memset (bh->b_data + unmoved_loc - paste_size, 0, zeros_number); | ||
865 | memcpy (bh->b_data + unmoved_loc - paste_size + zeros_number, body, paste_size - zeros_number); | ||
866 | } | ||
867 | } | ||
868 | } | ||
869 | else | ||
870 | memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size); | ||
871 | |||
872 | put_ih_item_len( ih, ih_item_len(ih) + paste_size ); | ||
873 | |||
874 | /* change free space */ | ||
875 | set_blkh_free_space( blkh, free_space - paste_size ); | ||
876 | |||
877 | do_balance_mark_leaf_dirty (bi->tb, bh, 0); | ||
878 | |||
879 | if (bi->bi_parent) { | ||
880 | struct disk_child *t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position); | ||
881 | put_dc_size( t_dc, dc_size(t_dc) + paste_size ); | ||
882 | do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0); | ||
883 | } | ||
884 | } | ||
885 | |||
886 | |||
887 | /* cuts DEL_COUNT entries beginning from FROM-th entry. Directory item | ||
888 | does not have free space, so it moves DEHs and remaining records as | ||
889 | necessary. Return value is size of removed part of directory item | ||
890 | in bytes. */ | ||
891 | static int leaf_cut_entries ( | ||
892 | struct buffer_head * bh, | ||
893 | struct item_head * ih, | ||
894 | int from, | ||
895 | int del_count | ||
896 | ) | ||
897 | { | ||
898 | char * item; | ||
899 | struct reiserfs_de_head * deh; | ||
900 | int prev_record_offset; /* offset of record, that is (from-1)th */ | ||
901 | char * prev_record; /* */ | ||
902 | int cut_records_len; /* length of all removed records */ | ||
903 | int i; | ||
904 | |||
905 | |||
906 | /* make sure, that item is directory and there are enough entries to | ||
907 | remove */ | ||
908 | RFALSE( !is_direntry_le_ih (ih), "10180: item is not directory item"); | ||
909 | RFALSE( I_ENTRY_COUNT(ih) < from + del_count, | ||
910 | "10185: item contains not enough entries: entry_cout = %d, from = %d, to delete = %d", | ||
911 | I_ENTRY_COUNT(ih), from, del_count); | ||
912 | |||
913 | if (del_count == 0) | ||
914 | return 0; | ||
915 | |||
916 | /* first byte of item */ | ||
917 | item = bh->b_data + ih_location(ih); | ||
918 | |||
919 | /* entry head array */ | ||
920 | deh = B_I_DEH (bh, ih); | ||
921 | |||
922 | /* first byte of remaining entries, those are BEFORE cut entries | ||
923 | (prev_record) and length of all removed records (cut_records_len) */ | ||
924 | prev_record_offset = (from ? deh_location( &(deh[from - 1])) : ih_item_len(ih)); | ||
925 | cut_records_len = prev_record_offset/*from_record*/ - | ||
926 | deh_location( &(deh[from + del_count - 1])); | ||
927 | prev_record = item + prev_record_offset; | ||
928 | |||
929 | |||
930 | /* adjust locations of remaining entries */ | ||
931 | for (i = I_ENTRY_COUNT(ih) - 1; i > from + del_count - 1; i --) | ||
932 | put_deh_location( &(deh[i]), | ||
933 | deh_location( &deh[i] ) - (DEH_SIZE * del_count ) ); | ||
934 | |||
935 | for (i = 0; i < from; i ++) | ||
936 | put_deh_location( &(deh[i]), | ||
937 | deh_location( &deh[i] ) - (DEH_SIZE * del_count + cut_records_len) ); | ||
938 | |||
939 | put_ih_entry_count( ih, ih_entry_count(ih) - del_count ); | ||
940 | |||
941 | /* shift entry head array and entries those are AFTER removed entries */ | ||
942 | memmove ((char *)(deh + from), | ||
943 | deh + from + del_count, | ||
944 | prev_record - cut_records_len - (char *)(deh + from + del_count)); | ||
945 | |||
946 | /* shift records, those are BEFORE removed entries */ | ||
947 | memmove (prev_record - cut_records_len - DEH_SIZE * del_count, | ||
948 | prev_record, item + ih_item_len(ih) - prev_record); | ||
949 | |||
950 | return DEH_SIZE * del_count + cut_records_len; | ||
951 | } | ||
952 | |||
953 | |||
954 | /* when cut item is part of regular file | ||
955 | pos_in_item - first byte that must be cut | ||
956 | cut_size - number of bytes to be cut beginning from pos_in_item | ||
957 | |||
958 | when cut item is part of directory | ||
959 | pos_in_item - number of first deleted entry | ||
960 | cut_size - count of deleted entries | ||
961 | */ | ||
962 | void leaf_cut_from_buffer (struct buffer_info * bi, int cut_item_num, | ||
963 | int pos_in_item, int cut_size) | ||
964 | { | ||
965 | int nr; | ||
966 | struct buffer_head * bh = bi->bi_bh; | ||
967 | struct block_head * blkh; | ||
968 | struct item_head * ih; | ||
969 | int last_loc, unmoved_loc; | ||
970 | int i; | ||
971 | |||
972 | blkh = B_BLK_HEAD(bh); | ||
973 | nr = blkh_nr_item(blkh); | ||
974 | |||
975 | /* item head of truncated item */ | ||
976 | ih = B_N_PITEM_HEAD (bh, cut_item_num); | ||
977 | |||
978 | if (is_direntry_le_ih (ih)) { | ||
979 | /* first cut entry ()*/ | ||
980 | cut_size = leaf_cut_entries (bh, ih, pos_in_item, cut_size); | ||
981 | if (pos_in_item == 0) { | ||
982 | /* change key */ | ||
983 | RFALSE( cut_item_num, | ||
984 | "when 0-th enrty of item is cut, that item must be first in the node, not %d-th", cut_item_num); | ||
985 | /* change item key by key of first entry in the item */ | ||
986 | set_le_ih_k_offset (ih, deh_offset(B_I_DEH (bh, ih))); | ||
987 | /*memcpy (&ih->ih_key.k_offset, &(B_I_DEH (bh, ih)->deh_offset), SHORT_KEY_SIZE);*/ | ||
988 | } | ||
989 | } else { | ||
990 | /* item is direct or indirect */ | ||
991 | RFALSE( is_statdata_le_ih (ih), "10195: item is stat data"); | ||
992 | RFALSE( pos_in_item && pos_in_item + cut_size != ih_item_len(ih), | ||
993 | "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)", | ||
994 | ( long unsigned ) pos_in_item, ( long unsigned ) cut_size, | ||
995 | ( long unsigned ) ih_item_len (ih)); | ||
996 | |||
997 | /* shift item body to left if cut is from the head of item */ | ||
998 | if (pos_in_item == 0) { | ||
999 | memmove( bh->b_data + ih_location(ih), | ||
1000 | bh->b_data + ih_location(ih) + cut_size, | ||
1001 | ih_item_len(ih) - cut_size); | ||
1002 | |||
1003 | /* change key of item */ | ||
1004 | if (is_direct_le_ih (ih)) | ||
1005 | set_le_ih_k_offset (ih, le_ih_k_offset (ih) + cut_size); | ||
1006 | else { | ||
1007 | set_le_ih_k_offset (ih, le_ih_k_offset (ih) + (cut_size / UNFM_P_SIZE) * bh->b_size); | ||
1008 | RFALSE( ih_item_len(ih) == cut_size && get_ih_free_space (ih), | ||
1009 | "10205: invalid ih_free_space (%h)", ih); | ||
1010 | } | ||
1011 | } | ||
1012 | } | ||
1013 | |||
1014 | |||
1015 | /* location of the last item */ | ||
1016 | last_loc = ih_location( &(ih[nr - cut_item_num - 1]) ); | ||
1017 | |||
1018 | /* location of the item, which is remaining at the same place */ | ||
1019 | unmoved_loc = cut_item_num ? ih_location(ih-1) : bh->b_size; | ||
1020 | |||
1021 | |||
1022 | /* shift */ | ||
1023 | memmove (bh->b_data + last_loc + cut_size, bh->b_data + last_loc, | ||
1024 | unmoved_loc - last_loc - cut_size); | ||
1025 | |||
1026 | /* change item length */ | ||
1027 | put_ih_item_len( ih, ih_item_len(ih) - cut_size ); | ||
1028 | |||
1029 | if (is_indirect_le_ih (ih)) { | ||
1030 | if (pos_in_item) | ||
1031 | set_ih_free_space (ih, 0); | ||
1032 | } | ||
1033 | |||
1034 | /* change locations */ | ||
1035 | for (i = cut_item_num; i < nr; i ++) | ||
1036 | put_ih_location( &(ih[i-cut_item_num]), ih_location( &ih[i-cut_item_num]) + cut_size ); | ||
1037 | |||
1038 | /* size, free space */ | ||
1039 | set_blkh_free_space( blkh, blkh_free_space(blkh) + cut_size ); | ||
1040 | |||
1041 | do_balance_mark_leaf_dirty (bi->tb, bh, 0); | ||
1042 | |||
1043 | if (bi->bi_parent) { | ||
1044 | struct disk_child *t_dc; | ||
1045 | t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position); | ||
1046 | put_dc_size( t_dc, dc_size(t_dc) - cut_size ); | ||
1047 | do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0); | ||
1048 | } | ||
1049 | } | ||
1050 | |||
1051 | |||
1052 | /* delete del_num items from buffer starting from the first'th item */ | ||
1053 | static void leaf_delete_items_entirely (struct buffer_info * bi, | ||
1054 | int first, int del_num) | ||
1055 | { | ||
1056 | struct buffer_head * bh = bi->bi_bh; | ||
1057 | int nr; | ||
1058 | int i, j; | ||
1059 | int last_loc, last_removed_loc; | ||
1060 | struct block_head * blkh; | ||
1061 | struct item_head * ih; | ||
1062 | |||
1063 | RFALSE( bh == NULL, "10210: buffer is 0"); | ||
1064 | RFALSE( del_num < 0, "10215: del_num less than 0 (%d)", del_num); | ||
1065 | |||
1066 | if (del_num == 0) | ||
1067 | return; | ||
1068 | |||
1069 | blkh = B_BLK_HEAD(bh); | ||
1070 | nr = blkh_nr_item(blkh); | ||
1071 | |||
1072 | RFALSE( first < 0 || first + del_num > nr, | ||
1073 | "10220: first=%d, number=%d, there is %d items", first, del_num, nr); | ||
1074 | |||
1075 | if (first == 0 && del_num == nr) { | ||
1076 | /* this does not work */ | ||
1077 | make_empty_node (bi); | ||
1078 | |||
1079 | do_balance_mark_leaf_dirty (bi->tb, bh, 0); | ||
1080 | return; | ||
1081 | } | ||
1082 | |||
1083 | ih = B_N_PITEM_HEAD (bh, first); | ||
1084 | |||
1085 | /* location of unmovable item */ | ||
1086 | j = (first == 0) ? bh->b_size : ih_location(ih-1); | ||
1087 | |||
1088 | /* delete items */ | ||
1089 | last_loc = ih_location( &(ih[nr-1-first]) ); | ||
1090 | last_removed_loc = ih_location( &(ih[del_num-1]) ); | ||
1091 | |||
1092 | memmove (bh->b_data + last_loc + j - last_removed_loc, | ||
1093 | bh->b_data + last_loc, last_removed_loc - last_loc); | ||
1094 | |||
1095 | /* delete item headers */ | ||
1096 | memmove (ih, ih + del_num, (nr - first - del_num) * IH_SIZE); | ||
1097 | |||
1098 | /* change item location */ | ||
1099 | for (i = first; i < nr - del_num; i ++) | ||
1100 | put_ih_location( &(ih[i-first]), ih_location( &(ih[i-first]) ) + (j - last_removed_loc) ); | ||
1101 | |||
1102 | /* sizes, item number */ | ||
1103 | set_blkh_nr_item( blkh, blkh_nr_item(blkh) - del_num ); | ||
1104 | set_blkh_free_space( blkh, blkh_free_space(blkh) + (j - last_removed_loc + IH_SIZE * del_num) ); | ||
1105 | |||
1106 | do_balance_mark_leaf_dirty (bi->tb, bh, 0); | ||
1107 | |||
1108 | if (bi->bi_parent) { | ||
1109 | struct disk_child *t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position); | ||
1110 | put_dc_size( t_dc, dc_size(t_dc) - | ||
1111 | (j - last_removed_loc + IH_SIZE * del_num)); | ||
1112 | do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0); | ||
1113 | } | ||
1114 | } | ||
1115 | |||
1116 | |||
1117 | |||
1118 | |||
1119 | |||
1120 | /* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */ | ||
1121 | void leaf_paste_entries ( | ||
1122 | struct buffer_head * bh, | ||
1123 | int item_num, | ||
1124 | int before, | ||
1125 | int new_entry_count, | ||
1126 | struct reiserfs_de_head * new_dehs, | ||
1127 | const char * records, | ||
1128 | int paste_size | ||
1129 | ) | ||
1130 | { | ||
1131 | struct item_head * ih; | ||
1132 | char * item; | ||
1133 | struct reiserfs_de_head * deh; | ||
1134 | char * insert_point; | ||
1135 | int i, old_entry_num; | ||
1136 | |||
1137 | if (new_entry_count == 0) | ||
1138 | return; | ||
1139 | |||
1140 | ih = B_N_PITEM_HEAD(bh, item_num); | ||
1141 | |||
1142 | /* make sure, that item is directory, and there are enough records in it */ | ||
1143 | RFALSE( !is_direntry_le_ih (ih), "10225: item is not directory item"); | ||
1144 | RFALSE( I_ENTRY_COUNT (ih) < before, | ||
1145 | "10230: there are no entry we paste entries before. entry_count = %d, before = %d", | ||
1146 | I_ENTRY_COUNT (ih), before); | ||
1147 | |||
1148 | |||
1149 | /* first byte of dest item */ | ||
1150 | item = bh->b_data + ih_location(ih); | ||
1151 | |||
1152 | /* entry head array */ | ||
1153 | deh = B_I_DEH (bh, ih); | ||
1154 | |||
1155 | /* new records will be pasted at this point */ | ||
1156 | insert_point = item + (before ? deh_location( &(deh[before - 1])) : (ih_item_len(ih) - paste_size)); | ||
1157 | |||
1158 | /* adjust locations of records that will be AFTER new records */ | ||
1159 | for (i = I_ENTRY_COUNT(ih) - 1; i >= before; i --) | ||
1160 | put_deh_location( &(deh[i]), | ||
1161 | deh_location(&(deh[i])) + (DEH_SIZE * new_entry_count )); | ||
1162 | |||
1163 | /* adjust locations of records that will be BEFORE new records */ | ||
1164 | for (i = 0; i < before; i ++) | ||
1165 | put_deh_location( &(deh[i]), deh_location(&(deh[i])) + paste_size ); | ||
1166 | |||
1167 | old_entry_num = I_ENTRY_COUNT(ih); | ||
1168 | put_ih_entry_count( ih, ih_entry_count(ih) + new_entry_count ); | ||
1169 | |||
1170 | /* prepare space for pasted records */ | ||
1171 | memmove (insert_point + paste_size, insert_point, item + (ih_item_len(ih) - paste_size) - insert_point); | ||
1172 | |||
1173 | /* copy new records */ | ||
1174 | memcpy (insert_point + DEH_SIZE * new_entry_count, records, | ||
1175 | paste_size - DEH_SIZE * new_entry_count); | ||
1176 | |||
1177 | /* prepare space for new entry heads */ | ||
1178 | deh += before; | ||
1179 | memmove ((char *)(deh + new_entry_count), deh, insert_point - (char *)deh); | ||
1180 | |||
1181 | /* copy new entry heads */ | ||
1182 | deh = (struct reiserfs_de_head *)((char *)deh); | ||
1183 | memcpy (deh, new_dehs, DEH_SIZE * new_entry_count); | ||
1184 | |||
1185 | /* set locations of new records */ | ||
1186 | for (i = 0; i < new_entry_count; i ++) | ||
1187 | { | ||
1188 | put_deh_location( &(deh[i]), | ||
1189 | deh_location( &(deh[i] )) + | ||
1190 | (- deh_location( &(new_dehs[new_entry_count - 1])) + | ||
1191 | insert_point + DEH_SIZE * new_entry_count - item)); | ||
1192 | } | ||
1193 | |||
1194 | |||
1195 | /* change item key if necessary (when we paste before 0-th entry */ | ||
1196 | if (!before) | ||
1197 | { | ||
1198 | set_le_ih_k_offset (ih, deh_offset(new_dehs)); | ||
1199 | /* memcpy (&ih->ih_key.k_offset, | ||
1200 | &new_dehs->deh_offset, SHORT_KEY_SIZE);*/ | ||
1201 | } | ||
1202 | |||
1203 | #ifdef CONFIG_REISERFS_CHECK | ||
1204 | { | ||
1205 | int prev, next; | ||
1206 | /* check record locations */ | ||
1207 | deh = B_I_DEH (bh, ih); | ||
1208 | for (i = 0; i < I_ENTRY_COUNT(ih); i ++) { | ||
1209 | next = (i < I_ENTRY_COUNT(ih) - 1) ? deh_location( &(deh[i + 1])) : 0; | ||
1210 | prev = (i != 0) ? deh_location( &(deh[i - 1]) ) : 0; | ||
1211 | |||
1212 | if (prev && prev <= deh_location( &(deh[i]))) | ||
1213 | reiserfs_warning (NULL, "vs-10240: leaf_paste_entries: directory item (%h) corrupted (prev %a, cur(%d) %a)", | ||
1214 | ih, deh + i - 1, i, deh + i); | ||
1215 | if (next && next >= deh_location( &(deh[i]))) | ||
1216 | reiserfs_warning (NULL, "vs-10250: leaf_paste_entries: directory item (%h) corrupted (cur(%d) %a, next %a)", | ||
1217 | ih, i, deh + i, deh + i + 1); | ||
1218 | } | ||
1219 | } | ||
1220 | #endif | ||
1221 | |||
1222 | } | ||
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c new file mode 100644 index 000000000000..80e92d9b81cb --- /dev/null +++ b/fs/reiserfs/namei.c | |||
@@ -0,0 +1,1491 @@ | |||
1 | /* | ||
2 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
3 | * | ||
4 | * Trivial changes by Alan Cox to remove EHASHCOLLISION for compatibility | ||
5 | * | ||
6 | * Trivial Changes: | ||
7 | * Rights granted to Hans Reiser to redistribute under other terms providing | ||
8 | * he accepts all liability including but not limited to patent, fitness | ||
9 | * for purpose, and direct or indirect claims arising from failure to perform. | ||
10 | * | ||
11 | * NO WARRANTY | ||
12 | */ | ||
13 | |||
14 | #include <linux/config.h> | ||
15 | #include <linux/time.h> | ||
16 | #include <linux/bitops.h> | ||
17 | #include <linux/reiserfs_fs.h> | ||
18 | #include <linux/reiserfs_acl.h> | ||
19 | #include <linux/reiserfs_xattr.h> | ||
20 | #include <linux/smp_lock.h> | ||
21 | #include <linux/quotaops.h> | ||
22 | |||
23 | #define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { i->i_nlink++; if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; } | ||
24 | #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) i->i_nlink--; | ||
25 | |||
26 | // directory item contains array of entry headers. This performs | ||
27 | // binary search through that array | ||
28 | static int bin_search_in_dir_item (struct reiserfs_dir_entry * de, loff_t off) | ||
29 | { | ||
30 | struct item_head * ih = de->de_ih; | ||
31 | struct reiserfs_de_head * deh = de->de_deh; | ||
32 | int rbound, lbound, j; | ||
33 | |||
34 | lbound = 0; | ||
35 | rbound = I_ENTRY_COUNT (ih) - 1; | ||
36 | |||
37 | for (j = (rbound + lbound) / 2; lbound <= rbound; j = (rbound + lbound) / 2) { | ||
38 | if (off < deh_offset (deh + j)) { | ||
39 | rbound = j - 1; | ||
40 | continue; | ||
41 | } | ||
42 | if (off > deh_offset (deh + j)) { | ||
43 | lbound = j + 1; | ||
44 | continue; | ||
45 | } | ||
46 | // this is not name found, but matched third key component | ||
47 | de->de_entry_num = j; | ||
48 | return NAME_FOUND; | ||
49 | } | ||
50 | |||
51 | de->de_entry_num = lbound; | ||
52 | return NAME_NOT_FOUND; | ||
53 | } | ||
54 | |||
55 | |||
56 | // comment? maybe something like set de to point to what the path points to? | ||
57 | static inline void set_de_item_location (struct reiserfs_dir_entry * de, struct path * path) | ||
58 | { | ||
59 | de->de_bh = get_last_bh (path); | ||
60 | de->de_ih = get_ih (path); | ||
61 | de->de_deh = B_I_DEH (de->de_bh, de->de_ih); | ||
62 | de->de_item_num = PATH_LAST_POSITION (path); | ||
63 | } | ||
64 | |||
65 | |||
66 | // de_bh, de_ih, de_deh (points to first element of array), de_item_num is set | ||
67 | inline void set_de_name_and_namelen (struct reiserfs_dir_entry * de) | ||
68 | { | ||
69 | struct reiserfs_de_head * deh = de->de_deh + de->de_entry_num; | ||
70 | |||
71 | if (de->de_entry_num >= ih_entry_count (de->de_ih)) | ||
72 | BUG (); | ||
73 | |||
74 | de->de_entrylen = entry_length (de->de_bh, de->de_ih, de->de_entry_num); | ||
75 | de->de_namelen = de->de_entrylen - (de_with_sd (deh) ? SD_SIZE : 0); | ||
76 | de->de_name = B_I_PITEM (de->de_bh, de->de_ih) + deh_location(deh); | ||
77 | if (de->de_name[de->de_namelen - 1] == 0) | ||
78 | de->de_namelen = strlen (de->de_name); | ||
79 | } | ||
80 | |||
81 | |||
82 | // what entry points to | ||
83 | static inline void set_de_object_key (struct reiserfs_dir_entry * de) | ||
84 | { | ||
85 | if (de->de_entry_num >= ih_entry_count (de->de_ih)) | ||
86 | BUG (); | ||
87 | de->de_dir_id = deh_dir_id( &(de->de_deh[de->de_entry_num])); | ||
88 | de->de_objectid = deh_objectid( &(de->de_deh[de->de_entry_num])); | ||
89 | } | ||
90 | |||
91 | |||
92 | static inline void store_de_entry_key (struct reiserfs_dir_entry * de) | ||
93 | { | ||
94 | struct reiserfs_de_head * deh = de->de_deh + de->de_entry_num; | ||
95 | |||
96 | if (de->de_entry_num >= ih_entry_count (de->de_ih)) | ||
97 | BUG (); | ||
98 | |||
99 | /* store key of the found entry */ | ||
100 | de->de_entry_key.version = KEY_FORMAT_3_5; | ||
101 | de->de_entry_key.on_disk_key.k_dir_id = le32_to_cpu (de->de_ih->ih_key.k_dir_id); | ||
102 | de->de_entry_key.on_disk_key.k_objectid = le32_to_cpu (de->de_ih->ih_key.k_objectid); | ||
103 | set_cpu_key_k_offset (&(de->de_entry_key), deh_offset (deh)); | ||
104 | set_cpu_key_k_type (&(de->de_entry_key), TYPE_DIRENTRY); | ||
105 | } | ||
106 | |||
107 | |||
108 | /* We assign a key to each directory item, and place multiple entries | ||
109 | in a single directory item. A directory item has a key equal to the | ||
110 | key of the first directory entry in it. | ||
111 | |||
112 | This function first calls search_by_key, then, if item whose first | ||
113 | entry matches is not found it looks for the entry inside directory | ||
114 | item found by search_by_key. Fills the path to the entry, and to the | ||
115 | entry position in the item | ||
116 | |||
117 | */ | ||
118 | |||
119 | /* The function is NOT SCHEDULE-SAFE! */ | ||
120 | int search_by_entry_key (struct super_block * sb, const struct cpu_key * key, | ||
121 | struct path * path, struct reiserfs_dir_entry * de) | ||
122 | { | ||
123 | int retval; | ||
124 | |||
125 | retval = search_item (sb, key, path); | ||
126 | switch (retval) { | ||
127 | case ITEM_NOT_FOUND: | ||
128 | if (!PATH_LAST_POSITION (path)) { | ||
129 | reiserfs_warning (sb, "vs-7000: search_by_entry_key: search_by_key returned item position == 0"); | ||
130 | pathrelse(path) ; | ||
131 | return IO_ERROR ; | ||
132 | } | ||
133 | PATH_LAST_POSITION (path) --; | ||
134 | |||
135 | case ITEM_FOUND: | ||
136 | break; | ||
137 | |||
138 | case IO_ERROR: | ||
139 | return retval; | ||
140 | |||
141 | default: | ||
142 | pathrelse (path); | ||
143 | reiserfs_warning (sb, "vs-7002: search_by_entry_key: no path to here"); | ||
144 | return IO_ERROR; | ||
145 | } | ||
146 | |||
147 | set_de_item_location (de, path); | ||
148 | |||
149 | #ifdef CONFIG_REISERFS_CHECK | ||
150 | if (!is_direntry_le_ih (de->de_ih) || | ||
151 | COMP_SHORT_KEYS (&(de->de_ih->ih_key), key)) { | ||
152 | print_block (de->de_bh, 0, -1, -1); | ||
153 | reiserfs_panic (sb, "vs-7005: search_by_entry_key: found item %h is not directory item or " | ||
154 | "does not belong to the same directory as key %K", de->de_ih, key); | ||
155 | } | ||
156 | #endif /* CONFIG_REISERFS_CHECK */ | ||
157 | |||
158 | /* binary search in directory item by third componen t of the | ||
159 | key. sets de->de_entry_num of de */ | ||
160 | retval = bin_search_in_dir_item (de, cpu_key_k_offset (key)); | ||
161 | path->pos_in_item = de->de_entry_num; | ||
162 | if (retval != NAME_NOT_FOUND) { | ||
163 | // ugly, but rename needs de_bh, de_deh, de_name, de_namelen, de_objectid set | ||
164 | set_de_name_and_namelen (de); | ||
165 | set_de_object_key (de); | ||
166 | } | ||
167 | return retval; | ||
168 | } | ||
169 | |||
170 | |||
171 | |||
172 | /* Keyed 32-bit hash function using TEA in a Davis-Meyer function */ | ||
173 | |||
174 | /* The third component is hashed, and you can choose from more than | ||
175 | one hash function. Per directory hashes are not yet implemented | ||
176 | but are thought about. This function should be moved to hashes.c | ||
177 | Jedi, please do so. -Hans */ | ||
178 | |||
179 | static __u32 get_third_component (struct super_block * s, | ||
180 | const char * name, int len) | ||
181 | { | ||
182 | __u32 res; | ||
183 | |||
184 | if (!len || (len == 1 && name[0] == '.')) | ||
185 | return DOT_OFFSET; | ||
186 | if (len == 2 && name[0] == '.' && name[1] == '.') | ||
187 | return DOT_DOT_OFFSET; | ||
188 | |||
189 | res = REISERFS_SB(s)->s_hash_function (name, len); | ||
190 | |||
191 | // take bits from 7-th to 30-th including both bounds | ||
192 | res = GET_HASH_VALUE(res); | ||
193 | if (res == 0) | ||
194 | // needed to have no names before "." and ".." those have hash | ||
195 | // value == 0 and generation conters 1 and 2 accordingly | ||
196 | res = 128; | ||
197 | return res + MAX_GENERATION_NUMBER; | ||
198 | } | ||
199 | |||
200 | |||
201 | static int reiserfs_match (struct reiserfs_dir_entry * de, | ||
202 | const char * name, int namelen) | ||
203 | { | ||
204 | int retval = NAME_NOT_FOUND; | ||
205 | |||
206 | if ((namelen == de->de_namelen) && | ||
207 | !memcmp(de->de_name, name, de->de_namelen)) | ||
208 | retval = (de_visible (de->de_deh + de->de_entry_num) ? NAME_FOUND : NAME_FOUND_INVISIBLE); | ||
209 | |||
210 | return retval; | ||
211 | } | ||
212 | |||
213 | |||
214 | /* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */ | ||
215 | |||
216 | /* used when hash collisions exist */ | ||
217 | |||
218 | |||
219 | static int linear_search_in_dir_item (struct cpu_key * key, struct reiserfs_dir_entry * de, | ||
220 | const char * name, int namelen) | ||
221 | { | ||
222 | struct reiserfs_de_head * deh = de->de_deh; | ||
223 | int retval; | ||
224 | int i; | ||
225 | |||
226 | i = de->de_entry_num; | ||
227 | |||
228 | if (i == I_ENTRY_COUNT (de->de_ih) || | ||
229 | GET_HASH_VALUE (deh_offset (deh + i)) != GET_HASH_VALUE (cpu_key_k_offset (key))) { | ||
230 | i --; | ||
231 | } | ||
232 | |||
233 | RFALSE( de->de_deh != B_I_DEH (de->de_bh, de->de_ih), | ||
234 | "vs-7010: array of entry headers not found"); | ||
235 | |||
236 | deh += i; | ||
237 | |||
238 | for (; i >= 0; i --, deh --) { | ||
239 | if (GET_HASH_VALUE (deh_offset (deh)) != | ||
240 | GET_HASH_VALUE (cpu_key_k_offset (key))) { | ||
241 | // hash value does not match, no need to check whole name | ||
242 | return NAME_NOT_FOUND; | ||
243 | } | ||
244 | |||
245 | /* mark, that this generation number is used */ | ||
246 | if (de->de_gen_number_bit_string) | ||
247 | set_bit (GET_GENERATION_NUMBER (deh_offset (deh)), (unsigned long *)de->de_gen_number_bit_string); | ||
248 | |||
249 | // calculate pointer to name and namelen | ||
250 | de->de_entry_num = i; | ||
251 | set_de_name_and_namelen (de); | ||
252 | |||
253 | if ((retval = reiserfs_match (de, name, namelen)) != NAME_NOT_FOUND) { | ||
254 | // de's de_name, de_namelen, de_recordlen are set. Fill the rest: | ||
255 | |||
256 | // key of pointed object | ||
257 | set_de_object_key (de); | ||
258 | |||
259 | store_de_entry_key (de); | ||
260 | |||
261 | // retval can be NAME_FOUND or NAME_FOUND_INVISIBLE | ||
262 | return retval; | ||
263 | } | ||
264 | } | ||
265 | |||
266 | if (GET_GENERATION_NUMBER (le_ih_k_offset (de->de_ih)) == 0) | ||
267 | /* we have reached left most entry in the node. In common we | ||
268 | have to go to the left neighbor, but if generation counter | ||
269 | is 0 already, we know for sure, that there is no name with | ||
270 | the same hash value */ | ||
271 | // FIXME: this work correctly only because hash value can not | ||
272 | // be 0. Btw, in case of Yura's hash it is probably possible, | ||
273 | // so, this is a bug | ||
274 | return NAME_NOT_FOUND; | ||
275 | |||
276 | RFALSE( de->de_item_num, | ||
277 | "vs-7015: two diritems of the same directory in one node?"); | ||
278 | |||
279 | return GOTO_PREVIOUS_ITEM; | ||
280 | } | ||
281 | |||
282 | |||
283 | // may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND | ||
284 | // FIXME: should add something like IOERROR | ||
285 | static int reiserfs_find_entry (struct inode * dir, const char * name, int namelen, | ||
286 | struct path * path_to_entry, struct reiserfs_dir_entry * de) | ||
287 | { | ||
288 | struct cpu_key key_to_search; | ||
289 | int retval; | ||
290 | |||
291 | |||
292 | if (namelen > REISERFS_MAX_NAME (dir->i_sb->s_blocksize)) | ||
293 | return NAME_NOT_FOUND; | ||
294 | |||
295 | /* we will search for this key in the tree */ | ||
296 | make_cpu_key (&key_to_search, dir, | ||
297 | get_third_component (dir->i_sb, name, namelen), TYPE_DIRENTRY, 3); | ||
298 | |||
299 | while (1) { | ||
300 | retval = search_by_entry_key (dir->i_sb, &key_to_search, path_to_entry, de); | ||
301 | if (retval == IO_ERROR) { | ||
302 | reiserfs_warning (dir->i_sb, "zam-7001: io error in %s", | ||
303 | __FUNCTION__); | ||
304 | return IO_ERROR; | ||
305 | } | ||
306 | |||
307 | /* compare names for all entries having given hash value */ | ||
308 | retval = linear_search_in_dir_item (&key_to_search, de, name, namelen); | ||
309 | if (retval != GOTO_PREVIOUS_ITEM) { | ||
310 | /* there is no need to scan directory anymore. Given entry found or does not exist */ | ||
311 | path_to_entry->pos_in_item = de->de_entry_num; | ||
312 | return retval; | ||
313 | } | ||
314 | |||
315 | /* there is left neighboring item of this directory and given entry can be there */ | ||
316 | set_cpu_key_k_offset (&key_to_search, le_ih_k_offset (de->de_ih) - 1); | ||
317 | pathrelse (path_to_entry); | ||
318 | |||
319 | } /* while (1) */ | ||
320 | } | ||
321 | |||
322 | |||
323 | static struct dentry * reiserfs_lookup (struct inode * dir, struct dentry * dentry, struct nameidata *nd) | ||
324 | { | ||
325 | int retval; | ||
326 | struct inode * inode = NULL; | ||
327 | struct reiserfs_dir_entry de; | ||
328 | INITIALIZE_PATH (path_to_entry); | ||
329 | |||
330 | if (REISERFS_MAX_NAME (dir->i_sb->s_blocksize) < dentry->d_name.len) | ||
331 | return ERR_PTR(-ENAMETOOLONG); | ||
332 | |||
333 | reiserfs_write_lock(dir->i_sb); | ||
334 | de.de_gen_number_bit_string = NULL; | ||
335 | retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path_to_entry, &de); | ||
336 | pathrelse (&path_to_entry); | ||
337 | if (retval == NAME_FOUND) { | ||
338 | /* Hide the .reiserfs_priv directory */ | ||
339 | if (reiserfs_xattrs (dir->i_sb) && | ||
340 | !old_format_only(dir->i_sb) && | ||
341 | REISERFS_SB(dir->i_sb)->priv_root && | ||
342 | REISERFS_SB(dir->i_sb)->priv_root->d_inode && | ||
343 | de.de_objectid == le32_to_cpu (INODE_PKEY(REISERFS_SB(dir->i_sb)->priv_root->d_inode)->k_objectid)) { | ||
344 | reiserfs_write_unlock (dir->i_sb); | ||
345 | return ERR_PTR (-EACCES); | ||
346 | } | ||
347 | |||
348 | inode = reiserfs_iget (dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); | ||
349 | if (!inode || IS_ERR(inode)) { | ||
350 | reiserfs_write_unlock(dir->i_sb); | ||
351 | return ERR_PTR(-EACCES); | ||
352 | } | ||
353 | |||
354 | /* Propogate the priv_object flag so we know we're in the priv tree */ | ||
355 | if (is_reiserfs_priv_object (dir)) | ||
356 | reiserfs_mark_inode_private (inode); | ||
357 | } | ||
358 | reiserfs_write_unlock(dir->i_sb); | ||
359 | if ( retval == IO_ERROR ) { | ||
360 | return ERR_PTR(-EIO); | ||
361 | } | ||
362 | |||
363 | if (inode) | ||
364 | return d_splice_alias(inode, dentry); | ||
365 | |||
366 | d_add(dentry, inode); | ||
367 | return NULL; | ||
368 | } | ||
369 | |||
370 | |||
371 | /* | ||
372 | ** looks up the dentry of the parent directory for child. | ||
373 | ** taken from ext2_get_parent | ||
374 | */ | ||
375 | struct dentry *reiserfs_get_parent(struct dentry *child) | ||
376 | { | ||
377 | int retval; | ||
378 | struct inode * inode = NULL; | ||
379 | struct reiserfs_dir_entry de; | ||
380 | INITIALIZE_PATH (path_to_entry); | ||
381 | struct dentry *parent; | ||
382 | struct inode *dir = child->d_inode ; | ||
383 | |||
384 | |||
385 | if (dir->i_nlink == 0) { | ||
386 | return ERR_PTR(-ENOENT); | ||
387 | } | ||
388 | de.de_gen_number_bit_string = NULL; | ||
389 | |||
390 | reiserfs_write_lock(dir->i_sb); | ||
391 | retval = reiserfs_find_entry (dir, "..", 2, &path_to_entry, &de); | ||
392 | pathrelse (&path_to_entry); | ||
393 | if (retval != NAME_FOUND) { | ||
394 | reiserfs_write_unlock(dir->i_sb); | ||
395 | return ERR_PTR(-ENOENT); | ||
396 | } | ||
397 | inode = reiserfs_iget (dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); | ||
398 | reiserfs_write_unlock(dir->i_sb); | ||
399 | |||
400 | if (!inode || IS_ERR(inode)) { | ||
401 | return ERR_PTR(-EACCES); | ||
402 | } | ||
403 | parent = d_alloc_anon(inode); | ||
404 | if (!parent) { | ||
405 | iput(inode); | ||
406 | parent = ERR_PTR(-ENOMEM); | ||
407 | } | ||
408 | return parent; | ||
409 | } | ||
410 | |||
411 | |||
412 | /* add entry to the directory (entry can be hidden). | ||
413 | |||
414 | insert definition of when hidden directories are used here -Hans | ||
415 | |||
416 | Does not mark dir inode dirty, do it after successesfull call to it */ | ||
417 | |||
418 | static int reiserfs_add_entry (struct reiserfs_transaction_handle *th, struct inode * dir, | ||
419 | const char * name, int namelen, struct inode * inode, | ||
420 | int visible) | ||
421 | { | ||
422 | struct cpu_key entry_key; | ||
423 | struct reiserfs_de_head * deh; | ||
424 | INITIALIZE_PATH (path); | ||
425 | struct reiserfs_dir_entry de; | ||
426 | int bit_string [MAX_GENERATION_NUMBER / (sizeof(int) * 8) + 1]; | ||
427 | int gen_number; | ||
428 | char small_buf[32+DEH_SIZE] ; /* 48 bytes now and we avoid kmalloc | ||
429 | if we create file with short name */ | ||
430 | char * buffer; | ||
431 | int buflen, paste_size; | ||
432 | int retval; | ||
433 | |||
434 | BUG_ON (!th->t_trans_id); | ||
435 | |||
436 | /* cannot allow items to be added into a busy deleted directory */ | ||
437 | if (!namelen) | ||
438 | return -EINVAL; | ||
439 | |||
440 | if (namelen > REISERFS_MAX_NAME (dir->i_sb->s_blocksize)) | ||
441 | return -ENAMETOOLONG; | ||
442 | |||
443 | /* each entry has unique key. compose it */ | ||
444 | make_cpu_key (&entry_key, dir, | ||
445 | get_third_component (dir->i_sb, name, namelen), TYPE_DIRENTRY, 3); | ||
446 | |||
447 | /* get memory for composing the entry */ | ||
448 | buflen = DEH_SIZE + ROUND_UP (namelen); | ||
449 | if (buflen > sizeof (small_buf)) { | ||
450 | buffer = reiserfs_kmalloc (buflen, GFP_NOFS, dir->i_sb); | ||
451 | if (buffer == 0) | ||
452 | return -ENOMEM; | ||
453 | } else | ||
454 | buffer = small_buf; | ||
455 | |||
456 | paste_size = (get_inode_sd_version (dir) == STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen; | ||
457 | |||
458 | /* fill buffer : directory entry head, name[, dir objectid | , stat data | ,stat data, dir objectid ] */ | ||
459 | deh = (struct reiserfs_de_head *)buffer; | ||
460 | deh->deh_location = 0; /* JDM Endian safe if 0 */ | ||
461 | put_deh_offset( deh, cpu_key_k_offset( &entry_key ) ); | ||
462 | deh->deh_state = 0; /* JDM Endian safe if 0 */ | ||
463 | /* put key (ino analog) to de */ | ||
464 | deh->deh_dir_id = INODE_PKEY (inode)->k_dir_id; /* safe: k_dir_id is le */ | ||
465 | deh->deh_objectid = INODE_PKEY (inode)->k_objectid; /* safe: k_objectid is le */ | ||
466 | |||
467 | /* copy name */ | ||
468 | memcpy ((char *)(deh + 1), name, namelen); | ||
469 | /* padd by 0s to the 4 byte boundary */ | ||
470 | padd_item ((char *)(deh + 1), ROUND_UP (namelen), namelen); | ||
471 | |||
472 | /* entry is ready to be pasted into tree, set 'visibility' and 'stat data in entry' attributes */ | ||
473 | mark_de_without_sd (deh); | ||
474 | visible ? mark_de_visible (deh) : mark_de_hidden (deh); | ||
475 | |||
476 | /* find the proper place for the new entry */ | ||
477 | memset (bit_string, 0, sizeof (bit_string)); | ||
478 | de.de_gen_number_bit_string = (char *)bit_string; | ||
479 | retval = reiserfs_find_entry (dir, name, namelen, &path, &de); | ||
480 | if( retval != NAME_NOT_FOUND ) { | ||
481 | if (buffer != small_buf) | ||
482 | reiserfs_kfree (buffer, buflen, dir->i_sb); | ||
483 | pathrelse (&path); | ||
484 | |||
485 | if ( retval == IO_ERROR ) { | ||
486 | return -EIO; | ||
487 | } | ||
488 | |||
489 | if (retval != NAME_FOUND) { | ||
490 | reiserfs_warning (dir->i_sb, "zam-7002:%s: \"reiserfs_find_entry\" " | ||
491 | "has returned unexpected value (%d)", | ||
492 | __FUNCTION__, retval); | ||
493 | } | ||
494 | |||
495 | return -EEXIST; | ||
496 | } | ||
497 | |||
498 | gen_number = find_first_zero_bit ((unsigned long *)bit_string, MAX_GENERATION_NUMBER + 1); | ||
499 | if (gen_number > MAX_GENERATION_NUMBER) { | ||
500 | /* there is no free generation number */ | ||
501 | reiserfs_warning (dir->i_sb, "reiserfs_add_entry: Congratulations! we have got hash function screwed up"); | ||
502 | if (buffer != small_buf) | ||
503 | reiserfs_kfree (buffer, buflen, dir->i_sb); | ||
504 | pathrelse (&path); | ||
505 | return -EBUSY; | ||
506 | } | ||
507 | /* adjust offset of directory enrty */ | ||
508 | put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number)); | ||
509 | set_cpu_key_k_offset (&entry_key, deh_offset(deh)); | ||
510 | |||
511 | /* update max-hash-collisions counter in reiserfs_sb_info */ | ||
512 | PROC_INFO_MAX( th -> t_super, max_hash_collisions, gen_number ); | ||
513 | |||
514 | if (gen_number != 0) { /* we need to re-search for the insertion point */ | ||
515 | if (search_by_entry_key (dir->i_sb, &entry_key, &path, &de) != NAME_NOT_FOUND) { | ||
516 | reiserfs_warning (dir->i_sb, "vs-7032: reiserfs_add_entry: " | ||
517 | "entry with this key (%K) already exists", | ||
518 | &entry_key); | ||
519 | |||
520 | if (buffer != small_buf) | ||
521 | reiserfs_kfree (buffer, buflen, dir->i_sb); | ||
522 | pathrelse (&path); | ||
523 | return -EBUSY; | ||
524 | } | ||
525 | } | ||
526 | |||
527 | /* perform the insertion of the entry that we have prepared */ | ||
528 | retval = reiserfs_paste_into_item (th, &path, &entry_key, dir, buffer, paste_size); | ||
529 | if (buffer != small_buf) | ||
530 | reiserfs_kfree (buffer, buflen, dir->i_sb); | ||
531 | if (retval) { | ||
532 | reiserfs_check_path(&path) ; | ||
533 | return retval; | ||
534 | } | ||
535 | |||
536 | dir->i_size += paste_size; | ||
537 | dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; | ||
538 | if (!S_ISDIR (inode->i_mode) && visible) | ||
539 | // reiserfs_mkdir or reiserfs_rename will do that by itself | ||
540 | reiserfs_update_sd (th, dir); | ||
541 | |||
542 | reiserfs_check_path(&path) ; | ||
543 | return 0; | ||
544 | } | ||
545 | |||
546 | /* quota utility function, call if you've had to abort after calling | ||
547 | ** new_inode_init, and have not called reiserfs_new_inode yet. | ||
548 | ** This should only be called on inodes that do not have stat data | ||
549 | ** inserted into the tree yet. | ||
550 | */ | ||
551 | static int drop_new_inode(struct inode *inode) { | ||
552 | DQUOT_DROP(inode); | ||
553 | make_bad_inode(inode) ; | ||
554 | inode->i_flags |= S_NOQUOTA; | ||
555 | iput(inode) ; | ||
556 | return 0 ; | ||
557 | } | ||
558 | |||
559 | /* utility function that does setup for reiserfs_new_inode. | ||
560 | ** DQUOT_INIT needs lots of credits so it's better to have it | ||
561 | ** outside of a transaction, so we had to pull some bits of | ||
562 | ** reiserfs_new_inode out into this func. | ||
563 | */ | ||
564 | static int new_inode_init(struct inode *inode, struct inode *dir, int mode) { | ||
565 | |||
566 | /* the quota init calls have to know who to charge the quota to, so | ||
567 | ** we have to set uid and gid here | ||
568 | */ | ||
569 | inode->i_uid = current->fsuid; | ||
570 | inode->i_mode = mode; | ||
571 | |||
572 | if (dir->i_mode & S_ISGID) { | ||
573 | inode->i_gid = dir->i_gid; | ||
574 | if (S_ISDIR(mode)) | ||
575 | inode->i_mode |= S_ISGID; | ||
576 | } else { | ||
577 | inode->i_gid = current->fsgid; | ||
578 | } | ||
579 | DQUOT_INIT(inode); | ||
580 | return 0 ; | ||
581 | } | ||
582 | |||
583 | static int reiserfs_create (struct inode * dir, struct dentry *dentry, int mode, | ||
584 | struct nameidata *nd) | ||
585 | { | ||
586 | int retval; | ||
587 | struct inode * inode; | ||
588 | /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ | ||
589 | int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); | ||
590 | struct reiserfs_transaction_handle th ; | ||
591 | int locked; | ||
592 | |||
593 | if (!(inode = new_inode(dir->i_sb))) { | ||
594 | return -ENOMEM ; | ||
595 | } | ||
596 | new_inode_init(inode, dir, mode); | ||
597 | |||
598 | locked = reiserfs_cache_default_acl (dir); | ||
599 | |||
600 | reiserfs_write_lock(dir->i_sb); | ||
601 | |||
602 | if (locked) | ||
603 | reiserfs_write_lock_xattrs (dir->i_sb); | ||
604 | |||
605 | retval = journal_begin(&th, dir->i_sb, jbegin_count); | ||
606 | if (retval) { | ||
607 | drop_new_inode (inode); | ||
608 | goto out_failed; | ||
609 | } | ||
610 | |||
611 | retval = reiserfs_new_inode (&th, dir, mode, 0, 0/*i_size*/, dentry, inode); | ||
612 | if (retval) | ||
613 | goto out_failed; | ||
614 | |||
615 | if (locked) { | ||
616 | reiserfs_write_unlock_xattrs (dir->i_sb); | ||
617 | locked = 0; | ||
618 | } | ||
619 | |||
620 | inode->i_op = &reiserfs_file_inode_operations; | ||
621 | inode->i_fop = &reiserfs_file_operations; | ||
622 | inode->i_mapping->a_ops = &reiserfs_address_space_operations ; | ||
623 | |||
624 | retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, | ||
625 | inode, 1/*visible*/); | ||
626 | if (retval) { | ||
627 | int err; | ||
628 | inode->i_nlink--; | ||
629 | reiserfs_update_sd (&th, inode); | ||
630 | err = journal_end(&th, dir->i_sb, jbegin_count) ; | ||
631 | if (err) | ||
632 | retval = err; | ||
633 | iput (inode); | ||
634 | goto out_failed; | ||
635 | } | ||
636 | reiserfs_update_inode_transaction(inode) ; | ||
637 | reiserfs_update_inode_transaction(dir) ; | ||
638 | |||
639 | d_instantiate(dentry, inode); | ||
640 | retval = journal_end(&th, dir->i_sb, jbegin_count) ; | ||
641 | |||
642 | out_failed: | ||
643 | if (locked) | ||
644 | reiserfs_write_unlock_xattrs (dir->i_sb); | ||
645 | reiserfs_write_unlock(dir->i_sb); | ||
646 | return retval; | ||
647 | } | ||
648 | |||
649 | |||
650 | static int reiserfs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) | ||
651 | { | ||
652 | int retval; | ||
653 | struct inode * inode; | ||
654 | struct reiserfs_transaction_handle th ; | ||
655 | /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ | ||
656 | int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); | ||
657 | int locked; | ||
658 | |||
659 | if (!new_valid_dev(rdev)) | ||
660 | return -EINVAL; | ||
661 | |||
662 | if (!(inode = new_inode(dir->i_sb))) { | ||
663 | return -ENOMEM ; | ||
664 | } | ||
665 | new_inode_init(inode, dir, mode); | ||
666 | |||
667 | locked = reiserfs_cache_default_acl (dir); | ||
668 | |||
669 | reiserfs_write_lock(dir->i_sb); | ||
670 | |||
671 | if (locked) | ||
672 | reiserfs_write_lock_xattrs (dir->i_sb); | ||
673 | |||
674 | retval = journal_begin(&th, dir->i_sb, jbegin_count) ; | ||
675 | if (retval) { | ||
676 | drop_new_inode (inode); | ||
677 | goto out_failed; | ||
678 | } | ||
679 | |||
680 | retval = reiserfs_new_inode (&th, dir, mode, NULL, 0/*i_size*/, dentry, inode); | ||
681 | if (retval) { | ||
682 | goto out_failed; | ||
683 | } | ||
684 | |||
685 | if (locked) { | ||
686 | reiserfs_write_unlock_xattrs (dir->i_sb); | ||
687 | locked = 0; | ||
688 | } | ||
689 | |||
690 | |||
691 | inode->i_op = &reiserfs_special_inode_operations; | ||
692 | init_special_inode(inode, inode->i_mode, rdev) ; | ||
693 | |||
694 | //FIXME: needed for block and char devices only | ||
695 | reiserfs_update_sd (&th, inode); | ||
696 | |||
697 | reiserfs_update_inode_transaction(inode) ; | ||
698 | reiserfs_update_inode_transaction(dir) ; | ||
699 | |||
700 | retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, | ||
701 | inode, 1/*visible*/); | ||
702 | if (retval) { | ||
703 | int err; | ||
704 | inode->i_nlink--; | ||
705 | reiserfs_update_sd (&th, inode); | ||
706 | err = journal_end(&th, dir->i_sb, jbegin_count) ; | ||
707 | if (err) | ||
708 | retval = err; | ||
709 | iput (inode); | ||
710 | goto out_failed; | ||
711 | } | ||
712 | |||
713 | d_instantiate(dentry, inode); | ||
714 | retval = journal_end(&th, dir->i_sb, jbegin_count) ; | ||
715 | |||
716 | out_failed: | ||
717 | if (locked) | ||
718 | reiserfs_write_unlock_xattrs (dir->i_sb); | ||
719 | reiserfs_write_unlock(dir->i_sb); | ||
720 | return retval; | ||
721 | } | ||
722 | |||
723 | |||
724 | static int reiserfs_mkdir (struct inode * dir, struct dentry *dentry, int mode) | ||
725 | { | ||
726 | int retval; | ||
727 | struct inode * inode; | ||
728 | struct reiserfs_transaction_handle th ; | ||
729 | /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ | ||
730 | int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); | ||
731 | int locked; | ||
732 | |||
733 | #ifdef DISPLACE_NEW_PACKING_LOCALITIES | ||
734 | /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ | ||
735 | REISERFS_I(dir)->new_packing_locality = 1; | ||
736 | #endif | ||
737 | mode = S_IFDIR | mode; | ||
738 | if (!(inode = new_inode(dir->i_sb))) { | ||
739 | return -ENOMEM ; | ||
740 | } | ||
741 | new_inode_init(inode, dir, mode); | ||
742 | |||
743 | locked = reiserfs_cache_default_acl (dir); | ||
744 | |||
745 | reiserfs_write_lock(dir->i_sb); | ||
746 | if (locked) | ||
747 | reiserfs_write_lock_xattrs (dir->i_sb); | ||
748 | |||
749 | retval = journal_begin(&th, dir->i_sb, jbegin_count) ; | ||
750 | if (retval) { | ||
751 | drop_new_inode (inode); | ||
752 | goto out_failed; | ||
753 | } | ||
754 | |||
755 | |||
756 | /* inc the link count now, so another writer doesn't overflow it while | ||
757 | ** we sleep later on. | ||
758 | */ | ||
759 | INC_DIR_INODE_NLINK(dir) | ||
760 | |||
761 | retval = reiserfs_new_inode (&th, dir, mode, NULL/*symlink*/, | ||
762 | old_format_only (dir->i_sb) ? | ||
763 | EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, | ||
764 | dentry, inode); | ||
765 | if (retval) { | ||
766 | dir->i_nlink-- ; | ||
767 | goto out_failed; | ||
768 | } | ||
769 | |||
770 | if (locked) { | ||
771 | reiserfs_write_unlock_xattrs (dir->i_sb); | ||
772 | locked = 0; | ||
773 | } | ||
774 | |||
775 | reiserfs_update_inode_transaction(inode) ; | ||
776 | reiserfs_update_inode_transaction(dir) ; | ||
777 | |||
778 | inode->i_op = &reiserfs_dir_inode_operations; | ||
779 | inode->i_fop = &reiserfs_dir_operations; | ||
780 | |||
781 | // note, _this_ add_entry will not update dir's stat data | ||
782 | retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, | ||
783 | inode, 1/*visible*/); | ||
784 | if (retval) { | ||
785 | int err; | ||
786 | inode->i_nlink = 0; | ||
787 | DEC_DIR_INODE_NLINK(dir); | ||
788 | reiserfs_update_sd (&th, inode); | ||
789 | err = journal_end(&th, dir->i_sb, jbegin_count) ; | ||
790 | if (err) | ||
791 | retval = err; | ||
792 | iput (inode); | ||
793 | goto out_failed; | ||
794 | } | ||
795 | |||
796 | // the above add_entry did not update dir's stat data | ||
797 | reiserfs_update_sd (&th, dir); | ||
798 | |||
799 | d_instantiate(dentry, inode); | ||
800 | retval = journal_end(&th, dir->i_sb, jbegin_count) ; | ||
801 | out_failed: | ||
802 | if (locked) | ||
803 | reiserfs_write_unlock_xattrs (dir->i_sb); | ||
804 | reiserfs_write_unlock(dir->i_sb); | ||
805 | return retval; | ||
806 | } | ||
807 | |||
808 | static inline int reiserfs_empty_dir(struct inode *inode) { | ||
809 | /* we can cheat because an old format dir cannot have | ||
810 | ** EMPTY_DIR_SIZE, and a new format dir cannot have | ||
811 | ** EMPTY_DIR_SIZE_V1. So, if the inode is either size, | ||
812 | ** regardless of disk format version, the directory is empty. | ||
813 | */ | ||
814 | if (inode->i_size != EMPTY_DIR_SIZE && | ||
815 | inode->i_size != EMPTY_DIR_SIZE_V1) { | ||
816 | return 0 ; | ||
817 | } | ||
818 | return 1 ; | ||
819 | } | ||
820 | |||
821 | static int reiserfs_rmdir (struct inode * dir, struct dentry *dentry) | ||
822 | { | ||
823 | int retval, err; | ||
824 | struct inode * inode; | ||
825 | struct reiserfs_transaction_handle th ; | ||
826 | int jbegin_count; | ||
827 | INITIALIZE_PATH (path); | ||
828 | struct reiserfs_dir_entry de; | ||
829 | |||
830 | |||
831 | /* we will be doing 2 balancings and update 2 stat data, we change quotas | ||
832 | * of the owner of the directory and of the owner of the parent directory */ | ||
833 | jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); | ||
834 | |||
835 | reiserfs_write_lock(dir->i_sb); | ||
836 | retval = journal_begin(&th, dir->i_sb, jbegin_count) ; | ||
837 | if (retval) | ||
838 | goto out_rmdir; | ||
839 | |||
840 | de.de_gen_number_bit_string = NULL; | ||
841 | if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) { | ||
842 | retval = -ENOENT; | ||
843 | goto end_rmdir; | ||
844 | } else if ( retval == IO_ERROR) { | ||
845 | retval = -EIO; | ||
846 | goto end_rmdir; | ||
847 | } | ||
848 | |||
849 | inode = dentry->d_inode; | ||
850 | |||
851 | reiserfs_update_inode_transaction(inode) ; | ||
852 | reiserfs_update_inode_transaction(dir) ; | ||
853 | |||
854 | if (de.de_objectid != inode->i_ino) { | ||
855 | // FIXME: compare key of an object and a key found in the | ||
856 | // entry | ||
857 | retval = -EIO; | ||
858 | goto end_rmdir; | ||
859 | } | ||
860 | if (!reiserfs_empty_dir(inode)) { | ||
861 | retval = -ENOTEMPTY; | ||
862 | goto end_rmdir; | ||
863 | } | ||
864 | |||
865 | /* cut entry from dir directory */ | ||
866 | retval = reiserfs_cut_from_item (&th, &path, &(de.de_entry_key), dir, | ||
867 | NULL, /* page */ | ||
868 | 0/*new file size - not used here*/); | ||
869 | if (retval < 0) | ||
870 | goto end_rmdir; | ||
871 | |||
872 | if ( inode->i_nlink != 2 && inode->i_nlink != 1 ) | ||
873 | reiserfs_warning (inode->i_sb, "%s: empty directory has nlink " | ||
874 | "!= 2 (%d)", __FUNCTION__, inode->i_nlink); | ||
875 | |||
876 | inode->i_nlink = 0; | ||
877 | inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; | ||
878 | reiserfs_update_sd (&th, inode); | ||
879 | |||
880 | DEC_DIR_INODE_NLINK(dir) | ||
881 | dir->i_size -= (DEH_SIZE + de.de_entrylen); | ||
882 | reiserfs_update_sd (&th, dir); | ||
883 | |||
884 | /* prevent empty directory from getting lost */ | ||
885 | add_save_link (&th, inode, 0/* not truncate */); | ||
886 | |||
887 | retval = journal_end(&th, dir->i_sb, jbegin_count) ; | ||
888 | reiserfs_check_path(&path) ; | ||
889 | out_rmdir: | ||
890 | reiserfs_write_unlock(dir->i_sb); | ||
891 | return retval; | ||
892 | |||
893 | end_rmdir: | ||
894 | /* we must release path, because we did not call | ||
895 | reiserfs_cut_from_item, or reiserfs_cut_from_item does not | ||
896 | release path if operation was not complete */ | ||
897 | pathrelse (&path); | ||
898 | err = journal_end(&th, dir->i_sb, jbegin_count) ; | ||
899 | reiserfs_write_unlock(dir->i_sb); | ||
900 | return err ? err : retval; | ||
901 | } | ||
902 | |||
903 | static int reiserfs_unlink (struct inode * dir, struct dentry *dentry) | ||
904 | { | ||
905 | int retval, err; | ||
906 | struct inode * inode; | ||
907 | struct reiserfs_dir_entry de; | ||
908 | INITIALIZE_PATH (path); | ||
909 | struct reiserfs_transaction_handle th ; | ||
910 | int jbegin_count; | ||
911 | unsigned long savelink; | ||
912 | |||
913 | inode = dentry->d_inode; | ||
914 | |||
915 | /* in this transaction we can be doing at max two balancings and update | ||
916 | two stat datas, we change quotas of the owner of the directory and of | ||
917 | the owner of the parent directory */ | ||
918 | jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); | ||
919 | |||
920 | reiserfs_write_lock(dir->i_sb); | ||
921 | retval = journal_begin(&th, dir->i_sb, jbegin_count) ; | ||
922 | if (retval) | ||
923 | goto out_unlink; | ||
924 | |||
925 | de.de_gen_number_bit_string = NULL; | ||
926 | if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) { | ||
927 | retval = -ENOENT; | ||
928 | goto end_unlink; | ||
929 | } else if (retval == IO_ERROR) { | ||
930 | retval = -EIO; | ||
931 | goto end_unlink; | ||
932 | } | ||
933 | |||
934 | reiserfs_update_inode_transaction(inode) ; | ||
935 | reiserfs_update_inode_transaction(dir) ; | ||
936 | |||
937 | if (de.de_objectid != inode->i_ino) { | ||
938 | // FIXME: compare key of an object and a key found in the | ||
939 | // entry | ||
940 | retval = -EIO; | ||
941 | goto end_unlink; | ||
942 | } | ||
943 | |||
944 | if (!inode->i_nlink) { | ||
945 | reiserfs_warning (inode->i_sb, "%s: deleting nonexistent file " | ||
946 | "(%s:%lu), %d", __FUNCTION__, | ||
947 | reiserfs_bdevname (inode->i_sb), inode->i_ino, | ||
948 | inode->i_nlink); | ||
949 | inode->i_nlink = 1; | ||
950 | } | ||
951 | |||
952 | inode->i_nlink--; | ||
953 | |||
954 | /* | ||
955 | * we schedule before doing the add_save_link call, save the link | ||
956 | * count so we don't race | ||
957 | */ | ||
958 | savelink = inode->i_nlink; | ||
959 | |||
960 | |||
961 | retval = reiserfs_cut_from_item (&th, &path, &(de.de_entry_key), dir, NULL, 0); | ||
962 | if (retval < 0) { | ||
963 | inode->i_nlink++; | ||
964 | goto end_unlink; | ||
965 | } | ||
966 | inode->i_ctime = CURRENT_TIME_SEC; | ||
967 | reiserfs_update_sd (&th, inode); | ||
968 | |||
969 | dir->i_size -= (de.de_entrylen + DEH_SIZE); | ||
970 | dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; | ||
971 | reiserfs_update_sd (&th, dir); | ||
972 | |||
973 | if (!savelink) | ||
974 | /* prevent file from getting lost */ | ||
975 | add_save_link (&th, inode, 0/* not truncate */); | ||
976 | |||
977 | retval = journal_end(&th, dir->i_sb, jbegin_count) ; | ||
978 | reiserfs_check_path(&path) ; | ||
979 | reiserfs_write_unlock(dir->i_sb); | ||
980 | return retval; | ||
981 | |||
982 | end_unlink: | ||
983 | pathrelse (&path); | ||
984 | err = journal_end(&th, dir->i_sb, jbegin_count) ; | ||
985 | reiserfs_check_path(&path) ; | ||
986 | if (err) | ||
987 | retval = err; | ||
988 | out_unlink: | ||
989 | reiserfs_write_unlock(dir->i_sb); | ||
990 | return retval; | ||
991 | } | ||
992 | |||
993 | static int reiserfs_symlink (struct inode * parent_dir, | ||
994 | struct dentry * dentry, const char * symname) | ||
995 | { | ||
996 | int retval; | ||
997 | struct inode * inode; | ||
998 | char * name; | ||
999 | int item_len; | ||
1000 | struct reiserfs_transaction_handle th ; | ||
1001 | int mode = S_IFLNK | S_IRWXUGO; | ||
1002 | /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ | ||
1003 | int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); | ||
1004 | |||
1005 | if (!(inode = new_inode(parent_dir->i_sb))) { | ||
1006 | return -ENOMEM ; | ||
1007 | } | ||
1008 | new_inode_init(inode, parent_dir, mode); | ||
1009 | |||
1010 | reiserfs_write_lock(parent_dir->i_sb); | ||
1011 | item_len = ROUND_UP (strlen (symname)); | ||
1012 | if (item_len > MAX_DIRECT_ITEM_LEN (parent_dir->i_sb->s_blocksize)) { | ||
1013 | retval = -ENAMETOOLONG; | ||
1014 | drop_new_inode(inode); | ||
1015 | goto out_failed; | ||
1016 | } | ||
1017 | |||
1018 | name = reiserfs_kmalloc (item_len, GFP_NOFS, parent_dir->i_sb); | ||
1019 | if (!name) { | ||
1020 | drop_new_inode(inode); | ||
1021 | retval = -ENOMEM; | ||
1022 | goto out_failed; | ||
1023 | } | ||
1024 | memcpy (name, symname, strlen (symname)); | ||
1025 | padd_item (name, item_len, strlen (symname)); | ||
1026 | |||
1027 | /* We would inherit the default ACL here, but symlinks don't get ACLs */ | ||
1028 | |||
1029 | retval = journal_begin(&th, parent_dir->i_sb, jbegin_count) ; | ||
1030 | if (retval) { | ||
1031 | drop_new_inode (inode); | ||
1032 | reiserfs_kfree (name, item_len, parent_dir->i_sb); | ||
1033 | goto out_failed; | ||
1034 | } | ||
1035 | |||
1036 | retval = reiserfs_new_inode (&th, parent_dir, mode, name, strlen (symname), | ||
1037 | dentry, inode); | ||
1038 | reiserfs_kfree (name, item_len, parent_dir->i_sb); | ||
1039 | if (retval) { /* reiserfs_new_inode iputs for us */ | ||
1040 | goto out_failed; | ||
1041 | } | ||
1042 | |||
1043 | reiserfs_update_inode_transaction(inode) ; | ||
1044 | reiserfs_update_inode_transaction(parent_dir) ; | ||
1045 | |||
1046 | inode->i_op = &reiserfs_symlink_inode_operations; | ||
1047 | inode->i_mapping->a_ops = &reiserfs_address_space_operations; | ||
1048 | |||
1049 | // must be sure this inode is written with this transaction | ||
1050 | // | ||
1051 | //reiserfs_update_sd (&th, inode, READ_BLOCKS); | ||
1052 | |||
1053 | retval = reiserfs_add_entry (&th, parent_dir, dentry->d_name.name, | ||
1054 | dentry->d_name.len, inode, 1/*visible*/); | ||
1055 | if (retval) { | ||
1056 | int err; | ||
1057 | inode->i_nlink--; | ||
1058 | reiserfs_update_sd (&th, inode); | ||
1059 | err = journal_end(&th, parent_dir->i_sb, jbegin_count) ; | ||
1060 | if (err) | ||
1061 | retval = err; | ||
1062 | iput (inode); | ||
1063 | goto out_failed; | ||
1064 | } | ||
1065 | |||
1066 | d_instantiate(dentry, inode); | ||
1067 | retval = journal_end(&th, parent_dir->i_sb, jbegin_count) ; | ||
1068 | out_failed: | ||
1069 | reiserfs_write_unlock(parent_dir->i_sb); | ||
1070 | return retval; | ||
1071 | } | ||
1072 | |||
1073 | static int reiserfs_link (struct dentry * old_dentry, struct inode * dir, struct dentry * dentry) | ||
1074 | { | ||
1075 | int retval; | ||
1076 | struct inode *inode = old_dentry->d_inode; | ||
1077 | struct reiserfs_transaction_handle th ; | ||
1078 | /* We need blocks for transaction + update of quotas for the owners of the directory */ | ||
1079 | int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * REISERFS_QUOTA_TRANS_BLOCKS; | ||
1080 | |||
1081 | reiserfs_write_lock(dir->i_sb); | ||
1082 | if (inode->i_nlink >= REISERFS_LINK_MAX) { | ||
1083 | //FIXME: sd_nlink is 32 bit for new files | ||
1084 | reiserfs_write_unlock(dir->i_sb); | ||
1085 | return -EMLINK; | ||
1086 | } | ||
1087 | if (inode->i_nlink == 0) { | ||
1088 | reiserfs_write_unlock(dir->i_sb); | ||
1089 | return -ENOENT; | ||
1090 | } | ||
1091 | |||
1092 | /* inc before scheduling so reiserfs_unlink knows we are here */ | ||
1093 | inode->i_nlink++; | ||
1094 | |||
1095 | retval = journal_begin(&th, dir->i_sb, jbegin_count) ; | ||
1096 | if (retval) { | ||
1097 | inode->i_nlink--; | ||
1098 | reiserfs_write_unlock (dir->i_sb); | ||
1099 | return retval; | ||
1100 | } | ||
1101 | |||
1102 | /* create new entry */ | ||
1103 | retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, | ||
1104 | inode, 1/*visible*/); | ||
1105 | |||
1106 | reiserfs_update_inode_transaction(inode) ; | ||
1107 | reiserfs_update_inode_transaction(dir) ; | ||
1108 | |||
1109 | if (retval) { | ||
1110 | int err; | ||
1111 | inode->i_nlink--; | ||
1112 | err = journal_end(&th, dir->i_sb, jbegin_count) ; | ||
1113 | reiserfs_write_unlock(dir->i_sb); | ||
1114 | return err ? err : retval; | ||
1115 | } | ||
1116 | |||
1117 | inode->i_ctime = CURRENT_TIME_SEC; | ||
1118 | reiserfs_update_sd (&th, inode); | ||
1119 | |||
1120 | atomic_inc(&inode->i_count) ; | ||
1121 | d_instantiate(dentry, inode); | ||
1122 | retval = journal_end(&th, dir->i_sb, jbegin_count) ; | ||
1123 | reiserfs_write_unlock(dir->i_sb); | ||
1124 | return retval; | ||
1125 | } | ||
1126 | |||
1127 | |||
1128 | // de contains information pointing to an entry which | ||
1129 | static int de_still_valid (const char * name, int len, struct reiserfs_dir_entry * de) | ||
1130 | { | ||
1131 | struct reiserfs_dir_entry tmp = *de; | ||
1132 | |||
1133 | // recalculate pointer to name and name length | ||
1134 | set_de_name_and_namelen (&tmp); | ||
1135 | // FIXME: could check more | ||
1136 | if (tmp.de_namelen != len || memcmp (name, de->de_name, len)) | ||
1137 | return 0; | ||
1138 | return 1; | ||
1139 | } | ||
1140 | |||
1141 | |||
1142 | static int entry_points_to_object (const char * name, int len, struct reiserfs_dir_entry * de, struct inode * inode) | ||
1143 | { | ||
1144 | if (!de_still_valid (name, len, de)) | ||
1145 | return 0; | ||
1146 | |||
1147 | if (inode) { | ||
1148 | if (!de_visible (de->de_deh + de->de_entry_num)) | ||
1149 | reiserfs_panic (NULL, "vs-7042: entry_points_to_object: entry must be visible"); | ||
1150 | return (de->de_objectid == inode->i_ino) ? 1 : 0; | ||
1151 | } | ||
1152 | |||
1153 | /* this must be added hidden entry */ | ||
1154 | if (de_visible (de->de_deh + de->de_entry_num)) | ||
1155 | reiserfs_panic (NULL, "vs-7043: entry_points_to_object: entry must be visible"); | ||
1156 | |||
1157 | return 1; | ||
1158 | } | ||
1159 | |||
1160 | |||
1161 | /* sets key of objectid the entry has to point to */ | ||
1162 | static void set_ino_in_dir_entry (struct reiserfs_dir_entry * de, struct reiserfs_key * key) | ||
1163 | { | ||
1164 | /* JDM These operations are endian safe - both are le */ | ||
1165 | de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id; | ||
1166 | de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid; | ||
1167 | } | ||
1168 | |||
1169 | |||
1170 | /* | ||
1171 | * process, that is going to call fix_nodes/do_balance must hold only | ||
1172 | * one path. If it holds 2 or more, it can get into endless waiting in | ||
1173 | * get_empty_nodes or its clones | ||
1174 | */ | ||
1175 | static int reiserfs_rename (struct inode * old_dir, struct dentry *old_dentry, | ||
1176 | struct inode * new_dir, struct dentry *new_dentry) | ||
1177 | { | ||
1178 | int retval; | ||
1179 | INITIALIZE_PATH (old_entry_path); | ||
1180 | INITIALIZE_PATH (new_entry_path); | ||
1181 | INITIALIZE_PATH (dot_dot_entry_path); | ||
1182 | struct item_head new_entry_ih, old_entry_ih, dot_dot_ih ; | ||
1183 | struct reiserfs_dir_entry old_de, new_de, dot_dot_de; | ||
1184 | struct inode * old_inode, * new_dentry_inode; | ||
1185 | struct reiserfs_transaction_handle th ; | ||
1186 | int jbegin_count ; | ||
1187 | umode_t old_inode_mode; | ||
1188 | unsigned long savelink = 1; | ||
1189 | struct timespec ctime; | ||
1190 | |||
1191 | /* three balancings: (1) old name removal, (2) new name insertion | ||
1192 | and (3) maybe "save" link insertion | ||
1193 | stat data updates: (1) old directory, | ||
1194 | (2) new directory and (3) maybe old object stat data (when it is | ||
1195 | directory) and (4) maybe stat data of object to which new entry | ||
1196 | pointed initially and (5) maybe block containing ".." of | ||
1197 | renamed directory | ||
1198 | quota updates: two parent directories */ | ||
1199 | jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 5 + 4 * REISERFS_QUOTA_TRANS_BLOCKS; | ||
1200 | |||
1201 | old_inode = old_dentry->d_inode; | ||
1202 | new_dentry_inode = new_dentry->d_inode; | ||
1203 | |||
1204 | // make sure, that oldname still exists and points to an object we | ||
1205 | // are going to rename | ||
1206 | old_de.de_gen_number_bit_string = NULL; | ||
1207 | reiserfs_write_lock(old_dir->i_sb); | ||
1208 | retval = reiserfs_find_entry (old_dir, old_dentry->d_name.name, old_dentry->d_name.len, | ||
1209 | &old_entry_path, &old_de); | ||
1210 | pathrelse (&old_entry_path); | ||
1211 | if (retval == IO_ERROR) { | ||
1212 | reiserfs_write_unlock(old_dir->i_sb); | ||
1213 | return -EIO; | ||
1214 | } | ||
1215 | |||
1216 | if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) { | ||
1217 | reiserfs_write_unlock(old_dir->i_sb); | ||
1218 | return -ENOENT; | ||
1219 | } | ||
1220 | |||
1221 | old_inode_mode = old_inode->i_mode; | ||
1222 | if (S_ISDIR(old_inode_mode)) { | ||
1223 | // make sure, that directory being renamed has correct ".." | ||
1224 | // and that its new parent directory has not too many links | ||
1225 | // already | ||
1226 | |||
1227 | if (new_dentry_inode) { | ||
1228 | if (!reiserfs_empty_dir(new_dentry_inode)) { | ||
1229 | reiserfs_write_unlock(old_dir->i_sb); | ||
1230 | return -ENOTEMPTY; | ||
1231 | } | ||
1232 | } | ||
1233 | |||
1234 | /* directory is renamed, its parent directory will be changed, | ||
1235 | ** so find ".." entry | ||
1236 | */ | ||
1237 | dot_dot_de.de_gen_number_bit_string = NULL; | ||
1238 | retval = reiserfs_find_entry (old_inode, "..", 2, &dot_dot_entry_path, &dot_dot_de); | ||
1239 | pathrelse (&dot_dot_entry_path); | ||
1240 | if (retval != NAME_FOUND) { | ||
1241 | reiserfs_write_unlock(old_dir->i_sb); | ||
1242 | return -EIO; | ||
1243 | } | ||
1244 | |||
1245 | /* inode number of .. must equal old_dir->i_ino */ | ||
1246 | if (dot_dot_de.de_objectid != old_dir->i_ino) { | ||
1247 | reiserfs_write_unlock(old_dir->i_sb); | ||
1248 | return -EIO; | ||
1249 | } | ||
1250 | } | ||
1251 | |||
1252 | retval = journal_begin(&th, old_dir->i_sb, jbegin_count) ; | ||
1253 | if (retval) { | ||
1254 | reiserfs_write_unlock (old_dir->i_sb); | ||
1255 | return retval; | ||
1256 | } | ||
1257 | |||
1258 | /* add new entry (or find the existing one) */ | ||
1259 | retval = reiserfs_add_entry (&th, new_dir, new_dentry->d_name.name, new_dentry->d_name.len, | ||
1260 | old_inode, 0); | ||
1261 | if (retval == -EEXIST) { | ||
1262 | if (!new_dentry_inode) { | ||
1263 | reiserfs_panic (old_dir->i_sb, | ||
1264 | "vs-7050: new entry is found, new inode == 0\n"); | ||
1265 | } | ||
1266 | } else if (retval) { | ||
1267 | int err = journal_end(&th, old_dir->i_sb, jbegin_count) ; | ||
1268 | reiserfs_write_unlock(old_dir->i_sb); | ||
1269 | return err ? err : retval; | ||
1270 | } | ||
1271 | |||
1272 | reiserfs_update_inode_transaction(old_dir) ; | ||
1273 | reiserfs_update_inode_transaction(new_dir) ; | ||
1274 | |||
1275 | /* this makes it so an fsync on an open fd for the old name will | ||
1276 | ** commit the rename operation | ||
1277 | */ | ||
1278 | reiserfs_update_inode_transaction(old_inode) ; | ||
1279 | |||
1280 | if (new_dentry_inode) | ||
1281 | reiserfs_update_inode_transaction(new_dentry_inode) ; | ||
1282 | |||
1283 | while (1) { | ||
1284 | // look for old name using corresponding entry key (found by reiserfs_find_entry) | ||
1285 | if ((retval = search_by_entry_key (new_dir->i_sb, &old_de.de_entry_key, | ||
1286 | &old_entry_path, &old_de)) != NAME_FOUND) { | ||
1287 | pathrelse(&old_entry_path); | ||
1288 | journal_end(&th, old_dir->i_sb, jbegin_count); | ||
1289 | reiserfs_write_unlock(old_dir->i_sb); | ||
1290 | return -EIO; | ||
1291 | } | ||
1292 | |||
1293 | copy_item_head(&old_entry_ih, get_ih(&old_entry_path)) ; | ||
1294 | |||
1295 | reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1) ; | ||
1296 | |||
1297 | // look for new name by reiserfs_find_entry | ||
1298 | new_de.de_gen_number_bit_string = NULL; | ||
1299 | retval = reiserfs_find_entry (new_dir, new_dentry->d_name.name, new_dentry->d_name.len, | ||
1300 | &new_entry_path, &new_de); | ||
1301 | // reiserfs_add_entry should not return IO_ERROR, because it is called with essentially same parameters from | ||
1302 | // reiserfs_add_entry above, and we'll catch any i/o errors before we get here. | ||
1303 | if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) { | ||
1304 | pathrelse(&new_entry_path); | ||
1305 | pathrelse(&old_entry_path); | ||
1306 | journal_end(&th, old_dir->i_sb, jbegin_count); | ||
1307 | reiserfs_write_unlock(old_dir->i_sb); | ||
1308 | return -EIO; | ||
1309 | } | ||
1310 | |||
1311 | copy_item_head(&new_entry_ih, get_ih(&new_entry_path)) ; | ||
1312 | |||
1313 | reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1) ; | ||
1314 | |||
1315 | if (S_ISDIR(old_inode->i_mode)) { | ||
1316 | if ((retval = search_by_entry_key (new_dir->i_sb, &dot_dot_de.de_entry_key, | ||
1317 | &dot_dot_entry_path, &dot_dot_de)) != NAME_FOUND) { | ||
1318 | pathrelse(&dot_dot_entry_path); | ||
1319 | pathrelse(&new_entry_path); | ||
1320 | pathrelse(&old_entry_path); | ||
1321 | journal_end(&th, old_dir->i_sb, jbegin_count); | ||
1322 | reiserfs_write_unlock(old_dir->i_sb); | ||
1323 | return -EIO; | ||
1324 | } | ||
1325 | copy_item_head(&dot_dot_ih, get_ih(&dot_dot_entry_path)) ; | ||
1326 | // node containing ".." gets into transaction | ||
1327 | reiserfs_prepare_for_journal(old_inode->i_sb, dot_dot_de.de_bh, 1) ; | ||
1328 | } | ||
1329 | /* we should check seals here, not do | ||
1330 | this stuff, yes? Then, having | ||
1331 | gathered everything into RAM we | ||
1332 | should lock the buffers, yes? -Hans */ | ||
1333 | /* probably. our rename needs to hold more | ||
1334 | ** than one path at once. The seals would | ||
1335 | ** have to be written to deal with multi-path | ||
1336 | ** issues -chris | ||
1337 | */ | ||
1338 | /* sanity checking before doing the rename - avoid races many | ||
1339 | ** of the above checks could have scheduled. We have to be | ||
1340 | ** sure our items haven't been shifted by another process. | ||
1341 | */ | ||
1342 | if (item_moved(&new_entry_ih, &new_entry_path) || | ||
1343 | !entry_points_to_object(new_dentry->d_name.name, | ||
1344 | new_dentry->d_name.len, | ||
1345 | &new_de, new_dentry_inode) || | ||
1346 | item_moved(&old_entry_ih, &old_entry_path) || | ||
1347 | !entry_points_to_object (old_dentry->d_name.name, | ||
1348 | old_dentry->d_name.len, | ||
1349 | &old_de, old_inode)) { | ||
1350 | reiserfs_restore_prepared_buffer (old_inode->i_sb, new_de.de_bh); | ||
1351 | reiserfs_restore_prepared_buffer (old_inode->i_sb, old_de.de_bh); | ||
1352 | if (S_ISDIR(old_inode_mode)) | ||
1353 | reiserfs_restore_prepared_buffer (old_inode->i_sb, dot_dot_de.de_bh); | ||
1354 | continue; | ||
1355 | } | ||
1356 | if (S_ISDIR(old_inode_mode)) { | ||
1357 | if ( item_moved(&dot_dot_ih, &dot_dot_entry_path) || | ||
1358 | !entry_points_to_object ( "..", 2, &dot_dot_de, old_dir) ) { | ||
1359 | reiserfs_restore_prepared_buffer (old_inode->i_sb, old_de.de_bh); | ||
1360 | reiserfs_restore_prepared_buffer (old_inode->i_sb, new_de.de_bh); | ||
1361 | reiserfs_restore_prepared_buffer (old_inode->i_sb, dot_dot_de.de_bh); | ||
1362 | continue; | ||
1363 | } | ||
1364 | } | ||
1365 | |||
1366 | RFALSE( S_ISDIR(old_inode_mode) && | ||
1367 | !buffer_journal_prepared(dot_dot_de.de_bh), "" ); | ||
1368 | |||
1369 | break; | ||
1370 | } | ||
1371 | |||
1372 | /* ok, all the changes can be done in one fell swoop when we | ||
1373 | have claimed all the buffers needed.*/ | ||
1374 | |||
1375 | mark_de_visible (new_de.de_deh + new_de.de_entry_num); | ||
1376 | set_ino_in_dir_entry (&new_de, INODE_PKEY (old_inode)); | ||
1377 | journal_mark_dirty (&th, old_dir->i_sb, new_de.de_bh); | ||
1378 | |||
1379 | mark_de_hidden (old_de.de_deh + old_de.de_entry_num); | ||
1380 | journal_mark_dirty (&th, old_dir->i_sb, old_de.de_bh); | ||
1381 | ctime = CURRENT_TIME_SEC; | ||
1382 | old_dir->i_ctime = old_dir->i_mtime = ctime; | ||
1383 | new_dir->i_ctime = new_dir->i_mtime = ctime; | ||
1384 | /* thanks to Alex Adriaanse <alex_a@caltech.edu> for patch which adds ctime update of | ||
1385 | renamed object */ | ||
1386 | old_inode->i_ctime = ctime; | ||
1387 | |||
1388 | if (new_dentry_inode) { | ||
1389 | // adjust link number of the victim | ||
1390 | if (S_ISDIR(new_dentry_inode->i_mode)) { | ||
1391 | new_dentry_inode->i_nlink = 0; | ||
1392 | } else { | ||
1393 | new_dentry_inode->i_nlink--; | ||
1394 | } | ||
1395 | new_dentry_inode->i_ctime = ctime; | ||
1396 | savelink = new_dentry_inode->i_nlink; | ||
1397 | } | ||
1398 | |||
1399 | if (S_ISDIR(old_inode_mode)) { | ||
1400 | // adjust ".." of renamed directory | ||
1401 | set_ino_in_dir_entry (&dot_dot_de, INODE_PKEY (new_dir)); | ||
1402 | journal_mark_dirty (&th, new_dir->i_sb, dot_dot_de.de_bh); | ||
1403 | |||
1404 | if (!new_dentry_inode) | ||
1405 | /* there (in new_dir) was no directory, so it got new link | ||
1406 | (".." of renamed directory) */ | ||
1407 | INC_DIR_INODE_NLINK(new_dir); | ||
1408 | |||
1409 | /* old directory lost one link - ".. " of renamed directory */ | ||
1410 | DEC_DIR_INODE_NLINK(old_dir); | ||
1411 | } | ||
1412 | |||
1413 | // looks like in 2.3.99pre3 brelse is atomic. so we can use pathrelse | ||
1414 | pathrelse (&new_entry_path); | ||
1415 | pathrelse (&dot_dot_entry_path); | ||
1416 | |||
1417 | // FIXME: this reiserfs_cut_from_item's return value may screw up | ||
1418 | // anybody, but it will panic if will not be able to find the | ||
1419 | // entry. This needs one more clean up | ||
1420 | if (reiserfs_cut_from_item (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL, 0) < 0) | ||
1421 | reiserfs_warning (old_dir->i_sb, "vs-7060: reiserfs_rename: couldn't not cut old name. Fsck later?"); | ||
1422 | |||
1423 | old_dir->i_size -= DEH_SIZE + old_de.de_entrylen; | ||
1424 | |||
1425 | reiserfs_update_sd (&th, old_dir); | ||
1426 | reiserfs_update_sd (&th, new_dir); | ||
1427 | reiserfs_update_sd (&th, old_inode); | ||
1428 | |||
1429 | if (new_dentry_inode) { | ||
1430 | if (savelink == 0) | ||
1431 | add_save_link (&th, new_dentry_inode, 0/* not truncate */); | ||
1432 | reiserfs_update_sd (&th, new_dentry_inode); | ||
1433 | } | ||
1434 | |||
1435 | retval = journal_end(&th, old_dir->i_sb, jbegin_count) ; | ||
1436 | reiserfs_write_unlock(old_dir->i_sb); | ||
1437 | return retval; | ||
1438 | } | ||
1439 | |||
1440 | /* | ||
1441 | * directories can handle most operations... | ||
1442 | */ | ||
1443 | struct inode_operations reiserfs_dir_inode_operations = { | ||
1444 | //&reiserfs_dir_operations, /* default_file_ops */ | ||
1445 | .create = reiserfs_create, | ||
1446 | .lookup = reiserfs_lookup, | ||
1447 | .link = reiserfs_link, | ||
1448 | .unlink = reiserfs_unlink, | ||
1449 | .symlink = reiserfs_symlink, | ||
1450 | .mkdir = reiserfs_mkdir, | ||
1451 | .rmdir = reiserfs_rmdir, | ||
1452 | .mknod = reiserfs_mknod, | ||
1453 | .rename = reiserfs_rename, | ||
1454 | .setattr = reiserfs_setattr, | ||
1455 | .setxattr = reiserfs_setxattr, | ||
1456 | .getxattr = reiserfs_getxattr, | ||
1457 | .listxattr = reiserfs_listxattr, | ||
1458 | .removexattr = reiserfs_removexattr, | ||
1459 | .permission = reiserfs_permission, | ||
1460 | }; | ||
1461 | |||
1462 | /* | ||
1463 | * symlink operations.. same as page_symlink_inode_operations, with xattr | ||
1464 | * stuff added | ||
1465 | */ | ||
1466 | struct inode_operations reiserfs_symlink_inode_operations = { | ||
1467 | .readlink = generic_readlink, | ||
1468 | .follow_link = page_follow_link_light, | ||
1469 | .put_link = page_put_link, | ||
1470 | .setattr = reiserfs_setattr, | ||
1471 | .setxattr = reiserfs_setxattr, | ||
1472 | .getxattr = reiserfs_getxattr, | ||
1473 | .listxattr = reiserfs_listxattr, | ||
1474 | .removexattr = reiserfs_removexattr, | ||
1475 | .permission = reiserfs_permission, | ||
1476 | |||
1477 | }; | ||
1478 | |||
1479 | |||
1480 | /* | ||
1481 | * special file operations.. just xattr/acl stuff | ||
1482 | */ | ||
1483 | struct inode_operations reiserfs_special_inode_operations = { | ||
1484 | .setattr = reiserfs_setattr, | ||
1485 | .setxattr = reiserfs_setxattr, | ||
1486 | .getxattr = reiserfs_getxattr, | ||
1487 | .listxattr = reiserfs_listxattr, | ||
1488 | .removexattr = reiserfs_removexattr, | ||
1489 | .permission = reiserfs_permission, | ||
1490 | |||
1491 | }; | ||
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c new file mode 100644 index 000000000000..0785c43a7486 --- /dev/null +++ b/fs/reiserfs/objectid.c | |||
@@ -0,0 +1,206 @@ | |||
1 | /* | ||
2 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
3 | */ | ||
4 | |||
5 | #include <linux/config.h> | ||
6 | #include <linux/string.h> | ||
7 | #include <linux/random.h> | ||
8 | #include <linux/time.h> | ||
9 | #include <linux/reiserfs_fs.h> | ||
10 | #include <linux/reiserfs_fs_sb.h> | ||
11 | |||
12 | // find where objectid map starts | ||
13 | #define objectid_map(s,rs) (old_format_only (s) ? \ | ||
14 | (__u32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\ | ||
15 | (__u32 *)((rs) + 1)) | ||
16 | |||
17 | |||
18 | #ifdef CONFIG_REISERFS_CHECK | ||
19 | |||
20 | static void check_objectid_map (struct super_block * s, __u32 * map) | ||
21 | { | ||
22 | if (le32_to_cpu (map[0]) != 1) | ||
23 | reiserfs_panic (s, "vs-15010: check_objectid_map: map corrupted: %lx", | ||
24 | ( long unsigned int ) le32_to_cpu (map[0])); | ||
25 | |||
26 | // FIXME: add something else here | ||
27 | } | ||
28 | |||
29 | #else | ||
30 | static void check_objectid_map (struct super_block * s, __u32 * map) | ||
31 | {;} | ||
32 | #endif | ||
33 | |||
34 | |||
35 | /* When we allocate objectids we allocate the first unused objectid. | ||
36 | Each sequence of objectids in use (the odd sequences) is followed | ||
37 | by a sequence of objectids not in use (the even sequences). We | ||
38 | only need to record the last objectid in each of these sequences | ||
39 | (both the odd and even sequences) in order to fully define the | ||
40 | boundaries of the sequences. A consequence of allocating the first | ||
41 | objectid not in use is that under most conditions this scheme is | ||
42 | extremely compact. The exception is immediately after a sequence | ||
43 | of operations which deletes a large number of objects of | ||
44 | non-sequential objectids, and even then it will become compact | ||
45 | again as soon as more objects are created. Note that many | ||
46 | interesting optimizations of layout could result from complicating | ||
47 | objectid assignment, but we have deferred making them for now. */ | ||
48 | |||
49 | |||
50 | /* get unique object identifier */ | ||
51 | __u32 reiserfs_get_unused_objectid (struct reiserfs_transaction_handle *th) | ||
52 | { | ||
53 | struct super_block * s = th->t_super; | ||
54 | struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s); | ||
55 | __u32 * map = objectid_map (s, rs); | ||
56 | __u32 unused_objectid; | ||
57 | |||
58 | BUG_ON (!th->t_trans_id); | ||
59 | |||
60 | check_objectid_map (s, map); | ||
61 | |||
62 | reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; | ||
63 | /* comment needed -Hans */ | ||
64 | unused_objectid = le32_to_cpu (map[1]); | ||
65 | if (unused_objectid == U32_MAX) { | ||
66 | reiserfs_warning (s, "%s: no more object ids", __FUNCTION__); | ||
67 | reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s)) ; | ||
68 | return 0; | ||
69 | } | ||
70 | |||
71 | /* This incrementation allocates the first unused objectid. That | ||
72 | is to say, the first entry on the objectid map is the first | ||
73 | unused objectid, and by incrementing it we use it. See below | ||
74 | where we check to see if we eliminated a sequence of unused | ||
75 | objectids.... */ | ||
76 | map[1] = cpu_to_le32 (unused_objectid + 1); | ||
77 | |||
78 | /* Now we check to see if we eliminated the last remaining member of | ||
79 | the first even sequence (and can eliminate the sequence by | ||
80 | eliminating its last objectid from oids), and can collapse the | ||
81 | first two odd sequences into one sequence. If so, then the net | ||
82 | result is to eliminate a pair of objectids from oids. We do this | ||
83 | by shifting the entire map to the left. */ | ||
84 | if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) { | ||
85 | memmove (map + 1, map + 3, (sb_oid_cursize(rs) - 3) * sizeof(__u32)); | ||
86 | set_sb_oid_cursize( rs, sb_oid_cursize(rs) - 2 ); | ||
87 | } | ||
88 | |||
89 | journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); | ||
90 | return unused_objectid; | ||
91 | } | ||
92 | |||
93 | |||
94 | /* makes object identifier unused */ | ||
95 | void reiserfs_release_objectid (struct reiserfs_transaction_handle *th, | ||
96 | __u32 objectid_to_release) | ||
97 | { | ||
98 | struct super_block * s = th->t_super; | ||
99 | struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s); | ||
100 | __u32 * map = objectid_map (s, rs); | ||
101 | int i = 0; | ||
102 | |||
103 | BUG_ON (!th->t_trans_id); | ||
104 | //return; | ||
105 | check_objectid_map (s, map); | ||
106 | |||
107 | reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; | ||
108 | journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); | ||
109 | |||
110 | /* start at the beginning of the objectid map (i = 0) and go to | ||
111 | the end of it (i = disk_sb->s_oid_cursize). Linear search is | ||
112 | what we use, though it is possible that binary search would be | ||
113 | more efficient after performing lots of deletions (which is | ||
114 | when oids is large.) We only check even i's. */ | ||
115 | while (i < sb_oid_cursize(rs)) { | ||
116 | if (objectid_to_release == le32_to_cpu (map[i])) { | ||
117 | /* This incrementation unallocates the objectid. */ | ||
118 | //map[i]++; | ||
119 | map[i] = cpu_to_le32 (le32_to_cpu (map[i]) + 1); | ||
120 | |||
121 | /* Did we unallocate the last member of an odd sequence, and can shrink oids? */ | ||
122 | if (map[i] == map[i+1]) { | ||
123 | /* shrink objectid map */ | ||
124 | memmove (map + i, map + i + 2, | ||
125 | (sb_oid_cursize(rs) - i - 2) * sizeof (__u32)); | ||
126 | //disk_sb->s_oid_cursize -= 2; | ||
127 | set_sb_oid_cursize( rs, sb_oid_cursize(rs) - 2 ); | ||
128 | |||
129 | RFALSE( sb_oid_cursize(rs) < 2 || | ||
130 | sb_oid_cursize(rs) > sb_oid_maxsize(rs), | ||
131 | "vs-15005: objectid map corrupted cur_size == %d (max == %d)", | ||
132 | sb_oid_cursize(rs), sb_oid_maxsize(rs)); | ||
133 | } | ||
134 | return; | ||
135 | } | ||
136 | |||
137 | if (objectid_to_release > le32_to_cpu (map[i]) && | ||
138 | objectid_to_release < le32_to_cpu (map[i + 1])) { | ||
139 | /* size of objectid map is not changed */ | ||
140 | if (objectid_to_release + 1 == le32_to_cpu (map[i + 1])) { | ||
141 | //objectid_map[i+1]--; | ||
142 | map[i + 1] = cpu_to_le32 (le32_to_cpu (map[i + 1]) - 1); | ||
143 | return; | ||
144 | } | ||
145 | |||
146 | /* JDM comparing two little-endian values for equality -- safe */ | ||
147 | if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) { | ||
148 | /* objectid map must be expanded, but there is no space */ | ||
149 | PROC_INFO_INC( s, leaked_oid ); | ||
150 | return; | ||
151 | } | ||
152 | |||
153 | /* expand the objectid map*/ | ||
154 | memmove (map + i + 3, map + i + 1, | ||
155 | (sb_oid_cursize(rs) - i - 1) * sizeof(__u32)); | ||
156 | map[i + 1] = cpu_to_le32 (objectid_to_release); | ||
157 | map[i + 2] = cpu_to_le32 (objectid_to_release + 1); | ||
158 | set_sb_oid_cursize( rs, sb_oid_cursize(rs) + 2 ); | ||
159 | return; | ||
160 | } | ||
161 | i += 2; | ||
162 | } | ||
163 | |||
164 | reiserfs_warning (s, "vs-15011: reiserfs_release_objectid: tried to free free object id (%lu)", | ||
165 | ( long unsigned ) objectid_to_release); | ||
166 | } | ||
167 | |||
168 | |||
169 | int reiserfs_convert_objectid_map_v1(struct super_block *s) { | ||
170 | struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK (s); | ||
171 | int cur_size = sb_oid_cursize(disk_sb); | ||
172 | int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2 ; | ||
173 | int old_max = sb_oid_maxsize(disk_sb); | ||
174 | struct reiserfs_super_block_v1 *disk_sb_v1 ; | ||
175 | __u32 *objectid_map, *new_objectid_map ; | ||
176 | int i ; | ||
177 | |||
178 | disk_sb_v1=(struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data); | ||
179 | objectid_map = (__u32 *)(disk_sb_v1 + 1) ; | ||
180 | new_objectid_map = (__u32 *)(disk_sb + 1) ; | ||
181 | |||
182 | if (cur_size > new_size) { | ||
183 | /* mark everyone used that was listed as free at the end of the objectid | ||
184 | ** map | ||
185 | */ | ||
186 | objectid_map[new_size - 1] = objectid_map[cur_size - 1] ; | ||
187 | set_sb_oid_cursize(disk_sb,new_size) ; | ||
188 | } | ||
189 | /* move the smaller objectid map past the end of the new super */ | ||
190 | for (i = new_size - 1 ; i >= 0 ; i--) { | ||
191 | objectid_map[i + (old_max - new_size)] = objectid_map[i] ; | ||
192 | } | ||
193 | |||
194 | |||
195 | /* set the max size so we don't overflow later */ | ||
196 | set_sb_oid_maxsize(disk_sb,new_size) ; | ||
197 | |||
198 | /* Zero out label and generate random UUID */ | ||
199 | memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label)) ; | ||
200 | generate_random_uuid(disk_sb->s_uuid); | ||
201 | |||
202 | /* finally, zero out the unused chunk of the new super */ | ||
203 | memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused)) ; | ||
204 | return 0 ; | ||
205 | } | ||
206 | |||
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c new file mode 100644 index 000000000000..16fdca1d4bd7 --- /dev/null +++ b/fs/reiserfs/prints.c | |||
@@ -0,0 +1,727 @@ | |||
1 | /* | ||
2 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
3 | */ | ||
4 | |||
5 | #include <linux/config.h> | ||
6 | #include <linux/time.h> | ||
7 | #include <linux/fs.h> | ||
8 | #include <linux/reiserfs_fs.h> | ||
9 | #include <linux/string.h> | ||
10 | #include <linux/buffer_head.h> | ||
11 | |||
12 | #include <stdarg.h> | ||
13 | |||
14 | static char error_buf[1024]; | ||
15 | static char fmt_buf[1024]; | ||
16 | static char off_buf[80]; | ||
17 | |||
18 | |||
19 | static char * reiserfs_cpu_offset (struct cpu_key * key) | ||
20 | { | ||
21 | if (cpu_key_k_type(key) == TYPE_DIRENTRY) | ||
22 | sprintf (off_buf, "%Lu(%Lu)", | ||
23 | (unsigned long long)GET_HASH_VALUE (cpu_key_k_offset (key)), | ||
24 | (unsigned long long)GET_GENERATION_NUMBER (cpu_key_k_offset (key))); | ||
25 | else | ||
26 | sprintf (off_buf, "0x%Lx", (unsigned long long)cpu_key_k_offset (key)); | ||
27 | return off_buf; | ||
28 | } | ||
29 | |||
30 | |||
31 | static char * le_offset (struct reiserfs_key * key) | ||
32 | { | ||
33 | int version; | ||
34 | |||
35 | version = le_key_version (key); | ||
36 | if (le_key_k_type (version, key) == TYPE_DIRENTRY) | ||
37 | sprintf (off_buf, "%Lu(%Lu)", | ||
38 | (unsigned long long)GET_HASH_VALUE (le_key_k_offset (version, key)), | ||
39 | (unsigned long long)GET_GENERATION_NUMBER (le_key_k_offset (version, key))); | ||
40 | else | ||
41 | sprintf (off_buf, "0x%Lx", (unsigned long long)le_key_k_offset (version, key)); | ||
42 | return off_buf; | ||
43 | } | ||
44 | |||
45 | |||
46 | static char * cpu_type (struct cpu_key * key) | ||
47 | { | ||
48 | if (cpu_key_k_type (key) == TYPE_STAT_DATA) | ||
49 | return "SD"; | ||
50 | if (cpu_key_k_type (key) == TYPE_DIRENTRY) | ||
51 | return "DIR"; | ||
52 | if (cpu_key_k_type (key) == TYPE_DIRECT) | ||
53 | return "DIRECT"; | ||
54 | if (cpu_key_k_type (key) == TYPE_INDIRECT) | ||
55 | return "IND"; | ||
56 | return "UNKNOWN"; | ||
57 | } | ||
58 | |||
59 | |||
60 | static char * le_type (struct reiserfs_key * key) | ||
61 | { | ||
62 | int version; | ||
63 | |||
64 | version = le_key_version (key); | ||
65 | |||
66 | if (le_key_k_type (version, key) == TYPE_STAT_DATA) | ||
67 | return "SD"; | ||
68 | if (le_key_k_type (version, key) == TYPE_DIRENTRY) | ||
69 | return "DIR"; | ||
70 | if (le_key_k_type (version, key) == TYPE_DIRECT) | ||
71 | return "DIRECT"; | ||
72 | if (le_key_k_type (version, key) == TYPE_INDIRECT) | ||
73 | return "IND"; | ||
74 | return "UNKNOWN"; | ||
75 | } | ||
76 | |||
77 | |||
78 | /* %k */ | ||
79 | static void sprintf_le_key (char * buf, struct reiserfs_key * key) | ||
80 | { | ||
81 | if (key) | ||
82 | sprintf (buf, "[%d %d %s %s]", le32_to_cpu (key->k_dir_id), | ||
83 | le32_to_cpu (key->k_objectid), le_offset (key), le_type (key)); | ||
84 | else | ||
85 | sprintf (buf, "[NULL]"); | ||
86 | } | ||
87 | |||
88 | |||
89 | /* %K */ | ||
90 | static void sprintf_cpu_key (char * buf, struct cpu_key * key) | ||
91 | { | ||
92 | if (key) | ||
93 | sprintf (buf, "[%d %d %s %s]", key->on_disk_key.k_dir_id, | ||
94 | key->on_disk_key.k_objectid, reiserfs_cpu_offset (key), | ||
95 | cpu_type (key)); | ||
96 | else | ||
97 | sprintf (buf, "[NULL]"); | ||
98 | } | ||
99 | |||
100 | static void sprintf_de_head( char *buf, struct reiserfs_de_head *deh ) | ||
101 | { | ||
102 | if( deh ) | ||
103 | sprintf( buf, "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]", deh_offset(deh), deh_dir_id(deh), | ||
104 | deh_objectid(deh), deh_location(deh), deh_state(deh) ); | ||
105 | else | ||
106 | sprintf( buf, "[NULL]" ); | ||
107 | |||
108 | } | ||
109 | |||
110 | static void sprintf_item_head (char * buf, struct item_head * ih) | ||
111 | { | ||
112 | if (ih) { | ||
113 | strcpy (buf, (ih_version (ih) == KEY_FORMAT_3_6) ? "*3.6* " : "*3.5*"); | ||
114 | sprintf_le_key (buf + strlen (buf), &(ih->ih_key)); | ||
115 | sprintf (buf + strlen (buf), ", item_len %d, item_location %d, " | ||
116 | "free_space(entry_count) %d", | ||
117 | ih_item_len(ih), ih_location(ih), ih_free_space (ih)); | ||
118 | } else | ||
119 | sprintf (buf, "[NULL]"); | ||
120 | } | ||
121 | |||
122 | |||
123 | static void sprintf_direntry (char * buf, struct reiserfs_dir_entry * de) | ||
124 | { | ||
125 | char name[20]; | ||
126 | |||
127 | memcpy (name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen); | ||
128 | name [de->de_namelen > 19 ? 19 : de->de_namelen] = 0; | ||
129 | sprintf (buf, "\"%s\"==>[%d %d]", name, de->de_dir_id, de->de_objectid); | ||
130 | } | ||
131 | |||
132 | |||
133 | static void sprintf_block_head (char * buf, struct buffer_head * bh) | ||
134 | { | ||
135 | sprintf (buf, "level=%d, nr_items=%d, free_space=%d rdkey ", | ||
136 | B_LEVEL (bh), B_NR_ITEMS (bh), B_FREE_SPACE (bh)); | ||
137 | } | ||
138 | |||
139 | |||
140 | static void sprintf_buffer_head (char * buf, struct buffer_head * bh) | ||
141 | { | ||
142 | char b[BDEVNAME_SIZE]; | ||
143 | |||
144 | sprintf (buf, "dev %s, size %d, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", | ||
145 | bdevname (bh->b_bdev, b), bh->b_size, | ||
146 | (unsigned long long)bh->b_blocknr, | ||
147 | atomic_read (&(bh->b_count)), | ||
148 | bh->b_state, bh->b_page, | ||
149 | buffer_uptodate (bh) ? "UPTODATE" : "!UPTODATE", | ||
150 | buffer_dirty (bh) ? "DIRTY" : "CLEAN", | ||
151 | buffer_locked (bh) ? "LOCKED" : "UNLOCKED"); | ||
152 | } | ||
153 | |||
154 | |||
155 | static void sprintf_disk_child (char * buf, struct disk_child * dc) | ||
156 | { | ||
157 | sprintf (buf, "[dc_number=%d, dc_size=%u]", dc_block_number(dc), dc_size(dc)); | ||
158 | } | ||
159 | |||
160 | |||
161 | static char * is_there_reiserfs_struct (char * fmt, int * what, int * skip) | ||
162 | { | ||
163 | char * k = fmt; | ||
164 | |||
165 | *skip = 0; | ||
166 | |||
167 | while ((k = strchr (k, '%')) != NULL) | ||
168 | { | ||
169 | if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' || | ||
170 | k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a' ) { | ||
171 | *what = k[1]; | ||
172 | break; | ||
173 | } | ||
174 | (*skip) ++; | ||
175 | k ++; | ||
176 | } | ||
177 | return k; | ||
178 | } | ||
179 | |||
180 | |||
181 | /* debugging reiserfs we used to print out a lot of different | ||
182 | variables, like keys, item headers, buffer heads etc. Values of | ||
183 | most fields matter. So it took a long time just to write | ||
184 | appropriative printk. With this reiserfs_warning you can use format | ||
185 | specification for complex structures like you used to do with | ||
186 | printfs for integers, doubles and pointers. For instance, to print | ||
187 | out key structure you have to write just: | ||
188 | reiserfs_warning ("bad key %k", key); | ||
189 | instead of | ||
190 | printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid, | ||
191 | key->k_offset, key->k_uniqueness); | ||
192 | */ | ||
193 | |||
194 | |||
195 | static void | ||
196 | prepare_error_buf( const char *fmt, va_list args ) | ||
197 | { | ||
198 | char * fmt1 = fmt_buf; | ||
199 | char * k; | ||
200 | char * p = error_buf; | ||
201 | int i, j, what, skip; | ||
202 | |||
203 | strcpy (fmt1, fmt); | ||
204 | |||
205 | while( (k = is_there_reiserfs_struct( fmt1, &what, &skip )) != NULL ) | ||
206 | { | ||
207 | *k = 0; | ||
208 | |||
209 | p += vsprintf (p, fmt1, args); | ||
210 | |||
211 | for (i = 0; i < skip; i ++) | ||
212 | j = va_arg (args, int); | ||
213 | |||
214 | switch (what) { | ||
215 | case 'k': | ||
216 | sprintf_le_key (p, va_arg(args, struct reiserfs_key *)); | ||
217 | break; | ||
218 | case 'K': | ||
219 | sprintf_cpu_key (p, va_arg(args, struct cpu_key *)); | ||
220 | break; | ||
221 | case 'h': | ||
222 | sprintf_item_head (p, va_arg(args, struct item_head *)); | ||
223 | break; | ||
224 | case 't': | ||
225 | sprintf_direntry (p, va_arg(args, struct reiserfs_dir_entry *)); | ||
226 | break; | ||
227 | case 'y': | ||
228 | sprintf_disk_child (p, va_arg(args, struct disk_child *)); | ||
229 | break; | ||
230 | case 'z': | ||
231 | sprintf_block_head (p, va_arg(args, struct buffer_head *)); | ||
232 | break; | ||
233 | case 'b': | ||
234 | sprintf_buffer_head (p, va_arg(args, struct buffer_head *)); | ||
235 | break; | ||
236 | case 'a': | ||
237 | sprintf_de_head (p, va_arg(args, struct reiserfs_de_head *)); | ||
238 | break; | ||
239 | } | ||
240 | |||
241 | p += strlen (p); | ||
242 | fmt1 = k + 2; | ||
243 | } | ||
244 | vsprintf (p, fmt1, args); | ||
245 | |||
246 | } | ||
247 | |||
248 | |||
249 | /* in addition to usual conversion specifiers this accepts reiserfs | ||
250 | specific conversion specifiers: | ||
251 | %k to print little endian key, | ||
252 | %K to print cpu key, | ||
253 | %h to print item_head, | ||
254 | %t to print directory entry | ||
255 | %z to print block head (arg must be struct buffer_head * | ||
256 | %b to print buffer_head | ||
257 | */ | ||
258 | |||
259 | #define do_reiserfs_warning(fmt)\ | ||
260 | {\ | ||
261 | va_list args;\ | ||
262 | va_start( args, fmt );\ | ||
263 | prepare_error_buf( fmt, args );\ | ||
264 | va_end( args );\ | ||
265 | } | ||
266 | |||
267 | void reiserfs_warning (struct super_block *sb, const char * fmt, ...) | ||
268 | { | ||
269 | do_reiserfs_warning(fmt); | ||
270 | if (sb) | ||
271 | printk (KERN_WARNING "ReiserFS: %s: warning: %s\n", | ||
272 | reiserfs_bdevname (sb), error_buf); | ||
273 | else | ||
274 | printk (KERN_WARNING "ReiserFS: warning: %s\n", error_buf); | ||
275 | } | ||
276 | |||
277 | /* No newline.. reiserfs_info calls can be followed by printk's */ | ||
278 | void reiserfs_info (struct super_block *sb, const char * fmt, ...) | ||
279 | { | ||
280 | do_reiserfs_warning(fmt); | ||
281 | if (sb) | ||
282 | printk (KERN_NOTICE "ReiserFS: %s: %s", | ||
283 | reiserfs_bdevname (sb), error_buf); | ||
284 | else | ||
285 | printk (KERN_NOTICE "ReiserFS: %s", error_buf); | ||
286 | } | ||
287 | |||
288 | /* No newline.. reiserfs_printk calls can be followed by printk's */ | ||
289 | static void reiserfs_printk (const char * fmt, ...) | ||
290 | { | ||
291 | do_reiserfs_warning(fmt); | ||
292 | printk (error_buf); | ||
293 | } | ||
294 | |||
295 | void reiserfs_debug (struct super_block *s, int level, const char * fmt, ...) | ||
296 | { | ||
297 | #ifdef CONFIG_REISERFS_CHECK | ||
298 | do_reiserfs_warning(fmt); | ||
299 | if (s) | ||
300 | printk (KERN_DEBUG "ReiserFS: %s: %s\n", | ||
301 | reiserfs_bdevname (s), error_buf); | ||
302 | else | ||
303 | printk (KERN_DEBUG "ReiserFS: %s\n", error_buf); | ||
304 | #endif | ||
305 | } | ||
306 | |||
307 | /* The format: | ||
308 | |||
309 | maintainer-errorid: [function-name:] message | ||
310 | |||
311 | where errorid is unique to the maintainer and function-name is | ||
312 | optional, is recommended, so that anyone can easily find the bug | ||
313 | with a simple grep for the short to type string | ||
314 | maintainer-errorid. Don't bother with reusing errorids, there are | ||
315 | lots of numbers out there. | ||
316 | |||
317 | Example: | ||
318 | |||
319 | reiserfs_panic( | ||
320 | p_sb, "reiser-29: reiserfs_new_blocknrs: " | ||
321 | "one of search_start or rn(%d) is equal to MAX_B_NUM," | ||
322 | "which means that we are optimizing location based on the bogus location of a temp buffer (%p).", | ||
323 | rn, bh | ||
324 | ); | ||
325 | |||
326 | Regular panic()s sometimes clear the screen before the message can | ||
327 | be read, thus the need for the while loop. | ||
328 | |||
329 | Numbering scheme for panic used by Vladimir and Anatoly( Hans completely ignores this scheme, and considers it | ||
330 | pointless complexity): | ||
331 | |||
332 | panics in reiserfs_fs.h have numbers from 1000 to 1999 | ||
333 | super.c 2000 to 2999 | ||
334 | preserve.c (unused) 3000 to 3999 | ||
335 | bitmap.c 4000 to 4999 | ||
336 | stree.c 5000 to 5999 | ||
337 | prints.c 6000 to 6999 | ||
338 | namei.c 7000 to 7999 | ||
339 | fix_nodes.c 8000 to 8999 | ||
340 | dir.c 9000 to 9999 | ||
341 | lbalance.c 10000 to 10999 | ||
342 | ibalance.c 11000 to 11999 not ready | ||
343 | do_balan.c 12000 to 12999 | ||
344 | inode.c 13000 to 13999 | ||
345 | file.c 14000 to 14999 | ||
346 | objectid.c 15000 - 15999 | ||
347 | buffer.c 16000 - 16999 | ||
348 | symlink.c 17000 - 17999 | ||
349 | |||
350 | . */ | ||
351 | |||
352 | |||
353 | #ifdef CONFIG_REISERFS_CHECK | ||
354 | extern struct tree_balance * cur_tb; | ||
355 | #endif | ||
356 | |||
357 | void reiserfs_panic (struct super_block * sb, const char * fmt, ...) | ||
358 | { | ||
359 | do_reiserfs_warning(fmt); | ||
360 | printk (KERN_EMERG "REISERFS: panic (device %s): %s\n", | ||
361 | reiserfs_bdevname (sb), error_buf); | ||
362 | BUG (); | ||
363 | |||
364 | /* this is not actually called, but makes reiserfs_panic() "noreturn" */ | ||
365 | panic ("REISERFS: panic (device %s): %s\n", | ||
366 | reiserfs_bdevname (sb), error_buf); | ||
367 | } | ||
368 | |||
369 | void | ||
370 | reiserfs_abort (struct super_block *sb, int errno, const char *fmt, ...) | ||
371 | { | ||
372 | do_reiserfs_warning (fmt); | ||
373 | |||
374 | if (reiserfs_error_panic (sb)) { | ||
375 | panic (KERN_CRIT "REISERFS: panic (device %s): %s\n", | ||
376 | reiserfs_bdevname (sb), error_buf); | ||
377 | } | ||
378 | |||
379 | if (sb->s_flags & MS_RDONLY) | ||
380 | return; | ||
381 | |||
382 | printk (KERN_CRIT "REISERFS: abort (device %s): %s\n", | ||
383 | reiserfs_bdevname (sb), error_buf); | ||
384 | |||
385 | sb->s_flags |= MS_RDONLY; | ||
386 | reiserfs_journal_abort (sb, errno); | ||
387 | } | ||
388 | |||
389 | /* this prints internal nodes (4 keys/items in line) (dc_number, | ||
390 | dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number, | ||
391 | dc_size)...*/ | ||
392 | static int print_internal (struct buffer_head * bh, int first, int last) | ||
393 | { | ||
394 | struct reiserfs_key * key; | ||
395 | struct disk_child * dc; | ||
396 | int i; | ||
397 | int from, to; | ||
398 | |||
399 | if (!B_IS_KEYS_LEVEL (bh)) | ||
400 | return 1; | ||
401 | |||
402 | check_internal (bh); | ||
403 | |||
404 | if (first == -1) { | ||
405 | from = 0; | ||
406 | to = B_NR_ITEMS (bh); | ||
407 | } else { | ||
408 | from = first; | ||
409 | to = last < B_NR_ITEMS (bh) ? last : B_NR_ITEMS (bh); | ||
410 | } | ||
411 | |||
412 | reiserfs_printk ("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh); | ||
413 | |||
414 | dc = B_N_CHILD (bh, from); | ||
415 | reiserfs_printk ("PTR %d: %y ", from, dc); | ||
416 | |||
417 | for (i = from, key = B_N_PDELIM_KEY (bh, from), dc ++; i < to; i ++, key ++, dc ++) { | ||
418 | reiserfs_printk ("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc); | ||
419 | if (i && i % 4 == 0) | ||
420 | printk ("\n"); | ||
421 | } | ||
422 | printk ("\n"); | ||
423 | return 0; | ||
424 | } | ||
425 | |||
426 | |||
427 | |||
428 | |||
429 | |||
430 | static int print_leaf (struct buffer_head * bh, int print_mode, int first, int last) | ||
431 | { | ||
432 | struct block_head * blkh; | ||
433 | struct item_head * ih; | ||
434 | int i, nr; | ||
435 | int from, to; | ||
436 | |||
437 | if (!B_IS_ITEMS_LEVEL (bh)) | ||
438 | return 1; | ||
439 | |||
440 | check_leaf (bh); | ||
441 | |||
442 | blkh = B_BLK_HEAD (bh); | ||
443 | ih = B_N_PITEM_HEAD (bh,0); | ||
444 | nr = blkh_nr_item(blkh); | ||
445 | |||
446 | printk ("\n===================================================================\n"); | ||
447 | reiserfs_printk ("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh); | ||
448 | |||
449 | if (!(print_mode & PRINT_LEAF_ITEMS)) { | ||
450 | reiserfs_printk ("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n", | ||
451 | &(ih->ih_key), &((ih + nr - 1)->ih_key)); | ||
452 | return 0; | ||
453 | } | ||
454 | |||
455 | if (first < 0 || first > nr - 1) | ||
456 | from = 0; | ||
457 | else | ||
458 | from = first; | ||
459 | |||
460 | if (last < 0 || last > nr ) | ||
461 | to = nr; | ||
462 | else | ||
463 | to = last; | ||
464 | |||
465 | ih += from; | ||
466 | printk ("-------------------------------------------------------------------------------\n"); | ||
467 | printk ("|##| type | key | ilen | free_space | version | loc |\n"); | ||
468 | for (i = from; i < to; i++, ih ++) { | ||
469 | printk ("-------------------------------------------------------------------------------\n"); | ||
470 | reiserfs_printk ("|%2d| %h |\n", i, ih); | ||
471 | if (print_mode & PRINT_LEAF_ITEMS) | ||
472 | op_print_item (ih, B_I_PITEM (bh, ih)); | ||
473 | } | ||
474 | |||
475 | printk ("===================================================================\n"); | ||
476 | |||
477 | return 0; | ||
478 | } | ||
479 | |||
480 | char * reiserfs_hashname(int code) | ||
481 | { | ||
482 | if ( code == YURA_HASH) | ||
483 | return "rupasov"; | ||
484 | if ( code == TEA_HASH) | ||
485 | return "tea"; | ||
486 | if ( code == R5_HASH) | ||
487 | return "r5"; | ||
488 | |||
489 | return "unknown"; | ||
490 | } | ||
491 | |||
492 | /* return 1 if this is not super block */ | ||
493 | static int print_super_block (struct buffer_head * bh) | ||
494 | { | ||
495 | struct reiserfs_super_block * rs = (struct reiserfs_super_block *)(bh->b_data); | ||
496 | int skipped, data_blocks; | ||
497 | char *version; | ||
498 | char b[BDEVNAME_SIZE]; | ||
499 | |||
500 | if (is_reiserfs_3_5(rs)) { | ||
501 | version = "3.5"; | ||
502 | } else if (is_reiserfs_3_6(rs)) { | ||
503 | version = "3.6"; | ||
504 | } else if (is_reiserfs_jr(rs)) { | ||
505 | version = ((sb_version(rs) == REISERFS_VERSION_2) ? | ||
506 | "3.6" : "3.5"); | ||
507 | } else { | ||
508 | return 1; | ||
509 | } | ||
510 | |||
511 | printk ("%s\'s super block is in block %llu\n", bdevname (bh->b_bdev, b), | ||
512 | (unsigned long long)bh->b_blocknr); | ||
513 | printk ("Reiserfs version %s\n", version ); | ||
514 | printk ("Block count %u\n", sb_block_count(rs)); | ||
515 | printk ("Blocksize %d\n", sb_blocksize(rs)); | ||
516 | printk ("Free blocks %u\n", sb_free_blocks(rs)); | ||
517 | // FIXME: this would be confusing if | ||
518 | // someone stores reiserfs super block in some data block ;) | ||
519 | // skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs); | ||
520 | skipped = bh->b_blocknr; | ||
521 | data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) - | ||
522 | (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) + 1 : sb_reserved_for_journal(rs)) - | ||
523 | sb_free_blocks(rs); | ||
524 | printk ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n" | ||
525 | "1 super block, %d data blocks\n", | ||
526 | skipped, sb_bmap_nr(rs), (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) : | ||
527 | sb_reserved_for_journal(rs)) , data_blocks); | ||
528 | printk ("Root block %u\n", sb_root_block(rs)); | ||
529 | printk ("Journal block (first) %d\n", sb_jp_journal_1st_block(rs)); | ||
530 | printk ("Journal dev %d\n", sb_jp_journal_dev(rs)); | ||
531 | printk ("Journal orig size %d\n", sb_jp_journal_size(rs)); | ||
532 | printk ("FS state %d\n", sb_fs_state(rs)); | ||
533 | printk ("Hash function \"%s\"\n", | ||
534 | reiserfs_hashname(sb_hash_function_code(rs))); | ||
535 | |||
536 | printk ("Tree height %d\n", sb_tree_height(rs)); | ||
537 | return 0; | ||
538 | } | ||
539 | |||
540 | static int print_desc_block (struct buffer_head * bh) | ||
541 | { | ||
542 | struct reiserfs_journal_desc * desc; | ||
543 | |||
544 | if (memcmp(get_journal_desc_magic (bh), JOURNAL_DESC_MAGIC, 8)) | ||
545 | return 1; | ||
546 | |||
547 | desc = (struct reiserfs_journal_desc *)(bh->b_data); | ||
548 | printk ("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)", | ||
549 | (unsigned long long)bh->b_blocknr, get_desc_trans_id (desc), get_desc_mount_id (desc), | ||
550 | get_desc_trans_len (desc)); | ||
551 | |||
552 | return 0; | ||
553 | } | ||
554 | |||
555 | |||
556 | void print_block (struct buffer_head * bh, ...)//int print_mode, int first, int last) | ||
557 | { | ||
558 | va_list args; | ||
559 | int mode, first, last; | ||
560 | |||
561 | va_start (args, bh); | ||
562 | |||
563 | if ( ! bh ) { | ||
564 | printk("print_block: buffer is NULL\n"); | ||
565 | return; | ||
566 | } | ||
567 | |||
568 | mode = va_arg (args, int); | ||
569 | first = va_arg (args, int); | ||
570 | last = va_arg (args, int); | ||
571 | if (print_leaf (bh, mode, first, last)) | ||
572 | if (print_internal (bh, first, last)) | ||
573 | if (print_super_block (bh)) | ||
574 | if (print_desc_block (bh)) | ||
575 | printk ("Block %llu contains unformatted data\n", (unsigned long long)bh->b_blocknr); | ||
576 | } | ||
577 | |||
578 | |||
579 | |||
580 | static char print_tb_buf[2048]; | ||
581 | |||
582 | /* this stores initial state of tree balance in the print_tb_buf */ | ||
583 | void store_print_tb (struct tree_balance * tb) | ||
584 | { | ||
585 | int h = 0; | ||
586 | int i; | ||
587 | struct buffer_head * tbSh, * tbFh; | ||
588 | |||
589 | if (!tb) | ||
590 | return; | ||
591 | |||
592 | sprintf (print_tb_buf, "\n" | ||
593 | "BALANCING %d\n" | ||
594 | "MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n" | ||
595 | "=====================================================================\n" | ||
596 | "* h * S * L * R * F * FL * FR * CFL * CFR *\n", | ||
597 | REISERFS_SB(tb->tb_sb)->s_do_balance, | ||
598 | tb->tb_mode, PATH_LAST_POSITION (tb->tb_path), tb->tb_path->pos_in_item); | ||
599 | |||
600 | for (h = 0; h < sizeof(tb->insert_size) / sizeof (tb->insert_size[0]); h ++) { | ||
601 | if (PATH_H_PATH_OFFSET (tb->tb_path, h) <= tb->tb_path->path_length && | ||
602 | PATH_H_PATH_OFFSET (tb->tb_path, h) > ILLEGAL_PATH_ELEMENT_OFFSET) { | ||
603 | tbSh = PATH_H_PBUFFER (tb->tb_path, h); | ||
604 | tbFh = PATH_H_PPARENT (tb->tb_path, h); | ||
605 | } else { | ||
606 | tbSh = NULL; | ||
607 | tbFh = NULL; | ||
608 | } | ||
609 | sprintf (print_tb_buf + strlen (print_tb_buf), | ||
610 | "* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n", | ||
611 | h, | ||
612 | (tbSh) ? (long long)(tbSh->b_blocknr):(-1LL), | ||
613 | (tbSh) ? atomic_read (&(tbSh->b_count)) : -1, | ||
614 | (tb->L[h]) ? (long long)(tb->L[h]->b_blocknr):(-1LL), | ||
615 | (tb->L[h]) ? atomic_read (&(tb->L[h]->b_count)) : -1, | ||
616 | (tb->R[h]) ? (long long)(tb->R[h]->b_blocknr):(-1LL), | ||
617 | (tb->R[h]) ? atomic_read (&(tb->R[h]->b_count)) : -1, | ||
618 | (tbFh) ? (long long)(tbFh->b_blocknr):(-1LL), | ||
619 | (tb->FL[h]) ? (long long)(tb->FL[h]->b_blocknr):(-1LL), | ||
620 | (tb->FR[h]) ? (long long)(tb->FR[h]->b_blocknr):(-1LL), | ||
621 | (tb->CFL[h]) ? (long long)(tb->CFL[h]->b_blocknr):(-1LL), | ||
622 | (tb->CFR[h]) ? (long long)(tb->CFR[h]->b_blocknr):(-1LL)); | ||
623 | } | ||
624 | |||
625 | sprintf (print_tb_buf + strlen (print_tb_buf), | ||
626 | "=====================================================================\n" | ||
627 | "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n" | ||
628 | "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n", | ||
629 | tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0],tb->rbytes, tb->blknum[0], | ||
630 | tb->s0num, tb->s1num,tb->s1bytes, tb->s2num, tb->s2bytes, tb->cur_blknum, tb->lkey[0], tb->rkey[0]); | ||
631 | |||
632 | /* this prints balance parameters for non-leaf levels */ | ||
633 | h = 0; | ||
634 | do { | ||
635 | h++; | ||
636 | sprintf (print_tb_buf + strlen (print_tb_buf), | ||
637 | "* %d * %4d * %2d * * %2d * * %2d *\n", | ||
638 | h, tb->insert_size[h], tb->lnum[h], tb->rnum[h], tb->blknum[h]); | ||
639 | } while (tb->insert_size[h]); | ||
640 | |||
641 | sprintf (print_tb_buf + strlen (print_tb_buf), | ||
642 | "=====================================================================\n" | ||
643 | "FEB list: "); | ||
644 | |||
645 | /* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */ | ||
646 | h = 0; | ||
647 | for (i = 0; i < sizeof (tb->FEB) / sizeof (tb->FEB[0]); i ++) | ||
648 | sprintf (print_tb_buf + strlen (print_tb_buf), | ||
649 | "%p (%llu %d)%s", tb->FEB[i], tb->FEB[i] ? (unsigned long long)tb->FEB[i]->b_blocknr : 0ULL, | ||
650 | tb->FEB[i] ? atomic_read (&(tb->FEB[i]->b_count)) : 0, | ||
651 | (i == sizeof (tb->FEB) / sizeof (tb->FEB[0]) - 1) ? "\n" : ", "); | ||
652 | |||
653 | sprintf (print_tb_buf + strlen (print_tb_buf), | ||
654 | "======================== the end ====================================\n"); | ||
655 | } | ||
656 | |||
657 | void print_cur_tb (char * mes) | ||
658 | { | ||
659 | printk ("%s\n%s", mes, print_tb_buf); | ||
660 | } | ||
661 | |||
662 | static void check_leaf_block_head (struct buffer_head * bh) | ||
663 | { | ||
664 | struct block_head * blkh; | ||
665 | int nr; | ||
666 | |||
667 | blkh = B_BLK_HEAD (bh); | ||
668 | nr = blkh_nr_item(blkh); | ||
669 | if ( nr > (bh->b_size - BLKH_SIZE) / IH_SIZE) | ||
670 | reiserfs_panic (NULL, "vs-6010: check_leaf_block_head: invalid item number %z", bh); | ||
671 | if ( blkh_free_space(blkh) > | ||
672 | bh->b_size - BLKH_SIZE - IH_SIZE * nr ) | ||
673 | reiserfs_panic (NULL, "vs-6020: check_leaf_block_head: invalid free space %z", bh); | ||
674 | |||
675 | } | ||
676 | |||
677 | static void check_internal_block_head (struct buffer_head * bh) | ||
678 | { | ||
679 | struct block_head * blkh; | ||
680 | |||
681 | blkh = B_BLK_HEAD (bh); | ||
682 | if (!(B_LEVEL (bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL (bh) <= MAX_HEIGHT)) | ||
683 | reiserfs_panic (NULL, "vs-6025: check_internal_block_head: invalid level %z", bh); | ||
684 | |||
685 | if (B_NR_ITEMS (bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE) | ||
686 | reiserfs_panic (NULL, "vs-6030: check_internal_block_head: invalid item number %z", bh); | ||
687 | |||
688 | if (B_FREE_SPACE (bh) != | ||
689 | bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS (bh) - DC_SIZE * (B_NR_ITEMS (bh) + 1)) | ||
690 | reiserfs_panic (NULL, "vs-6040: check_internal_block_head: invalid free space %z", bh); | ||
691 | |||
692 | } | ||
693 | |||
694 | |||
695 | void check_leaf (struct buffer_head * bh) | ||
696 | { | ||
697 | int i; | ||
698 | struct item_head * ih; | ||
699 | |||
700 | if (!bh) | ||
701 | return; | ||
702 | check_leaf_block_head (bh); | ||
703 | for (i = 0, ih = B_N_PITEM_HEAD (bh, 0); i < B_NR_ITEMS (bh); i ++, ih ++) | ||
704 | op_check_item (ih, B_I_PITEM (bh, ih)); | ||
705 | } | ||
706 | |||
707 | |||
708 | void check_internal (struct buffer_head * bh) | ||
709 | { | ||
710 | if (!bh) | ||
711 | return; | ||
712 | check_internal_block_head (bh); | ||
713 | } | ||
714 | |||
715 | |||
716 | void print_statistics (struct super_block * s) | ||
717 | { | ||
718 | |||
719 | /* | ||
720 | printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \ | ||
721 | bmap with search %d, without %d, dir2ind %d, ind2dir %d\n", | ||
722 | REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes, | ||
723 | REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search, | ||
724 | REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct); | ||
725 | */ | ||
726 | |||
727 | } | ||
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c new file mode 100644 index 000000000000..f4ea81ae0e0f --- /dev/null +++ b/fs/reiserfs/procfs.c | |||
@@ -0,0 +1,664 @@ | |||
1 | /* -*- linux-c -*- */ | ||
2 | |||
3 | /* fs/reiserfs/procfs.c */ | ||
4 | |||
5 | /* | ||
6 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
7 | */ | ||
8 | |||
9 | /* proc info support a la one created by Sizif@Botik.RU for PGC */ | ||
10 | |||
11 | /* $Id: procfs.c,v 1.1.8.2 2001/07/15 17:08:42 god Exp $ */ | ||
12 | |||
13 | #include <linux/config.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/time.h> | ||
16 | #include <linux/seq_file.h> | ||
17 | #include <asm/uaccess.h> | ||
18 | #include <linux/reiserfs_fs.h> | ||
19 | #include <linux/reiserfs_fs_sb.h> | ||
20 | #include <linux/smp_lock.h> | ||
21 | #include <linux/init.h> | ||
22 | #include <linux/proc_fs.h> | ||
23 | |||
24 | #if defined( REISERFS_PROC_INFO ) | ||
25 | |||
26 | /* | ||
27 | * LOCKING: | ||
28 | * | ||
29 | * We rely on new Alexander Viro's super-block locking. | ||
30 | * | ||
31 | */ | ||
32 | |||
33 | static int show_version(struct seq_file *m, struct super_block *sb) | ||
34 | { | ||
35 | char *format; | ||
36 | |||
37 | if ( REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6) ) { | ||
38 | format = "3.6"; | ||
39 | } else if ( REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5) ) { | ||
40 | format = "3.5"; | ||
41 | } else { | ||
42 | format = "unknown"; | ||
43 | } | ||
44 | |||
45 | seq_printf(m, "%s format\twith checks %s\n", | ||
46 | format, | ||
47 | #if defined( CONFIG_REISERFS_CHECK ) | ||
48 | "on" | ||
49 | #else | ||
50 | "off" | ||
51 | #endif | ||
52 | ); | ||
53 | return 0; | ||
54 | } | ||
55 | |||
56 | int reiserfs_global_version_in_proc( char *buffer, char **start, off_t offset, | ||
57 | int count, int *eof, void *data ) | ||
58 | { | ||
59 | *start = buffer; | ||
60 | *eof = 1; | ||
61 | return 0; | ||
62 | } | ||
63 | |||
64 | #define SF( x ) ( r -> x ) | ||
65 | #define SFP( x ) SF( s_proc_info_data.x ) | ||
66 | #define SFPL( x ) SFP( x[ level ] ) | ||
67 | #define SFPF( x ) SFP( scan_bitmap.x ) | ||
68 | #define SFPJ( x ) SFP( journal.x ) | ||
69 | |||
70 | #define D2C( x ) le16_to_cpu( x ) | ||
71 | #define D4C( x ) le32_to_cpu( x ) | ||
72 | #define DF( x ) D2C( rs -> s_v1.x ) | ||
73 | #define DFL( x ) D4C( rs -> s_v1.x ) | ||
74 | |||
75 | #define objectid_map( s, rs ) (old_format_only (s) ? \ | ||
76 | (__u32 *)((struct reiserfs_super_block_v1 *)rs + 1) : \ | ||
77 | (__u32 *)(rs + 1)) | ||
78 | #define MAP( i ) D4C( objectid_map( sb, rs )[ i ] ) | ||
79 | |||
80 | #define DJF( x ) le32_to_cpu( rs -> x ) | ||
81 | #define DJV( x ) le32_to_cpu( s_v1 -> x ) | ||
82 | #define DJP( x ) le32_to_cpu( jp -> x ) | ||
83 | #define JF( x ) ( r -> s_journal -> x ) | ||
84 | |||
85 | static int show_super(struct seq_file *m, struct super_block *sb) | ||
86 | { | ||
87 | struct reiserfs_sb_info *r = REISERFS_SB(sb); | ||
88 | |||
89 | seq_printf(m, "state: \t%s\n" | ||
90 | "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n" | ||
91 | "gen. counter: \t%i\n" | ||
92 | "s_kmallocs: \t%i\n" | ||
93 | "s_disk_reads: \t%i\n" | ||
94 | "s_disk_writes: \t%i\n" | ||
95 | "s_fix_nodes: \t%i\n" | ||
96 | "s_do_balance: \t%i\n" | ||
97 | "s_unneeded_left_neighbor: \t%i\n" | ||
98 | "s_good_search_by_key_reada: \t%i\n" | ||
99 | "s_bmaps: \t%i\n" | ||
100 | "s_bmaps_without_search: \t%i\n" | ||
101 | "s_direct2indirect: \t%i\n" | ||
102 | "s_indirect2direct: \t%i\n" | ||
103 | "\n" | ||
104 | "max_hash_collisions: \t%i\n" | ||
105 | |||
106 | "breads: \t%lu\n" | ||
107 | "bread_misses: \t%lu\n" | ||
108 | |||
109 | "search_by_key: \t%lu\n" | ||
110 | "search_by_key_fs_changed: \t%lu\n" | ||
111 | "search_by_key_restarted: \t%lu\n" | ||
112 | |||
113 | "insert_item_restarted: \t%lu\n" | ||
114 | "paste_into_item_restarted: \t%lu\n" | ||
115 | "cut_from_item_restarted: \t%lu\n" | ||
116 | "delete_solid_item_restarted: \t%lu\n" | ||
117 | "delete_item_restarted: \t%lu\n" | ||
118 | |||
119 | "leaked_oid: \t%lu\n" | ||
120 | "leaves_removable: \t%lu\n", | ||
121 | |||
122 | SF( s_mount_state ) == REISERFS_VALID_FS ? | ||
123 | "REISERFS_VALID_FS" : "REISERFS_ERROR_FS", | ||
124 | reiserfs_r5_hash( sb ) ? "FORCE_R5 " : "", | ||
125 | reiserfs_rupasov_hash( sb ) ? "FORCE_RUPASOV " : "", | ||
126 | reiserfs_tea_hash( sb ) ? "FORCE_TEA " : "", | ||
127 | reiserfs_hash_detect( sb ) ? "DETECT_HASH " : "", | ||
128 | reiserfs_no_border( sb ) ? "NO_BORDER " : "BORDER ", | ||
129 | reiserfs_no_unhashed_relocation( sb ) ? "NO_UNHASHED_RELOCATION " : "", | ||
130 | reiserfs_hashed_relocation( sb ) ? "UNHASHED_RELOCATION " : "", | ||
131 | reiserfs_test4( sb ) ? "TEST4 " : "", | ||
132 | have_large_tails( sb ) ? "TAILS " : have_small_tails(sb)?"SMALL_TAILS ":"NO_TAILS ", | ||
133 | replay_only( sb ) ? "REPLAY_ONLY " : "", | ||
134 | convert_reiserfs( sb ) ? "CONV " : "", | ||
135 | |||
136 | atomic_read( &r -> s_generation_counter ), | ||
137 | SF( s_kmallocs ), | ||
138 | SF( s_disk_reads ), | ||
139 | SF( s_disk_writes ), | ||
140 | SF( s_fix_nodes ), | ||
141 | SF( s_do_balance ), | ||
142 | SF( s_unneeded_left_neighbor ), | ||
143 | SF( s_good_search_by_key_reada ), | ||
144 | SF( s_bmaps ), | ||
145 | SF( s_bmaps_without_search ), | ||
146 | SF( s_direct2indirect ), | ||
147 | SF( s_indirect2direct ), | ||
148 | SFP( max_hash_collisions ), | ||
149 | SFP( breads ), | ||
150 | SFP( bread_miss ), | ||
151 | SFP( search_by_key ), | ||
152 | SFP( search_by_key_fs_changed ), | ||
153 | SFP( search_by_key_restarted ), | ||
154 | |||
155 | SFP( insert_item_restarted ), | ||
156 | SFP( paste_into_item_restarted ), | ||
157 | SFP( cut_from_item_restarted ), | ||
158 | SFP( delete_solid_item_restarted ), | ||
159 | SFP( delete_item_restarted ), | ||
160 | |||
161 | SFP( leaked_oid ), | ||
162 | SFP( leaves_removable ) ); | ||
163 | |||
164 | return 0; | ||
165 | } | ||
166 | |||
167 | static int show_per_level(struct seq_file *m, struct super_block *sb) | ||
168 | { | ||
169 | struct reiserfs_sb_info *r = REISERFS_SB(sb); | ||
170 | int level; | ||
171 | |||
172 | seq_printf(m, "level\t" | ||
173 | " balances" | ||
174 | " [sbk: reads" | ||
175 | " fs_changed" | ||
176 | " restarted]" | ||
177 | " free space" | ||
178 | " items" | ||
179 | " can_remove" | ||
180 | " lnum" | ||
181 | " rnum" | ||
182 | " lbytes" | ||
183 | " rbytes" | ||
184 | " get_neig" | ||
185 | " get_neig_res" | ||
186 | " need_l_neig" | ||
187 | " need_r_neig" | ||
188 | "\n" | ||
189 | |||
190 | ); | ||
191 | |||
192 | for( level = 0 ; level < MAX_HEIGHT ; ++ level ) { | ||
193 | seq_printf(m, "%i\t" | ||
194 | " %12lu" | ||
195 | " %12lu" | ||
196 | " %12lu" | ||
197 | " %12lu" | ||
198 | " %12lu" | ||
199 | " %12lu" | ||
200 | " %12lu" | ||
201 | " %12li" | ||
202 | " %12li" | ||
203 | " %12li" | ||
204 | " %12li" | ||
205 | " %12lu" | ||
206 | " %12lu" | ||
207 | " %12lu" | ||
208 | " %12lu" | ||
209 | "\n", | ||
210 | level, | ||
211 | SFPL( balance_at ), | ||
212 | SFPL( sbk_read_at ), | ||
213 | SFPL( sbk_fs_changed ), | ||
214 | SFPL( sbk_restarted ), | ||
215 | SFPL( free_at ), | ||
216 | SFPL( items_at ), | ||
217 | SFPL( can_node_be_removed ), | ||
218 | SFPL( lnum ), | ||
219 | SFPL( rnum ), | ||
220 | SFPL( lbytes ), | ||
221 | SFPL( rbytes ), | ||
222 | SFPL( get_neighbors ), | ||
223 | SFPL( get_neighbors_restart ), | ||
224 | SFPL( need_l_neighbor ), | ||
225 | SFPL( need_r_neighbor ) | ||
226 | ); | ||
227 | } | ||
228 | return 0; | ||
229 | } | ||
230 | |||
231 | static int show_bitmap(struct seq_file *m, struct super_block *sb) | ||
232 | { | ||
233 | struct reiserfs_sb_info *r = REISERFS_SB(sb); | ||
234 | |||
235 | seq_printf(m, "free_block: %lu\n" | ||
236 | " scan_bitmap:" | ||
237 | " wait" | ||
238 | " bmap" | ||
239 | " retry" | ||
240 | " stolen" | ||
241 | " journal_hint" | ||
242 | "journal_nohint" | ||
243 | "\n" | ||
244 | " %14lu" | ||
245 | " %14lu" | ||
246 | " %14lu" | ||
247 | " %14lu" | ||
248 | " %14lu" | ||
249 | " %14lu" | ||
250 | " %14lu" | ||
251 | "\n", | ||
252 | SFP( free_block ), | ||
253 | SFPF( call ), | ||
254 | SFPF( wait ), | ||
255 | SFPF( bmap ), | ||
256 | SFPF( retry ), | ||
257 | SFPF( stolen ), | ||
258 | SFPF( in_journal_hint ), | ||
259 | SFPF( in_journal_nohint ) ); | ||
260 | |||
261 | return 0; | ||
262 | } | ||
263 | |||
264 | static int show_on_disk_super(struct seq_file *m, struct super_block *sb) | ||
265 | { | ||
266 | struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); | ||
267 | struct reiserfs_super_block *rs = sb_info -> s_rs; | ||
268 | int hash_code = DFL( s_hash_function_code ); | ||
269 | __u32 flags = DJF( s_flags ); | ||
270 | |||
271 | seq_printf(m, "block_count: \t%i\n" | ||
272 | "free_blocks: \t%i\n" | ||
273 | "root_block: \t%i\n" | ||
274 | "blocksize: \t%i\n" | ||
275 | "oid_maxsize: \t%i\n" | ||
276 | "oid_cursize: \t%i\n" | ||
277 | "umount_state: \t%i\n" | ||
278 | "magic: \t%10.10s\n" | ||
279 | "fs_state: \t%i\n" | ||
280 | "hash: \t%s\n" | ||
281 | "tree_height: \t%i\n" | ||
282 | "bmap_nr: \t%i\n" | ||
283 | "version: \t%i\n" | ||
284 | "flags: \t%x[%s]\n" | ||
285 | "reserved_for_journal: \t%i\n", | ||
286 | |||
287 | DFL( s_block_count ), | ||
288 | DFL( s_free_blocks ), | ||
289 | DFL( s_root_block ), | ||
290 | DF( s_blocksize ), | ||
291 | DF( s_oid_maxsize ), | ||
292 | DF( s_oid_cursize ), | ||
293 | DF( s_umount_state ), | ||
294 | rs -> s_v1.s_magic, | ||
295 | DF( s_fs_state ), | ||
296 | hash_code == TEA_HASH ? "tea" : | ||
297 | ( hash_code == YURA_HASH ) ? "rupasov" : | ||
298 | ( hash_code == R5_HASH ) ? "r5" : | ||
299 | ( hash_code == UNSET_HASH ) ? "unset" : "unknown", | ||
300 | DF( s_tree_height ), | ||
301 | DF( s_bmap_nr ), | ||
302 | DF( s_version ), | ||
303 | flags, | ||
304 | ( flags & reiserfs_attrs_cleared ) | ||
305 | ? "attrs_cleared" : "", | ||
306 | DF (s_reserved_for_journal)); | ||
307 | |||
308 | return 0; | ||
309 | } | ||
310 | |||
311 | static int show_oidmap(struct seq_file *m, struct super_block *sb) | ||
312 | { | ||
313 | struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); | ||
314 | struct reiserfs_super_block *rs = sb_info -> s_rs; | ||
315 | unsigned int mapsize = le16_to_cpu( rs -> s_v1.s_oid_cursize ); | ||
316 | unsigned long total_used = 0; | ||
317 | int i; | ||
318 | |||
319 | for( i = 0 ; i < mapsize ; ++i ) { | ||
320 | __u32 right; | ||
321 | |||
322 | right = ( i == mapsize - 1 ) ? MAX_KEY_OBJECTID : MAP( i + 1 ); | ||
323 | seq_printf(m, "%s: [ %x .. %x )\n", | ||
324 | ( i & 1 ) ? "free" : "used", MAP( i ), right ); | ||
325 | if( ! ( i & 1 ) ) { | ||
326 | total_used += right - MAP( i ); | ||
327 | } | ||
328 | } | ||
329 | #if defined( REISERFS_USE_OIDMAPF ) | ||
330 | if( sb_info -> oidmap.use_file && ( sb_info -> oidmap.mapf != NULL ) ) { | ||
331 | loff_t size = sb_info->oidmap.mapf->f_dentry->d_inode->i_size; | ||
332 | total_used += size / sizeof( reiserfs_oidinterval_d_t ); | ||
333 | } | ||
334 | #endif | ||
335 | seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n", | ||
336 | mapsize, | ||
337 | mapsize, le16_to_cpu( rs -> s_v1.s_oid_maxsize ), | ||
338 | total_used); | ||
339 | return 0; | ||
340 | } | ||
341 | |||
342 | static int show_journal(struct seq_file *m, struct super_block *sb) | ||
343 | { | ||
344 | struct reiserfs_sb_info *r = REISERFS_SB(sb); | ||
345 | struct reiserfs_super_block *rs = r -> s_rs; | ||
346 | struct journal_params *jp = &rs->s_v1.s_journal; | ||
347 | char b[BDEVNAME_SIZE]; | ||
348 | |||
349 | |||
350 | seq_printf(m, /* on-disk fields */ | ||
351 | "jp_journal_1st_block: \t%i\n" | ||
352 | "jp_journal_dev: \t%s[%x]\n" | ||
353 | "jp_journal_size: \t%i\n" | ||
354 | "jp_journal_trans_max: \t%i\n" | ||
355 | "jp_journal_magic: \t%i\n" | ||
356 | "jp_journal_max_batch: \t%i\n" | ||
357 | "jp_journal_max_commit_age: \t%i\n" | ||
358 | "jp_journal_max_trans_age: \t%i\n" | ||
359 | /* incore fields */ | ||
360 | "j_1st_reserved_block: \t%i\n" | ||
361 | "j_state: \t%li\n" | ||
362 | "j_trans_id: \t%lu\n" | ||
363 | "j_mount_id: \t%lu\n" | ||
364 | "j_start: \t%lu\n" | ||
365 | "j_len: \t%lu\n" | ||
366 | "j_len_alloc: \t%lu\n" | ||
367 | "j_wcount: \t%i\n" | ||
368 | "j_bcount: \t%lu\n" | ||
369 | "j_first_unflushed_offset: \t%lu\n" | ||
370 | "j_last_flush_trans_id: \t%lu\n" | ||
371 | "j_trans_start_time: \t%li\n" | ||
372 | "j_list_bitmap_index: \t%i\n" | ||
373 | "j_must_wait: \t%i\n" | ||
374 | "j_next_full_flush: \t%i\n" | ||
375 | "j_next_async_flush: \t%i\n" | ||
376 | "j_cnode_used: \t%i\n" | ||
377 | "j_cnode_free: \t%i\n" | ||
378 | "\n" | ||
379 | /* reiserfs_proc_info_data_t.journal fields */ | ||
380 | "in_journal: \t%12lu\n" | ||
381 | "in_journal_bitmap: \t%12lu\n" | ||
382 | "in_journal_reusable: \t%12lu\n" | ||
383 | "lock_journal: \t%12lu\n" | ||
384 | "lock_journal_wait: \t%12lu\n" | ||
385 | "journal_begin: \t%12lu\n" | ||
386 | "journal_relock_writers: \t%12lu\n" | ||
387 | "journal_relock_wcount: \t%12lu\n" | ||
388 | "mark_dirty: \t%12lu\n" | ||
389 | "mark_dirty_already: \t%12lu\n" | ||
390 | "mark_dirty_notjournal: \t%12lu\n" | ||
391 | "restore_prepared: \t%12lu\n" | ||
392 | "prepare: \t%12lu\n" | ||
393 | "prepare_retry: \t%12lu\n", | ||
394 | |||
395 | DJP( jp_journal_1st_block ), | ||
396 | bdevname(SB_JOURNAL(sb)->j_dev_bd, b), | ||
397 | DJP( jp_journal_dev ), | ||
398 | DJP( jp_journal_size ), | ||
399 | DJP( jp_journal_trans_max ), | ||
400 | DJP( jp_journal_magic ), | ||
401 | DJP( jp_journal_max_batch ), | ||
402 | SB_JOURNAL(sb)->j_max_commit_age, | ||
403 | DJP( jp_journal_max_trans_age ), | ||
404 | |||
405 | JF( j_1st_reserved_block ), | ||
406 | JF( j_state ), | ||
407 | JF( j_trans_id ), | ||
408 | JF( j_mount_id ), | ||
409 | JF( j_start ), | ||
410 | JF( j_len ), | ||
411 | JF( j_len_alloc ), | ||
412 | atomic_read( & r -> s_journal -> j_wcount ), | ||
413 | JF( j_bcount ), | ||
414 | JF( j_first_unflushed_offset ), | ||
415 | JF( j_last_flush_trans_id ), | ||
416 | JF( j_trans_start_time ), | ||
417 | JF( j_list_bitmap_index ), | ||
418 | JF( j_must_wait ), | ||
419 | JF( j_next_full_flush ), | ||
420 | JF( j_next_async_flush ), | ||
421 | JF( j_cnode_used ), | ||
422 | JF( j_cnode_free ), | ||
423 | |||
424 | SFPJ( in_journal ), | ||
425 | SFPJ( in_journal_bitmap ), | ||
426 | SFPJ( in_journal_reusable ), | ||
427 | SFPJ( lock_journal ), | ||
428 | SFPJ( lock_journal_wait ), | ||
429 | SFPJ( journal_being ), | ||
430 | SFPJ( journal_relock_writers ), | ||
431 | SFPJ( journal_relock_wcount ), | ||
432 | SFPJ( mark_dirty ), | ||
433 | SFPJ( mark_dirty_already ), | ||
434 | SFPJ( mark_dirty_notjournal ), | ||
435 | SFPJ( restore_prepared ), | ||
436 | SFPJ( prepare ), | ||
437 | SFPJ( prepare_retry ) | ||
438 | ); | ||
439 | return 0; | ||
440 | } | ||
441 | |||
442 | /* iterator */ | ||
443 | static int test_sb(struct super_block *sb, void *data) | ||
444 | { | ||
445 | return data == sb; | ||
446 | } | ||
447 | |||
448 | static int set_sb(struct super_block *sb, void *data) | ||
449 | { | ||
450 | return -ENOENT; | ||
451 | } | ||
452 | |||
453 | static void *r_start(struct seq_file *m, loff_t *pos) | ||
454 | { | ||
455 | struct proc_dir_entry *de = m->private; | ||
456 | struct super_block *s = de->parent->data; | ||
457 | loff_t l = *pos; | ||
458 | |||
459 | if (l) | ||
460 | return NULL; | ||
461 | |||
462 | if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, s))) | ||
463 | return NULL; | ||
464 | |||
465 | up_write(&s->s_umount); | ||
466 | |||
467 | if (de->deleted) { | ||
468 | deactivate_super(s); | ||
469 | return NULL; | ||
470 | } | ||
471 | |||
472 | return s; | ||
473 | } | ||
474 | |||
475 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | ||
476 | { | ||
477 | ++*pos; | ||
478 | if (v) | ||
479 | deactivate_super(v); | ||
480 | return NULL; | ||
481 | } | ||
482 | |||
483 | static void r_stop(struct seq_file *m, void *v) | ||
484 | { | ||
485 | if (v) | ||
486 | deactivate_super(v); | ||
487 | } | ||
488 | |||
489 | static int r_show(struct seq_file *m, void *v) | ||
490 | { | ||
491 | struct proc_dir_entry *de = m->private; | ||
492 | int (*show)(struct seq_file *, struct super_block *) = de->data; | ||
493 | return show(m, v); | ||
494 | } | ||
495 | |||
496 | static struct seq_operations r_ops = { | ||
497 | .start = r_start, | ||
498 | .next = r_next, | ||
499 | .stop = r_stop, | ||
500 | .show = r_show, | ||
501 | }; | ||
502 | |||
503 | static int r_open(struct inode *inode, struct file *file) | ||
504 | { | ||
505 | int ret = seq_open(file, &r_ops); | ||
506 | |||
507 | if (!ret) { | ||
508 | struct seq_file *m = file->private_data; | ||
509 | m->private = PDE(inode); | ||
510 | } | ||
511 | return ret; | ||
512 | } | ||
513 | |||
514 | static struct file_operations r_file_operations = { | ||
515 | .open = r_open, | ||
516 | .read = seq_read, | ||
517 | .llseek = seq_lseek, | ||
518 | .release = seq_release, | ||
519 | }; | ||
520 | |||
521 | static struct proc_dir_entry *proc_info_root = NULL; | ||
522 | static const char proc_info_root_name[] = "fs/reiserfs"; | ||
523 | |||
524 | static void add_file(struct super_block *sb, char *name, | ||
525 | int (*func)(struct seq_file *, struct super_block *)) | ||
526 | { | ||
527 | struct proc_dir_entry *de; | ||
528 | de = create_proc_entry(name, 0, REISERFS_SB(sb)->procdir); | ||
529 | if (de) { | ||
530 | de->data = func; | ||
531 | de->proc_fops = &r_file_operations; | ||
532 | } | ||
533 | } | ||
534 | |||
535 | int reiserfs_proc_info_init( struct super_block *sb ) | ||
536 | { | ||
537 | spin_lock_init( & __PINFO( sb ).lock ); | ||
538 | REISERFS_SB(sb)->procdir = proc_mkdir(reiserfs_bdevname (sb), proc_info_root); | ||
539 | if( REISERFS_SB(sb)->procdir ) { | ||
540 | REISERFS_SB(sb)->procdir->owner = THIS_MODULE; | ||
541 | REISERFS_SB(sb)->procdir->data = sb; | ||
542 | add_file(sb, "version", show_version); | ||
543 | add_file(sb, "super", show_super); | ||
544 | add_file(sb, "per-level", show_per_level); | ||
545 | add_file(sb, "bitmap", show_bitmap); | ||
546 | add_file(sb, "on-disk-super", show_on_disk_super); | ||
547 | add_file(sb, "oidmap", show_oidmap); | ||
548 | add_file(sb, "journal", show_journal); | ||
549 | return 0; | ||
550 | } | ||
551 | reiserfs_warning(sb, "reiserfs: cannot create /proc/%s/%s", | ||
552 | proc_info_root_name, reiserfs_bdevname (sb) ); | ||
553 | return 1; | ||
554 | } | ||
555 | |||
556 | int reiserfs_proc_info_done( struct super_block *sb ) | ||
557 | { | ||
558 | struct proc_dir_entry *de = REISERFS_SB(sb)->procdir; | ||
559 | if (de) { | ||
560 | remove_proc_entry("journal", de); | ||
561 | remove_proc_entry("oidmap", de); | ||
562 | remove_proc_entry("on-disk-super", de); | ||
563 | remove_proc_entry("bitmap", de); | ||
564 | remove_proc_entry("per-level", de); | ||
565 | remove_proc_entry("super", de); | ||
566 | remove_proc_entry("version", de); | ||
567 | } | ||
568 | spin_lock( & __PINFO( sb ).lock ); | ||
569 | __PINFO( sb ).exiting = 1; | ||
570 | spin_unlock( & __PINFO( sb ).lock ); | ||
571 | if ( proc_info_root ) { | ||
572 | remove_proc_entry( reiserfs_bdevname (sb), proc_info_root ); | ||
573 | REISERFS_SB(sb)->procdir = NULL; | ||
574 | } | ||
575 | return 0; | ||
576 | } | ||
577 | |||
578 | struct proc_dir_entry *reiserfs_proc_register_global( char *name, | ||
579 | read_proc_t *func ) | ||
580 | { | ||
581 | return ( proc_info_root ) ? create_proc_read_entry( name, 0, | ||
582 | proc_info_root, | ||
583 | func, NULL ) : NULL; | ||
584 | } | ||
585 | |||
586 | void reiserfs_proc_unregister_global( const char *name ) | ||
587 | { | ||
588 | remove_proc_entry( name, proc_info_root ); | ||
589 | } | ||
590 | |||
591 | int reiserfs_proc_info_global_init( void ) | ||
592 | { | ||
593 | if( proc_info_root == NULL ) { | ||
594 | proc_info_root = proc_mkdir(proc_info_root_name, NULL); | ||
595 | if( proc_info_root ) { | ||
596 | proc_info_root -> owner = THIS_MODULE; | ||
597 | } else { | ||
598 | reiserfs_warning (NULL, | ||
599 | "reiserfs: cannot create /proc/%s", | ||
600 | proc_info_root_name ); | ||
601 | return 1; | ||
602 | } | ||
603 | } | ||
604 | return 0; | ||
605 | } | ||
606 | |||
607 | int reiserfs_proc_info_global_done( void ) | ||
608 | { | ||
609 | if ( proc_info_root != NULL ) { | ||
610 | proc_info_root = NULL; | ||
611 | remove_proc_entry(proc_info_root_name, NULL); | ||
612 | } | ||
613 | return 0; | ||
614 | } | ||
615 | |||
616 | /* REISERFS_PROC_INFO */ | ||
617 | #else | ||
618 | |||
619 | int reiserfs_proc_info_init( struct super_block *sb ) { return 0; } | ||
620 | int reiserfs_proc_info_done( struct super_block *sb ) { return 0; } | ||
621 | |||
622 | struct proc_dir_entry *reiserfs_proc_register_global( char *name, | ||
623 | read_proc_t *func ) | ||
624 | { return NULL; } | ||
625 | |||
626 | void reiserfs_proc_unregister_global( const char *name ) {;} | ||
627 | |||
628 | int reiserfs_proc_info_global_init( void ) { return 0; } | ||
629 | int reiserfs_proc_info_global_done( void ) { return 0; } | ||
630 | |||
631 | int reiserfs_global_version_in_proc( char *buffer, char **start, | ||
632 | off_t offset, | ||
633 | int count, int *eof, void *data ) | ||
634 | { return 0; } | ||
635 | |||
636 | /* REISERFS_PROC_INFO */ | ||
637 | #endif | ||
638 | |||
639 | /* | ||
640 | * $Log: procfs.c,v $ | ||
641 | * Revision 1.1.8.2 2001/07/15 17:08:42 god | ||
642 | * . use get_super() in procfs.c | ||
643 | * . remove remove_save_link() from reiserfs_do_truncate() | ||
644 | * | ||
645 | * I accept terms and conditions stated in the Legal Agreement | ||
646 | * (available at http://www.namesys.com/legalese.html) | ||
647 | * | ||
648 | * Revision 1.1.8.1 2001/07/11 16:48:50 god | ||
649 | * proc info support | ||
650 | * | ||
651 | * I accept terms and conditions stated in the Legal Agreement | ||
652 | * (available at http://www.namesys.com/legalese.html) | ||
653 | * | ||
654 | */ | ||
655 | |||
656 | /* | ||
657 | * Make Linus happy. | ||
658 | * Local variables: | ||
659 | * c-indentation-style: "K&R" | ||
660 | * mode-name: "LC" | ||
661 | * c-basic-offset: 8 | ||
662 | * tab-width: 8 | ||
663 | * End: | ||
664 | */ | ||
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c new file mode 100644 index 000000000000..170012078b76 --- /dev/null +++ b/fs/reiserfs/resize.c | |||
@@ -0,0 +1,182 @@ | |||
1 | /* | ||
2 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
3 | */ | ||
4 | |||
5 | /* | ||
6 | * Written by Alexander Zarochentcev. | ||
7 | * | ||
8 | * The kernel part of the (on-line) reiserfs resizer. | ||
9 | */ | ||
10 | |||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/vmalloc.h> | ||
14 | #include <linux/string.h> | ||
15 | #include <linux/errno.h> | ||
16 | #include <linux/reiserfs_fs.h> | ||
17 | #include <linux/reiserfs_fs_sb.h> | ||
18 | #include <linux/buffer_head.h> | ||
19 | |||
20 | int reiserfs_resize (struct super_block * s, unsigned long block_count_new) | ||
21 | { | ||
22 | int err = 0; | ||
23 | struct reiserfs_super_block * sb; | ||
24 | struct reiserfs_bitmap_info *bitmap; | ||
25 | struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s); | ||
26 | struct buffer_head * bh; | ||
27 | struct reiserfs_transaction_handle th; | ||
28 | unsigned int bmap_nr_new, bmap_nr; | ||
29 | unsigned int block_r_new, block_r; | ||
30 | |||
31 | struct reiserfs_list_bitmap * jb; | ||
32 | struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS]; | ||
33 | |||
34 | unsigned long int block_count, free_blocks; | ||
35 | int i; | ||
36 | int copy_size ; | ||
37 | |||
38 | sb = SB_DISK_SUPER_BLOCK(s); | ||
39 | |||
40 | if (SB_BLOCK_COUNT(s) >= block_count_new) { | ||
41 | printk("can\'t shrink filesystem on-line\n"); | ||
42 | return -EINVAL; | ||
43 | } | ||
44 | |||
45 | /* check the device size */ | ||
46 | bh = sb_bread(s, block_count_new - 1); | ||
47 | if (!bh) { | ||
48 | printk("reiserfs_resize: can\'t read last block\n"); | ||
49 | return -EINVAL; | ||
50 | } | ||
51 | bforget(bh); | ||
52 | |||
53 | /* old disk layout detection; those partitions can be mounted, but | ||
54 | * cannot be resized */ | ||
55 | if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size | ||
56 | != REISERFS_DISK_OFFSET_IN_BYTES ) { | ||
57 | printk("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n"); | ||
58 | return -ENOTSUPP; | ||
59 | } | ||
60 | |||
61 | /* count used bits in last bitmap block */ | ||
62 | block_r = SB_BLOCK_COUNT(s) - | ||
63 | (SB_BMAP_NR(s) - 1) * s->s_blocksize * 8; | ||
64 | |||
65 | /* count bitmap blocks in new fs */ | ||
66 | bmap_nr_new = block_count_new / ( s->s_blocksize * 8 ); | ||
67 | block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8; | ||
68 | if (block_r_new) | ||
69 | bmap_nr_new++; | ||
70 | else | ||
71 | block_r_new = s->s_blocksize * 8; | ||
72 | |||
73 | /* save old values */ | ||
74 | block_count = SB_BLOCK_COUNT(s); | ||
75 | bmap_nr = SB_BMAP_NR(s); | ||
76 | |||
77 | /* resizing of reiserfs bitmaps (journal and real), if needed */ | ||
78 | if (bmap_nr_new > bmap_nr) { | ||
79 | /* reallocate journal bitmaps */ | ||
80 | if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) { | ||
81 | printk("reiserfs_resize: unable to allocate memory for journal bitmaps\n"); | ||
82 | unlock_super(s) ; | ||
83 | return -ENOMEM ; | ||
84 | } | ||
85 | /* the new journal bitmaps are zero filled, now we copy in the bitmap | ||
86 | ** node pointers from the old journal bitmap structs, and then | ||
87 | ** transfer the new data structures into the journal struct. | ||
88 | ** | ||
89 | ** using the copy_size var below allows this code to work for | ||
90 | ** both shrinking and expanding the FS. | ||
91 | */ | ||
92 | copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr ; | ||
93 | copy_size = copy_size * sizeof(struct reiserfs_list_bitmap_node *) ; | ||
94 | for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) { | ||
95 | struct reiserfs_bitmap_node **node_tmp ; | ||
96 | jb = SB_JOURNAL(s)->j_list_bitmap + i ; | ||
97 | memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size) ; | ||
98 | |||
99 | /* just in case vfree schedules on us, copy the new | ||
100 | ** pointer into the journal struct before freeing the | ||
101 | ** old one | ||
102 | */ | ||
103 | node_tmp = jb->bitmaps ; | ||
104 | jb->bitmaps = jbitmap[i].bitmaps ; | ||
105 | vfree(node_tmp) ; | ||
106 | } | ||
107 | |||
108 | /* allocate additional bitmap blocks, reallocate array of bitmap | ||
109 | * block pointers */ | ||
110 | bitmap = vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); | ||
111 | if (!bitmap) { | ||
112 | /* Journal bitmaps are still supersized, but the memory isn't | ||
113 | * leaked, so I guess it's ok */ | ||
114 | printk("reiserfs_resize: unable to allocate memory.\n"); | ||
115 | return -ENOMEM; | ||
116 | } | ||
117 | memset (bitmap, 0, sizeof (struct reiserfs_bitmap_info) * SB_BMAP_NR(s)); | ||
118 | for (i = 0; i < bmap_nr; i++) | ||
119 | bitmap[i] = old_bitmap[i]; | ||
120 | |||
121 | /* This doesn't go through the journal, but it doesn't have to. | ||
122 | * The changes are still atomic: We're synced up when the journal | ||
123 | * transaction begins, and the new bitmaps don't matter if the | ||
124 | * transaction fails. */ | ||
125 | for (i = bmap_nr; i < bmap_nr_new; i++) { | ||
126 | bitmap[i].bh = sb_getblk(s, i * s->s_blocksize * 8); | ||
127 | memset(bitmap[i].bh->b_data, 0, sb_blocksize(sb)); | ||
128 | reiserfs_test_and_set_le_bit(0, bitmap[i].bh->b_data); | ||
129 | |||
130 | set_buffer_uptodate(bitmap[i].bh); | ||
131 | mark_buffer_dirty(bitmap[i].bh) ; | ||
132 | sync_dirty_buffer(bitmap[i].bh); | ||
133 | // update bitmap_info stuff | ||
134 | bitmap[i].first_zero_hint=1; | ||
135 | bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; | ||
136 | } | ||
137 | /* free old bitmap blocks array */ | ||
138 | SB_AP_BITMAP(s) = bitmap; | ||
139 | vfree (old_bitmap); | ||
140 | } | ||
141 | |||
142 | /* begin transaction, if there was an error, it's fine. Yes, we have | ||
143 | * incorrect bitmaps now, but none of it is ever going to touch the | ||
144 | * disk anyway. */ | ||
145 | err = journal_begin(&th, s, 10); | ||
146 | if (err) | ||
147 | return err; | ||
148 | |||
149 | /* correct last bitmap blocks in old and new disk layout */ | ||
150 | reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[bmap_nr - 1].bh, 1); | ||
151 | for (i = block_r; i < s->s_blocksize * 8; i++) | ||
152 | reiserfs_test_and_clear_le_bit(i, | ||
153 | SB_AP_BITMAP(s)[bmap_nr - 1].bh->b_data); | ||
154 | SB_AP_BITMAP(s)[bmap_nr - 1].free_count += s->s_blocksize * 8 - block_r; | ||
155 | if ( !SB_AP_BITMAP(s)[bmap_nr - 1].first_zero_hint) | ||
156 | SB_AP_BITMAP(s)[bmap_nr - 1].first_zero_hint = block_r; | ||
157 | |||
158 | journal_mark_dirty(&th, s, SB_AP_BITMAP(s)[bmap_nr - 1].bh); | ||
159 | |||
160 | reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[bmap_nr_new - 1].bh, 1); | ||
161 | for (i = block_r_new; i < s->s_blocksize * 8; i++) | ||
162 | reiserfs_test_and_set_le_bit(i, | ||
163 | SB_AP_BITMAP(s)[bmap_nr_new - 1].bh->b_data); | ||
164 | journal_mark_dirty(&th, s, SB_AP_BITMAP(s)[bmap_nr_new - 1].bh); | ||
165 | |||
166 | SB_AP_BITMAP(s)[bmap_nr_new - 1].free_count -= s->s_blocksize * 8 - block_r_new; | ||
167 | /* Extreme case where last bitmap is the only valid block in itself. */ | ||
168 | if ( !SB_AP_BITMAP(s)[bmap_nr_new - 1].free_count ) | ||
169 | SB_AP_BITMAP(s)[bmap_nr_new - 1].first_zero_hint = 0; | ||
170 | /* update super */ | ||
171 | reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; | ||
172 | free_blocks = SB_FREE_BLOCKS(s); | ||
173 | PUT_SB_FREE_BLOCKS(s, free_blocks + (block_count_new - block_count - (bmap_nr_new - bmap_nr))); | ||
174 | PUT_SB_BLOCK_COUNT(s, block_count_new); | ||
175 | PUT_SB_BMAP_NR(s, bmap_nr_new); | ||
176 | s->s_dirt = 1; | ||
177 | |||
178 | journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); | ||
179 | |||
180 | SB_JOURNAL(s)->j_must_wait = 1; | ||
181 | return journal_end(&th, s, 10); | ||
182 | } | ||
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c new file mode 100644 index 000000000000..73ec5212178b --- /dev/null +++ b/fs/reiserfs/stree.c | |||
@@ -0,0 +1,2073 @@ | |||
1 | /* | ||
2 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
3 | */ | ||
4 | |||
5 | /* | ||
6 | * Written by Anatoly P. Pinchuk pap@namesys.botik.ru | ||
7 | * Programm System Institute | ||
8 | * Pereslavl-Zalessky Russia | ||
9 | */ | ||
10 | |||
11 | /* | ||
12 | * This file contains functions dealing with S+tree | ||
13 | * | ||
14 | * B_IS_IN_TREE | ||
15 | * copy_item_head | ||
16 | * comp_short_keys | ||
17 | * comp_keys | ||
18 | * comp_short_le_keys | ||
19 | * le_key2cpu_key | ||
20 | * comp_le_keys | ||
21 | * bin_search | ||
22 | * get_lkey | ||
23 | * get_rkey | ||
24 | * key_in_buffer | ||
25 | * decrement_bcount | ||
26 | * decrement_counters_in_path | ||
27 | * reiserfs_check_path | ||
28 | * pathrelse_and_restore | ||
29 | * pathrelse | ||
30 | * search_by_key_reada | ||
31 | * search_by_key | ||
32 | * search_for_position_by_key | ||
33 | * comp_items | ||
34 | * prepare_for_direct_item | ||
35 | * prepare_for_direntry_item | ||
36 | * prepare_for_delete_or_cut | ||
37 | * calc_deleted_bytes_number | ||
38 | * init_tb_struct | ||
39 | * padd_item | ||
40 | * reiserfs_delete_item | ||
41 | * reiserfs_delete_solid_item | ||
42 | * reiserfs_delete_object | ||
43 | * maybe_indirect_to_direct | ||
44 | * indirect_to_direct_roll_back | ||
45 | * reiserfs_cut_from_item | ||
46 | * truncate_directory | ||
47 | * reiserfs_do_truncate | ||
48 | * reiserfs_paste_into_item | ||
49 | * reiserfs_insert_item | ||
50 | */ | ||
51 | |||
52 | #include <linux/config.h> | ||
53 | #include <linux/time.h> | ||
54 | #include <linux/string.h> | ||
55 | #include <linux/pagemap.h> | ||
56 | #include <linux/reiserfs_fs.h> | ||
57 | #include <linux/smp_lock.h> | ||
58 | #include <linux/buffer_head.h> | ||
59 | #include <linux/quotaops.h> | ||
60 | |||
61 | /* Does the buffer contain a disk block which is in the tree. */ | ||
62 | inline int B_IS_IN_TREE (const struct buffer_head * p_s_bh) | ||
63 | { | ||
64 | |||
65 | RFALSE( B_LEVEL (p_s_bh) > MAX_HEIGHT, | ||
66 | "PAP-1010: block (%b) has too big level (%z)", p_s_bh, p_s_bh); | ||
67 | |||
68 | return ( B_LEVEL (p_s_bh) != FREE_LEVEL ); | ||
69 | } | ||
70 | |||
71 | // | ||
72 | // to gets item head in le form | ||
73 | // | ||
74 | inline void copy_item_head(struct item_head * p_v_to, | ||
75 | const struct item_head * p_v_from) | ||
76 | { | ||
77 | memcpy (p_v_to, p_v_from, IH_SIZE); | ||
78 | } | ||
79 | |||
80 | |||
81 | /* k1 is pointer to on-disk structure which is stored in little-endian | ||
82 | form. k2 is pointer to cpu variable. For key of items of the same | ||
83 | object this returns 0. | ||
84 | Returns: -1 if key1 < key2 | ||
85 | 0 if key1 == key2 | ||
86 | 1 if key1 > key2 */ | ||
87 | inline int comp_short_keys (const struct reiserfs_key * le_key, | ||
88 | const struct cpu_key * cpu_key) | ||
89 | { | ||
90 | __u32 * p_s_le_u32, * p_s_cpu_u32; | ||
91 | int n_key_length = REISERFS_SHORT_KEY_LEN; | ||
92 | |||
93 | p_s_le_u32 = (__u32 *)le_key; | ||
94 | p_s_cpu_u32 = (__u32 *)&cpu_key->on_disk_key; | ||
95 | for( ; n_key_length--; ++p_s_le_u32, ++p_s_cpu_u32 ) { | ||
96 | if ( le32_to_cpu (*p_s_le_u32) < *p_s_cpu_u32 ) | ||
97 | return -1; | ||
98 | if ( le32_to_cpu (*p_s_le_u32) > *p_s_cpu_u32 ) | ||
99 | return 1; | ||
100 | } | ||
101 | |||
102 | return 0; | ||
103 | } | ||
104 | |||
105 | |||
106 | /* k1 is pointer to on-disk structure which is stored in little-endian | ||
107 | form. k2 is pointer to cpu variable. | ||
108 | Compare keys using all 4 key fields. | ||
109 | Returns: -1 if key1 < key2 0 | ||
110 | if key1 = key2 1 if key1 > key2 */ | ||
111 | static inline int comp_keys (const struct reiserfs_key * le_key, const struct cpu_key * cpu_key) | ||
112 | { | ||
113 | int retval; | ||
114 | |||
115 | retval = comp_short_keys (le_key, cpu_key); | ||
116 | if (retval) | ||
117 | return retval; | ||
118 | if (le_key_k_offset (le_key_version(le_key), le_key) < cpu_key_k_offset (cpu_key)) | ||
119 | return -1; | ||
120 | if (le_key_k_offset (le_key_version(le_key), le_key) > cpu_key_k_offset (cpu_key)) | ||
121 | return 1; | ||
122 | |||
123 | if (cpu_key->key_length == 3) | ||
124 | return 0; | ||
125 | |||
126 | /* this part is needed only when tail conversion is in progress */ | ||
127 | if (le_key_k_type (le_key_version(le_key), le_key) < cpu_key_k_type (cpu_key)) | ||
128 | return -1; | ||
129 | |||
130 | if (le_key_k_type (le_key_version(le_key), le_key) > cpu_key_k_type (cpu_key)) | ||
131 | return 1; | ||
132 | |||
133 | return 0; | ||
134 | } | ||
135 | |||
136 | |||
137 | inline int comp_short_le_keys (const struct reiserfs_key * key1, const struct reiserfs_key * key2) | ||
138 | { | ||
139 | __u32 * p_s_1_u32, * p_s_2_u32; | ||
140 | int n_key_length = REISERFS_SHORT_KEY_LEN; | ||
141 | |||
142 | p_s_1_u32 = (__u32 *)key1; | ||
143 | p_s_2_u32 = (__u32 *)key2; | ||
144 | for( ; n_key_length--; ++p_s_1_u32, ++p_s_2_u32 ) { | ||
145 | if ( le32_to_cpu (*p_s_1_u32) < le32_to_cpu (*p_s_2_u32) ) | ||
146 | return -1; | ||
147 | if ( le32_to_cpu (*p_s_1_u32) > le32_to_cpu (*p_s_2_u32) ) | ||
148 | return 1; | ||
149 | } | ||
150 | return 0; | ||
151 | } | ||
152 | |||
153 | inline void le_key2cpu_key (struct cpu_key * to, const struct reiserfs_key * from) | ||
154 | { | ||
155 | to->on_disk_key.k_dir_id = le32_to_cpu (from->k_dir_id); | ||
156 | to->on_disk_key.k_objectid = le32_to_cpu (from->k_objectid); | ||
157 | |||
158 | // find out version of the key | ||
159 | to->version = le_key_version (from); | ||
160 | if (to->version == KEY_FORMAT_3_5) { | ||
161 | to->on_disk_key.u.k_offset_v1.k_offset = le32_to_cpu (from->u.k_offset_v1.k_offset); | ||
162 | to->on_disk_key.u.k_offset_v1.k_uniqueness = le32_to_cpu (from->u.k_offset_v1.k_uniqueness); | ||
163 | } else { | ||
164 | to->on_disk_key.u.k_offset_v2.k_offset = offset_v2_k_offset(&from->u.k_offset_v2); | ||
165 | to->on_disk_key.u.k_offset_v2.k_type = offset_v2_k_type(&from->u.k_offset_v2); | ||
166 | } | ||
167 | } | ||
168 | |||
169 | |||
170 | |||
171 | // this does not say which one is bigger, it only returns 1 if keys | ||
172 | // are not equal, 0 otherwise | ||
173 | inline int comp_le_keys (const struct reiserfs_key * k1, const struct reiserfs_key * k2) | ||
174 | { | ||
175 | return memcmp (k1, k2, sizeof (struct reiserfs_key)); | ||
176 | } | ||
177 | |||
178 | /************************************************************************** | ||
179 | * Binary search toolkit function * | ||
180 | * Search for an item in the array by the item key * | ||
181 | * Returns: 1 if found, 0 if not found; * | ||
182 | * *p_n_pos = number of the searched element if found, else the * | ||
183 | * number of the first element that is larger than p_v_key. * | ||
184 | **************************************************************************/ | ||
185 | /* For those not familiar with binary search: n_lbound is the leftmost item that it | ||
186 | could be, n_rbound the rightmost item that it could be. We examine the item | ||
187 | halfway between n_lbound and n_rbound, and that tells us either that we can increase | ||
188 | n_lbound, or decrease n_rbound, or that we have found it, or if n_lbound <= n_rbound that | ||
189 | there are no possible items, and we have not found it. With each examination we | ||
190 | cut the number of possible items it could be by one more than half rounded down, | ||
191 | or we find it. */ | ||
192 | static inline int bin_search ( | ||
193 | const void * p_v_key, /* Key to search for. */ | ||
194 | const void * p_v_base,/* First item in the array. */ | ||
195 | int p_n_num, /* Number of items in the array. */ | ||
196 | int p_n_width, /* Item size in the array. | ||
197 | searched. Lest the reader be | ||
198 | confused, note that this is crafted | ||
199 | as a general function, and when it | ||
200 | is applied specifically to the array | ||
201 | of item headers in a node, p_n_width | ||
202 | is actually the item header size not | ||
203 | the item size. */ | ||
204 | int * p_n_pos /* Number of the searched for element. */ | ||
205 | ) { | ||
206 | int n_rbound, n_lbound, n_j; | ||
207 | |||
208 | for ( n_j = ((n_rbound = p_n_num - 1) + (n_lbound = 0))/2; n_lbound <= n_rbound; n_j = (n_rbound + n_lbound)/2 ) | ||
209 | switch( comp_keys((struct reiserfs_key *)((char * )p_v_base + n_j * p_n_width), (struct cpu_key *)p_v_key) ) { | ||
210 | case -1: n_lbound = n_j + 1; continue; | ||
211 | case 1: n_rbound = n_j - 1; continue; | ||
212 | case 0: *p_n_pos = n_j; return ITEM_FOUND; /* Key found in the array. */ | ||
213 | } | ||
214 | |||
215 | /* bin_search did not find given key, it returns position of key, | ||
216 | that is minimal and greater than the given one. */ | ||
217 | *p_n_pos = n_lbound; | ||
218 | return ITEM_NOT_FOUND; | ||
219 | } | ||
220 | |||
221 | #ifdef CONFIG_REISERFS_CHECK | ||
222 | extern struct tree_balance * cur_tb; | ||
223 | #endif | ||
224 | |||
225 | |||
226 | |||
227 | /* Minimal possible key. It is never in the tree. */ | ||
228 | const struct reiserfs_key MIN_KEY = {0, 0, {{0, 0},}}; | ||
229 | |||
230 | /* Maximal possible key. It is never in the tree. */ | ||
231 | const struct reiserfs_key MAX_KEY = {0xffffffff, 0xffffffff, {{0xffffffff, 0xffffffff},}}; | ||
232 | |||
233 | |||
234 | /* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom | ||
235 | of the path, and going upwards. We must check the path's validity at each step. If the key is not in | ||
236 | the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this | ||
237 | case we return a special key, either MIN_KEY or MAX_KEY. */ | ||
238 | static inline const struct reiserfs_key * get_lkey ( | ||
239 | const struct path * p_s_chk_path, | ||
240 | const struct super_block * p_s_sb | ||
241 | ) { | ||
242 | int n_position, n_path_offset = p_s_chk_path->path_length; | ||
243 | struct buffer_head * p_s_parent; | ||
244 | |||
245 | RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET, | ||
246 | "PAP-5010: invalid offset in the path"); | ||
247 | |||
248 | /* While not higher in path than first element. */ | ||
249 | while ( n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET ) { | ||
250 | |||
251 | RFALSE( ! buffer_uptodate(PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)), | ||
252 | "PAP-5020: parent is not uptodate"); | ||
253 | |||
254 | /* Parent at the path is not in the tree now. */ | ||
255 | if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)) ) | ||
256 | return &MAX_KEY; | ||
257 | /* Check whether position in the parent is correct. */ | ||
258 | if ( (n_position = PATH_OFFSET_POSITION(p_s_chk_path, n_path_offset)) > B_NR_ITEMS(p_s_parent) ) | ||
259 | return &MAX_KEY; | ||
260 | /* Check whether parent at the path really points to the child. */ | ||
261 | if ( B_N_CHILD_NUM(p_s_parent, n_position) != | ||
262 | PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset + 1)->b_blocknr ) | ||
263 | return &MAX_KEY; | ||
264 | /* Return delimiting key if position in the parent is not equal to zero. */ | ||
265 | if ( n_position ) | ||
266 | return B_N_PDELIM_KEY(p_s_parent, n_position - 1); | ||
267 | } | ||
268 | /* Return MIN_KEY if we are in the root of the buffer tree. */ | ||
269 | if ( PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == | ||
270 | SB_ROOT_BLOCK (p_s_sb) ) | ||
271 | return &MIN_KEY; | ||
272 | return &MAX_KEY; | ||
273 | } | ||
274 | |||
275 | |||
276 | /* Get delimiting key of the buffer at the path and its right neighbor. */ | ||
277 | inline const struct reiserfs_key * get_rkey ( | ||
278 | const struct path * p_s_chk_path, | ||
279 | const struct super_block * p_s_sb | ||
280 | ) { | ||
281 | int n_position, | ||
282 | n_path_offset = p_s_chk_path->path_length; | ||
283 | struct buffer_head * p_s_parent; | ||
284 | |||
285 | RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET, | ||
286 | "PAP-5030: invalid offset in the path"); | ||
287 | |||
288 | while ( n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET ) { | ||
289 | |||
290 | RFALSE( ! buffer_uptodate(PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)), | ||
291 | "PAP-5040: parent is not uptodate"); | ||
292 | |||
293 | /* Parent at the path is not in the tree now. */ | ||
294 | if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)) ) | ||
295 | return &MIN_KEY; | ||
296 | /* Check whether position in the parent is correct. */ | ||
297 | if ( (n_position = PATH_OFFSET_POSITION(p_s_chk_path, n_path_offset)) > B_NR_ITEMS(p_s_parent) ) | ||
298 | return &MIN_KEY; | ||
299 | /* Check whether parent at the path really points to the child. */ | ||
300 | if ( B_N_CHILD_NUM(p_s_parent, n_position) != | ||
301 | PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset + 1)->b_blocknr ) | ||
302 | return &MIN_KEY; | ||
303 | /* Return delimiting key if position in the parent is not the last one. */ | ||
304 | if ( n_position != B_NR_ITEMS(p_s_parent) ) | ||
305 | return B_N_PDELIM_KEY(p_s_parent, n_position); | ||
306 | } | ||
307 | /* Return MAX_KEY if we are in the root of the buffer tree. */ | ||
308 | if ( PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == | ||
309 | SB_ROOT_BLOCK (p_s_sb) ) | ||
310 | return &MAX_KEY; | ||
311 | return &MIN_KEY; | ||
312 | } | ||
313 | |||
314 | |||
315 | /* Check whether a key is contained in the tree rooted from a buffer at a path. */ | ||
316 | /* This works by looking at the left and right delimiting keys for the buffer in the last path_element in | ||
317 | the path. These delimiting keys are stored at least one level above that buffer in the tree. If the | ||
318 | buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in | ||
319 | this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */ | ||
320 | static inline int key_in_buffer ( | ||
321 | struct path * p_s_chk_path, /* Path which should be checked. */ | ||
322 | const struct cpu_key * p_s_key, /* Key which should be checked. */ | ||
323 | struct super_block * p_s_sb /* Super block pointer. */ | ||
324 | ) { | ||
325 | |||
326 | RFALSE( ! p_s_key || p_s_chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET || | ||
327 | p_s_chk_path->path_length > MAX_HEIGHT, | ||
328 | "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)", | ||
329 | p_s_key, p_s_chk_path->path_length); | ||
330 | RFALSE( !PATH_PLAST_BUFFER(p_s_chk_path)->b_bdev, | ||
331 | "PAP-5060: device must not be NODEV"); | ||
332 | |||
333 | if ( comp_keys(get_lkey(p_s_chk_path, p_s_sb), p_s_key) == 1 ) | ||
334 | /* left delimiting key is bigger, that the key we look for */ | ||
335 | return 0; | ||
336 | // if ( comp_keys(p_s_key, get_rkey(p_s_chk_path, p_s_sb)) != -1 ) | ||
337 | if ( comp_keys(get_rkey(p_s_chk_path, p_s_sb), p_s_key) != 1 ) | ||
338 | /* p_s_key must be less than right delimitiing key */ | ||
339 | return 0; | ||
340 | return 1; | ||
341 | } | ||
342 | |||
343 | |||
344 | inline void decrement_bcount( | ||
345 | struct buffer_head * p_s_bh | ||
346 | ) { | ||
347 | if ( p_s_bh ) { | ||
348 | if ( atomic_read (&(p_s_bh->b_count)) ) { | ||
349 | put_bh(p_s_bh) ; | ||
350 | return; | ||
351 | } | ||
352 | reiserfs_panic(NULL, "PAP-5070: decrement_bcount: trying to free free buffer %b", p_s_bh); | ||
353 | } | ||
354 | } | ||
355 | |||
356 | |||
357 | /* Decrement b_count field of the all buffers in the path. */ | ||
358 | void decrement_counters_in_path ( | ||
359 | struct path * p_s_search_path | ||
360 | ) { | ||
361 | int n_path_offset = p_s_search_path->path_length; | ||
362 | |||
363 | RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET || | ||
364 | n_path_offset > EXTENDED_MAX_HEIGHT - 1, | ||
365 | "PAP-5080: invalid path offset of %d", n_path_offset); | ||
366 | |||
367 | while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET ) { | ||
368 | struct buffer_head * bh; | ||
369 | |||
370 | bh = PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--); | ||
371 | decrement_bcount (bh); | ||
372 | } | ||
373 | p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; | ||
374 | } | ||
375 | |||
376 | |||
377 | int reiserfs_check_path(struct path *p) { | ||
378 | RFALSE( p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET, | ||
379 | "path not properly relsed") ; | ||
380 | return 0 ; | ||
381 | } | ||
382 | |||
383 | |||
384 | /* Release all buffers in the path. Restore dirty bits clean | ||
385 | ** when preparing the buffer for the log | ||
386 | ** | ||
387 | ** only called from fix_nodes() | ||
388 | */ | ||
389 | void pathrelse_and_restore ( | ||
390 | struct super_block *s, | ||
391 | struct path * p_s_search_path | ||
392 | ) { | ||
393 | int n_path_offset = p_s_search_path->path_length; | ||
394 | |||
395 | RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, | ||
396 | "clm-4000: invalid path offset"); | ||
397 | |||
398 | while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET ) { | ||
399 | reiserfs_restore_prepared_buffer(s, PATH_OFFSET_PBUFFER(p_s_search_path, | ||
400 | n_path_offset)); | ||
401 | brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--)); | ||
402 | } | ||
403 | p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; | ||
404 | } | ||
405 | |||
406 | /* Release all buffers in the path. */ | ||
407 | void pathrelse ( | ||
408 | struct path * p_s_search_path | ||
409 | ) { | ||
410 | int n_path_offset = p_s_search_path->path_length; | ||
411 | |||
412 | RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, | ||
413 | "PAP-5090: invalid path offset"); | ||
414 | |||
415 | while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET ) | ||
416 | brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--)); | ||
417 | |||
418 | p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; | ||
419 | } | ||
420 | |||
421 | |||
422 | |||
423 | static int is_leaf (char * buf, int blocksize, struct buffer_head * bh) | ||
424 | { | ||
425 | struct block_head * blkh; | ||
426 | struct item_head * ih; | ||
427 | int used_space; | ||
428 | int prev_location; | ||
429 | int i; | ||
430 | int nr; | ||
431 | |||
432 | blkh = (struct block_head *)buf; | ||
433 | if ( blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) { | ||
434 | reiserfs_warning (NULL, "is_leaf: this should be caught earlier"); | ||
435 | return 0; | ||
436 | } | ||
437 | |||
438 | nr = blkh_nr_item(blkh); | ||
439 | if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) { | ||
440 | /* item number is too big or too small */ | ||
441 | reiserfs_warning (NULL, "is_leaf: nr_item seems wrong: %z", bh); | ||
442 | return 0; | ||
443 | } | ||
444 | ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1; | ||
445 | used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location (ih)); | ||
446 | if (used_space != blocksize - blkh_free_space(blkh)) { | ||
447 | /* free space does not match to calculated amount of use space */ | ||
448 | reiserfs_warning (NULL, "is_leaf: free space seems wrong: %z", bh); | ||
449 | return 0; | ||
450 | } | ||
451 | |||
452 | // FIXME: it is_leaf will hit performance too much - we may have | ||
453 | // return 1 here | ||
454 | |||
455 | /* check tables of item heads */ | ||
456 | ih = (struct item_head *)(buf + BLKH_SIZE); | ||
457 | prev_location = blocksize; | ||
458 | for (i = 0; i < nr; i ++, ih ++) { | ||
459 | if ( le_ih_k_type(ih) == TYPE_ANY) { | ||
460 | reiserfs_warning (NULL, "is_leaf: wrong item type for item %h",ih); | ||
461 | return 0; | ||
462 | } | ||
463 | if (ih_location (ih) >= blocksize || ih_location (ih) < IH_SIZE * nr) { | ||
464 | reiserfs_warning (NULL, "is_leaf: item location seems wrong: %h", ih); | ||
465 | return 0; | ||
466 | } | ||
467 | if (ih_item_len (ih) < 1 || ih_item_len (ih) > MAX_ITEM_LEN (blocksize)) { | ||
468 | reiserfs_warning (NULL, "is_leaf: item length seems wrong: %h", ih); | ||
469 | return 0; | ||
470 | } | ||
471 | if (prev_location - ih_location (ih) != ih_item_len (ih)) { | ||
472 | reiserfs_warning (NULL, "is_leaf: item location seems wrong (second one): %h", ih); | ||
473 | return 0; | ||
474 | } | ||
475 | prev_location = ih_location (ih); | ||
476 | } | ||
477 | |||
478 | // one may imagine much more checks | ||
479 | return 1; | ||
480 | } | ||
481 | |||
482 | |||
483 | /* returns 1 if buf looks like an internal node, 0 otherwise */ | ||
484 | static int is_internal (char * buf, int blocksize, struct buffer_head * bh) | ||
485 | { | ||
486 | struct block_head * blkh; | ||
487 | int nr; | ||
488 | int used_space; | ||
489 | |||
490 | blkh = (struct block_head *)buf; | ||
491 | nr = blkh_level(blkh); | ||
492 | if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) { | ||
493 | /* this level is not possible for internal nodes */ | ||
494 | reiserfs_warning (NULL, "is_internal: this should be caught earlier"); | ||
495 | return 0; | ||
496 | } | ||
497 | |||
498 | nr = blkh_nr_item(blkh); | ||
499 | if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) { | ||
500 | /* for internal which is not root we might check min number of keys */ | ||
501 | reiserfs_warning (NULL, "is_internal: number of key seems wrong: %z", bh); | ||
502 | return 0; | ||
503 | } | ||
504 | |||
505 | used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1); | ||
506 | if (used_space != blocksize - blkh_free_space(blkh)) { | ||
507 | reiserfs_warning (NULL, "is_internal: free space seems wrong: %z", bh); | ||
508 | return 0; | ||
509 | } | ||
510 | |||
511 | // one may imagine much more checks | ||
512 | return 1; | ||
513 | } | ||
514 | |||
515 | |||
516 | // make sure that bh contains formatted node of reiserfs tree of | ||
517 | // 'level'-th level | ||
518 | static int is_tree_node (struct buffer_head * bh, int level) | ||
519 | { | ||
520 | if (B_LEVEL (bh) != level) { | ||
521 | reiserfs_warning (NULL, "is_tree_node: node level %d does not match to the expected one %d", | ||
522 | B_LEVEL (bh), level); | ||
523 | return 0; | ||
524 | } | ||
525 | if (level == DISK_LEAF_NODE_LEVEL) | ||
526 | return is_leaf (bh->b_data, bh->b_size, bh); | ||
527 | |||
528 | return is_internal (bh->b_data, bh->b_size, bh); | ||
529 | } | ||
530 | |||
531 | |||
532 | |||
533 | #define SEARCH_BY_KEY_READA 16 | ||
534 | |||
535 | /* The function is NOT SCHEDULE-SAFE! */ | ||
536 | static void search_by_key_reada (struct super_block * s, | ||
537 | struct buffer_head **bh, | ||
538 | unsigned long *b, int num) | ||
539 | { | ||
540 | int i,j; | ||
541 | |||
542 | for (i = 0 ; i < num ; i++) { | ||
543 | bh[i] = sb_getblk (s, b[i]); | ||
544 | } | ||
545 | for (j = 0 ; j < i ; j++) { | ||
546 | /* | ||
547 | * note, this needs attention if we are getting rid of the BKL | ||
548 | * you have to make sure the prepared bit isn't set on this buffer | ||
549 | */ | ||
550 | if (!buffer_uptodate(bh[j])) | ||
551 | ll_rw_block(READA, 1, bh + j); | ||
552 | brelse(bh[j]); | ||
553 | } | ||
554 | } | ||
555 | |||
556 | /************************************************************************** | ||
557 | * Algorithm SearchByKey * | ||
558 | * look for item in the Disk S+Tree by its key * | ||
559 | * Input: p_s_sb - super block * | ||
560 | * p_s_key - pointer to the key to search * | ||
561 | * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR * | ||
562 | * p_s_search_path - path from the root to the needed leaf * | ||
563 | **************************************************************************/ | ||
564 | |||
565 | /* This function fills up the path from the root to the leaf as it | ||
566 | descends the tree looking for the key. It uses reiserfs_bread to | ||
567 | try to find buffers in the cache given their block number. If it | ||
568 | does not find them in the cache it reads them from disk. For each | ||
569 | node search_by_key finds using reiserfs_bread it then uses | ||
570 | bin_search to look through that node. bin_search will find the | ||
571 | position of the block_number of the next node if it is looking | ||
572 | through an internal node. If it is looking through a leaf node | ||
573 | bin_search will find the position of the item which has key either | ||
574 | equal to given key, or which is the maximal key less than the given | ||
575 | key. search_by_key returns a path that must be checked for the | ||
576 | correctness of the top of the path but need not be checked for the | ||
577 | correctness of the bottom of the path */ | ||
578 | /* The function is NOT SCHEDULE-SAFE! */ | ||
579 | int search_by_key (struct super_block * p_s_sb, | ||
580 | const struct cpu_key * p_s_key, /* Key to search. */ | ||
581 | struct path * p_s_search_path, /* This structure was | ||
582 | allocated and initialized | ||
583 | by the calling | ||
584 | function. It is filled up | ||
585 | by this function. */ | ||
586 | int n_stop_level /* How far down the tree to search. To | ||
587 | stop at leaf level - set to | ||
588 | DISK_LEAF_NODE_LEVEL */ | ||
589 | ) { | ||
590 | int n_block_number; | ||
591 | int expected_level; | ||
592 | struct buffer_head * p_s_bh; | ||
593 | struct path_element * p_s_last_element; | ||
594 | int n_node_level, n_retval; | ||
595 | int right_neighbor_of_leaf_node; | ||
596 | int fs_gen; | ||
597 | struct buffer_head *reada_bh[SEARCH_BY_KEY_READA]; | ||
598 | unsigned long reada_blocks[SEARCH_BY_KEY_READA]; | ||
599 | int reada_count = 0; | ||
600 | |||
601 | #ifdef CONFIG_REISERFS_CHECK | ||
602 | int n_repeat_counter = 0; | ||
603 | #endif | ||
604 | |||
605 | PROC_INFO_INC( p_s_sb, search_by_key ); | ||
606 | |||
607 | /* As we add each node to a path we increase its count. This means that | ||
608 | we must be careful to release all nodes in a path before we either | ||
609 | discard the path struct or re-use the path struct, as we do here. */ | ||
610 | |||
611 | decrement_counters_in_path(p_s_search_path); | ||
612 | |||
613 | right_neighbor_of_leaf_node = 0; | ||
614 | |||
615 | /* With each iteration of this loop we search through the items in the | ||
616 | current node, and calculate the next current node(next path element) | ||
617 | for the next iteration of this loop.. */ | ||
618 | n_block_number = SB_ROOT_BLOCK (p_s_sb); | ||
619 | expected_level = -1; | ||
620 | while ( 1 ) { | ||
621 | |||
622 | #ifdef CONFIG_REISERFS_CHECK | ||
623 | if ( !(++n_repeat_counter % 50000) ) | ||
624 | reiserfs_warning (p_s_sb, "PAP-5100: search_by_key: %s:" | ||
625 | "there were %d iterations of while loop " | ||
626 | "looking for key %K", | ||
627 | current->comm, n_repeat_counter, p_s_key); | ||
628 | #endif | ||
629 | |||
630 | /* prep path to have another element added to it. */ | ||
631 | p_s_last_element = PATH_OFFSET_PELEMENT(p_s_search_path, ++p_s_search_path->path_length); | ||
632 | fs_gen = get_generation (p_s_sb); | ||
633 | |||
634 | /* Read the next tree node, and set the last element in the path to | ||
635 | have a pointer to it. */ | ||
636 | if ((p_s_bh = p_s_last_element->pe_buffer = | ||
637 | sb_getblk(p_s_sb, n_block_number)) ) { | ||
638 | if (!buffer_uptodate(p_s_bh) && reada_count > 1) { | ||
639 | search_by_key_reada (p_s_sb, reada_bh, | ||
640 | reada_blocks, reada_count); | ||
641 | } | ||
642 | ll_rw_block(READ, 1, &p_s_bh); | ||
643 | wait_on_buffer(p_s_bh); | ||
644 | if (!buffer_uptodate(p_s_bh)) | ||
645 | goto io_error; | ||
646 | } else { | ||
647 | io_error: | ||
648 | p_s_search_path->path_length --; | ||
649 | pathrelse(p_s_search_path); | ||
650 | return IO_ERROR; | ||
651 | } | ||
652 | reada_count = 0; | ||
653 | if (expected_level == -1) | ||
654 | expected_level = SB_TREE_HEIGHT (p_s_sb); | ||
655 | expected_level --; | ||
656 | |||
657 | /* It is possible that schedule occurred. We must check whether the key | ||
658 | to search is still in the tree rooted from the current buffer. If | ||
659 | not then repeat search from the root. */ | ||
660 | if ( fs_changed (fs_gen, p_s_sb) && | ||
661 | (!B_IS_IN_TREE (p_s_bh) || | ||
662 | B_LEVEL(p_s_bh) != expected_level || | ||
663 | !key_in_buffer(p_s_search_path, p_s_key, p_s_sb))) { | ||
664 | PROC_INFO_INC( p_s_sb, search_by_key_fs_changed ); | ||
665 | PROC_INFO_INC( p_s_sb, search_by_key_restarted ); | ||
666 | PROC_INFO_INC( p_s_sb, sbk_restarted[ expected_level - 1 ] ); | ||
667 | decrement_counters_in_path(p_s_search_path); | ||
668 | |||
669 | /* Get the root block number so that we can repeat the search | ||
670 | starting from the root. */ | ||
671 | n_block_number = SB_ROOT_BLOCK (p_s_sb); | ||
672 | expected_level = -1; | ||
673 | right_neighbor_of_leaf_node = 0; | ||
674 | |||
675 | /* repeat search from the root */ | ||
676 | continue; | ||
677 | } | ||
678 | |||
679 | /* only check that the key is in the buffer if p_s_key is not | ||
680 | equal to the MAX_KEY. Latter case is only possible in | ||
681 | "finish_unfinished()" processing during mount. */ | ||
682 | RFALSE( comp_keys( &MAX_KEY, p_s_key ) && | ||
683 | ! key_in_buffer(p_s_search_path, p_s_key, p_s_sb), | ||
684 | "PAP-5130: key is not in the buffer"); | ||
685 | #ifdef CONFIG_REISERFS_CHECK | ||
686 | if ( cur_tb ) { | ||
687 | print_cur_tb ("5140"); | ||
688 | reiserfs_panic(p_s_sb, "PAP-5140: search_by_key: schedule occurred in do_balance!"); | ||
689 | } | ||
690 | #endif | ||
691 | |||
692 | // make sure, that the node contents look like a node of | ||
693 | // certain level | ||
694 | if (!is_tree_node (p_s_bh, expected_level)) { | ||
695 | reiserfs_warning (p_s_sb, "vs-5150: search_by_key: " | ||
696 | "invalid format found in block %ld. Fsck?", | ||
697 | p_s_bh->b_blocknr); | ||
698 | pathrelse (p_s_search_path); | ||
699 | return IO_ERROR; | ||
700 | } | ||
701 | |||
702 | /* ok, we have acquired next formatted node in the tree */ | ||
703 | n_node_level = B_LEVEL (p_s_bh); | ||
704 | |||
705 | PROC_INFO_BH_STAT( p_s_sb, p_s_bh, n_node_level - 1 ); | ||
706 | |||
707 | RFALSE( n_node_level < n_stop_level, | ||
708 | "vs-5152: tree level (%d) is less than stop level (%d)", | ||
709 | n_node_level, n_stop_level); | ||
710 | |||
711 | n_retval = bin_search( p_s_key, B_N_PITEM_HEAD(p_s_bh, 0), | ||
712 | B_NR_ITEMS(p_s_bh), | ||
713 | ( n_node_level == DISK_LEAF_NODE_LEVEL ) ? IH_SIZE : KEY_SIZE, | ||
714 | &(p_s_last_element->pe_position)); | ||
715 | if (n_node_level == n_stop_level) { | ||
716 | return n_retval; | ||
717 | } | ||
718 | |||
719 | /* we are not in the stop level */ | ||
720 | if (n_retval == ITEM_FOUND) | ||
721 | /* item has been found, so we choose the pointer which is to the right of the found one */ | ||
722 | p_s_last_element->pe_position++; | ||
723 | |||
724 | /* if item was not found we choose the position which is to | ||
725 | the left of the found item. This requires no code, | ||
726 | bin_search did it already.*/ | ||
727 | |||
728 | /* So we have chosen a position in the current node which is | ||
729 | an internal node. Now we calculate child block number by | ||
730 | position in the node. */ | ||
731 | n_block_number = B_N_CHILD_NUM(p_s_bh, p_s_last_element->pe_position); | ||
732 | |||
733 | /* if we are going to read leaf nodes, try for read ahead as well */ | ||
734 | if ((p_s_search_path->reada & PATH_READA) && | ||
735 | n_node_level == DISK_LEAF_NODE_LEVEL + 1) | ||
736 | { | ||
737 | int pos = p_s_last_element->pe_position; | ||
738 | int limit = B_NR_ITEMS(p_s_bh); | ||
739 | struct reiserfs_key *le_key; | ||
740 | |||
741 | if (p_s_search_path->reada & PATH_READA_BACK) | ||
742 | limit = 0; | ||
743 | while(reada_count < SEARCH_BY_KEY_READA) { | ||
744 | if (pos == limit) | ||
745 | break; | ||
746 | reada_blocks[reada_count++] = B_N_CHILD_NUM(p_s_bh, pos); | ||
747 | if (p_s_search_path->reada & PATH_READA_BACK) | ||
748 | pos--; | ||
749 | else | ||
750 | pos++; | ||
751 | |||
752 | /* | ||
753 | * check to make sure we're in the same object | ||
754 | */ | ||
755 | le_key = B_N_PDELIM_KEY(p_s_bh, pos); | ||
756 | if (le32_to_cpu(le_key->k_objectid) != | ||
757 | p_s_key->on_disk_key.k_objectid) | ||
758 | { | ||
759 | break; | ||
760 | } | ||
761 | } | ||
762 | } | ||
763 | } | ||
764 | } | ||
765 | |||
766 | |||
767 | /* Form the path to an item and position in this item which contains | ||
768 | file byte defined by p_s_key. If there is no such item | ||
769 | corresponding to the key, we point the path to the item with | ||
770 | maximal key less than p_s_key, and *p_n_pos_in_item is set to one | ||
771 | past the last entry/byte in the item. If searching for entry in a | ||
772 | directory item, and it is not found, *p_n_pos_in_item is set to one | ||
773 | entry more than the entry with maximal key which is less than the | ||
774 | sought key. | ||
775 | |||
776 | Note that if there is no entry in this same node which is one more, | ||
777 | then we point to an imaginary entry. for direct items, the | ||
778 | position is in units of bytes, for indirect items the position is | ||
779 | in units of blocknr entries, for directory items the position is in | ||
780 | units of directory entries. */ | ||
781 | |||
782 | /* The function is NOT SCHEDULE-SAFE! */ | ||
783 | int search_for_position_by_key (struct super_block * p_s_sb, /* Pointer to the super block. */ | ||
784 | const struct cpu_key * p_cpu_key, /* Key to search (cpu variable) */ | ||
785 | struct path * p_s_search_path /* Filled up by this function. */ | ||
786 | ) { | ||
787 | struct item_head * p_le_ih; /* pointer to on-disk structure */ | ||
788 | int n_blk_size; | ||
789 | loff_t item_offset, offset; | ||
790 | struct reiserfs_dir_entry de; | ||
791 | int retval; | ||
792 | |||
793 | /* If searching for directory entry. */ | ||
794 | if ( is_direntry_cpu_key (p_cpu_key) ) | ||
795 | return search_by_entry_key (p_s_sb, p_cpu_key, p_s_search_path, &de); | ||
796 | |||
797 | /* If not searching for directory entry. */ | ||
798 | |||
799 | /* If item is found. */ | ||
800 | retval = search_item (p_s_sb, p_cpu_key, p_s_search_path); | ||
801 | if (retval == IO_ERROR) | ||
802 | return retval; | ||
803 | if ( retval == ITEM_FOUND ) { | ||
804 | |||
805 | RFALSE( ! ih_item_len( | ||
806 | B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path), | ||
807 | PATH_LAST_POSITION(p_s_search_path))), | ||
808 | "PAP-5165: item length equals zero"); | ||
809 | |||
810 | pos_in_item(p_s_search_path) = 0; | ||
811 | return POSITION_FOUND; | ||
812 | } | ||
813 | |||
814 | RFALSE( ! PATH_LAST_POSITION(p_s_search_path), | ||
815 | "PAP-5170: position equals zero"); | ||
816 | |||
817 | /* Item is not found. Set path to the previous item. */ | ||
818 | p_le_ih = B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path), --PATH_LAST_POSITION(p_s_search_path)); | ||
819 | n_blk_size = p_s_sb->s_blocksize; | ||
820 | |||
821 | if (comp_short_keys (&(p_le_ih->ih_key), p_cpu_key)) { | ||
822 | return FILE_NOT_FOUND; | ||
823 | } | ||
824 | |||
825 | // FIXME: quite ugly this far | ||
826 | |||
827 | item_offset = le_ih_k_offset (p_le_ih); | ||
828 | offset = cpu_key_k_offset (p_cpu_key); | ||
829 | |||
830 | /* Needed byte is contained in the item pointed to by the path.*/ | ||
831 | if (item_offset <= offset && | ||
832 | item_offset + op_bytes_number (p_le_ih, n_blk_size) > offset) { | ||
833 | pos_in_item (p_s_search_path) = offset - item_offset; | ||
834 | if ( is_indirect_le_ih(p_le_ih) ) { | ||
835 | pos_in_item (p_s_search_path) /= n_blk_size; | ||
836 | } | ||
837 | return POSITION_FOUND; | ||
838 | } | ||
839 | |||
840 | /* Needed byte is not contained in the item pointed to by the | ||
841 | path. Set pos_in_item out of the item. */ | ||
842 | if ( is_indirect_le_ih (p_le_ih) ) | ||
843 | pos_in_item (p_s_search_path) = ih_item_len(p_le_ih) / UNFM_P_SIZE; | ||
844 | else | ||
845 | pos_in_item (p_s_search_path) = ih_item_len( p_le_ih ); | ||
846 | |||
847 | return POSITION_NOT_FOUND; | ||
848 | } | ||
849 | |||
850 | |||
851 | /* Compare given item and item pointed to by the path. */ | ||
852 | int comp_items (const struct item_head * stored_ih, const struct path * p_s_path) | ||
853 | { | ||
854 | struct buffer_head * p_s_bh; | ||
855 | struct item_head * ih; | ||
856 | |||
857 | /* Last buffer at the path is not in the tree. */ | ||
858 | if ( ! B_IS_IN_TREE(p_s_bh = PATH_PLAST_BUFFER(p_s_path)) ) | ||
859 | return 1; | ||
860 | |||
861 | /* Last path position is invalid. */ | ||
862 | if ( PATH_LAST_POSITION(p_s_path) >= B_NR_ITEMS(p_s_bh) ) | ||
863 | return 1; | ||
864 | |||
865 | /* we need only to know, whether it is the same item */ | ||
866 | ih = get_ih (p_s_path); | ||
867 | return memcmp (stored_ih, ih, IH_SIZE); | ||
868 | } | ||
869 | |||
870 | |||
871 | /* unformatted nodes are not logged anymore, ever. This is safe | ||
872 | ** now | ||
873 | */ | ||
874 | #define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1) | ||
875 | |||
876 | // block can not be forgotten as it is in I/O or held by someone | ||
877 | #define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh))) | ||
878 | |||
879 | |||
880 | |||
881 | // prepare for delete or cut of direct item | ||
882 | static inline int prepare_for_direct_item (struct path * path, | ||
883 | struct item_head * le_ih, | ||
884 | struct inode * inode, | ||
885 | loff_t new_file_length, | ||
886 | int * cut_size) | ||
887 | { | ||
888 | loff_t round_len; | ||
889 | |||
890 | |||
891 | if ( new_file_length == max_reiserfs_offset (inode) ) { | ||
892 | /* item has to be deleted */ | ||
893 | *cut_size = -(IH_SIZE + ih_item_len(le_ih)); | ||
894 | return M_DELETE; | ||
895 | } | ||
896 | |||
897 | // new file gets truncated | ||
898 | if (get_inode_item_key_version (inode) == KEY_FORMAT_3_6) { | ||
899 | // | ||
900 | round_len = ROUND_UP (new_file_length); | ||
901 | /* this was n_new_file_length < le_ih ... */ | ||
902 | if ( round_len < le_ih_k_offset (le_ih) ) { | ||
903 | *cut_size = -(IH_SIZE + ih_item_len(le_ih)); | ||
904 | return M_DELETE; /* Delete this item. */ | ||
905 | } | ||
906 | /* Calculate first position and size for cutting from item. */ | ||
907 | pos_in_item (path) = round_len - (le_ih_k_offset (le_ih) - 1); | ||
908 | *cut_size = -(ih_item_len(le_ih) - pos_in_item(path)); | ||
909 | |||
910 | return M_CUT; /* Cut from this item. */ | ||
911 | } | ||
912 | |||
913 | |||
914 | // old file: items may have any length | ||
915 | |||
916 | if ( new_file_length < le_ih_k_offset (le_ih) ) { | ||
917 | *cut_size = -(IH_SIZE + ih_item_len(le_ih)); | ||
918 | return M_DELETE; /* Delete this item. */ | ||
919 | } | ||
920 | /* Calculate first position and size for cutting from item. */ | ||
921 | *cut_size = -(ih_item_len(le_ih) - | ||
922 | (pos_in_item (path) = new_file_length + 1 - le_ih_k_offset (le_ih))); | ||
923 | return M_CUT; /* Cut from this item. */ | ||
924 | } | ||
925 | |||
926 | |||
927 | static inline int prepare_for_direntry_item (struct path * path, | ||
928 | struct item_head * le_ih, | ||
929 | struct inode * inode, | ||
930 | loff_t new_file_length, | ||
931 | int * cut_size) | ||
932 | { | ||
933 | if (le_ih_k_offset (le_ih) == DOT_OFFSET && | ||
934 | new_file_length == max_reiserfs_offset (inode)) { | ||
935 | RFALSE( ih_entry_count (le_ih) != 2, | ||
936 | "PAP-5220: incorrect empty directory item (%h)", le_ih); | ||
937 | *cut_size = -(IH_SIZE + ih_item_len(le_ih)); | ||
938 | return M_DELETE; /* Delete the directory item containing "." and ".." entry. */ | ||
939 | } | ||
940 | |||
941 | if ( ih_entry_count (le_ih) == 1 ) { | ||
942 | /* Delete the directory item such as there is one record only | ||
943 | in this item*/ | ||
944 | *cut_size = -(IH_SIZE + ih_item_len(le_ih)); | ||
945 | return M_DELETE; | ||
946 | } | ||
947 | |||
948 | /* Cut one record from the directory item. */ | ||
949 | *cut_size = -(DEH_SIZE + entry_length (get_last_bh (path), le_ih, pos_in_item (path))); | ||
950 | return M_CUT; | ||
951 | } | ||
952 | |||
953 | |||
954 | /* If the path points to a directory or direct item, calculate mode and the size cut, for balance. | ||
955 | If the path points to an indirect item, remove some number of its unformatted nodes. | ||
956 | In case of file truncate calculate whether this item must be deleted/truncated or last | ||
957 | unformatted node of this item will be converted to a direct item. | ||
958 | This function returns a determination of what balance mode the calling function should employ. */ | ||
959 | static char prepare_for_delete_or_cut( | ||
960 | struct reiserfs_transaction_handle *th, | ||
961 | struct inode * inode, | ||
962 | struct path * p_s_path, | ||
963 | const struct cpu_key * p_s_item_key, | ||
964 | int * p_n_removed, /* Number of unformatted nodes which were removed | ||
965 | from end of the file. */ | ||
966 | int * p_n_cut_size, | ||
967 | unsigned long long n_new_file_length /* MAX_KEY_OFFSET in case of delete. */ | ||
968 | ) { | ||
969 | struct super_block * p_s_sb = inode->i_sb; | ||
970 | struct item_head * p_le_ih = PATH_PITEM_HEAD(p_s_path); | ||
971 | struct buffer_head * p_s_bh = PATH_PLAST_BUFFER(p_s_path); | ||
972 | |||
973 | BUG_ON (!th->t_trans_id); | ||
974 | |||
975 | /* Stat_data item. */ | ||
976 | if ( is_statdata_le_ih (p_le_ih) ) { | ||
977 | |||
978 | RFALSE( n_new_file_length != max_reiserfs_offset (inode), | ||
979 | "PAP-5210: mode must be M_DELETE"); | ||
980 | |||
981 | *p_n_cut_size = -(IH_SIZE + ih_item_len(p_le_ih)); | ||
982 | return M_DELETE; | ||
983 | } | ||
984 | |||
985 | |||
986 | /* Directory item. */ | ||
987 | if ( is_direntry_le_ih (p_le_ih) ) | ||
988 | return prepare_for_direntry_item (p_s_path, p_le_ih, inode, n_new_file_length, p_n_cut_size); | ||
989 | |||
990 | /* Direct item. */ | ||
991 | if ( is_direct_le_ih (p_le_ih) ) | ||
992 | return prepare_for_direct_item (p_s_path, p_le_ih, inode, n_new_file_length, p_n_cut_size); | ||
993 | |||
994 | |||
995 | /* Case of an indirect item. */ | ||
996 | { | ||
997 | int n_unfm_number, /* Number of the item unformatted nodes. */ | ||
998 | n_counter, | ||
999 | n_blk_size; | ||
1000 | __u32 * p_n_unfm_pointer; /* Pointer to the unformatted node number. */ | ||
1001 | __u32 tmp; | ||
1002 | struct item_head s_ih; /* Item header. */ | ||
1003 | char c_mode; /* Returned mode of the balance. */ | ||
1004 | int need_research; | ||
1005 | |||
1006 | |||
1007 | n_blk_size = p_s_sb->s_blocksize; | ||
1008 | |||
1009 | /* Search for the needed object indirect item until there are no unformatted nodes to be removed. */ | ||
1010 | do { | ||
1011 | need_research = 0; | ||
1012 | p_s_bh = PATH_PLAST_BUFFER(p_s_path); | ||
1013 | /* Copy indirect item header to a temp variable. */ | ||
1014 | copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); | ||
1015 | /* Calculate number of unformatted nodes in this item. */ | ||
1016 | n_unfm_number = I_UNFM_NUM(&s_ih); | ||
1017 | |||
1018 | RFALSE( ! is_indirect_le_ih(&s_ih) || ! n_unfm_number || | ||
1019 | pos_in_item (p_s_path) + 1 != n_unfm_number, | ||
1020 | "PAP-5240: invalid item %h " | ||
1021 | "n_unfm_number = %d *p_n_pos_in_item = %d", | ||
1022 | &s_ih, n_unfm_number, pos_in_item (p_s_path)); | ||
1023 | |||
1024 | /* Calculate balance mode and position in the item to remove unformatted nodes. */ | ||
1025 | if ( n_new_file_length == max_reiserfs_offset (inode) ) {/* Case of delete. */ | ||
1026 | pos_in_item (p_s_path) = 0; | ||
1027 | *p_n_cut_size = -(IH_SIZE + ih_item_len(&s_ih)); | ||
1028 | c_mode = M_DELETE; | ||
1029 | } | ||
1030 | else { /* Case of truncate. */ | ||
1031 | if ( n_new_file_length < le_ih_k_offset (&s_ih) ) { | ||
1032 | pos_in_item (p_s_path) = 0; | ||
1033 | *p_n_cut_size = -(IH_SIZE + ih_item_len(&s_ih)); | ||
1034 | c_mode = M_DELETE; /* Delete this item. */ | ||
1035 | } | ||
1036 | else { | ||
1037 | /* indirect item must be truncated starting from *p_n_pos_in_item-th position */ | ||
1038 | pos_in_item (p_s_path) = (n_new_file_length + n_blk_size - le_ih_k_offset (&s_ih) ) >> p_s_sb->s_blocksize_bits; | ||
1039 | |||
1040 | RFALSE( pos_in_item (p_s_path) > n_unfm_number, | ||
1041 | "PAP-5250: invalid position in the item"); | ||
1042 | |||
1043 | /* Either convert last unformatted node of indirect item to direct item or increase | ||
1044 | its free space. */ | ||
1045 | if ( pos_in_item (p_s_path) == n_unfm_number ) { | ||
1046 | *p_n_cut_size = 0; /* Nothing to cut. */ | ||
1047 | return M_CONVERT; /* Maybe convert last unformatted node to the direct item. */ | ||
1048 | } | ||
1049 | /* Calculate size to cut. */ | ||
1050 | *p_n_cut_size = -(ih_item_len(&s_ih) - pos_in_item(p_s_path) * UNFM_P_SIZE); | ||
1051 | |||
1052 | c_mode = M_CUT; /* Cut from this indirect item. */ | ||
1053 | } | ||
1054 | } | ||
1055 | |||
1056 | RFALSE( n_unfm_number <= pos_in_item (p_s_path), | ||
1057 | "PAP-5260: invalid position in the indirect item"); | ||
1058 | |||
1059 | /* pointers to be cut */ | ||
1060 | n_unfm_number -= pos_in_item (p_s_path); | ||
1061 | /* Set pointer to the last unformatted node pointer that is to be cut. */ | ||
1062 | p_n_unfm_pointer = (__u32 *)B_I_PITEM(p_s_bh, &s_ih) + I_UNFM_NUM(&s_ih) - 1 - *p_n_removed; | ||
1063 | |||
1064 | |||
1065 | /* We go through the unformatted nodes pointers of the indirect | ||
1066 | item and look for the unformatted nodes in the cache. If we | ||
1067 | found some of them we free it, zero corresponding indirect item | ||
1068 | entry and log buffer containing that indirect item. For this we | ||
1069 | need to prepare last path element for logging. If some | ||
1070 | unformatted node has b_count > 1 we must not free this | ||
1071 | unformatted node since it is in use. */ | ||
1072 | reiserfs_prepare_for_journal(p_s_sb, p_s_bh, 1); | ||
1073 | // note: path could be changed, first line in for loop takes care | ||
1074 | // of it | ||
1075 | |||
1076 | for (n_counter = *p_n_removed; | ||
1077 | n_counter < n_unfm_number; n_counter++, p_n_unfm_pointer-- ) { | ||
1078 | |||
1079 | cond_resched(); | ||
1080 | if (item_moved (&s_ih, p_s_path)) { | ||
1081 | need_research = 1 ; | ||
1082 | break; | ||
1083 | } | ||
1084 | RFALSE( p_n_unfm_pointer < (__u32 *)B_I_PITEM(p_s_bh, &s_ih) || | ||
1085 | p_n_unfm_pointer > (__u32 *)B_I_PITEM(p_s_bh, &s_ih) + I_UNFM_NUM(&s_ih) - 1, | ||
1086 | "vs-5265: pointer out of range"); | ||
1087 | |||
1088 | /* Hole, nothing to remove. */ | ||
1089 | if ( ! get_block_num(p_n_unfm_pointer,0) ) { | ||
1090 | (*p_n_removed)++; | ||
1091 | continue; | ||
1092 | } | ||
1093 | |||
1094 | (*p_n_removed)++; | ||
1095 | |||
1096 | tmp = get_block_num(p_n_unfm_pointer,0); | ||
1097 | put_block_num(p_n_unfm_pointer, 0, 0); | ||
1098 | journal_mark_dirty (th, p_s_sb, p_s_bh); | ||
1099 | reiserfs_free_block(th, inode, tmp, 1); | ||
1100 | if ( item_moved (&s_ih, p_s_path) ) { | ||
1101 | need_research = 1; | ||
1102 | break ; | ||
1103 | } | ||
1104 | } | ||
1105 | |||
1106 | /* a trick. If the buffer has been logged, this | ||
1107 | ** will do nothing. If we've broken the loop without | ||
1108 | ** logging it, it will restore the buffer | ||
1109 | ** | ||
1110 | */ | ||
1111 | reiserfs_restore_prepared_buffer(p_s_sb, p_s_bh); | ||
1112 | |||
1113 | /* This loop can be optimized. */ | ||
1114 | } while ( (*p_n_removed < n_unfm_number || need_research) && | ||
1115 | search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_FOUND ); | ||
1116 | |||
1117 | RFALSE( *p_n_removed < n_unfm_number, | ||
1118 | "PAP-5310: indirect item is not found"); | ||
1119 | RFALSE( item_moved (&s_ih, p_s_path), | ||
1120 | "after while, comp failed, retry") ; | ||
1121 | |||
1122 | if (c_mode == M_CUT) | ||
1123 | pos_in_item (p_s_path) *= UNFM_P_SIZE; | ||
1124 | return c_mode; | ||
1125 | } | ||
1126 | } | ||
1127 | |||
1128 | /* Calculate number of bytes which will be deleted or cut during balance */ | ||
1129 | static int calc_deleted_bytes_number( | ||
1130 | struct tree_balance * p_s_tb, | ||
1131 | char c_mode | ||
1132 | ) { | ||
1133 | int n_del_size; | ||
1134 | struct item_head * p_le_ih = PATH_PITEM_HEAD(p_s_tb->tb_path); | ||
1135 | |||
1136 | if ( is_statdata_le_ih (p_le_ih) ) | ||
1137 | return 0; | ||
1138 | |||
1139 | n_del_size = ( c_mode == M_DELETE ) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0]; | ||
1140 | if ( is_direntry_le_ih (p_le_ih) ) { | ||
1141 | // return EMPTY_DIR_SIZE; /* We delete emty directoris only. */ | ||
1142 | // we can't use EMPTY_DIR_SIZE, as old format dirs have a different | ||
1143 | // empty size. ick. FIXME, is this right? | ||
1144 | // | ||
1145 | return n_del_size ; | ||
1146 | } | ||
1147 | |||
1148 | if ( is_indirect_le_ih (p_le_ih) ) | ||
1149 | n_del_size = (n_del_size/UNFM_P_SIZE)* | ||
1150 | (PATH_PLAST_BUFFER(p_s_tb->tb_path)->b_size);// - get_ih_free_space (p_le_ih); | ||
1151 | return n_del_size; | ||
1152 | } | ||
1153 | |||
1154 | static void init_tb_struct( | ||
1155 | struct reiserfs_transaction_handle *th, | ||
1156 | struct tree_balance * p_s_tb, | ||
1157 | struct super_block * p_s_sb, | ||
1158 | struct path * p_s_path, | ||
1159 | int n_size | ||
1160 | ) { | ||
1161 | |||
1162 | BUG_ON (!th->t_trans_id); | ||
1163 | |||
1164 | memset (p_s_tb,'\0',sizeof(struct tree_balance)); | ||
1165 | p_s_tb->transaction_handle = th ; | ||
1166 | p_s_tb->tb_sb = p_s_sb; | ||
1167 | p_s_tb->tb_path = p_s_path; | ||
1168 | PATH_OFFSET_PBUFFER(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL; | ||
1169 | PATH_OFFSET_POSITION(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0; | ||
1170 | p_s_tb->insert_size[0] = n_size; | ||
1171 | } | ||
1172 | |||
1173 | |||
1174 | |||
1175 | void padd_item (char * item, int total_length, int length) | ||
1176 | { | ||
1177 | int i; | ||
1178 | |||
1179 | for (i = total_length; i > length; ) | ||
1180 | item [--i] = 0; | ||
1181 | } | ||
1182 | |||
1183 | #ifdef REISERQUOTA_DEBUG | ||
1184 | char key2type(struct reiserfs_key *ih) | ||
1185 | { | ||
1186 | if (is_direntry_le_key(2, ih)) | ||
1187 | return 'd'; | ||
1188 | if (is_direct_le_key(2, ih)) | ||
1189 | return 'D'; | ||
1190 | if (is_indirect_le_key(2, ih)) | ||
1191 | return 'i'; | ||
1192 | if (is_statdata_le_key(2, ih)) | ||
1193 | return 's'; | ||
1194 | return 'u'; | ||
1195 | } | ||
1196 | |||
1197 | char head2type(struct item_head *ih) | ||
1198 | { | ||
1199 | if (is_direntry_le_ih(ih)) | ||
1200 | return 'd'; | ||
1201 | if (is_direct_le_ih(ih)) | ||
1202 | return 'D'; | ||
1203 | if (is_indirect_le_ih(ih)) | ||
1204 | return 'i'; | ||
1205 | if (is_statdata_le_ih(ih)) | ||
1206 | return 's'; | ||
1207 | return 'u'; | ||
1208 | } | ||
1209 | #endif | ||
1210 | |||
1211 | /* Delete object item. */ | ||
1212 | int reiserfs_delete_item (struct reiserfs_transaction_handle *th, | ||
1213 | struct path * p_s_path, /* Path to the deleted item. */ | ||
1214 | const struct cpu_key * p_s_item_key, /* Key to search for the deleted item. */ | ||
1215 | struct inode * p_s_inode,/* inode is here just to update i_blocks and quotas */ | ||
1216 | struct buffer_head * p_s_un_bh) /* NULL or unformatted node pointer. */ | ||
1217 | { | ||
1218 | struct super_block * p_s_sb = p_s_inode->i_sb; | ||
1219 | struct tree_balance s_del_balance; | ||
1220 | struct item_head s_ih; | ||
1221 | struct item_head *q_ih; | ||
1222 | int quota_cut_bytes; | ||
1223 | int n_ret_value, | ||
1224 | n_del_size, | ||
1225 | n_removed; | ||
1226 | |||
1227 | #ifdef CONFIG_REISERFS_CHECK | ||
1228 | char c_mode; | ||
1229 | int n_iter = 0; | ||
1230 | #endif | ||
1231 | |||
1232 | BUG_ON (!th->t_trans_id); | ||
1233 | |||
1234 | init_tb_struct(th, &s_del_balance, p_s_sb, p_s_path, 0/*size is unknown*/); | ||
1235 | |||
1236 | while ( 1 ) { | ||
1237 | n_removed = 0; | ||
1238 | |||
1239 | #ifdef CONFIG_REISERFS_CHECK | ||
1240 | n_iter++; | ||
1241 | c_mode = | ||
1242 | #endif | ||
1243 | prepare_for_delete_or_cut(th, p_s_inode, p_s_path, p_s_item_key, &n_removed, &n_del_size, max_reiserfs_offset (p_s_inode)); | ||
1244 | |||
1245 | RFALSE( c_mode != M_DELETE, "PAP-5320: mode must be M_DELETE"); | ||
1246 | |||
1247 | copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); | ||
1248 | s_del_balance.insert_size[0] = n_del_size; | ||
1249 | |||
1250 | n_ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL); | ||
1251 | if ( n_ret_value != REPEAT_SEARCH ) | ||
1252 | break; | ||
1253 | |||
1254 | PROC_INFO_INC( p_s_sb, delete_item_restarted ); | ||
1255 | |||
1256 | // file system changed, repeat search | ||
1257 | n_ret_value = search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path); | ||
1258 | if (n_ret_value == IO_ERROR) | ||
1259 | break; | ||
1260 | if (n_ret_value == FILE_NOT_FOUND) { | ||
1261 | reiserfs_warning (p_s_sb, "vs-5340: reiserfs_delete_item: " | ||
1262 | "no items of the file %K found", p_s_item_key); | ||
1263 | break; | ||
1264 | } | ||
1265 | } /* while (1) */ | ||
1266 | |||
1267 | if ( n_ret_value != CARRY_ON ) { | ||
1268 | unfix_nodes(&s_del_balance); | ||
1269 | return 0; | ||
1270 | } | ||
1271 | |||
1272 | // reiserfs_delete_item returns item length when success | ||
1273 | n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE); | ||
1274 | q_ih = get_ih(p_s_path) ; | ||
1275 | quota_cut_bytes = ih_item_len(q_ih) ; | ||
1276 | |||
1277 | /* hack so the quota code doesn't have to guess if the file | ||
1278 | ** has a tail. On tail insert, we allocate quota for 1 unformatted node. | ||
1279 | ** We test the offset because the tail might have been | ||
1280 | ** split into multiple items, and we only want to decrement for | ||
1281 | ** the unfm node once | ||
1282 | */ | ||
1283 | if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(q_ih)) { | ||
1284 | if ((le_ih_k_offset(q_ih) & (p_s_sb->s_blocksize - 1)) == 1) { | ||
1285 | quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE; | ||
1286 | } else { | ||
1287 | quota_cut_bytes = 0 ; | ||
1288 | } | ||
1289 | } | ||
1290 | |||
1291 | if ( p_s_un_bh ) { | ||
1292 | int off; | ||
1293 | char *data ; | ||
1294 | |||
1295 | /* We are in direct2indirect conversion, so move tail contents | ||
1296 | to the unformatted node */ | ||
1297 | /* note, we do the copy before preparing the buffer because we | ||
1298 | ** don't care about the contents of the unformatted node yet. | ||
1299 | ** the only thing we really care about is the direct item's data | ||
1300 | ** is in the unformatted node. | ||
1301 | ** | ||
1302 | ** Otherwise, we would have to call reiserfs_prepare_for_journal on | ||
1303 | ** the unformatted node, which might schedule, meaning we'd have to | ||
1304 | ** loop all the way back up to the start of the while loop. | ||
1305 | ** | ||
1306 | ** The unformatted node must be dirtied later on. We can't be | ||
1307 | ** sure here if the entire tail has been deleted yet. | ||
1308 | ** | ||
1309 | ** p_s_un_bh is from the page cache (all unformatted nodes are | ||
1310 | ** from the page cache) and might be a highmem page. So, we | ||
1311 | ** can't use p_s_un_bh->b_data. | ||
1312 | ** -clm | ||
1313 | */ | ||
1314 | |||
1315 | data = kmap_atomic(p_s_un_bh->b_page, KM_USER0); | ||
1316 | off = ((le_ih_k_offset (&s_ih) - 1) & (PAGE_CACHE_SIZE - 1)); | ||
1317 | memcpy(data + off, | ||
1318 | B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih), n_ret_value); | ||
1319 | kunmap_atomic(data, KM_USER0); | ||
1320 | } | ||
1321 | /* Perform balancing after all resources have been collected at once. */ | ||
1322 | do_balance(&s_del_balance, NULL, NULL, M_DELETE); | ||
1323 | |||
1324 | #ifdef REISERQUOTA_DEBUG | ||
1325 | reiserfs_debug (p_s_sb, REISERFS_DEBUG_CODE, "reiserquota delete_item(): freeing %u, id=%u type=%c", quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih)); | ||
1326 | #endif | ||
1327 | DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes); | ||
1328 | |||
1329 | /* Return deleted body length */ | ||
1330 | return n_ret_value; | ||
1331 | } | ||
1332 | |||
1333 | |||
1334 | /* Summary Of Mechanisms For Handling Collisions Between Processes: | ||
1335 | |||
1336 | deletion of the body of the object is performed by iput(), with the | ||
1337 | result that if multiple processes are operating on a file, the | ||
1338 | deletion of the body of the file is deferred until the last process | ||
1339 | that has an open inode performs its iput(). | ||
1340 | |||
1341 | writes and truncates are protected from collisions by use of | ||
1342 | semaphores. | ||
1343 | |||
1344 | creates, linking, and mknod are protected from collisions with other | ||
1345 | processes by making the reiserfs_add_entry() the last step in the | ||
1346 | creation, and then rolling back all changes if there was a collision. | ||
1347 | - Hans | ||
1348 | */ | ||
1349 | |||
1350 | |||
1351 | /* this deletes item which never gets split */ | ||
1352 | void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th, | ||
1353 | struct inode *inode, | ||
1354 | struct reiserfs_key * key) | ||
1355 | { | ||
1356 | struct tree_balance tb; | ||
1357 | INITIALIZE_PATH (path); | ||
1358 | int item_len = 0; | ||
1359 | int tb_init = 0 ; | ||
1360 | struct cpu_key cpu_key; | ||
1361 | int retval; | ||
1362 | int quota_cut_bytes = 0; | ||
1363 | |||
1364 | BUG_ON (!th->t_trans_id); | ||
1365 | |||
1366 | le_key2cpu_key (&cpu_key, key); | ||
1367 | |||
1368 | while (1) { | ||
1369 | retval = search_item (th->t_super, &cpu_key, &path); | ||
1370 | if (retval == IO_ERROR) { | ||
1371 | reiserfs_warning (th->t_super, | ||
1372 | "vs-5350: reiserfs_delete_solid_item: " | ||
1373 | "i/o failure occurred trying to delete %K", | ||
1374 | &cpu_key); | ||
1375 | break; | ||
1376 | } | ||
1377 | if (retval != ITEM_FOUND) { | ||
1378 | pathrelse (&path); | ||
1379 | // No need for a warning, if there is just no free space to insert '..' item into the newly-created subdir | ||
1380 | if ( !( (unsigned long long) GET_HASH_VALUE (le_key_k_offset (le_key_version (key), key)) == 0 && \ | ||
1381 | (unsigned long long) GET_GENERATION_NUMBER (le_key_k_offset (le_key_version (key), key)) == 1 ) ) | ||
1382 | reiserfs_warning (th->t_super, "vs-5355: reiserfs_delete_solid_item: %k not found", key); | ||
1383 | break; | ||
1384 | } | ||
1385 | if (!tb_init) { | ||
1386 | tb_init = 1 ; | ||
1387 | item_len = ih_item_len( PATH_PITEM_HEAD(&path) ); | ||
1388 | init_tb_struct (th, &tb, th->t_super, &path, - (IH_SIZE + item_len)); | ||
1389 | } | ||
1390 | quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path)) ; | ||
1391 | |||
1392 | retval = fix_nodes (M_DELETE, &tb, NULL, NULL); | ||
1393 | if (retval == REPEAT_SEARCH) { | ||
1394 | PROC_INFO_INC( th -> t_super, delete_solid_item_restarted ); | ||
1395 | continue; | ||
1396 | } | ||
1397 | |||
1398 | if (retval == CARRY_ON) { | ||
1399 | do_balance (&tb, NULL, NULL, M_DELETE); | ||
1400 | if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */ | ||
1401 | #ifdef REISERQUOTA_DEBUG | ||
1402 | reiserfs_debug (th->t_super, REISERFS_DEBUG_CODE, "reiserquota delete_solid_item(): freeing %u id=%u type=%c", quota_cut_bytes, inode->i_uid, key2type(key)); | ||
1403 | #endif | ||
1404 | DQUOT_FREE_SPACE_NODIRTY(inode, quota_cut_bytes); | ||
1405 | } | ||
1406 | break; | ||
1407 | } | ||
1408 | |||
1409 | // IO_ERROR, NO_DISK_SPACE, etc | ||
1410 | reiserfs_warning (th->t_super, "vs-5360: reiserfs_delete_solid_item: " | ||
1411 | "could not delete %K due to fix_nodes failure", &cpu_key); | ||
1412 | unfix_nodes (&tb); | ||
1413 | break; | ||
1414 | } | ||
1415 | |||
1416 | reiserfs_check_path(&path) ; | ||
1417 | } | ||
1418 | |||
1419 | |||
1420 | int reiserfs_delete_object (struct reiserfs_transaction_handle *th, struct inode * inode) | ||
1421 | { | ||
1422 | int err; | ||
1423 | inode->i_size = 0; | ||
1424 | BUG_ON (!th->t_trans_id); | ||
1425 | |||
1426 | /* for directory this deletes item containing "." and ".." */ | ||
1427 | err = reiserfs_do_truncate (th, inode, NULL, 0/*no timestamp updates*/); | ||
1428 | if (err) | ||
1429 | return err; | ||
1430 | |||
1431 | #if defined( USE_INODE_GENERATION_COUNTER ) | ||
1432 | if( !old_format_only ( th -> t_super ) ) | ||
1433 | { | ||
1434 | __u32 *inode_generation; | ||
1435 | |||
1436 | inode_generation = | ||
1437 | &REISERFS_SB(th -> t_super) -> s_rs -> s_inode_generation; | ||
1438 | *inode_generation = cpu_to_le32( le32_to_cpu( *inode_generation ) + 1 ); | ||
1439 | } | ||
1440 | /* USE_INODE_GENERATION_COUNTER */ | ||
1441 | #endif | ||
1442 | reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode)); | ||
1443 | |||
1444 | return err; | ||
1445 | } | ||
1446 | |||
1447 | static void | ||
1448 | unmap_buffers(struct page *page, loff_t pos) { | ||
1449 | struct buffer_head *bh ; | ||
1450 | struct buffer_head *head ; | ||
1451 | struct buffer_head *next ; | ||
1452 | unsigned long tail_index ; | ||
1453 | unsigned long cur_index ; | ||
1454 | |||
1455 | if (page) { | ||
1456 | if (page_has_buffers(page)) { | ||
1457 | tail_index = pos & (PAGE_CACHE_SIZE - 1) ; | ||
1458 | cur_index = 0 ; | ||
1459 | head = page_buffers(page) ; | ||
1460 | bh = head ; | ||
1461 | do { | ||
1462 | next = bh->b_this_page ; | ||
1463 | |||
1464 | /* we want to unmap the buffers that contain the tail, and | ||
1465 | ** all the buffers after it (since the tail must be at the | ||
1466 | ** end of the file). We don't want to unmap file data | ||
1467 | ** before the tail, since it might be dirty and waiting to | ||
1468 | ** reach disk | ||
1469 | */ | ||
1470 | cur_index += bh->b_size ; | ||
1471 | if (cur_index > tail_index) { | ||
1472 | reiserfs_unmap_buffer(bh) ; | ||
1473 | } | ||
1474 | bh = next ; | ||
1475 | } while (bh != head) ; | ||
1476 | if ( PAGE_SIZE == bh->b_size ) { | ||
1477 | clear_page_dirty(page); | ||
1478 | } | ||
1479 | } | ||
1480 | } | ||
1481 | } | ||
1482 | |||
1483 | static int maybe_indirect_to_direct (struct reiserfs_transaction_handle *th, | ||
1484 | struct inode * p_s_inode, | ||
1485 | struct page *page, | ||
1486 | struct path * p_s_path, | ||
1487 | const struct cpu_key * p_s_item_key, | ||
1488 | loff_t n_new_file_size, | ||
1489 | char * p_c_mode | ||
1490 | ) { | ||
1491 | struct super_block * p_s_sb = p_s_inode->i_sb; | ||
1492 | int n_block_size = p_s_sb->s_blocksize; | ||
1493 | int cut_bytes; | ||
1494 | BUG_ON (!th->t_trans_id); | ||
1495 | |||
1496 | if (n_new_file_size != p_s_inode->i_size) | ||
1497 | BUG (); | ||
1498 | |||
1499 | /* the page being sent in could be NULL if there was an i/o error | ||
1500 | ** reading in the last block. The user will hit problems trying to | ||
1501 | ** read the file, but for now we just skip the indirect2direct | ||
1502 | */ | ||
1503 | if (atomic_read(&p_s_inode->i_count) > 1 || | ||
1504 | !tail_has_to_be_packed (p_s_inode) || | ||
1505 | !page || (REISERFS_I(p_s_inode)->i_flags & i_nopack_mask)) { | ||
1506 | // leave tail in an unformatted node | ||
1507 | *p_c_mode = M_SKIP_BALANCING; | ||
1508 | cut_bytes = n_block_size - (n_new_file_size & (n_block_size - 1)); | ||
1509 | pathrelse(p_s_path); | ||
1510 | return cut_bytes; | ||
1511 | } | ||
1512 | /* Permorm the conversion to a direct_item. */ | ||
1513 | /*return indirect_to_direct (p_s_inode, p_s_path, p_s_item_key, n_new_file_size, p_c_mode);*/ | ||
1514 | return indirect2direct (th, p_s_inode, page, p_s_path, p_s_item_key, n_new_file_size, p_c_mode); | ||
1515 | } | ||
1516 | |||
1517 | |||
1518 | /* we did indirect_to_direct conversion. And we have inserted direct | ||
1519 | item successesfully, but there were no disk space to cut unfm | ||
1520 | pointer being converted. Therefore we have to delete inserted | ||
1521 | direct item(s) */ | ||
1522 | static void indirect_to_direct_roll_back (struct reiserfs_transaction_handle *th, struct inode * inode, struct path * path) | ||
1523 | { | ||
1524 | struct cpu_key tail_key; | ||
1525 | int tail_len; | ||
1526 | int removed; | ||
1527 | BUG_ON (!th->t_trans_id); | ||
1528 | |||
1529 | make_cpu_key (&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);// !!!! | ||
1530 | tail_key.key_length = 4; | ||
1531 | |||
1532 | tail_len = (cpu_key_k_offset (&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1; | ||
1533 | while (tail_len) { | ||
1534 | /* look for the last byte of the tail */ | ||
1535 | if (search_for_position_by_key (inode->i_sb, &tail_key, path) == POSITION_NOT_FOUND) | ||
1536 | reiserfs_panic (inode->i_sb, "vs-5615: indirect_to_direct_roll_back: found invalid item"); | ||
1537 | RFALSE( path->pos_in_item != ih_item_len(PATH_PITEM_HEAD (path)) - 1, | ||
1538 | "vs-5616: appended bytes found"); | ||
1539 | PATH_LAST_POSITION (path) --; | ||
1540 | |||
1541 | removed = reiserfs_delete_item (th, path, &tail_key, inode, NULL/*unbh not needed*/); | ||
1542 | RFALSE( removed <= 0 || removed > tail_len, | ||
1543 | "vs-5617: there was tail %d bytes, removed item length %d bytes", | ||
1544 | tail_len, removed); | ||
1545 | tail_len -= removed; | ||
1546 | set_cpu_key_k_offset (&tail_key, cpu_key_k_offset (&tail_key) - removed); | ||
1547 | } | ||
1548 | reiserfs_warning (inode->i_sb, "indirect_to_direct_roll_back: indirect_to_direct conversion has been rolled back due to lack of disk space"); | ||
1549 | //mark_file_without_tail (inode); | ||
1550 | mark_inode_dirty (inode); | ||
1551 | } | ||
1552 | |||
1553 | |||
1554 | /* (Truncate or cut entry) or delete object item. Returns < 0 on failure */ | ||
1555 | int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th, | ||
1556 | struct path * p_s_path, | ||
1557 | struct cpu_key * p_s_item_key, | ||
1558 | struct inode * p_s_inode, | ||
1559 | struct page *page, | ||
1560 | loff_t n_new_file_size) | ||
1561 | { | ||
1562 | struct super_block * p_s_sb = p_s_inode->i_sb; | ||
1563 | /* Every function which is going to call do_balance must first | ||
1564 | create a tree_balance structure. Then it must fill up this | ||
1565 | structure by using the init_tb_struct and fix_nodes functions. | ||
1566 | After that we can make tree balancing. */ | ||
1567 | struct tree_balance s_cut_balance; | ||
1568 | struct item_head *p_le_ih; | ||
1569 | int n_cut_size = 0, /* Amount to be cut. */ | ||
1570 | n_ret_value = CARRY_ON, | ||
1571 | n_removed = 0, /* Number of the removed unformatted nodes. */ | ||
1572 | n_is_inode_locked = 0; | ||
1573 | char c_mode; /* Mode of the balance. */ | ||
1574 | int retval2 = -1; | ||
1575 | int quota_cut_bytes; | ||
1576 | loff_t tail_pos = 0; | ||
1577 | |||
1578 | BUG_ON (!th->t_trans_id); | ||
1579 | |||
1580 | init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path, n_cut_size); | ||
1581 | |||
1582 | |||
1583 | /* Repeat this loop until we either cut the item without needing | ||
1584 | to balance, or we fix_nodes without schedule occurring */ | ||
1585 | while ( 1 ) { | ||
1586 | /* Determine the balance mode, position of the first byte to | ||
1587 | be cut, and size to be cut. In case of the indirect item | ||
1588 | free unformatted nodes which are pointed to by the cut | ||
1589 | pointers. */ | ||
1590 | |||
1591 | c_mode = prepare_for_delete_or_cut(th, p_s_inode, p_s_path, p_s_item_key, &n_removed, | ||
1592 | &n_cut_size, n_new_file_size); | ||
1593 | if ( c_mode == M_CONVERT ) { | ||
1594 | /* convert last unformatted node to direct item or leave | ||
1595 | tail in the unformatted node */ | ||
1596 | RFALSE( n_ret_value != CARRY_ON, "PAP-5570: can not convert twice"); | ||
1597 | |||
1598 | n_ret_value = maybe_indirect_to_direct (th, p_s_inode, page, p_s_path, p_s_item_key, | ||
1599 | n_new_file_size, &c_mode); | ||
1600 | if ( c_mode == M_SKIP_BALANCING ) | ||
1601 | /* tail has been left in the unformatted node */ | ||
1602 | return n_ret_value; | ||
1603 | |||
1604 | n_is_inode_locked = 1; | ||
1605 | |||
1606 | /* removing of last unformatted node will change value we | ||
1607 | have to return to truncate. Save it */ | ||
1608 | retval2 = n_ret_value; | ||
1609 | /*retval2 = p_s_sb->s_blocksize - (n_new_file_size & (p_s_sb->s_blocksize - 1));*/ | ||
1610 | |||
1611 | /* So, we have performed the first part of the conversion: | ||
1612 | inserting the new direct item. Now we are removing the | ||
1613 | last unformatted node pointer. Set key to search for | ||
1614 | it. */ | ||
1615 | set_cpu_key_k_type (p_s_item_key, TYPE_INDIRECT); | ||
1616 | p_s_item_key->key_length = 4; | ||
1617 | n_new_file_size -= (n_new_file_size & (p_s_sb->s_blocksize - 1)); | ||
1618 | tail_pos = n_new_file_size; | ||
1619 | set_cpu_key_k_offset (p_s_item_key, n_new_file_size + 1); | ||
1620 | if ( search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_NOT_FOUND ){ | ||
1621 | print_block (PATH_PLAST_BUFFER (p_s_path), 3, PATH_LAST_POSITION (p_s_path) - 1, PATH_LAST_POSITION (p_s_path) + 1); | ||
1622 | reiserfs_panic(p_s_sb, "PAP-5580: reiserfs_cut_from_item: item to convert does not exist (%K)", p_s_item_key); | ||
1623 | } | ||
1624 | continue; | ||
1625 | } | ||
1626 | if (n_cut_size == 0) { | ||
1627 | pathrelse (p_s_path); | ||
1628 | return 0; | ||
1629 | } | ||
1630 | |||
1631 | s_cut_balance.insert_size[0] = n_cut_size; | ||
1632 | |||
1633 | n_ret_value = fix_nodes(c_mode, &s_cut_balance, NULL, NULL); | ||
1634 | if ( n_ret_value != REPEAT_SEARCH ) | ||
1635 | break; | ||
1636 | |||
1637 | PROC_INFO_INC( p_s_sb, cut_from_item_restarted ); | ||
1638 | |||
1639 | n_ret_value = search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path); | ||
1640 | if (n_ret_value == POSITION_FOUND) | ||
1641 | continue; | ||
1642 | |||
1643 | reiserfs_warning (p_s_sb, "PAP-5610: reiserfs_cut_from_item: item %K not found", p_s_item_key); | ||
1644 | unfix_nodes (&s_cut_balance); | ||
1645 | return (n_ret_value == IO_ERROR) ? -EIO : -ENOENT; | ||
1646 | } /* while */ | ||
1647 | |||
1648 | // check fix_nodes results (IO_ERROR or NO_DISK_SPACE) | ||
1649 | if ( n_ret_value != CARRY_ON ) { | ||
1650 | if ( n_is_inode_locked ) { | ||
1651 | // FIXME: this seems to be not needed: we are always able | ||
1652 | // to cut item | ||
1653 | indirect_to_direct_roll_back (th, p_s_inode, p_s_path); | ||
1654 | } | ||
1655 | if (n_ret_value == NO_DISK_SPACE) | ||
1656 | reiserfs_warning (p_s_sb, "NO_DISK_SPACE"); | ||
1657 | unfix_nodes (&s_cut_balance); | ||
1658 | return -EIO; | ||
1659 | } | ||
1660 | |||
1661 | /* go ahead and perform balancing */ | ||
1662 | |||
1663 | RFALSE( c_mode == M_PASTE || c_mode == M_INSERT, "invalid mode"); | ||
1664 | |||
1665 | /* Calculate number of bytes that need to be cut from the item. */ | ||
1666 | quota_cut_bytes = ( c_mode == M_DELETE ) ? ih_item_len(get_ih(p_s_path)) : -s_cut_balance.insert_size[0]; | ||
1667 | if (retval2 == -1) | ||
1668 | n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode); | ||
1669 | else | ||
1670 | n_ret_value = retval2; | ||
1671 | |||
1672 | |||
1673 | /* For direct items, we only change the quota when deleting the last | ||
1674 | ** item. | ||
1675 | */ | ||
1676 | p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path); | ||
1677 | if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) { | ||
1678 | if (c_mode == M_DELETE && | ||
1679 | (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) { | ||
1680 | // FIXME: this is to keep 3.5 happy | ||
1681 | REISERFS_I(p_s_inode)->i_first_direct_byte = U32_MAX; | ||
1682 | quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE ; | ||
1683 | } else { | ||
1684 | quota_cut_bytes = 0 ; | ||
1685 | } | ||
1686 | } | ||
1687 | #ifdef CONFIG_REISERFS_CHECK | ||
1688 | if (n_is_inode_locked) { | ||
1689 | struct item_head * le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path); | ||
1690 | /* we are going to complete indirect2direct conversion. Make | ||
1691 | sure, that we exactly remove last unformatted node pointer | ||
1692 | of the item */ | ||
1693 | if (!is_indirect_le_ih (le_ih)) | ||
1694 | reiserfs_panic (p_s_sb, "vs-5652: reiserfs_cut_from_item: " | ||
1695 | "item must be indirect %h", le_ih); | ||
1696 | |||
1697 | if (c_mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE) | ||
1698 | reiserfs_panic (p_s_sb, "vs-5653: reiserfs_cut_from_item: " | ||
1699 | "completing indirect2direct conversion indirect item %h " | ||
1700 | "being deleted must be of 4 byte long", le_ih); | ||
1701 | |||
1702 | if (c_mode == M_CUT && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) { | ||
1703 | reiserfs_panic (p_s_sb, "vs-5654: reiserfs_cut_from_item: " | ||
1704 | "can not complete indirect2direct conversion of %h (CUT, insert_size==%d)", | ||
1705 | le_ih, s_cut_balance.insert_size[0]); | ||
1706 | } | ||
1707 | /* it would be useful to make sure, that right neighboring | ||
1708 | item is direct item of this file */ | ||
1709 | } | ||
1710 | #endif | ||
1711 | |||
1712 | do_balance(&s_cut_balance, NULL, NULL, c_mode); | ||
1713 | if ( n_is_inode_locked ) { | ||
1714 | /* we've done an indirect->direct conversion. when the data block | ||
1715 | ** was freed, it was removed from the list of blocks that must | ||
1716 | ** be flushed before the transaction commits, make sure to | ||
1717 | ** unmap and invalidate it | ||
1718 | */ | ||
1719 | unmap_buffers(page, tail_pos); | ||
1720 | REISERFS_I(p_s_inode)->i_flags &= ~i_pack_on_close_mask ; | ||
1721 | } | ||
1722 | #ifdef REISERQUOTA_DEBUG | ||
1723 | reiserfs_debug (p_s_inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota cut_from_item(): freeing %u id=%u type=%c", quota_cut_bytes, p_s_inode->i_uid, '?'); | ||
1724 | #endif | ||
1725 | DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes); | ||
1726 | return n_ret_value; | ||
1727 | } | ||
1728 | |||
1729 | static void truncate_directory (struct reiserfs_transaction_handle *th, struct inode * inode) | ||
1730 | { | ||
1731 | BUG_ON (!th->t_trans_id); | ||
1732 | if (inode->i_nlink) | ||
1733 | reiserfs_warning (inode->i_sb, | ||
1734 | "vs-5655: truncate_directory: link count != 0"); | ||
1735 | |||
1736 | set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), DOT_OFFSET); | ||
1737 | set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_DIRENTRY); | ||
1738 | reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode)); | ||
1739 | reiserfs_update_sd(th, inode) ; | ||
1740 | set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), SD_OFFSET); | ||
1741 | set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_STAT_DATA); | ||
1742 | } | ||
1743 | |||
1744 | |||
1745 | |||
1746 | |||
1747 | /* Truncate file to the new size. Note, this must be called with a transaction | ||
1748 | already started */ | ||
1749 | int reiserfs_do_truncate (struct reiserfs_transaction_handle *th, | ||
1750 | struct inode * p_s_inode, /* ->i_size contains new | ||
1751 | size */ | ||
1752 | struct page *page, /* up to date for last block */ | ||
1753 | int update_timestamps /* when it is called by | ||
1754 | file_release to convert | ||
1755 | the tail - no timestamps | ||
1756 | should be updated */ | ||
1757 | ) { | ||
1758 | INITIALIZE_PATH (s_search_path); /* Path to the current object item. */ | ||
1759 | struct item_head * p_le_ih; /* Pointer to an item header. */ | ||
1760 | struct cpu_key s_item_key; /* Key to search for a previous file item. */ | ||
1761 | loff_t n_file_size, /* Old file size. */ | ||
1762 | n_new_file_size;/* New file size. */ | ||
1763 | int n_deleted; /* Number of deleted or truncated bytes. */ | ||
1764 | int retval; | ||
1765 | int err = 0; | ||
1766 | |||
1767 | BUG_ON (!th->t_trans_id); | ||
1768 | if ( ! (S_ISREG(p_s_inode->i_mode) || S_ISDIR(p_s_inode->i_mode) || S_ISLNK(p_s_inode->i_mode)) ) | ||
1769 | return 0; | ||
1770 | |||
1771 | if (S_ISDIR(p_s_inode->i_mode)) { | ||
1772 | // deletion of directory - no need to update timestamps | ||
1773 | truncate_directory (th, p_s_inode); | ||
1774 | return 0; | ||
1775 | } | ||
1776 | |||
1777 | /* Get new file size. */ | ||
1778 | n_new_file_size = p_s_inode->i_size; | ||
1779 | |||
1780 | // FIXME: note, that key type is unimportant here | ||
1781 | make_cpu_key (&s_item_key, p_s_inode, max_reiserfs_offset (p_s_inode), TYPE_DIRECT, 3); | ||
1782 | |||
1783 | retval = search_for_position_by_key(p_s_inode->i_sb, &s_item_key, &s_search_path); | ||
1784 | if (retval == IO_ERROR) { | ||
1785 | reiserfs_warning (p_s_inode->i_sb, "vs-5657: reiserfs_do_truncate: " | ||
1786 | "i/o failure occurred trying to truncate %K", &s_item_key); | ||
1787 | err = -EIO; | ||
1788 | goto out; | ||
1789 | } | ||
1790 | if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) { | ||
1791 | reiserfs_warning (p_s_inode->i_sb, "PAP-5660: reiserfs_do_truncate: " | ||
1792 | "wrong result %d of search for %K", retval, &s_item_key); | ||
1793 | |||
1794 | err = -EIO; | ||
1795 | goto out; | ||
1796 | } | ||
1797 | |||
1798 | s_search_path.pos_in_item --; | ||
1799 | |||
1800 | /* Get real file size (total length of all file items) */ | ||
1801 | p_le_ih = PATH_PITEM_HEAD(&s_search_path); | ||
1802 | if ( is_statdata_le_ih (p_le_ih) ) | ||
1803 | n_file_size = 0; | ||
1804 | else { | ||
1805 | loff_t offset = le_ih_k_offset (p_le_ih); | ||
1806 | int bytes = op_bytes_number (p_le_ih,p_s_inode->i_sb->s_blocksize); | ||
1807 | |||
1808 | /* this may mismatch with real file size: if last direct item | ||
1809 | had no padding zeros and last unformatted node had no free | ||
1810 | space, this file would have this file size */ | ||
1811 | n_file_size = offset + bytes - 1; | ||
1812 | } | ||
1813 | /* | ||
1814 | * are we doing a full truncate or delete, if so | ||
1815 | * kick in the reada code | ||
1816 | */ | ||
1817 | if (n_new_file_size == 0) | ||
1818 | s_search_path.reada = PATH_READA | PATH_READA_BACK; | ||
1819 | |||
1820 | if ( n_file_size == 0 || n_file_size < n_new_file_size ) { | ||
1821 | goto update_and_out ; | ||
1822 | } | ||
1823 | |||
1824 | /* Update key to search for the last file item. */ | ||
1825 | set_cpu_key_k_offset (&s_item_key, n_file_size); | ||
1826 | |||
1827 | do { | ||
1828 | /* Cut or delete file item. */ | ||
1829 | n_deleted = reiserfs_cut_from_item(th, &s_search_path, &s_item_key, p_s_inode, page, n_new_file_size); | ||
1830 | if (n_deleted < 0) { | ||
1831 | reiserfs_warning (p_s_inode->i_sb, "vs-5665: reiserfs_do_truncate: reiserfs_cut_from_item failed"); | ||
1832 | reiserfs_check_path(&s_search_path) ; | ||
1833 | return 0; | ||
1834 | } | ||
1835 | |||
1836 | RFALSE( n_deleted > n_file_size, | ||
1837 | "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K", | ||
1838 | n_deleted, n_file_size, &s_item_key); | ||
1839 | |||
1840 | /* Change key to search the last file item. */ | ||
1841 | n_file_size -= n_deleted; | ||
1842 | |||
1843 | set_cpu_key_k_offset (&s_item_key, n_file_size); | ||
1844 | |||
1845 | /* While there are bytes to truncate and previous file item is presented in the tree. */ | ||
1846 | |||
1847 | /* | ||
1848 | ** This loop could take a really long time, and could log | ||
1849 | ** many more blocks than a transaction can hold. So, we do a polite | ||
1850 | ** journal end here, and if the transaction needs ending, we make | ||
1851 | ** sure the file is consistent before ending the current trans | ||
1852 | ** and starting a new one | ||
1853 | */ | ||
1854 | if (journal_transaction_should_end(th, th->t_blocks_allocated)) { | ||
1855 | int orig_len_alloc = th->t_blocks_allocated ; | ||
1856 | decrement_counters_in_path(&s_search_path) ; | ||
1857 | |||
1858 | if (update_timestamps) { | ||
1859 | p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC; | ||
1860 | } | ||
1861 | reiserfs_update_sd(th, p_s_inode) ; | ||
1862 | |||
1863 | err = journal_end(th, p_s_inode->i_sb, orig_len_alloc) ; | ||
1864 | if (err) | ||
1865 | goto out; | ||
1866 | err = journal_begin (th, p_s_inode->i_sb, | ||
1867 | JOURNAL_PER_BALANCE_CNT * 6); | ||
1868 | if (err) | ||
1869 | goto out; | ||
1870 | reiserfs_update_inode_transaction(p_s_inode) ; | ||
1871 | } | ||
1872 | } while ( n_file_size > ROUND_UP (n_new_file_size) && | ||
1873 | search_for_position_by_key(p_s_inode->i_sb, &s_item_key, &s_search_path) == POSITION_FOUND ) ; | ||
1874 | |||
1875 | RFALSE( n_file_size > ROUND_UP (n_new_file_size), | ||
1876 | "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d", | ||
1877 | n_new_file_size, n_file_size, s_item_key.on_disk_key.k_objectid); | ||
1878 | |||
1879 | update_and_out: | ||
1880 | if (update_timestamps) { | ||
1881 | // this is truncate, not file closing | ||
1882 | p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC; | ||
1883 | } | ||
1884 | reiserfs_update_sd (th, p_s_inode); | ||
1885 | |||
1886 | out: | ||
1887 | pathrelse(&s_search_path) ; | ||
1888 | return err; | ||
1889 | } | ||
1890 | |||
1891 | |||
1892 | #ifdef CONFIG_REISERFS_CHECK | ||
1893 | // this makes sure, that we __append__, not overwrite or add holes | ||
1894 | static void check_research_for_paste (struct path * path, | ||
1895 | const struct cpu_key * p_s_key) | ||
1896 | { | ||
1897 | struct item_head * found_ih = get_ih (path); | ||
1898 | |||
1899 | if (is_direct_le_ih (found_ih)) { | ||
1900 | if (le_ih_k_offset (found_ih) + op_bytes_number (found_ih, get_last_bh (path)->b_size) != | ||
1901 | cpu_key_k_offset (p_s_key) || | ||
1902 | op_bytes_number (found_ih, get_last_bh (path)->b_size) != pos_in_item (path)) | ||
1903 | reiserfs_panic (NULL, "PAP-5720: check_research_for_paste: " | ||
1904 | "found direct item %h or position (%d) does not match to key %K", | ||
1905 | found_ih, pos_in_item (path), p_s_key); | ||
1906 | } | ||
1907 | if (is_indirect_le_ih (found_ih)) { | ||
1908 | if (le_ih_k_offset (found_ih) + op_bytes_number (found_ih, get_last_bh (path)->b_size) != cpu_key_k_offset (p_s_key) || | ||
1909 | I_UNFM_NUM (found_ih) != pos_in_item (path) || | ||
1910 | get_ih_free_space (found_ih) != 0) | ||
1911 | reiserfs_panic (NULL, "PAP-5730: check_research_for_paste: " | ||
1912 | "found indirect item (%h) or position (%d) does not match to key (%K)", | ||
1913 | found_ih, pos_in_item (path), p_s_key); | ||
1914 | } | ||
1915 | } | ||
1916 | #endif /* config reiserfs check */ | ||
1917 | |||
1918 | |||
1919 | /* Paste bytes to the existing item. Returns bytes number pasted into the item. */ | ||
1920 | int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th, | ||
1921 | struct path * p_s_search_path, /* Path to the pasted item. */ | ||
1922 | const struct cpu_key * p_s_key, /* Key to search for the needed item.*/ | ||
1923 | struct inode * inode, /* Inode item belongs to */ | ||
1924 | const char * p_c_body, /* Pointer to the bytes to paste. */ | ||
1925 | int n_pasted_size) /* Size of pasted bytes. */ | ||
1926 | { | ||
1927 | struct tree_balance s_paste_balance; | ||
1928 | int retval; | ||
1929 | int fs_gen; | ||
1930 | |||
1931 | BUG_ON (!th->t_trans_id); | ||
1932 | |||
1933 | fs_gen = get_generation(inode->i_sb) ; | ||
1934 | |||
1935 | #ifdef REISERQUOTA_DEBUG | ||
1936 | reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota paste_into_item(): allocating %u id=%u type=%c", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key))); | ||
1937 | #endif | ||
1938 | |||
1939 | if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) { | ||
1940 | pathrelse(p_s_search_path); | ||
1941 | return -EDQUOT; | ||
1942 | } | ||
1943 | init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path, n_pasted_size); | ||
1944 | #ifdef DISPLACE_NEW_PACKING_LOCALITIES | ||
1945 | s_paste_balance.key = p_s_key->on_disk_key; | ||
1946 | #endif | ||
1947 | |||
1948 | /* DQUOT_* can schedule, must check before the fix_nodes */ | ||
1949 | if (fs_changed(fs_gen, inode->i_sb)) { | ||
1950 | goto search_again; | ||
1951 | } | ||
1952 | |||
1953 | while ((retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) == | ||
1954 | REPEAT_SEARCH ) { | ||
1955 | search_again: | ||
1956 | /* file system changed while we were in the fix_nodes */ | ||
1957 | PROC_INFO_INC( th -> t_super, paste_into_item_restarted ); | ||
1958 | retval = search_for_position_by_key (th->t_super, p_s_key, p_s_search_path); | ||
1959 | if (retval == IO_ERROR) { | ||
1960 | retval = -EIO ; | ||
1961 | goto error_out ; | ||
1962 | } | ||
1963 | if (retval == POSITION_FOUND) { | ||
1964 | reiserfs_warning (inode->i_sb, "PAP-5710: reiserfs_paste_into_item: entry or pasted byte (%K) exists", p_s_key); | ||
1965 | retval = -EEXIST ; | ||
1966 | goto error_out ; | ||
1967 | } | ||
1968 | |||
1969 | #ifdef CONFIG_REISERFS_CHECK | ||
1970 | check_research_for_paste (p_s_search_path, p_s_key); | ||
1971 | #endif | ||
1972 | } | ||
1973 | |||
1974 | /* Perform balancing after all resources are collected by fix_nodes, and | ||
1975 | accessing them will not risk triggering schedule. */ | ||
1976 | if ( retval == CARRY_ON ) { | ||
1977 | do_balance(&s_paste_balance, NULL/*ih*/, p_c_body, M_PASTE); | ||
1978 | return 0; | ||
1979 | } | ||
1980 | retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; | ||
1981 | error_out: | ||
1982 | /* this also releases the path */ | ||
1983 | unfix_nodes(&s_paste_balance); | ||
1984 | #ifdef REISERQUOTA_DEBUG | ||
1985 | reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota paste_into_item(): freeing %u id=%u type=%c", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key))); | ||
1986 | #endif | ||
1987 | DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size); | ||
1988 | return retval ; | ||
1989 | } | ||
1990 | |||
1991 | |||
1992 | /* Insert new item into the buffer at the path. */ | ||
1993 | int reiserfs_insert_item(struct reiserfs_transaction_handle *th, | ||
1994 | struct path * p_s_path, /* Path to the inserteded item. */ | ||
1995 | const struct cpu_key * key, | ||
1996 | struct item_head * p_s_ih, /* Pointer to the item header to insert.*/ | ||
1997 | struct inode * inode, | ||
1998 | const char * p_c_body) /* Pointer to the bytes to insert. */ | ||
1999 | { | ||
2000 | struct tree_balance s_ins_balance; | ||
2001 | int retval; | ||
2002 | int fs_gen = 0 ; | ||
2003 | int quota_bytes = 0 ; | ||
2004 | |||
2005 | BUG_ON (!th->t_trans_id); | ||
2006 | |||
2007 | if (inode) { /* Do we count quotas for item? */ | ||
2008 | fs_gen = get_generation(inode->i_sb); | ||
2009 | quota_bytes = ih_item_len(p_s_ih); | ||
2010 | |||
2011 | /* hack so the quota code doesn't have to guess if the file has | ||
2012 | ** a tail, links are always tails, so there's no guessing needed | ||
2013 | */ | ||
2014 | if (!S_ISLNK (inode->i_mode) && is_direct_le_ih(p_s_ih)) { | ||
2015 | quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE ; | ||
2016 | } | ||
2017 | #ifdef REISERQUOTA_DEBUG | ||
2018 | reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota insert_item(): allocating %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(p_s_ih)); | ||
2019 | #endif | ||
2020 | /* We can't dirty inode here. It would be immediately written but | ||
2021 | * appropriate stat item isn't inserted yet... */ | ||
2022 | if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) { | ||
2023 | pathrelse(p_s_path); | ||
2024 | return -EDQUOT; | ||
2025 | } | ||
2026 | } | ||
2027 | init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path, IH_SIZE + ih_item_len(p_s_ih)); | ||
2028 | #ifdef DISPLACE_NEW_PACKING_LOCALITIES | ||
2029 | s_ins_balance.key = key->on_disk_key; | ||
2030 | #endif | ||
2031 | /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */ | ||
2032 | if (inode && fs_changed(fs_gen, inode->i_sb)) { | ||
2033 | goto search_again; | ||
2034 | } | ||
2035 | |||
2036 | while ( (retval = fix_nodes(M_INSERT, &s_ins_balance, p_s_ih, p_c_body)) == REPEAT_SEARCH) { | ||
2037 | search_again: | ||
2038 | /* file system changed while we were in the fix_nodes */ | ||
2039 | PROC_INFO_INC( th -> t_super, insert_item_restarted ); | ||
2040 | retval = search_item (th->t_super, key, p_s_path); | ||
2041 | if (retval == IO_ERROR) { | ||
2042 | retval = -EIO; | ||
2043 | goto error_out ; | ||
2044 | } | ||
2045 | if (retval == ITEM_FOUND) { | ||
2046 | reiserfs_warning (th->t_super, "PAP-5760: reiserfs_insert_item: " | ||
2047 | "key %K already exists in the tree", key); | ||
2048 | retval = -EEXIST ; | ||
2049 | goto error_out; | ||
2050 | } | ||
2051 | } | ||
2052 | |||
2053 | /* make balancing after all resources will be collected at a time */ | ||
2054 | if ( retval == CARRY_ON ) { | ||
2055 | do_balance (&s_ins_balance, p_s_ih, p_c_body, M_INSERT); | ||
2056 | return 0; | ||
2057 | } | ||
2058 | |||
2059 | retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; | ||
2060 | error_out: | ||
2061 | /* also releases the path */ | ||
2062 | unfix_nodes(&s_ins_balance); | ||
2063 | #ifdef REISERQUOTA_DEBUG | ||
2064 | reiserfs_debug (th->t_super, REISERFS_DEBUG_CODE, "reiserquota insert_item(): freeing %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(p_s_ih)); | ||
2065 | #endif | ||
2066 | if (inode) | ||
2067 | DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes) ; | ||
2068 | return retval; | ||
2069 | } | ||
2070 | |||
2071 | |||
2072 | |||
2073 | |||
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c new file mode 100644 index 000000000000..bcdf2438d152 --- /dev/null +++ b/fs/reiserfs/super.c | |||
@@ -0,0 +1,2148 @@ | |||
1 | /* | ||
2 | * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README | ||
3 | * | ||
4 | * Trivial changes by Alan Cox to add the LFS fixes | ||
5 | * | ||
6 | * Trivial Changes: | ||
7 | * Rights granted to Hans Reiser to redistribute under other terms providing | ||
8 | * he accepts all liability including but not limited to patent, fitness | ||
9 | * for purpose, and direct or indirect claims arising from failure to perform. | ||
10 | * | ||
11 | * NO WARRANTY | ||
12 | */ | ||
13 | |||
14 | #include <linux/config.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/vmalloc.h> | ||
17 | #include <linux/time.h> | ||
18 | #include <asm/uaccess.h> | ||
19 | #include <linux/reiserfs_fs.h> | ||
20 | #include <linux/reiserfs_acl.h> | ||
21 | #include <linux/reiserfs_xattr.h> | ||
22 | #include <linux/smp_lock.h> | ||
23 | #include <linux/init.h> | ||
24 | #include <linux/blkdev.h> | ||
25 | #include <linux/buffer_head.h> | ||
26 | #include <linux/vfs.h> | ||
27 | #include <linux/namespace.h> | ||
28 | #include <linux/mount.h> | ||
29 | #include <linux/namei.h> | ||
30 | #include <linux/quotaops.h> | ||
31 | |||
32 | struct file_system_type reiserfs_fs_type; | ||
33 | |||
34 | static const char reiserfs_3_5_magic_string[] = REISERFS_SUPER_MAGIC_STRING; | ||
35 | static const char reiserfs_3_6_magic_string[] = REISER2FS_SUPER_MAGIC_STRING; | ||
36 | static const char reiserfs_jr_magic_string[] = REISER2FS_JR_SUPER_MAGIC_STRING; | ||
37 | |||
38 | int is_reiserfs_3_5 (struct reiserfs_super_block * rs) | ||
39 | { | ||
40 | return !strncmp (rs->s_v1.s_magic, reiserfs_3_5_magic_string, | ||
41 | strlen (reiserfs_3_5_magic_string)); | ||
42 | } | ||
43 | |||
44 | |||
45 | int is_reiserfs_3_6 (struct reiserfs_super_block * rs) | ||
46 | { | ||
47 | return !strncmp (rs->s_v1.s_magic, reiserfs_3_6_magic_string, | ||
48 | strlen (reiserfs_3_6_magic_string)); | ||
49 | } | ||
50 | |||
51 | |||
52 | int is_reiserfs_jr (struct reiserfs_super_block * rs) | ||
53 | { | ||
54 | return !strncmp (rs->s_v1.s_magic, reiserfs_jr_magic_string, | ||
55 | strlen (reiserfs_jr_magic_string)); | ||
56 | } | ||
57 | |||
58 | |||
59 | static int is_any_reiserfs_magic_string (struct reiserfs_super_block * rs) | ||
60 | { | ||
61 | return (is_reiserfs_3_5 (rs) || is_reiserfs_3_6 (rs) || | ||
62 | is_reiserfs_jr (rs)); | ||
63 | } | ||
64 | |||
65 | static int reiserfs_remount (struct super_block * s, int * flags, char * data); | ||
66 | static int reiserfs_statfs (struct super_block * s, struct kstatfs * buf); | ||
67 | |||
68 | static int reiserfs_sync_fs (struct super_block * s, int wait) | ||
69 | { | ||
70 | if (!(s->s_flags & MS_RDONLY)) { | ||
71 | struct reiserfs_transaction_handle th; | ||
72 | reiserfs_write_lock(s); | ||
73 | if (!journal_begin(&th, s, 1)) | ||
74 | if (!journal_end_sync(&th, s, 1)) | ||
75 | reiserfs_flush_old_commits(s); | ||
76 | s->s_dirt = 0; /* Even if it's not true. | ||
77 | * We'll loop forever in sync_supers otherwise */ | ||
78 | reiserfs_write_unlock(s); | ||
79 | } else { | ||
80 | s->s_dirt = 0; | ||
81 | } | ||
82 | return 0; | ||
83 | } | ||
84 | |||
85 | static void reiserfs_write_super(struct super_block *s) | ||
86 | { | ||
87 | reiserfs_sync_fs(s, 1); | ||
88 | } | ||
89 | |||
90 | static void reiserfs_write_super_lockfs (struct super_block * s) | ||
91 | { | ||
92 | struct reiserfs_transaction_handle th ; | ||
93 | reiserfs_write_lock(s); | ||
94 | if (!(s->s_flags & MS_RDONLY)) { | ||
95 | int err = journal_begin(&th, s, 1) ; | ||
96 | if (err) { | ||
97 | reiserfs_block_writes(&th) ; | ||
98 | } else { | ||
99 | reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); | ||
100 | journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); | ||
101 | reiserfs_block_writes(&th) ; | ||
102 | journal_end_sync(&th, s, 1) ; | ||
103 | } | ||
104 | } | ||
105 | s->s_dirt = 0; | ||
106 | reiserfs_write_unlock(s); | ||
107 | } | ||
108 | |||
109 | static void reiserfs_unlockfs(struct super_block *s) { | ||
110 | reiserfs_allow_writes(s) ; | ||
111 | } | ||
112 | |||
113 | extern const struct reiserfs_key MAX_KEY; | ||
114 | |||
115 | |||
116 | /* this is used to delete "save link" when there are no items of a | ||
117 | file it points to. It can either happen if unlink is completed but | ||
118 | "save unlink" removal, or if file has both unlink and truncate | ||
119 | pending and as unlink completes first (because key of "save link" | ||
120 | protecting unlink is bigger that a key lf "save link" which | ||
121 | protects truncate), so there left no items to make truncate | ||
122 | completion on */ | ||
123 | static int remove_save_link_only (struct super_block * s, struct reiserfs_key * key, int oid_free) | ||
124 | { | ||
125 | struct reiserfs_transaction_handle th; | ||
126 | int err; | ||
127 | |||
128 | /* we are going to do one balancing */ | ||
129 | err = journal_begin (&th, s, JOURNAL_PER_BALANCE_CNT); | ||
130 | if (err) | ||
131 | return err; | ||
132 | |||
133 | reiserfs_delete_solid_item (&th, NULL, key); | ||
134 | if (oid_free) | ||
135 | /* removals are protected by direct items */ | ||
136 | reiserfs_release_objectid (&th, le32_to_cpu (key->k_objectid)); | ||
137 | |||
138 | return journal_end (&th, s, JOURNAL_PER_BALANCE_CNT); | ||
139 | } | ||
140 | |||
141 | #ifdef CONFIG_QUOTA | ||
142 | static int reiserfs_quota_on_mount(struct super_block *, int); | ||
143 | #endif | ||
144 | |||
145 | /* look for uncompleted unlinks and truncates and complete them */ | ||
146 | static int finish_unfinished (struct super_block * s) | ||
147 | { | ||
148 | INITIALIZE_PATH (path); | ||
149 | struct cpu_key max_cpu_key, obj_key; | ||
150 | struct reiserfs_key save_link_key; | ||
151 | int retval = 0; | ||
152 | struct item_head * ih; | ||
153 | struct buffer_head * bh; | ||
154 | int item_pos; | ||
155 | char * item; | ||
156 | int done; | ||
157 | struct inode * inode; | ||
158 | int truncate; | ||
159 | #ifdef CONFIG_QUOTA | ||
160 | int i; | ||
161 | int ms_active_set; | ||
162 | #endif | ||
163 | |||
164 | |||
165 | /* compose key to look for "save" links */ | ||
166 | max_cpu_key.version = KEY_FORMAT_3_5; | ||
167 | max_cpu_key.on_disk_key = MAX_KEY; | ||
168 | max_cpu_key.key_length = 3; | ||
169 | |||
170 | #ifdef CONFIG_QUOTA | ||
171 | /* Needed for iput() to work correctly and not trash data */ | ||
172 | if (s->s_flags & MS_ACTIVE) { | ||
173 | ms_active_set = 0; | ||
174 | } else { | ||
175 | ms_active_set = 1; | ||
176 | s->s_flags |= MS_ACTIVE; | ||
177 | } | ||
178 | /* Turn on quotas so that they are updated correctly */ | ||
179 | for (i = 0; i < MAXQUOTAS; i++) { | ||
180 | if (REISERFS_SB(s)->s_qf_names[i]) { | ||
181 | int ret = reiserfs_quota_on_mount(s, i); | ||
182 | if (ret < 0) | ||
183 | reiserfs_warning(s, "reiserfs: cannot turn on journalled quota: error %d", ret); | ||
184 | } | ||
185 | } | ||
186 | #endif | ||
187 | |||
188 | done = 0; | ||
189 | REISERFS_SB(s)->s_is_unlinked_ok = 1; | ||
190 | while (!retval) { | ||
191 | retval = search_item (s, &max_cpu_key, &path); | ||
192 | if (retval != ITEM_NOT_FOUND) { | ||
193 | reiserfs_warning (s, "vs-2140: finish_unfinished: search_by_key returned %d", | ||
194 | retval); | ||
195 | break; | ||
196 | } | ||
197 | |||
198 | bh = get_last_bh (&path); | ||
199 | item_pos = get_item_pos (&path); | ||
200 | if (item_pos != B_NR_ITEMS (bh)) { | ||
201 | reiserfs_warning (s, "vs-2060: finish_unfinished: wrong position found"); | ||
202 | break; | ||
203 | } | ||
204 | item_pos --; | ||
205 | ih = B_N_PITEM_HEAD (bh, item_pos); | ||
206 | |||
207 | if (le32_to_cpu (ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID) | ||
208 | /* there are no "save" links anymore */ | ||
209 | break; | ||
210 | |||
211 | save_link_key = ih->ih_key; | ||
212 | if (is_indirect_le_ih (ih)) | ||
213 | truncate = 1; | ||
214 | else | ||
215 | truncate = 0; | ||
216 | |||
217 | /* reiserfs_iget needs k_dirid and k_objectid only */ | ||
218 | item = B_I_PITEM (bh, ih); | ||
219 | obj_key.on_disk_key.k_dir_id = le32_to_cpu (*(__u32 *)item); | ||
220 | obj_key.on_disk_key.k_objectid = le32_to_cpu (ih->ih_key.k_objectid); | ||
221 | obj_key.on_disk_key.u.k_offset_v1.k_offset = 0; | ||
222 | obj_key.on_disk_key.u.k_offset_v1.k_uniqueness = 0; | ||
223 | |||
224 | pathrelse (&path); | ||
225 | |||
226 | inode = reiserfs_iget (s, &obj_key); | ||
227 | if (!inode) { | ||
228 | /* the unlink almost completed, it just did not manage to remove | ||
229 | "save" link and release objectid */ | ||
230 | reiserfs_warning (s, "vs-2180: finish_unfinished: iget failed for %K", | ||
231 | &obj_key); | ||
232 | retval = remove_save_link_only (s, &save_link_key, 1); | ||
233 | continue; | ||
234 | } | ||
235 | |||
236 | if (!truncate && inode->i_nlink) { | ||
237 | /* file is not unlinked */ | ||
238 | reiserfs_warning (s, "vs-2185: finish_unfinished: file %K is not unlinked", | ||
239 | &obj_key); | ||
240 | retval = remove_save_link_only (s, &save_link_key, 0); | ||
241 | continue; | ||
242 | } | ||
243 | DQUOT_INIT(inode); | ||
244 | |||
245 | if (truncate && S_ISDIR (inode->i_mode) ) { | ||
246 | /* We got a truncate request for a dir which is impossible. | ||
247 | The only imaginable way is to execute unfinished truncate request | ||
248 | then boot into old kernel, remove the file and create dir with | ||
249 | the same key. */ | ||
250 | reiserfs_warning(s, "green-2101: impossible truncate on a directory %k. Please report", INODE_PKEY (inode)); | ||
251 | retval = remove_save_link_only (s, &save_link_key, 0); | ||
252 | truncate = 0; | ||
253 | iput (inode); | ||
254 | continue; | ||
255 | } | ||
256 | |||
257 | if (truncate) { | ||
258 | REISERFS_I(inode) -> i_flags |= i_link_saved_truncate_mask; | ||
259 | /* not completed truncate found. New size was committed together | ||
260 | with "save" link */ | ||
261 | reiserfs_info (s, "Truncating %k to %Ld ..", | ||
262 | INODE_PKEY (inode), inode->i_size); | ||
263 | reiserfs_truncate_file (inode, 0/*don't update modification time*/); | ||
264 | retval = remove_save_link (inode, truncate); | ||
265 | } else { | ||
266 | REISERFS_I(inode) -> i_flags |= i_link_saved_unlink_mask; | ||
267 | /* not completed unlink (rmdir) found */ | ||
268 | reiserfs_info (s, "Removing %k..", INODE_PKEY (inode)); | ||
269 | /* removal gets completed in iput */ | ||
270 | retval = 0; | ||
271 | } | ||
272 | |||
273 | iput (inode); | ||
274 | printk ("done\n"); | ||
275 | done ++; | ||
276 | } | ||
277 | REISERFS_SB(s)->s_is_unlinked_ok = 0; | ||
278 | |||
279 | #ifdef CONFIG_QUOTA | ||
280 | /* Turn quotas off */ | ||
281 | for (i = 0; i < MAXQUOTAS; i++) { | ||
282 | if (sb_dqopt(s)->files[i]) | ||
283 | vfs_quota_off_mount(s, i); | ||
284 | } | ||
285 | if (ms_active_set) | ||
286 | /* Restore the flag back */ | ||
287 | s->s_flags &= ~MS_ACTIVE; | ||
288 | #endif | ||
289 | pathrelse (&path); | ||
290 | if (done) | ||
291 | reiserfs_info (s, "There were %d uncompleted unlinks/truncates. " | ||
292 | "Completed\n", done); | ||
293 | return retval; | ||
294 | } | ||
295 | |||
296 | /* to protect file being unlinked from getting lost we "safe" link files | ||
297 | being unlinked. This link will be deleted in the same transaction with last | ||
298 | item of file. mounting the filesytem we scan all these links and remove | ||
299 | files which almost got lost */ | ||
300 | void add_save_link (struct reiserfs_transaction_handle * th, | ||
301 | struct inode * inode, int truncate) | ||
302 | { | ||
303 | INITIALIZE_PATH (path); | ||
304 | int retval; | ||
305 | struct cpu_key key; | ||
306 | struct item_head ih; | ||
307 | __u32 link; | ||
308 | |||
309 | BUG_ON (!th->t_trans_id); | ||
310 | |||
311 | /* file can only get one "save link" of each kind */ | ||
312 | RFALSE( truncate && | ||
313 | ( REISERFS_I(inode) -> i_flags & i_link_saved_truncate_mask ), | ||
314 | "saved link already exists for truncated inode %lx", | ||
315 | ( long ) inode -> i_ino ); | ||
316 | RFALSE( !truncate && | ||
317 | ( REISERFS_I(inode) -> i_flags & i_link_saved_unlink_mask ), | ||
318 | "saved link already exists for unlinked inode %lx", | ||
319 | ( long ) inode -> i_ino ); | ||
320 | |||
321 | /* setup key of "save" link */ | ||
322 | key.version = KEY_FORMAT_3_5; | ||
323 | key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID; | ||
324 | key.on_disk_key.k_objectid = inode->i_ino; | ||
325 | if (!truncate) { | ||
326 | /* unlink, rmdir, rename */ | ||
327 | set_cpu_key_k_offset (&key, 1 + inode->i_sb->s_blocksize); | ||
328 | set_cpu_key_k_type (&key, TYPE_DIRECT); | ||
329 | |||
330 | /* item head of "safe" link */ | ||
331 | make_le_item_head (&ih, &key, key.version, 1 + inode->i_sb->s_blocksize, TYPE_DIRECT, | ||
332 | 4/*length*/, 0xffff/*free space*/); | ||
333 | } else { | ||
334 | /* truncate */ | ||
335 | if (S_ISDIR (inode->i_mode)) | ||
336 | reiserfs_warning(inode->i_sb, "green-2102: Adding a truncate savelink for a directory %k! Please report", INODE_PKEY(inode)); | ||
337 | set_cpu_key_k_offset (&key, 1); | ||
338 | set_cpu_key_k_type (&key, TYPE_INDIRECT); | ||
339 | |||
340 | /* item head of "safe" link */ | ||
341 | make_le_item_head (&ih, &key, key.version, 1, TYPE_INDIRECT, | ||
342 | 4/*length*/, 0/*free space*/); | ||
343 | } | ||
344 | key.key_length = 3; | ||
345 | |||
346 | /* look for its place in the tree */ | ||
347 | retval = search_item (inode->i_sb, &key, &path); | ||
348 | if (retval != ITEM_NOT_FOUND) { | ||
349 | if ( retval != -ENOSPC ) | ||
350 | reiserfs_warning (inode->i_sb, "vs-2100: add_save_link:" | ||
351 | "search_by_key (%K) returned %d", &key, retval); | ||
352 | pathrelse (&path); | ||
353 | return; | ||
354 | } | ||
355 | |||
356 | /* body of "save" link */ | ||
357 | link = INODE_PKEY (inode)->k_dir_id; | ||
358 | |||
359 | /* put "save" link inot tree, don't charge quota to anyone */ | ||
360 | retval = reiserfs_insert_item (th, &path, &key, &ih, NULL, (char *)&link); | ||
361 | if (retval) { | ||
362 | if (retval != -ENOSPC) | ||
363 | reiserfs_warning (inode->i_sb, "vs-2120: add_save_link: insert_item returned %d", | ||
364 | retval); | ||
365 | } else { | ||
366 | if( truncate ) | ||
367 | REISERFS_I(inode) -> i_flags |= i_link_saved_truncate_mask; | ||
368 | else | ||
369 | REISERFS_I(inode) -> i_flags |= i_link_saved_unlink_mask; | ||
370 | } | ||
371 | } | ||
372 | |||
373 | |||
374 | /* this opens transaction unlike add_save_link */ | ||
375 | int remove_save_link (struct inode * inode, int truncate) | ||
376 | { | ||
377 | struct reiserfs_transaction_handle th; | ||
378 | struct reiserfs_key key; | ||
379 | int err; | ||
380 | |||
381 | /* we are going to do one balancing only */ | ||
382 | err = journal_begin (&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); | ||
383 | if (err) | ||
384 | return err; | ||
385 | |||
386 | /* setup key of "save" link */ | ||
387 | key.k_dir_id = cpu_to_le32 (MAX_KEY_OBJECTID); | ||
388 | key.k_objectid = INODE_PKEY (inode)->k_objectid; | ||
389 | if (!truncate) { | ||
390 | /* unlink, rmdir, rename */ | ||
391 | set_le_key_k_offset (KEY_FORMAT_3_5, &key, | ||
392 | 1 + inode->i_sb->s_blocksize); | ||
393 | set_le_key_k_type (KEY_FORMAT_3_5, &key, TYPE_DIRECT); | ||
394 | } else { | ||
395 | /* truncate */ | ||
396 | set_le_key_k_offset (KEY_FORMAT_3_5, &key, 1); | ||
397 | set_le_key_k_type (KEY_FORMAT_3_5, &key, TYPE_INDIRECT); | ||
398 | } | ||
399 | |||
400 | if( ( truncate && | ||
401 | ( REISERFS_I(inode) -> i_flags & i_link_saved_truncate_mask ) ) || | ||
402 | ( !truncate && | ||
403 | ( REISERFS_I(inode) -> i_flags & i_link_saved_unlink_mask ) ) ) | ||
404 | /* don't take quota bytes from anywhere */ | ||
405 | reiserfs_delete_solid_item (&th, NULL, &key); | ||
406 | if (!truncate) { | ||
407 | reiserfs_release_objectid (&th, inode->i_ino); | ||
408 | REISERFS_I(inode) -> i_flags &= ~i_link_saved_unlink_mask; | ||
409 | } else | ||
410 | REISERFS_I(inode) -> i_flags &= ~i_link_saved_truncate_mask; | ||
411 | |||
412 | return journal_end (&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); | ||
413 | } | ||
414 | |||
415 | |||
416 | static void reiserfs_put_super (struct super_block * s) | ||
417 | { | ||
418 | int i; | ||
419 | struct reiserfs_transaction_handle th ; | ||
420 | th.t_trans_id = 0; | ||
421 | |||
422 | if (REISERFS_SB(s)->xattr_root) { | ||
423 | d_invalidate (REISERFS_SB(s)->xattr_root); | ||
424 | dput (REISERFS_SB(s)->xattr_root); | ||
425 | } | ||
426 | |||
427 | if (REISERFS_SB(s)->priv_root) { | ||
428 | d_invalidate (REISERFS_SB(s)->priv_root); | ||
429 | dput (REISERFS_SB(s)->priv_root); | ||
430 | } | ||
431 | |||
432 | /* change file system state to current state if it was mounted with read-write permissions */ | ||
433 | if (!(s->s_flags & MS_RDONLY)) { | ||
434 | if (!journal_begin(&th, s, 10)) { | ||
435 | reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; | ||
436 | set_sb_umount_state( SB_DISK_SUPER_BLOCK(s), REISERFS_SB(s)->s_mount_state ); | ||
437 | journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); | ||
438 | } | ||
439 | } | ||
440 | |||
441 | /* note, journal_release checks for readonly mount, and can decide not | ||
442 | ** to do a journal_end | ||
443 | */ | ||
444 | journal_release(&th, s) ; | ||
445 | |||
446 | for (i = 0; i < SB_BMAP_NR (s); i ++) | ||
447 | brelse (SB_AP_BITMAP (s)[i].bh); | ||
448 | |||
449 | vfree (SB_AP_BITMAP (s)); | ||
450 | |||
451 | brelse (SB_BUFFER_WITH_SB (s)); | ||
452 | |||
453 | print_statistics (s); | ||
454 | |||
455 | if (REISERFS_SB(s)->s_kmallocs != 0) { | ||
456 | reiserfs_warning (s, "vs-2004: reiserfs_put_super: allocated memory left %d", | ||
457 | REISERFS_SB(s)->s_kmallocs); | ||
458 | } | ||
459 | |||
460 | if (REISERFS_SB(s)->reserved_blocks != 0) { | ||
461 | reiserfs_warning (s, "green-2005: reiserfs_put_super: reserved blocks left %d", | ||
462 | REISERFS_SB(s)->reserved_blocks); | ||
463 | } | ||
464 | |||
465 | reiserfs_proc_info_done( s ); | ||
466 | |||
467 | kfree(s->s_fs_info); | ||
468 | s->s_fs_info = NULL; | ||
469 | |||
470 | return; | ||
471 | } | ||
472 | |||
473 | static kmem_cache_t * reiserfs_inode_cachep; | ||
474 | |||
475 | static struct inode *reiserfs_alloc_inode(struct super_block *sb) | ||
476 | { | ||
477 | struct reiserfs_inode_info *ei; | ||
478 | ei = (struct reiserfs_inode_info *)kmem_cache_alloc(reiserfs_inode_cachep, SLAB_KERNEL); | ||
479 | if (!ei) | ||
480 | return NULL; | ||
481 | return &ei->vfs_inode; | ||
482 | } | ||
483 | |||
484 | static void reiserfs_destroy_inode(struct inode *inode) | ||
485 | { | ||
486 | kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode)); | ||
487 | } | ||
488 | |||
489 | static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) | ||
490 | { | ||
491 | struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *) foo; | ||
492 | |||
493 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == | ||
494 | SLAB_CTOR_CONSTRUCTOR) { | ||
495 | INIT_LIST_HEAD(&ei->i_prealloc_list) ; | ||
496 | inode_init_once(&ei->vfs_inode); | ||
497 | ei->i_acl_access = NULL; | ||
498 | ei->i_acl_default = NULL; | ||
499 | } | ||
500 | } | ||
501 | |||
502 | static int init_inodecache(void) | ||
503 | { | ||
504 | reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache", | ||
505 | sizeof(struct reiserfs_inode_info), | ||
506 | 0, SLAB_RECLAIM_ACCOUNT, | ||
507 | init_once, NULL); | ||
508 | if (reiserfs_inode_cachep == NULL) | ||
509 | return -ENOMEM; | ||
510 | return 0; | ||
511 | } | ||
512 | |||
513 | static void destroy_inodecache(void) | ||
514 | { | ||
515 | if (kmem_cache_destroy(reiserfs_inode_cachep)) | ||
516 | reiserfs_warning (NULL, "reiserfs_inode_cache: not all structures were freed"); | ||
517 | } | ||
518 | |||
519 | /* we don't mark inodes dirty, we just log them */ | ||
520 | static void reiserfs_dirty_inode (struct inode * inode) { | ||
521 | struct reiserfs_transaction_handle th ; | ||
522 | |||
523 | int err = 0; | ||
524 | if (inode->i_sb->s_flags & MS_RDONLY) { | ||
525 | reiserfs_warning(inode->i_sb, "clm-6006: writing inode %lu on readonly FS", | ||
526 | inode->i_ino) ; | ||
527 | return ; | ||
528 | } | ||
529 | reiserfs_write_lock(inode->i_sb); | ||
530 | |||
531 | /* this is really only used for atime updates, so they don't have | ||
532 | ** to be included in O_SYNC or fsync | ||
533 | */ | ||
534 | err = journal_begin(&th, inode->i_sb, 1) ; | ||
535 | if (err) { | ||
536 | reiserfs_write_unlock (inode->i_sb); | ||
537 | return; | ||
538 | } | ||
539 | reiserfs_update_sd (&th, inode); | ||
540 | journal_end(&th, inode->i_sb, 1) ; | ||
541 | reiserfs_write_unlock(inode->i_sb); | ||
542 | } | ||
543 | |||
544 | static void reiserfs_clear_inode (struct inode *inode) | ||
545 | { | ||
546 | struct posix_acl *acl; | ||
547 | |||
548 | acl = REISERFS_I(inode)->i_acl_access; | ||
549 | if (acl && !IS_ERR (acl)) | ||
550 | posix_acl_release (acl); | ||
551 | REISERFS_I(inode)->i_acl_access = NULL; | ||
552 | |||
553 | acl = REISERFS_I(inode)->i_acl_default; | ||
554 | if (acl && !IS_ERR (acl)) | ||
555 | posix_acl_release (acl); | ||
556 | REISERFS_I(inode)->i_acl_default = NULL; | ||
557 | } | ||
558 | |||
559 | #ifdef CONFIG_QUOTA | ||
560 | static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, size_t, loff_t); | ||
561 | static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t, loff_t); | ||
562 | #endif | ||
563 | |||
564 | static struct super_operations reiserfs_sops = | ||
565 | { | ||
566 | .alloc_inode = reiserfs_alloc_inode, | ||
567 | .destroy_inode = reiserfs_destroy_inode, | ||
568 | .write_inode = reiserfs_write_inode, | ||
569 | .dirty_inode = reiserfs_dirty_inode, | ||
570 | .delete_inode = reiserfs_delete_inode, | ||
571 | .clear_inode = reiserfs_clear_inode, | ||
572 | .put_super = reiserfs_put_super, | ||
573 | .write_super = reiserfs_write_super, | ||
574 | .sync_fs = reiserfs_sync_fs, | ||
575 | .write_super_lockfs = reiserfs_write_super_lockfs, | ||
576 | .unlockfs = reiserfs_unlockfs, | ||
577 | .statfs = reiserfs_statfs, | ||
578 | .remount_fs = reiserfs_remount, | ||
579 | #ifdef CONFIG_QUOTA | ||
580 | .quota_read = reiserfs_quota_read, | ||
581 | .quota_write = reiserfs_quota_write, | ||
582 | #endif | ||
583 | }; | ||
584 | |||
585 | #ifdef CONFIG_QUOTA | ||
586 | #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") | ||
587 | |||
588 | static int reiserfs_dquot_initialize(struct inode *, int); | ||
589 | static int reiserfs_dquot_drop(struct inode *); | ||
590 | static int reiserfs_write_dquot(struct dquot *); | ||
591 | static int reiserfs_acquire_dquot(struct dquot *); | ||
592 | static int reiserfs_release_dquot(struct dquot *); | ||
593 | static int reiserfs_mark_dquot_dirty(struct dquot *); | ||
594 | static int reiserfs_write_info(struct super_block *, int); | ||
595 | static int reiserfs_quota_on(struct super_block *, int, int, char *); | ||
596 | |||
597 | static struct dquot_operations reiserfs_quota_operations = | ||
598 | { | ||
599 | .initialize = reiserfs_dquot_initialize, | ||
600 | .drop = reiserfs_dquot_drop, | ||
601 | .alloc_space = dquot_alloc_space, | ||
602 | .alloc_inode = dquot_alloc_inode, | ||
603 | .free_space = dquot_free_space, | ||
604 | .free_inode = dquot_free_inode, | ||
605 | .transfer = dquot_transfer, | ||
606 | .write_dquot = reiserfs_write_dquot, | ||
607 | .acquire_dquot = reiserfs_acquire_dquot, | ||
608 | .release_dquot = reiserfs_release_dquot, | ||
609 | .mark_dirty = reiserfs_mark_dquot_dirty, | ||
610 | .write_info = reiserfs_write_info, | ||
611 | }; | ||
612 | |||
613 | static struct quotactl_ops reiserfs_qctl_operations = | ||
614 | { | ||
615 | .quota_on = reiserfs_quota_on, | ||
616 | .quota_off = vfs_quota_off, | ||
617 | .quota_sync = vfs_quota_sync, | ||
618 | .get_info = vfs_get_dqinfo, | ||
619 | .set_info = vfs_set_dqinfo, | ||
620 | .get_dqblk = vfs_get_dqblk, | ||
621 | .set_dqblk = vfs_set_dqblk, | ||
622 | }; | ||
623 | #endif | ||
624 | |||
625 | static struct export_operations reiserfs_export_ops = { | ||
626 | .encode_fh = reiserfs_encode_fh, | ||
627 | .decode_fh = reiserfs_decode_fh, | ||
628 | .get_parent = reiserfs_get_parent, | ||
629 | .get_dentry = reiserfs_get_dentry, | ||
630 | } ; | ||
631 | |||
632 | /* this struct is used in reiserfs_getopt () for containing the value for those | ||
633 | mount options that have values rather than being toggles. */ | ||
634 | typedef struct { | ||
635 | char * value; | ||
636 | int setmask; /* bitmask which is to set on mount_options bitmask when this | ||
637 | value is found, 0 is no bits are to be changed. */ | ||
638 | int clrmask; /* bitmask which is to clear on mount_options bitmask when this | ||
639 | value is found, 0 is no bits are to be changed. This is | ||
640 | applied BEFORE setmask */ | ||
641 | } arg_desc_t; | ||
642 | |||
643 | /* Set this bit in arg_required to allow empty arguments */ | ||
644 | #define REISERFS_OPT_ALLOWEMPTY 31 | ||
645 | |||
646 | /* this struct is used in reiserfs_getopt() for describing the set of reiserfs | ||
647 | mount options */ | ||
648 | typedef struct { | ||
649 | char * option_name; | ||
650 | int arg_required; /* 0 if argument is not required, not 0 otherwise */ | ||
651 | const arg_desc_t * values; /* list of values accepted by an option */ | ||
652 | int setmask; /* bitmask which is to set on mount_options bitmask when this | ||
653 | value is found, 0 is no bits are to be changed. */ | ||
654 | int clrmask; /* bitmask which is to clear on mount_options bitmask when this | ||
655 | value is found, 0 is no bits are to be changed. This is | ||
656 | applied BEFORE setmask */ | ||
657 | } opt_desc_t; | ||
658 | |||
659 | /* possible values for -o data= */ | ||
660 | static const arg_desc_t logging_mode[] = { | ||
661 | {"ordered", 1<<REISERFS_DATA_ORDERED, (1<<REISERFS_DATA_LOG|1<<REISERFS_DATA_WRITEBACK)}, | ||
662 | {"journal", 1<<REISERFS_DATA_LOG, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_WRITEBACK)}, | ||
663 | {"writeback", 1<<REISERFS_DATA_WRITEBACK, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_LOG)}, | ||
664 | {NULL, 0} | ||
665 | }; | ||
666 | |||
667 | /* possible values for -o barrier= */ | ||
668 | static const arg_desc_t barrier_mode[] = { | ||
669 | {"none", 1<<REISERFS_BARRIER_NONE, 1<<REISERFS_BARRIER_FLUSH}, | ||
670 | {"flush", 1<<REISERFS_BARRIER_FLUSH, 1<<REISERFS_BARRIER_NONE}, | ||
671 | {NULL, 0} | ||
672 | }; | ||
673 | |||
674 | /* possible values for "-o block-allocator=" and bits which are to be set in | ||
675 | s_mount_opt of reiserfs specific part of in-core super block */ | ||
676 | static const arg_desc_t balloc[] = { | ||
677 | {"noborder", 1<<REISERFS_NO_BORDER, 0}, | ||
678 | {"border", 0, 1<<REISERFS_NO_BORDER}, | ||
679 | {"no_unhashed_relocation", 1<<REISERFS_NO_UNHASHED_RELOCATION, 0}, | ||
680 | {"hashed_relocation", 1<<REISERFS_HASHED_RELOCATION, 0}, | ||
681 | {"test4", 1<<REISERFS_TEST4, 0}, | ||
682 | {"notest4", 0, 1<<REISERFS_TEST4}, | ||
683 | {NULL, 0, 0} | ||
684 | }; | ||
685 | |||
686 | static const arg_desc_t tails[] = { | ||
687 | {"on", 1<<REISERFS_LARGETAIL, 1<<REISERFS_SMALLTAIL}, | ||
688 | {"off", 0, (1<<REISERFS_LARGETAIL)|(1<<REISERFS_SMALLTAIL)}, | ||
689 | {"small", 1<<REISERFS_SMALLTAIL, 1<<REISERFS_LARGETAIL}, | ||
690 | {NULL, 0, 0} | ||
691 | }; | ||
692 | |||
693 | static const arg_desc_t error_actions[] = { | ||
694 | {"panic", 1 << REISERFS_ERROR_PANIC, | ||
695 | (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)}, | ||
696 | {"ro-remount", 1 << REISERFS_ERROR_RO, | ||
697 | (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)}, | ||
698 | #ifdef REISERFS_JOURNAL_ERROR_ALLOWS_NO_LOG | ||
699 | {"continue", 1 << REISERFS_ERROR_CONTINUE, | ||
700 | (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)}, | ||
701 | #endif | ||
702 | {NULL, 0, 0}, | ||
703 | }; | ||
704 | |||
705 | int reiserfs_default_io_size = 128 * 1024; /* Default recommended I/O size is 128k. | ||
706 | There might be broken applications that are | ||
707 | confused by this. Use nolargeio mount option | ||
708 | to get usual i/o size = PAGE_SIZE. | ||
709 | */ | ||
710 | |||
711 | /* proceed only one option from a list *cur - string containing of mount options | ||
712 | opts - array of options which are accepted | ||
713 | opt_arg - if option is found and requires an argument and if it is specifed | ||
714 | in the input - pointer to the argument is stored here | ||
715 | bit_flags - if option requires to set a certain bit - it is set here | ||
716 | return -1 if unknown option is found, opt->arg_required otherwise */ | ||
717 | static int reiserfs_getopt ( struct super_block * s, char ** cur, opt_desc_t * opts, char ** opt_arg, | ||
718 | unsigned long * bit_flags) | ||
719 | { | ||
720 | char * p; | ||
721 | /* foo=bar, | ||
722 | ^ ^ ^ | ||
723 | | | +-- option_end | ||
724 | | +-- arg_start | ||
725 | +-- option_start | ||
726 | */ | ||
727 | const opt_desc_t * opt; | ||
728 | const arg_desc_t * arg; | ||
729 | |||
730 | |||
731 | p = *cur; | ||
732 | |||
733 | /* assume argument cannot contain commas */ | ||
734 | *cur = strchr (p, ','); | ||
735 | if (*cur) { | ||
736 | *(*cur) = '\0'; | ||
737 | (*cur) ++; | ||
738 | } | ||
739 | |||
740 | if ( !strncmp (p, "alloc=", 6) ) { | ||
741 | /* Ugly special case, probably we should redo options parser so that | ||
742 | it can understand several arguments for some options, also so that | ||
743 | it can fill several bitfields with option values. */ | ||
744 | if ( reiserfs_parse_alloc_options( s, p + 6) ) { | ||
745 | return -1; | ||
746 | } else { | ||
747 | return 0; | ||
748 | } | ||
749 | } | ||
750 | |||
751 | |||
752 | /* for every option in the list */ | ||
753 | for (opt = opts; opt->option_name; opt ++) { | ||
754 | if (!strncmp (p, opt->option_name, strlen (opt->option_name))) { | ||
755 | if (bit_flags) { | ||
756 | if (opt->clrmask == (1 << REISERFS_UNSUPPORTED_OPT)) | ||
757 | reiserfs_warning (s, "%s not supported.", p); | ||
758 | else | ||
759 | *bit_flags &= ~opt->clrmask; | ||
760 | if (opt->setmask == (1 << REISERFS_UNSUPPORTED_OPT)) | ||
761 | reiserfs_warning (s, "%s not supported.", p); | ||
762 | else | ||
763 | *bit_flags |= opt->setmask; | ||
764 | } | ||
765 | break; | ||
766 | } | ||
767 | } | ||
768 | if (!opt->option_name) { | ||
769 | reiserfs_warning (s, "unknown mount option \"%s\"", p); | ||
770 | return -1; | ||
771 | } | ||
772 | |||
773 | p += strlen (opt->option_name); | ||
774 | switch (*p) { | ||
775 | case '=': | ||
776 | if (!opt->arg_required) { | ||
777 | reiserfs_warning (s, "the option \"%s\" does not require an argument", | ||
778 | opt->option_name); | ||
779 | return -1; | ||
780 | } | ||
781 | break; | ||
782 | |||
783 | case 0: | ||
784 | if (opt->arg_required) { | ||
785 | reiserfs_warning (s, "the option \"%s\" requires an argument", opt->option_name); | ||
786 | return -1; | ||
787 | } | ||
788 | break; | ||
789 | default: | ||
790 | reiserfs_warning (s, "head of option \"%s\" is only correct", opt->option_name); | ||
791 | return -1; | ||
792 | } | ||
793 | |||
794 | /* move to the argument, or to next option if argument is not required */ | ||
795 | p ++; | ||
796 | |||
797 | if ( opt->arg_required && !(opt->arg_required & (1<<REISERFS_OPT_ALLOWEMPTY)) && !strlen (p) ) { | ||
798 | /* this catches "option=," if not allowed */ | ||
799 | reiserfs_warning (s, "empty argument for \"%s\"", opt->option_name); | ||
800 | return -1; | ||
801 | } | ||
802 | |||
803 | if (!opt->values) { | ||
804 | /* *=NULLopt_arg contains pointer to argument */ | ||
805 | *opt_arg = p; | ||
806 | return opt->arg_required & ~(1<<REISERFS_OPT_ALLOWEMPTY); | ||
807 | } | ||
808 | |||
809 | /* values possible for this option are listed in opt->values */ | ||
810 | for (arg = opt->values; arg->value; arg ++) { | ||
811 | if (!strcmp (p, arg->value)) { | ||
812 | if (bit_flags) { | ||
813 | *bit_flags &= ~arg->clrmask; | ||
814 | *bit_flags |= arg->setmask; | ||
815 | } | ||
816 | return opt->arg_required; | ||
817 | } | ||
818 | } | ||
819 | |||
820 | reiserfs_warning (s, "bad value \"%s\" for option \"%s\"", p, opt->option_name); | ||
821 | return -1; | ||
822 | } | ||
823 | |||
824 | /* returns 0 if something is wrong in option string, 1 - otherwise */ | ||
825 | static int reiserfs_parse_options (struct super_block * s, char * options, /* string given via mount's -o */ | ||
826 | unsigned long * mount_options, | ||
827 | /* after the parsing phase, contains the | ||
828 | collection of bitflags defining what | ||
829 | mount options were selected. */ | ||
830 | unsigned long * blocks, /* strtol-ed from NNN of resize=NNN */ | ||
831 | char ** jdev_name, | ||
832 | unsigned int * commit_max_age) | ||
833 | { | ||
834 | int c; | ||
835 | char * arg = NULL; | ||
836 | char * pos; | ||
837 | opt_desc_t opts[] = { | ||
838 | /* Compatibility stuff, so that -o notail for old setups still work */ | ||
839 | {"tails", .arg_required = 't', .values = tails}, | ||
840 | {"notail", .clrmask = (1<<REISERFS_LARGETAIL)|(1<<REISERFS_SMALLTAIL)}, | ||
841 | {"conv", .setmask = 1<<REISERFS_CONVERT}, | ||
842 | {"attrs", .setmask = 1<<REISERFS_ATTRS}, | ||
843 | {"noattrs", .clrmask = 1<<REISERFS_ATTRS}, | ||
844 | #ifdef CONFIG_REISERFS_FS_XATTR | ||
845 | {"user_xattr", .setmask = 1<<REISERFS_XATTRS_USER}, | ||
846 | {"nouser_xattr",.clrmask = 1<<REISERFS_XATTRS_USER}, | ||
847 | #else | ||
848 | {"user_xattr", .setmask = 1<<REISERFS_UNSUPPORTED_OPT}, | ||
849 | {"nouser_xattr",.clrmask = 1<<REISERFS_UNSUPPORTED_OPT}, | ||
850 | #endif | ||
851 | #ifdef CONFIG_REISERFS_FS_POSIX_ACL | ||
852 | {"acl", .setmask = 1<<REISERFS_POSIXACL}, | ||
853 | {"noacl", .clrmask = 1<<REISERFS_POSIXACL}, | ||
854 | #else | ||
855 | {"acl", .setmask = 1<<REISERFS_UNSUPPORTED_OPT}, | ||
856 | {"noacl", .clrmask = 1<<REISERFS_UNSUPPORTED_OPT}, | ||
857 | #endif | ||
858 | {"nolog",}, /* This is unsupported */ | ||
859 | {"replayonly", .setmask = 1<<REPLAYONLY}, | ||
860 | {"block-allocator", .arg_required = 'a', .values = balloc}, | ||
861 | {"data", .arg_required = 'd', .values = logging_mode}, | ||
862 | {"barrier", .arg_required = 'b', .values = barrier_mode}, | ||
863 | {"resize", .arg_required = 'r', .values = NULL}, | ||
864 | {"jdev", .arg_required = 'j', .values = NULL}, | ||
865 | {"nolargeio", .arg_required = 'w', .values = NULL}, | ||
866 | {"commit", .arg_required = 'c', .values = NULL}, | ||
867 | {"usrquota",}, | ||
868 | {"grpquota",}, | ||
869 | {"errors", .arg_required = 'e', .values = error_actions}, | ||
870 | {"usrjquota", .arg_required = 'u'|(1<<REISERFS_OPT_ALLOWEMPTY), .values = NULL}, | ||
871 | {"grpjquota", .arg_required = 'g'|(1<<REISERFS_OPT_ALLOWEMPTY), .values = NULL}, | ||
872 | {"jqfmt", .arg_required = 'f', .values = NULL}, | ||
873 | {NULL,} | ||
874 | }; | ||
875 | |||
876 | *blocks = 0; | ||
877 | if (!options || !*options) | ||
878 | /* use default configuration: create tails, journaling on, no | ||
879 | conversion to newest format */ | ||
880 | return 1; | ||
881 | |||
882 | for (pos = options; pos; ) { | ||
883 | c = reiserfs_getopt (s, &pos, opts, &arg, mount_options); | ||
884 | if (c == -1) | ||
885 | /* wrong option is given */ | ||
886 | return 0; | ||
887 | |||
888 | if (c == 'r') { | ||
889 | char * p; | ||
890 | |||
891 | p = NULL; | ||
892 | /* "resize=NNN" */ | ||
893 | *blocks = simple_strtoul (arg, &p, 0); | ||
894 | if (*p != '\0') { | ||
895 | /* NNN does not look like a number */ | ||
896 | reiserfs_warning (s, "reiserfs_parse_options: bad value %s", arg); | ||
897 | return 0; | ||
898 | } | ||
899 | } | ||
900 | |||
901 | if ( c == 'c' ) { | ||
902 | char *p = NULL; | ||
903 | unsigned long val = simple_strtoul (arg, &p, 0); | ||
904 | /* commit=NNN (time in seconds) */ | ||
905 | if ( *p != '\0' || val >= (unsigned int)-1) { | ||
906 | reiserfs_warning (s, "reiserfs_parse_options: bad value %s", arg); return 0; | ||
907 | } | ||
908 | *commit_max_age = (unsigned int)val; | ||
909 | } | ||
910 | |||
911 | if ( c == 'w' ) { | ||
912 | char *p=NULL; | ||
913 | int val = simple_strtoul (arg, &p, 0); | ||
914 | |||
915 | if ( *p != '\0') { | ||
916 | reiserfs_warning (s, "reiserfs_parse_options: non-numeric value %s for nolargeio option", arg); | ||
917 | return 0; | ||
918 | } | ||
919 | if ( val ) | ||
920 | reiserfs_default_io_size = PAGE_SIZE; | ||
921 | else | ||
922 | reiserfs_default_io_size = 128 * 1024; | ||
923 | } | ||
924 | |||
925 | if (c == 'j') { | ||
926 | if (arg && *arg && jdev_name) { | ||
927 | if ( *jdev_name ) { //Hm, already assigned? | ||
928 | reiserfs_warning (s, "reiserfs_parse_options: journal device was already specified to be %s", *jdev_name); | ||
929 | return 0; | ||
930 | } | ||
931 | *jdev_name = arg; | ||
932 | } | ||
933 | } | ||
934 | |||
935 | #ifdef CONFIG_QUOTA | ||
936 | if (c == 'u' || c == 'g') { | ||
937 | int qtype = c == 'u' ? USRQUOTA : GRPQUOTA; | ||
938 | |||
939 | if (sb_any_quota_enabled(s)) { | ||
940 | reiserfs_warning(s, "reiserfs_parse_options: cannot change journalled quota options when quota turned on."); | ||
941 | return 0; | ||
942 | } | ||
943 | if (*arg) { /* Some filename specified? */ | ||
944 | if (REISERFS_SB(s)->s_qf_names[qtype] && strcmp(REISERFS_SB(s)->s_qf_names[qtype], arg)) { | ||
945 | reiserfs_warning(s, "reiserfs_parse_options: %s quota file already specified.", QTYPE2NAME(qtype)); | ||
946 | return 0; | ||
947 | } | ||
948 | if (strchr(arg, '/')) { | ||
949 | reiserfs_warning(s, "reiserfs_parse_options: quotafile must be on filesystem root."); | ||
950 | return 0; | ||
951 | } | ||
952 | REISERFS_SB(s)->s_qf_names[qtype] = kmalloc(strlen(arg)+1, GFP_KERNEL); | ||
953 | if (!REISERFS_SB(s)->s_qf_names[qtype]) { | ||
954 | reiserfs_warning(s, "reiserfs_parse_options: not enough memory for storing quotafile name."); | ||
955 | return 0; | ||
956 | } | ||
957 | strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg); | ||
958 | } | ||
959 | else { | ||
960 | if (REISERFS_SB(s)->s_qf_names[qtype]) { | ||
961 | kfree(REISERFS_SB(s)->s_qf_names[qtype]); | ||
962 | REISERFS_SB(s)->s_qf_names[qtype] = NULL; | ||
963 | } | ||
964 | } | ||
965 | } | ||
966 | if (c == 'f') { | ||
967 | if (!strcmp(arg, "vfsold")) | ||
968 | REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_OLD; | ||
969 | else if (!strcmp(arg, "vfsv0")) | ||
970 | REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_V0; | ||
971 | else { | ||
972 | reiserfs_warning(s, "reiserfs_parse_options: unknown quota format specified."); | ||
973 | return 0; | ||
974 | } | ||
975 | } | ||
976 | #else | ||
977 | if (c == 'u' || c == 'g' || c == 'f') { | ||
978 | reiserfs_warning(s, "reiserfs_parse_options: journalled quota options not supported."); | ||
979 | return 0; | ||
980 | } | ||
981 | #endif | ||
982 | } | ||
983 | |||
984 | #ifdef CONFIG_QUOTA | ||
985 | if (!REISERFS_SB(s)->s_jquota_fmt && (REISERFS_SB(s)->s_qf_names[USRQUOTA] || REISERFS_SB(s)->s_qf_names[GRPQUOTA])) { | ||
986 | reiserfs_warning(s, "reiserfs_parse_options: journalled quota format not specified."); | ||
987 | return 0; | ||
988 | } | ||
989 | #endif | ||
990 | return 1; | ||
991 | } | ||
992 | |||
993 | static void switch_data_mode(struct super_block *s, unsigned long mode) { | ||
994 | REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) | | ||
995 | (1 << REISERFS_DATA_ORDERED) | | ||
996 | (1 << REISERFS_DATA_WRITEBACK)); | ||
997 | REISERFS_SB(s)->s_mount_opt |= (1 << mode); | ||
998 | } | ||
999 | |||
1000 | static void handle_data_mode(struct super_block *s, unsigned long mount_options) | ||
1001 | { | ||
1002 | if (mount_options & (1 << REISERFS_DATA_LOG)) { | ||
1003 | if (!reiserfs_data_log(s)) { | ||
1004 | switch_data_mode(s, REISERFS_DATA_LOG); | ||
1005 | reiserfs_info (s, "switching to journaled data mode\n"); | ||
1006 | } | ||
1007 | } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) { | ||
1008 | if (!reiserfs_data_ordered(s)) { | ||
1009 | switch_data_mode(s, REISERFS_DATA_ORDERED); | ||
1010 | reiserfs_info (s, "switching to ordered data mode\n"); | ||
1011 | } | ||
1012 | } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) { | ||
1013 | if (!reiserfs_data_writeback(s)) { | ||
1014 | switch_data_mode(s, REISERFS_DATA_WRITEBACK); | ||
1015 | reiserfs_info (s, "switching to writeback data mode\n"); | ||
1016 | } | ||
1017 | } | ||
1018 | } | ||
1019 | |||
1020 | static void handle_barrier_mode(struct super_block *s, unsigned long bits) { | ||
1021 | int flush = (1 << REISERFS_BARRIER_FLUSH); | ||
1022 | int none = (1 << REISERFS_BARRIER_NONE); | ||
1023 | int all_barrier = flush | none; | ||
1024 | |||
1025 | if (bits & all_barrier) { | ||
1026 | REISERFS_SB(s)->s_mount_opt &= ~all_barrier; | ||
1027 | if (bits & flush) { | ||
1028 | REISERFS_SB(s)->s_mount_opt |= flush; | ||
1029 | printk("reiserfs: enabling write barrier flush mode\n"); | ||
1030 | } else if (bits & none) { | ||
1031 | REISERFS_SB(s)->s_mount_opt |= none; | ||
1032 | printk("reiserfs: write barriers turned off\n"); | ||
1033 | } | ||
1034 | } | ||
1035 | } | ||
1036 | |||
1037 | static void handle_attrs( struct super_block *s ) | ||
1038 | { | ||
1039 | struct reiserfs_super_block * rs; | ||
1040 | |||
1041 | if( reiserfs_attrs( s ) ) { | ||
1042 | rs = SB_DISK_SUPER_BLOCK (s); | ||
1043 | if( old_format_only(s) ) { | ||
1044 | reiserfs_warning(s, "reiserfs: cannot support attributes on 3.5.x disk format" ); | ||
1045 | REISERFS_SB(s) -> s_mount_opt &= ~ ( 1 << REISERFS_ATTRS ); | ||
1046 | return; | ||
1047 | } | ||
1048 | if( !( le32_to_cpu( rs -> s_flags ) & reiserfs_attrs_cleared ) ) { | ||
1049 | reiserfs_warning(s, "reiserfs: cannot support attributes until flag is set in super-block" ); | ||
1050 | REISERFS_SB(s) -> s_mount_opt &= ~ ( 1 << REISERFS_ATTRS ); | ||
1051 | } | ||
1052 | } | ||
1053 | } | ||
1054 | |||
1055 | static int reiserfs_remount (struct super_block * s, int * mount_flags, char * arg) | ||
1056 | { | ||
1057 | struct reiserfs_super_block * rs; | ||
1058 | struct reiserfs_transaction_handle th ; | ||
1059 | unsigned long blocks; | ||
1060 | unsigned long mount_options = REISERFS_SB(s)->s_mount_opt; | ||
1061 | unsigned long safe_mask = 0; | ||
1062 | unsigned int commit_max_age = (unsigned int)-1; | ||
1063 | struct reiserfs_journal *journal = SB_JOURNAL(s); | ||
1064 | int err; | ||
1065 | #ifdef CONFIG_QUOTA | ||
1066 | int i; | ||
1067 | #endif | ||
1068 | |||
1069 | rs = SB_DISK_SUPER_BLOCK (s); | ||
1070 | |||
1071 | if (!reiserfs_parse_options(s, arg, &mount_options, &blocks, NULL, &commit_max_age)) { | ||
1072 | #ifdef CONFIG_QUOTA | ||
1073 | for (i = 0; i < MAXQUOTAS; i++) | ||
1074 | if (REISERFS_SB(s)->s_qf_names[i]) { | ||
1075 | kfree(REISERFS_SB(s)->s_qf_names[i]); | ||
1076 | REISERFS_SB(s)->s_qf_names[i] = NULL; | ||
1077 | } | ||
1078 | #endif | ||
1079 | return -EINVAL; | ||
1080 | } | ||
1081 | |||
1082 | handle_attrs(s); | ||
1083 | |||
1084 | /* Add options that are safe here */ | ||
1085 | safe_mask |= 1 << REISERFS_SMALLTAIL; | ||
1086 | safe_mask |= 1 << REISERFS_LARGETAIL; | ||
1087 | safe_mask |= 1 << REISERFS_NO_BORDER; | ||
1088 | safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION; | ||
1089 | safe_mask |= 1 << REISERFS_HASHED_RELOCATION; | ||
1090 | safe_mask |= 1 << REISERFS_TEST4; | ||
1091 | safe_mask |= 1 << REISERFS_ATTRS; | ||
1092 | safe_mask |= 1 << REISERFS_XATTRS_USER; | ||
1093 | safe_mask |= 1 << REISERFS_POSIXACL; | ||
1094 | safe_mask |= 1 << REISERFS_BARRIER_FLUSH; | ||
1095 | safe_mask |= 1 << REISERFS_BARRIER_NONE; | ||
1096 | safe_mask |= 1 << REISERFS_ERROR_RO; | ||
1097 | safe_mask |= 1 << REISERFS_ERROR_CONTINUE; | ||
1098 | safe_mask |= 1 << REISERFS_ERROR_PANIC; | ||
1099 | |||
1100 | /* Update the bitmask, taking care to keep | ||
1101 | * the bits we're not allowed to change here */ | ||
1102 | REISERFS_SB(s)->s_mount_opt = (REISERFS_SB(s)->s_mount_opt & ~safe_mask) | (mount_options & safe_mask); | ||
1103 | |||
1104 | if(commit_max_age != 0 && commit_max_age != (unsigned int)-1) { | ||
1105 | journal->j_max_commit_age = commit_max_age; | ||
1106 | journal->j_max_trans_age = commit_max_age; | ||
1107 | } | ||
1108 | else if(commit_max_age == 0) | ||
1109 | { | ||
1110 | /* 0 means restore defaults. */ | ||
1111 | journal->j_max_commit_age = journal->j_default_max_commit_age; | ||
1112 | journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; | ||
1113 | } | ||
1114 | |||
1115 | if(blocks) { | ||
1116 | int rc = reiserfs_resize(s, blocks); | ||
1117 | if (rc != 0) | ||
1118 | return rc; | ||
1119 | } | ||
1120 | |||
1121 | if (*mount_flags & MS_RDONLY) { | ||
1122 | reiserfs_xattr_init (s, *mount_flags); | ||
1123 | /* remount read-only */ | ||
1124 | if (s->s_flags & MS_RDONLY) | ||
1125 | /* it is read-only already */ | ||
1126 | return 0; | ||
1127 | /* try to remount file system with read-only permissions */ | ||
1128 | if (sb_umount_state(rs) == REISERFS_VALID_FS || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { | ||
1129 | return 0; | ||
1130 | } | ||
1131 | |||
1132 | err = journal_begin(&th, s, 10) ; | ||
1133 | if (err) | ||
1134 | return err; | ||
1135 | |||
1136 | /* Mounting a rw partition read-only. */ | ||
1137 | reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; | ||
1138 | set_sb_umount_state( rs, REISERFS_SB(s)->s_mount_state ); | ||
1139 | journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); | ||
1140 | } else { | ||
1141 | /* remount read-write */ | ||
1142 | if (!(s->s_flags & MS_RDONLY)) { | ||
1143 | reiserfs_xattr_init (s, *mount_flags); | ||
1144 | return 0; /* We are read-write already */ | ||
1145 | } | ||
1146 | |||
1147 | if (reiserfs_is_journal_aborted (journal)) | ||
1148 | return journal->j_errno; | ||
1149 | |||
1150 | handle_data_mode(s, mount_options); | ||
1151 | handle_barrier_mode(s, mount_options); | ||
1152 | REISERFS_SB(s)->s_mount_state = sb_umount_state(rs) ; | ||
1153 | s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */ | ||
1154 | err = journal_begin(&th, s, 10) ; | ||
1155 | if (err) | ||
1156 | return err; | ||
1157 | |||
1158 | /* Mount a partition which is read-only, read-write */ | ||
1159 | reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; | ||
1160 | REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); | ||
1161 | s->s_flags &= ~MS_RDONLY; | ||
1162 | set_sb_umount_state( rs, REISERFS_ERROR_FS ); | ||
1163 | /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */ | ||
1164 | journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); | ||
1165 | REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS ; | ||
1166 | } | ||
1167 | /* this will force a full flush of all journal lists */ | ||
1168 | SB_JOURNAL(s)->j_must_wait = 1 ; | ||
1169 | err = journal_end(&th, s, 10) ; | ||
1170 | if (err) | ||
1171 | return err; | ||
1172 | s->s_dirt = 0; | ||
1173 | |||
1174 | if (!( *mount_flags & MS_RDONLY ) ) { | ||
1175 | finish_unfinished( s ); | ||
1176 | reiserfs_xattr_init (s, *mount_flags); | ||
1177 | } | ||
1178 | |||
1179 | return 0; | ||
1180 | } | ||
1181 | |||
1182 | /* load_bitmap_info_data - Sets up the reiserfs_bitmap_info structure from disk. | ||
1183 | * @sb - superblock for this filesystem | ||
1184 | * @bi - the bitmap info to be loaded. Requires that bi->bh is valid. | ||
1185 | * | ||
1186 | * This routine counts how many free bits there are, finding the first zero | ||
1187 | * as a side effect. Could also be implemented as a loop of test_bit() calls, or | ||
1188 | * a loop of find_first_zero_bit() calls. This implementation is similar to | ||
1189 | * find_first_zero_bit(), but doesn't return after it finds the first bit. | ||
1190 | * Should only be called on fs mount, but should be fairly efficient anyways. | ||
1191 | * | ||
1192 | * bi->first_zero_hint is considered unset if it == 0, since the bitmap itself | ||
1193 | * will * invariably occupt block 0 represented in the bitmap. The only | ||
1194 | * exception to this is when free_count also == 0, since there will be no | ||
1195 | * free blocks at all. | ||
1196 | */ | ||
1197 | |||
1198 | static void load_bitmap_info_data (struct super_block *sb, | ||
1199 | struct reiserfs_bitmap_info *bi) | ||
1200 | { | ||
1201 | unsigned long *cur = (unsigned long *)bi->bh->b_data; | ||
1202 | |||
1203 | while ((char *)cur < (bi->bh->b_data + sb->s_blocksize)) { | ||
1204 | |||
1205 | /* No need to scan if all 0's or all 1's. | ||
1206 | * Since we're only counting 0's, we can simply ignore all 1's */ | ||
1207 | if (*cur == 0) { | ||
1208 | if (bi->first_zero_hint == 0) { | ||
1209 | bi->first_zero_hint = ((char *)cur - bi->bh->b_data) << 3; | ||
1210 | } | ||
1211 | bi->free_count += sizeof(unsigned long)*8; | ||
1212 | } else if (*cur != ~0L) { | ||
1213 | int b; | ||
1214 | for (b = 0; b < sizeof(unsigned long)*8; b++) { | ||
1215 | if (!reiserfs_test_le_bit (b, cur)) { | ||
1216 | bi->free_count ++; | ||
1217 | if (bi->first_zero_hint == 0) | ||
1218 | bi->first_zero_hint = | ||
1219 | (((char *)cur - bi->bh->b_data) << 3) + b; | ||
1220 | } | ||
1221 | } | ||
1222 | } | ||
1223 | cur ++; | ||
1224 | } | ||
1225 | |||
1226 | #ifdef CONFIG_REISERFS_CHECK | ||
1227 | // This outputs a lot of unneded info on big FSes | ||
1228 | // reiserfs_warning ("bitmap loaded from block %d: %d free blocks", | ||
1229 | // bi->bh->b_blocknr, bi->free_count); | ||
1230 | #endif | ||
1231 | } | ||
1232 | |||
1233 | static int read_bitmaps (struct super_block * s) | ||
1234 | { | ||
1235 | int i, bmap_nr; | ||
1236 | |||
1237 | SB_AP_BITMAP (s) = vmalloc (sizeof (struct reiserfs_bitmap_info) * SB_BMAP_NR(s)); | ||
1238 | if (SB_AP_BITMAP (s) == 0) | ||
1239 | return 1; | ||
1240 | memset (SB_AP_BITMAP (s), 0, sizeof (struct reiserfs_bitmap_info) * SB_BMAP_NR(s)); | ||
1241 | for (i = 0, bmap_nr = REISERFS_DISK_OFFSET_IN_BYTES / s->s_blocksize + 1; | ||
1242 | i < SB_BMAP_NR(s); i++, bmap_nr = s->s_blocksize * 8 * i) { | ||
1243 | SB_AP_BITMAP (s)[i].bh = sb_getblk(s, bmap_nr); | ||
1244 | if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) | ||
1245 | ll_rw_block(READ, 1, &SB_AP_BITMAP(s)[i].bh); | ||
1246 | } | ||
1247 | for (i = 0; i < SB_BMAP_NR(s); i++) { | ||
1248 | wait_on_buffer(SB_AP_BITMAP (s)[i].bh); | ||
1249 | if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) { | ||
1250 | reiserfs_warning(s,"sh-2029: reiserfs read_bitmaps: " | ||
1251 | "bitmap block (#%lu) reading failed", | ||
1252 | SB_AP_BITMAP(s)[i].bh->b_blocknr); | ||
1253 | for (i = 0; i < SB_BMAP_NR(s); i++) | ||
1254 | brelse(SB_AP_BITMAP(s)[i].bh); | ||
1255 | vfree(SB_AP_BITMAP(s)); | ||
1256 | SB_AP_BITMAP(s) = NULL; | ||
1257 | return 1; | ||
1258 | } | ||
1259 | load_bitmap_info_data (s, SB_AP_BITMAP (s) + i); | ||
1260 | } | ||
1261 | return 0; | ||
1262 | } | ||
1263 | |||
1264 | static int read_old_bitmaps (struct super_block * s) | ||
1265 | { | ||
1266 | int i ; | ||
1267 | struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK(s); | ||
1268 | int bmp1 = (REISERFS_OLD_DISK_OFFSET_IN_BYTES / s->s_blocksize) + 1; /* first of bitmap blocks */ | ||
1269 | |||
1270 | /* read true bitmap */ | ||
1271 | SB_AP_BITMAP (s) = vmalloc (sizeof (struct reiserfs_buffer_info *) * sb_bmap_nr(rs)); | ||
1272 | if (SB_AP_BITMAP (s) == 0) | ||
1273 | return 1; | ||
1274 | |||
1275 | memset (SB_AP_BITMAP (s), 0, sizeof (struct reiserfs_buffer_info *) * sb_bmap_nr(rs)); | ||
1276 | |||
1277 | for (i = 0; i < sb_bmap_nr(rs); i ++) { | ||
1278 | SB_AP_BITMAP (s)[i].bh = sb_bread (s, bmp1 + i); | ||
1279 | if (!SB_AP_BITMAP (s)[i].bh) | ||
1280 | return 1; | ||
1281 | load_bitmap_info_data (s, SB_AP_BITMAP (s) + i); | ||
1282 | } | ||
1283 | |||
1284 | return 0; | ||
1285 | } | ||
1286 | |||
1287 | static int read_super_block (struct super_block * s, int offset) | ||
1288 | { | ||
1289 | struct buffer_head * bh; | ||
1290 | struct reiserfs_super_block * rs; | ||
1291 | int fs_blocksize; | ||
1292 | |||
1293 | |||
1294 | bh = sb_bread (s, offset / s->s_blocksize); | ||
1295 | if (!bh) { | ||
1296 | reiserfs_warning (s, "sh-2006: read_super_block: " | ||
1297 | "bread failed (dev %s, block %lu, size %lu)", | ||
1298 | reiserfs_bdevname (s), offset / s->s_blocksize, s->s_blocksize); | ||
1299 | return 1; | ||
1300 | } | ||
1301 | |||
1302 | rs = (struct reiserfs_super_block *)bh->b_data; | ||
1303 | if (!is_any_reiserfs_magic_string (rs)) { | ||
1304 | brelse (bh); | ||
1305 | return 1; | ||
1306 | } | ||
1307 | |||
1308 | // | ||
1309 | // ok, reiserfs signature (old or new) found in at the given offset | ||
1310 | // | ||
1311 | fs_blocksize = sb_blocksize(rs); | ||
1312 | brelse (bh); | ||
1313 | sb_set_blocksize (s, fs_blocksize); | ||
1314 | |||
1315 | bh = sb_bread (s, offset / s->s_blocksize); | ||
1316 | if (!bh) { | ||
1317 | reiserfs_warning (s, "sh-2007: read_super_block: " | ||
1318 | "bread failed (dev %s, block %lu, size %lu)\n", | ||
1319 | reiserfs_bdevname (s), offset / s->s_blocksize, s->s_blocksize); | ||
1320 | return 1; | ||
1321 | } | ||
1322 | |||
1323 | rs = (struct reiserfs_super_block *)bh->b_data; | ||
1324 | if (sb_blocksize(rs) != s->s_blocksize) { | ||
1325 | reiserfs_warning (s, "sh-2011: read_super_block: " | ||
1326 | "can't find a reiserfs filesystem on (dev %s, block %Lu, size %lu)\n", | ||
1327 | reiserfs_bdevname (s), (unsigned long long)bh->b_blocknr, s->s_blocksize); | ||
1328 | brelse (bh); | ||
1329 | return 1; | ||
1330 | } | ||
1331 | |||
1332 | if ( rs->s_v1.s_root_block == -1 ) { | ||
1333 | brelse(bh) ; | ||
1334 | reiserfs_warning (s, "Unfinished reiserfsck --rebuild-tree run detected. Please run\n" | ||
1335 | "reiserfsck --rebuild-tree and wait for a completion. If that fails\n" | ||
1336 | "get newer reiserfsprogs package"); | ||
1337 | return 1; | ||
1338 | } | ||
1339 | |||
1340 | SB_BUFFER_WITH_SB (s) = bh; | ||
1341 | SB_DISK_SUPER_BLOCK (s) = rs; | ||
1342 | |||
1343 | if (is_reiserfs_jr (rs)) { | ||
1344 | /* magic is of non-standard journal filesystem, look at s_version to | ||
1345 | find which format is in use */ | ||
1346 | if (sb_version(rs) == REISERFS_VERSION_2) | ||
1347 | reiserfs_warning (s, "read_super_block: found reiserfs format \"3.6\"" | ||
1348 | " with non-standard journal"); | ||
1349 | else if (sb_version(rs) == REISERFS_VERSION_1) | ||
1350 | reiserfs_warning (s, "read_super_block: found reiserfs format \"3.5\"" | ||
1351 | " with non-standard journal"); | ||
1352 | else { | ||
1353 | reiserfs_warning (s, "sh-2012: read_super_block: found unknown " | ||
1354 | "format \"%u\" of reiserfs with non-standard magic", | ||
1355 | sb_version(rs)); | ||
1356 | return 1; | ||
1357 | } | ||
1358 | } | ||
1359 | else | ||
1360 | /* s_version of standard format may contain incorrect information, | ||
1361 | so we just look at the magic string */ | ||
1362 | reiserfs_info (s, "found reiserfs format \"%s\" with standard journal\n", | ||
1363 | is_reiserfs_3_5 (rs) ? "3.5" : "3.6"); | ||
1364 | |||
1365 | s->s_op = &reiserfs_sops; | ||
1366 | s->s_export_op = &reiserfs_export_ops; | ||
1367 | #ifdef CONFIG_QUOTA | ||
1368 | s->s_qcop = &reiserfs_qctl_operations; | ||
1369 | s->dq_op = &reiserfs_quota_operations; | ||
1370 | #endif | ||
1371 | |||
1372 | /* new format is limited by the 32 bit wide i_blocks field, want to | ||
1373 | ** be one full block below that. | ||
1374 | */ | ||
1375 | s->s_maxbytes = (512LL << 32) - s->s_blocksize ; | ||
1376 | return 0; | ||
1377 | } | ||
1378 | |||
1379 | |||
1380 | |||
1381 | /* after journal replay, reread all bitmap and super blocks */ | ||
1382 | static int reread_meta_blocks(struct super_block *s) { | ||
1383 | int i ; | ||
1384 | ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))) ; | ||
1385 | wait_on_buffer(SB_BUFFER_WITH_SB(s)) ; | ||
1386 | if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { | ||
1387 | reiserfs_warning (s, "reread_meta_blocks, error reading the super") ; | ||
1388 | return 1 ; | ||
1389 | } | ||
1390 | |||
1391 | for (i = 0; i < SB_BMAP_NR(s) ; i++) { | ||
1392 | ll_rw_block(READ, 1, &(SB_AP_BITMAP(s)[i].bh)) ; | ||
1393 | wait_on_buffer(SB_AP_BITMAP(s)[i].bh) ; | ||
1394 | if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) { | ||
1395 | reiserfs_warning (s, "reread_meta_blocks, error reading bitmap block number %d at %llu", | ||
1396 | i, (unsigned long long)SB_AP_BITMAP(s)[i].bh->b_blocknr) ; | ||
1397 | return 1 ; | ||
1398 | } | ||
1399 | } | ||
1400 | return 0 ; | ||
1401 | |||
1402 | } | ||
1403 | |||
1404 | |||
1405 | ///////////////////////////////////////////////////// | ||
1406 | // hash detection stuff | ||
1407 | |||
1408 | |||
1409 | // if root directory is empty - we set default - Yura's - hash and | ||
1410 | // warn about it | ||
1411 | // FIXME: we look for only one name in a directory. If tea and yura | ||
1412 | // bith have the same value - we ask user to send report to the | ||
1413 | // mailing list | ||
1414 | static __u32 find_hash_out (struct super_block * s) | ||
1415 | { | ||
1416 | int retval; | ||
1417 | struct inode * inode; | ||
1418 | struct cpu_key key; | ||
1419 | INITIALIZE_PATH (path); | ||
1420 | struct reiserfs_dir_entry de; | ||
1421 | __u32 hash = DEFAULT_HASH; | ||
1422 | |||
1423 | inode = s->s_root->d_inode; | ||
1424 | |||
1425 | do { // Some serious "goto"-hater was there ;) | ||
1426 | u32 teahash, r5hash, yurahash; | ||
1427 | |||
1428 | make_cpu_key (&key, inode, ~0, TYPE_DIRENTRY, 3); | ||
1429 | retval = search_by_entry_key (s, &key, &path, &de); | ||
1430 | if (retval == IO_ERROR) { | ||
1431 | pathrelse (&path); | ||
1432 | return UNSET_HASH ; | ||
1433 | } | ||
1434 | if (retval == NAME_NOT_FOUND) | ||
1435 | de.de_entry_num --; | ||
1436 | set_de_name_and_namelen (&de); | ||
1437 | if (deh_offset( &(de.de_deh[de.de_entry_num]) ) == DOT_DOT_OFFSET) { | ||
1438 | /* allow override in this case */ | ||
1439 | if (reiserfs_rupasov_hash(s)) { | ||
1440 | hash = YURA_HASH ; | ||
1441 | } | ||
1442 | reiserfs_warning(s,"FS seems to be empty, autodetect " | ||
1443 | "is using the default hash"); | ||
1444 | break; | ||
1445 | } | ||
1446 | r5hash=GET_HASH_VALUE (r5_hash (de.de_name, de.de_namelen)); | ||
1447 | teahash=GET_HASH_VALUE (keyed_hash (de.de_name, de.de_namelen)); | ||
1448 | yurahash=GET_HASH_VALUE (yura_hash (de.de_name, de.de_namelen)); | ||
1449 | if ( ( (teahash == r5hash) && (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash) ) || | ||
1450 | ( (teahash == yurahash) && (yurahash == GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])))) ) || | ||
1451 | ( (r5hash == yurahash) && (yurahash == GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])))) ) ) { | ||
1452 | reiserfs_warning(s,"Unable to automatically detect hash function. " | ||
1453 | "Please mount with -o hash={tea,rupasov,r5}", | ||
1454 | reiserfs_bdevname (s)); | ||
1455 | hash = UNSET_HASH; | ||
1456 | break; | ||
1457 | } | ||
1458 | if (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])) ) == yurahash) | ||
1459 | hash = YURA_HASH; | ||
1460 | else if (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])) ) == teahash) | ||
1461 | hash = TEA_HASH; | ||
1462 | else if (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])) ) == r5hash) | ||
1463 | hash = R5_HASH; | ||
1464 | else { | ||
1465 | reiserfs_warning (s,"Unrecognised hash function"); | ||
1466 | hash = UNSET_HASH; | ||
1467 | } | ||
1468 | } while (0); | ||
1469 | |||
1470 | pathrelse (&path); | ||
1471 | return hash; | ||
1472 | } | ||
1473 | |||
1474 | // finds out which hash names are sorted with | ||
1475 | static int what_hash (struct super_block * s) | ||
1476 | { | ||
1477 | __u32 code; | ||
1478 | |||
1479 | code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s)); | ||
1480 | |||
1481 | /* reiserfs_hash_detect() == true if any of the hash mount options | ||
1482 | ** were used. We must check them to make sure the user isn't | ||
1483 | ** using a bad hash value | ||
1484 | */ | ||
1485 | if (code == UNSET_HASH || reiserfs_hash_detect(s)) | ||
1486 | code = find_hash_out (s); | ||
1487 | |||
1488 | if (code != UNSET_HASH && reiserfs_hash_detect(s)) { | ||
1489 | /* detection has found the hash, and we must check against the | ||
1490 | ** mount options | ||
1491 | */ | ||
1492 | if (reiserfs_rupasov_hash(s) && code != YURA_HASH) { | ||
1493 | reiserfs_warning (s, "Error, %s hash detected, " | ||
1494 | "unable to force rupasov hash", reiserfs_hashname(code)) ; | ||
1495 | code = UNSET_HASH ; | ||
1496 | } else if (reiserfs_tea_hash(s) && code != TEA_HASH) { | ||
1497 | reiserfs_warning (s, "Error, %s hash detected, " | ||
1498 | "unable to force tea hash", reiserfs_hashname(code)) ; | ||
1499 | code = UNSET_HASH ; | ||
1500 | } else if (reiserfs_r5_hash(s) && code != R5_HASH) { | ||
1501 | reiserfs_warning (s, "Error, %s hash detected, " | ||
1502 | "unable to force r5 hash", reiserfs_hashname(code)) ; | ||
1503 | code = UNSET_HASH ; | ||
1504 | } | ||
1505 | } else { | ||
1506 | /* find_hash_out was not called or could not determine the hash */ | ||
1507 | if (reiserfs_rupasov_hash(s)) { | ||
1508 | code = YURA_HASH ; | ||
1509 | } else if (reiserfs_tea_hash(s)) { | ||
1510 | code = TEA_HASH ; | ||
1511 | } else if (reiserfs_r5_hash(s)) { | ||
1512 | code = R5_HASH ; | ||
1513 | } | ||
1514 | } | ||
1515 | |||
1516 | /* if we are mounted RW, and we have a new valid hash code, update | ||
1517 | ** the super | ||
1518 | */ | ||
1519 | if (code != UNSET_HASH && | ||
1520 | !(s->s_flags & MS_RDONLY) && | ||
1521 | code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) { | ||
1522 | set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code); | ||
1523 | } | ||
1524 | return code; | ||
1525 | } | ||
1526 | |||
1527 | // return pointer to appropriate function | ||
1528 | static hashf_t hash_function (struct super_block * s) | ||
1529 | { | ||
1530 | switch (what_hash (s)) { | ||
1531 | case TEA_HASH: | ||
1532 | reiserfs_info (s, "Using tea hash to sort names\n"); | ||
1533 | return keyed_hash; | ||
1534 | case YURA_HASH: | ||
1535 | reiserfs_info (s, "Using rupasov hash to sort names\n"); | ||
1536 | return yura_hash; | ||
1537 | case R5_HASH: | ||
1538 | reiserfs_info (s, "Using r5 hash to sort names\n"); | ||
1539 | return r5_hash; | ||
1540 | } | ||
1541 | return NULL; | ||
1542 | } | ||
1543 | |||
1544 | // this is used to set up correct value for old partitions | ||
1545 | static int function2code (hashf_t func) | ||
1546 | { | ||
1547 | if (func == keyed_hash) | ||
1548 | return TEA_HASH; | ||
1549 | if (func == yura_hash) | ||
1550 | return YURA_HASH; | ||
1551 | if (func == r5_hash) | ||
1552 | return R5_HASH; | ||
1553 | |||
1554 | BUG() ; // should never happen | ||
1555 | |||
1556 | return 0; | ||
1557 | } | ||
1558 | |||
1559 | #define SWARN(silent, s, ...) \ | ||
1560 | if (!(silent)) \ | ||
1561 | reiserfs_warning (s, __VA_ARGS__) | ||
1562 | |||
1563 | static int reiserfs_fill_super (struct super_block * s, void * data, int silent) | ||
1564 | { | ||
1565 | struct inode *root_inode; | ||
1566 | int j; | ||
1567 | struct reiserfs_transaction_handle th ; | ||
1568 | int old_format = 0; | ||
1569 | unsigned long blocks; | ||
1570 | unsigned int commit_max_age = 0; | ||
1571 | int jinit_done = 0 ; | ||
1572 | struct reiserfs_iget_args args ; | ||
1573 | struct reiserfs_super_block * rs; | ||
1574 | char *jdev_name; | ||
1575 | struct reiserfs_sb_info *sbi; | ||
1576 | int errval = -EINVAL; | ||
1577 | |||
1578 | sbi = kmalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); | ||
1579 | if (!sbi) { | ||
1580 | errval = -ENOMEM; | ||
1581 | goto error; | ||
1582 | } | ||
1583 | s->s_fs_info = sbi; | ||
1584 | memset (sbi, 0, sizeof (struct reiserfs_sb_info)); | ||
1585 | /* Set default values for options: non-aggressive tails, RO on errors */ | ||
1586 | REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL); | ||
1587 | REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO); | ||
1588 | /* no preallocation minimum, be smart in | ||
1589 | reiserfs_file_write instead */ | ||
1590 | REISERFS_SB(s)->s_alloc_options.preallocmin = 0; | ||
1591 | /* Preallocate by 16 blocks (17-1) at once */ | ||
1592 | REISERFS_SB(s)->s_alloc_options.preallocsize = 17; | ||
1593 | /* Initialize the rwsem for xattr dir */ | ||
1594 | init_rwsem(&REISERFS_SB(s)->xattr_dir_sem); | ||
1595 | |||
1596 | /* setup default block allocator options */ | ||
1597 | reiserfs_init_alloc_options(s); | ||
1598 | |||
1599 | jdev_name = NULL; | ||
1600 | if (reiserfs_parse_options (s, (char *) data, &(sbi->s_mount_opt), &blocks, &jdev_name, &commit_max_age) == 0) { | ||
1601 | goto error; | ||
1602 | } | ||
1603 | |||
1604 | if (blocks) { | ||
1605 | SWARN (silent, s, "jmacd-7: reiserfs_fill_super: resize option " | ||
1606 | "for remount only"); | ||
1607 | goto error; | ||
1608 | } | ||
1609 | |||
1610 | /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */ | ||
1611 | if (!read_super_block (s, REISERFS_OLD_DISK_OFFSET_IN_BYTES)) | ||
1612 | old_format = 1; | ||
1613 | /* try new format (64-th 1k block), which can contain reiserfs super block */ | ||
1614 | else if (read_super_block (s, REISERFS_DISK_OFFSET_IN_BYTES)) { | ||
1615 | SWARN(silent, s, "sh-2021: reiserfs_fill_super: can not find reiserfs on %s", reiserfs_bdevname (s)); | ||
1616 | goto error; | ||
1617 | } | ||
1618 | |||
1619 | rs = SB_DISK_SUPER_BLOCK (s); | ||
1620 | /* Let's do basic sanity check to verify that underlying device is not | ||
1621 | smaller than the filesystem. If the check fails then abort and scream, | ||
1622 | because bad stuff will happen otherwise. */ | ||
1623 | if ( s->s_bdev && s->s_bdev->bd_inode && i_size_read(s->s_bdev->bd_inode) < sb_block_count(rs)*sb_blocksize(rs)) { | ||
1624 | SWARN (silent, s, "Filesystem on %s cannot be mounted because it is bigger than the device", reiserfs_bdevname(s)); | ||
1625 | SWARN(silent, s, "You may need to run fsck or increase size of your LVM partition"); | ||
1626 | SWARN(silent, s, "Or may be you forgot to reboot after fdisk when it told you to"); | ||
1627 | goto error; | ||
1628 | } | ||
1629 | |||
1630 | sbi->s_mount_state = SB_REISERFS_STATE(s); | ||
1631 | sbi->s_mount_state = REISERFS_VALID_FS ; | ||
1632 | |||
1633 | if (old_format ? read_old_bitmaps(s) : read_bitmaps(s)) { | ||
1634 | SWARN(silent, s, "jmacd-8: reiserfs_fill_super: unable to read bitmap"); | ||
1635 | goto error; | ||
1636 | } | ||
1637 | #ifdef CONFIG_REISERFS_CHECK | ||
1638 | SWARN (silent, s, "CONFIG_REISERFS_CHECK is set ON"); | ||
1639 | SWARN (silent, s, "- it is slow mode for debugging."); | ||
1640 | #endif | ||
1641 | |||
1642 | /* make data=ordered the default */ | ||
1643 | if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) && | ||
1644 | !reiserfs_data_writeback(s)) | ||
1645 | { | ||
1646 | REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED); | ||
1647 | } | ||
1648 | |||
1649 | if (reiserfs_data_log(s)) { | ||
1650 | reiserfs_info (s, "using journaled data mode\n"); | ||
1651 | } else if (reiserfs_data_ordered(s)) { | ||
1652 | reiserfs_info (s, "using ordered data mode\n"); | ||
1653 | } else { | ||
1654 | reiserfs_info (s, "using writeback data mode\n"); | ||
1655 | } | ||
1656 | if (reiserfs_barrier_flush(s)) { | ||
1657 | printk("reiserfs: using flush barriers\n"); | ||
1658 | } | ||
1659 | |||
1660 | // set_device_ro(s->s_dev, 1) ; | ||
1661 | if( journal_init(s, jdev_name, old_format, commit_max_age) ) { | ||
1662 | SWARN(silent, s, "sh-2022: reiserfs_fill_super: unable to initialize journal space") ; | ||
1663 | goto error ; | ||
1664 | } else { | ||
1665 | jinit_done = 1 ; /* once this is set, journal_release must be called | ||
1666 | ** if we error out of the mount | ||
1667 | */ | ||
1668 | } | ||
1669 | if (reread_meta_blocks(s)) { | ||
1670 | SWARN(silent, s, "jmacd-9: reiserfs_fill_super: unable to reread meta blocks after journal init") ; | ||
1671 | goto error ; | ||
1672 | } | ||
1673 | |||
1674 | if (replay_only (s)) | ||
1675 | goto error; | ||
1676 | |||
1677 | if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) { | ||
1678 | SWARN(silent, s, "clm-7000: Detected readonly device, marking FS readonly") ; | ||
1679 | s->s_flags |= MS_RDONLY ; | ||
1680 | } | ||
1681 | args.objectid = REISERFS_ROOT_OBJECTID ; | ||
1682 | args.dirid = REISERFS_ROOT_PARENT_OBJECTID ; | ||
1683 | root_inode = iget5_locked (s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); | ||
1684 | if (!root_inode) { | ||
1685 | SWARN(silent, s, "jmacd-10: reiserfs_fill_super: get root inode failed"); | ||
1686 | goto error; | ||
1687 | } | ||
1688 | |||
1689 | if (root_inode->i_state & I_NEW) { | ||
1690 | reiserfs_read_locked_inode(root_inode, &args); | ||
1691 | unlock_new_inode(root_inode); | ||
1692 | } | ||
1693 | |||
1694 | s->s_root = d_alloc_root(root_inode); | ||
1695 | if (!s->s_root) { | ||
1696 | iput(root_inode); | ||
1697 | goto error; | ||
1698 | } | ||
1699 | |||
1700 | // define and initialize hash function | ||
1701 | sbi->s_hash_function = hash_function (s); | ||
1702 | if (sbi->s_hash_function == NULL) { | ||
1703 | dput(s->s_root) ; | ||
1704 | s->s_root = NULL ; | ||
1705 | goto error ; | ||
1706 | } | ||
1707 | |||
1708 | if (is_reiserfs_3_5 (rs) || (is_reiserfs_jr (rs) && SB_VERSION (s) == REISERFS_VERSION_1)) | ||
1709 | set_bit(REISERFS_3_5, &(sbi->s_properties)); | ||
1710 | else | ||
1711 | set_bit(REISERFS_3_6, &(sbi->s_properties)); | ||
1712 | |||
1713 | if (!(s->s_flags & MS_RDONLY)) { | ||
1714 | |||
1715 | errval = journal_begin(&th, s, 1) ; | ||
1716 | if (errval) { | ||
1717 | dput (s->s_root); | ||
1718 | s->s_root = NULL; | ||
1719 | goto error; | ||
1720 | } | ||
1721 | reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; | ||
1722 | |||
1723 | set_sb_umount_state( rs, REISERFS_ERROR_FS ); | ||
1724 | set_sb_fs_state (rs, 0); | ||
1725 | |||
1726 | if (old_format_only(s)) { | ||
1727 | /* filesystem of format 3.5 either with standard or non-standard | ||
1728 | journal */ | ||
1729 | if (convert_reiserfs (s)) { | ||
1730 | /* and -o conv is given */ | ||
1731 | if(!silent) | ||
1732 | reiserfs_info (s,"converting 3.5 filesystem to the 3.6 format") ; | ||
1733 | |||
1734 | if (is_reiserfs_3_5 (rs)) | ||
1735 | /* put magic string of 3.6 format. 2.2 will not be able to | ||
1736 | mount this filesystem anymore */ | ||
1737 | memcpy (rs->s_v1.s_magic, reiserfs_3_6_magic_string, | ||
1738 | sizeof (reiserfs_3_6_magic_string)); | ||
1739 | |||
1740 | set_sb_version(rs,REISERFS_VERSION_2); | ||
1741 | reiserfs_convert_objectid_map_v1(s) ; | ||
1742 | set_bit(REISERFS_3_6, &(sbi->s_properties)); | ||
1743 | clear_bit(REISERFS_3_5, &(sbi->s_properties)); | ||
1744 | } else if (!silent){ | ||
1745 | reiserfs_info (s, "using 3.5.x disk format\n") ; | ||
1746 | } | ||
1747 | } | ||
1748 | |||
1749 | journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); | ||
1750 | errval = journal_end(&th, s, 1) ; | ||
1751 | if (errval) { | ||
1752 | dput (s->s_root); | ||
1753 | s->s_root = NULL; | ||
1754 | goto error; | ||
1755 | } | ||
1756 | |||
1757 | if ((errval = reiserfs_xattr_init (s, s->s_flags))) { | ||
1758 | dput (s->s_root); | ||
1759 | s->s_root = NULL; | ||
1760 | goto error; | ||
1761 | } | ||
1762 | |||
1763 | /* look for files which were to be removed in previous session */ | ||
1764 | finish_unfinished (s); | ||
1765 | } else { | ||
1766 | if ( old_format_only(s) && !silent) { | ||
1767 | reiserfs_info (s, "using 3.5.x disk format\n") ; | ||
1768 | } | ||
1769 | |||
1770 | if ((errval = reiserfs_xattr_init (s, s->s_flags))) { | ||
1771 | dput (s->s_root); | ||
1772 | s->s_root = NULL; | ||
1773 | goto error; | ||
1774 | } | ||
1775 | } | ||
1776 | // mark hash in super block: it could be unset. overwrite should be ok | ||
1777 | set_sb_hash_function_code( rs, function2code(sbi->s_hash_function ) ); | ||
1778 | |||
1779 | handle_attrs( s ); | ||
1780 | |||
1781 | reiserfs_proc_info_init( s ); | ||
1782 | |||
1783 | init_waitqueue_head (&(sbi->s_wait)); | ||
1784 | spin_lock_init(&sbi->bitmap_lock); | ||
1785 | |||
1786 | return (0); | ||
1787 | |||
1788 | error: | ||
1789 | if (jinit_done) { /* kill the commit thread, free journal ram */ | ||
1790 | journal_release_error(NULL, s) ; | ||
1791 | } | ||
1792 | if (SB_DISK_SUPER_BLOCK (s)) { | ||
1793 | for (j = 0; j < SB_BMAP_NR (s); j ++) { | ||
1794 | if (SB_AP_BITMAP (s)) | ||
1795 | brelse (SB_AP_BITMAP (s)[j].bh); | ||
1796 | } | ||
1797 | if (SB_AP_BITMAP (s)) | ||
1798 | vfree (SB_AP_BITMAP (s)); | ||
1799 | } | ||
1800 | if (SB_BUFFER_WITH_SB (s)) | ||
1801 | brelse(SB_BUFFER_WITH_SB (s)); | ||
1802 | #ifdef CONFIG_QUOTA | ||
1803 | for (j = 0; j < MAXQUOTAS; j++) { | ||
1804 | if (sbi->s_qf_names[j]) | ||
1805 | kfree(sbi->s_qf_names[j]); | ||
1806 | } | ||
1807 | #endif | ||
1808 | if (sbi != NULL) { | ||
1809 | kfree(sbi); | ||
1810 | } | ||
1811 | |||
1812 | s->s_fs_info = NULL; | ||
1813 | return errval; | ||
1814 | } | ||
1815 | |||
1816 | |||
1817 | static int reiserfs_statfs (struct super_block * s, struct kstatfs * buf) | ||
1818 | { | ||
1819 | struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s); | ||
1820 | |||
1821 | buf->f_namelen = (REISERFS_MAX_NAME (s->s_blocksize)); | ||
1822 | buf->f_bfree = sb_free_blocks(rs); | ||
1823 | buf->f_bavail = buf->f_bfree; | ||
1824 | buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1; | ||
1825 | buf->f_bsize = s->s_blocksize; | ||
1826 | /* changed to accommodate gcc folks.*/ | ||
1827 | buf->f_type = REISERFS_SUPER_MAGIC; | ||
1828 | return 0; | ||
1829 | } | ||
1830 | |||
1831 | #ifdef CONFIG_QUOTA | ||
1832 | static int reiserfs_dquot_initialize(struct inode *inode, int type) | ||
1833 | { | ||
1834 | struct reiserfs_transaction_handle th; | ||
1835 | int ret; | ||
1836 | |||
1837 | /* We may create quota structure so we need to reserve enough blocks */ | ||
1838 | reiserfs_write_lock(inode->i_sb); | ||
1839 | journal_begin(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS); | ||
1840 | ret = dquot_initialize(inode, type); | ||
1841 | journal_end(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS); | ||
1842 | reiserfs_write_unlock(inode->i_sb); | ||
1843 | return ret; | ||
1844 | } | ||
1845 | |||
1846 | static int reiserfs_dquot_drop(struct inode *inode) | ||
1847 | { | ||
1848 | struct reiserfs_transaction_handle th; | ||
1849 | int ret; | ||
1850 | |||
1851 | /* We may delete quota structure so we need to reserve enough blocks */ | ||
1852 | reiserfs_write_lock(inode->i_sb); | ||
1853 | journal_begin(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS); | ||
1854 | ret = dquot_drop(inode); | ||
1855 | journal_end(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS); | ||
1856 | reiserfs_write_unlock(inode->i_sb); | ||
1857 | return ret; | ||
1858 | } | ||
1859 | |||
1860 | static int reiserfs_write_dquot(struct dquot *dquot) | ||
1861 | { | ||
1862 | struct reiserfs_transaction_handle th; | ||
1863 | int ret; | ||
1864 | |||
1865 | reiserfs_write_lock(dquot->dq_sb); | ||
1866 | journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS); | ||
1867 | ret = dquot_commit(dquot); | ||
1868 | journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS); | ||
1869 | reiserfs_write_unlock(dquot->dq_sb); | ||
1870 | return ret; | ||
1871 | } | ||
1872 | |||
1873 | static int reiserfs_acquire_dquot(struct dquot *dquot) | ||
1874 | { | ||
1875 | struct reiserfs_transaction_handle th; | ||
1876 | int ret; | ||
1877 | |||
1878 | reiserfs_write_lock(dquot->dq_sb); | ||
1879 | journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS); | ||
1880 | ret = dquot_acquire(dquot); | ||
1881 | journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS); | ||
1882 | reiserfs_write_unlock(dquot->dq_sb); | ||
1883 | return ret; | ||
1884 | } | ||
1885 | |||
1886 | static int reiserfs_release_dquot(struct dquot *dquot) | ||
1887 | { | ||
1888 | struct reiserfs_transaction_handle th; | ||
1889 | int ret; | ||
1890 | |||
1891 | reiserfs_write_lock(dquot->dq_sb); | ||
1892 | journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS); | ||
1893 | ret = dquot_release(dquot); | ||
1894 | journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS); | ||
1895 | reiserfs_write_unlock(dquot->dq_sb); | ||
1896 | return ret; | ||
1897 | } | ||
1898 | |||
1899 | static int reiserfs_mark_dquot_dirty(struct dquot *dquot) | ||
1900 | { | ||
1901 | /* Are we journalling quotas? */ | ||
1902 | if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || | ||
1903 | REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { | ||
1904 | dquot_mark_dquot_dirty(dquot); | ||
1905 | return reiserfs_write_dquot(dquot); | ||
1906 | } | ||
1907 | else | ||
1908 | return dquot_mark_dquot_dirty(dquot); | ||
1909 | } | ||
1910 | |||
1911 | static int reiserfs_write_info(struct super_block *sb, int type) | ||
1912 | { | ||
1913 | struct reiserfs_transaction_handle th; | ||
1914 | int ret; | ||
1915 | |||
1916 | /* Data block + inode block */ | ||
1917 | reiserfs_write_lock(sb); | ||
1918 | journal_begin(&th, sb, 2); | ||
1919 | ret = dquot_commit_info(sb, type); | ||
1920 | journal_end(&th, sb, 2); | ||
1921 | reiserfs_write_unlock(sb); | ||
1922 | return ret; | ||
1923 | } | ||
1924 | |||
1925 | /* | ||
1926 | * Turn on quotas during mount time - we need to find | ||
1927 | * the quota file and such... | ||
1928 | */ | ||
1929 | static int reiserfs_quota_on_mount(struct super_block *sb, int type) | ||
1930 | { | ||
1931 | int err; | ||
1932 | struct dentry *dentry; | ||
1933 | struct qstr name = { .name = REISERFS_SB(sb)->s_qf_names[type], | ||
1934 | .hash = 0, | ||
1935 | .len = strlen(REISERFS_SB(sb)->s_qf_names[type])}; | ||
1936 | |||
1937 | dentry = lookup_hash(&name, sb->s_root); | ||
1938 | if (IS_ERR(dentry)) | ||
1939 | return PTR_ERR(dentry); | ||
1940 | err = vfs_quota_on_mount(type, REISERFS_SB(sb)->s_jquota_fmt, dentry); | ||
1941 | /* Now invalidate and put the dentry - quota got its own reference | ||
1942 | * to inode and dentry has at least wrong hash so we had better | ||
1943 | * throw it away */ | ||
1944 | d_invalidate(dentry); | ||
1945 | dput(dentry); | ||
1946 | return err; | ||
1947 | } | ||
1948 | |||
1949 | /* | ||
1950 | * Standard function to be called on quota_on | ||
1951 | */ | ||
1952 | static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, char *path) | ||
1953 | { | ||
1954 | int err; | ||
1955 | struct nameidata nd; | ||
1956 | |||
1957 | err = path_lookup(path, LOOKUP_FOLLOW, &nd); | ||
1958 | if (err) | ||
1959 | return err; | ||
1960 | /* Quotafile not on the same filesystem? */ | ||
1961 | if (nd.mnt->mnt_sb != sb) { | ||
1962 | path_release(&nd); | ||
1963 | return -EXDEV; | ||
1964 | } | ||
1965 | /* We must not pack tails for quota files on reiserfs for quota IO to work */ | ||
1966 | if (!REISERFS_I(nd.dentry->d_inode)->i_flags & i_nopack_mask) { | ||
1967 | reiserfs_warning(sb, "reiserfs: Quota file must have tail packing disabled."); | ||
1968 | path_release(&nd); | ||
1969 | return -EINVAL; | ||
1970 | } | ||
1971 | /* Not journalling quota? No more tests needed... */ | ||
1972 | if (!REISERFS_SB(sb)->s_qf_names[USRQUOTA] && | ||
1973 | !REISERFS_SB(sb)->s_qf_names[GRPQUOTA]) { | ||
1974 | path_release(&nd); | ||
1975 | return vfs_quota_on(sb, type, format_id, path); | ||
1976 | } | ||
1977 | /* Quotafile not of fs root? */ | ||
1978 | if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode) | ||
1979 | reiserfs_warning(sb, "reiserfs: Quota file not on filesystem root. " | ||
1980 | "Journalled quota will not work."); | ||
1981 | path_release(&nd); | ||
1982 | return vfs_quota_on(sb, type, format_id, path); | ||
1983 | } | ||
1984 | |||
1985 | /* Read data from quotafile - avoid pagecache and such because we cannot afford | ||
1986 | * acquiring the locks... As quota files are never truncated and quota code | ||
1987 | * itself serializes the operations (and noone else should touch the files) | ||
1988 | * we don't have to be afraid of races */ | ||
1989 | static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, | ||
1990 | size_t len, loff_t off) | ||
1991 | { | ||
1992 | struct inode *inode = sb_dqopt(sb)->files[type]; | ||
1993 | unsigned long blk = off >> sb->s_blocksize_bits; | ||
1994 | int err = 0, offset = off & (sb->s_blocksize - 1), tocopy; | ||
1995 | size_t toread; | ||
1996 | struct buffer_head tmp_bh, *bh; | ||
1997 | loff_t i_size = i_size_read(inode); | ||
1998 | |||
1999 | if (off > i_size) | ||
2000 | return 0; | ||
2001 | if (off+len > i_size) | ||
2002 | len = i_size-off; | ||
2003 | toread = len; | ||
2004 | while (toread > 0) { | ||
2005 | tocopy = sb->s_blocksize - offset < toread ? sb->s_blocksize - offset : toread; | ||
2006 | tmp_bh.b_state = 0; | ||
2007 | /* Quota files are without tails so we can safely use this function */ | ||
2008 | reiserfs_write_lock(sb); | ||
2009 | err = reiserfs_get_block(inode, blk, &tmp_bh, 0); | ||
2010 | reiserfs_write_unlock(sb); | ||
2011 | if (err) | ||
2012 | return err; | ||
2013 | if (!buffer_mapped(&tmp_bh)) /* A hole? */ | ||
2014 | memset(data, 0, tocopy); | ||
2015 | else { | ||
2016 | bh = sb_bread(sb, tmp_bh.b_blocknr); | ||
2017 | if (!bh) | ||
2018 | return -EIO; | ||
2019 | memcpy(data, bh->b_data+offset, tocopy); | ||
2020 | brelse(bh); | ||
2021 | } | ||
2022 | offset = 0; | ||
2023 | toread -= tocopy; | ||
2024 | data += tocopy; | ||
2025 | blk++; | ||
2026 | } | ||
2027 | return len; | ||
2028 | } | ||
2029 | |||
2030 | /* Write to quotafile (we know the transaction is already started and has | ||
2031 | * enough credits) */ | ||
2032 | static ssize_t reiserfs_quota_write(struct super_block *sb, int type, | ||
2033 | const char *data, size_t len, loff_t off) | ||
2034 | { | ||
2035 | struct inode *inode = sb_dqopt(sb)->files[type]; | ||
2036 | unsigned long blk = off >> sb->s_blocksize_bits; | ||
2037 | int err = 0, offset = off & (sb->s_blocksize - 1), tocopy; | ||
2038 | int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL; | ||
2039 | size_t towrite = len; | ||
2040 | struct buffer_head tmp_bh, *bh; | ||
2041 | |||
2042 | down(&inode->i_sem); | ||
2043 | while (towrite > 0) { | ||
2044 | tocopy = sb->s_blocksize - offset < towrite ? | ||
2045 | sb->s_blocksize - offset : towrite; | ||
2046 | tmp_bh.b_state = 0; | ||
2047 | err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE); | ||
2048 | if (err) | ||
2049 | goto out; | ||
2050 | if (offset || tocopy != sb->s_blocksize) | ||
2051 | bh = sb_bread(sb, tmp_bh.b_blocknr); | ||
2052 | else | ||
2053 | bh = sb_getblk(sb, tmp_bh.b_blocknr); | ||
2054 | if (!bh) { | ||
2055 | err = -EIO; | ||
2056 | goto out; | ||
2057 | } | ||
2058 | lock_buffer(bh); | ||
2059 | memcpy(bh->b_data+offset, data, tocopy); | ||
2060 | flush_dcache_page(bh->b_page); | ||
2061 | set_buffer_uptodate(bh); | ||
2062 | unlock_buffer(bh); | ||
2063 | reiserfs_prepare_for_journal(sb, bh, 1); | ||
2064 | journal_mark_dirty(current->journal_info, sb, bh); | ||
2065 | if (!journal_quota) | ||
2066 | reiserfs_add_ordered_list(inode, bh); | ||
2067 | brelse(bh); | ||
2068 | offset = 0; | ||
2069 | towrite -= tocopy; | ||
2070 | data += tocopy; | ||
2071 | blk++; | ||
2072 | } | ||
2073 | out: | ||
2074 | if (len == towrite) | ||
2075 | return err; | ||
2076 | if (inode->i_size < off+len-towrite) | ||
2077 | i_size_write(inode, off+len-towrite); | ||
2078 | inode->i_version++; | ||
2079 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
2080 | mark_inode_dirty(inode); | ||
2081 | up(&inode->i_sem); | ||
2082 | return len - towrite; | ||
2083 | } | ||
2084 | |||
2085 | #endif | ||
2086 | |||
2087 | static struct super_block* | ||
2088 | get_super_block (struct file_system_type *fs_type, int flags, | ||
2089 | const char *dev_name, void *data) | ||
2090 | { | ||
2091 | return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super); | ||
2092 | } | ||
2093 | |||
2094 | static int __init | ||
2095 | init_reiserfs_fs ( void ) | ||
2096 | { | ||
2097 | int ret; | ||
2098 | |||
2099 | if ((ret = init_inodecache ())) { | ||
2100 | return ret; | ||
2101 | } | ||
2102 | |||
2103 | if ((ret = reiserfs_xattr_register_handlers ())) | ||
2104 | goto failed_reiserfs_xattr_register_handlers; | ||
2105 | |||
2106 | reiserfs_proc_info_global_init (); | ||
2107 | reiserfs_proc_register_global ("version", reiserfs_global_version_in_proc); | ||
2108 | |||
2109 | ret = register_filesystem (& reiserfs_fs_type); | ||
2110 | |||
2111 | if (ret == 0) { | ||
2112 | return 0; | ||
2113 | } | ||
2114 | |||
2115 | reiserfs_xattr_unregister_handlers (); | ||
2116 | |||
2117 | failed_reiserfs_xattr_register_handlers: | ||
2118 | reiserfs_proc_unregister_global ("version"); | ||
2119 | reiserfs_proc_info_global_done (); | ||
2120 | destroy_inodecache (); | ||
2121 | |||
2122 | return ret; | ||
2123 | } | ||
2124 | |||
2125 | static void __exit | ||
2126 | exit_reiserfs_fs ( void ) | ||
2127 | { | ||
2128 | reiserfs_xattr_unregister_handlers (); | ||
2129 | reiserfs_proc_unregister_global ("version"); | ||
2130 | reiserfs_proc_info_global_done (); | ||
2131 | unregister_filesystem (& reiserfs_fs_type); | ||
2132 | destroy_inodecache (); | ||
2133 | } | ||
2134 | |||
2135 | struct file_system_type reiserfs_fs_type = { | ||
2136 | .owner = THIS_MODULE, | ||
2137 | .name = "reiserfs", | ||
2138 | .get_sb = get_super_block, | ||
2139 | .kill_sb = kill_block_super, | ||
2140 | .fs_flags = FS_REQUIRES_DEV, | ||
2141 | }; | ||
2142 | |||
2143 | MODULE_DESCRIPTION ("ReiserFS journaled filesystem"); | ||
2144 | MODULE_AUTHOR ("Hans Reiser <reiser@namesys.com>"); | ||
2145 | MODULE_LICENSE ("GPL"); | ||
2146 | |||
2147 | module_init (init_reiserfs_fs); | ||
2148 | module_exit (exit_reiserfs_fs); | ||
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c new file mode 100644 index 000000000000..6191909d5165 --- /dev/null +++ b/fs/reiserfs/tail_conversion.c | |||
@@ -0,0 +1,276 @@ | |||
1 | /* | ||
2 | * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright details | ||
3 | */ | ||
4 | |||
5 | #include <linux/config.h> | ||
6 | #include <linux/time.h> | ||
7 | #include <linux/pagemap.h> | ||
8 | #include <linux/buffer_head.h> | ||
9 | #include <linux/reiserfs_fs.h> | ||
10 | |||
11 | /* access to tail : when one is going to read tail it must make sure, that is not running. | ||
12 | direct2indirect and indirect2direct can not run concurrently */ | ||
13 | |||
14 | |||
15 | /* Converts direct items to an unformatted node. Panics if file has no | ||
16 | tail. -ENOSPC if no disk space for conversion */ | ||
17 | /* path points to first direct item of the file regarless of how many of | ||
18 | them are there */ | ||
19 | int direct2indirect (struct reiserfs_transaction_handle *th, struct inode * inode, | ||
20 | struct path * path, struct buffer_head * unbh, | ||
21 | loff_t tail_offset) | ||
22 | { | ||
23 | struct super_block * sb = inode->i_sb; | ||
24 | struct buffer_head *up_to_date_bh ; | ||
25 | struct item_head * p_le_ih = PATH_PITEM_HEAD (path); | ||
26 | unsigned long total_tail = 0 ; | ||
27 | struct cpu_key end_key; /* Key to search for the last byte of the | ||
28 | converted item. */ | ||
29 | struct item_head ind_ih; /* new indirect item to be inserted or | ||
30 | key of unfm pointer to be pasted */ | ||
31 | int n_blk_size, | ||
32 | n_retval; /* returned value for reiserfs_insert_item and clones */ | ||
33 | unp_t unfm_ptr; /* Handle on an unformatted node | ||
34 | that will be inserted in the | ||
35 | tree. */ | ||
36 | |||
37 | BUG_ON (!th->t_trans_id); | ||
38 | |||
39 | REISERFS_SB(sb)->s_direct2indirect ++; | ||
40 | |||
41 | n_blk_size = sb->s_blocksize; | ||
42 | |||
43 | /* and key to search for append or insert pointer to the new | ||
44 | unformatted node. */ | ||
45 | copy_item_head (&ind_ih, p_le_ih); | ||
46 | set_le_ih_k_offset (&ind_ih, tail_offset); | ||
47 | set_le_ih_k_type (&ind_ih, TYPE_INDIRECT); | ||
48 | |||
49 | /* Set the key to search for the place for new unfm pointer */ | ||
50 | make_cpu_key (&end_key, inode, tail_offset, TYPE_INDIRECT, 4); | ||
51 | |||
52 | // FIXME: we could avoid this | ||
53 | if ( search_for_position_by_key (sb, &end_key, path) == POSITION_FOUND ) { | ||
54 | reiserfs_warning (sb, "PAP-14030: direct2indirect: " | ||
55 | "pasted or inserted byte exists in the tree %K. " | ||
56 | "Use fsck to repair.", &end_key); | ||
57 | pathrelse(path); | ||
58 | return -EIO; | ||
59 | } | ||
60 | |||
61 | p_le_ih = PATH_PITEM_HEAD (path); | ||
62 | |||
63 | unfm_ptr = cpu_to_le32 (unbh->b_blocknr); | ||
64 | |||
65 | if ( is_statdata_le_ih (p_le_ih) ) { | ||
66 | /* Insert new indirect item. */ | ||
67 | set_ih_free_space (&ind_ih, 0); /* delete at nearest future */ | ||
68 | put_ih_item_len( &ind_ih, UNFM_P_SIZE ); | ||
69 | PATH_LAST_POSITION (path)++; | ||
70 | n_retval = reiserfs_insert_item (th, path, &end_key, &ind_ih, inode, | ||
71 | (char *)&unfm_ptr); | ||
72 | } else { | ||
73 | /* Paste into last indirect item of an object. */ | ||
74 | n_retval = reiserfs_paste_into_item(th, path, &end_key, inode, | ||
75 | (char *)&unfm_ptr, UNFM_P_SIZE); | ||
76 | } | ||
77 | if ( n_retval ) { | ||
78 | return n_retval; | ||
79 | } | ||
80 | |||
81 | // note: from here there are two keys which have matching first | ||
82 | // three key components. They only differ by the fourth one. | ||
83 | |||
84 | |||
85 | /* Set the key to search for the direct items of the file */ | ||
86 | make_cpu_key (&end_key, inode, max_reiserfs_offset (inode), TYPE_DIRECT, 4); | ||
87 | |||
88 | /* Move bytes from the direct items to the new unformatted node | ||
89 | and delete them. */ | ||
90 | while (1) { | ||
91 | int tail_size; | ||
92 | |||
93 | /* end_key.k_offset is set so, that we will always have found | ||
94 | last item of the file */ | ||
95 | if ( search_for_position_by_key (sb, &end_key, path) == POSITION_FOUND ) | ||
96 | reiserfs_panic (sb, "PAP-14050: direct2indirect: " | ||
97 | "direct item (%K) not found", &end_key); | ||
98 | p_le_ih = PATH_PITEM_HEAD (path); | ||
99 | RFALSE( !is_direct_le_ih (p_le_ih), | ||
100 | "vs-14055: direct item expected(%K), found %h", | ||
101 | &end_key, p_le_ih); | ||
102 | tail_size = (le_ih_k_offset (p_le_ih) & (n_blk_size - 1)) | ||
103 | + ih_item_len(p_le_ih) - 1; | ||
104 | |||
105 | /* we only send the unbh pointer if the buffer is not up to date. | ||
106 | ** this avoids overwriting good data from writepage() with old data | ||
107 | ** from the disk or buffer cache | ||
108 | ** Special case: unbh->b_page will be NULL if we are coming through | ||
109 | ** DIRECT_IO handler here. | ||
110 | */ | ||
111 | if (!unbh->b_page || buffer_uptodate(unbh) || PageUptodate(unbh->b_page)) { | ||
112 | up_to_date_bh = NULL ; | ||
113 | } else { | ||
114 | up_to_date_bh = unbh ; | ||
115 | } | ||
116 | n_retval = reiserfs_delete_item (th, path, &end_key, inode, | ||
117 | up_to_date_bh) ; | ||
118 | |||
119 | total_tail += n_retval ; | ||
120 | if (tail_size == n_retval) | ||
121 | // done: file does not have direct items anymore | ||
122 | break; | ||
123 | |||
124 | } | ||
125 | /* if we've copied bytes from disk into the page, we need to zero | ||
126 | ** out the unused part of the block (it was not up to date before) | ||
127 | */ | ||
128 | if (up_to_date_bh) { | ||
129 | unsigned pgoff = (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1); | ||
130 | char *kaddr=kmap_atomic(up_to_date_bh->b_page, KM_USER0); | ||
131 | memset(kaddr + pgoff, 0, n_blk_size - total_tail) ; | ||
132 | kunmap_atomic(kaddr, KM_USER0); | ||
133 | } | ||
134 | |||
135 | REISERFS_I(inode)->i_first_direct_byte = U32_MAX; | ||
136 | |||
137 | return 0; | ||
138 | } | ||
139 | |||
140 | |||
141 | /* stolen from fs/buffer.c */ | ||
142 | void reiserfs_unmap_buffer(struct buffer_head *bh) { | ||
143 | lock_buffer(bh) ; | ||
144 | if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { | ||
145 | BUG() ; | ||
146 | } | ||
147 | clear_buffer_dirty(bh) ; | ||
148 | /* Remove the buffer from whatever list it belongs to. We are mostly | ||
149 | interested in removing it from per-sb j_dirty_buffers list, to avoid | ||
150 | BUG() on attempt to write not mapped buffer */ | ||
151 | if ( (!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) { | ||
152 | struct inode *inode = bh->b_page->mapping->host; | ||
153 | struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); | ||
154 | spin_lock(&j->j_dirty_buffers_lock); | ||
155 | list_del_init(&bh->b_assoc_buffers); | ||
156 | reiserfs_free_jh(bh); | ||
157 | spin_unlock(&j->j_dirty_buffers_lock); | ||
158 | } | ||
159 | clear_buffer_mapped(bh) ; | ||
160 | clear_buffer_req(bh) ; | ||
161 | clear_buffer_new(bh); | ||
162 | bh->b_bdev = NULL; | ||
163 | unlock_buffer(bh) ; | ||
164 | } | ||
165 | |||
166 | /* this first locks inode (neither reads nor sync are permitted), | ||
167 | reads tail through page cache, insert direct item. When direct item | ||
168 | inserted successfully inode is left locked. Return value is always | ||
169 | what we expect from it (number of cut bytes). But when tail remains | ||
170 | in the unformatted node, we set mode to SKIP_BALANCING and unlock | ||
171 | inode */ | ||
172 | int indirect2direct (struct reiserfs_transaction_handle *th, | ||
173 | struct inode * p_s_inode, | ||
174 | struct page *page, | ||
175 | struct path * p_s_path, /* path to the indirect item. */ | ||
176 | const struct cpu_key * p_s_item_key, /* Key to look for unformatted node pointer to be cut. */ | ||
177 | loff_t n_new_file_size, /* New file size. */ | ||
178 | char * p_c_mode) | ||
179 | { | ||
180 | struct super_block * p_s_sb = p_s_inode->i_sb; | ||
181 | struct item_head s_ih; | ||
182 | unsigned long n_block_size = p_s_sb->s_blocksize; | ||
183 | char * tail; | ||
184 | int tail_len, round_tail_len; | ||
185 | loff_t pos, pos1; /* position of first byte of the tail */ | ||
186 | struct cpu_key key; | ||
187 | |||
188 | BUG_ON (!th->t_trans_id); | ||
189 | |||
190 | REISERFS_SB(p_s_sb)->s_indirect2direct ++; | ||
191 | |||
192 | *p_c_mode = M_SKIP_BALANCING; | ||
193 | |||
194 | /* store item head path points to. */ | ||
195 | copy_item_head (&s_ih, PATH_PITEM_HEAD(p_s_path)); | ||
196 | |||
197 | tail_len = (n_new_file_size & (n_block_size - 1)); | ||
198 | if (get_inode_sd_version (p_s_inode) == STAT_DATA_V2) | ||
199 | round_tail_len = ROUND_UP (tail_len); | ||
200 | else | ||
201 | round_tail_len = tail_len; | ||
202 | |||
203 | pos = le_ih_k_offset (&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE - 1) * p_s_sb->s_blocksize; | ||
204 | pos1 = pos; | ||
205 | |||
206 | // we are protected by i_sem. The tail can not disapper, not | ||
207 | // append can be done either | ||
208 | // we are in truncate or packing tail in file_release | ||
209 | |||
210 | tail = (char *)kmap(page) ; /* this can schedule */ | ||
211 | |||
212 | if (path_changed (&s_ih, p_s_path)) { | ||
213 | /* re-search indirect item */ | ||
214 | if ( search_for_position_by_key (p_s_sb, p_s_item_key, p_s_path) == POSITION_NOT_FOUND ) | ||
215 | reiserfs_panic(p_s_sb, "PAP-5520: indirect2direct: " | ||
216 | "item to be converted %K does not exist", p_s_item_key); | ||
217 | copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); | ||
218 | #ifdef CONFIG_REISERFS_CHECK | ||
219 | pos = le_ih_k_offset (&s_ih) - 1 + | ||
220 | (ih_item_len(&s_ih) / UNFM_P_SIZE - 1) * p_s_sb->s_blocksize; | ||
221 | if (pos != pos1) | ||
222 | reiserfs_panic (p_s_sb, "vs-5530: indirect2direct: " | ||
223 | "tail position changed while we were reading it"); | ||
224 | #endif | ||
225 | } | ||
226 | |||
227 | |||
228 | /* Set direct item header to insert. */ | ||
229 | make_le_item_head (&s_ih, NULL, get_inode_item_key_version (p_s_inode), pos1 + 1, | ||
230 | TYPE_DIRECT, round_tail_len, 0xffff/*ih_free_space*/); | ||
231 | |||
232 | /* we want a pointer to the first byte of the tail in the page. | ||
233 | ** the page was locked and this part of the page was up to date when | ||
234 | ** indirect2direct was called, so we know the bytes are still valid | ||
235 | */ | ||
236 | tail = tail + (pos & (PAGE_CACHE_SIZE - 1)) ; | ||
237 | |||
238 | PATH_LAST_POSITION(p_s_path)++; | ||
239 | |||
240 | key = *p_s_item_key; | ||
241 | set_cpu_key_k_type (&key, TYPE_DIRECT); | ||
242 | key.key_length = 4; | ||
243 | /* Insert tail as new direct item in the tree */ | ||
244 | if ( reiserfs_insert_item(th, p_s_path, &key, &s_ih, p_s_inode, | ||
245 | tail ? tail : NULL) < 0 ) { | ||
246 | /* No disk memory. So we can not convert last unformatted node | ||
247 | to the direct item. In this case we used to adjust | ||
248 | indirect items's ih_free_space. Now ih_free_space is not | ||
249 | used, it would be ideal to write zeros to corresponding | ||
250 | unformatted node. For now i_size is considered as guard for | ||
251 | going out of file size */ | ||
252 | kunmap(page) ; | ||
253 | return n_block_size - round_tail_len; | ||
254 | } | ||
255 | kunmap(page) ; | ||
256 | |||
257 | /* make sure to get the i_blocks changes from reiserfs_insert_item */ | ||
258 | reiserfs_update_sd(th, p_s_inode); | ||
259 | |||
260 | // note: we have now the same as in above direct2indirect | ||
261 | // conversion: there are two keys which have matching first three | ||
262 | // key components. They only differ by the fouhth one. | ||
263 | |||
264 | /* We have inserted new direct item and must remove last | ||
265 | unformatted node. */ | ||
266 | *p_c_mode = M_CUT; | ||
267 | |||
268 | /* we store position of first direct item in the in-core inode */ | ||
269 | //mark_file_with_tail (p_s_inode, pos1 + 1); | ||
270 | REISERFS_I(p_s_inode)->i_first_direct_byte = pos1 + 1; | ||
271 | |||
272 | return n_block_size - round_tail_len; | ||
273 | } | ||
274 | |||
275 | |||
276 | |||
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c new file mode 100644 index 000000000000..45582fe8b466 --- /dev/null +++ b/fs/reiserfs/xattr.c | |||
@@ -0,0 +1,1450 @@ | |||
1 | /* | ||
2 | * linux/fs/reiserfs/xattr.c | ||
3 | * | ||
4 | * Copyright (c) 2002 by Jeff Mahoney, <jeffm@suse.com> | ||
5 | * | ||
6 | */ | ||
7 | |||
8 | /* | ||
9 | * In order to implement EA/ACLs in a clean, backwards compatible manner, | ||
10 | * they are implemented as files in a "private" directory. | ||
11 | * Each EA is in it's own file, with the directory layout like so (/ is assumed | ||
12 | * to be relative to fs root). Inside the /.reiserfs_priv/xattrs directory, | ||
13 | * directories named using the capital-hex form of the objectid and | ||
14 | * generation number are used. Inside each directory are individual files | ||
15 | * named with the name of the extended attribute. | ||
16 | * | ||
17 | * So, for objectid 12648430, we could have: | ||
18 | * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_access | ||
19 | * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_default | ||
20 | * /.reiserfs_priv/xattrs/C0FFEE.0/user.Content-Type | ||
21 | * .. or similar. | ||
22 | * | ||
23 | * The file contents are the text of the EA. The size is known based on the | ||
24 | * stat data describing the file. | ||
25 | * | ||
26 | * In the case of system.posix_acl_access and system.posix_acl_default, since | ||
27 | * these are special cases for filesystem ACLs, they are interpreted by the | ||
28 | * kernel, in addition, they are negatively and positively cached and attached | ||
29 | * to the inode so that unnecessary lookups are avoided. | ||
30 | */ | ||
31 | |||
32 | #include <linux/reiserfs_fs.h> | ||
33 | #include <linux/dcache.h> | ||
34 | #include <linux/namei.h> | ||
35 | #include <linux/errno.h> | ||
36 | #include <linux/fs.h> | ||
37 | #include <linux/file.h> | ||
38 | #include <linux/pagemap.h> | ||
39 | #include <linux/xattr.h> | ||
40 | #include <linux/reiserfs_xattr.h> | ||
41 | #include <linux/reiserfs_acl.h> | ||
42 | #include <linux/mbcache.h> | ||
43 | #include <asm/uaccess.h> | ||
44 | #include <asm/checksum.h> | ||
45 | #include <linux/smp_lock.h> | ||
46 | #include <linux/stat.h> | ||
47 | #include <asm/semaphore.h> | ||
48 | |||
49 | #define FL_READONLY 128 | ||
50 | #define FL_DIR_SEM_HELD 256 | ||
51 | #define PRIVROOT_NAME ".reiserfs_priv" | ||
52 | #define XAROOT_NAME "xattrs" | ||
53 | |||
54 | static struct reiserfs_xattr_handler *find_xattr_handler_prefix (const char *prefix); | ||
55 | |||
56 | static struct dentry * | ||
57 | create_xa_root (struct super_block *sb) | ||
58 | { | ||
59 | struct dentry *privroot = dget (REISERFS_SB(sb)->priv_root); | ||
60 | struct dentry *xaroot; | ||
61 | |||
62 | /* This needs to be created at mount-time */ | ||
63 | if (!privroot) | ||
64 | return ERR_PTR(-EOPNOTSUPP); | ||
65 | |||
66 | xaroot = lookup_one_len (XAROOT_NAME, privroot, strlen (XAROOT_NAME)); | ||
67 | if (IS_ERR (xaroot)) { | ||
68 | goto out; | ||
69 | } else if (!xaroot->d_inode) { | ||
70 | int err; | ||
71 | down (&privroot->d_inode->i_sem); | ||
72 | err = privroot->d_inode->i_op->mkdir (privroot->d_inode, xaroot, 0700); | ||
73 | up (&privroot->d_inode->i_sem); | ||
74 | |||
75 | if (err) { | ||
76 | dput (xaroot); | ||
77 | dput (privroot); | ||
78 | return ERR_PTR (err); | ||
79 | } | ||
80 | REISERFS_SB(sb)->xattr_root = dget (xaroot); | ||
81 | } | ||
82 | |||
83 | out: | ||
84 | dput (privroot); | ||
85 | return xaroot; | ||
86 | } | ||
87 | |||
88 | /* This will return a dentry, or error, refering to the xa root directory. | ||
89 | * If the xa root doesn't exist yet, the dentry will be returned without | ||
90 | * an associated inode. This dentry can be used with ->mkdir to create | ||
91 | * the xa directory. */ | ||
92 | static struct dentry * | ||
93 | __get_xa_root (struct super_block *s) | ||
94 | { | ||
95 | struct dentry *privroot = dget (REISERFS_SB(s)->priv_root); | ||
96 | struct dentry *xaroot = NULL; | ||
97 | |||
98 | if (IS_ERR (privroot) || !privroot) | ||
99 | return privroot; | ||
100 | |||
101 | xaroot = lookup_one_len (XAROOT_NAME, privroot, strlen (XAROOT_NAME)); | ||
102 | if (IS_ERR (xaroot)) { | ||
103 | goto out; | ||
104 | } else if (!xaroot->d_inode) { | ||
105 | dput (xaroot); | ||
106 | xaroot = NULL; | ||
107 | goto out; | ||
108 | } | ||
109 | |||
110 | REISERFS_SB(s)->xattr_root = dget (xaroot); | ||
111 | |||
112 | out: | ||
113 | dput (privroot); | ||
114 | return xaroot; | ||
115 | } | ||
116 | |||
117 | /* Returns the dentry (or NULL) referring to the root of the extended | ||
118 | * attribute directory tree. If it has already been retreived, it is used. | ||
119 | * Otherwise, we attempt to retreive it from disk. It may also return | ||
120 | * a pointer-encoded error. | ||
121 | */ | ||
122 | static inline struct dentry * | ||
123 | get_xa_root (struct super_block *s) | ||
124 | { | ||
125 | struct dentry *dentry = dget (REISERFS_SB(s)->xattr_root); | ||
126 | |||
127 | if (!dentry) | ||
128 | dentry = __get_xa_root (s); | ||
129 | |||
130 | return dentry; | ||
131 | } | ||
132 | |||
133 | /* Opens the directory corresponding to the inode's extended attribute store. | ||
134 | * If flags allow, the tree to the directory may be created. If creation is | ||
135 | * prohibited, -ENODATA is returned. */ | ||
136 | static struct dentry * | ||
137 | open_xa_dir (const struct inode *inode, int flags) | ||
138 | { | ||
139 | struct dentry *xaroot, *xadir; | ||
140 | char namebuf[17]; | ||
141 | |||
142 | xaroot = get_xa_root (inode->i_sb); | ||
143 | if (IS_ERR (xaroot)) { | ||
144 | return xaroot; | ||
145 | } else if (!xaroot) { | ||
146 | if (flags == 0 || flags & XATTR_CREATE) { | ||
147 | xaroot = create_xa_root (inode->i_sb); | ||
148 | if (IS_ERR (xaroot)) | ||
149 | return xaroot; | ||
150 | } | ||
151 | if (!xaroot) | ||
152 | return ERR_PTR (-ENODATA); | ||
153 | } | ||
154 | |||
155 | /* ok, we have xaroot open */ | ||
156 | |||
157 | snprintf (namebuf, sizeof (namebuf), "%X.%X", | ||
158 | le32_to_cpu (INODE_PKEY (inode)->k_objectid), | ||
159 | inode->i_generation); | ||
160 | xadir = lookup_one_len (namebuf, xaroot, strlen (namebuf)); | ||
161 | if (IS_ERR (xadir)) { | ||
162 | dput (xaroot); | ||
163 | return xadir; | ||
164 | } | ||
165 | |||
166 | if (!xadir->d_inode) { | ||
167 | int err; | ||
168 | if (flags == 0 || flags & XATTR_CREATE) { | ||
169 | /* Although there is nothing else trying to create this directory, | ||
170 | * another directory with the same hash may be created, so we need | ||
171 | * to protect against that */ | ||
172 | err = xaroot->d_inode->i_op->mkdir (xaroot->d_inode, xadir, 0700); | ||
173 | if (err) { | ||
174 | dput (xaroot); | ||
175 | dput (xadir); | ||
176 | return ERR_PTR (err); | ||
177 | } | ||
178 | } | ||
179 | if (!xadir->d_inode) { | ||
180 | dput (xaroot); | ||
181 | dput (xadir); | ||
182 | return ERR_PTR (-ENODATA); | ||
183 | } | ||
184 | } | ||
185 | |||
186 | dput (xaroot); | ||
187 | return xadir; | ||
188 | } | ||
189 | |||
190 | /* Returns a dentry corresponding to a specific extended attribute file | ||
191 | * for the inode. If flags allow, the file is created. Otherwise, a | ||
192 | * valid or negative dentry, or an error is returned. */ | ||
193 | static struct dentry * | ||
194 | get_xa_file_dentry (const struct inode *inode, const char *name, int flags) | ||
195 | { | ||
196 | struct dentry *xadir, *xafile; | ||
197 | int err = 0; | ||
198 | |||
199 | xadir = open_xa_dir (inode, flags); | ||
200 | if (IS_ERR (xadir)) { | ||
201 | return ERR_PTR (PTR_ERR (xadir)); | ||
202 | } else if (xadir && !xadir->d_inode) { | ||
203 | dput (xadir); | ||
204 | return ERR_PTR (-ENODATA); | ||
205 | } | ||
206 | |||
207 | xafile = lookup_one_len (name, xadir, strlen (name)); | ||
208 | if (IS_ERR (xafile)) { | ||
209 | dput (xadir); | ||
210 | return ERR_PTR (PTR_ERR (xafile)); | ||
211 | } | ||
212 | |||
213 | if (xafile->d_inode) { /* file exists */ | ||
214 | if (flags & XATTR_CREATE) { | ||
215 | err = -EEXIST; | ||
216 | dput (xafile); | ||
217 | goto out; | ||
218 | } | ||
219 | } else if (flags & XATTR_REPLACE || flags & FL_READONLY) { | ||
220 | goto out; | ||
221 | } else { | ||
222 | /* inode->i_sem is down, so nothing else can try to create | ||
223 | * the same xattr */ | ||
224 | err = xadir->d_inode->i_op->create (xadir->d_inode, xafile, | ||
225 | 0700|S_IFREG, NULL); | ||
226 | |||
227 | if (err) { | ||
228 | dput (xafile); | ||
229 | goto out; | ||
230 | } | ||
231 | } | ||
232 | |||
233 | out: | ||
234 | dput (xadir); | ||
235 | if (err) | ||
236 | xafile = ERR_PTR (err); | ||
237 | return xafile; | ||
238 | } | ||
239 | |||
240 | |||
241 | /* Opens a file pointer to the attribute associated with inode */ | ||
242 | static struct file * | ||
243 | open_xa_file (const struct inode *inode, const char *name, int flags) | ||
244 | { | ||
245 | struct dentry *xafile; | ||
246 | struct file *fp; | ||
247 | |||
248 | xafile = get_xa_file_dentry (inode, name, flags); | ||
249 | if (IS_ERR (xafile)) | ||
250 | return ERR_PTR (PTR_ERR (xafile)); | ||
251 | else if (!xafile->d_inode) { | ||
252 | dput (xafile); | ||
253 | return ERR_PTR (-ENODATA); | ||
254 | } | ||
255 | |||
256 | fp = dentry_open (xafile, NULL, O_RDWR); | ||
257 | /* dentry_open dputs the dentry if it fails */ | ||
258 | |||
259 | return fp; | ||
260 | } | ||
261 | |||
262 | |||
263 | /* | ||
264 | * this is very similar to fs/reiserfs/dir.c:reiserfs_readdir, but | ||
265 | * we need to drop the path before calling the filldir struct. That | ||
266 | * would be a big performance hit to the non-xattr case, so I've copied | ||
267 | * the whole thing for now. --clm | ||
268 | * | ||
269 | * the big difference is that I go backwards through the directory, | ||
270 | * and don't mess with f->f_pos, but the idea is the same. Do some | ||
271 | * action on each and every entry in the directory. | ||
272 | * | ||
273 | * we're called with i_sem held, so there are no worries about the directory | ||
274 | * changing underneath us. | ||
275 | */ | ||
276 | static int __xattr_readdir(struct file * filp, void * dirent, filldir_t filldir) | ||
277 | { | ||
278 | struct inode *inode = filp->f_dentry->d_inode; | ||
279 | struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ | ||
280 | INITIALIZE_PATH (path_to_entry); | ||
281 | struct buffer_head * bh; | ||
282 | int entry_num; | ||
283 | struct item_head * ih, tmp_ih; | ||
284 | int search_res; | ||
285 | char * local_buf; | ||
286 | loff_t next_pos; | ||
287 | char small_buf[32] ; /* avoid kmalloc if we can */ | ||
288 | struct reiserfs_de_head *deh; | ||
289 | int d_reclen; | ||
290 | char * d_name; | ||
291 | off_t d_off; | ||
292 | ino_t d_ino; | ||
293 | struct reiserfs_dir_entry de; | ||
294 | |||
295 | |||
296 | /* form key for search the next directory entry using f_pos field of | ||
297 | file structure */ | ||
298 | next_pos = max_reiserfs_offset(inode); | ||
299 | |||
300 | while (1) { | ||
301 | research: | ||
302 | if (next_pos <= DOT_DOT_OFFSET) | ||
303 | break; | ||
304 | make_cpu_key (&pos_key, inode, next_pos, TYPE_DIRENTRY, 3); | ||
305 | |||
306 | search_res = search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry, &de); | ||
307 | if (search_res == IO_ERROR) { | ||
308 | // FIXME: we could just skip part of directory which could | ||
309 | // not be read | ||
310 | pathrelse(&path_to_entry); | ||
311 | return -EIO; | ||
312 | } | ||
313 | |||
314 | if (search_res == NAME_NOT_FOUND) | ||
315 | de.de_entry_num--; | ||
316 | |||
317 | set_de_name_and_namelen(&de); | ||
318 | entry_num = de.de_entry_num; | ||
319 | deh = &(de.de_deh[entry_num]); | ||
320 | |||
321 | bh = de.de_bh; | ||
322 | ih = de.de_ih; | ||
323 | |||
324 | if (!is_direntry_le_ih(ih)) { | ||
325 | reiserfs_warning(inode->i_sb, "not direntry %h", ih); | ||
326 | break; | ||
327 | } | ||
328 | copy_item_head(&tmp_ih, ih); | ||
329 | |||
330 | /* we must have found item, that is item of this directory, */ | ||
331 | RFALSE( COMP_SHORT_KEYS (&(ih->ih_key), &pos_key), | ||
332 | "vs-9000: found item %h does not match to dir we readdir %K", | ||
333 | ih, &pos_key); | ||
334 | |||
335 | if (deh_offset(deh) <= DOT_DOT_OFFSET) { | ||
336 | break; | ||
337 | } | ||
338 | |||
339 | /* look for the previous entry in the directory */ | ||
340 | next_pos = deh_offset (deh) - 1; | ||
341 | |||
342 | if (!de_visible (deh)) | ||
343 | /* it is hidden entry */ | ||
344 | continue; | ||
345 | |||
346 | d_reclen = entry_length(bh, ih, entry_num); | ||
347 | d_name = B_I_DEH_ENTRY_FILE_NAME (bh, ih, deh); | ||
348 | d_off = deh_offset (deh); | ||
349 | d_ino = deh_objectid (deh); | ||
350 | |||
351 | if (!d_name[d_reclen - 1]) | ||
352 | d_reclen = strlen (d_name); | ||
353 | |||
354 | if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)){ | ||
355 | /* too big to send back to VFS */ | ||
356 | continue ; | ||
357 | } | ||
358 | |||
359 | /* Ignore the .reiserfs_priv entry */ | ||
360 | if (reiserfs_xattrs (inode->i_sb) && | ||
361 | !old_format_only(inode->i_sb) && | ||
362 | deh_objectid (deh) == le32_to_cpu (INODE_PKEY(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->k_objectid)) | ||
363 | continue; | ||
364 | |||
365 | if (d_reclen <= 32) { | ||
366 | local_buf = small_buf ; | ||
367 | } else { | ||
368 | local_buf = reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb) ; | ||
369 | if (!local_buf) { | ||
370 | pathrelse (&path_to_entry); | ||
371 | return -ENOMEM ; | ||
372 | } | ||
373 | if (item_moved (&tmp_ih, &path_to_entry)) { | ||
374 | reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; | ||
375 | |||
376 | /* sigh, must retry. Do this same offset again */ | ||
377 | next_pos = d_off; | ||
378 | goto research; | ||
379 | } | ||
380 | } | ||
381 | |||
382 | // Note, that we copy name to user space via temporary | ||
383 | // buffer (local_buf) because filldir will block if | ||
384 | // user space buffer is swapped out. At that time | ||
385 | // entry can move to somewhere else | ||
386 | memcpy (local_buf, d_name, d_reclen); | ||
387 | |||
388 | /* the filldir function might need to start transactions, | ||
389 | * or do who knows what. Release the path now that we've | ||
390 | * copied all the important stuff out of the deh | ||
391 | */ | ||
392 | pathrelse (&path_to_entry); | ||
393 | |||
394 | if (filldir (dirent, local_buf, d_reclen, d_off, d_ino, | ||
395 | DT_UNKNOWN) < 0) { | ||
396 | if (local_buf != small_buf) { | ||
397 | reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; | ||
398 | } | ||
399 | goto end; | ||
400 | } | ||
401 | if (local_buf != small_buf) { | ||
402 | reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; | ||
403 | } | ||
404 | } /* while */ | ||
405 | |||
406 | end: | ||
407 | pathrelse (&path_to_entry); | ||
408 | return 0; | ||
409 | } | ||
410 | |||
411 | /* | ||
412 | * this could be done with dedicated readdir ops for the xattr files, | ||
413 | * but I want to get something working asap | ||
414 | * this is stolen from vfs_readdir | ||
415 | * | ||
416 | */ | ||
417 | static | ||
418 | int xattr_readdir(struct file *file, filldir_t filler, void *buf) | ||
419 | { | ||
420 | struct inode *inode = file->f_dentry->d_inode; | ||
421 | int res = -ENOTDIR; | ||
422 | if (!file->f_op || !file->f_op->readdir) | ||
423 | goto out; | ||
424 | down(&inode->i_sem); | ||
425 | // down(&inode->i_zombie); | ||
426 | res = -ENOENT; | ||
427 | if (!IS_DEADDIR(inode)) { | ||
428 | lock_kernel(); | ||
429 | res = __xattr_readdir(file, buf, filler); | ||
430 | unlock_kernel(); | ||
431 | } | ||
432 | // up(&inode->i_zombie); | ||
433 | up(&inode->i_sem); | ||
434 | out: | ||
435 | return res; | ||
436 | } | ||
437 | |||
438 | |||
439 | /* Internal operations on file data */ | ||
440 | static inline void | ||
441 | reiserfs_put_page(struct page *page) | ||
442 | { | ||
443 | kunmap(page); | ||
444 | page_cache_release(page); | ||
445 | } | ||
446 | |||
447 | static struct page * | ||
448 | reiserfs_get_page(struct inode *dir, unsigned long n) | ||
449 | { | ||
450 | struct address_space *mapping = dir->i_mapping; | ||
451 | struct page *page; | ||
452 | /* We can deadlock if we try to free dentries, | ||
453 | and an unlink/rmdir has just occured - GFP_NOFS avoids this */ | ||
454 | mapping->flags = (mapping->flags & ~__GFP_BITS_MASK) | GFP_NOFS; | ||
455 | page = read_cache_page (mapping, n, | ||
456 | (filler_t*)mapping->a_ops->readpage, NULL); | ||
457 | if (!IS_ERR(page)) { | ||
458 | wait_on_page_locked(page); | ||
459 | kmap(page); | ||
460 | if (!PageUptodate(page)) | ||
461 | goto fail; | ||
462 | |||
463 | if (PageError(page)) | ||
464 | goto fail; | ||
465 | } | ||
466 | return page; | ||
467 | |||
468 | fail: | ||
469 | reiserfs_put_page(page); | ||
470 | return ERR_PTR(-EIO); | ||
471 | } | ||
472 | |||
473 | static inline __u32 | ||
474 | xattr_hash (const char *msg, int len) | ||
475 | { | ||
476 | return csum_partial (msg, len, 0); | ||
477 | } | ||
478 | |||
479 | /* Generic extended attribute operations that can be used by xa plugins */ | ||
480 | |||
481 | /* | ||
482 | * inode->i_sem: down | ||
483 | */ | ||
484 | int | ||
485 | reiserfs_xattr_set (struct inode *inode, const char *name, const void *buffer, | ||
486 | size_t buffer_size, int flags) | ||
487 | { | ||
488 | int err = 0; | ||
489 | struct file *fp; | ||
490 | struct page *page; | ||
491 | char *data; | ||
492 | struct address_space *mapping; | ||
493 | size_t file_pos = 0; | ||
494 | size_t buffer_pos = 0; | ||
495 | struct inode *xinode; | ||
496 | struct iattr newattrs; | ||
497 | __u32 xahash = 0; | ||
498 | |||
499 | if (IS_RDONLY (inode)) | ||
500 | return -EROFS; | ||
501 | |||
502 | if (IS_IMMUTABLE (inode) || IS_APPEND (inode)) | ||
503 | return -EPERM; | ||
504 | |||
505 | if (get_inode_sd_version (inode) == STAT_DATA_V1) | ||
506 | return -EOPNOTSUPP; | ||
507 | |||
508 | /* Empty xattrs are ok, they're just empty files, no hash */ | ||
509 | if (buffer && buffer_size) | ||
510 | xahash = xattr_hash (buffer, buffer_size); | ||
511 | |||
512 | open_file: | ||
513 | fp = open_xa_file (inode, name, flags); | ||
514 | if (IS_ERR (fp)) { | ||
515 | err = PTR_ERR (fp); | ||
516 | goto out; | ||
517 | } | ||
518 | |||
519 | xinode = fp->f_dentry->d_inode; | ||
520 | REISERFS_I(inode)->i_flags |= i_has_xattr_dir; | ||
521 | |||
522 | /* we need to copy it off.. */ | ||
523 | if (xinode->i_nlink > 1) { | ||
524 | fput(fp); | ||
525 | err = reiserfs_xattr_del (inode, name); | ||
526 | if (err < 0) | ||
527 | goto out; | ||
528 | /* We just killed the old one, we're not replacing anymore */ | ||
529 | if (flags & XATTR_REPLACE) | ||
530 | flags &= ~XATTR_REPLACE; | ||
531 | goto open_file; | ||
532 | } | ||
533 | |||
534 | /* Resize it so we're ok to write there */ | ||
535 | newattrs.ia_size = buffer_size; | ||
536 | newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; | ||
537 | down (&xinode->i_sem); | ||
538 | err = notify_change(fp->f_dentry, &newattrs); | ||
539 | if (err) | ||
540 | goto out_filp; | ||
541 | |||
542 | mapping = xinode->i_mapping; | ||
543 | while (buffer_pos < buffer_size || buffer_pos == 0) { | ||
544 | size_t chunk; | ||
545 | size_t skip = 0; | ||
546 | size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1)); | ||
547 | if (buffer_size - buffer_pos > PAGE_CACHE_SIZE) | ||
548 | chunk = PAGE_CACHE_SIZE; | ||
549 | else | ||
550 | chunk = buffer_size - buffer_pos; | ||
551 | |||
552 | page = reiserfs_get_page (xinode, file_pos >> PAGE_CACHE_SHIFT); | ||
553 | if (IS_ERR (page)) { | ||
554 | err = PTR_ERR (page); | ||
555 | goto out_filp; | ||
556 | } | ||
557 | |||
558 | lock_page (page); | ||
559 | data = page_address (page); | ||
560 | |||
561 | if (file_pos == 0) { | ||
562 | struct reiserfs_xattr_header *rxh; | ||
563 | skip = file_pos = sizeof (struct reiserfs_xattr_header); | ||
564 | if (chunk + skip > PAGE_CACHE_SIZE) | ||
565 | chunk = PAGE_CACHE_SIZE - skip; | ||
566 | rxh = (struct reiserfs_xattr_header *)data; | ||
567 | rxh->h_magic = cpu_to_le32 (REISERFS_XATTR_MAGIC); | ||
568 | rxh->h_hash = cpu_to_le32 (xahash); | ||
569 | } | ||
570 | |||
571 | err = mapping->a_ops->prepare_write (fp, page, page_offset, | ||
572 | page_offset + chunk + skip); | ||
573 | if (!err) { | ||
574 | if (buffer) | ||
575 | memcpy (data + skip, buffer + buffer_pos, chunk); | ||
576 | err = mapping->a_ops->commit_write (fp, page, page_offset, | ||
577 | page_offset + chunk + skip); | ||
578 | } | ||
579 | unlock_page (page); | ||
580 | reiserfs_put_page (page); | ||
581 | buffer_pos += chunk; | ||
582 | file_pos += chunk; | ||
583 | skip = 0; | ||
584 | if (err || buffer_size == 0 || !buffer) | ||
585 | break; | ||
586 | } | ||
587 | |||
588 | /* We can't mark the inode dirty if it's not hashed. This is the case | ||
589 | * when we're inheriting the default ACL. If we dirty it, the inode | ||
590 | * gets marked dirty, but won't (ever) make it onto the dirty list until | ||
591 | * it's synced explicitly to clear I_DIRTY. This is bad. */ | ||
592 | if (!hlist_unhashed(&inode->i_hash)) { | ||
593 | inode->i_ctime = CURRENT_TIME_SEC; | ||
594 | mark_inode_dirty (inode); | ||
595 | } | ||
596 | |||
597 | out_filp: | ||
598 | up (&xinode->i_sem); | ||
599 | fput(fp); | ||
600 | |||
601 | out: | ||
602 | return err; | ||
603 | } | ||
604 | |||
605 | /* | ||
606 | * inode->i_sem: down | ||
607 | */ | ||
608 | int | ||
609 | reiserfs_xattr_get (const struct inode *inode, const char *name, void *buffer, | ||
610 | size_t buffer_size) | ||
611 | { | ||
612 | ssize_t err = 0; | ||
613 | struct file *fp; | ||
614 | size_t isize; | ||
615 | size_t file_pos = 0; | ||
616 | size_t buffer_pos = 0; | ||
617 | struct page *page; | ||
618 | struct inode *xinode; | ||
619 | __u32 hash = 0; | ||
620 | |||
621 | if (name == NULL) | ||
622 | return -EINVAL; | ||
623 | |||
624 | /* We can't have xattrs attached to v1 items since they don't have | ||
625 | * generation numbers */ | ||
626 | if (get_inode_sd_version (inode) == STAT_DATA_V1) | ||
627 | return -EOPNOTSUPP; | ||
628 | |||
629 | fp = open_xa_file (inode, name, FL_READONLY); | ||
630 | if (IS_ERR (fp)) { | ||
631 | err = PTR_ERR (fp); | ||
632 | goto out; | ||
633 | } | ||
634 | |||
635 | xinode = fp->f_dentry->d_inode; | ||
636 | isize = xinode->i_size; | ||
637 | REISERFS_I(inode)->i_flags |= i_has_xattr_dir; | ||
638 | |||
639 | /* Just return the size needed */ | ||
640 | if (buffer == NULL) { | ||
641 | err = isize - sizeof (struct reiserfs_xattr_header); | ||
642 | goto out_dput; | ||
643 | } | ||
644 | |||
645 | if (buffer_size < isize - sizeof (struct reiserfs_xattr_header)) { | ||
646 | err = -ERANGE; | ||
647 | goto out_dput; | ||
648 | } | ||
649 | |||
650 | while (file_pos < isize) { | ||
651 | size_t chunk; | ||
652 | char *data; | ||
653 | size_t skip = 0; | ||
654 | if (isize - file_pos > PAGE_CACHE_SIZE) | ||
655 | chunk = PAGE_CACHE_SIZE; | ||
656 | else | ||
657 | chunk = isize - file_pos; | ||
658 | |||
659 | page = reiserfs_get_page (xinode, file_pos >> PAGE_CACHE_SHIFT); | ||
660 | if (IS_ERR (page)) { | ||
661 | err = PTR_ERR (page); | ||
662 | goto out_dput; | ||
663 | } | ||
664 | |||
665 | lock_page (page); | ||
666 | data = page_address (page); | ||
667 | if (file_pos == 0) { | ||
668 | struct reiserfs_xattr_header *rxh = | ||
669 | (struct reiserfs_xattr_header *)data; | ||
670 | skip = file_pos = sizeof (struct reiserfs_xattr_header); | ||
671 | chunk -= skip; | ||
672 | /* Magic doesn't match up.. */ | ||
673 | if (rxh->h_magic != cpu_to_le32 (REISERFS_XATTR_MAGIC)) { | ||
674 | unlock_page (page); | ||
675 | reiserfs_put_page (page); | ||
676 | reiserfs_warning (inode->i_sb, "Invalid magic for xattr (%s) " | ||
677 | "associated with %k", name, | ||
678 | INODE_PKEY (inode)); | ||
679 | err = -EIO; | ||
680 | goto out_dput; | ||
681 | } | ||
682 | hash = le32_to_cpu (rxh->h_hash); | ||
683 | } | ||
684 | memcpy (buffer + buffer_pos, data + skip, chunk); | ||
685 | unlock_page (page); | ||
686 | reiserfs_put_page (page); | ||
687 | file_pos += chunk; | ||
688 | buffer_pos += chunk; | ||
689 | skip = 0; | ||
690 | } | ||
691 | err = isize - sizeof (struct reiserfs_xattr_header); | ||
692 | |||
693 | if (xattr_hash (buffer, isize - sizeof (struct reiserfs_xattr_header)) != hash) { | ||
694 | reiserfs_warning (inode->i_sb, "Invalid hash for xattr (%s) associated " | ||
695 | "with %k", name, INODE_PKEY (inode)); | ||
696 | err = -EIO; | ||
697 | } | ||
698 | |||
699 | out_dput: | ||
700 | fput(fp); | ||
701 | |||
702 | out: | ||
703 | return err; | ||
704 | } | ||
705 | |||
706 | static int | ||
707 | __reiserfs_xattr_del (struct dentry *xadir, const char *name, int namelen) | ||
708 | { | ||
709 | struct dentry *dentry; | ||
710 | struct inode *dir = xadir->d_inode; | ||
711 | int err = 0; | ||
712 | |||
713 | dentry = lookup_one_len (name, xadir, namelen); | ||
714 | if (IS_ERR (dentry)) { | ||
715 | err = PTR_ERR (dentry); | ||
716 | goto out; | ||
717 | } else if (!dentry->d_inode) { | ||
718 | err = -ENODATA; | ||
719 | goto out_file; | ||
720 | } | ||
721 | |||
722 | /* Skip directories.. */ | ||
723 | if (S_ISDIR (dentry->d_inode->i_mode)) | ||
724 | goto out_file; | ||
725 | |||
726 | if (!is_reiserfs_priv_object (dentry->d_inode)) { | ||
727 | reiserfs_warning (dir->i_sb, "OID %08x [%.*s/%.*s] doesn't have " | ||
728 | "priv flag set [parent is %sset].", | ||
729 | le32_to_cpu (INODE_PKEY (dentry->d_inode)->k_objectid), | ||
730 | xadir->d_name.len, xadir->d_name.name, namelen, name, | ||
731 | is_reiserfs_priv_object (xadir->d_inode) ? "" : "not "); | ||
732 | dput (dentry); | ||
733 | return -EIO; | ||
734 | } | ||
735 | |||
736 | err = dir->i_op->unlink (dir, dentry); | ||
737 | if (!err) | ||
738 | d_delete (dentry); | ||
739 | |||
740 | out_file: | ||
741 | dput (dentry); | ||
742 | |||
743 | out: | ||
744 | return err; | ||
745 | } | ||
746 | |||
747 | |||
748 | int | ||
749 | reiserfs_xattr_del (struct inode *inode, const char *name) | ||
750 | { | ||
751 | struct dentry *dir; | ||
752 | int err; | ||
753 | |||
754 | if (IS_RDONLY (inode)) | ||
755 | return -EROFS; | ||
756 | |||
757 | dir = open_xa_dir (inode, FL_READONLY); | ||
758 | if (IS_ERR (dir)) { | ||
759 | err = PTR_ERR (dir); | ||
760 | goto out; | ||
761 | } | ||
762 | |||
763 | err = __reiserfs_xattr_del (dir, name, strlen (name)); | ||
764 | dput (dir); | ||
765 | |||
766 | if (!err) { | ||
767 | inode->i_ctime = CURRENT_TIME_SEC; | ||
768 | mark_inode_dirty (inode); | ||
769 | } | ||
770 | |||
771 | out: | ||
772 | return err; | ||
773 | } | ||
774 | |||
775 | /* The following are side effects of other operations that aren't explicitly | ||
776 | * modifying extended attributes. This includes operations such as permissions | ||
777 | * or ownership changes, object deletions, etc. */ | ||
778 | |||
779 | static int | ||
780 | reiserfs_delete_xattrs_filler (void *buf, const char *name, int namelen, | ||
781 | loff_t offset, ino_t ino, unsigned int d_type) | ||
782 | { | ||
783 | struct dentry *xadir = (struct dentry *)buf; | ||
784 | |||
785 | return __reiserfs_xattr_del (xadir, name, namelen); | ||
786 | |||
787 | } | ||
788 | |||
789 | /* This is called w/ inode->i_sem downed */ | ||
790 | int | ||
791 | reiserfs_delete_xattrs (struct inode *inode) | ||
792 | { | ||
793 | struct file *fp; | ||
794 | struct dentry *dir, *root; | ||
795 | int err = 0; | ||
796 | |||
797 | /* Skip out, an xattr has no xattrs associated with it */ | ||
798 | if (is_reiserfs_priv_object (inode) || | ||
799 | get_inode_sd_version (inode) == STAT_DATA_V1 || | ||
800 | !reiserfs_xattrs(inode->i_sb)) | ||
801 | { | ||
802 | return 0; | ||
803 | } | ||
804 | reiserfs_read_lock_xattrs (inode->i_sb); | ||
805 | dir = open_xa_dir (inode, FL_READONLY); | ||
806 | reiserfs_read_unlock_xattrs (inode->i_sb); | ||
807 | if (IS_ERR (dir)) { | ||
808 | err = PTR_ERR (dir); | ||
809 | goto out; | ||
810 | } else if (!dir->d_inode) { | ||
811 | dput (dir); | ||
812 | return 0; | ||
813 | } | ||
814 | |||
815 | fp = dentry_open (dir, NULL, O_RDWR); | ||
816 | if (IS_ERR (fp)) { | ||
817 | err = PTR_ERR (fp); | ||
818 | /* dentry_open dputs the dentry if it fails */ | ||
819 | goto out; | ||
820 | } | ||
821 | |||
822 | lock_kernel (); | ||
823 | err = xattr_readdir (fp, reiserfs_delete_xattrs_filler, dir); | ||
824 | if (err) { | ||
825 | unlock_kernel (); | ||
826 | goto out_dir; | ||
827 | } | ||
828 | |||
829 | /* Leftovers besides . and .. -- that's not good. */ | ||
830 | if (dir->d_inode->i_nlink <= 2) { | ||
831 | root = get_xa_root (inode->i_sb); | ||
832 | reiserfs_write_lock_xattrs (inode->i_sb); | ||
833 | err = vfs_rmdir (root->d_inode, dir); | ||
834 | reiserfs_write_unlock_xattrs (inode->i_sb); | ||
835 | dput (root); | ||
836 | } else { | ||
837 | reiserfs_warning (inode->i_sb, | ||
838 | "Couldn't remove all entries in directory"); | ||
839 | } | ||
840 | unlock_kernel (); | ||
841 | |||
842 | out_dir: | ||
843 | fput(fp); | ||
844 | |||
845 | out: | ||
846 | if (!err) | ||
847 | REISERFS_I(inode)->i_flags = REISERFS_I(inode)->i_flags & ~i_has_xattr_dir; | ||
848 | return err; | ||
849 | } | ||
850 | |||
851 | struct reiserfs_chown_buf { | ||
852 | struct inode *inode; | ||
853 | struct dentry *xadir; | ||
854 | struct iattr *attrs; | ||
855 | }; | ||
856 | |||
857 | /* XXX: If there is a better way to do this, I'd love to hear about it */ | ||
858 | static int | ||
859 | reiserfs_chown_xattrs_filler (void *buf, const char *name, int namelen, | ||
860 | loff_t offset, ino_t ino, unsigned int d_type) | ||
861 | { | ||
862 | struct reiserfs_chown_buf *chown_buf = (struct reiserfs_chown_buf *)buf; | ||
863 | struct dentry *xafile, *xadir = chown_buf->xadir; | ||
864 | struct iattr *attrs = chown_buf->attrs; | ||
865 | int err = 0; | ||
866 | |||
867 | xafile = lookup_one_len (name, xadir, namelen); | ||
868 | if (IS_ERR (xafile)) | ||
869 | return PTR_ERR (xafile); | ||
870 | else if (!xafile->d_inode) { | ||
871 | dput (xafile); | ||
872 | return -ENODATA; | ||
873 | } | ||
874 | |||
875 | if (!S_ISDIR (xafile->d_inode->i_mode)) | ||
876 | err = notify_change (xafile, attrs); | ||
877 | dput (xafile); | ||
878 | |||
879 | return err; | ||
880 | } | ||
881 | |||
882 | int | ||
883 | reiserfs_chown_xattrs (struct inode *inode, struct iattr *attrs) | ||
884 | { | ||
885 | struct file *fp; | ||
886 | struct dentry *dir; | ||
887 | int err = 0; | ||
888 | struct reiserfs_chown_buf buf; | ||
889 | unsigned int ia_valid = attrs->ia_valid; | ||
890 | |||
891 | /* Skip out, an xattr has no xattrs associated with it */ | ||
892 | if (is_reiserfs_priv_object (inode) || | ||
893 | get_inode_sd_version (inode) == STAT_DATA_V1 || | ||
894 | !reiserfs_xattrs(inode->i_sb)) | ||
895 | { | ||
896 | return 0; | ||
897 | } | ||
898 | reiserfs_read_lock_xattrs (inode->i_sb); | ||
899 | dir = open_xa_dir (inode, FL_READONLY); | ||
900 | reiserfs_read_unlock_xattrs (inode->i_sb); | ||
901 | if (IS_ERR (dir)) { | ||
902 | if (PTR_ERR (dir) != -ENODATA) | ||
903 | err = PTR_ERR (dir); | ||
904 | goto out; | ||
905 | } else if (!dir->d_inode) { | ||
906 | dput (dir); | ||
907 | goto out; | ||
908 | } | ||
909 | |||
910 | fp = dentry_open (dir, NULL, O_RDWR); | ||
911 | if (IS_ERR (fp)) { | ||
912 | err = PTR_ERR (fp); | ||
913 | /* dentry_open dputs the dentry if it fails */ | ||
914 | goto out; | ||
915 | } | ||
916 | |||
917 | lock_kernel (); | ||
918 | |||
919 | attrs->ia_valid &= (ATTR_UID | ATTR_GID | ATTR_CTIME); | ||
920 | buf.xadir = dir; | ||
921 | buf.attrs = attrs; | ||
922 | buf.inode = inode; | ||
923 | |||
924 | err = xattr_readdir (fp, reiserfs_chown_xattrs_filler, &buf); | ||
925 | if (err) { | ||
926 | unlock_kernel (); | ||
927 | goto out_dir; | ||
928 | } | ||
929 | |||
930 | err = notify_change (dir, attrs); | ||
931 | unlock_kernel (); | ||
932 | |||
933 | out_dir: | ||
934 | fput(fp); | ||
935 | |||
936 | out: | ||
937 | attrs->ia_valid = ia_valid; | ||
938 | return err; | ||
939 | } | ||
940 | |||
941 | |||
942 | /* Actual operations that are exported to VFS-land */ | ||
943 | |||
944 | /* | ||
945 | * Inode operation getxattr() | ||
946 | * Preliminary locking: we down dentry->d_inode->i_sem | ||
947 | */ | ||
948 | ssize_t | ||
949 | reiserfs_getxattr (struct dentry *dentry, const char *name, void *buffer, | ||
950 | size_t size) | ||
951 | { | ||
952 | struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name); | ||
953 | int err; | ||
954 | |||
955 | if (!xah || !reiserfs_xattrs(dentry->d_sb) || | ||
956 | get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1) | ||
957 | return -EOPNOTSUPP; | ||
958 | |||
959 | reiserfs_read_lock_xattr_i (dentry->d_inode); | ||
960 | reiserfs_read_lock_xattrs (dentry->d_sb); | ||
961 | err = xah->get (dentry->d_inode, name, buffer, size); | ||
962 | reiserfs_read_unlock_xattrs (dentry->d_sb); | ||
963 | reiserfs_read_unlock_xattr_i (dentry->d_inode); | ||
964 | return err; | ||
965 | } | ||
966 | |||
967 | |||
968 | /* | ||
969 | * Inode operation setxattr() | ||
970 | * | ||
971 | * dentry->d_inode->i_sem down | ||
972 | */ | ||
973 | int | ||
974 | reiserfs_setxattr (struct dentry *dentry, const char *name, const void *value, | ||
975 | size_t size, int flags) | ||
976 | { | ||
977 | struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name); | ||
978 | int err; | ||
979 | int lock; | ||
980 | |||
981 | if (!xah || !reiserfs_xattrs(dentry->d_sb) || | ||
982 | get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1) | ||
983 | return -EOPNOTSUPP; | ||
984 | |||
985 | if (IS_RDONLY (dentry->d_inode)) | ||
986 | return -EROFS; | ||
987 | |||
988 | if (IS_IMMUTABLE (dentry->d_inode) || IS_APPEND (dentry->d_inode)) | ||
989 | return -EROFS; | ||
990 | |||
991 | reiserfs_write_lock_xattr_i (dentry->d_inode); | ||
992 | lock = !has_xattr_dir (dentry->d_inode); | ||
993 | if (lock) | ||
994 | reiserfs_write_lock_xattrs (dentry->d_sb); | ||
995 | else | ||
996 | reiserfs_read_lock_xattrs (dentry->d_sb); | ||
997 | err = xah->set (dentry->d_inode, name, value, size, flags); | ||
998 | if (lock) | ||
999 | reiserfs_write_unlock_xattrs (dentry->d_sb); | ||
1000 | else | ||
1001 | reiserfs_read_unlock_xattrs (dentry->d_sb); | ||
1002 | reiserfs_write_unlock_xattr_i (dentry->d_inode); | ||
1003 | return err; | ||
1004 | } | ||
1005 | |||
1006 | /* | ||
1007 | * Inode operation removexattr() | ||
1008 | * | ||
1009 | * dentry->d_inode->i_sem down | ||
1010 | */ | ||
1011 | int | ||
1012 | reiserfs_removexattr (struct dentry *dentry, const char *name) | ||
1013 | { | ||
1014 | int err; | ||
1015 | struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name); | ||
1016 | |||
1017 | if (!xah || !reiserfs_xattrs(dentry->d_sb) || | ||
1018 | get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1) | ||
1019 | return -EOPNOTSUPP; | ||
1020 | |||
1021 | if (IS_RDONLY (dentry->d_inode)) | ||
1022 | return -EROFS; | ||
1023 | |||
1024 | if (IS_IMMUTABLE (dentry->d_inode) || IS_APPEND (dentry->d_inode)) | ||
1025 | return -EPERM; | ||
1026 | |||
1027 | reiserfs_write_lock_xattr_i (dentry->d_inode); | ||
1028 | reiserfs_read_lock_xattrs (dentry->d_sb); | ||
1029 | |||
1030 | /* Deletion pre-operation */ | ||
1031 | if (xah->del) { | ||
1032 | err = xah->del (dentry->d_inode, name); | ||
1033 | if (err) | ||
1034 | goto out; | ||
1035 | } | ||
1036 | |||
1037 | err = reiserfs_xattr_del (dentry->d_inode, name); | ||
1038 | |||
1039 | dentry->d_inode->i_ctime = CURRENT_TIME_SEC; | ||
1040 | mark_inode_dirty (dentry->d_inode); | ||
1041 | |||
1042 | out: | ||
1043 | reiserfs_read_unlock_xattrs (dentry->d_sb); | ||
1044 | reiserfs_write_unlock_xattr_i (dentry->d_inode); | ||
1045 | return err; | ||
1046 | } | ||
1047 | |||
1048 | |||
1049 | /* This is what filldir will use: | ||
1050 | * r_pos will always contain the amount of space required for the entire | ||
1051 | * list. If r_pos becomes larger than r_size, we need more space and we | ||
1052 | * return an error indicating this. If r_pos is less than r_size, then we've | ||
1053 | * filled the buffer successfully and we return success */ | ||
1054 | struct reiserfs_listxattr_buf { | ||
1055 | int r_pos; | ||
1056 | int r_size; | ||
1057 | char *r_buf; | ||
1058 | struct inode *r_inode; | ||
1059 | }; | ||
1060 | |||
1061 | static int | ||
1062 | reiserfs_listxattr_filler (void *buf, const char *name, int namelen, | ||
1063 | loff_t offset, ino_t ino, unsigned int d_type) | ||
1064 | { | ||
1065 | struct reiserfs_listxattr_buf *b = (struct reiserfs_listxattr_buf *)buf; | ||
1066 | int len = 0; | ||
1067 | if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) { | ||
1068 | struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name); | ||
1069 | if (!xah) return 0; /* Unsupported xattr name, skip it */ | ||
1070 | |||
1071 | /* We call ->list() twice because the operation isn't required to just | ||
1072 | * return the name back - we want to make sure we have enough space */ | ||
1073 | len += xah->list (b->r_inode, name, namelen, NULL); | ||
1074 | |||
1075 | if (len) { | ||
1076 | if (b->r_pos + len + 1 <= b->r_size) { | ||
1077 | char *p = b->r_buf + b->r_pos; | ||
1078 | p += xah->list (b->r_inode, name, namelen, p); | ||
1079 | *p++ = '\0'; | ||
1080 | } | ||
1081 | b->r_pos += len + 1; | ||
1082 | } | ||
1083 | } | ||
1084 | |||
1085 | return 0; | ||
1086 | } | ||
1087 | /* | ||
1088 | * Inode operation listxattr() | ||
1089 | * | ||
1090 | * Preliminary locking: we down dentry->d_inode->i_sem | ||
1091 | */ | ||
1092 | ssize_t | ||
1093 | reiserfs_listxattr (struct dentry *dentry, char *buffer, size_t size) | ||
1094 | { | ||
1095 | struct file *fp; | ||
1096 | struct dentry *dir; | ||
1097 | int err = 0; | ||
1098 | struct reiserfs_listxattr_buf buf; | ||
1099 | |||
1100 | if (!dentry->d_inode) | ||
1101 | return -EINVAL; | ||
1102 | |||
1103 | if (!reiserfs_xattrs(dentry->d_sb) || | ||
1104 | get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1) | ||
1105 | return -EOPNOTSUPP; | ||
1106 | |||
1107 | reiserfs_read_lock_xattr_i (dentry->d_inode); | ||
1108 | reiserfs_read_lock_xattrs (dentry->d_sb); | ||
1109 | dir = open_xa_dir (dentry->d_inode, FL_READONLY); | ||
1110 | reiserfs_read_unlock_xattrs (dentry->d_sb); | ||
1111 | if (IS_ERR (dir)) { | ||
1112 | err = PTR_ERR (dir); | ||
1113 | if (err == -ENODATA) | ||
1114 | err = 0; /* Not an error if there aren't any xattrs */ | ||
1115 | goto out; | ||
1116 | } | ||
1117 | |||
1118 | fp = dentry_open (dir, NULL, O_RDWR); | ||
1119 | if (IS_ERR (fp)) { | ||
1120 | err = PTR_ERR (fp); | ||
1121 | /* dentry_open dputs the dentry if it fails */ | ||
1122 | goto out; | ||
1123 | } | ||
1124 | |||
1125 | buf.r_buf = buffer; | ||
1126 | buf.r_size = buffer ? size : 0; | ||
1127 | buf.r_pos = 0; | ||
1128 | buf.r_inode = dentry->d_inode; | ||
1129 | |||
1130 | REISERFS_I(dentry->d_inode)->i_flags |= i_has_xattr_dir; | ||
1131 | |||
1132 | err = xattr_readdir (fp, reiserfs_listxattr_filler, &buf); | ||
1133 | if (err) | ||
1134 | goto out_dir; | ||
1135 | |||
1136 | if (buf.r_pos > buf.r_size && buffer != NULL) | ||
1137 | err = -ERANGE; | ||
1138 | else | ||
1139 | err = buf.r_pos; | ||
1140 | |||
1141 | out_dir: | ||
1142 | fput(fp); | ||
1143 | |||
1144 | out: | ||
1145 | reiserfs_read_unlock_xattr_i (dentry->d_inode); | ||
1146 | return err; | ||
1147 | } | ||
1148 | |||
1149 | /* This is the implementation for the xattr plugin infrastructure */ | ||
1150 | static struct list_head xattr_handlers = LIST_HEAD_INIT (xattr_handlers); | ||
1151 | static DEFINE_RWLOCK(handler_lock); | ||
1152 | |||
1153 | static struct reiserfs_xattr_handler * | ||
1154 | find_xattr_handler_prefix (const char *prefix) | ||
1155 | { | ||
1156 | struct reiserfs_xattr_handler *xah = NULL; | ||
1157 | struct list_head *p; | ||
1158 | |||
1159 | read_lock (&handler_lock); | ||
1160 | list_for_each (p, &xattr_handlers) { | ||
1161 | xah = list_entry (p, struct reiserfs_xattr_handler, handlers); | ||
1162 | if (strncmp (xah->prefix, prefix, strlen (xah->prefix)) == 0) | ||
1163 | break; | ||
1164 | xah = NULL; | ||
1165 | } | ||
1166 | |||
1167 | read_unlock (&handler_lock); | ||
1168 | return xah; | ||
1169 | } | ||
1170 | |||
1171 | static void | ||
1172 | __unregister_handlers (void) | ||
1173 | { | ||
1174 | struct reiserfs_xattr_handler *xah; | ||
1175 | struct list_head *p, *tmp; | ||
1176 | |||
1177 | list_for_each_safe (p, tmp, &xattr_handlers) { | ||
1178 | xah = list_entry (p, struct reiserfs_xattr_handler, handlers); | ||
1179 | if (xah->exit) | ||
1180 | xah->exit(); | ||
1181 | |||
1182 | list_del_init (p); | ||
1183 | } | ||
1184 | INIT_LIST_HEAD (&xattr_handlers); | ||
1185 | } | ||
1186 | |||
1187 | int __init | ||
1188 | reiserfs_xattr_register_handlers (void) | ||
1189 | { | ||
1190 | int err = 0; | ||
1191 | struct reiserfs_xattr_handler *xah; | ||
1192 | struct list_head *p; | ||
1193 | |||
1194 | write_lock (&handler_lock); | ||
1195 | |||
1196 | /* If we're already initialized, nothing to do */ | ||
1197 | if (!list_empty (&xattr_handlers)) { | ||
1198 | write_unlock (&handler_lock); | ||
1199 | return 0; | ||
1200 | } | ||
1201 | |||
1202 | /* Add the handlers */ | ||
1203 | list_add_tail (&user_handler.handlers, &xattr_handlers); | ||
1204 | list_add_tail (&trusted_handler.handlers, &xattr_handlers); | ||
1205 | #ifdef CONFIG_REISERFS_FS_SECURITY | ||
1206 | list_add_tail (&security_handler.handlers, &xattr_handlers); | ||
1207 | #endif | ||
1208 | #ifdef CONFIG_REISERFS_FS_POSIX_ACL | ||
1209 | list_add_tail (&posix_acl_access_handler.handlers, &xattr_handlers); | ||
1210 | list_add_tail (&posix_acl_default_handler.handlers, &xattr_handlers); | ||
1211 | #endif | ||
1212 | |||
1213 | /* Run initializers, if available */ | ||
1214 | list_for_each (p, &xattr_handlers) { | ||
1215 | xah = list_entry (p, struct reiserfs_xattr_handler, handlers); | ||
1216 | if (xah->init) { | ||
1217 | err = xah->init (); | ||
1218 | if (err) { | ||
1219 | list_del_init (p); | ||
1220 | break; | ||
1221 | } | ||
1222 | } | ||
1223 | } | ||
1224 | |||
1225 | /* Clean up other handlers, if any failed */ | ||
1226 | if (err) | ||
1227 | __unregister_handlers (); | ||
1228 | |||
1229 | write_unlock (&handler_lock); | ||
1230 | return err; | ||
1231 | } | ||
1232 | |||
1233 | void | ||
1234 | reiserfs_xattr_unregister_handlers (void) | ||
1235 | { | ||
1236 | write_lock (&handler_lock); | ||
1237 | __unregister_handlers (); | ||
1238 | write_unlock (&handler_lock); | ||
1239 | } | ||
1240 | |||
1241 | /* This will catch lookups from the fs root to .reiserfs_priv */ | ||
1242 | static int | ||
1243 | xattr_lookup_poison (struct dentry *dentry, struct qstr *q1, struct qstr *name) | ||
1244 | { | ||
1245 | struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root; | ||
1246 | if (name->len == priv_root->d_name.len && | ||
1247 | name->hash == priv_root->d_name.hash && | ||
1248 | !memcmp (name->name, priv_root->d_name.name, name->len)) { | ||
1249 | return -ENOENT; | ||
1250 | } else if (q1->len == name->len && | ||
1251 | !memcmp(q1->name, name->name, name->len)) | ||
1252 | return 0; | ||
1253 | return 1; | ||
1254 | } | ||
1255 | |||
1256 | static struct dentry_operations xattr_lookup_poison_ops = { | ||
1257 | .d_compare = xattr_lookup_poison, | ||
1258 | }; | ||
1259 | |||
1260 | |||
1261 | /* We need to take a copy of the mount flags since things like | ||
1262 | * MS_RDONLY don't get set until *after* we're called. | ||
1263 | * mount_flags != mount_options */ | ||
1264 | int | ||
1265 | reiserfs_xattr_init (struct super_block *s, int mount_flags) | ||
1266 | { | ||
1267 | int err = 0; | ||
1268 | |||
1269 | /* We need generation numbers to ensure that the oid mapping is correct | ||
1270 | * v3.5 filesystems don't have them. */ | ||
1271 | if (!old_format_only (s)) { | ||
1272 | set_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt)); | ||
1273 | } else if (reiserfs_xattrs_optional (s)) { | ||
1274 | /* Old format filesystem, but optional xattrs have been enabled | ||
1275 | * at mount time. Error out. */ | ||
1276 | reiserfs_warning (s, "xattrs/ACLs not supported on pre v3.6 " | ||
1277 | "format filesystem. Failing mount."); | ||
1278 | err = -EOPNOTSUPP; | ||
1279 | goto error; | ||
1280 | } else { | ||
1281 | /* Old format filesystem, but no optional xattrs have been enabled. This | ||
1282 | * means we silently disable xattrs on the filesystem. */ | ||
1283 | clear_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt)); | ||
1284 | } | ||
1285 | |||
1286 | /* If we don't have the privroot located yet - go find it */ | ||
1287 | if (reiserfs_xattrs (s) && !REISERFS_SB(s)->priv_root) { | ||
1288 | struct dentry *dentry; | ||
1289 | dentry = lookup_one_len (PRIVROOT_NAME, s->s_root, | ||
1290 | strlen (PRIVROOT_NAME)); | ||
1291 | if (!IS_ERR (dentry)) { | ||
1292 | if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) { | ||
1293 | struct inode *inode = dentry->d_parent->d_inode; | ||
1294 | down (&inode->i_sem); | ||
1295 | err = inode->i_op->mkdir (inode, dentry, 0700); | ||
1296 | up (&inode->i_sem); | ||
1297 | if (err) { | ||
1298 | dput (dentry); | ||
1299 | dentry = NULL; | ||
1300 | } | ||
1301 | |||
1302 | if (dentry && dentry->d_inode) | ||
1303 | reiserfs_warning (s, "Created %s on %s - reserved for " | ||
1304 | "xattr storage.", PRIVROOT_NAME, | ||
1305 | reiserfs_bdevname (inode->i_sb)); | ||
1306 | } else if (!dentry->d_inode) { | ||
1307 | dput (dentry); | ||
1308 | dentry = NULL; | ||
1309 | } | ||
1310 | } else | ||
1311 | err = PTR_ERR (dentry); | ||
1312 | |||
1313 | if (!err && dentry) { | ||
1314 | s->s_root->d_op = &xattr_lookup_poison_ops; | ||
1315 | reiserfs_mark_inode_private (dentry->d_inode); | ||
1316 | REISERFS_SB(s)->priv_root = dentry; | ||
1317 | } else if (!(mount_flags & MS_RDONLY)) { /* xattrs are unavailable */ | ||
1318 | /* If we're read-only it just means that the dir hasn't been | ||
1319 | * created. Not an error -- just no xattrs on the fs. We'll | ||
1320 | * check again if we go read-write */ | ||
1321 | reiserfs_warning (s, "xattrs/ACLs enabled and couldn't " | ||
1322 | "find/create .reiserfs_priv. Failing mount."); | ||
1323 | err = -EOPNOTSUPP; | ||
1324 | } | ||
1325 | } | ||
1326 | |||
1327 | error: | ||
1328 | /* This is only nonzero if there was an error initializing the xattr | ||
1329 | * directory or if there is a condition where we don't support them. */ | ||
1330 | if (err) { | ||
1331 | clear_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt)); | ||
1332 | clear_bit (REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt)); | ||
1333 | clear_bit (REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt)); | ||
1334 | } | ||
1335 | |||
1336 | /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */ | ||
1337 | s->s_flags = s->s_flags & ~MS_POSIXACL; | ||
1338 | if (reiserfs_posixacl (s)) | ||
1339 | s->s_flags |= MS_POSIXACL; | ||
1340 | |||
1341 | return err; | ||
1342 | } | ||
1343 | |||
1344 | static int | ||
1345 | __reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd, | ||
1346 | int need_lock) | ||
1347 | { | ||
1348 | umode_t mode = inode->i_mode; | ||
1349 | |||
1350 | if (mask & MAY_WRITE) { | ||
1351 | /* | ||
1352 | * Nobody gets write access to a read-only fs. | ||
1353 | */ | ||
1354 | if (IS_RDONLY(inode) && | ||
1355 | (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) | ||
1356 | return -EROFS; | ||
1357 | |||
1358 | /* | ||
1359 | * Nobody gets write access to an immutable file. | ||
1360 | */ | ||
1361 | if (IS_IMMUTABLE(inode)) | ||
1362 | return -EACCES; | ||
1363 | } | ||
1364 | |||
1365 | /* We don't do permission checks on the internal objects. | ||
1366 | * Permissions are determined by the "owning" object. */ | ||
1367 | if (is_reiserfs_priv_object (inode)) | ||
1368 | return 0; | ||
1369 | |||
1370 | if (current->fsuid == inode->i_uid) { | ||
1371 | mode >>= 6; | ||
1372 | #ifdef CONFIG_REISERFS_FS_POSIX_ACL | ||
1373 | } else if (reiserfs_posixacl(inode->i_sb) && | ||
1374 | get_inode_sd_version (inode) != STAT_DATA_V1) { | ||
1375 | struct posix_acl *acl; | ||
1376 | |||
1377 | /* ACL can't contain additional permissions if | ||
1378 | the ACL_MASK entry is 0 */ | ||
1379 | if (!(mode & S_IRWXG)) | ||
1380 | goto check_groups; | ||
1381 | |||
1382 | if (need_lock) { | ||
1383 | reiserfs_read_lock_xattr_i (inode); | ||
1384 | reiserfs_read_lock_xattrs (inode->i_sb); | ||
1385 | } | ||
1386 | acl = reiserfs_get_acl (inode, ACL_TYPE_ACCESS); | ||
1387 | if (need_lock) { | ||
1388 | reiserfs_read_unlock_xattrs (inode->i_sb); | ||
1389 | reiserfs_read_unlock_xattr_i (inode); | ||
1390 | } | ||
1391 | if (IS_ERR (acl)) { | ||
1392 | if (PTR_ERR (acl) == -ENODATA) | ||
1393 | goto check_groups; | ||
1394 | return PTR_ERR (acl); | ||
1395 | } | ||
1396 | |||
1397 | if (acl) { | ||
1398 | int err = posix_acl_permission (inode, acl, mask); | ||
1399 | posix_acl_release (acl); | ||
1400 | if (err == -EACCES) { | ||
1401 | goto check_capabilities; | ||
1402 | } | ||
1403 | return err; | ||
1404 | } else { | ||
1405 | goto check_groups; | ||
1406 | } | ||
1407 | #endif | ||
1408 | } else { | ||
1409 | check_groups: | ||
1410 | if (in_group_p(inode->i_gid)) | ||
1411 | mode >>= 3; | ||
1412 | } | ||
1413 | |||
1414 | /* | ||
1415 | * If the DACs are ok we don't need any capability check. | ||
1416 | */ | ||
1417 | if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask)) | ||
1418 | return 0; | ||
1419 | |||
1420 | check_capabilities: | ||
1421 | /* | ||
1422 | * Read/write DACs are always overridable. | ||
1423 | * Executable DACs are overridable if at least one exec bit is set. | ||
1424 | */ | ||
1425 | if (!(mask & MAY_EXEC) || | ||
1426 | (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode)) | ||
1427 | if (capable(CAP_DAC_OVERRIDE)) | ||
1428 | return 0; | ||
1429 | |||
1430 | /* | ||
1431 | * Searching includes executable on directories, else just read. | ||
1432 | */ | ||
1433 | if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) | ||
1434 | if (capable(CAP_DAC_READ_SEARCH)) | ||
1435 | return 0; | ||
1436 | |||
1437 | return -EACCES; | ||
1438 | } | ||
1439 | |||
1440 | int | ||
1441 | reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd) | ||
1442 | { | ||
1443 | return __reiserfs_permission (inode, mask, nd, 1); | ||
1444 | } | ||
1445 | |||
1446 | int | ||
1447 | reiserfs_permission_locked (struct inode *inode, int mask, struct nameidata *nd) | ||
1448 | { | ||
1449 | return __reiserfs_permission (inode, mask, nd, 0); | ||
1450 | } | ||
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c new file mode 100644 index 000000000000..e302071903a1 --- /dev/null +++ b/fs/reiserfs/xattr_acl.c | |||
@@ -0,0 +1,571 @@ | |||
1 | #include <linux/fs.h> | ||
2 | #include <linux/posix_acl.h> | ||
3 | #include <linux/reiserfs_fs.h> | ||
4 | #include <linux/errno.h> | ||
5 | #include <linux/pagemap.h> | ||
6 | #include <linux/xattr.h> | ||
7 | #include <linux/xattr_acl.h> | ||
8 | #include <linux/reiserfs_xattr.h> | ||
9 | #include <linux/reiserfs_acl.h> | ||
10 | #include <asm/uaccess.h> | ||
11 | |||
12 | static int reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl); | ||
13 | |||
14 | static int | ||
15 | xattr_set_acl(struct inode *inode, int type, const void *value, size_t size) | ||
16 | { | ||
17 | struct posix_acl *acl; | ||
18 | int error; | ||
19 | |||
20 | if (!reiserfs_posixacl(inode->i_sb)) | ||
21 | return -EOPNOTSUPP; | ||
22 | if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) | ||
23 | return -EPERM; | ||
24 | |||
25 | if (value) { | ||
26 | acl = posix_acl_from_xattr(value, size); | ||
27 | if (IS_ERR(acl)) { | ||
28 | return PTR_ERR(acl); | ||
29 | } else if (acl) { | ||
30 | error = posix_acl_valid(acl); | ||
31 | if (error) | ||
32 | goto release_and_out; | ||
33 | } | ||
34 | } else | ||
35 | acl = NULL; | ||
36 | |||
37 | error = reiserfs_set_acl (inode, type, acl); | ||
38 | |||
39 | release_and_out: | ||
40 | posix_acl_release(acl); | ||
41 | return error; | ||
42 | } | ||
43 | |||
44 | |||
45 | static int | ||
46 | xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) | ||
47 | { | ||
48 | struct posix_acl *acl; | ||
49 | int error; | ||
50 | |||
51 | if (!reiserfs_posixacl(inode->i_sb)) | ||
52 | return -EOPNOTSUPP; | ||
53 | |||
54 | acl = reiserfs_get_acl (inode, type); | ||
55 | if (IS_ERR(acl)) | ||
56 | return PTR_ERR(acl); | ||
57 | if (acl == NULL) | ||
58 | return -ENODATA; | ||
59 | error = posix_acl_to_xattr(acl, buffer, size); | ||
60 | posix_acl_release(acl); | ||
61 | |||
62 | return error; | ||
63 | } | ||
64 | |||
65 | |||
66 | /* | ||
67 | * Convert from filesystem to in-memory representation. | ||
68 | */ | ||
69 | static struct posix_acl * | ||
70 | posix_acl_from_disk(const void *value, size_t size) | ||
71 | { | ||
72 | const char *end = (char *)value + size; | ||
73 | int n, count; | ||
74 | struct posix_acl *acl; | ||
75 | |||
76 | if (!value) | ||
77 | return NULL; | ||
78 | if (size < sizeof(reiserfs_acl_header)) | ||
79 | return ERR_PTR(-EINVAL); | ||
80 | if (((reiserfs_acl_header *)value)->a_version != | ||
81 | cpu_to_le32(REISERFS_ACL_VERSION)) | ||
82 | return ERR_PTR(-EINVAL); | ||
83 | value = (char *)value + sizeof(reiserfs_acl_header); | ||
84 | count = reiserfs_acl_count(size); | ||
85 | if (count < 0) | ||
86 | return ERR_PTR(-EINVAL); | ||
87 | if (count == 0) | ||
88 | return NULL; | ||
89 | acl = posix_acl_alloc(count, GFP_NOFS); | ||
90 | if (!acl) | ||
91 | return ERR_PTR(-ENOMEM); | ||
92 | for (n=0; n < count; n++) { | ||
93 | reiserfs_acl_entry *entry = | ||
94 | (reiserfs_acl_entry *)value; | ||
95 | if ((char *)value + sizeof(reiserfs_acl_entry_short) > end) | ||
96 | goto fail; | ||
97 | acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); | ||
98 | acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); | ||
99 | switch(acl->a_entries[n].e_tag) { | ||
100 | case ACL_USER_OBJ: | ||
101 | case ACL_GROUP_OBJ: | ||
102 | case ACL_MASK: | ||
103 | case ACL_OTHER: | ||
104 | value = (char *)value + | ||
105 | sizeof(reiserfs_acl_entry_short); | ||
106 | acl->a_entries[n].e_id = ACL_UNDEFINED_ID; | ||
107 | break; | ||
108 | |||
109 | case ACL_USER: | ||
110 | case ACL_GROUP: | ||
111 | value = (char *)value + sizeof(reiserfs_acl_entry); | ||
112 | if ((char *)value > end) | ||
113 | goto fail; | ||
114 | acl->a_entries[n].e_id = | ||
115 | le32_to_cpu(entry->e_id); | ||
116 | break; | ||
117 | |||
118 | default: | ||
119 | goto fail; | ||
120 | } | ||
121 | } | ||
122 | if (value != end) | ||
123 | goto fail; | ||
124 | return acl; | ||
125 | |||
126 | fail: | ||
127 | posix_acl_release(acl); | ||
128 | return ERR_PTR(-EINVAL); | ||
129 | } | ||
130 | |||
131 | /* | ||
132 | * Convert from in-memory to filesystem representation. | ||
133 | */ | ||
134 | static void * | ||
135 | posix_acl_to_disk(const struct posix_acl *acl, size_t *size) | ||
136 | { | ||
137 | reiserfs_acl_header *ext_acl; | ||
138 | char *e; | ||
139 | int n; | ||
140 | |||
141 | *size = reiserfs_acl_size(acl->a_count); | ||
142 | ext_acl = (reiserfs_acl_header *)kmalloc(sizeof(reiserfs_acl_header) + | ||
143 | acl->a_count * sizeof(reiserfs_acl_entry), GFP_NOFS); | ||
144 | if (!ext_acl) | ||
145 | return ERR_PTR(-ENOMEM); | ||
146 | ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION); | ||
147 | e = (char *)ext_acl + sizeof(reiserfs_acl_header); | ||
148 | for (n=0; n < acl->a_count; n++) { | ||
149 | reiserfs_acl_entry *entry = (reiserfs_acl_entry *)e; | ||
150 | entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); | ||
151 | entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); | ||
152 | switch(acl->a_entries[n].e_tag) { | ||
153 | case ACL_USER: | ||
154 | case ACL_GROUP: | ||
155 | entry->e_id = | ||
156 | cpu_to_le32(acl->a_entries[n].e_id); | ||
157 | e += sizeof(reiserfs_acl_entry); | ||
158 | break; | ||
159 | |||
160 | case ACL_USER_OBJ: | ||
161 | case ACL_GROUP_OBJ: | ||
162 | case ACL_MASK: | ||
163 | case ACL_OTHER: | ||
164 | e += sizeof(reiserfs_acl_entry_short); | ||
165 | break; | ||
166 | |||
167 | default: | ||
168 | goto fail; | ||
169 | } | ||
170 | } | ||
171 | return (char *)ext_acl; | ||
172 | |||
173 | fail: | ||
174 | kfree(ext_acl); | ||
175 | return ERR_PTR(-EINVAL); | ||
176 | } | ||
177 | |||
178 | /* | ||
179 | * Inode operation get_posix_acl(). | ||
180 | * | ||
181 | * inode->i_sem: down | ||
182 | * BKL held [before 2.5.x] | ||
183 | */ | ||
184 | struct posix_acl * | ||
185 | reiserfs_get_acl(struct inode *inode, int type) | ||
186 | { | ||
187 | char *name, *value; | ||
188 | struct posix_acl *acl, **p_acl; | ||
189 | size_t size; | ||
190 | int retval; | ||
191 | struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); | ||
192 | |||
193 | switch (type) { | ||
194 | case ACL_TYPE_ACCESS: | ||
195 | name = XATTR_NAME_ACL_ACCESS; | ||
196 | p_acl = &reiserfs_i->i_acl_access; | ||
197 | break; | ||
198 | case ACL_TYPE_DEFAULT: | ||
199 | name = XATTR_NAME_ACL_DEFAULT; | ||
200 | p_acl = &reiserfs_i->i_acl_default; | ||
201 | break; | ||
202 | default: | ||
203 | return ERR_PTR (-EINVAL); | ||
204 | } | ||
205 | |||
206 | if (IS_ERR (*p_acl)) { | ||
207 | if (PTR_ERR (*p_acl) == -ENODATA) | ||
208 | return NULL; | ||
209 | } else if (*p_acl != NULL) | ||
210 | return posix_acl_dup (*p_acl); | ||
211 | |||
212 | size = reiserfs_xattr_get (inode, name, NULL, 0); | ||
213 | if ((int)size < 0) { | ||
214 | if (size == -ENODATA || size == -ENOSYS) { | ||
215 | *p_acl = ERR_PTR (-ENODATA); | ||
216 | return NULL; | ||
217 | } | ||
218 | return ERR_PTR (size); | ||
219 | } | ||
220 | |||
221 | value = kmalloc (size, GFP_NOFS); | ||
222 | if (!value) | ||
223 | return ERR_PTR (-ENOMEM); | ||
224 | |||
225 | retval = reiserfs_xattr_get(inode, name, value, size); | ||
226 | if (retval == -ENODATA || retval == -ENOSYS) { | ||
227 | /* This shouldn't actually happen as it should have | ||
228 | been caught above.. but just in case */ | ||
229 | acl = NULL; | ||
230 | *p_acl = ERR_PTR (-ENODATA); | ||
231 | } else if (retval < 0) { | ||
232 | acl = ERR_PTR(retval); | ||
233 | } else { | ||
234 | acl = posix_acl_from_disk(value, retval); | ||
235 | *p_acl = posix_acl_dup (acl); | ||
236 | } | ||
237 | |||
238 | kfree(value); | ||
239 | return acl; | ||
240 | } | ||
241 | |||
242 | /* | ||
243 | * Inode operation set_posix_acl(). | ||
244 | * | ||
245 | * inode->i_sem: down | ||
246 | * BKL held [before 2.5.x] | ||
247 | */ | ||
248 | static int | ||
249 | reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) | ||
250 | { | ||
251 | char *name; | ||
252 | void *value = NULL; | ||
253 | struct posix_acl **p_acl; | ||
254 | size_t size; | ||
255 | int error; | ||
256 | struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); | ||
257 | |||
258 | if (S_ISLNK(inode->i_mode)) | ||
259 | return -EOPNOTSUPP; | ||
260 | |||
261 | switch (type) { | ||
262 | case ACL_TYPE_ACCESS: | ||
263 | name = XATTR_NAME_ACL_ACCESS; | ||
264 | p_acl = &reiserfs_i->i_acl_access; | ||
265 | if (acl) { | ||
266 | mode_t mode = inode->i_mode; | ||
267 | error = posix_acl_equiv_mode (acl, &mode); | ||
268 | if (error < 0) | ||
269 | return error; | ||
270 | else { | ||
271 | inode->i_mode = mode; | ||
272 | if (error == 0) | ||
273 | acl = NULL; | ||
274 | } | ||
275 | } | ||
276 | break; | ||
277 | case ACL_TYPE_DEFAULT: | ||
278 | name = XATTR_NAME_ACL_DEFAULT; | ||
279 | p_acl = &reiserfs_i->i_acl_default; | ||
280 | if (!S_ISDIR (inode->i_mode)) | ||
281 | return acl ? -EACCES : 0; | ||
282 | break; | ||
283 | default: | ||
284 | return -EINVAL; | ||
285 | } | ||
286 | |||
287 | if (acl) { | ||
288 | value = posix_acl_to_disk(acl, &size); | ||
289 | if (IS_ERR(value)) | ||
290 | return (int)PTR_ERR(value); | ||
291 | error = reiserfs_xattr_set(inode, name, value, size, 0); | ||
292 | } else { | ||
293 | error = reiserfs_xattr_del (inode, name); | ||
294 | if (error == -ENODATA) { | ||
295 | /* This may seem odd here, but it means that the ACL was set | ||
296 | * with a value representable with mode bits. If there was | ||
297 | * an ACL before, reiserfs_xattr_del already dirtied the inode. | ||
298 | */ | ||
299 | mark_inode_dirty (inode); | ||
300 | error = 0; | ||
301 | } | ||
302 | } | ||
303 | |||
304 | if (value) | ||
305 | kfree(value); | ||
306 | |||
307 | if (!error) { | ||
308 | /* Release the old one */ | ||
309 | if (!IS_ERR (*p_acl) && *p_acl) | ||
310 | posix_acl_release (*p_acl); | ||
311 | |||
312 | if (acl == NULL) | ||
313 | *p_acl = ERR_PTR (-ENODATA); | ||
314 | else | ||
315 | *p_acl = posix_acl_dup (acl); | ||
316 | } | ||
317 | |||
318 | return error; | ||
319 | } | ||
320 | |||
321 | /* dir->i_sem: down, | ||
322 | * inode is new and not released into the wild yet */ | ||
323 | int | ||
324 | reiserfs_inherit_default_acl (struct inode *dir, struct dentry *dentry, struct inode *inode) | ||
325 | { | ||
326 | struct posix_acl *acl; | ||
327 | int err = 0; | ||
328 | |||
329 | /* ACLs only get applied to files and directories */ | ||
330 | if (S_ISLNK (inode->i_mode)) | ||
331 | return 0; | ||
332 | |||
333 | /* ACLs can only be used on "new" objects, so if it's an old object | ||
334 | * there is nothing to inherit from */ | ||
335 | if (get_inode_sd_version (dir) == STAT_DATA_V1) | ||
336 | goto apply_umask; | ||
337 | |||
338 | /* Don't apply ACLs to objects in the .reiserfs_priv tree.. This | ||
339 | * would be useless since permissions are ignored, and a pain because | ||
340 | * it introduces locking cycles */ | ||
341 | if (is_reiserfs_priv_object (dir)) { | ||
342 | reiserfs_mark_inode_private (inode); | ||
343 | goto apply_umask; | ||
344 | } | ||
345 | |||
346 | acl = reiserfs_get_acl (dir, ACL_TYPE_DEFAULT); | ||
347 | if (IS_ERR (acl)) { | ||
348 | if (PTR_ERR (acl) == -ENODATA) | ||
349 | goto apply_umask; | ||
350 | return PTR_ERR (acl); | ||
351 | } | ||
352 | |||
353 | if (acl) { | ||
354 | struct posix_acl *acl_copy; | ||
355 | mode_t mode = inode->i_mode; | ||
356 | int need_acl; | ||
357 | |||
358 | /* Copy the default ACL to the default ACL of a new directory */ | ||
359 | if (S_ISDIR (inode->i_mode)) { | ||
360 | err = reiserfs_set_acl (inode, ACL_TYPE_DEFAULT, acl); | ||
361 | if (err) | ||
362 | goto cleanup; | ||
363 | } | ||
364 | |||
365 | /* Now we reconcile the new ACL and the mode, | ||
366 | potentially modifying both */ | ||
367 | acl_copy = posix_acl_clone (acl, GFP_NOFS); | ||
368 | if (!acl_copy) { | ||
369 | err = -ENOMEM; | ||
370 | goto cleanup; | ||
371 | } | ||
372 | |||
373 | |||
374 | need_acl = posix_acl_create_masq (acl_copy, &mode); | ||
375 | if (need_acl >= 0) { | ||
376 | if (mode != inode->i_mode) { | ||
377 | inode->i_mode = mode; | ||
378 | } | ||
379 | |||
380 | /* If we need an ACL.. */ | ||
381 | if (need_acl > 0) { | ||
382 | err = reiserfs_set_acl (inode, ACL_TYPE_ACCESS, acl_copy); | ||
383 | if (err) | ||
384 | goto cleanup_copy; | ||
385 | } | ||
386 | } | ||
387 | cleanup_copy: | ||
388 | posix_acl_release (acl_copy); | ||
389 | cleanup: | ||
390 | posix_acl_release (acl); | ||
391 | } else { | ||
392 | apply_umask: | ||
393 | /* no ACL, apply umask */ | ||
394 | inode->i_mode &= ~current->fs->umask; | ||
395 | } | ||
396 | |||
397 | return err; | ||
398 | } | ||
399 | |||
400 | /* Looks up and caches the result of the default ACL. | ||
401 | * We do this so that we don't need to carry the xattr_sem into | ||
402 | * reiserfs_new_inode if we don't need to */ | ||
403 | int | ||
404 | reiserfs_cache_default_acl (struct inode *inode) | ||
405 | { | ||
406 | int ret = 0; | ||
407 | if (reiserfs_posixacl (inode->i_sb) && | ||
408 | !is_reiserfs_priv_object (inode)) { | ||
409 | struct posix_acl *acl; | ||
410 | reiserfs_read_lock_xattr_i (inode); | ||
411 | reiserfs_read_lock_xattrs (inode->i_sb); | ||
412 | acl = reiserfs_get_acl (inode, ACL_TYPE_DEFAULT); | ||
413 | reiserfs_read_unlock_xattrs (inode->i_sb); | ||
414 | reiserfs_read_unlock_xattr_i (inode); | ||
415 | ret = acl ? 1 : 0; | ||
416 | posix_acl_release (acl); | ||
417 | } | ||
418 | |||
419 | return ret; | ||
420 | } | ||
421 | |||
422 | int | ||
423 | reiserfs_acl_chmod (struct inode *inode) | ||
424 | { | ||
425 | struct posix_acl *acl, *clone; | ||
426 | int error; | ||
427 | |||
428 | if (S_ISLNK(inode->i_mode)) | ||
429 | return -EOPNOTSUPP; | ||
430 | |||
431 | if (get_inode_sd_version (inode) == STAT_DATA_V1 || | ||
432 | !reiserfs_posixacl(inode->i_sb)) | ||
433 | { | ||
434 | return 0; | ||
435 | } | ||
436 | |||
437 | reiserfs_read_lock_xattrs (inode->i_sb); | ||
438 | acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); | ||
439 | reiserfs_read_unlock_xattrs (inode->i_sb); | ||
440 | if (!acl) | ||
441 | return 0; | ||
442 | if (IS_ERR(acl)) | ||
443 | return PTR_ERR(acl); | ||
444 | clone = posix_acl_clone(acl, GFP_NOFS); | ||
445 | posix_acl_release(acl); | ||
446 | if (!clone) | ||
447 | return -ENOMEM; | ||
448 | error = posix_acl_chmod_masq(clone, inode->i_mode); | ||
449 | if (!error) { | ||
450 | int lock = !has_xattr_dir (inode); | ||
451 | reiserfs_write_lock_xattr_i (inode); | ||
452 | if (lock) | ||
453 | reiserfs_write_lock_xattrs (inode->i_sb); | ||
454 | else | ||
455 | reiserfs_read_lock_xattrs (inode->i_sb); | ||
456 | error = reiserfs_set_acl(inode, ACL_TYPE_ACCESS, clone); | ||
457 | if (lock) | ||
458 | reiserfs_write_unlock_xattrs (inode->i_sb); | ||
459 | else | ||
460 | reiserfs_read_unlock_xattrs (inode->i_sb); | ||
461 | reiserfs_write_unlock_xattr_i (inode); | ||
462 | } | ||
463 | posix_acl_release(clone); | ||
464 | return error; | ||
465 | } | ||
466 | |||
467 | static int | ||
468 | posix_acl_access_get(struct inode *inode, const char *name, | ||
469 | void *buffer, size_t size) | ||
470 | { | ||
471 | if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1) | ||
472 | return -EINVAL; | ||
473 | return xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); | ||
474 | } | ||
475 | |||
476 | static int | ||
477 | posix_acl_access_set(struct inode *inode, const char *name, | ||
478 | const void *value, size_t size, int flags) | ||
479 | { | ||
480 | if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1) | ||
481 | return -EINVAL; | ||
482 | return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); | ||
483 | } | ||
484 | |||
485 | static int | ||
486 | posix_acl_access_del (struct inode *inode, const char *name) | ||
487 | { | ||
488 | struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); | ||
489 | struct posix_acl **acl = &reiserfs_i->i_acl_access; | ||
490 | if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1) | ||
491 | return -EINVAL; | ||
492 | if (!IS_ERR (*acl) && *acl) { | ||
493 | posix_acl_release (*acl); | ||
494 | *acl = ERR_PTR (-ENODATA); | ||
495 | } | ||
496 | |||
497 | return 0; | ||
498 | } | ||
499 | |||
500 | static int | ||
501 | posix_acl_access_list (struct inode *inode, const char *name, int namelen, char *out) | ||
502 | { | ||
503 | int len = namelen; | ||
504 | if (!reiserfs_posixacl (inode->i_sb)) | ||
505 | return 0; | ||
506 | if (out) | ||
507 | memcpy (out, name, len); | ||
508 | |||
509 | return len; | ||
510 | } | ||
511 | |||
512 | struct reiserfs_xattr_handler posix_acl_access_handler = { | ||
513 | .prefix = XATTR_NAME_ACL_ACCESS, | ||
514 | .get = posix_acl_access_get, | ||
515 | .set = posix_acl_access_set, | ||
516 | .del = posix_acl_access_del, | ||
517 | .list = posix_acl_access_list, | ||
518 | }; | ||
519 | |||
520 | static int | ||
521 | posix_acl_default_get (struct inode *inode, const char *name, | ||
522 | void *buffer, size_t size) | ||
523 | { | ||
524 | if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1) | ||
525 | return -EINVAL; | ||
526 | return xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); | ||
527 | } | ||
528 | |||
529 | static int | ||
530 | posix_acl_default_set(struct inode *inode, const char *name, | ||
531 | const void *value, size_t size, int flags) | ||
532 | { | ||
533 | if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1) | ||
534 | return -EINVAL; | ||
535 | return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); | ||
536 | } | ||
537 | |||
538 | static int | ||
539 | posix_acl_default_del (struct inode *inode, const char *name) | ||
540 | { | ||
541 | struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); | ||
542 | struct posix_acl **acl = &reiserfs_i->i_acl_default; | ||
543 | if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1) | ||
544 | return -EINVAL; | ||
545 | if (!IS_ERR (*acl) && *acl) { | ||
546 | posix_acl_release (*acl); | ||
547 | *acl = ERR_PTR (-ENODATA); | ||
548 | } | ||
549 | |||
550 | return 0; | ||
551 | } | ||
552 | |||
553 | static int | ||
554 | posix_acl_default_list (struct inode *inode, const char *name, int namelen, char *out) | ||
555 | { | ||
556 | int len = namelen; | ||
557 | if (!reiserfs_posixacl (inode->i_sb)) | ||
558 | return 0; | ||
559 | if (out) | ||
560 | memcpy (out, name, len); | ||
561 | |||
562 | return len; | ||
563 | } | ||
564 | |||
565 | struct reiserfs_xattr_handler posix_acl_default_handler = { | ||
566 | .prefix = XATTR_NAME_ACL_DEFAULT, | ||
567 | .get = posix_acl_default_get, | ||
568 | .set = posix_acl_default_set, | ||
569 | .del = posix_acl_default_del, | ||
570 | .list = posix_acl_default_list, | ||
571 | }; | ||
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c new file mode 100644 index 000000000000..e044d5117117 --- /dev/null +++ b/fs/reiserfs/xattr_security.c | |||
@@ -0,0 +1,69 @@ | |||
1 | #include <linux/reiserfs_fs.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/fs.h> | ||
4 | #include <linux/pagemap.h> | ||
5 | #include <linux/xattr.h> | ||
6 | #include <linux/reiserfs_xattr.h> | ||
7 | #include <asm/uaccess.h> | ||
8 | |||
9 | #define XATTR_SECURITY_PREFIX "security." | ||
10 | |||
11 | static int | ||
12 | security_get (struct inode *inode, const char *name, void *buffer, size_t size) | ||
13 | { | ||
14 | if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) | ||
15 | return -EINVAL; | ||
16 | |||
17 | if (is_reiserfs_priv_object(inode)) | ||
18 | return -EPERM; | ||
19 | |||
20 | return reiserfs_xattr_get (inode, name, buffer, size); | ||
21 | } | ||
22 | |||
23 | static int | ||
24 | security_set (struct inode *inode, const char *name, const void *buffer, | ||
25 | size_t size, int flags) | ||
26 | { | ||
27 | if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) | ||
28 | return -EINVAL; | ||
29 | |||
30 | if (is_reiserfs_priv_object(inode)) | ||
31 | return -EPERM; | ||
32 | |||
33 | return reiserfs_xattr_set (inode, name, buffer, size, flags); | ||
34 | } | ||
35 | |||
36 | static int | ||
37 | security_del (struct inode *inode, const char *name) | ||
38 | { | ||
39 | if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) | ||
40 | return -EINVAL; | ||
41 | |||
42 | if (is_reiserfs_priv_object(inode)) | ||
43 | return -EPERM; | ||
44 | |||
45 | return 0; | ||
46 | } | ||
47 | |||
48 | static int | ||
49 | security_list (struct inode *inode, const char *name, int namelen, char *out) | ||
50 | { | ||
51 | int len = namelen; | ||
52 | |||
53 | if (is_reiserfs_priv_object(inode)) | ||
54 | return 0; | ||
55 | |||
56 | if (out) | ||
57 | memcpy (out, name, len); | ||
58 | |||
59 | return len; | ||
60 | } | ||
61 | |||
62 | |||
63 | struct reiserfs_xattr_handler security_handler = { | ||
64 | .prefix = XATTR_SECURITY_PREFIX, | ||
65 | .get = security_get, | ||
66 | .set = security_set, | ||
67 | .del = security_del, | ||
68 | .list = security_list, | ||
69 | }; | ||
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c new file mode 100644 index 000000000000..43762197fb0a --- /dev/null +++ b/fs/reiserfs/xattr_trusted.c | |||
@@ -0,0 +1,81 @@ | |||
1 | #include <linux/reiserfs_fs.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/fs.h> | ||
4 | #include <linux/pagemap.h> | ||
5 | #include <linux/xattr.h> | ||
6 | #include <linux/reiserfs_xattr.h> | ||
7 | #include <asm/uaccess.h> | ||
8 | |||
9 | #define XATTR_TRUSTED_PREFIX "trusted." | ||
10 | |||
11 | static int | ||
12 | trusted_get (struct inode *inode, const char *name, void *buffer, size_t size) | ||
13 | { | ||
14 | if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) | ||
15 | return -EINVAL; | ||
16 | |||
17 | if (!reiserfs_xattrs (inode->i_sb)) | ||
18 | return -EOPNOTSUPP; | ||
19 | |||
20 | if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) | ||
21 | return -EPERM; | ||
22 | |||
23 | return reiserfs_xattr_get (inode, name, buffer, size); | ||
24 | } | ||
25 | |||
26 | static int | ||
27 | trusted_set (struct inode *inode, const char *name, const void *buffer, | ||
28 | size_t size, int flags) | ||
29 | { | ||
30 | if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) | ||
31 | return -EINVAL; | ||
32 | |||
33 | if (!reiserfs_xattrs (inode->i_sb)) | ||
34 | return -EOPNOTSUPP; | ||
35 | |||
36 | if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) | ||
37 | return -EPERM; | ||
38 | |||
39 | return reiserfs_xattr_set (inode, name, buffer, size, flags); | ||
40 | } | ||
41 | |||
42 | static int | ||
43 | trusted_del (struct inode *inode, const char *name) | ||
44 | { | ||
45 | if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) | ||
46 | return -EINVAL; | ||
47 | |||
48 | if (!reiserfs_xattrs (inode->i_sb)) | ||
49 | return -EOPNOTSUPP; | ||
50 | |||
51 | if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) | ||
52 | return -EPERM; | ||
53 | |||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | static int | ||
58 | trusted_list (struct inode *inode, const char *name, int namelen, char *out) | ||
59 | { | ||
60 | int len = namelen; | ||
61 | |||
62 | if (!reiserfs_xattrs (inode->i_sb)) | ||
63 | return 0; | ||
64 | |||
65 | if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) | ||
66 | return 0; | ||
67 | |||
68 | if (out) | ||
69 | memcpy (out, name, len); | ||
70 | |||
71 | return len; | ||
72 | } | ||
73 | |||
74 | |||
75 | struct reiserfs_xattr_handler trusted_handler = { | ||
76 | .prefix = XATTR_TRUSTED_PREFIX, | ||
77 | .get = trusted_get, | ||
78 | .set = trusted_set, | ||
79 | .del = trusted_del, | ||
80 | .list = trusted_list, | ||
81 | }; | ||
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c new file mode 100644 index 000000000000..0772806466a8 --- /dev/null +++ b/fs/reiserfs/xattr_user.c | |||
@@ -0,0 +1,99 @@ | |||
1 | #include <linux/reiserfs_fs.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/fs.h> | ||
4 | #include <linux/pagemap.h> | ||
5 | #include <linux/xattr.h> | ||
6 | #include <linux/reiserfs_xattr.h> | ||
7 | #include <asm/uaccess.h> | ||
8 | |||
9 | #ifdef CONFIG_REISERFS_FS_POSIX_ACL | ||
10 | # include <linux/reiserfs_acl.h> | ||
11 | #endif | ||
12 | |||
13 | #define XATTR_USER_PREFIX "user." | ||
14 | |||
15 | static int | ||
16 | user_get (struct inode *inode, const char *name, void *buffer, size_t size) | ||
17 | { | ||
18 | |||
19 | int error; | ||
20 | |||
21 | if (strlen(name) < sizeof(XATTR_USER_PREFIX)) | ||
22 | return -EINVAL; | ||
23 | |||
24 | if (!reiserfs_xattrs_user (inode->i_sb)) | ||
25 | return -EOPNOTSUPP; | ||
26 | |||
27 | error = reiserfs_permission_locked (inode, MAY_READ, NULL); | ||
28 | if (error) | ||
29 | return error; | ||
30 | |||
31 | return reiserfs_xattr_get (inode, name, buffer, size); | ||
32 | } | ||
33 | |||
34 | static int | ||
35 | user_set (struct inode *inode, const char *name, const void *buffer, | ||
36 | size_t size, int flags) | ||
37 | { | ||
38 | |||
39 | int error; | ||
40 | |||
41 | if (strlen(name) < sizeof(XATTR_USER_PREFIX)) | ||
42 | return -EINVAL; | ||
43 | |||
44 | if (!reiserfs_xattrs_user (inode->i_sb)) | ||
45 | return -EOPNOTSUPP; | ||
46 | |||
47 | if (!S_ISREG (inode->i_mode) && | ||
48 | (!S_ISDIR (inode->i_mode) || inode->i_mode & S_ISVTX)) | ||
49 | return -EPERM; | ||
50 | |||
51 | error = reiserfs_permission_locked (inode, MAY_WRITE, NULL); | ||
52 | if (error) | ||
53 | return error; | ||
54 | |||
55 | return reiserfs_xattr_set (inode, name, buffer, size, flags); | ||
56 | } | ||
57 | |||
58 | static int | ||
59 | user_del (struct inode *inode, const char *name) | ||
60 | { | ||
61 | int error; | ||
62 | |||
63 | if (strlen(name) < sizeof(XATTR_USER_PREFIX)) | ||
64 | return -EINVAL; | ||
65 | |||
66 | if (!reiserfs_xattrs_user (inode->i_sb)) | ||
67 | return -EOPNOTSUPP; | ||
68 | |||
69 | if (!S_ISREG (inode->i_mode) && | ||
70 | (!S_ISDIR (inode->i_mode) || inode->i_mode & S_ISVTX)) | ||
71 | return -EPERM; | ||
72 | |||
73 | error = reiserfs_permission_locked (inode, MAY_WRITE, NULL); | ||
74 | if (error) | ||
75 | return error; | ||
76 | |||
77 | return 0; | ||
78 | } | ||
79 | |||
80 | static int | ||
81 | user_list (struct inode *inode, const char *name, int namelen, char *out) | ||
82 | { | ||
83 | int len = namelen; | ||
84 | if (!reiserfs_xattrs_user (inode->i_sb)) | ||
85 | return 0; | ||
86 | |||
87 | if (out) | ||
88 | memcpy (out, name, len); | ||
89 | |||
90 | return len; | ||
91 | } | ||
92 | |||
93 | struct reiserfs_xattr_handler user_handler = { | ||
94 | .prefix = XATTR_USER_PREFIX, | ||
95 | .get = user_get, | ||
96 | .set = user_set, | ||
97 | .del = user_del, | ||
98 | .list = user_list, | ||
99 | }; | ||