diff options
119 files changed, 46657 insertions, 56 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index 7e17712f3229..74052d22d868 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX | |||
@@ -12,10 +12,14 @@ cifs.txt | |||
12 | - description of the CIFS filesystem | 12 | - description of the CIFS filesystem |
13 | coda.txt | 13 | coda.txt |
14 | - description of the CODA filesystem. | 14 | - description of the CODA filesystem. |
15 | configfs/ | ||
16 | - directory containing configfs documentation and example code. | ||
15 | cramfs.txt | 17 | cramfs.txt |
16 | - info on the cram filesystem for small storage (ROMs etc) | 18 | - info on the cram filesystem for small storage (ROMs etc) |
17 | devfs/ | 19 | devfs/ |
18 | - directory containing devfs documentation. | 20 | - directory containing devfs documentation. |
21 | dlmfs.txt | ||
22 | - info on the userspace interface to the OCFS2 DLM. | ||
19 | ext2.txt | 23 | ext2.txt |
20 | - info, mount options and specifications for the Ext2 filesystem. | 24 | - info, mount options and specifications for the Ext2 filesystem. |
21 | hpfs.txt | 25 | hpfs.txt |
@@ -30,6 +34,8 @@ ntfs.txt | |||
30 | - info and mount options for the NTFS filesystem (Windows NT). | 34 | - info and mount options for the NTFS filesystem (Windows NT). |
31 | proc.txt | 35 | proc.txt |
32 | - info on Linux's /proc filesystem. | 36 | - info on Linux's /proc filesystem. |
37 | ocfs2.txt | ||
38 | - info and mount options for the OCFS2 clustered filesystem. | ||
33 | romfs.txt | 39 | romfs.txt |
34 | - Description of the ROMFS filesystem. | 40 | - Description of the ROMFS filesystem. |
35 | smbfs.txt | 41 | smbfs.txt |
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt new file mode 100644 index 000000000000..c4ff96b7c4e0 --- /dev/null +++ b/Documentation/filesystems/configfs/configfs.txt | |||
@@ -0,0 +1,434 @@ | |||
1 | |||
2 | configfs - Userspace-driven kernel object configuation. | ||
3 | |||
4 | Joel Becker <joel.becker@oracle.com> | ||
5 | |||
6 | Updated: 31 March 2005 | ||
7 | |||
8 | Copyright (c) 2005 Oracle Corporation, | ||
9 | Joel Becker <joel.becker@oracle.com> | ||
10 | |||
11 | |||
12 | [What is configfs?] | ||
13 | |||
14 | configfs is a ram-based filesystem that provides the converse of | ||
15 | sysfs's functionality. Where sysfs is a filesystem-based view of | ||
16 | kernel objects, configfs is a filesystem-based manager of kernel | ||
17 | objects, or config_items. | ||
18 | |||
19 | With sysfs, an object is created in kernel (for example, when a device | ||
20 | is discovered) and it is registered with sysfs. Its attributes then | ||
21 | appear in sysfs, allowing userspace to read the attributes via | ||
22 | readdir(3)/read(2). It may allow some attributes to be modified via | ||
23 | write(2). The important point is that the object is created and | ||
24 | destroyed in kernel, the kernel controls the lifecycle of the sysfs | ||
25 | representation, and sysfs is merely a window on all this. | ||
26 | |||
27 | A configfs config_item is created via an explicit userspace operation: | ||
28 | mkdir(2). It is destroyed via rmdir(2). The attributes appear at | ||
29 | mkdir(2) time, and can be read or modified via read(2) and write(2). | ||
30 | As with sysfs, readdir(3) queries the list of items and/or attributes. | ||
31 | symlink(2) can be used to group items together. Unlike sysfs, the | ||
32 | lifetime of the representation is completely driven by userspace. The | ||
33 | kernel modules backing the items must respond to this. | ||
34 | |||
35 | Both sysfs and configfs can and should exist together on the same | ||
36 | system. One is not a replacement for the other. | ||
37 | |||
38 | [Using configfs] | ||
39 | |||
40 | configfs can be compiled as a module or into the kernel. You can access | ||
41 | it by doing | ||
42 | |||
43 | mount -t configfs none /config | ||
44 | |||
45 | The configfs tree will be empty unless client modules are also loaded. | ||
46 | These are modules that register their item types with configfs as | ||
47 | subsystems. Once a client subsystem is loaded, it will appear as a | ||
48 | subdirectory (or more than one) under /config. Like sysfs, the | ||
49 | configfs tree is always there, whether mounted on /config or not. | ||
50 | |||
51 | An item is created via mkdir(2). The item's attributes will also | ||
52 | appear at this time. readdir(3) can determine what the attributes are, | ||
53 | read(2) can query their default values, and write(2) can store new | ||
54 | values. Like sysfs, attributes should be ASCII text files, preferably | ||
55 | with only one value per file. The same efficiency caveats from sysfs | ||
56 | apply. Don't mix more than one attribute in one attribute file. | ||
57 | |||
58 | Like sysfs, configfs expects write(2) to store the entire buffer at | ||
59 | once. When writing to configfs attributes, userspace processes should | ||
60 | first read the entire file, modify the portions they wish to change, and | ||
61 | then write the entire buffer back. Attribute files have a maximum size | ||
62 | of one page (PAGE_SIZE, 4096 on i386). | ||
63 | |||
64 | When an item needs to be destroyed, remove it with rmdir(2). An | ||
65 | item cannot be destroyed if any other item has a link to it (via | ||
66 | symlink(2)). Links can be removed via unlink(2). | ||
67 | |||
68 | [Configuring FakeNBD: an Example] | ||
69 | |||
70 | Imagine there's a Network Block Device (NBD) driver that allows you to | ||
71 | access remote block devices. Call it FakeNBD. FakeNBD uses configfs | ||
72 | for its configuration. Obviously, there will be a nice program that | ||
73 | sysadmins use to configure FakeNBD, but somehow that program has to tell | ||
74 | the driver about it. Here's where configfs comes in. | ||
75 | |||
76 | When the FakeNBD driver is loaded, it registers itself with configfs. | ||
77 | readdir(3) sees this just fine: | ||
78 | |||
79 | # ls /config | ||
80 | fakenbd | ||
81 | |||
82 | A fakenbd connection can be created with mkdir(2). The name is | ||
83 | arbitrary, but likely the tool will make some use of the name. Perhaps | ||
84 | it is a uuid or a disk name: | ||
85 | |||
86 | # mkdir /config/fakenbd/disk1 | ||
87 | # ls /config/fakenbd/disk1 | ||
88 | target device rw | ||
89 | |||
90 | The target attribute contains the IP address of the server FakeNBD will | ||
91 | connect to. The device attribute is the device on the server. | ||
92 | Predictably, the rw attribute determines whether the connection is | ||
93 | read-only or read-write. | ||
94 | |||
95 | # echo 10.0.0.1 > /config/fakenbd/disk1/target | ||
96 | # echo /dev/sda1 > /config/fakenbd/disk1/device | ||
97 | # echo 1 > /config/fakenbd/disk1/rw | ||
98 | |||
99 | That's it. That's all there is. Now the device is configured, via the | ||
100 | shell no less. | ||
101 | |||
102 | [Coding With configfs] | ||
103 | |||
104 | Every object in configfs is a config_item. A config_item reflects an | ||
105 | object in the subsystem. It has attributes that match values on that | ||
106 | object. configfs handles the filesystem representation of that object | ||
107 | and its attributes, allowing the subsystem to ignore all but the | ||
108 | basic show/store interaction. | ||
109 | |||
110 | Items are created and destroyed inside a config_group. A group is a | ||
111 | collection of items that share the same attributes and operations. | ||
112 | Items are created by mkdir(2) and removed by rmdir(2), but configfs | ||
113 | handles that. The group has a set of operations to perform these tasks | ||
114 | |||
115 | A subsystem is the top level of a client module. During initialization, | ||
116 | the client module registers the subsystem with configfs, the subsystem | ||
117 | appears as a directory at the top of the configfs filesystem. A | ||
118 | subsystem is also a config_group, and can do everything a config_group | ||
119 | can. | ||
120 | |||
121 | [struct config_item] | ||
122 | |||
123 | struct config_item { | ||
124 | char *ci_name; | ||
125 | char ci_namebuf[UOBJ_NAME_LEN]; | ||
126 | struct kref ci_kref; | ||
127 | struct list_head ci_entry; | ||
128 | struct config_item *ci_parent; | ||
129 | struct config_group *ci_group; | ||
130 | struct config_item_type *ci_type; | ||
131 | struct dentry *ci_dentry; | ||
132 | }; | ||
133 | |||
134 | void config_item_init(struct config_item *); | ||
135 | void config_item_init_type_name(struct config_item *, | ||
136 | const char *name, | ||
137 | struct config_item_type *type); | ||
138 | struct config_item *config_item_get(struct config_item *); | ||
139 | void config_item_put(struct config_item *); | ||
140 | |||
141 | Generally, struct config_item is embedded in a container structure, a | ||
142 | structure that actually represents what the subsystem is doing. The | ||
143 | config_item portion of that structure is how the object interacts with | ||
144 | configfs. | ||
145 | |||
146 | Whether statically defined in a source file or created by a parent | ||
147 | config_group, a config_item must have one of the _init() functions | ||
148 | called on it. This initializes the reference count and sets up the | ||
149 | appropriate fields. | ||
150 | |||
151 | All users of a config_item should have a reference on it via | ||
152 | config_item_get(), and drop the reference when they are done via | ||
153 | config_item_put(). | ||
154 | |||
155 | By itself, a config_item cannot do much more than appear in configfs. | ||
156 | Usually a subsystem wants the item to display and/or store attributes, | ||
157 | among other things. For that, it needs a type. | ||
158 | |||
159 | [struct config_item_type] | ||
160 | |||
161 | struct configfs_item_operations { | ||
162 | void (*release)(struct config_item *); | ||
163 | ssize_t (*show_attribute)(struct config_item *, | ||
164 | struct configfs_attribute *, | ||
165 | char *); | ||
166 | ssize_t (*store_attribute)(struct config_item *, | ||
167 | struct configfs_attribute *, | ||
168 | const char *, size_t); | ||
169 | int (*allow_link)(struct config_item *src, | ||
170 | struct config_item *target); | ||
171 | int (*drop_link)(struct config_item *src, | ||
172 | struct config_item *target); | ||
173 | }; | ||
174 | |||
175 | struct config_item_type { | ||
176 | struct module *ct_owner; | ||
177 | struct configfs_item_operations *ct_item_ops; | ||
178 | struct configfs_group_operations *ct_group_ops; | ||
179 | struct configfs_attribute **ct_attrs; | ||
180 | }; | ||
181 | |||
182 | The most basic function of a config_item_type is to define what | ||
183 | operations can be performed on a config_item. All items that have been | ||
184 | allocated dynamically will need to provide the ct_item_ops->release() | ||
185 | method. This method is called when the config_item's reference count | ||
186 | reaches zero. Items that wish to display an attribute need to provide | ||
187 | the ct_item_ops->show_attribute() method. Similarly, storing a new | ||
188 | attribute value uses the store_attribute() method. | ||
189 | |||
190 | [struct configfs_attribute] | ||
191 | |||
192 | struct configfs_attribute { | ||
193 | char *ca_name; | ||
194 | struct module *ca_owner; | ||
195 | mode_t ca_mode; | ||
196 | }; | ||
197 | |||
198 | When a config_item wants an attribute to appear as a file in the item's | ||
199 | configfs directory, it must define a configfs_attribute describing it. | ||
200 | It then adds the attribute to the NULL-terminated array | ||
201 | config_item_type->ct_attrs. When the item appears in configfs, the | ||
202 | attribute file will appear with the configfs_attribute->ca_name | ||
203 | filename. configfs_attribute->ca_mode specifies the file permissions. | ||
204 | |||
205 | If an attribute is readable and the config_item provides a | ||
206 | ct_item_ops->show_attribute() method, that method will be called | ||
207 | whenever userspace asks for a read(2) on the attribute. The converse | ||
208 | will happen for write(2). | ||
209 | |||
210 | [struct config_group] | ||
211 | |||
212 | A config_item cannot live in a vaccum. The only way one can be created | ||
213 | is via mkdir(2) on a config_group. This will trigger creation of a | ||
214 | child item. | ||
215 | |||
216 | struct config_group { | ||
217 | struct config_item cg_item; | ||
218 | struct list_head cg_children; | ||
219 | struct configfs_subsystem *cg_subsys; | ||
220 | struct config_group **default_groups; | ||
221 | }; | ||
222 | |||
223 | void config_group_init(struct config_group *group); | ||
224 | void config_group_init_type_name(struct config_group *group, | ||
225 | const char *name, | ||
226 | struct config_item_type *type); | ||
227 | |||
228 | |||
229 | The config_group structure contains a config_item. Properly configuring | ||
230 | that item means that a group can behave as an item in its own right. | ||
231 | However, it can do more: it can create child items or groups. This is | ||
232 | accomplished via the group operations specified on the group's | ||
233 | config_item_type. | ||
234 | |||
235 | struct configfs_group_operations { | ||
236 | struct config_item *(*make_item)(struct config_group *group, | ||
237 | const char *name); | ||
238 | struct config_group *(*make_group)(struct config_group *group, | ||
239 | const char *name); | ||
240 | int (*commit_item)(struct config_item *item); | ||
241 | void (*drop_item)(struct config_group *group, | ||
242 | struct config_item *item); | ||
243 | }; | ||
244 | |||
245 | A group creates child items by providing the | ||
246 | ct_group_ops->make_item() method. If provided, this method is called from mkdir(2) in the group's directory. The subsystem allocates a new | ||
247 | config_item (or more likely, its container structure), initializes it, | ||
248 | and returns it to configfs. Configfs will then populate the filesystem | ||
249 | tree to reflect the new item. | ||
250 | |||
251 | If the subsystem wants the child to be a group itself, the subsystem | ||
252 | provides ct_group_ops->make_group(). Everything else behaves the same, | ||
253 | using the group _init() functions on the group. | ||
254 | |||
255 | Finally, when userspace calls rmdir(2) on the item or group, | ||
256 | ct_group_ops->drop_item() is called. As a config_group is also a | ||
257 | config_item, it is not necessary for a seperate drop_group() method. | ||
258 | The subsystem must config_item_put() the reference that was initialized | ||
259 | upon item allocation. If a subsystem has no work to do, it may omit | ||
260 | the ct_group_ops->drop_item() method, and configfs will call | ||
261 | config_item_put() on the item on behalf of the subsystem. | ||
262 | |||
263 | IMPORTANT: drop_item() is void, and as such cannot fail. When rmdir(2) | ||
264 | is called, configfs WILL remove the item from the filesystem tree | ||
265 | (assuming that it has no children to keep it busy). The subsystem is | ||
266 | responsible for responding to this. If the subsystem has references to | ||
267 | the item in other threads, the memory is safe. It may take some time | ||
268 | for the item to actually disappear from the subsystem's usage. But it | ||
269 | is gone from configfs. | ||
270 | |||
271 | A config_group cannot be removed while it still has child items. This | ||
272 | is implemented in the configfs rmdir(2) code. ->drop_item() will not be | ||
273 | called, as the item has not been dropped. rmdir(2) will fail, as the | ||
274 | directory is not empty. | ||
275 | |||
276 | [struct configfs_subsystem] | ||
277 | |||
278 | A subsystem must register itself, ususally at module_init time. This | ||
279 | tells configfs to make the subsystem appear in the file tree. | ||
280 | |||
281 | struct configfs_subsystem { | ||
282 | struct config_group su_group; | ||
283 | struct semaphore su_sem; | ||
284 | }; | ||
285 | |||
286 | int configfs_register_subsystem(struct configfs_subsystem *subsys); | ||
287 | void configfs_unregister_subsystem(struct configfs_subsystem *subsys); | ||
288 | |||
289 | A subsystem consists of a toplevel config_group and a semaphore. | ||
290 | The group is where child config_items are created. For a subsystem, | ||
291 | this group is usually defined statically. Before calling | ||
292 | configfs_register_subsystem(), the subsystem must have initialized the | ||
293 | group via the usual group _init() functions, and it must also have | ||
294 | initialized the semaphore. | ||
295 | When the register call returns, the subsystem is live, and it | ||
296 | will be visible via configfs. At that point, mkdir(2) can be called and | ||
297 | the subsystem must be ready for it. | ||
298 | |||
299 | [An Example] | ||
300 | |||
301 | The best example of these basic concepts is the simple_children | ||
302 | subsystem/group and the simple_child item in configfs_example.c It | ||
303 | shows a trivial object displaying and storing an attribute, and a simple | ||
304 | group creating and destroying these children. | ||
305 | |||
306 | [Hierarchy Navigation and the Subsystem Semaphore] | ||
307 | |||
308 | There is an extra bonus that configfs provides. The config_groups and | ||
309 | config_items are arranged in a hierarchy due to the fact that they | ||
310 | appear in a filesystem. A subsystem is NEVER to touch the filesystem | ||
311 | parts, but the subsystem might be interested in this hierarchy. For | ||
312 | this reason, the hierarchy is mirrored via the config_group->cg_children | ||
313 | and config_item->ci_parent structure members. | ||
314 | |||
315 | A subsystem can navigate the cg_children list and the ci_parent pointer | ||
316 | to see the tree created by the subsystem. This can race with configfs' | ||
317 | management of the hierarchy, so configfs uses the subsystem semaphore to | ||
318 | protect modifications. Whenever a subsystem wants to navigate the | ||
319 | hierarchy, it must do so under the protection of the subsystem | ||
320 | semaphore. | ||
321 | |||
322 | A subsystem will be prevented from acquiring the semaphore while a newly | ||
323 | allocated item has not been linked into this hierarchy. Similarly, it | ||
324 | will not be able to acquire the semaphore while a dropping item has not | ||
325 | yet been unlinked. This means that an item's ci_parent pointer will | ||
326 | never be NULL while the item is in configfs, and that an item will only | ||
327 | be in its parent's cg_children list for the same duration. This allows | ||
328 | a subsystem to trust ci_parent and cg_children while they hold the | ||
329 | semaphore. | ||
330 | |||
331 | [Item Aggregation Via symlink(2)] | ||
332 | |||
333 | configfs provides a simple group via the group->item parent/child | ||
334 | relationship. Often, however, a larger environment requires aggregation | ||
335 | outside of the parent/child connection. This is implemented via | ||
336 | symlink(2). | ||
337 | |||
338 | A config_item may provide the ct_item_ops->allow_link() and | ||
339 | ct_item_ops->drop_link() methods. If the ->allow_link() method exists, | ||
340 | symlink(2) may be called with the config_item as the source of the link. | ||
341 | These links are only allowed between configfs config_items. Any | ||
342 | symlink(2) attempt outside the configfs filesystem will be denied. | ||
343 | |||
344 | When symlink(2) is called, the source config_item's ->allow_link() | ||
345 | method is called with itself and a target item. If the source item | ||
346 | allows linking to target item, it returns 0. A source item may wish to | ||
347 | reject a link if it only wants links to a certain type of object (say, | ||
348 | in its own subsystem). | ||
349 | |||
350 | When unlink(2) is called on the symbolic link, the source item is | ||
351 | notified via the ->drop_link() method. Like the ->drop_item() method, | ||
352 | this is a void function and cannot return failure. The subsystem is | ||
353 | responsible for responding to the change. | ||
354 | |||
355 | A config_item cannot be removed while it links to any other item, nor | ||
356 | can it be removed while an item links to it. Dangling symlinks are not | ||
357 | allowed in configfs. | ||
358 | |||
359 | [Automatically Created Subgroups] | ||
360 | |||
361 | A new config_group may want to have two types of child config_items. | ||
362 | While this could be codified by magic names in ->make_item(), it is much | ||
363 | more explicit to have a method whereby userspace sees this divergence. | ||
364 | |||
365 | Rather than have a group where some items behave differently than | ||
366 | others, configfs provides a method whereby one or many subgroups are | ||
367 | automatically created inside the parent at its creation. Thus, | ||
368 | mkdir("parent) results in "parent", "parent/subgroup1", up through | ||
369 | "parent/subgroupN". Items of type 1 can now be created in | ||
370 | "parent/subgroup1", and items of type N can be created in | ||
371 | "parent/subgroupN". | ||
372 | |||
373 | These automatic subgroups, or default groups, do not preclude other | ||
374 | children of the parent group. If ct_group_ops->make_group() exists, | ||
375 | other child groups can be created on the parent group directly. | ||
376 | |||
377 | A configfs subsystem specifies default groups by filling in the | ||
378 | NULL-terminated array default_groups on the config_group structure. | ||
379 | Each group in that array is populated in the configfs tree at the same | ||
380 | time as the parent group. Similarly, they are removed at the same time | ||
381 | as the parent. No extra notification is provided. When a ->drop_item() | ||
382 | method call notifies the subsystem the parent group is going away, it | ||
383 | also means every default group child associated with that parent group. | ||
384 | |||
385 | As a consequence of this, default_groups cannot be removed directly via | ||
386 | rmdir(2). They also are not considered when rmdir(2) on the parent | ||
387 | group is checking for children. | ||
388 | |||
389 | [Committable Items] | ||
390 | |||
391 | NOTE: Committable items are currently unimplemented. | ||
392 | |||
393 | Some config_items cannot have a valid initial state. That is, no | ||
394 | default values can be specified for the item's attributes such that the | ||
395 | item can do its work. Userspace must configure one or more attributes, | ||
396 | after which the subsystem can start whatever entity this item | ||
397 | represents. | ||
398 | |||
399 | Consider the FakeNBD device from above. Without a target address *and* | ||
400 | a target device, the subsystem has no idea what block device to import. | ||
401 | The simple example assumes that the subsystem merely waits until all the | ||
402 | appropriate attributes are configured, and then connects. This will, | ||
403 | indeed, work, but now every attribute store must check if the attributes | ||
404 | are initialized. Every attribute store must fire off the connection if | ||
405 | that condition is met. | ||
406 | |||
407 | Far better would be an explicit action notifying the subsystem that the | ||
408 | config_item is ready to go. More importantly, an explicit action allows | ||
409 | the subsystem to provide feedback as to whether the attibutes are | ||
410 | initialized in a way that makes sense. configfs provides this as | ||
411 | committable items. | ||
412 | |||
413 | configfs still uses only normal filesystem operations. An item is | ||
414 | committed via rename(2). The item is moved from a directory where it | ||
415 | can be modified to a directory where it cannot. | ||
416 | |||
417 | Any group that provides the ct_group_ops->commit_item() method has | ||
418 | committable items. When this group appears in configfs, mkdir(2) will | ||
419 | not work directly in the group. Instead, the group will have two | ||
420 | subdirectories: "live" and "pending". The "live" directory does not | ||
421 | support mkdir(2) or rmdir(2) either. It only allows rename(2). The | ||
422 | "pending" directory does allow mkdir(2) and rmdir(2). An item is | ||
423 | created in the "pending" directory. Its attributes can be modified at | ||
424 | will. Userspace commits the item by renaming it into the "live" | ||
425 | directory. At this point, the subsystem recieves the ->commit_item() | ||
426 | callback. If all required attributes are filled to satisfaction, the | ||
427 | method returns zero and the item is moved to the "live" directory. | ||
428 | |||
429 | As rmdir(2) does not work in the "live" directory, an item must be | ||
430 | shutdown, or "uncommitted". Again, this is done via rename(2), this | ||
431 | time from the "live" directory back to the "pending" one. The subsystem | ||
432 | is notified by the ct_group_ops->uncommit_object() method. | ||
433 | |||
434 | |||
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c new file mode 100644 index 000000000000..f3c6e4946f98 --- /dev/null +++ b/Documentation/filesystems/configfs/configfs_example.c | |||
@@ -0,0 +1,474 @@ | |||
1 | /* | ||
2 | * vim: noexpandtab ts=8 sts=0 sw=8: | ||
3 | * | ||
4 | * configfs_example.c - This file is a demonstration module containing | ||
5 | * a number of configfs subsystems. | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public | ||
9 | * License as published by the Free Software Foundation; either | ||
10 | * version 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
15 | * General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public | ||
18 | * License along with this program; if not, write to the | ||
19 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
20 | * Boston, MA 021110-1307, USA. | ||
21 | * | ||
22 | * Based on sysfs: | ||
23 | * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel | ||
24 | * | ||
25 | * configfs Copyright (C) 2005 Oracle. All rights reserved. | ||
26 | */ | ||
27 | |||
28 | #include <linux/init.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <linux/slab.h> | ||
31 | |||
32 | #include <linux/configfs.h> | ||
33 | |||
34 | |||
35 | |||
36 | /* | ||
37 | * 01-childless | ||
38 | * | ||
39 | * This first example is a childless subsystem. It cannot create | ||
40 | * any config_items. It just has attributes. | ||
41 | * | ||
42 | * Note that we are enclosing the configfs_subsystem inside a container. | ||
43 | * This is not necessary if a subsystem has no attributes directly | ||
44 | * on the subsystem. See the next example, 02-simple-children, for | ||
45 | * such a subsystem. | ||
46 | */ | ||
47 | |||
48 | struct childless { | ||
49 | struct configfs_subsystem subsys; | ||
50 | int showme; | ||
51 | int storeme; | ||
52 | }; | ||
53 | |||
54 | struct childless_attribute { | ||
55 | struct configfs_attribute attr; | ||
56 | ssize_t (*show)(struct childless *, char *); | ||
57 | ssize_t (*store)(struct childless *, const char *, size_t); | ||
58 | }; | ||
59 | |||
60 | static inline struct childless *to_childless(struct config_item *item) | ||
61 | { | ||
62 | return item ? container_of(to_configfs_subsystem(to_config_group(item)), struct childless, subsys) : NULL; | ||
63 | } | ||
64 | |||
65 | static ssize_t childless_showme_read(struct childless *childless, | ||
66 | char *page) | ||
67 | { | ||
68 | ssize_t pos; | ||
69 | |||
70 | pos = sprintf(page, "%d\n", childless->showme); | ||
71 | childless->showme++; | ||
72 | |||
73 | return pos; | ||
74 | } | ||
75 | |||
76 | static ssize_t childless_storeme_read(struct childless *childless, | ||
77 | char *page) | ||
78 | { | ||
79 | return sprintf(page, "%d\n", childless->storeme); | ||
80 | } | ||
81 | |||
82 | static ssize_t childless_storeme_write(struct childless *childless, | ||
83 | const char *page, | ||
84 | size_t count) | ||
85 | { | ||
86 | unsigned long tmp; | ||
87 | char *p = (char *) page; | ||
88 | |||
89 | tmp = simple_strtoul(p, &p, 10); | ||
90 | if (!p || (*p && (*p != '\n'))) | ||
91 | return -EINVAL; | ||
92 | |||
93 | if (tmp > INT_MAX) | ||
94 | return -ERANGE; | ||
95 | |||
96 | childless->storeme = tmp; | ||
97 | |||
98 | return count; | ||
99 | } | ||
100 | |||
101 | static ssize_t childless_description_read(struct childless *childless, | ||
102 | char *page) | ||
103 | { | ||
104 | return sprintf(page, | ||
105 | "[01-childless]\n" | ||
106 | "\n" | ||
107 | "The childless subsystem is the simplest possible subsystem in\n" | ||
108 | "configfs. It does not support the creation of child config_items.\n" | ||
109 | "It only has a few attributes. In fact, it isn't much different\n" | ||
110 | "than a directory in /proc.\n"); | ||
111 | } | ||
112 | |||
113 | static struct childless_attribute childless_attr_showme = { | ||
114 | .attr = { .ca_owner = THIS_MODULE, .ca_name = "showme", .ca_mode = S_IRUGO }, | ||
115 | .show = childless_showme_read, | ||
116 | }; | ||
117 | static struct childless_attribute childless_attr_storeme = { | ||
118 | .attr = { .ca_owner = THIS_MODULE, .ca_name = "storeme", .ca_mode = S_IRUGO | S_IWUSR }, | ||
119 | .show = childless_storeme_read, | ||
120 | .store = childless_storeme_write, | ||
121 | }; | ||
122 | static struct childless_attribute childless_attr_description = { | ||
123 | .attr = { .ca_owner = THIS_MODULE, .ca_name = "description", .ca_mode = S_IRUGO }, | ||
124 | .show = childless_description_read, | ||
125 | }; | ||
126 | |||
127 | static struct configfs_attribute *childless_attrs[] = { | ||
128 | &childless_attr_showme.attr, | ||
129 | &childless_attr_storeme.attr, | ||
130 | &childless_attr_description.attr, | ||
131 | NULL, | ||
132 | }; | ||
133 | |||
134 | static ssize_t childless_attr_show(struct config_item *item, | ||
135 | struct configfs_attribute *attr, | ||
136 | char *page) | ||
137 | { | ||
138 | struct childless *childless = to_childless(item); | ||
139 | struct childless_attribute *childless_attr = | ||
140 | container_of(attr, struct childless_attribute, attr); | ||
141 | ssize_t ret = 0; | ||
142 | |||
143 | if (childless_attr->show) | ||
144 | ret = childless_attr->show(childless, page); | ||
145 | return ret; | ||
146 | } | ||
147 | |||
148 | static ssize_t childless_attr_store(struct config_item *item, | ||
149 | struct configfs_attribute *attr, | ||
150 | const char *page, size_t count) | ||
151 | { | ||
152 | struct childless *childless = to_childless(item); | ||
153 | struct childless_attribute *childless_attr = | ||
154 | container_of(attr, struct childless_attribute, attr); | ||
155 | ssize_t ret = -EINVAL; | ||
156 | |||
157 | if (childless_attr->store) | ||
158 | ret = childless_attr->store(childless, page, count); | ||
159 | return ret; | ||
160 | } | ||
161 | |||
162 | static struct configfs_item_operations childless_item_ops = { | ||
163 | .show_attribute = childless_attr_show, | ||
164 | .store_attribute = childless_attr_store, | ||
165 | }; | ||
166 | |||
167 | static struct config_item_type childless_type = { | ||
168 | .ct_item_ops = &childless_item_ops, | ||
169 | .ct_attrs = childless_attrs, | ||
170 | .ct_owner = THIS_MODULE, | ||
171 | }; | ||
172 | |||
173 | static struct childless childless_subsys = { | ||
174 | .subsys = { | ||
175 | .su_group = { | ||
176 | .cg_item = { | ||
177 | .ci_namebuf = "01-childless", | ||
178 | .ci_type = &childless_type, | ||
179 | }, | ||
180 | }, | ||
181 | }, | ||
182 | }; | ||
183 | |||
184 | |||
185 | /* ----------------------------------------------------------------- */ | ||
186 | |||
187 | /* | ||
188 | * 02-simple-children | ||
189 | * | ||
190 | * This example merely has a simple one-attribute child. Note that | ||
191 | * there is no extra attribute structure, as the child's attribute is | ||
192 | * known from the get-go. Also, there is no container for the | ||
193 | * subsystem, as it has no attributes of its own. | ||
194 | */ | ||
195 | |||
196 | struct simple_child { | ||
197 | struct config_item item; | ||
198 | int storeme; | ||
199 | }; | ||
200 | |||
201 | static inline struct simple_child *to_simple_child(struct config_item *item) | ||
202 | { | ||
203 | return item ? container_of(item, struct simple_child, item) : NULL; | ||
204 | } | ||
205 | |||
206 | static struct configfs_attribute simple_child_attr_storeme = { | ||
207 | .ca_owner = THIS_MODULE, | ||
208 | .ca_name = "storeme", | ||
209 | .ca_mode = S_IRUGO | S_IWUSR, | ||
210 | }; | ||
211 | |||
212 | static struct configfs_attribute *simple_child_attrs[] = { | ||
213 | &simple_child_attr_storeme, | ||
214 | NULL, | ||
215 | }; | ||
216 | |||
217 | static ssize_t simple_child_attr_show(struct config_item *item, | ||
218 | struct configfs_attribute *attr, | ||
219 | char *page) | ||
220 | { | ||
221 | ssize_t count; | ||
222 | struct simple_child *simple_child = to_simple_child(item); | ||
223 | |||
224 | count = sprintf(page, "%d\n", simple_child->storeme); | ||
225 | |||
226 | return count; | ||
227 | } | ||
228 | |||
229 | static ssize_t simple_child_attr_store(struct config_item *item, | ||
230 | struct configfs_attribute *attr, | ||
231 | const char *page, size_t count) | ||
232 | { | ||
233 | struct simple_child *simple_child = to_simple_child(item); | ||
234 | unsigned long tmp; | ||
235 | char *p = (char *) page; | ||
236 | |||
237 | tmp = simple_strtoul(p, &p, 10); | ||
238 | if (!p || (*p && (*p != '\n'))) | ||
239 | return -EINVAL; | ||
240 | |||
241 | if (tmp > INT_MAX) | ||
242 | return -ERANGE; | ||
243 | |||
244 | simple_child->storeme = tmp; | ||
245 | |||
246 | return count; | ||
247 | } | ||
248 | |||
249 | static void simple_child_release(struct config_item *item) | ||
250 | { | ||
251 | kfree(to_simple_child(item)); | ||
252 | } | ||
253 | |||
254 | static struct configfs_item_operations simple_child_item_ops = { | ||
255 | .release = simple_child_release, | ||
256 | .show_attribute = simple_child_attr_show, | ||
257 | .store_attribute = simple_child_attr_store, | ||
258 | }; | ||
259 | |||
260 | static struct config_item_type simple_child_type = { | ||
261 | .ct_item_ops = &simple_child_item_ops, | ||
262 | .ct_attrs = simple_child_attrs, | ||
263 | .ct_owner = THIS_MODULE, | ||
264 | }; | ||
265 | |||
266 | |||
267 | static struct config_item *simple_children_make_item(struct config_group *group, const char *name) | ||
268 | { | ||
269 | struct simple_child *simple_child; | ||
270 | |||
271 | simple_child = kmalloc(sizeof(struct simple_child), GFP_KERNEL); | ||
272 | if (!simple_child) | ||
273 | return NULL; | ||
274 | |||
275 | memset(simple_child, 0, sizeof(struct simple_child)); | ||
276 | |||
277 | config_item_init_type_name(&simple_child->item, name, | ||
278 | &simple_child_type); | ||
279 | |||
280 | simple_child->storeme = 0; | ||
281 | |||
282 | return &simple_child->item; | ||
283 | } | ||
284 | |||
285 | static struct configfs_attribute simple_children_attr_description = { | ||
286 | .ca_owner = THIS_MODULE, | ||
287 | .ca_name = "description", | ||
288 | .ca_mode = S_IRUGO, | ||
289 | }; | ||
290 | |||
291 | static struct configfs_attribute *simple_children_attrs[] = { | ||
292 | &simple_children_attr_description, | ||
293 | NULL, | ||
294 | }; | ||
295 | |||
296 | static ssize_t simple_children_attr_show(struct config_item *item, | ||
297 | struct configfs_attribute *attr, | ||
298 | char *page) | ||
299 | { | ||
300 | return sprintf(page, | ||
301 | "[02-simple-children]\n" | ||
302 | "\n" | ||
303 | "This subsystem allows the creation of child config_items. These\n" | ||
304 | "items have only one attribute that is readable and writeable.\n"); | ||
305 | } | ||
306 | |||
307 | static struct configfs_item_operations simple_children_item_ops = { | ||
308 | .show_attribute = simple_children_attr_show, | ||
309 | }; | ||
310 | |||
311 | /* | ||
312 | * Note that, since no extra work is required on ->drop_item(), | ||
313 | * no ->drop_item() is provided. | ||
314 | */ | ||
315 | static struct configfs_group_operations simple_children_group_ops = { | ||
316 | .make_item = simple_children_make_item, | ||
317 | }; | ||
318 | |||
319 | static struct config_item_type simple_children_type = { | ||
320 | .ct_item_ops = &simple_children_item_ops, | ||
321 | .ct_group_ops = &simple_children_group_ops, | ||
322 | .ct_attrs = simple_children_attrs, | ||
323 | }; | ||
324 | |||
325 | static struct configfs_subsystem simple_children_subsys = { | ||
326 | .su_group = { | ||
327 | .cg_item = { | ||
328 | .ci_namebuf = "02-simple-children", | ||
329 | .ci_type = &simple_children_type, | ||
330 | }, | ||
331 | }, | ||
332 | }; | ||
333 | |||
334 | |||
335 | /* ----------------------------------------------------------------- */ | ||
336 | |||
337 | /* | ||
338 | * 03-group-children | ||
339 | * | ||
340 | * This example reuses the simple_children group from above. However, | ||
341 | * the simple_children group is not the subsystem itself, it is a | ||
342 | * child of the subsystem. Creation of a group in the subsystem creates | ||
343 | * a new simple_children group. That group can then have simple_child | ||
344 | * children of its own. | ||
345 | */ | ||
346 | |||
347 | struct simple_children { | ||
348 | struct config_group group; | ||
349 | }; | ||
350 | |||
351 | static struct config_group *group_children_make_group(struct config_group *group, const char *name) | ||
352 | { | ||
353 | struct simple_children *simple_children; | ||
354 | |||
355 | simple_children = kmalloc(sizeof(struct simple_children), | ||
356 | GFP_KERNEL); | ||
357 | if (!simple_children) | ||
358 | return NULL; | ||
359 | |||
360 | memset(simple_children, 0, sizeof(struct simple_children)); | ||
361 | |||
362 | config_group_init_type_name(&simple_children->group, name, | ||
363 | &simple_children_type); | ||
364 | |||
365 | return &simple_children->group; | ||
366 | } | ||
367 | |||
368 | static struct configfs_attribute group_children_attr_description = { | ||
369 | .ca_owner = THIS_MODULE, | ||
370 | .ca_name = "description", | ||
371 | .ca_mode = S_IRUGO, | ||
372 | }; | ||
373 | |||
374 | static struct configfs_attribute *group_children_attrs[] = { | ||
375 | &group_children_attr_description, | ||
376 | NULL, | ||
377 | }; | ||
378 | |||
379 | static ssize_t group_children_attr_show(struct config_item *item, | ||
380 | struct configfs_attribute *attr, | ||
381 | char *page) | ||
382 | { | ||
383 | return sprintf(page, | ||
384 | "[03-group-children]\n" | ||
385 | "\n" | ||
386 | "This subsystem allows the creation of child config_groups. These\n" | ||
387 | "groups are like the subsystem simple-children.\n"); | ||
388 | } | ||
389 | |||
390 | static struct configfs_item_operations group_children_item_ops = { | ||
391 | .show_attribute = group_children_attr_show, | ||
392 | }; | ||
393 | |||
394 | /* | ||
395 | * Note that, since no extra work is required on ->drop_item(), | ||
396 | * no ->drop_item() is provided. | ||
397 | */ | ||
398 | static struct configfs_group_operations group_children_group_ops = { | ||
399 | .make_group = group_children_make_group, | ||
400 | }; | ||
401 | |||
402 | static struct config_item_type group_children_type = { | ||
403 | .ct_item_ops = &group_children_item_ops, | ||
404 | .ct_group_ops = &group_children_group_ops, | ||
405 | .ct_attrs = group_children_attrs, | ||
406 | }; | ||
407 | |||
408 | static struct configfs_subsystem group_children_subsys = { | ||
409 | .su_group = { | ||
410 | .cg_item = { | ||
411 | .ci_namebuf = "03-group-children", | ||
412 | .ci_type = &group_children_type, | ||
413 | }, | ||
414 | }, | ||
415 | }; | ||
416 | |||
417 | /* ----------------------------------------------------------------- */ | ||
418 | |||
419 | /* | ||
420 | * We're now done with our subsystem definitions. | ||
421 | * For convenience in this module, here's a list of them all. It | ||
422 | * allows the init function to easily register them. Most modules | ||
423 | * will only have one subsystem, and will only call register_subsystem | ||
424 | * on it directly. | ||
425 | */ | ||
426 | static struct configfs_subsystem *example_subsys[] = { | ||
427 | &childless_subsys.subsys, | ||
428 | &simple_children_subsys, | ||
429 | &group_children_subsys, | ||
430 | NULL, | ||
431 | }; | ||
432 | |||
433 | static int __init configfs_example_init(void) | ||
434 | { | ||
435 | int ret; | ||
436 | int i; | ||
437 | struct configfs_subsystem *subsys; | ||
438 | |||
439 | for (i = 0; example_subsys[i]; i++) { | ||
440 | subsys = example_subsys[i]; | ||
441 | |||
442 | config_group_init(&subsys->su_group); | ||
443 | init_MUTEX(&subsys->su_sem); | ||
444 | ret = configfs_register_subsystem(subsys); | ||
445 | if (ret) { | ||
446 | printk(KERN_ERR "Error %d while registering subsystem %s\n", | ||
447 | ret, | ||
448 | subsys->su_group.cg_item.ci_namebuf); | ||
449 | goto out_unregister; | ||
450 | } | ||
451 | } | ||
452 | |||
453 | return 0; | ||
454 | |||
455 | out_unregister: | ||
456 | for (; i >= 0; i--) { | ||
457 | configfs_unregister_subsystem(example_subsys[i]); | ||
458 | } | ||
459 | |||
460 | return ret; | ||
461 | } | ||
462 | |||
463 | static void __exit configfs_example_exit(void) | ||
464 | { | ||
465 | int i; | ||
466 | |||
467 | for (i = 0; example_subsys[i]; i++) { | ||
468 | configfs_unregister_subsystem(example_subsys[i]); | ||
469 | } | ||
470 | } | ||
471 | |||
472 | module_init(configfs_example_init); | ||
473 | module_exit(configfs_example_exit); | ||
474 | MODULE_LICENSE("GPL"); | ||
diff --git a/Documentation/filesystems/dlmfs.txt b/Documentation/filesystems/dlmfs.txt new file mode 100644 index 000000000000..9afab845a906 --- /dev/null +++ b/Documentation/filesystems/dlmfs.txt | |||
@@ -0,0 +1,130 @@ | |||
1 | dlmfs | ||
2 | ================== | ||
3 | A minimal DLM userspace interface implemented via a virtual file | ||
4 | system. | ||
5 | |||
6 | dlmfs is built with OCFS2 as it requires most of its infrastructure. | ||
7 | |||
8 | Project web page: http://oss.oracle.com/projects/ocfs2 | ||
9 | Tools web page: http://oss.oracle.com/projects/ocfs2-tools | ||
10 | OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ | ||
11 | |||
12 | All code copyright 2005 Oracle except when otherwise noted. | ||
13 | |||
14 | CREDITS | ||
15 | ======= | ||
16 | |||
17 | Some code taken from ramfs which is Copyright (C) 2000 Linus Torvalds | ||
18 | and Transmeta Corp. | ||
19 | |||
20 | Mark Fasheh <mark.fasheh@oracle.com> | ||
21 | |||
22 | Caveats | ||
23 | ======= | ||
24 | - Right now it only works with the OCFS2 DLM, though support for other | ||
25 | DLM implementations should not be a major issue. | ||
26 | |||
27 | Mount options | ||
28 | ============= | ||
29 | None | ||
30 | |||
31 | Usage | ||
32 | ===== | ||
33 | |||
34 | If you're just interested in OCFS2, then please see ocfs2.txt. The | ||
35 | rest of this document will be geared towards those who want to use | ||
36 | dlmfs for easy to setup and easy to use clustered locking in | ||
37 | userspace. | ||
38 | |||
39 | Setup | ||
40 | ===== | ||
41 | |||
42 | dlmfs requires that the OCFS2 cluster infrastructure be in | ||
43 | place. Please download ocfs2-tools from the above url and configure a | ||
44 | cluster. | ||
45 | |||
46 | You'll want to start heartbeating on a volume which all the nodes in | ||
47 | your lockspace can access. The easiest way to do this is via | ||
48 | ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires | ||
49 | that an OCFS2 file system be in place so that it can automatically | ||
50 | find it's heartbeat area, though it will eventually support heartbeat | ||
51 | against raw disks. | ||
52 | |||
53 | Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed | ||
54 | with ocfs2-tools. | ||
55 | |||
56 | Once you're heartbeating, DLM lock 'domains' can be easily created / | ||
57 | destroyed and locks within them accessed. | ||
58 | |||
59 | Locking | ||
60 | ======= | ||
61 | |||
62 | Users may access dlmfs via standard file system calls, or they can use | ||
63 | 'libo2dlm' (distributed with ocfs2-tools) which abstracts the file | ||
64 | system calls and presents a more traditional locking api. | ||
65 | |||
66 | dlmfs handles lock caching automatically for the user, so a lock | ||
67 | request for an already acquired lock will not generate another DLM | ||
68 | call. Userspace programs are assumed to handle their own local | ||
69 | locking. | ||
70 | |||
71 | Two levels of locks are supported - Shared Read, and Exlcusive. | ||
72 | Also supported is a Trylock operation. | ||
73 | |||
74 | For information on the libo2dlm interface, please see o2dlm.h, | ||
75 | distributed with ocfs2-tools. | ||
76 | |||
77 | Lock value blocks can be read and written to a resource via read(2) | ||
78 | and write(2) against the fd obtained via your open(2) call. The | ||
79 | maximum currently supported LVB length is 64 bytes (though that is an | ||
80 | OCFS2 DLM limitation). Through this mechanism, users of dlmfs can share | ||
81 | small amounts of data amongst their nodes. | ||
82 | |||
83 | mkdir(2) signals dlmfs to join a domain (which will have the same name | ||
84 | as the resulting directory) | ||
85 | |||
86 | rmdir(2) signals dlmfs to leave the domain | ||
87 | |||
88 | Locks for a given domain are represented by regular inodes inside the | ||
89 | domain directory. Locking against them is done via the open(2) system | ||
90 | call. | ||
91 | |||
92 | The open(2) call will not return until your lock has been granted or | ||
93 | an error has occurred, unless it has been instructed to do a trylock | ||
94 | operation. If the lock succeeds, you'll get an fd. | ||
95 | |||
96 | open(2) with O_CREAT to ensure the resource inode is created - dlmfs does | ||
97 | not automatically create inodes for existing lock resources. | ||
98 | |||
99 | Open Flag Lock Request Type | ||
100 | --------- ----------------- | ||
101 | O_RDONLY Shared Read | ||
102 | O_RDWR Exclusive | ||
103 | |||
104 | Open Flag Resulting Locking Behavior | ||
105 | --------- -------------------------- | ||
106 | O_NONBLOCK Trylock operation | ||
107 | |||
108 | You must provide exactly one of O_RDONLY or O_RDWR. | ||
109 | |||
110 | If O_NONBLOCK is also provided and the trylock operation was valid but | ||
111 | could not lock the resource then open(2) will return ETXTBUSY. | ||
112 | |||
113 | close(2) drops the lock associated with your fd. | ||
114 | |||
115 | Modes passed to mkdir(2) or open(2) are adhered to locally. Chown is | ||
116 | supported locally as well. This means you can use them to restrict | ||
117 | access to the resources via dlmfs on your local node only. | ||
118 | |||
119 | The resource LVB may be read from the fd in either Shared Read or | ||
120 | Exclusive modes via the read(2) system call. It can be written via | ||
121 | write(2) only when open in Exclusive mode. | ||
122 | |||
123 | Once written, an LVB will be visible to other nodes who obtain Read | ||
124 | Only or higher level locks on the resource. | ||
125 | |||
126 | See Also | ||
127 | ======== | ||
128 | http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf | ||
129 | |||
130 | For more information on the VMS distributed locking API. | ||
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt new file mode 100644 index 000000000000..f2595caf052e --- /dev/null +++ b/Documentation/filesystems/ocfs2.txt | |||
@@ -0,0 +1,55 @@ | |||
1 | OCFS2 filesystem | ||
2 | ================== | ||
3 | OCFS2 is a general purpose extent based shared disk cluster file | ||
4 | system with many similarities to ext3. It supports 64 bit inode | ||
5 | numbers, and has automatically extending metadata groups which may | ||
6 | also make it attractive for non-clustered use. | ||
7 | |||
8 | You'll want to install the ocfs2-tools package in order to at least | ||
9 | get "mount.ocfs2" and "ocfs2_hb_ctl". | ||
10 | |||
11 | Project web page: http://oss.oracle.com/projects/ocfs2 | ||
12 | Tools web page: http://oss.oracle.com/projects/ocfs2-tools | ||
13 | OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ | ||
14 | |||
15 | All code copyright 2005 Oracle except when otherwise noted. | ||
16 | |||
17 | CREDITS: | ||
18 | Lots of code taken from ext3 and other projects. | ||
19 | |||
20 | Authors in alphabetical order: | ||
21 | Joel Becker <joel.becker@oracle.com> | ||
22 | Zach Brown <zach.brown@oracle.com> | ||
23 | Mark Fasheh <mark.fasheh@oracle.com> | ||
24 | Kurt Hackel <kurt.hackel@oracle.com> | ||
25 | Sunil Mushran <sunil.mushran@oracle.com> | ||
26 | Manish Singh <manish.singh@oracle.com> | ||
27 | |||
28 | Caveats | ||
29 | ======= | ||
30 | Features which OCFS2 does not support yet: | ||
31 | - sparse files | ||
32 | - extended attributes | ||
33 | - shared writeable mmap | ||
34 | - loopback is supported, but data written will not | ||
35 | be cluster coherent. | ||
36 | - quotas | ||
37 | - cluster aware flock | ||
38 | - Directory change notification (F_NOTIFY) | ||
39 | - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease) | ||
40 | - POSIX ACLs | ||
41 | - readpages / writepages (not user visible) | ||
42 | |||
43 | Mount options | ||
44 | ============= | ||
45 | |||
46 | OCFS2 supports the following mount options: | ||
47 | (*) == default | ||
48 | |||
49 | barrier=1 This enables/disables barriers. barrier=0 disables it, | ||
50 | barrier=1 enables it. | ||
51 | errors=remount-ro(*) Remount the filesystem read-only on an error. | ||
52 | errors=panic Panic and halt the machine if an error occurs. | ||
53 | intr (*) Allow signals to interrupt cluster operations. | ||
54 | nointr Do not allow signals to interrupt cluster | ||
55 | operations. | ||
diff --git a/MAINTAINERS b/MAINTAINERS index 5daae53bf975..e9db0d6b928a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -554,6 +554,11 @@ W: http://us1.samba.org/samba/Linux_CIFS_client.html | |||
554 | T: git kernel.org:/pub/scm/linux/kernel/git/sfrench/cifs-2.6.git | 554 | T: git kernel.org:/pub/scm/linux/kernel/git/sfrench/cifs-2.6.git |
555 | S: Supported | 555 | S: Supported |
556 | 556 | ||
557 | CONFIGFS | ||
558 | P: Joel Becker | ||
559 | M: Joel Becker <joel.becker@oracle.com> | ||
560 | S: Supported | ||
561 | |||
557 | CIRRUS LOGIC GENERIC FBDEV DRIVER | 562 | CIRRUS LOGIC GENERIC FBDEV DRIVER |
558 | P: Jeff Garzik | 563 | P: Jeff Garzik |
559 | M: jgarzik@pobox.com | 564 | M: jgarzik@pobox.com |
@@ -1898,6 +1903,15 @@ M: ajoshi@shell.unixbox.com | |||
1898 | L: linux-nvidia@lists.surfsouth.com | 1903 | L: linux-nvidia@lists.surfsouth.com |
1899 | S: Maintained | 1904 | S: Maintained |
1900 | 1905 | ||
1906 | ORACLE CLUSTER FILESYSTEM 2 (OCFS2) | ||
1907 | P: Mark Fasheh | ||
1908 | M: mark.fasheh@oracle.com | ||
1909 | P: Kurt Hackel | ||
1910 | M: kurt.hackel@oracle.com | ||
1911 | L: ocfs2-devel@oss.oracle.com | ||
1912 | W: http://oss.oracle.com/projects/ocfs2/ | ||
1913 | S: Supported | ||
1914 | |||
1901 | OLYMPIC NETWORK DRIVER | 1915 | OLYMPIC NETWORK DRIVER |
1902 | P: Peter De Shrijver | 1916 | P: Peter De Shrijver |
1903 | M: p2@ace.ulyssis.student.kuleuven.ac.be | 1917 | M: p2@ace.ulyssis.student.kuleuven.ac.be |
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 96c664af8d06..a452b13620a2 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c | |||
@@ -213,7 +213,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, | |||
213 | struct address_space_operations *aops = mapping->a_ops; | 213 | struct address_space_operations *aops = mapping->a_ops; |
214 | pgoff_t index; | 214 | pgoff_t index; |
215 | unsigned offset, bv_offs; | 215 | unsigned offset, bv_offs; |
216 | int len, ret = 0; | 216 | int len, ret; |
217 | 217 | ||
218 | down(&mapping->host->i_sem); | 218 | down(&mapping->host->i_sem); |
219 | index = pos >> PAGE_CACHE_SHIFT; | 219 | index = pos >> PAGE_CACHE_SHIFT; |
@@ -232,9 +232,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, | |||
232 | page = grab_cache_page(mapping, index); | 232 | page = grab_cache_page(mapping, index); |
233 | if (unlikely(!page)) | 233 | if (unlikely(!page)) |
234 | goto fail; | 234 | goto fail; |
235 | if (unlikely(aops->prepare_write(file, page, offset, | 235 | ret = aops->prepare_write(file, page, offset, |
236 | offset + size))) | 236 | offset + size); |
237 | if (unlikely(ret)) { | ||
238 | if (ret == AOP_TRUNCATED_PAGE) { | ||
239 | page_cache_release(page); | ||
240 | continue; | ||
241 | } | ||
237 | goto unlock; | 242 | goto unlock; |
243 | } | ||
238 | transfer_result = lo_do_transfer(lo, WRITE, page, offset, | 244 | transfer_result = lo_do_transfer(lo, WRITE, page, offset, |
239 | bvec->bv_page, bv_offs, size, IV); | 245 | bvec->bv_page, bv_offs, size, IV); |
240 | if (unlikely(transfer_result)) { | 246 | if (unlikely(transfer_result)) { |
@@ -251,9 +257,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, | |||
251 | kunmap_atomic(kaddr, KM_USER0); | 257 | kunmap_atomic(kaddr, KM_USER0); |
252 | } | 258 | } |
253 | flush_dcache_page(page); | 259 | flush_dcache_page(page); |
254 | if (unlikely(aops->commit_write(file, page, offset, | 260 | ret = aops->commit_write(file, page, offset, |
255 | offset + size))) | 261 | offset + size); |
262 | if (unlikely(ret)) { | ||
263 | if (ret == AOP_TRUNCATED_PAGE) { | ||
264 | page_cache_release(page); | ||
265 | continue; | ||
266 | } | ||
256 | goto unlock; | 267 | goto unlock; |
268 | } | ||
257 | if (unlikely(transfer_result)) | 269 | if (unlikely(transfer_result)) |
258 | goto unlock; | 270 | goto unlock; |
259 | bv_offs += size; | 271 | bv_offs += size; |
@@ -264,6 +276,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, | |||
264 | unlock_page(page); | 276 | unlock_page(page); |
265 | page_cache_release(page); | 277 | page_cache_release(page); |
266 | } | 278 | } |
279 | ret = 0; | ||
267 | out: | 280 | out: |
268 | up(&mapping->host->i_sem); | 281 | up(&mapping->host->i_sem); |
269 | return ret; | 282 | return ret; |
diff --git a/drivers/block/rd.c b/drivers/block/rd.c index 68c60a5bcdab..ffd6abd6d5a0 100644 --- a/drivers/block/rd.c +++ b/drivers/block/rd.c | |||
@@ -154,7 +154,7 @@ static int ramdisk_commit_write(struct file *file, struct page *page, | |||
154 | 154 | ||
155 | /* | 155 | /* |
156 | * ->writepage to the the blockdev's mapping has to redirty the page so that the | 156 | * ->writepage to the the blockdev's mapping has to redirty the page so that the |
157 | * VM doesn't go and steal it. We return WRITEPAGE_ACTIVATE so that the VM | 157 | * VM doesn't go and steal it. We return AOP_WRITEPAGE_ACTIVATE so that the VM |
158 | * won't try to (pointlessly) write the page again for a while. | 158 | * won't try to (pointlessly) write the page again for a while. |
159 | * | 159 | * |
160 | * Really, these pages should not be on the LRU at all. | 160 | * Really, these pages should not be on the LRU at all. |
@@ -165,7 +165,7 @@ static int ramdisk_writepage(struct page *page, struct writeback_control *wbc) | |||
165 | make_page_uptodate(page); | 165 | make_page_uptodate(page); |
166 | SetPageDirty(page); | 166 | SetPageDirty(page); |
167 | if (wbc->for_reclaim) | 167 | if (wbc->for_reclaim) |
168 | return WRITEPAGE_ACTIVATE; | 168 | return AOP_WRITEPAGE_ACTIVATE; |
169 | unlock_page(page); | 169 | unlock_page(page); |
170 | return 0; | 170 | return 0; |
171 | } | 171 | } |
diff --git a/fs/Kconfig b/fs/Kconfig index d5255e627b5f..382e3b2883d5 100644 --- a/fs/Kconfig +++ b/fs/Kconfig | |||
@@ -70,6 +70,7 @@ config FS_XIP | |||
70 | 70 | ||
71 | config EXT3_FS | 71 | config EXT3_FS |
72 | tristate "Ext3 journalling file system support" | 72 | tristate "Ext3 journalling file system support" |
73 | select JBD | ||
73 | help | 74 | help |
74 | This is the journaling version of the Second extended file system | 75 | This is the journaling version of the Second extended file system |
75 | (often called ext3), the de facto standard Linux file system | 76 | (often called ext3), the de facto standard Linux file system |
@@ -138,23 +139,20 @@ config EXT3_FS_SECURITY | |||
138 | extended attributes for file security labels, say N. | 139 | extended attributes for file security labels, say N. |
139 | 140 | ||
140 | config JBD | 141 | config JBD |
141 | # CONFIG_JBD could be its own option (even modular), but until there are | ||
142 | # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS | ||
143 | # dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS | ||
144 | tristate | 142 | tristate |
145 | default EXT3_FS | ||
146 | help | 143 | help |
147 | This is a generic journaling layer for block devices. It is | 144 | This is a generic journaling layer for block devices. It is |
148 | currently used by the ext3 file system, but it could also be used to | 145 | currently used by the ext3 and OCFS2 file systems, but it could |
149 | add journal support to other file systems or block devices such as | 146 | also be used to add journal support to other file systems or block |
150 | RAID or LVM. | 147 | devices such as RAID or LVM. |
151 | 148 | ||
152 | If you are using the ext3 file system, you need to say Y here. If | 149 | If you are using the ext3 or OCFS2 file systems, you need to |
153 | you are not using ext3 then you will probably want to say N. | 150 | say Y here. If you are not using ext3 OCFS2 then you will probably |
151 | want to say N. | ||
154 | 152 | ||
155 | To compile this device as a module, choose M here: the module will be | 153 | To compile this device as a module, choose M here: the module will be |
156 | called jbd. If you are compiling ext3 into the kernel, you cannot | 154 | called jbd. If you are compiling ext3 or OCFS2 into the kernel, |
157 | compile this code as a module. | 155 | you cannot compile this code as a module. |
158 | 156 | ||
159 | config JBD_DEBUG | 157 | config JBD_DEBUG |
160 | bool "JBD (ext3) debugging support" | 158 | bool "JBD (ext3) debugging support" |
@@ -326,6 +324,38 @@ config FS_POSIX_ACL | |||
326 | 324 | ||
327 | source "fs/xfs/Kconfig" | 325 | source "fs/xfs/Kconfig" |
328 | 326 | ||
327 | config OCFS2_FS | ||
328 | tristate "OCFS2 file system support (EXPERIMENTAL)" | ||
329 | depends on NET && EXPERIMENTAL | ||
330 | select CONFIGFS_FS | ||
331 | select JBD | ||
332 | select CRC32 | ||
333 | select INET | ||
334 | help | ||
335 | OCFS2 is a general purpose extent based shared disk cluster file | ||
336 | system with many similarities to ext3. It supports 64 bit inode | ||
337 | numbers, and has automatically extending metadata groups which may | ||
338 | also make it attractive for non-clustered use. | ||
339 | |||
340 | You'll want to install the ocfs2-tools package in order to at least | ||
341 | get "mount.ocfs2". | ||
342 | |||
343 | Project web page: http://oss.oracle.com/projects/ocfs2 | ||
344 | Tools web page: http://oss.oracle.com/projects/ocfs2-tools | ||
345 | OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ | ||
346 | |||
347 | Note: Features which OCFS2 does not support yet: | ||
348 | - extended attributes | ||
349 | - shared writeable mmap | ||
350 | - loopback is supported, but data written will not | ||
351 | be cluster coherent. | ||
352 | - quotas | ||
353 | - cluster aware flock | ||
354 | - Directory change notification (F_NOTIFY) | ||
355 | - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease) | ||
356 | - POSIX ACLs | ||
357 | - readpages / writepages (not user visible) | ||
358 | |||
329 | config MINIX_FS | 359 | config MINIX_FS |
330 | tristate "Minix fs support" | 360 | tristate "Minix fs support" |
331 | help | 361 | help |
@@ -841,6 +871,20 @@ config RELAYFS_FS | |||
841 | 871 | ||
842 | If unsure, say N. | 872 | If unsure, say N. |
843 | 873 | ||
874 | config CONFIGFS_FS | ||
875 | tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)" | ||
876 | depends on EXPERIMENTAL | ||
877 | help | ||
878 | configfs is a ram-based filesystem that provides the converse | ||
879 | of sysfs's functionality. Where sysfs is a filesystem-based | ||
880 | view of kernel objects, configfs is a filesystem-based manager | ||
881 | of kernel objects, or config_items. | ||
882 | |||
883 | Both sysfs and configfs can and should exist together on the | ||
884 | same system. One is not a replacement for the other. | ||
885 | |||
886 | If unsure, say N. | ||
887 | |||
844 | endmenu | 888 | endmenu |
845 | 889 | ||
846 | menu "Miscellaneous filesystems" | 890 | menu "Miscellaneous filesystems" |
diff --git a/fs/Makefile b/fs/Makefile index 4c2655759078..73676111ebbe 100644 --- a/fs/Makefile +++ b/fs/Makefile | |||
@@ -101,3 +101,5 @@ obj-$(CONFIG_BEFS_FS) += befs/ | |||
101 | obj-$(CONFIG_HOSTFS) += hostfs/ | 101 | obj-$(CONFIG_HOSTFS) += hostfs/ |
102 | obj-$(CONFIG_HPPFS) += hppfs/ | 102 | obj-$(CONFIG_HPPFS) += hppfs/ |
103 | obj-$(CONFIG_DEBUG_FS) += debugfs/ | 103 | obj-$(CONFIG_DEBUG_FS) += debugfs/ |
104 | obj-$(CONFIG_CONFIGFS_FS) += configfs/ | ||
105 | obj-$(CONFIG_OCFS2_FS) += ocfs2/ | ||
diff --git a/fs/configfs/Makefile b/fs/configfs/Makefile new file mode 100644 index 000000000000..00ffb278e98c --- /dev/null +++ b/fs/configfs/Makefile | |||
@@ -0,0 +1,7 @@ | |||
1 | # | ||
2 | # Makefile for the configfs virtual filesystem | ||
3 | # | ||
4 | |||
5 | obj-$(CONFIG_CONFIGFS_FS) += configfs.o | ||
6 | |||
7 | configfs-objs := inode.o file.o dir.o symlink.o mount.o item.o | ||
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h new file mode 100644 index 000000000000..8899d9c5f6bf --- /dev/null +++ b/fs/configfs/configfs_internal.h | |||
@@ -0,0 +1,142 @@ | |||
1 | /* -*- mode: c; c-basic-offset:8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * configfs_internal.h - Internal stuff for configfs | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | * | ||
21 | * Based on sysfs: | ||
22 | * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel | ||
23 | * | ||
24 | * configfs Copyright (C) 2005 Oracle. All rights reserved. | ||
25 | */ | ||
26 | |||
27 | #include <linux/slab.h> | ||
28 | #include <linux/list.h> | ||
29 | |||
30 | struct configfs_dirent { | ||
31 | atomic_t s_count; | ||
32 | struct list_head s_sibling; | ||
33 | struct list_head s_children; | ||
34 | struct list_head s_links; | ||
35 | void * s_element; | ||
36 | int s_type; | ||
37 | umode_t s_mode; | ||
38 | struct dentry * s_dentry; | ||
39 | }; | ||
40 | |||
41 | #define CONFIGFS_ROOT 0x0001 | ||
42 | #define CONFIGFS_DIR 0x0002 | ||
43 | #define CONFIGFS_ITEM_ATTR 0x0004 | ||
44 | #define CONFIGFS_ITEM_LINK 0x0020 | ||
45 | #define CONFIGFS_USET_DIR 0x0040 | ||
46 | #define CONFIGFS_USET_DEFAULT 0x0080 | ||
47 | #define CONFIGFS_USET_DROPPING 0x0100 | ||
48 | #define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) | ||
49 | |||
50 | extern struct vfsmount * configfs_mount; | ||
51 | |||
52 | extern int configfs_is_root(struct config_item *item); | ||
53 | |||
54 | extern struct inode * configfs_new_inode(mode_t mode); | ||
55 | extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *)); | ||
56 | |||
57 | extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); | ||
58 | extern int configfs_make_dirent(struct configfs_dirent *, | ||
59 | struct dentry *, void *, umode_t, int); | ||
60 | |||
61 | extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int); | ||
62 | extern void configfs_hash_and_remove(struct dentry * dir, const char * name); | ||
63 | |||
64 | extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); | ||
65 | extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); | ||
66 | |||
67 | extern int configfs_pin_fs(void); | ||
68 | extern void configfs_release_fs(void); | ||
69 | |||
70 | extern struct rw_semaphore configfs_rename_sem; | ||
71 | extern struct super_block * configfs_sb; | ||
72 | extern struct file_operations configfs_dir_operations; | ||
73 | extern struct file_operations configfs_file_operations; | ||
74 | extern struct file_operations bin_fops; | ||
75 | extern struct inode_operations configfs_dir_inode_operations; | ||
76 | extern struct inode_operations configfs_symlink_inode_operations; | ||
77 | |||
78 | extern int configfs_symlink(struct inode *dir, struct dentry *dentry, | ||
79 | const char *symname); | ||
80 | extern int configfs_unlink(struct inode *dir, struct dentry *dentry); | ||
81 | |||
82 | struct configfs_symlink { | ||
83 | struct list_head sl_list; | ||
84 | struct config_item *sl_target; | ||
85 | }; | ||
86 | |||
87 | extern int configfs_create_link(struct configfs_symlink *sl, | ||
88 | struct dentry *parent, | ||
89 | struct dentry *dentry); | ||
90 | |||
91 | static inline struct config_item * to_item(struct dentry * dentry) | ||
92 | { | ||
93 | struct configfs_dirent * sd = dentry->d_fsdata; | ||
94 | return ((struct config_item *) sd->s_element); | ||
95 | } | ||
96 | |||
97 | static inline struct configfs_attribute * to_attr(struct dentry * dentry) | ||
98 | { | ||
99 | struct configfs_dirent * sd = dentry->d_fsdata; | ||
100 | return ((struct configfs_attribute *) sd->s_element); | ||
101 | } | ||
102 | |||
103 | static inline struct config_item *configfs_get_config_item(struct dentry *dentry) | ||
104 | { | ||
105 | struct config_item * item = NULL; | ||
106 | |||
107 | spin_lock(&dcache_lock); | ||
108 | if (!d_unhashed(dentry)) { | ||
109 | struct configfs_dirent * sd = dentry->d_fsdata; | ||
110 | if (sd->s_type & CONFIGFS_ITEM_LINK) { | ||
111 | struct configfs_symlink * sl = sd->s_element; | ||
112 | item = config_item_get(sl->sl_target); | ||
113 | } else | ||
114 | item = config_item_get(sd->s_element); | ||
115 | } | ||
116 | spin_unlock(&dcache_lock); | ||
117 | |||
118 | return item; | ||
119 | } | ||
120 | |||
121 | static inline void release_configfs_dirent(struct configfs_dirent * sd) | ||
122 | { | ||
123 | if (!(sd->s_type & CONFIGFS_ROOT)) | ||
124 | kfree(sd); | ||
125 | } | ||
126 | |||
127 | static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd) | ||
128 | { | ||
129 | if (sd) { | ||
130 | WARN_ON(!atomic_read(&sd->s_count)); | ||
131 | atomic_inc(&sd->s_count); | ||
132 | } | ||
133 | return sd; | ||
134 | } | ||
135 | |||
136 | static inline void configfs_put(struct configfs_dirent * sd) | ||
137 | { | ||
138 | WARN_ON(!atomic_read(&sd->s_count)); | ||
139 | if (atomic_dec_and_test(&sd->s_count)) | ||
140 | release_configfs_dirent(sd); | ||
141 | } | ||
142 | |||
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c new file mode 100644 index 000000000000..e48b539243a1 --- /dev/null +++ b/fs/configfs/dir.c | |||
@@ -0,0 +1,1102 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dir.c - Operations for configfs directories. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | * | ||
21 | * Based on sysfs: | ||
22 | * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel | ||
23 | * | ||
24 | * configfs Copyright (C) 2005 Oracle. All rights reserved. | ||
25 | */ | ||
26 | |||
27 | #undef DEBUG | ||
28 | |||
29 | #include <linux/fs.h> | ||
30 | #include <linux/mount.h> | ||
31 | #include <linux/module.h> | ||
32 | #include <linux/slab.h> | ||
33 | |||
34 | #include <linux/configfs.h> | ||
35 | #include "configfs_internal.h" | ||
36 | |||
37 | DECLARE_RWSEM(configfs_rename_sem); | ||
38 | |||
39 | static void configfs_d_iput(struct dentry * dentry, | ||
40 | struct inode * inode) | ||
41 | { | ||
42 | struct configfs_dirent * sd = dentry->d_fsdata; | ||
43 | |||
44 | if (sd) { | ||
45 | BUG_ON(sd->s_dentry != dentry); | ||
46 | sd->s_dentry = NULL; | ||
47 | configfs_put(sd); | ||
48 | } | ||
49 | iput(inode); | ||
50 | } | ||
51 | |||
52 | /* | ||
53 | * We _must_ delete our dentries on last dput, as the chain-to-parent | ||
54 | * behavior is required to clear the parents of default_groups. | ||
55 | */ | ||
56 | static int configfs_d_delete(struct dentry *dentry) | ||
57 | { | ||
58 | return 1; | ||
59 | } | ||
60 | |||
61 | static struct dentry_operations configfs_dentry_ops = { | ||
62 | .d_iput = configfs_d_iput, | ||
63 | /* simple_delete_dentry() isn't exported */ | ||
64 | .d_delete = configfs_d_delete, | ||
65 | }; | ||
66 | |||
67 | /* | ||
68 | * Allocates a new configfs_dirent and links it to the parent configfs_dirent | ||
69 | */ | ||
70 | static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * parent_sd, | ||
71 | void * element) | ||
72 | { | ||
73 | struct configfs_dirent * sd; | ||
74 | |||
75 | sd = kmalloc(sizeof(*sd), GFP_KERNEL); | ||
76 | if (!sd) | ||
77 | return NULL; | ||
78 | |||
79 | memset(sd, 0, sizeof(*sd)); | ||
80 | atomic_set(&sd->s_count, 1); | ||
81 | INIT_LIST_HEAD(&sd->s_links); | ||
82 | INIT_LIST_HEAD(&sd->s_children); | ||
83 | list_add(&sd->s_sibling, &parent_sd->s_children); | ||
84 | sd->s_element = element; | ||
85 | |||
86 | return sd; | ||
87 | } | ||
88 | |||
89 | int configfs_make_dirent(struct configfs_dirent * parent_sd, | ||
90 | struct dentry * dentry, void * element, | ||
91 | umode_t mode, int type) | ||
92 | { | ||
93 | struct configfs_dirent * sd; | ||
94 | |||
95 | sd = configfs_new_dirent(parent_sd, element); | ||
96 | if (!sd) | ||
97 | return -ENOMEM; | ||
98 | |||
99 | sd->s_mode = mode; | ||
100 | sd->s_type = type; | ||
101 | sd->s_dentry = dentry; | ||
102 | if (dentry) { | ||
103 | dentry->d_fsdata = configfs_get(sd); | ||
104 | dentry->d_op = &configfs_dentry_ops; | ||
105 | } | ||
106 | |||
107 | return 0; | ||
108 | } | ||
109 | |||
110 | static int init_dir(struct inode * inode) | ||
111 | { | ||
112 | inode->i_op = &configfs_dir_inode_operations; | ||
113 | inode->i_fop = &configfs_dir_operations; | ||
114 | |||
115 | /* directory inodes start off with i_nlink == 2 (for "." entry) */ | ||
116 | inode->i_nlink++; | ||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | static int init_file(struct inode * inode) | ||
121 | { | ||
122 | inode->i_size = PAGE_SIZE; | ||
123 | inode->i_fop = &configfs_file_operations; | ||
124 | return 0; | ||
125 | } | ||
126 | |||
127 | static int init_symlink(struct inode * inode) | ||
128 | { | ||
129 | inode->i_op = &configfs_symlink_inode_operations; | ||
130 | return 0; | ||
131 | } | ||
132 | |||
133 | static int create_dir(struct config_item * k, struct dentry * p, | ||
134 | struct dentry * d) | ||
135 | { | ||
136 | int error; | ||
137 | umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; | ||
138 | |||
139 | error = configfs_create(d, mode, init_dir); | ||
140 | if (!error) { | ||
141 | error = configfs_make_dirent(p->d_fsdata, d, k, mode, | ||
142 | CONFIGFS_DIR); | ||
143 | if (!error) { | ||
144 | p->d_inode->i_nlink++; | ||
145 | (d)->d_op = &configfs_dentry_ops; | ||
146 | } | ||
147 | } | ||
148 | return error; | ||
149 | } | ||
150 | |||
151 | |||
152 | /** | ||
153 | * configfs_create_dir - create a directory for an config_item. | ||
154 | * @item: config_itemwe're creating directory for. | ||
155 | * @dentry: config_item's dentry. | ||
156 | */ | ||
157 | |||
158 | static int configfs_create_dir(struct config_item * item, struct dentry *dentry) | ||
159 | { | ||
160 | struct dentry * parent; | ||
161 | int error = 0; | ||
162 | |||
163 | BUG_ON(!item); | ||
164 | |||
165 | if (item->ci_parent) | ||
166 | parent = item->ci_parent->ci_dentry; | ||
167 | else if (configfs_mount && configfs_mount->mnt_sb) | ||
168 | parent = configfs_mount->mnt_sb->s_root; | ||
169 | else | ||
170 | return -EFAULT; | ||
171 | |||
172 | error = create_dir(item,parent,dentry); | ||
173 | if (!error) | ||
174 | item->ci_dentry = dentry; | ||
175 | return error; | ||
176 | } | ||
177 | |||
178 | int configfs_create_link(struct configfs_symlink *sl, | ||
179 | struct dentry *parent, | ||
180 | struct dentry *dentry) | ||
181 | { | ||
182 | int err = 0; | ||
183 | umode_t mode = S_IFLNK | S_IRWXUGO; | ||
184 | |||
185 | err = configfs_create(dentry, mode, init_symlink); | ||
186 | if (!err) { | ||
187 | err = configfs_make_dirent(parent->d_fsdata, dentry, sl, | ||
188 | mode, CONFIGFS_ITEM_LINK); | ||
189 | if (!err) | ||
190 | dentry->d_op = &configfs_dentry_ops; | ||
191 | } | ||
192 | return err; | ||
193 | } | ||
194 | |||
195 | static void remove_dir(struct dentry * d) | ||
196 | { | ||
197 | struct dentry * parent = dget(d->d_parent); | ||
198 | struct configfs_dirent * sd; | ||
199 | |||
200 | sd = d->d_fsdata; | ||
201 | list_del_init(&sd->s_sibling); | ||
202 | configfs_put(sd); | ||
203 | if (d->d_inode) | ||
204 | simple_rmdir(parent->d_inode,d); | ||
205 | |||
206 | pr_debug(" o %s removing done (%d)\n",d->d_name.name, | ||
207 | atomic_read(&d->d_count)); | ||
208 | |||
209 | dput(parent); | ||
210 | } | ||
211 | |||
212 | /** | ||
213 | * configfs_remove_dir - remove an config_item's directory. | ||
214 | * @item: config_item we're removing. | ||
215 | * | ||
216 | * The only thing special about this is that we remove any files in | ||
217 | * the directory before we remove the directory, and we've inlined | ||
218 | * what used to be configfs_rmdir() below, instead of calling separately. | ||
219 | */ | ||
220 | |||
221 | static void configfs_remove_dir(struct config_item * item) | ||
222 | { | ||
223 | struct dentry * dentry = dget(item->ci_dentry); | ||
224 | |||
225 | if (!dentry) | ||
226 | return; | ||
227 | |||
228 | remove_dir(dentry); | ||
229 | /** | ||
230 | * Drop reference from dget() on entrance. | ||
231 | */ | ||
232 | dput(dentry); | ||
233 | } | ||
234 | |||
235 | |||
236 | /* attaches attribute's configfs_dirent to the dentry corresponding to the | ||
237 | * attribute file | ||
238 | */ | ||
239 | static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry) | ||
240 | { | ||
241 | struct configfs_attribute * attr = sd->s_element; | ||
242 | int error; | ||
243 | |||
244 | error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file); | ||
245 | if (error) | ||
246 | return error; | ||
247 | |||
248 | dentry->d_op = &configfs_dentry_ops; | ||
249 | dentry->d_fsdata = configfs_get(sd); | ||
250 | sd->s_dentry = dentry; | ||
251 | d_rehash(dentry); | ||
252 | |||
253 | return 0; | ||
254 | } | ||
255 | |||
256 | static struct dentry * configfs_lookup(struct inode *dir, | ||
257 | struct dentry *dentry, | ||
258 | struct nameidata *nd) | ||
259 | { | ||
260 | struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata; | ||
261 | struct configfs_dirent * sd; | ||
262 | int found = 0; | ||
263 | int err = 0; | ||
264 | |||
265 | list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { | ||
266 | if (sd->s_type & CONFIGFS_NOT_PINNED) { | ||
267 | const unsigned char * name = configfs_get_name(sd); | ||
268 | |||
269 | if (strcmp(name, dentry->d_name.name)) | ||
270 | continue; | ||
271 | |||
272 | found = 1; | ||
273 | err = configfs_attach_attr(sd, dentry); | ||
274 | break; | ||
275 | } | ||
276 | } | ||
277 | |||
278 | if (!found) { | ||
279 | /* | ||
280 | * If it doesn't exist and it isn't a NOT_PINNED item, | ||
281 | * it must be negative. | ||
282 | */ | ||
283 | return simple_lookup(dir, dentry, nd); | ||
284 | } | ||
285 | |||
286 | return ERR_PTR(err); | ||
287 | } | ||
288 | |||
289 | /* | ||
290 | * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are | ||
291 | * attributes and are removed by rmdir(). We recurse, taking i_sem | ||
292 | * on all children that are candidates for default detach. If the | ||
293 | * result is clean, then configfs_detach_group() will handle dropping | ||
294 | * i_sem. If there is an error, the caller will clean up the i_sem | ||
295 | * holders via configfs_detach_rollback(). | ||
296 | */ | ||
297 | static int configfs_detach_prep(struct dentry *dentry) | ||
298 | { | ||
299 | struct configfs_dirent *parent_sd = dentry->d_fsdata; | ||
300 | struct configfs_dirent *sd; | ||
301 | int ret; | ||
302 | |||
303 | ret = -EBUSY; | ||
304 | if (!list_empty(&parent_sd->s_links)) | ||
305 | goto out; | ||
306 | |||
307 | ret = 0; | ||
308 | list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { | ||
309 | if (sd->s_type & CONFIGFS_NOT_PINNED) | ||
310 | continue; | ||
311 | if (sd->s_type & CONFIGFS_USET_DEFAULT) { | ||
312 | down(&sd->s_dentry->d_inode->i_sem); | ||
313 | /* Mark that we've taken i_sem */ | ||
314 | sd->s_type |= CONFIGFS_USET_DROPPING; | ||
315 | |||
316 | ret = configfs_detach_prep(sd->s_dentry); | ||
317 | if (!ret) | ||
318 | continue; | ||
319 | } else | ||
320 | ret = -ENOTEMPTY; | ||
321 | |||
322 | break; | ||
323 | } | ||
324 | |||
325 | out: | ||
326 | return ret; | ||
327 | } | ||
328 | |||
329 | /* | ||
330 | * Walk the tree, dropping i_sem wherever CONFIGFS_USET_DROPPING is | ||
331 | * set. | ||
332 | */ | ||
333 | static void configfs_detach_rollback(struct dentry *dentry) | ||
334 | { | ||
335 | struct configfs_dirent *parent_sd = dentry->d_fsdata; | ||
336 | struct configfs_dirent *sd; | ||
337 | |||
338 | list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { | ||
339 | if (sd->s_type & CONFIGFS_USET_DEFAULT) { | ||
340 | configfs_detach_rollback(sd->s_dentry); | ||
341 | |||
342 | if (sd->s_type & CONFIGFS_USET_DROPPING) { | ||
343 | sd->s_type &= ~CONFIGFS_USET_DROPPING; | ||
344 | up(&sd->s_dentry->d_inode->i_sem); | ||
345 | } | ||
346 | } | ||
347 | } | ||
348 | } | ||
349 | |||
350 | static void detach_attrs(struct config_item * item) | ||
351 | { | ||
352 | struct dentry * dentry = dget(item->ci_dentry); | ||
353 | struct configfs_dirent * parent_sd; | ||
354 | struct configfs_dirent * sd, * tmp; | ||
355 | |||
356 | if (!dentry) | ||
357 | return; | ||
358 | |||
359 | pr_debug("configfs %s: dropping attrs for dir\n", | ||
360 | dentry->d_name.name); | ||
361 | |||
362 | parent_sd = dentry->d_fsdata; | ||
363 | list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { | ||
364 | if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED)) | ||
365 | continue; | ||
366 | list_del_init(&sd->s_sibling); | ||
367 | configfs_drop_dentry(sd, dentry); | ||
368 | configfs_put(sd); | ||
369 | } | ||
370 | |||
371 | /** | ||
372 | * Drop reference from dget() on entrance. | ||
373 | */ | ||
374 | dput(dentry); | ||
375 | } | ||
376 | |||
377 | static int populate_attrs(struct config_item *item) | ||
378 | { | ||
379 | struct config_item_type *t = item->ci_type; | ||
380 | struct configfs_attribute *attr; | ||
381 | int error = 0; | ||
382 | int i; | ||
383 | |||
384 | if (!t) | ||
385 | return -EINVAL; | ||
386 | if (t->ct_attrs) { | ||
387 | for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) { | ||
388 | if ((error = configfs_create_file(item, attr))) | ||
389 | break; | ||
390 | } | ||
391 | } | ||
392 | |||
393 | if (error) | ||
394 | detach_attrs(item); | ||
395 | |||
396 | return error; | ||
397 | } | ||
398 | |||
399 | static int configfs_attach_group(struct config_item *parent_item, | ||
400 | struct config_item *item, | ||
401 | struct dentry *dentry); | ||
402 | static void configfs_detach_group(struct config_item *item); | ||
403 | |||
404 | static void detach_groups(struct config_group *group) | ||
405 | { | ||
406 | struct dentry * dentry = dget(group->cg_item.ci_dentry); | ||
407 | struct dentry *child; | ||
408 | struct configfs_dirent *parent_sd; | ||
409 | struct configfs_dirent *sd, *tmp; | ||
410 | |||
411 | if (!dentry) | ||
412 | return; | ||
413 | |||
414 | parent_sd = dentry->d_fsdata; | ||
415 | list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { | ||
416 | if (!sd->s_element || | ||
417 | !(sd->s_type & CONFIGFS_USET_DEFAULT)) | ||
418 | continue; | ||
419 | |||
420 | child = sd->s_dentry; | ||
421 | |||
422 | configfs_detach_group(sd->s_element); | ||
423 | child->d_inode->i_flags |= S_DEAD; | ||
424 | |||
425 | /* | ||
426 | * From rmdir/unregister, a configfs_detach_prep() pass | ||
427 | * has taken our i_sem for us. Drop it. | ||
428 | * From mkdir/register cleanup, there is no sem held. | ||
429 | */ | ||
430 | if (sd->s_type & CONFIGFS_USET_DROPPING) | ||
431 | up(&child->d_inode->i_sem); | ||
432 | |||
433 | d_delete(child); | ||
434 | dput(child); | ||
435 | } | ||
436 | |||
437 | /** | ||
438 | * Drop reference from dget() on entrance. | ||
439 | */ | ||
440 | dput(dentry); | ||
441 | } | ||
442 | |||
443 | /* | ||
444 | * This fakes mkdir(2) on a default_groups[] entry. It | ||
445 | * creates a dentry, attachs it, and then does fixup | ||
446 | * on the sd->s_type. | ||
447 | * | ||
448 | * We could, perhaps, tweak our parent's ->mkdir for a minute and | ||
449 | * try using vfs_mkdir. Just a thought. | ||
450 | */ | ||
451 | static int create_default_group(struct config_group *parent_group, | ||
452 | struct config_group *group) | ||
453 | { | ||
454 | int ret; | ||
455 | struct qstr name; | ||
456 | struct configfs_dirent *sd; | ||
457 | /* We trust the caller holds a reference to parent */ | ||
458 | struct dentry *child, *parent = parent_group->cg_item.ci_dentry; | ||
459 | |||
460 | if (!group->cg_item.ci_name) | ||
461 | group->cg_item.ci_name = group->cg_item.ci_namebuf; | ||
462 | name.name = group->cg_item.ci_name; | ||
463 | name.len = strlen(name.name); | ||
464 | name.hash = full_name_hash(name.name, name.len); | ||
465 | |||
466 | ret = -ENOMEM; | ||
467 | child = d_alloc(parent, &name); | ||
468 | if (child) { | ||
469 | d_add(child, NULL); | ||
470 | |||
471 | ret = configfs_attach_group(&parent_group->cg_item, | ||
472 | &group->cg_item, child); | ||
473 | if (!ret) { | ||
474 | sd = child->d_fsdata; | ||
475 | sd->s_type |= CONFIGFS_USET_DEFAULT; | ||
476 | } else { | ||
477 | d_delete(child); | ||
478 | dput(child); | ||
479 | } | ||
480 | } | ||
481 | |||
482 | return ret; | ||
483 | } | ||
484 | |||
485 | static int populate_groups(struct config_group *group) | ||
486 | { | ||
487 | struct config_group *new_group; | ||
488 | struct dentry *dentry = group->cg_item.ci_dentry; | ||
489 | int ret = 0; | ||
490 | int i; | ||
491 | |||
492 | if (group && group->default_groups) { | ||
493 | /* FYI, we're faking mkdir here | ||
494 | * I'm not sure we need this semaphore, as we're called | ||
495 | * from our parent's mkdir. That holds our parent's | ||
496 | * i_sem, so afaik lookup cannot continue through our | ||
497 | * parent to find us, let alone mess with our tree. | ||
498 | * That said, taking our i_sem is closer to mkdir | ||
499 | * emulation, and shouldn't hurt. */ | ||
500 | down(&dentry->d_inode->i_sem); | ||
501 | |||
502 | for (i = 0; group->default_groups[i]; i++) { | ||
503 | new_group = group->default_groups[i]; | ||
504 | |||
505 | ret = create_default_group(group, new_group); | ||
506 | if (ret) | ||
507 | break; | ||
508 | } | ||
509 | |||
510 | up(&dentry->d_inode->i_sem); | ||
511 | } | ||
512 | |||
513 | if (ret) | ||
514 | detach_groups(group); | ||
515 | |||
516 | return ret; | ||
517 | } | ||
518 | |||
519 | /* | ||
520 | * All of link_obj/unlink_obj/link_group/unlink_group require that | ||
521 | * subsys->su_sem is held. | ||
522 | */ | ||
523 | |||
524 | static void unlink_obj(struct config_item *item) | ||
525 | { | ||
526 | struct config_group *group; | ||
527 | |||
528 | group = item->ci_group; | ||
529 | if (group) { | ||
530 | list_del_init(&item->ci_entry); | ||
531 | |||
532 | item->ci_group = NULL; | ||
533 | item->ci_parent = NULL; | ||
534 | config_item_put(item); | ||
535 | |||
536 | config_group_put(group); | ||
537 | } | ||
538 | } | ||
539 | |||
540 | static void link_obj(struct config_item *parent_item, struct config_item *item) | ||
541 | { | ||
542 | /* Parent seems redundant with group, but it makes certain | ||
543 | * traversals much nicer. */ | ||
544 | item->ci_parent = parent_item; | ||
545 | item->ci_group = config_group_get(to_config_group(parent_item)); | ||
546 | list_add_tail(&item->ci_entry, &item->ci_group->cg_children); | ||
547 | |||
548 | config_item_get(item); | ||
549 | } | ||
550 | |||
551 | static void unlink_group(struct config_group *group) | ||
552 | { | ||
553 | int i; | ||
554 | struct config_group *new_group; | ||
555 | |||
556 | if (group->default_groups) { | ||
557 | for (i = 0; group->default_groups[i]; i++) { | ||
558 | new_group = group->default_groups[i]; | ||
559 | unlink_group(new_group); | ||
560 | } | ||
561 | } | ||
562 | |||
563 | group->cg_subsys = NULL; | ||
564 | unlink_obj(&group->cg_item); | ||
565 | } | ||
566 | |||
567 | static void link_group(struct config_group *parent_group, struct config_group *group) | ||
568 | { | ||
569 | int i; | ||
570 | struct config_group *new_group; | ||
571 | struct configfs_subsystem *subsys = NULL; /* gcc is a turd */ | ||
572 | |||
573 | link_obj(&parent_group->cg_item, &group->cg_item); | ||
574 | |||
575 | if (parent_group->cg_subsys) | ||
576 | subsys = parent_group->cg_subsys; | ||
577 | else if (configfs_is_root(&parent_group->cg_item)) | ||
578 | subsys = to_configfs_subsystem(group); | ||
579 | else | ||
580 | BUG(); | ||
581 | group->cg_subsys = subsys; | ||
582 | |||
583 | if (group->default_groups) { | ||
584 | for (i = 0; group->default_groups[i]; i++) { | ||
585 | new_group = group->default_groups[i]; | ||
586 | link_group(group, new_group); | ||
587 | } | ||
588 | } | ||
589 | } | ||
590 | |||
591 | /* | ||
592 | * The goal is that configfs_attach_item() (and | ||
593 | * configfs_attach_group()) can be called from either the VFS or this | ||
594 | * module. That is, they assume that the items have been created, | ||
595 | * the dentry allocated, and the dcache is all ready to go. | ||
596 | * | ||
597 | * If they fail, they must clean up after themselves as if they | ||
598 | * had never been called. The caller (VFS or local function) will | ||
599 | * handle cleaning up the dcache bits. | ||
600 | * | ||
601 | * configfs_detach_group() and configfs_detach_item() behave similarly on | ||
602 | * the way out. They assume that the proper semaphores are held, they | ||
603 | * clean up the configfs items, and they expect their callers will | ||
604 | * handle the dcache bits. | ||
605 | */ | ||
606 | static int configfs_attach_item(struct config_item *parent_item, | ||
607 | struct config_item *item, | ||
608 | struct dentry *dentry) | ||
609 | { | ||
610 | int ret; | ||
611 | |||
612 | ret = configfs_create_dir(item, dentry); | ||
613 | if (!ret) { | ||
614 | ret = populate_attrs(item); | ||
615 | if (ret) { | ||
616 | configfs_remove_dir(item); | ||
617 | d_delete(dentry); | ||
618 | } | ||
619 | } | ||
620 | |||
621 | return ret; | ||
622 | } | ||
623 | |||
624 | static void configfs_detach_item(struct config_item *item) | ||
625 | { | ||
626 | detach_attrs(item); | ||
627 | configfs_remove_dir(item); | ||
628 | } | ||
629 | |||
630 | static int configfs_attach_group(struct config_item *parent_item, | ||
631 | struct config_item *item, | ||
632 | struct dentry *dentry) | ||
633 | { | ||
634 | int ret; | ||
635 | struct configfs_dirent *sd; | ||
636 | |||
637 | ret = configfs_attach_item(parent_item, item, dentry); | ||
638 | if (!ret) { | ||
639 | sd = dentry->d_fsdata; | ||
640 | sd->s_type |= CONFIGFS_USET_DIR; | ||
641 | |||
642 | ret = populate_groups(to_config_group(item)); | ||
643 | if (ret) { | ||
644 | configfs_detach_item(item); | ||
645 | d_delete(dentry); | ||
646 | } | ||
647 | } | ||
648 | |||
649 | return ret; | ||
650 | } | ||
651 | |||
652 | static void configfs_detach_group(struct config_item *item) | ||
653 | { | ||
654 | detach_groups(to_config_group(item)); | ||
655 | configfs_detach_item(item); | ||
656 | } | ||
657 | |||
658 | /* | ||
659 | * Drop the initial reference from make_item()/make_group() | ||
660 | * This function assumes that reference is held on item | ||
661 | * and that item holds a valid reference to the parent. Also, it | ||
662 | * assumes the caller has validated ci_type. | ||
663 | */ | ||
664 | static void client_drop_item(struct config_item *parent_item, | ||
665 | struct config_item *item) | ||
666 | { | ||
667 | struct config_item_type *type; | ||
668 | |||
669 | type = parent_item->ci_type; | ||
670 | BUG_ON(!type); | ||
671 | |||
672 | if (type->ct_group_ops && type->ct_group_ops->drop_item) | ||
673 | type->ct_group_ops->drop_item(to_config_group(parent_item), | ||
674 | item); | ||
675 | else | ||
676 | config_item_put(item); | ||
677 | } | ||
678 | |||
679 | |||
680 | static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) | ||
681 | { | ||
682 | int ret; | ||
683 | struct config_group *group; | ||
684 | struct config_item *item; | ||
685 | struct config_item *parent_item; | ||
686 | struct configfs_subsystem *subsys; | ||
687 | struct configfs_dirent *sd; | ||
688 | struct config_item_type *type; | ||
689 | struct module *owner; | ||
690 | char *name; | ||
691 | |||
692 | if (dentry->d_parent == configfs_sb->s_root) | ||
693 | return -EPERM; | ||
694 | |||
695 | sd = dentry->d_parent->d_fsdata; | ||
696 | if (!(sd->s_type & CONFIGFS_USET_DIR)) | ||
697 | return -EPERM; | ||
698 | |||
699 | parent_item = configfs_get_config_item(dentry->d_parent); | ||
700 | type = parent_item->ci_type; | ||
701 | subsys = to_config_group(parent_item)->cg_subsys; | ||
702 | BUG_ON(!subsys); | ||
703 | |||
704 | if (!type || !type->ct_group_ops || | ||
705 | (!type->ct_group_ops->make_group && | ||
706 | !type->ct_group_ops->make_item)) { | ||
707 | config_item_put(parent_item); | ||
708 | return -EPERM; /* What lack-of-mkdir returns */ | ||
709 | } | ||
710 | |||
711 | name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL); | ||
712 | if (!name) { | ||
713 | config_item_put(parent_item); | ||
714 | return -ENOMEM; | ||
715 | } | ||
716 | snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name); | ||
717 | |||
718 | down(&subsys->su_sem); | ||
719 | group = NULL; | ||
720 | item = NULL; | ||
721 | if (type->ct_group_ops->make_group) { | ||
722 | group = type->ct_group_ops->make_group(to_config_group(parent_item), name); | ||
723 | if (group) { | ||
724 | link_group(to_config_group(parent_item), group); | ||
725 | item = &group->cg_item; | ||
726 | } | ||
727 | } else { | ||
728 | item = type->ct_group_ops->make_item(to_config_group(parent_item), name); | ||
729 | if (item) | ||
730 | link_obj(parent_item, item); | ||
731 | } | ||
732 | up(&subsys->su_sem); | ||
733 | |||
734 | kfree(name); | ||
735 | if (!item) { | ||
736 | config_item_put(parent_item); | ||
737 | return -ENOMEM; | ||
738 | } | ||
739 | |||
740 | ret = -EINVAL; | ||
741 | type = item->ci_type; | ||
742 | if (type) { | ||
743 | owner = type->ct_owner; | ||
744 | if (try_module_get(owner)) { | ||
745 | if (group) { | ||
746 | ret = configfs_attach_group(parent_item, | ||
747 | item, | ||
748 | dentry); | ||
749 | } else { | ||
750 | ret = configfs_attach_item(parent_item, | ||
751 | item, | ||
752 | dentry); | ||
753 | } | ||
754 | |||
755 | if (ret) { | ||
756 | down(&subsys->su_sem); | ||
757 | if (group) | ||
758 | unlink_group(group); | ||
759 | else | ||
760 | unlink_obj(item); | ||
761 | client_drop_item(parent_item, item); | ||
762 | up(&subsys->su_sem); | ||
763 | |||
764 | config_item_put(parent_item); | ||
765 | module_put(owner); | ||
766 | } | ||
767 | } | ||
768 | } | ||
769 | |||
770 | return ret; | ||
771 | } | ||
772 | |||
773 | static int configfs_rmdir(struct inode *dir, struct dentry *dentry) | ||
774 | { | ||
775 | struct config_item *parent_item; | ||
776 | struct config_item *item; | ||
777 | struct configfs_subsystem *subsys; | ||
778 | struct configfs_dirent *sd; | ||
779 | struct module *owner = NULL; | ||
780 | int ret; | ||
781 | |||
782 | if (dentry->d_parent == configfs_sb->s_root) | ||
783 | return -EPERM; | ||
784 | |||
785 | sd = dentry->d_fsdata; | ||
786 | if (sd->s_type & CONFIGFS_USET_DEFAULT) | ||
787 | return -EPERM; | ||
788 | |||
789 | parent_item = configfs_get_config_item(dentry->d_parent); | ||
790 | subsys = to_config_group(parent_item)->cg_subsys; | ||
791 | BUG_ON(!subsys); | ||
792 | |||
793 | if (!parent_item->ci_type) { | ||
794 | config_item_put(parent_item); | ||
795 | return -EINVAL; | ||
796 | } | ||
797 | |||
798 | ret = configfs_detach_prep(dentry); | ||
799 | if (ret) { | ||
800 | configfs_detach_rollback(dentry); | ||
801 | config_item_put(parent_item); | ||
802 | return ret; | ||
803 | } | ||
804 | |||
805 | item = configfs_get_config_item(dentry); | ||
806 | |||
807 | /* Drop reference from above, item already holds one. */ | ||
808 | config_item_put(parent_item); | ||
809 | |||
810 | if (item->ci_type) | ||
811 | owner = item->ci_type->ct_owner; | ||
812 | |||
813 | if (sd->s_type & CONFIGFS_USET_DIR) { | ||
814 | configfs_detach_group(item); | ||
815 | |||
816 | down(&subsys->su_sem); | ||
817 | unlink_group(to_config_group(item)); | ||
818 | } else { | ||
819 | configfs_detach_item(item); | ||
820 | |||
821 | down(&subsys->su_sem); | ||
822 | unlink_obj(item); | ||
823 | } | ||
824 | |||
825 | client_drop_item(parent_item, item); | ||
826 | up(&subsys->su_sem); | ||
827 | |||
828 | /* Drop our reference from above */ | ||
829 | config_item_put(item); | ||
830 | |||
831 | module_put(owner); | ||
832 | |||
833 | return 0; | ||
834 | } | ||
835 | |||
836 | struct inode_operations configfs_dir_inode_operations = { | ||
837 | .mkdir = configfs_mkdir, | ||
838 | .rmdir = configfs_rmdir, | ||
839 | .symlink = configfs_symlink, | ||
840 | .unlink = configfs_unlink, | ||
841 | .lookup = configfs_lookup, | ||
842 | }; | ||
843 | |||
844 | #if 0 | ||
845 | int configfs_rename_dir(struct config_item * item, const char *new_name) | ||
846 | { | ||
847 | int error = 0; | ||
848 | struct dentry * new_dentry, * parent; | ||
849 | |||
850 | if (!strcmp(config_item_name(item), new_name)) | ||
851 | return -EINVAL; | ||
852 | |||
853 | if (!item->parent) | ||
854 | return -EINVAL; | ||
855 | |||
856 | down_write(&configfs_rename_sem); | ||
857 | parent = item->parent->dentry; | ||
858 | |||
859 | down(&parent->d_inode->i_sem); | ||
860 | |||
861 | new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); | ||
862 | if (!IS_ERR(new_dentry)) { | ||
863 | if (!new_dentry->d_inode) { | ||
864 | error = config_item_set_name(item, "%s", new_name); | ||
865 | if (!error) { | ||
866 | d_add(new_dentry, NULL); | ||
867 | d_move(item->dentry, new_dentry); | ||
868 | } | ||
869 | else | ||
870 | d_delete(new_dentry); | ||
871 | } else | ||
872 | error = -EEXIST; | ||
873 | dput(new_dentry); | ||
874 | } | ||
875 | up(&parent->d_inode->i_sem); | ||
876 | up_write(&configfs_rename_sem); | ||
877 | |||
878 | return error; | ||
879 | } | ||
880 | #endif | ||
881 | |||
882 | static int configfs_dir_open(struct inode *inode, struct file *file) | ||
883 | { | ||
884 | struct dentry * dentry = file->f_dentry; | ||
885 | struct configfs_dirent * parent_sd = dentry->d_fsdata; | ||
886 | |||
887 | down(&dentry->d_inode->i_sem); | ||
888 | file->private_data = configfs_new_dirent(parent_sd, NULL); | ||
889 | up(&dentry->d_inode->i_sem); | ||
890 | |||
891 | return file->private_data ? 0 : -ENOMEM; | ||
892 | |||
893 | } | ||
894 | |||
895 | static int configfs_dir_close(struct inode *inode, struct file *file) | ||
896 | { | ||
897 | struct dentry * dentry = file->f_dentry; | ||
898 | struct configfs_dirent * cursor = file->private_data; | ||
899 | |||
900 | down(&dentry->d_inode->i_sem); | ||
901 | list_del_init(&cursor->s_sibling); | ||
902 | up(&dentry->d_inode->i_sem); | ||
903 | |||
904 | release_configfs_dirent(cursor); | ||
905 | |||
906 | return 0; | ||
907 | } | ||
908 | |||
909 | /* Relationship between s_mode and the DT_xxx types */ | ||
910 | static inline unsigned char dt_type(struct configfs_dirent *sd) | ||
911 | { | ||
912 | return (sd->s_mode >> 12) & 15; | ||
913 | } | ||
914 | |||
915 | static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir) | ||
916 | { | ||
917 | struct dentry *dentry = filp->f_dentry; | ||
918 | struct configfs_dirent * parent_sd = dentry->d_fsdata; | ||
919 | struct configfs_dirent *cursor = filp->private_data; | ||
920 | struct list_head *p, *q = &cursor->s_sibling; | ||
921 | ino_t ino; | ||
922 | int i = filp->f_pos; | ||
923 | |||
924 | switch (i) { | ||
925 | case 0: | ||
926 | ino = dentry->d_inode->i_ino; | ||
927 | if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) | ||
928 | break; | ||
929 | filp->f_pos++; | ||
930 | i++; | ||
931 | /* fallthrough */ | ||
932 | case 1: | ||
933 | ino = parent_ino(dentry); | ||
934 | if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) | ||
935 | break; | ||
936 | filp->f_pos++; | ||
937 | i++; | ||
938 | /* fallthrough */ | ||
939 | default: | ||
940 | if (filp->f_pos == 2) { | ||
941 | list_del(q); | ||
942 | list_add(q, &parent_sd->s_children); | ||
943 | } | ||
944 | for (p=q->next; p!= &parent_sd->s_children; p=p->next) { | ||
945 | struct configfs_dirent *next; | ||
946 | const char * name; | ||
947 | int len; | ||
948 | |||
949 | next = list_entry(p, struct configfs_dirent, | ||
950 | s_sibling); | ||
951 | if (!next->s_element) | ||
952 | continue; | ||
953 | |||
954 | name = configfs_get_name(next); | ||
955 | len = strlen(name); | ||
956 | if (next->s_dentry) | ||
957 | ino = next->s_dentry->d_inode->i_ino; | ||
958 | else | ||
959 | ino = iunique(configfs_sb, 2); | ||
960 | |||
961 | if (filldir(dirent, name, len, filp->f_pos, ino, | ||
962 | dt_type(next)) < 0) | ||
963 | return 0; | ||
964 | |||
965 | list_del(q); | ||
966 | list_add(q, p); | ||
967 | p = q; | ||
968 | filp->f_pos++; | ||
969 | } | ||
970 | } | ||
971 | return 0; | ||
972 | } | ||
973 | |||
974 | static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin) | ||
975 | { | ||
976 | struct dentry * dentry = file->f_dentry; | ||
977 | |||
978 | down(&dentry->d_inode->i_sem); | ||
979 | switch (origin) { | ||
980 | case 1: | ||
981 | offset += file->f_pos; | ||
982 | case 0: | ||
983 | if (offset >= 0) | ||
984 | break; | ||
985 | default: | ||
986 | up(&file->f_dentry->d_inode->i_sem); | ||
987 | return -EINVAL; | ||
988 | } | ||
989 | if (offset != file->f_pos) { | ||
990 | file->f_pos = offset; | ||
991 | if (file->f_pos >= 2) { | ||
992 | struct configfs_dirent *sd = dentry->d_fsdata; | ||
993 | struct configfs_dirent *cursor = file->private_data; | ||
994 | struct list_head *p; | ||
995 | loff_t n = file->f_pos - 2; | ||
996 | |||
997 | list_del(&cursor->s_sibling); | ||
998 | p = sd->s_children.next; | ||
999 | while (n && p != &sd->s_children) { | ||
1000 | struct configfs_dirent *next; | ||
1001 | next = list_entry(p, struct configfs_dirent, | ||
1002 | s_sibling); | ||
1003 | if (next->s_element) | ||
1004 | n--; | ||
1005 | p = p->next; | ||
1006 | } | ||
1007 | list_add_tail(&cursor->s_sibling, p); | ||
1008 | } | ||
1009 | } | ||
1010 | up(&dentry->d_inode->i_sem); | ||
1011 | return offset; | ||
1012 | } | ||
1013 | |||
1014 | struct file_operations configfs_dir_operations = { | ||
1015 | .open = configfs_dir_open, | ||
1016 | .release = configfs_dir_close, | ||
1017 | .llseek = configfs_dir_lseek, | ||
1018 | .read = generic_read_dir, | ||
1019 | .readdir = configfs_readdir, | ||
1020 | }; | ||
1021 | |||
1022 | int configfs_register_subsystem(struct configfs_subsystem *subsys) | ||
1023 | { | ||
1024 | int err; | ||
1025 | struct config_group *group = &subsys->su_group; | ||
1026 | struct qstr name; | ||
1027 | struct dentry *dentry; | ||
1028 | struct configfs_dirent *sd; | ||
1029 | |||
1030 | err = configfs_pin_fs(); | ||
1031 | if (err) | ||
1032 | return err; | ||
1033 | |||
1034 | if (!group->cg_item.ci_name) | ||
1035 | group->cg_item.ci_name = group->cg_item.ci_namebuf; | ||
1036 | |||
1037 | sd = configfs_sb->s_root->d_fsdata; | ||
1038 | link_group(to_config_group(sd->s_element), group); | ||
1039 | |||
1040 | down(&configfs_sb->s_root->d_inode->i_sem); | ||
1041 | |||
1042 | name.name = group->cg_item.ci_name; | ||
1043 | name.len = strlen(name.name); | ||
1044 | name.hash = full_name_hash(name.name, name.len); | ||
1045 | |||
1046 | err = -ENOMEM; | ||
1047 | dentry = d_alloc(configfs_sb->s_root, &name); | ||
1048 | if (!dentry) | ||
1049 | goto out_release; | ||
1050 | |||
1051 | d_add(dentry, NULL); | ||
1052 | |||
1053 | err = configfs_attach_group(sd->s_element, &group->cg_item, | ||
1054 | dentry); | ||
1055 | if (!err) | ||
1056 | dentry = NULL; | ||
1057 | else | ||
1058 | d_delete(dentry); | ||
1059 | |||
1060 | up(&configfs_sb->s_root->d_inode->i_sem); | ||
1061 | |||
1062 | if (dentry) { | ||
1063 | dput(dentry); | ||
1064 | out_release: | ||
1065 | unlink_group(group); | ||
1066 | configfs_release_fs(); | ||
1067 | } | ||
1068 | |||
1069 | return err; | ||
1070 | } | ||
1071 | |||
1072 | void configfs_unregister_subsystem(struct configfs_subsystem *subsys) | ||
1073 | { | ||
1074 | struct config_group *group = &subsys->su_group; | ||
1075 | struct dentry *dentry = group->cg_item.ci_dentry; | ||
1076 | |||
1077 | if (dentry->d_parent != configfs_sb->s_root) { | ||
1078 | printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n"); | ||
1079 | return; | ||
1080 | } | ||
1081 | |||
1082 | down(&configfs_sb->s_root->d_inode->i_sem); | ||
1083 | down(&dentry->d_inode->i_sem); | ||
1084 | if (configfs_detach_prep(dentry)) { | ||
1085 | printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n"); | ||
1086 | } | ||
1087 | configfs_detach_group(&group->cg_item); | ||
1088 | dentry->d_inode->i_flags |= S_DEAD; | ||
1089 | up(&dentry->d_inode->i_sem); | ||
1090 | |||
1091 | d_delete(dentry); | ||
1092 | |||
1093 | up(&configfs_sb->s_root->d_inode->i_sem); | ||
1094 | |||
1095 | dput(dentry); | ||
1096 | |||
1097 | unlink_group(group); | ||
1098 | configfs_release_fs(); | ||
1099 | } | ||
1100 | |||
1101 | EXPORT_SYMBOL(configfs_register_subsystem); | ||
1102 | EXPORT_SYMBOL(configfs_unregister_subsystem); | ||
diff --git a/fs/configfs/file.c b/fs/configfs/file.c new file mode 100644 index 000000000000..af1ffc9a15c0 --- /dev/null +++ b/fs/configfs/file.c | |||
@@ -0,0 +1,360 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * file.c - operations for regular (text) files. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | * | ||
21 | * Based on sysfs: | ||
22 | * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel | ||
23 | * | ||
24 | * configfs Copyright (C) 2005 Oracle. All rights reserved. | ||
25 | */ | ||
26 | |||
27 | #include <linux/fs.h> | ||
28 | #include <linux/module.h> | ||
29 | #include <linux/dnotify.h> | ||
30 | #include <linux/slab.h> | ||
31 | #include <asm/uaccess.h> | ||
32 | #include <asm/semaphore.h> | ||
33 | |||
34 | #include <linux/configfs.h> | ||
35 | #include "configfs_internal.h" | ||
36 | |||
37 | |||
38 | struct configfs_buffer { | ||
39 | size_t count; | ||
40 | loff_t pos; | ||
41 | char * page; | ||
42 | struct configfs_item_operations * ops; | ||
43 | struct semaphore sem; | ||
44 | int needs_read_fill; | ||
45 | }; | ||
46 | |||
47 | |||
48 | /** | ||
49 | * fill_read_buffer - allocate and fill buffer from item. | ||
50 | * @dentry: dentry pointer. | ||
51 | * @buffer: data buffer for file. | ||
52 | * | ||
53 | * Allocate @buffer->page, if it hasn't been already, then call the | ||
54 | * config_item's show() method to fill the buffer with this attribute's | ||
55 | * data. | ||
56 | * This is called only once, on the file's first read. | ||
57 | */ | ||
58 | static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer) | ||
59 | { | ||
60 | struct configfs_attribute * attr = to_attr(dentry); | ||
61 | struct config_item * item = to_item(dentry->d_parent); | ||
62 | struct configfs_item_operations * ops = buffer->ops; | ||
63 | int ret = 0; | ||
64 | ssize_t count; | ||
65 | |||
66 | if (!buffer->page) | ||
67 | buffer->page = (char *) get_zeroed_page(GFP_KERNEL); | ||
68 | if (!buffer->page) | ||
69 | return -ENOMEM; | ||
70 | |||
71 | count = ops->show_attribute(item,attr,buffer->page); | ||
72 | buffer->needs_read_fill = 0; | ||
73 | BUG_ON(count > (ssize_t)PAGE_SIZE); | ||
74 | if (count >= 0) | ||
75 | buffer->count = count; | ||
76 | else | ||
77 | ret = count; | ||
78 | return ret; | ||
79 | } | ||
80 | |||
81 | |||
82 | /** | ||
83 | * flush_read_buffer - push buffer to userspace. | ||
84 | * @buffer: data buffer for file. | ||
85 | * @userbuf: user-passed buffer. | ||
86 | * @count: number of bytes requested. | ||
87 | * @ppos: file position. | ||
88 | * | ||
89 | * Copy the buffer we filled in fill_read_buffer() to userspace. | ||
90 | * This is done at the reader's leisure, copying and advancing | ||
91 | * the amount they specify each time. | ||
92 | * This may be called continuously until the buffer is empty. | ||
93 | */ | ||
94 | static int flush_read_buffer(struct configfs_buffer * buffer, char __user * buf, | ||
95 | size_t count, loff_t * ppos) | ||
96 | { | ||
97 | int error; | ||
98 | |||
99 | if (*ppos > buffer->count) | ||
100 | return 0; | ||
101 | |||
102 | if (count > (buffer->count - *ppos)) | ||
103 | count = buffer->count - *ppos; | ||
104 | |||
105 | error = copy_to_user(buf,buffer->page + *ppos,count); | ||
106 | if (!error) | ||
107 | *ppos += count; | ||
108 | return error ? -EFAULT : count; | ||
109 | } | ||
110 | |||
111 | /** | ||
112 | * configfs_read_file - read an attribute. | ||
113 | * @file: file pointer. | ||
114 | * @buf: buffer to fill. | ||
115 | * @count: number of bytes to read. | ||
116 | * @ppos: starting offset in file. | ||
117 | * | ||
118 | * Userspace wants to read an attribute file. The attribute descriptor | ||
119 | * is in the file's ->d_fsdata. The target item is in the directory's | ||
120 | * ->d_fsdata. | ||
121 | * | ||
122 | * We call fill_read_buffer() to allocate and fill the buffer from the | ||
123 | * item's show() method exactly once (if the read is happening from | ||
124 | * the beginning of the file). That should fill the entire buffer with | ||
125 | * all the data the item has to offer for that attribute. | ||
126 | * We then call flush_read_buffer() to copy the buffer to userspace | ||
127 | * in the increments specified. | ||
128 | */ | ||
129 | |||
130 | static ssize_t | ||
131 | configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) | ||
132 | { | ||
133 | struct configfs_buffer * buffer = file->private_data; | ||
134 | ssize_t retval = 0; | ||
135 | |||
136 | down(&buffer->sem); | ||
137 | if (buffer->needs_read_fill) { | ||
138 | if ((retval = fill_read_buffer(file->f_dentry,buffer))) | ||
139 | goto out; | ||
140 | } | ||
141 | pr_debug("%s: count = %d, ppos = %lld, buf = %s\n", | ||
142 | __FUNCTION__,count,*ppos,buffer->page); | ||
143 | retval = flush_read_buffer(buffer,buf,count,ppos); | ||
144 | out: | ||
145 | up(&buffer->sem); | ||
146 | return retval; | ||
147 | } | ||
148 | |||
149 | |||
150 | /** | ||
151 | * fill_write_buffer - copy buffer from userspace. | ||
152 | * @buffer: data buffer for file. | ||
153 | * @userbuf: data from user. | ||
154 | * @count: number of bytes in @userbuf. | ||
155 | * | ||
156 | * Allocate @buffer->page if it hasn't been already, then | ||
157 | * copy the user-supplied buffer into it. | ||
158 | */ | ||
159 | |||
160 | static int | ||
161 | fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count) | ||
162 | { | ||
163 | int error; | ||
164 | |||
165 | if (!buffer->page) | ||
166 | buffer->page = (char *)get_zeroed_page(GFP_KERNEL); | ||
167 | if (!buffer->page) | ||
168 | return -ENOMEM; | ||
169 | |||
170 | if (count > PAGE_SIZE) | ||
171 | count = PAGE_SIZE; | ||
172 | error = copy_from_user(buffer->page,buf,count); | ||
173 | buffer->needs_read_fill = 1; | ||
174 | return error ? -EFAULT : count; | ||
175 | } | ||
176 | |||
177 | |||
178 | /** | ||
179 | * flush_write_buffer - push buffer to config_item. | ||
180 | * @file: file pointer. | ||
181 | * @buffer: data buffer for file. | ||
182 | * | ||
183 | * Get the correct pointers for the config_item and the attribute we're | ||
184 | * dealing with, then call the store() method for the attribute, | ||
185 | * passing the buffer that we acquired in fill_write_buffer(). | ||
186 | */ | ||
187 | |||
188 | static int | ||
189 | flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count) | ||
190 | { | ||
191 | struct configfs_attribute * attr = to_attr(dentry); | ||
192 | struct config_item * item = to_item(dentry->d_parent); | ||
193 | struct configfs_item_operations * ops = buffer->ops; | ||
194 | |||
195 | return ops->store_attribute(item,attr,buffer->page,count); | ||
196 | } | ||
197 | |||
198 | |||
199 | /** | ||
200 | * configfs_write_file - write an attribute. | ||
201 | * @file: file pointer | ||
202 | * @buf: data to write | ||
203 | * @count: number of bytes | ||
204 | * @ppos: starting offset | ||
205 | * | ||
206 | * Similar to configfs_read_file(), though working in the opposite direction. | ||
207 | * We allocate and fill the data from the user in fill_write_buffer(), | ||
208 | * then push it to the config_item in flush_write_buffer(). | ||
209 | * There is no easy way for us to know if userspace is only doing a partial | ||
210 | * write, so we don't support them. We expect the entire buffer to come | ||
211 | * on the first write. | ||
212 | * Hint: if you're writing a value, first read the file, modify only the | ||
213 | * the value you're changing, then write entire buffer back. | ||
214 | */ | ||
215 | |||
216 | static ssize_t | ||
217 | configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) | ||
218 | { | ||
219 | struct configfs_buffer * buffer = file->private_data; | ||
220 | |||
221 | down(&buffer->sem); | ||
222 | count = fill_write_buffer(buffer,buf,count); | ||
223 | if (count > 0) | ||
224 | count = flush_write_buffer(file->f_dentry,buffer,count); | ||
225 | if (count > 0) | ||
226 | *ppos += count; | ||
227 | up(&buffer->sem); | ||
228 | return count; | ||
229 | } | ||
230 | |||
231 | static int check_perm(struct inode * inode, struct file * file) | ||
232 | { | ||
233 | struct config_item *item = configfs_get_config_item(file->f_dentry->d_parent); | ||
234 | struct configfs_attribute * attr = to_attr(file->f_dentry); | ||
235 | struct configfs_buffer * buffer; | ||
236 | struct configfs_item_operations * ops = NULL; | ||
237 | int error = 0; | ||
238 | |||
239 | if (!item || !attr) | ||
240 | goto Einval; | ||
241 | |||
242 | /* Grab the module reference for this attribute if we have one */ | ||
243 | if (!try_module_get(attr->ca_owner)) { | ||
244 | error = -ENODEV; | ||
245 | goto Done; | ||
246 | } | ||
247 | |||
248 | if (item->ci_type) | ||
249 | ops = item->ci_type->ct_item_ops; | ||
250 | else | ||
251 | goto Eaccess; | ||
252 | |||
253 | /* File needs write support. | ||
254 | * The inode's perms must say it's ok, | ||
255 | * and we must have a store method. | ||
256 | */ | ||
257 | if (file->f_mode & FMODE_WRITE) { | ||
258 | |||
259 | if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute) | ||
260 | goto Eaccess; | ||
261 | |||
262 | } | ||
263 | |||
264 | /* File needs read support. | ||
265 | * The inode's perms must say it's ok, and we there | ||
266 | * must be a show method for it. | ||
267 | */ | ||
268 | if (file->f_mode & FMODE_READ) { | ||
269 | if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute) | ||
270 | goto Eaccess; | ||
271 | } | ||
272 | |||
273 | /* No error? Great, allocate a buffer for the file, and store it | ||
274 | * it in file->private_data for easy access. | ||
275 | */ | ||
276 | buffer = kmalloc(sizeof(struct configfs_buffer),GFP_KERNEL); | ||
277 | if (buffer) { | ||
278 | memset(buffer,0,sizeof(struct configfs_buffer)); | ||
279 | init_MUTEX(&buffer->sem); | ||
280 | buffer->needs_read_fill = 1; | ||
281 | buffer->ops = ops; | ||
282 | file->private_data = buffer; | ||
283 | } else | ||
284 | error = -ENOMEM; | ||
285 | goto Done; | ||
286 | |||
287 | Einval: | ||
288 | error = -EINVAL; | ||
289 | goto Done; | ||
290 | Eaccess: | ||
291 | error = -EACCES; | ||
292 | module_put(attr->ca_owner); | ||
293 | Done: | ||
294 | if (error && item) | ||
295 | config_item_put(item); | ||
296 | return error; | ||
297 | } | ||
298 | |||
299 | static int configfs_open_file(struct inode * inode, struct file * filp) | ||
300 | { | ||
301 | return check_perm(inode,filp); | ||
302 | } | ||
303 | |||
304 | static int configfs_release(struct inode * inode, struct file * filp) | ||
305 | { | ||
306 | struct config_item * item = to_item(filp->f_dentry->d_parent); | ||
307 | struct configfs_attribute * attr = to_attr(filp->f_dentry); | ||
308 | struct module * owner = attr->ca_owner; | ||
309 | struct configfs_buffer * buffer = filp->private_data; | ||
310 | |||
311 | if (item) | ||
312 | config_item_put(item); | ||
313 | /* After this point, attr should not be accessed. */ | ||
314 | module_put(owner); | ||
315 | |||
316 | if (buffer) { | ||
317 | if (buffer->page) | ||
318 | free_page((unsigned long)buffer->page); | ||
319 | kfree(buffer); | ||
320 | } | ||
321 | return 0; | ||
322 | } | ||
323 | |||
324 | struct file_operations configfs_file_operations = { | ||
325 | .read = configfs_read_file, | ||
326 | .write = configfs_write_file, | ||
327 | .llseek = generic_file_llseek, | ||
328 | .open = configfs_open_file, | ||
329 | .release = configfs_release, | ||
330 | }; | ||
331 | |||
332 | |||
333 | int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type) | ||
334 | { | ||
335 | struct configfs_dirent * parent_sd = dir->d_fsdata; | ||
336 | umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; | ||
337 | int error = 0; | ||
338 | |||
339 | down(&dir->d_inode->i_sem); | ||
340 | error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type); | ||
341 | up(&dir->d_inode->i_sem); | ||
342 | |||
343 | return error; | ||
344 | } | ||
345 | |||
346 | |||
347 | /** | ||
348 | * configfs_create_file - create an attribute file for an item. | ||
349 | * @item: item we're creating for. | ||
350 | * @attr: atrribute descriptor. | ||
351 | */ | ||
352 | |||
353 | int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr) | ||
354 | { | ||
355 | BUG_ON(!item || !item->ci_dentry || !attr); | ||
356 | |||
357 | return configfs_add_file(item->ci_dentry, attr, | ||
358 | CONFIGFS_ITEM_ATTR); | ||
359 | } | ||
360 | |||
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c new file mode 100644 index 000000000000..6b274c6d428f --- /dev/null +++ b/fs/configfs/inode.c | |||
@@ -0,0 +1,162 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * inode.c - basic inode and dentry operations. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | * | ||
21 | * Based on sysfs: | ||
22 | * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel | ||
23 | * | ||
24 | * configfs Copyright (C) 2005 Oracle. All rights reserved. | ||
25 | * | ||
26 | * Please see Documentation/filesystems/configfs.txt for more information. | ||
27 | */ | ||
28 | |||
29 | #undef DEBUG | ||
30 | |||
31 | #include <linux/pagemap.h> | ||
32 | #include <linux/namei.h> | ||
33 | #include <linux/backing-dev.h> | ||
34 | |||
35 | #include <linux/configfs.h> | ||
36 | #include "configfs_internal.h" | ||
37 | |||
38 | extern struct super_block * configfs_sb; | ||
39 | |||
40 | static struct address_space_operations configfs_aops = { | ||
41 | .readpage = simple_readpage, | ||
42 | .prepare_write = simple_prepare_write, | ||
43 | .commit_write = simple_commit_write | ||
44 | }; | ||
45 | |||
46 | static struct backing_dev_info configfs_backing_dev_info = { | ||
47 | .ra_pages = 0, /* No readahead */ | ||
48 | .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, | ||
49 | }; | ||
50 | |||
51 | struct inode * configfs_new_inode(mode_t mode) | ||
52 | { | ||
53 | struct inode * inode = new_inode(configfs_sb); | ||
54 | if (inode) { | ||
55 | inode->i_mode = mode; | ||
56 | inode->i_uid = 0; | ||
57 | inode->i_gid = 0; | ||
58 | inode->i_blksize = PAGE_CACHE_SIZE; | ||
59 | inode->i_blocks = 0; | ||
60 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
61 | inode->i_mapping->a_ops = &configfs_aops; | ||
62 | inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; | ||
63 | } | ||
64 | return inode; | ||
65 | } | ||
66 | |||
67 | int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *)) | ||
68 | { | ||
69 | int error = 0; | ||
70 | struct inode * inode = NULL; | ||
71 | if (dentry) { | ||
72 | if (!dentry->d_inode) { | ||
73 | if ((inode = configfs_new_inode(mode))) { | ||
74 | if (dentry->d_parent && dentry->d_parent->d_inode) { | ||
75 | struct inode *p_inode = dentry->d_parent->d_inode; | ||
76 | p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; | ||
77 | } | ||
78 | goto Proceed; | ||
79 | } | ||
80 | else | ||
81 | error = -ENOMEM; | ||
82 | } else | ||
83 | error = -EEXIST; | ||
84 | } else | ||
85 | error = -ENOENT; | ||
86 | goto Done; | ||
87 | |||
88 | Proceed: | ||
89 | if (init) | ||
90 | error = init(inode); | ||
91 | if (!error) { | ||
92 | d_instantiate(dentry, inode); | ||
93 | if (S_ISDIR(mode) || S_ISLNK(mode)) | ||
94 | dget(dentry); /* pin link and directory dentries in core */ | ||
95 | } else | ||
96 | iput(inode); | ||
97 | Done: | ||
98 | return error; | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * Get the name for corresponding element represented by the given configfs_dirent | ||
103 | */ | ||
104 | const unsigned char * configfs_get_name(struct configfs_dirent *sd) | ||
105 | { | ||
106 | struct attribute * attr; | ||
107 | |||
108 | if (!sd || !sd->s_element) | ||
109 | BUG(); | ||
110 | |||
111 | /* These always have a dentry, so use that */ | ||
112 | if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK)) | ||
113 | return sd->s_dentry->d_name.name; | ||
114 | |||
115 | if (sd->s_type & CONFIGFS_ITEM_ATTR) { | ||
116 | attr = sd->s_element; | ||
117 | return attr->name; | ||
118 | } | ||
119 | return NULL; | ||
120 | } | ||
121 | |||
122 | |||
123 | /* | ||
124 | * Unhashes the dentry corresponding to given configfs_dirent | ||
125 | * Called with parent inode's i_sem held. | ||
126 | */ | ||
127 | void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) | ||
128 | { | ||
129 | struct dentry * dentry = sd->s_dentry; | ||
130 | |||
131 | if (dentry) { | ||
132 | spin_lock(&dcache_lock); | ||
133 | if (!(d_unhashed(dentry) && dentry->d_inode)) { | ||
134 | dget_locked(dentry); | ||
135 | __d_drop(dentry); | ||
136 | spin_unlock(&dcache_lock); | ||
137 | simple_unlink(parent->d_inode, dentry); | ||
138 | } else | ||
139 | spin_unlock(&dcache_lock); | ||
140 | } | ||
141 | } | ||
142 | |||
143 | void configfs_hash_and_remove(struct dentry * dir, const char * name) | ||
144 | { | ||
145 | struct configfs_dirent * sd; | ||
146 | struct configfs_dirent * parent_sd = dir->d_fsdata; | ||
147 | |||
148 | down(&dir->d_inode->i_sem); | ||
149 | list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { | ||
150 | if (!sd->s_element) | ||
151 | continue; | ||
152 | if (!strcmp(configfs_get_name(sd), name)) { | ||
153 | list_del_init(&sd->s_sibling); | ||
154 | configfs_drop_dentry(sd, dir); | ||
155 | configfs_put(sd); | ||
156 | break; | ||
157 | } | ||
158 | } | ||
159 | up(&dir->d_inode->i_sem); | ||
160 | } | ||
161 | |||
162 | |||
diff --git a/fs/configfs/item.c b/fs/configfs/item.c new file mode 100644 index 000000000000..e07485ac50ad --- /dev/null +++ b/fs/configfs/item.c | |||
@@ -0,0 +1,227 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * item.c - library routines for handling generic config items | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | * | ||
21 | * Based on kobject: | ||
22 | * kobject is Copyright (c) 2002-2003 Patrick Mochel | ||
23 | * | ||
24 | * configfs Copyright (C) 2005 Oracle. All rights reserved. | ||
25 | * | ||
26 | * Please see the file Documentation/filesystems/configfs.txt for | ||
27 | * critical information about using the config_item interface. | ||
28 | */ | ||
29 | |||
30 | #include <linux/string.h> | ||
31 | #include <linux/module.h> | ||
32 | #include <linux/stat.h> | ||
33 | #include <linux/slab.h> | ||
34 | |||
35 | #include <linux/configfs.h> | ||
36 | |||
37 | |||
38 | static inline struct config_item * to_item(struct list_head * entry) | ||
39 | { | ||
40 | return container_of(entry,struct config_item,ci_entry); | ||
41 | } | ||
42 | |||
43 | /* Evil kernel */ | ||
44 | static void config_item_release(struct kref *kref); | ||
45 | |||
46 | /** | ||
47 | * config_item_init - initialize item. | ||
48 | * @item: item in question. | ||
49 | */ | ||
50 | void config_item_init(struct config_item * item) | ||
51 | { | ||
52 | kref_init(&item->ci_kref); | ||
53 | INIT_LIST_HEAD(&item->ci_entry); | ||
54 | } | ||
55 | |||
56 | /** | ||
57 | * config_item_set_name - Set the name of an item | ||
58 | * @item: item. | ||
59 | * @name: name. | ||
60 | * | ||
61 | * If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a | ||
62 | * dynamically allocated string that @item->ci_name points to. | ||
63 | * Otherwise, use the static @item->ci_namebuf array. | ||
64 | */ | ||
65 | |||
66 | int config_item_set_name(struct config_item * item, const char * fmt, ...) | ||
67 | { | ||
68 | int error = 0; | ||
69 | int limit = CONFIGFS_ITEM_NAME_LEN; | ||
70 | int need; | ||
71 | va_list args; | ||
72 | char * name; | ||
73 | |||
74 | /* | ||
75 | * First, try the static array | ||
76 | */ | ||
77 | va_start(args,fmt); | ||
78 | need = vsnprintf(item->ci_namebuf,limit,fmt,args); | ||
79 | va_end(args); | ||
80 | if (need < limit) | ||
81 | name = item->ci_namebuf; | ||
82 | else { | ||
83 | /* | ||
84 | * Need more space? Allocate it and try again | ||
85 | */ | ||
86 | limit = need + 1; | ||
87 | name = kmalloc(limit,GFP_KERNEL); | ||
88 | if (!name) { | ||
89 | error = -ENOMEM; | ||
90 | goto Done; | ||
91 | } | ||
92 | va_start(args,fmt); | ||
93 | need = vsnprintf(name,limit,fmt,args); | ||
94 | va_end(args); | ||
95 | |||
96 | /* Still? Give up. */ | ||
97 | if (need >= limit) { | ||
98 | kfree(name); | ||
99 | error = -EFAULT; | ||
100 | goto Done; | ||
101 | } | ||
102 | } | ||
103 | |||
104 | /* Free the old name, if necessary. */ | ||
105 | if (item->ci_name && item->ci_name != item->ci_namebuf) | ||
106 | kfree(item->ci_name); | ||
107 | |||
108 | /* Now, set the new name */ | ||
109 | item->ci_name = name; | ||
110 | Done: | ||
111 | return error; | ||
112 | } | ||
113 | |||
114 | EXPORT_SYMBOL(config_item_set_name); | ||
115 | |||
116 | void config_item_init_type_name(struct config_item *item, | ||
117 | const char *name, | ||
118 | struct config_item_type *type) | ||
119 | { | ||
120 | config_item_set_name(item, name); | ||
121 | item->ci_type = type; | ||
122 | config_item_init(item); | ||
123 | } | ||
124 | EXPORT_SYMBOL(config_item_init_type_name); | ||
125 | |||
126 | void config_group_init_type_name(struct config_group *group, const char *name, | ||
127 | struct config_item_type *type) | ||
128 | { | ||
129 | config_item_set_name(&group->cg_item, name); | ||
130 | group->cg_item.ci_type = type; | ||
131 | config_group_init(group); | ||
132 | } | ||
133 | EXPORT_SYMBOL(config_group_init_type_name); | ||
134 | |||
135 | struct config_item * config_item_get(struct config_item * item) | ||
136 | { | ||
137 | if (item) | ||
138 | kref_get(&item->ci_kref); | ||
139 | return item; | ||
140 | } | ||
141 | |||
142 | /** | ||
143 | * config_item_cleanup - free config_item resources. | ||
144 | * @item: item. | ||
145 | */ | ||
146 | |||
147 | void config_item_cleanup(struct config_item * item) | ||
148 | { | ||
149 | struct config_item_type * t = item->ci_type; | ||
150 | struct config_group * s = item->ci_group; | ||
151 | struct config_item * parent = item->ci_parent; | ||
152 | |||
153 | pr_debug("config_item %s: cleaning up\n",config_item_name(item)); | ||
154 | if (item->ci_name != item->ci_namebuf) | ||
155 | kfree(item->ci_name); | ||
156 | item->ci_name = NULL; | ||
157 | if (t && t->ct_item_ops && t->ct_item_ops->release) | ||
158 | t->ct_item_ops->release(item); | ||
159 | if (s) | ||
160 | config_group_put(s); | ||
161 | if (parent) | ||
162 | config_item_put(parent); | ||
163 | } | ||
164 | |||
165 | static void config_item_release(struct kref *kref) | ||
166 | { | ||
167 | config_item_cleanup(container_of(kref, struct config_item, ci_kref)); | ||
168 | } | ||
169 | |||
170 | /** | ||
171 | * config_item_put - decrement refcount for item. | ||
172 | * @item: item. | ||
173 | * | ||
174 | * Decrement the refcount, and if 0, call config_item_cleanup(). | ||
175 | */ | ||
176 | void config_item_put(struct config_item * item) | ||
177 | { | ||
178 | if (item) | ||
179 | kref_put(&item->ci_kref, config_item_release); | ||
180 | } | ||
181 | |||
182 | |||
183 | /** | ||
184 | * config_group_init - initialize a group for use | ||
185 | * @k: group | ||
186 | */ | ||
187 | |||
188 | void config_group_init(struct config_group *group) | ||
189 | { | ||
190 | config_item_init(&group->cg_item); | ||
191 | INIT_LIST_HEAD(&group->cg_children); | ||
192 | } | ||
193 | |||
194 | |||
195 | /** | ||
196 | * config_group_find_obj - search for item in group. | ||
197 | * @group: group we're looking in. | ||
198 | * @name: item's name. | ||
199 | * | ||
200 | * Lock group via @group->cg_subsys, and iterate over @group->cg_list, | ||
201 | * looking for a matching config_item. If matching item is found | ||
202 | * take a reference and return the item. | ||
203 | */ | ||
204 | |||
205 | struct config_item * config_group_find_obj(struct config_group * group, const char * name) | ||
206 | { | ||
207 | struct list_head * entry; | ||
208 | struct config_item * ret = NULL; | ||
209 | |||
210 | /* XXX LOCKING! */ | ||
211 | list_for_each(entry,&group->cg_children) { | ||
212 | struct config_item * item = to_item(entry); | ||
213 | if (config_item_name(item) && | ||
214 | !strcmp(config_item_name(item), name)) { | ||
215 | ret = config_item_get(item); | ||
216 | break; | ||
217 | } | ||
218 | } | ||
219 | return ret; | ||
220 | } | ||
221 | |||
222 | |||
223 | EXPORT_SYMBOL(config_item_init); | ||
224 | EXPORT_SYMBOL(config_group_init); | ||
225 | EXPORT_SYMBOL(config_item_get); | ||
226 | EXPORT_SYMBOL(config_item_put); | ||
227 | |||
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c new file mode 100644 index 000000000000..1a2f6f6a4d91 --- /dev/null +++ b/fs/configfs/mount.c | |||
@@ -0,0 +1,159 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * mount.c - operations for initializing and mounting configfs. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | * | ||
21 | * Based on sysfs: | ||
22 | * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel | ||
23 | * | ||
24 | * configfs Copyright (C) 2005 Oracle. All rights reserved. | ||
25 | */ | ||
26 | |||
27 | #include <linux/fs.h> | ||
28 | #include <linux/module.h> | ||
29 | #include <linux/mount.h> | ||
30 | #include <linux/pagemap.h> | ||
31 | #include <linux/init.h> | ||
32 | |||
33 | #include <linux/configfs.h> | ||
34 | #include "configfs_internal.h" | ||
35 | |||
36 | /* Random magic number */ | ||
37 | #define CONFIGFS_MAGIC 0x62656570 | ||
38 | |||
39 | struct vfsmount * configfs_mount = NULL; | ||
40 | struct super_block * configfs_sb = NULL; | ||
41 | static int configfs_mnt_count = 0; | ||
42 | |||
43 | static struct super_operations configfs_ops = { | ||
44 | .statfs = simple_statfs, | ||
45 | .drop_inode = generic_delete_inode, | ||
46 | }; | ||
47 | |||
48 | static struct config_group configfs_root_group = { | ||
49 | .cg_item = { | ||
50 | .ci_namebuf = "root", | ||
51 | .ci_name = configfs_root_group.cg_item.ci_namebuf, | ||
52 | }, | ||
53 | }; | ||
54 | |||
55 | int configfs_is_root(struct config_item *item) | ||
56 | { | ||
57 | return item == &configfs_root_group.cg_item; | ||
58 | } | ||
59 | |||
60 | static struct configfs_dirent configfs_root = { | ||
61 | .s_sibling = LIST_HEAD_INIT(configfs_root.s_sibling), | ||
62 | .s_children = LIST_HEAD_INIT(configfs_root.s_children), | ||
63 | .s_element = &configfs_root_group.cg_item, | ||
64 | .s_type = CONFIGFS_ROOT, | ||
65 | }; | ||
66 | |||
67 | static int configfs_fill_super(struct super_block *sb, void *data, int silent) | ||
68 | { | ||
69 | struct inode *inode; | ||
70 | struct dentry *root; | ||
71 | |||
72 | sb->s_blocksize = PAGE_CACHE_SIZE; | ||
73 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; | ||
74 | sb->s_magic = CONFIGFS_MAGIC; | ||
75 | sb->s_op = &configfs_ops; | ||
76 | configfs_sb = sb; | ||
77 | |||
78 | inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO); | ||
79 | if (inode) { | ||
80 | inode->i_op = &configfs_dir_inode_operations; | ||
81 | inode->i_fop = &configfs_dir_operations; | ||
82 | /* directory inodes start off with i_nlink == 2 (for "." entry) */ | ||
83 | inode->i_nlink++; | ||
84 | } else { | ||
85 | pr_debug("configfs: could not get root inode\n"); | ||
86 | return -ENOMEM; | ||
87 | } | ||
88 | |||
89 | root = d_alloc_root(inode); | ||
90 | if (!root) { | ||
91 | pr_debug("%s: could not get root dentry!\n",__FUNCTION__); | ||
92 | iput(inode); | ||
93 | return -ENOMEM; | ||
94 | } | ||
95 | config_group_init(&configfs_root_group); | ||
96 | configfs_root_group.cg_item.ci_dentry = root; | ||
97 | root->d_fsdata = &configfs_root; | ||
98 | sb->s_root = root; | ||
99 | return 0; | ||
100 | } | ||
101 | |||
102 | static struct super_block *configfs_get_sb(struct file_system_type *fs_type, | ||
103 | int flags, const char *dev_name, void *data) | ||
104 | { | ||
105 | return get_sb_single(fs_type, flags, data, configfs_fill_super); | ||
106 | } | ||
107 | |||
108 | static struct file_system_type configfs_fs_type = { | ||
109 | .owner = THIS_MODULE, | ||
110 | .name = "configfs", | ||
111 | .get_sb = configfs_get_sb, | ||
112 | .kill_sb = kill_litter_super, | ||
113 | }; | ||
114 | |||
115 | int configfs_pin_fs(void) | ||
116 | { | ||
117 | return simple_pin_fs("configfs", &configfs_mount, | ||
118 | &configfs_mnt_count); | ||
119 | } | ||
120 | |||
121 | void configfs_release_fs(void) | ||
122 | { | ||
123 | simple_release_fs(&configfs_mount, &configfs_mnt_count); | ||
124 | } | ||
125 | |||
126 | |||
127 | static decl_subsys(config, NULL, NULL); | ||
128 | |||
129 | static int __init configfs_init(void) | ||
130 | { | ||
131 | int err; | ||
132 | |||
133 | kset_set_kset_s(&config_subsys, kernel_subsys); | ||
134 | err = subsystem_register(&config_subsys); | ||
135 | if (err) | ||
136 | return err; | ||
137 | |||
138 | err = register_filesystem(&configfs_fs_type); | ||
139 | if (err) { | ||
140 | printk(KERN_ERR "configfs: Unable to register filesystem!\n"); | ||
141 | subsystem_unregister(&config_subsys); | ||
142 | } | ||
143 | |||
144 | return err; | ||
145 | } | ||
146 | |||
147 | static void __exit configfs_exit(void) | ||
148 | { | ||
149 | unregister_filesystem(&configfs_fs_type); | ||
150 | subsystem_unregister(&config_subsys); | ||
151 | } | ||
152 | |||
153 | MODULE_AUTHOR("Oracle"); | ||
154 | MODULE_LICENSE("GPL"); | ||
155 | MODULE_VERSION("0.0.1"); | ||
156 | MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); | ||
157 | |||
158 | module_init(configfs_init); | ||
159 | module_exit(configfs_exit); | ||
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c new file mode 100644 index 000000000000..50f5840521a9 --- /dev/null +++ b/fs/configfs/symlink.c | |||
@@ -0,0 +1,281 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * symlink.c - operations for configfs symlinks. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | * | ||
21 | * Based on sysfs: | ||
22 | * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel | ||
23 | * | ||
24 | * configfs Copyright (C) 2005 Oracle. All rights reserved. | ||
25 | */ | ||
26 | |||
27 | #include <linux/fs.h> | ||
28 | #include <linux/module.h> | ||
29 | #include <linux/namei.h> | ||
30 | |||
31 | #include <linux/configfs.h> | ||
32 | #include "configfs_internal.h" | ||
33 | |||
34 | static int item_depth(struct config_item * item) | ||
35 | { | ||
36 | struct config_item * p = item; | ||
37 | int depth = 0; | ||
38 | do { depth++; } while ((p = p->ci_parent) && !configfs_is_root(p)); | ||
39 | return depth; | ||
40 | } | ||
41 | |||
42 | static int item_path_length(struct config_item * item) | ||
43 | { | ||
44 | struct config_item * p = item; | ||
45 | int length = 1; | ||
46 | do { | ||
47 | length += strlen(config_item_name(p)) + 1; | ||
48 | p = p->ci_parent; | ||
49 | } while (p && !configfs_is_root(p)); | ||
50 | return length; | ||
51 | } | ||
52 | |||
53 | static void fill_item_path(struct config_item * item, char * buffer, int length) | ||
54 | { | ||
55 | struct config_item * p; | ||
56 | |||
57 | --length; | ||
58 | for (p = item; p && !configfs_is_root(p); p = p->ci_parent) { | ||
59 | int cur = strlen(config_item_name(p)); | ||
60 | |||
61 | /* back up enough to print this bus id with '/' */ | ||
62 | length -= cur; | ||
63 | strncpy(buffer + length,config_item_name(p),cur); | ||
64 | *(buffer + --length) = '/'; | ||
65 | } | ||
66 | } | ||
67 | |||
68 | static int create_link(struct config_item *parent_item, | ||
69 | struct config_item *item, | ||
70 | struct dentry *dentry) | ||
71 | { | ||
72 | struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata; | ||
73 | struct configfs_symlink *sl; | ||
74 | int ret; | ||
75 | |||
76 | ret = -ENOMEM; | ||
77 | sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL); | ||
78 | if (sl) { | ||
79 | sl->sl_target = config_item_get(item); | ||
80 | /* FIXME: needs a lock, I'd bet */ | ||
81 | list_add(&sl->sl_list, &target_sd->s_links); | ||
82 | ret = configfs_create_link(sl, parent_item->ci_dentry, | ||
83 | dentry); | ||
84 | if (ret) { | ||
85 | list_del_init(&sl->sl_list); | ||
86 | config_item_put(item); | ||
87 | kfree(sl); | ||
88 | } | ||
89 | } | ||
90 | |||
91 | return ret; | ||
92 | } | ||
93 | |||
94 | |||
95 | static int get_target(const char *symname, struct nameidata *nd, | ||
96 | struct config_item **target) | ||
97 | { | ||
98 | int ret; | ||
99 | |||
100 | ret = path_lookup(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, nd); | ||
101 | if (!ret) { | ||
102 | if (nd->dentry->d_sb == configfs_sb) { | ||
103 | *target = configfs_get_config_item(nd->dentry); | ||
104 | if (!*target) { | ||
105 | ret = -ENOENT; | ||
106 | path_release(nd); | ||
107 | } | ||
108 | } else | ||
109 | ret = -EPERM; | ||
110 | } | ||
111 | |||
112 | return ret; | ||
113 | } | ||
114 | |||
115 | |||
116 | int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) | ||
117 | { | ||
118 | int ret; | ||
119 | struct nameidata nd; | ||
120 | struct config_item *parent_item; | ||
121 | struct config_item *target_item; | ||
122 | struct config_item_type *type; | ||
123 | |||
124 | ret = -EPERM; /* What lack-of-symlink returns */ | ||
125 | if (dentry->d_parent == configfs_sb->s_root) | ||
126 | goto out; | ||
127 | |||
128 | parent_item = configfs_get_config_item(dentry->d_parent); | ||
129 | type = parent_item->ci_type; | ||
130 | |||
131 | if (!type || !type->ct_item_ops || | ||
132 | !type->ct_item_ops->allow_link) | ||
133 | goto out_put; | ||
134 | |||
135 | ret = get_target(symname, &nd, &target_item); | ||
136 | if (ret) | ||
137 | goto out_put; | ||
138 | |||
139 | ret = type->ct_item_ops->allow_link(parent_item, target_item); | ||
140 | if (!ret) | ||
141 | ret = create_link(parent_item, target_item, dentry); | ||
142 | |||
143 | config_item_put(target_item); | ||
144 | path_release(&nd); | ||
145 | |||
146 | out_put: | ||
147 | config_item_put(parent_item); | ||
148 | |||
149 | out: | ||
150 | return ret; | ||
151 | } | ||
152 | |||
153 | int configfs_unlink(struct inode *dir, struct dentry *dentry) | ||
154 | { | ||
155 | struct configfs_dirent *sd = dentry->d_fsdata; | ||
156 | struct configfs_symlink *sl; | ||
157 | struct config_item *parent_item; | ||
158 | struct config_item_type *type; | ||
159 | int ret; | ||
160 | |||
161 | ret = -EPERM; /* What lack-of-symlink returns */ | ||
162 | if (!(sd->s_type & CONFIGFS_ITEM_LINK)) | ||
163 | goto out; | ||
164 | |||
165 | if (dentry->d_parent == configfs_sb->s_root) | ||
166 | BUG(); | ||
167 | |||
168 | sl = sd->s_element; | ||
169 | |||
170 | parent_item = configfs_get_config_item(dentry->d_parent); | ||
171 | type = parent_item->ci_type; | ||
172 | |||
173 | list_del_init(&sd->s_sibling); | ||
174 | configfs_drop_dentry(sd, dentry->d_parent); | ||
175 | dput(dentry); | ||
176 | configfs_put(sd); | ||
177 | |||
178 | /* | ||
179 | * drop_link() must be called before | ||
180 | * list_del_init(&sl->sl_list), so that the order of | ||
181 | * drop_link(this, target) and drop_item(target) is preserved. | ||
182 | */ | ||
183 | if (type && type->ct_item_ops && | ||
184 | type->ct_item_ops->drop_link) | ||
185 | type->ct_item_ops->drop_link(parent_item, | ||
186 | sl->sl_target); | ||
187 | |||
188 | /* FIXME: Needs lock */ | ||
189 | list_del_init(&sl->sl_list); | ||
190 | |||
191 | /* Put reference from create_link() */ | ||
192 | config_item_put(sl->sl_target); | ||
193 | kfree(sl); | ||
194 | |||
195 | config_item_put(parent_item); | ||
196 | |||
197 | ret = 0; | ||
198 | |||
199 | out: | ||
200 | return ret; | ||
201 | } | ||
202 | |||
203 | static int configfs_get_target_path(struct config_item * item, struct config_item * target, | ||
204 | char *path) | ||
205 | { | ||
206 | char * s; | ||
207 | int depth, size; | ||
208 | |||
209 | depth = item_depth(item); | ||
210 | size = item_path_length(target) + depth * 3 - 1; | ||
211 | if (size > PATH_MAX) | ||
212 | return -ENAMETOOLONG; | ||
213 | |||
214 | pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size); | ||
215 | |||
216 | for (s = path; depth--; s += 3) | ||
217 | strcpy(s,"../"); | ||
218 | |||
219 | fill_item_path(target, path, size); | ||
220 | pr_debug("%s: path = '%s'\n", __FUNCTION__, path); | ||
221 | |||
222 | return 0; | ||
223 | } | ||
224 | |||
225 | static int configfs_getlink(struct dentry *dentry, char * path) | ||
226 | { | ||
227 | struct config_item *item, *target_item; | ||
228 | int error = 0; | ||
229 | |||
230 | item = configfs_get_config_item(dentry->d_parent); | ||
231 | if (!item) | ||
232 | return -EINVAL; | ||
233 | |||
234 | target_item = configfs_get_config_item(dentry); | ||
235 | if (!target_item) { | ||
236 | config_item_put(item); | ||
237 | return -EINVAL; | ||
238 | } | ||
239 | |||
240 | down_read(&configfs_rename_sem); | ||
241 | error = configfs_get_target_path(item, target_item, path); | ||
242 | up_read(&configfs_rename_sem); | ||
243 | |||
244 | config_item_put(item); | ||
245 | config_item_put(target_item); | ||
246 | return error; | ||
247 | |||
248 | } | ||
249 | |||
250 | static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd) | ||
251 | { | ||
252 | int error = -ENOMEM; | ||
253 | unsigned long page = get_zeroed_page(GFP_KERNEL); | ||
254 | |||
255 | if (page) { | ||
256 | error = configfs_getlink(dentry, (char *)page); | ||
257 | if (!error) { | ||
258 | nd_set_link(nd, (char *)page); | ||
259 | return (void *)page; | ||
260 | } | ||
261 | } | ||
262 | |||
263 | nd_set_link(nd, ERR_PTR(error)); | ||
264 | return NULL; | ||
265 | } | ||
266 | |||
267 | static void configfs_put_link(struct dentry *dentry, struct nameidata *nd, | ||
268 | void *cookie) | ||
269 | { | ||
270 | if (cookie) { | ||
271 | unsigned long page = (unsigned long)cookie; | ||
272 | free_page(page); | ||
273 | } | ||
274 | } | ||
275 | |||
276 | struct inode_operations configfs_symlink_inode_operations = { | ||
277 | .follow_link = configfs_follow_link, | ||
278 | .readlink = generic_readlink, | ||
279 | .put_link = configfs_put_link, | ||
280 | }; | ||
281 | |||
diff --git a/fs/mpage.c b/fs/mpage.c index c5adcdddf3cc..f1d2d02bd4c8 100644 --- a/fs/mpage.c +++ b/fs/mpage.c | |||
@@ -721,7 +721,7 @@ retry: | |||
721 | &last_block_in_bio, &ret, wbc, | 721 | &last_block_in_bio, &ret, wbc, |
722 | page->mapping->a_ops->writepage); | 722 | page->mapping->a_ops->writepage); |
723 | } | 723 | } |
724 | if (unlikely(ret == WRITEPAGE_ACTIVATE)) | 724 | if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) |
725 | unlock_page(page); | 725 | unlock_page(page); |
726 | if (ret || (--(wbc->nr_to_write) <= 0)) | 726 | if (ret || (--(wbc->nr_to_write) <= 0)) |
727 | done = 1; | 727 | done = 1; |
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile new file mode 100644 index 000000000000..7d3be845a614 --- /dev/null +++ b/fs/ocfs2/Makefile | |||
@@ -0,0 +1,33 @@ | |||
1 | EXTRA_CFLAGS += -Ifs/ocfs2 | ||
2 | |||
3 | EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES | ||
4 | |||
5 | obj-$(CONFIG_OCFS2_FS) += ocfs2.o | ||
6 | |||
7 | ocfs2-objs := \ | ||
8 | alloc.o \ | ||
9 | aops.o \ | ||
10 | buffer_head_io.o \ | ||
11 | dcache.o \ | ||
12 | dir.o \ | ||
13 | dlmglue.o \ | ||
14 | export.o \ | ||
15 | extent_map.o \ | ||
16 | file.o \ | ||
17 | heartbeat.o \ | ||
18 | inode.o \ | ||
19 | journal.o \ | ||
20 | localalloc.o \ | ||
21 | mmap.o \ | ||
22 | namei.o \ | ||
23 | slot_map.o \ | ||
24 | suballoc.o \ | ||
25 | super.o \ | ||
26 | symlink.o \ | ||
27 | sysfile.o \ | ||
28 | uptodate.o \ | ||
29 | ver.o \ | ||
30 | vote.o | ||
31 | |||
32 | obj-$(CONFIG_OCFS2_FS) += cluster/ | ||
33 | obj-$(CONFIG_OCFS2_FS) += dlm/ | ||
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c new file mode 100644 index 000000000000..465f797451ee --- /dev/null +++ b/fs/ocfs2/alloc.c | |||
@@ -0,0 +1,2040 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * alloc.c | ||
5 | * | ||
6 | * Extent allocs and frees | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | |||
31 | #define MLOG_MASK_PREFIX ML_DISK_ALLOC | ||
32 | #include <cluster/masklog.h> | ||
33 | |||
34 | #include "ocfs2.h" | ||
35 | |||
36 | #include "alloc.h" | ||
37 | #include "dlmglue.h" | ||
38 | #include "extent_map.h" | ||
39 | #include "inode.h" | ||
40 | #include "journal.h" | ||
41 | #include "localalloc.h" | ||
42 | #include "suballoc.h" | ||
43 | #include "sysfile.h" | ||
44 | #include "file.h" | ||
45 | #include "super.h" | ||
46 | #include "uptodate.h" | ||
47 | |||
48 | #include "buffer_head_io.h" | ||
49 | |||
50 | static int ocfs2_extent_contig(struct inode *inode, | ||
51 | struct ocfs2_extent_rec *ext, | ||
52 | u64 blkno); | ||
53 | |||
54 | static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, | ||
55 | struct ocfs2_journal_handle *handle, | ||
56 | struct inode *inode, | ||
57 | int wanted, | ||
58 | struct ocfs2_alloc_context *meta_ac, | ||
59 | struct buffer_head *bhs[]); | ||
60 | |||
61 | static int ocfs2_add_branch(struct ocfs2_super *osb, | ||
62 | struct ocfs2_journal_handle *handle, | ||
63 | struct inode *inode, | ||
64 | struct buffer_head *fe_bh, | ||
65 | struct buffer_head *eb_bh, | ||
66 | struct buffer_head *last_eb_bh, | ||
67 | struct ocfs2_alloc_context *meta_ac); | ||
68 | |||
69 | static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, | ||
70 | struct ocfs2_journal_handle *handle, | ||
71 | struct inode *inode, | ||
72 | struct buffer_head *fe_bh, | ||
73 | struct ocfs2_alloc_context *meta_ac, | ||
74 | struct buffer_head **ret_new_eb_bh); | ||
75 | |||
76 | static int ocfs2_do_insert_extent(struct ocfs2_super *osb, | ||
77 | struct ocfs2_journal_handle *handle, | ||
78 | struct inode *inode, | ||
79 | struct buffer_head *fe_bh, | ||
80 | u64 blkno, | ||
81 | u32 new_clusters); | ||
82 | |||
83 | static int ocfs2_find_branch_target(struct ocfs2_super *osb, | ||
84 | struct inode *inode, | ||
85 | struct buffer_head *fe_bh, | ||
86 | struct buffer_head **target_bh); | ||
87 | |||
88 | static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, | ||
89 | struct inode *inode, | ||
90 | struct ocfs2_dinode *fe, | ||
91 | unsigned int new_i_clusters, | ||
92 | struct buffer_head *old_last_eb, | ||
93 | struct buffer_head **new_last_eb); | ||
94 | |||
95 | static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); | ||
96 | |||
97 | static int ocfs2_extent_contig(struct inode *inode, | ||
98 | struct ocfs2_extent_rec *ext, | ||
99 | u64 blkno) | ||
100 | { | ||
101 | return blkno == (le64_to_cpu(ext->e_blkno) + | ||
102 | ocfs2_clusters_to_blocks(inode->i_sb, | ||
103 | le32_to_cpu(ext->e_clusters))); | ||
104 | } | ||
105 | |||
106 | /* | ||
107 | * How many free extents have we got before we need more meta data? | ||
108 | */ | ||
109 | int ocfs2_num_free_extents(struct ocfs2_super *osb, | ||
110 | struct inode *inode, | ||
111 | struct ocfs2_dinode *fe) | ||
112 | { | ||
113 | int retval; | ||
114 | struct ocfs2_extent_list *el; | ||
115 | struct ocfs2_extent_block *eb; | ||
116 | struct buffer_head *eb_bh = NULL; | ||
117 | |||
118 | mlog_entry_void(); | ||
119 | |||
120 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
121 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | ||
122 | retval = -EIO; | ||
123 | goto bail; | ||
124 | } | ||
125 | |||
126 | if (fe->i_last_eb_blk) { | ||
127 | retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), | ||
128 | &eb_bh, OCFS2_BH_CACHED, inode); | ||
129 | if (retval < 0) { | ||
130 | mlog_errno(retval); | ||
131 | goto bail; | ||
132 | } | ||
133 | eb = (struct ocfs2_extent_block *) eb_bh->b_data; | ||
134 | el = &eb->h_list; | ||
135 | } else | ||
136 | el = &fe->id2.i_list; | ||
137 | |||
138 | BUG_ON(el->l_tree_depth != 0); | ||
139 | |||
140 | retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); | ||
141 | bail: | ||
142 | if (eb_bh) | ||
143 | brelse(eb_bh); | ||
144 | |||
145 | mlog_exit(retval); | ||
146 | return retval; | ||
147 | } | ||
148 | |||
149 | /* expects array to already be allocated | ||
150 | * | ||
151 | * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and | ||
152 | * l_count for you | ||
153 | */ | ||
154 | static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, | ||
155 | struct ocfs2_journal_handle *handle, | ||
156 | struct inode *inode, | ||
157 | int wanted, | ||
158 | struct ocfs2_alloc_context *meta_ac, | ||
159 | struct buffer_head *bhs[]) | ||
160 | { | ||
161 | int count, status, i; | ||
162 | u16 suballoc_bit_start; | ||
163 | u32 num_got; | ||
164 | u64 first_blkno; | ||
165 | struct ocfs2_extent_block *eb; | ||
166 | |||
167 | mlog_entry_void(); | ||
168 | |||
169 | count = 0; | ||
170 | while (count < wanted) { | ||
171 | status = ocfs2_claim_metadata(osb, | ||
172 | handle, | ||
173 | meta_ac, | ||
174 | wanted - count, | ||
175 | &suballoc_bit_start, | ||
176 | &num_got, | ||
177 | &first_blkno); | ||
178 | if (status < 0) { | ||
179 | mlog_errno(status); | ||
180 | goto bail; | ||
181 | } | ||
182 | |||
183 | for(i = count; i < (num_got + count); i++) { | ||
184 | bhs[i] = sb_getblk(osb->sb, first_blkno); | ||
185 | if (bhs[i] == NULL) { | ||
186 | status = -EIO; | ||
187 | mlog_errno(status); | ||
188 | goto bail; | ||
189 | } | ||
190 | ocfs2_set_new_buffer_uptodate(inode, bhs[i]); | ||
191 | |||
192 | status = ocfs2_journal_access(handle, inode, bhs[i], | ||
193 | OCFS2_JOURNAL_ACCESS_CREATE); | ||
194 | if (status < 0) { | ||
195 | mlog_errno(status); | ||
196 | goto bail; | ||
197 | } | ||
198 | |||
199 | memset(bhs[i]->b_data, 0, osb->sb->s_blocksize); | ||
200 | eb = (struct ocfs2_extent_block *) bhs[i]->b_data; | ||
201 | /* Ok, setup the minimal stuff here. */ | ||
202 | strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); | ||
203 | eb->h_blkno = cpu_to_le64(first_blkno); | ||
204 | eb->h_fs_generation = cpu_to_le32(osb->fs_generation); | ||
205 | |||
206 | #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS | ||
207 | /* we always use slot zero's suballocator */ | ||
208 | eb->h_suballoc_slot = 0; | ||
209 | #else | ||
210 | eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); | ||
211 | #endif | ||
212 | eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); | ||
213 | eb->h_list.l_count = | ||
214 | cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); | ||
215 | |||
216 | suballoc_bit_start++; | ||
217 | first_blkno++; | ||
218 | |||
219 | /* We'll also be dirtied by the caller, so | ||
220 | * this isn't absolutely necessary. */ | ||
221 | status = ocfs2_journal_dirty(handle, bhs[i]); | ||
222 | if (status < 0) { | ||
223 | mlog_errno(status); | ||
224 | goto bail; | ||
225 | } | ||
226 | } | ||
227 | |||
228 | count += num_got; | ||
229 | } | ||
230 | |||
231 | status = 0; | ||
232 | bail: | ||
233 | if (status < 0) { | ||
234 | for(i = 0; i < wanted; i++) { | ||
235 | if (bhs[i]) | ||
236 | brelse(bhs[i]); | ||
237 | bhs[i] = NULL; | ||
238 | } | ||
239 | } | ||
240 | mlog_exit(status); | ||
241 | return status; | ||
242 | } | ||
243 | |||
244 | /* | ||
245 | * Add an entire tree branch to our inode. eb_bh is the extent block | ||
246 | * to start at, if we don't want to start the branch at the dinode | ||
247 | * structure. | ||
248 | * | ||
249 | * last_eb_bh is required as we have to update it's next_leaf pointer | ||
250 | * for the new last extent block. | ||
251 | * | ||
252 | * the new branch will be 'empty' in the sense that every block will | ||
253 | * contain a single record with e_clusters == 0. | ||
254 | */ | ||
255 | static int ocfs2_add_branch(struct ocfs2_super *osb, | ||
256 | struct ocfs2_journal_handle *handle, | ||
257 | struct inode *inode, | ||
258 | struct buffer_head *fe_bh, | ||
259 | struct buffer_head *eb_bh, | ||
260 | struct buffer_head *last_eb_bh, | ||
261 | struct ocfs2_alloc_context *meta_ac) | ||
262 | { | ||
263 | int status, new_blocks, i; | ||
264 | u64 next_blkno, new_last_eb_blk; | ||
265 | struct buffer_head *bh; | ||
266 | struct buffer_head **new_eb_bhs = NULL; | ||
267 | struct ocfs2_dinode *fe; | ||
268 | struct ocfs2_extent_block *eb; | ||
269 | struct ocfs2_extent_list *eb_el; | ||
270 | struct ocfs2_extent_list *el; | ||
271 | |||
272 | mlog_entry_void(); | ||
273 | |||
274 | BUG_ON(!last_eb_bh); | ||
275 | |||
276 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
277 | |||
278 | if (eb_bh) { | ||
279 | eb = (struct ocfs2_extent_block *) eb_bh->b_data; | ||
280 | el = &eb->h_list; | ||
281 | } else | ||
282 | el = &fe->id2.i_list; | ||
283 | |||
284 | /* we never add a branch to a leaf. */ | ||
285 | BUG_ON(!el->l_tree_depth); | ||
286 | |||
287 | new_blocks = le16_to_cpu(el->l_tree_depth); | ||
288 | |||
289 | /* allocate the number of new eb blocks we need */ | ||
290 | new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), | ||
291 | GFP_KERNEL); | ||
292 | if (!new_eb_bhs) { | ||
293 | status = -ENOMEM; | ||
294 | mlog_errno(status); | ||
295 | goto bail; | ||
296 | } | ||
297 | |||
298 | status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, | ||
299 | meta_ac, new_eb_bhs); | ||
300 | if (status < 0) { | ||
301 | mlog_errno(status); | ||
302 | goto bail; | ||
303 | } | ||
304 | |||
305 | /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be | ||
306 | * linked with the rest of the tree. | ||
307 | * conversly, new_eb_bhs[0] is the new bottommost leaf. | ||
308 | * | ||
309 | * when we leave the loop, new_last_eb_blk will point to the | ||
310 | * newest leaf, and next_blkno will point to the topmost extent | ||
311 | * block. */ | ||
312 | next_blkno = new_last_eb_blk = 0; | ||
313 | for(i = 0; i < new_blocks; i++) { | ||
314 | bh = new_eb_bhs[i]; | ||
315 | eb = (struct ocfs2_extent_block *) bh->b_data; | ||
316 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
317 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
318 | status = -EIO; | ||
319 | goto bail; | ||
320 | } | ||
321 | eb_el = &eb->h_list; | ||
322 | |||
323 | status = ocfs2_journal_access(handle, inode, bh, | ||
324 | OCFS2_JOURNAL_ACCESS_CREATE); | ||
325 | if (status < 0) { | ||
326 | mlog_errno(status); | ||
327 | goto bail; | ||
328 | } | ||
329 | |||
330 | eb->h_next_leaf_blk = 0; | ||
331 | eb_el->l_tree_depth = cpu_to_le16(i); | ||
332 | eb_el->l_next_free_rec = cpu_to_le16(1); | ||
333 | eb_el->l_recs[0].e_cpos = fe->i_clusters; | ||
334 | eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); | ||
335 | eb_el->l_recs[0].e_clusters = cpu_to_le32(0); | ||
336 | if (!eb_el->l_tree_depth) | ||
337 | new_last_eb_blk = le64_to_cpu(eb->h_blkno); | ||
338 | |||
339 | status = ocfs2_journal_dirty(handle, bh); | ||
340 | if (status < 0) { | ||
341 | mlog_errno(status); | ||
342 | goto bail; | ||
343 | } | ||
344 | |||
345 | next_blkno = le64_to_cpu(eb->h_blkno); | ||
346 | } | ||
347 | |||
348 | /* This is a bit hairy. We want to update up to three blocks | ||
349 | * here without leaving any of them in an inconsistent state | ||
350 | * in case of error. We don't have to worry about | ||
351 | * journal_dirty erroring as it won't unless we've aborted the | ||
352 | * handle (in which case we would never be here) so reserving | ||
353 | * the write with journal_access is all we need to do. */ | ||
354 | status = ocfs2_journal_access(handle, inode, last_eb_bh, | ||
355 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
356 | if (status < 0) { | ||
357 | mlog_errno(status); | ||
358 | goto bail; | ||
359 | } | ||
360 | status = ocfs2_journal_access(handle, inode, fe_bh, | ||
361 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
362 | if (status < 0) { | ||
363 | mlog_errno(status); | ||
364 | goto bail; | ||
365 | } | ||
366 | if (eb_bh) { | ||
367 | status = ocfs2_journal_access(handle, inode, eb_bh, | ||
368 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
369 | if (status < 0) { | ||
370 | mlog_errno(status); | ||
371 | goto bail; | ||
372 | } | ||
373 | } | ||
374 | |||
375 | /* Link the new branch into the rest of the tree (el will | ||
376 | * either be on the fe, or the extent block passed in. */ | ||
377 | i = le16_to_cpu(el->l_next_free_rec); | ||
378 | el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); | ||
379 | el->l_recs[i].e_cpos = fe->i_clusters; | ||
380 | el->l_recs[i].e_clusters = 0; | ||
381 | le16_add_cpu(&el->l_next_free_rec, 1); | ||
382 | |||
383 | /* fe needs a new last extent block pointer, as does the | ||
384 | * next_leaf on the previously last-extent-block. */ | ||
385 | fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); | ||
386 | |||
387 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
388 | eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); | ||
389 | |||
390 | status = ocfs2_journal_dirty(handle, last_eb_bh); | ||
391 | if (status < 0) | ||
392 | mlog_errno(status); | ||
393 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
394 | if (status < 0) | ||
395 | mlog_errno(status); | ||
396 | if (eb_bh) { | ||
397 | status = ocfs2_journal_dirty(handle, eb_bh); | ||
398 | if (status < 0) | ||
399 | mlog_errno(status); | ||
400 | } | ||
401 | |||
402 | status = 0; | ||
403 | bail: | ||
404 | if (new_eb_bhs) { | ||
405 | for (i = 0; i < new_blocks; i++) | ||
406 | if (new_eb_bhs[i]) | ||
407 | brelse(new_eb_bhs[i]); | ||
408 | kfree(new_eb_bhs); | ||
409 | } | ||
410 | |||
411 | mlog_exit(status); | ||
412 | return status; | ||
413 | } | ||
414 | |||
415 | /* | ||
416 | * adds another level to the allocation tree. | ||
417 | * returns back the new extent block so you can add a branch to it | ||
418 | * after this call. | ||
419 | */ | ||
420 | static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, | ||
421 | struct ocfs2_journal_handle *handle, | ||
422 | struct inode *inode, | ||
423 | struct buffer_head *fe_bh, | ||
424 | struct ocfs2_alloc_context *meta_ac, | ||
425 | struct buffer_head **ret_new_eb_bh) | ||
426 | { | ||
427 | int status, i; | ||
428 | struct buffer_head *new_eb_bh = NULL; | ||
429 | struct ocfs2_dinode *fe; | ||
430 | struct ocfs2_extent_block *eb; | ||
431 | struct ocfs2_extent_list *fe_el; | ||
432 | struct ocfs2_extent_list *eb_el; | ||
433 | |||
434 | mlog_entry_void(); | ||
435 | |||
436 | status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, | ||
437 | &new_eb_bh); | ||
438 | if (status < 0) { | ||
439 | mlog_errno(status); | ||
440 | goto bail; | ||
441 | } | ||
442 | |||
443 | eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; | ||
444 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
445 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
446 | status = -EIO; | ||
447 | goto bail; | ||
448 | } | ||
449 | |||
450 | eb_el = &eb->h_list; | ||
451 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
452 | fe_el = &fe->id2.i_list; | ||
453 | |||
454 | status = ocfs2_journal_access(handle, inode, new_eb_bh, | ||
455 | OCFS2_JOURNAL_ACCESS_CREATE); | ||
456 | if (status < 0) { | ||
457 | mlog_errno(status); | ||
458 | goto bail; | ||
459 | } | ||
460 | |||
461 | /* copy the fe data into the new extent block */ | ||
462 | eb_el->l_tree_depth = fe_el->l_tree_depth; | ||
463 | eb_el->l_next_free_rec = fe_el->l_next_free_rec; | ||
464 | for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { | ||
465 | eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos; | ||
466 | eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters; | ||
467 | eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno; | ||
468 | } | ||
469 | |||
470 | status = ocfs2_journal_dirty(handle, new_eb_bh); | ||
471 | if (status < 0) { | ||
472 | mlog_errno(status); | ||
473 | goto bail; | ||
474 | } | ||
475 | |||
476 | status = ocfs2_journal_access(handle, inode, fe_bh, | ||
477 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
478 | if (status < 0) { | ||
479 | mlog_errno(status); | ||
480 | goto bail; | ||
481 | } | ||
482 | |||
483 | /* update fe now */ | ||
484 | le16_add_cpu(&fe_el->l_tree_depth, 1); | ||
485 | fe_el->l_recs[0].e_cpos = 0; | ||
486 | fe_el->l_recs[0].e_blkno = eb->h_blkno; | ||
487 | fe_el->l_recs[0].e_clusters = fe->i_clusters; | ||
488 | for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { | ||
489 | fe_el->l_recs[i].e_cpos = 0; | ||
490 | fe_el->l_recs[i].e_clusters = 0; | ||
491 | fe_el->l_recs[i].e_blkno = 0; | ||
492 | } | ||
493 | fe_el->l_next_free_rec = cpu_to_le16(1); | ||
494 | |||
495 | /* If this is our 1st tree depth shift, then last_eb_blk | ||
496 | * becomes the allocated extent block */ | ||
497 | if (fe_el->l_tree_depth == cpu_to_le16(1)) | ||
498 | fe->i_last_eb_blk = eb->h_blkno; | ||
499 | |||
500 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
501 | if (status < 0) { | ||
502 | mlog_errno(status); | ||
503 | goto bail; | ||
504 | } | ||
505 | |||
506 | *ret_new_eb_bh = new_eb_bh; | ||
507 | new_eb_bh = NULL; | ||
508 | status = 0; | ||
509 | bail: | ||
510 | if (new_eb_bh) | ||
511 | brelse(new_eb_bh); | ||
512 | |||
513 | mlog_exit(status); | ||
514 | return status; | ||
515 | } | ||
516 | |||
517 | /* | ||
518 | * Expects the tree to already have room in the rightmost leaf for the | ||
519 | * extent. Updates all the extent blocks (and the dinode) on the way | ||
520 | * down. | ||
521 | */ | ||
522 | static int ocfs2_do_insert_extent(struct ocfs2_super *osb, | ||
523 | struct ocfs2_journal_handle *handle, | ||
524 | struct inode *inode, | ||
525 | struct buffer_head *fe_bh, | ||
526 | u64 start_blk, | ||
527 | u32 new_clusters) | ||
528 | { | ||
529 | int status, i, num_bhs = 0; | ||
530 | u64 next_blkno; | ||
531 | u16 next_free; | ||
532 | struct buffer_head **eb_bhs = NULL; | ||
533 | struct ocfs2_dinode *fe; | ||
534 | struct ocfs2_extent_block *eb; | ||
535 | struct ocfs2_extent_list *el; | ||
536 | |||
537 | mlog_entry_void(); | ||
538 | |||
539 | status = ocfs2_journal_access(handle, inode, fe_bh, | ||
540 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
541 | if (status < 0) { | ||
542 | mlog_errno(status); | ||
543 | goto bail; | ||
544 | } | ||
545 | |||
546 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
547 | el = &fe->id2.i_list; | ||
548 | if (el->l_tree_depth) { | ||
549 | /* This is another operation where we want to be | ||
550 | * careful about our tree updates. An error here means | ||
551 | * none of the previous changes we made should roll | ||
552 | * forward. As a result, we have to record the buffers | ||
553 | * for this part of the tree in an array and reserve a | ||
554 | * journal write to them before making any changes. */ | ||
555 | num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth); | ||
556 | eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *), | ||
557 | GFP_KERNEL); | ||
558 | if (!eb_bhs) { | ||
559 | status = -ENOMEM; | ||
560 | mlog_errno(status); | ||
561 | goto bail; | ||
562 | } | ||
563 | |||
564 | i = 0; | ||
565 | while(el->l_tree_depth) { | ||
566 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
567 | if (next_free == 0) { | ||
568 | ocfs2_error(inode->i_sb, | ||
569 | "Dinode %"MLFu64" has a bad " | ||
570 | "extent list", | ||
571 | OCFS2_I(inode)->ip_blkno); | ||
572 | status = -EIO; | ||
573 | goto bail; | ||
574 | } | ||
575 | next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno); | ||
576 | |||
577 | BUG_ON(i >= num_bhs); | ||
578 | status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i], | ||
579 | OCFS2_BH_CACHED, inode); | ||
580 | if (status < 0) { | ||
581 | mlog_errno(status); | ||
582 | goto bail; | ||
583 | } | ||
584 | eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; | ||
585 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
586 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, | ||
587 | eb); | ||
588 | status = -EIO; | ||
589 | goto bail; | ||
590 | } | ||
591 | |||
592 | status = ocfs2_journal_access(handle, inode, eb_bhs[i], | ||
593 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
594 | if (status < 0) { | ||
595 | mlog_errno(status); | ||
596 | goto bail; | ||
597 | } | ||
598 | |||
599 | el = &eb->h_list; | ||
600 | i++; | ||
601 | /* When we leave this loop, eb_bhs[num_bhs - 1] will | ||
602 | * hold the bottom-most leaf extent block. */ | ||
603 | } | ||
604 | BUG_ON(el->l_tree_depth); | ||
605 | |||
606 | el = &fe->id2.i_list; | ||
607 | /* If we have tree depth, then the fe update is | ||
608 | * trivial, and we want to switch el out for the | ||
609 | * bottom-most leaf in order to update it with the | ||
610 | * actual extent data below. */ | ||
611 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
612 | if (next_free == 0) { | ||
613 | ocfs2_error(inode->i_sb, | ||
614 | "Dinode %"MLFu64" has a bad " | ||
615 | "extent list", | ||
616 | OCFS2_I(inode)->ip_blkno); | ||
617 | status = -EIO; | ||
618 | goto bail; | ||
619 | } | ||
620 | le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, | ||
621 | new_clusters); | ||
622 | /* (num_bhs - 1) to avoid the leaf */ | ||
623 | for(i = 0; i < (num_bhs - 1); i++) { | ||
624 | eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; | ||
625 | el = &eb->h_list; | ||
626 | |||
627 | /* finally, make our actual change to the | ||
628 | * intermediate extent blocks. */ | ||
629 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
630 | le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, | ||
631 | new_clusters); | ||
632 | |||
633 | status = ocfs2_journal_dirty(handle, eb_bhs[i]); | ||
634 | if (status < 0) | ||
635 | mlog_errno(status); | ||
636 | } | ||
637 | BUG_ON(i != (num_bhs - 1)); | ||
638 | /* note that the leaf block wasn't touched in | ||
639 | * the loop above */ | ||
640 | eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data; | ||
641 | el = &eb->h_list; | ||
642 | BUG_ON(el->l_tree_depth); | ||
643 | } | ||
644 | |||
645 | /* yay, we can finally add the actual extent now! */ | ||
646 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
647 | if (le16_to_cpu(el->l_next_free_rec) && | ||
648 | ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) { | ||
649 | le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters); | ||
650 | } else if (le16_to_cpu(el->l_next_free_rec) && | ||
651 | (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) { | ||
652 | /* having an empty extent at eof is legal. */ | ||
653 | if (el->l_recs[i].e_cpos != fe->i_clusters) { | ||
654 | ocfs2_error(inode->i_sb, | ||
655 | "Dinode %"MLFu64" trailing extent is bad: " | ||
656 | "cpos (%u) != number of clusters (%u)", | ||
657 | le32_to_cpu(el->l_recs[i].e_cpos), | ||
658 | le32_to_cpu(fe->i_clusters)); | ||
659 | status = -EIO; | ||
660 | goto bail; | ||
661 | } | ||
662 | el->l_recs[i].e_blkno = cpu_to_le64(start_blk); | ||
663 | el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); | ||
664 | } else { | ||
665 | /* No contiguous record, or no empty record at eof, so | ||
666 | * we add a new one. */ | ||
667 | |||
668 | BUG_ON(le16_to_cpu(el->l_next_free_rec) >= | ||
669 | le16_to_cpu(el->l_count)); | ||
670 | i = le16_to_cpu(el->l_next_free_rec); | ||
671 | |||
672 | el->l_recs[i].e_blkno = cpu_to_le64(start_blk); | ||
673 | el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); | ||
674 | el->l_recs[i].e_cpos = fe->i_clusters; | ||
675 | le16_add_cpu(&el->l_next_free_rec, 1); | ||
676 | } | ||
677 | |||
678 | /* | ||
679 | * extent_map errors are not fatal, so they are ignored outside | ||
680 | * of flushing the thing. | ||
681 | */ | ||
682 | status = ocfs2_extent_map_append(inode, &el->l_recs[i], | ||
683 | new_clusters); | ||
684 | if (status) { | ||
685 | mlog_errno(status); | ||
686 | ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters)); | ||
687 | } | ||
688 | |||
689 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
690 | if (status < 0) | ||
691 | mlog_errno(status); | ||
692 | if (fe->id2.i_list.l_tree_depth) { | ||
693 | status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]); | ||
694 | if (status < 0) | ||
695 | mlog_errno(status); | ||
696 | } | ||
697 | |||
698 | status = 0; | ||
699 | bail: | ||
700 | if (eb_bhs) { | ||
701 | for (i = 0; i < num_bhs; i++) | ||
702 | if (eb_bhs[i]) | ||
703 | brelse(eb_bhs[i]); | ||
704 | kfree(eb_bhs); | ||
705 | } | ||
706 | |||
707 | mlog_exit(status); | ||
708 | return status; | ||
709 | } | ||
710 | |||
711 | /* | ||
712 | * Should only be called when there is no space left in any of the | ||
713 | * leaf nodes. What we want to do is find the lowest tree depth | ||
714 | * non-leaf extent block with room for new records. There are three | ||
715 | * valid results of this search: | ||
716 | * | ||
717 | * 1) a lowest extent block is found, then we pass it back in | ||
718 | * *lowest_eb_bh and return '0' | ||
719 | * | ||
720 | * 2) the search fails to find anything, but the dinode has room. We | ||
721 | * pass NULL back in *lowest_eb_bh, but still return '0' | ||
722 | * | ||
723 | * 3) the search fails to find anything AND the dinode is full, in | ||
724 | * which case we return > 0 | ||
725 | * | ||
726 | * return status < 0 indicates an error. | ||
727 | */ | ||
728 | static int ocfs2_find_branch_target(struct ocfs2_super *osb, | ||
729 | struct inode *inode, | ||
730 | struct buffer_head *fe_bh, | ||
731 | struct buffer_head **target_bh) | ||
732 | { | ||
733 | int status = 0, i; | ||
734 | u64 blkno; | ||
735 | struct ocfs2_dinode *fe; | ||
736 | struct ocfs2_extent_block *eb; | ||
737 | struct ocfs2_extent_list *el; | ||
738 | struct buffer_head *bh = NULL; | ||
739 | struct buffer_head *lowest_bh = NULL; | ||
740 | |||
741 | mlog_entry_void(); | ||
742 | |||
743 | *target_bh = NULL; | ||
744 | |||
745 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
746 | el = &fe->id2.i_list; | ||
747 | |||
748 | while(le16_to_cpu(el->l_tree_depth) > 1) { | ||
749 | if (le16_to_cpu(el->l_next_free_rec) == 0) { | ||
750 | ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has empty " | ||
751 | "extent list (next_free_rec == 0)", | ||
752 | OCFS2_I(inode)->ip_blkno); | ||
753 | status = -EIO; | ||
754 | goto bail; | ||
755 | } | ||
756 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
757 | blkno = le64_to_cpu(el->l_recs[i].e_blkno); | ||
758 | if (!blkno) { | ||
759 | ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has extent " | ||
760 | "list where extent # %d has no physical " | ||
761 | "block start", | ||
762 | OCFS2_I(inode)->ip_blkno, i); | ||
763 | status = -EIO; | ||
764 | goto bail; | ||
765 | } | ||
766 | |||
767 | if (bh) { | ||
768 | brelse(bh); | ||
769 | bh = NULL; | ||
770 | } | ||
771 | |||
772 | status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, | ||
773 | inode); | ||
774 | if (status < 0) { | ||
775 | mlog_errno(status); | ||
776 | goto bail; | ||
777 | } | ||
778 | |||
779 | eb = (struct ocfs2_extent_block *) bh->b_data; | ||
780 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
781 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
782 | status = -EIO; | ||
783 | goto bail; | ||
784 | } | ||
785 | el = &eb->h_list; | ||
786 | |||
787 | if (le16_to_cpu(el->l_next_free_rec) < | ||
788 | le16_to_cpu(el->l_count)) { | ||
789 | if (lowest_bh) | ||
790 | brelse(lowest_bh); | ||
791 | lowest_bh = bh; | ||
792 | get_bh(lowest_bh); | ||
793 | } | ||
794 | } | ||
795 | |||
796 | /* If we didn't find one and the fe doesn't have any room, | ||
797 | * then return '1' */ | ||
798 | if (!lowest_bh | ||
799 | && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count)) | ||
800 | status = 1; | ||
801 | |||
802 | *target_bh = lowest_bh; | ||
803 | bail: | ||
804 | if (bh) | ||
805 | brelse(bh); | ||
806 | |||
807 | mlog_exit(status); | ||
808 | return status; | ||
809 | } | ||
810 | |||
811 | /* the caller needs to update fe->i_clusters */ | ||
812 | int ocfs2_insert_extent(struct ocfs2_super *osb, | ||
813 | struct ocfs2_journal_handle *handle, | ||
814 | struct inode *inode, | ||
815 | struct buffer_head *fe_bh, | ||
816 | u64 start_blk, | ||
817 | u32 new_clusters, | ||
818 | struct ocfs2_alloc_context *meta_ac) | ||
819 | { | ||
820 | int status, i, shift; | ||
821 | struct buffer_head *last_eb_bh = NULL; | ||
822 | struct buffer_head *bh = NULL; | ||
823 | struct ocfs2_dinode *fe; | ||
824 | struct ocfs2_extent_block *eb; | ||
825 | struct ocfs2_extent_list *el; | ||
826 | |||
827 | mlog_entry_void(); | ||
828 | |||
829 | mlog(0, "add %u clusters starting at block %"MLFu64" to " | ||
830 | "inode %"MLFu64"\n", | ||
831 | new_clusters, start_blk, OCFS2_I(inode)->ip_blkno); | ||
832 | |||
833 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
834 | el = &fe->id2.i_list; | ||
835 | |||
836 | if (el->l_tree_depth) { | ||
837 | /* jump to end of tree */ | ||
838 | status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), | ||
839 | &last_eb_bh, OCFS2_BH_CACHED, inode); | ||
840 | if (status < 0) { | ||
841 | mlog_exit(status); | ||
842 | goto bail; | ||
843 | } | ||
844 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
845 | el = &eb->h_list; | ||
846 | } | ||
847 | |||
848 | /* Can we allocate without adding/shifting tree bits? */ | ||
849 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
850 | if (le16_to_cpu(el->l_next_free_rec) == 0 | ||
851 | || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) | ||
852 | || le32_to_cpu(el->l_recs[i].e_clusters) == 0 | ||
853 | || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) | ||
854 | goto out_add; | ||
855 | |||
856 | mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing " | ||
857 | "tree now.\n"); | ||
858 | |||
859 | shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); | ||
860 | if (shift < 0) { | ||
861 | status = shift; | ||
862 | mlog_errno(status); | ||
863 | goto bail; | ||
864 | } | ||
865 | |||
866 | /* We traveled all the way to the bottom of the allocation tree | ||
867 | * and didn't find room for any more extents - we need to add | ||
868 | * another tree level */ | ||
869 | if (shift) { | ||
870 | /* if we hit a leaf, we'd better be empty :) */ | ||
871 | BUG_ON(le16_to_cpu(el->l_next_free_rec) != | ||
872 | le16_to_cpu(el->l_count)); | ||
873 | BUG_ON(bh); | ||
874 | mlog(0, "ocfs2_allocate_extent: need to shift tree depth " | ||
875 | "(current = %u)\n", | ||
876 | le16_to_cpu(fe->id2.i_list.l_tree_depth)); | ||
877 | |||
878 | /* ocfs2_shift_tree_depth will return us a buffer with | ||
879 | * the new extent block (so we can pass that to | ||
880 | * ocfs2_add_branch). */ | ||
881 | status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh, | ||
882 | meta_ac, &bh); | ||
883 | if (status < 0) { | ||
884 | mlog_errno(status); | ||
885 | goto bail; | ||
886 | } | ||
887 | /* Special case: we have room now if we shifted from | ||
888 | * tree_depth 0 */ | ||
889 | if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1)) | ||
890 | goto out_add; | ||
891 | } | ||
892 | |||
893 | /* call ocfs2_add_branch to add the final part of the tree with | ||
894 | * the new data. */ | ||
895 | mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh); | ||
896 | status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, | ||
897 | meta_ac); | ||
898 | if (status < 0) { | ||
899 | mlog_errno(status); | ||
900 | goto bail; | ||
901 | } | ||
902 | |||
903 | out_add: | ||
904 | /* Finally, we can add clusters. */ | ||
905 | status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh, | ||
906 | start_blk, new_clusters); | ||
907 | if (status < 0) | ||
908 | mlog_errno(status); | ||
909 | |||
910 | bail: | ||
911 | if (bh) | ||
912 | brelse(bh); | ||
913 | |||
914 | if (last_eb_bh) | ||
915 | brelse(last_eb_bh); | ||
916 | |||
917 | mlog_exit(status); | ||
918 | return status; | ||
919 | } | ||
920 | |||
921 | static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) | ||
922 | { | ||
923 | struct buffer_head *tl_bh = osb->osb_tl_bh; | ||
924 | struct ocfs2_dinode *di; | ||
925 | struct ocfs2_truncate_log *tl; | ||
926 | |||
927 | di = (struct ocfs2_dinode *) tl_bh->b_data; | ||
928 | tl = &di->id2.i_dealloc; | ||
929 | |||
930 | mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count), | ||
931 | "slot %d, invalid truncate log parameters: used = " | ||
932 | "%u, count = %u\n", osb->slot_num, | ||
933 | le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count)); | ||
934 | return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count); | ||
935 | } | ||
936 | |||
937 | static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, | ||
938 | unsigned int new_start) | ||
939 | { | ||
940 | unsigned int tail_index; | ||
941 | unsigned int current_tail; | ||
942 | |||
943 | /* No records, nothing to coalesce */ | ||
944 | if (!le16_to_cpu(tl->tl_used)) | ||
945 | return 0; | ||
946 | |||
947 | tail_index = le16_to_cpu(tl->tl_used) - 1; | ||
948 | current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start); | ||
949 | current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters); | ||
950 | |||
951 | return current_tail == new_start; | ||
952 | } | ||
953 | |||
954 | static int ocfs2_truncate_log_append(struct ocfs2_super *osb, | ||
955 | struct ocfs2_journal_handle *handle, | ||
956 | u64 start_blk, | ||
957 | unsigned int num_clusters) | ||
958 | { | ||
959 | int status, index; | ||
960 | unsigned int start_cluster, tl_count; | ||
961 | struct inode *tl_inode = osb->osb_tl_inode; | ||
962 | struct buffer_head *tl_bh = osb->osb_tl_bh; | ||
963 | struct ocfs2_dinode *di; | ||
964 | struct ocfs2_truncate_log *tl; | ||
965 | |||
966 | mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk, | ||
967 | num_clusters); | ||
968 | |||
969 | BUG_ON(!down_trylock(&tl_inode->i_sem)); | ||
970 | |||
971 | start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); | ||
972 | |||
973 | di = (struct ocfs2_dinode *) tl_bh->b_data; | ||
974 | tl = &di->id2.i_dealloc; | ||
975 | if (!OCFS2_IS_VALID_DINODE(di)) { | ||
976 | OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); | ||
977 | status = -EIO; | ||
978 | goto bail; | ||
979 | } | ||
980 | |||
981 | tl_count = le16_to_cpu(tl->tl_count); | ||
982 | mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || | ||
983 | tl_count == 0, | ||
984 | "Truncate record count on #%"MLFu64" invalid (" | ||
985 | "wanted %u, actual %u\n", OCFS2_I(tl_inode)->ip_blkno, | ||
986 | ocfs2_truncate_recs_per_inode(osb->sb), | ||
987 | le16_to_cpu(tl->tl_count)); | ||
988 | |||
989 | /* Caller should have known to flush before calling us. */ | ||
990 | index = le16_to_cpu(tl->tl_used); | ||
991 | if (index >= tl_count) { | ||
992 | status = -ENOSPC; | ||
993 | mlog_errno(status); | ||
994 | goto bail; | ||
995 | } | ||
996 | |||
997 | status = ocfs2_journal_access(handle, tl_inode, tl_bh, | ||
998 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
999 | if (status < 0) { | ||
1000 | mlog_errno(status); | ||
1001 | goto bail; | ||
1002 | } | ||
1003 | |||
1004 | mlog(0, "Log truncate of %u clusters starting at cluster %u to " | ||
1005 | "%"MLFu64" (index = %d)\n", num_clusters, start_cluster, | ||
1006 | OCFS2_I(tl_inode)->ip_blkno, index); | ||
1007 | |||
1008 | if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { | ||
1009 | /* | ||
1010 | * Move index back to the record we are coalescing with. | ||
1011 | * ocfs2_truncate_log_can_coalesce() guarantees nonzero | ||
1012 | */ | ||
1013 | index--; | ||
1014 | |||
1015 | num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); | ||
1016 | mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n", | ||
1017 | index, le32_to_cpu(tl->tl_recs[index].t_start), | ||
1018 | num_clusters); | ||
1019 | } else { | ||
1020 | tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); | ||
1021 | tl->tl_used = cpu_to_le16(index + 1); | ||
1022 | } | ||
1023 | tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); | ||
1024 | |||
1025 | status = ocfs2_journal_dirty(handle, tl_bh); | ||
1026 | if (status < 0) { | ||
1027 | mlog_errno(status); | ||
1028 | goto bail; | ||
1029 | } | ||
1030 | |||
1031 | bail: | ||
1032 | mlog_exit(status); | ||
1033 | return status; | ||
1034 | } | ||
1035 | |||
1036 | static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, | ||
1037 | struct ocfs2_journal_handle *handle, | ||
1038 | struct inode *data_alloc_inode, | ||
1039 | struct buffer_head *data_alloc_bh) | ||
1040 | { | ||
1041 | int status = 0; | ||
1042 | int i; | ||
1043 | unsigned int num_clusters; | ||
1044 | u64 start_blk; | ||
1045 | struct ocfs2_truncate_rec rec; | ||
1046 | struct ocfs2_dinode *di; | ||
1047 | struct ocfs2_truncate_log *tl; | ||
1048 | struct inode *tl_inode = osb->osb_tl_inode; | ||
1049 | struct buffer_head *tl_bh = osb->osb_tl_bh; | ||
1050 | |||
1051 | mlog_entry_void(); | ||
1052 | |||
1053 | di = (struct ocfs2_dinode *) tl_bh->b_data; | ||
1054 | tl = &di->id2.i_dealloc; | ||
1055 | i = le16_to_cpu(tl->tl_used) - 1; | ||
1056 | while (i >= 0) { | ||
1057 | /* Caller has given us at least enough credits to | ||
1058 | * update the truncate log dinode */ | ||
1059 | status = ocfs2_journal_access(handle, tl_inode, tl_bh, | ||
1060 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1061 | if (status < 0) { | ||
1062 | mlog_errno(status); | ||
1063 | goto bail; | ||
1064 | } | ||
1065 | |||
1066 | tl->tl_used = cpu_to_le16(i); | ||
1067 | |||
1068 | status = ocfs2_journal_dirty(handle, tl_bh); | ||
1069 | if (status < 0) { | ||
1070 | mlog_errno(status); | ||
1071 | goto bail; | ||
1072 | } | ||
1073 | |||
1074 | /* TODO: Perhaps we can calculate the bulk of the | ||
1075 | * credits up front rather than extending like | ||
1076 | * this. */ | ||
1077 | status = ocfs2_extend_trans(handle, | ||
1078 | OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); | ||
1079 | if (status < 0) { | ||
1080 | mlog_errno(status); | ||
1081 | goto bail; | ||
1082 | } | ||
1083 | |||
1084 | rec = tl->tl_recs[i]; | ||
1085 | start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, | ||
1086 | le32_to_cpu(rec.t_start)); | ||
1087 | num_clusters = le32_to_cpu(rec.t_clusters); | ||
1088 | |||
1089 | /* if start_blk is not set, we ignore the record as | ||
1090 | * invalid. */ | ||
1091 | if (start_blk) { | ||
1092 | mlog(0, "free record %d, start = %u, clusters = %u\n", | ||
1093 | i, le32_to_cpu(rec.t_start), num_clusters); | ||
1094 | |||
1095 | status = ocfs2_free_clusters(handle, data_alloc_inode, | ||
1096 | data_alloc_bh, start_blk, | ||
1097 | num_clusters); | ||
1098 | if (status < 0) { | ||
1099 | mlog_errno(status); | ||
1100 | goto bail; | ||
1101 | } | ||
1102 | } | ||
1103 | i--; | ||
1104 | } | ||
1105 | |||
1106 | bail: | ||
1107 | mlog_exit(status); | ||
1108 | return status; | ||
1109 | } | ||
1110 | |||
1111 | /* Expects you to already be holding tl_inode->i_sem */ | ||
1112 | static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) | ||
1113 | { | ||
1114 | int status; | ||
1115 | unsigned int num_to_flush; | ||
1116 | struct ocfs2_journal_handle *handle = NULL; | ||
1117 | struct inode *tl_inode = osb->osb_tl_inode; | ||
1118 | struct inode *data_alloc_inode = NULL; | ||
1119 | struct buffer_head *tl_bh = osb->osb_tl_bh; | ||
1120 | struct buffer_head *data_alloc_bh = NULL; | ||
1121 | struct ocfs2_dinode *di; | ||
1122 | struct ocfs2_truncate_log *tl; | ||
1123 | |||
1124 | mlog_entry_void(); | ||
1125 | |||
1126 | BUG_ON(!down_trylock(&tl_inode->i_sem)); | ||
1127 | |||
1128 | di = (struct ocfs2_dinode *) tl_bh->b_data; | ||
1129 | tl = &di->id2.i_dealloc; | ||
1130 | if (!OCFS2_IS_VALID_DINODE(di)) { | ||
1131 | OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); | ||
1132 | status = -EIO; | ||
1133 | goto bail; | ||
1134 | } | ||
1135 | |||
1136 | num_to_flush = le16_to_cpu(tl->tl_used); | ||
1137 | mlog(0, "Flush %u records from truncate log #%"MLFu64"\n", | ||
1138 | num_to_flush, OCFS2_I(tl_inode)->ip_blkno); | ||
1139 | if (!num_to_flush) { | ||
1140 | status = 0; | ||
1141 | goto bail; | ||
1142 | } | ||
1143 | |||
1144 | handle = ocfs2_alloc_handle(osb); | ||
1145 | if (!handle) { | ||
1146 | status = -ENOMEM; | ||
1147 | mlog_errno(status); | ||
1148 | goto bail; | ||
1149 | } | ||
1150 | |||
1151 | data_alloc_inode = ocfs2_get_system_file_inode(osb, | ||
1152 | GLOBAL_BITMAP_SYSTEM_INODE, | ||
1153 | OCFS2_INVALID_SLOT); | ||
1154 | if (!data_alloc_inode) { | ||
1155 | status = -EINVAL; | ||
1156 | mlog(ML_ERROR, "Could not get bitmap inode!\n"); | ||
1157 | goto bail; | ||
1158 | } | ||
1159 | |||
1160 | ocfs2_handle_add_inode(handle, data_alloc_inode); | ||
1161 | status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1); | ||
1162 | if (status < 0) { | ||
1163 | mlog_errno(status); | ||
1164 | goto bail; | ||
1165 | } | ||
1166 | |||
1167 | handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE); | ||
1168 | if (IS_ERR(handle)) { | ||
1169 | status = PTR_ERR(handle); | ||
1170 | handle = NULL; | ||
1171 | mlog_errno(status); | ||
1172 | goto bail; | ||
1173 | } | ||
1174 | |||
1175 | status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, | ||
1176 | data_alloc_bh); | ||
1177 | if (status < 0) { | ||
1178 | mlog_errno(status); | ||
1179 | goto bail; | ||
1180 | } | ||
1181 | |||
1182 | bail: | ||
1183 | if (handle) | ||
1184 | ocfs2_commit_trans(handle); | ||
1185 | |||
1186 | if (data_alloc_inode) | ||
1187 | iput(data_alloc_inode); | ||
1188 | |||
1189 | if (data_alloc_bh) | ||
1190 | brelse(data_alloc_bh); | ||
1191 | |||
1192 | mlog_exit(status); | ||
1193 | return status; | ||
1194 | } | ||
1195 | |||
1196 | int ocfs2_flush_truncate_log(struct ocfs2_super *osb) | ||
1197 | { | ||
1198 | int status; | ||
1199 | struct inode *tl_inode = osb->osb_tl_inode; | ||
1200 | |||
1201 | down(&tl_inode->i_sem); | ||
1202 | status = __ocfs2_flush_truncate_log(osb); | ||
1203 | up(&tl_inode->i_sem); | ||
1204 | |||
1205 | return status; | ||
1206 | } | ||
1207 | |||
1208 | static void ocfs2_truncate_log_worker(void *data) | ||
1209 | { | ||
1210 | int status; | ||
1211 | struct ocfs2_super *osb = data; | ||
1212 | |||
1213 | mlog_entry_void(); | ||
1214 | |||
1215 | status = ocfs2_flush_truncate_log(osb); | ||
1216 | if (status < 0) | ||
1217 | mlog_errno(status); | ||
1218 | |||
1219 | mlog_exit(status); | ||
1220 | } | ||
1221 | |||
1222 | #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) | ||
1223 | void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, | ||
1224 | int cancel) | ||
1225 | { | ||
1226 | if (osb->osb_tl_inode) { | ||
1227 | /* We want to push off log flushes while truncates are | ||
1228 | * still running. */ | ||
1229 | if (cancel) | ||
1230 | cancel_delayed_work(&osb->osb_truncate_log_wq); | ||
1231 | |||
1232 | queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, | ||
1233 | OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); | ||
1234 | } | ||
1235 | } | ||
1236 | |||
1237 | static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, | ||
1238 | int slot_num, | ||
1239 | struct inode **tl_inode, | ||
1240 | struct buffer_head **tl_bh) | ||
1241 | { | ||
1242 | int status; | ||
1243 | struct inode *inode = NULL; | ||
1244 | struct buffer_head *bh = NULL; | ||
1245 | |||
1246 | inode = ocfs2_get_system_file_inode(osb, | ||
1247 | TRUNCATE_LOG_SYSTEM_INODE, | ||
1248 | slot_num); | ||
1249 | if (!inode) { | ||
1250 | status = -EINVAL; | ||
1251 | mlog(ML_ERROR, "Could not get load truncate log inode!\n"); | ||
1252 | goto bail; | ||
1253 | } | ||
1254 | |||
1255 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, | ||
1256 | OCFS2_BH_CACHED, inode); | ||
1257 | if (status < 0) { | ||
1258 | iput(inode); | ||
1259 | mlog_errno(status); | ||
1260 | goto bail; | ||
1261 | } | ||
1262 | |||
1263 | *tl_inode = inode; | ||
1264 | *tl_bh = bh; | ||
1265 | bail: | ||
1266 | mlog_exit(status); | ||
1267 | return status; | ||
1268 | } | ||
1269 | |||
1270 | /* called during the 1st stage of node recovery. we stamp a clean | ||
1271 | * truncate log and pass back a copy for processing later. if the | ||
1272 | * truncate log does not require processing, a *tl_copy is set to | ||
1273 | * NULL. */ | ||
1274 | int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, | ||
1275 | int slot_num, | ||
1276 | struct ocfs2_dinode **tl_copy) | ||
1277 | { | ||
1278 | int status; | ||
1279 | struct inode *tl_inode = NULL; | ||
1280 | struct buffer_head *tl_bh = NULL; | ||
1281 | struct ocfs2_dinode *di; | ||
1282 | struct ocfs2_truncate_log *tl; | ||
1283 | |||
1284 | *tl_copy = NULL; | ||
1285 | |||
1286 | mlog(0, "recover truncate log from slot %d\n", slot_num); | ||
1287 | |||
1288 | status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); | ||
1289 | if (status < 0) { | ||
1290 | mlog_errno(status); | ||
1291 | goto bail; | ||
1292 | } | ||
1293 | |||
1294 | di = (struct ocfs2_dinode *) tl_bh->b_data; | ||
1295 | tl = &di->id2.i_dealloc; | ||
1296 | if (!OCFS2_IS_VALID_DINODE(di)) { | ||
1297 | OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di); | ||
1298 | status = -EIO; | ||
1299 | goto bail; | ||
1300 | } | ||
1301 | |||
1302 | if (le16_to_cpu(tl->tl_used)) { | ||
1303 | mlog(0, "We'll have %u logs to recover\n", | ||
1304 | le16_to_cpu(tl->tl_used)); | ||
1305 | |||
1306 | *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); | ||
1307 | if (!(*tl_copy)) { | ||
1308 | status = -ENOMEM; | ||
1309 | mlog_errno(status); | ||
1310 | goto bail; | ||
1311 | } | ||
1312 | |||
1313 | /* Assuming the write-out below goes well, this copy | ||
1314 | * will be passed back to recovery for processing. */ | ||
1315 | memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); | ||
1316 | |||
1317 | /* All we need to do to clear the truncate log is set | ||
1318 | * tl_used. */ | ||
1319 | tl->tl_used = 0; | ||
1320 | |||
1321 | status = ocfs2_write_block(osb, tl_bh, tl_inode); | ||
1322 | if (status < 0) { | ||
1323 | mlog_errno(status); | ||
1324 | goto bail; | ||
1325 | } | ||
1326 | } | ||
1327 | |||
1328 | bail: | ||
1329 | if (tl_inode) | ||
1330 | iput(tl_inode); | ||
1331 | if (tl_bh) | ||
1332 | brelse(tl_bh); | ||
1333 | |||
1334 | if (status < 0 && (*tl_copy)) { | ||
1335 | kfree(*tl_copy); | ||
1336 | *tl_copy = NULL; | ||
1337 | } | ||
1338 | |||
1339 | mlog_exit(status); | ||
1340 | return status; | ||
1341 | } | ||
1342 | |||
1343 | int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, | ||
1344 | struct ocfs2_dinode *tl_copy) | ||
1345 | { | ||
1346 | int status = 0; | ||
1347 | int i; | ||
1348 | unsigned int clusters, num_recs, start_cluster; | ||
1349 | u64 start_blk; | ||
1350 | struct ocfs2_journal_handle *handle; | ||
1351 | struct inode *tl_inode = osb->osb_tl_inode; | ||
1352 | struct ocfs2_truncate_log *tl; | ||
1353 | |||
1354 | mlog_entry_void(); | ||
1355 | |||
1356 | if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { | ||
1357 | mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); | ||
1358 | return -EINVAL; | ||
1359 | } | ||
1360 | |||
1361 | tl = &tl_copy->id2.i_dealloc; | ||
1362 | num_recs = le16_to_cpu(tl->tl_used); | ||
1363 | mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs, | ||
1364 | tl_copy->i_blkno); | ||
1365 | |||
1366 | down(&tl_inode->i_sem); | ||
1367 | for(i = 0; i < num_recs; i++) { | ||
1368 | if (ocfs2_truncate_log_needs_flush(osb)) { | ||
1369 | status = __ocfs2_flush_truncate_log(osb); | ||
1370 | if (status < 0) { | ||
1371 | mlog_errno(status); | ||
1372 | goto bail_up; | ||
1373 | } | ||
1374 | } | ||
1375 | |||
1376 | handle = ocfs2_start_trans(osb, NULL, | ||
1377 | OCFS2_TRUNCATE_LOG_UPDATE); | ||
1378 | if (IS_ERR(handle)) { | ||
1379 | status = PTR_ERR(handle); | ||
1380 | mlog_errno(status); | ||
1381 | goto bail_up; | ||
1382 | } | ||
1383 | |||
1384 | clusters = le32_to_cpu(tl->tl_recs[i].t_clusters); | ||
1385 | start_cluster = le32_to_cpu(tl->tl_recs[i].t_start); | ||
1386 | start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster); | ||
1387 | |||
1388 | status = ocfs2_truncate_log_append(osb, handle, | ||
1389 | start_blk, clusters); | ||
1390 | ocfs2_commit_trans(handle); | ||
1391 | if (status < 0) { | ||
1392 | mlog_errno(status); | ||
1393 | goto bail_up; | ||
1394 | } | ||
1395 | } | ||
1396 | |||
1397 | bail_up: | ||
1398 | up(&tl_inode->i_sem); | ||
1399 | |||
1400 | mlog_exit(status); | ||
1401 | return status; | ||
1402 | } | ||
1403 | |||
1404 | void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) | ||
1405 | { | ||
1406 | int status; | ||
1407 | struct inode *tl_inode = osb->osb_tl_inode; | ||
1408 | |||
1409 | mlog_entry_void(); | ||
1410 | |||
1411 | if (tl_inode) { | ||
1412 | cancel_delayed_work(&osb->osb_truncate_log_wq); | ||
1413 | flush_workqueue(ocfs2_wq); | ||
1414 | |||
1415 | status = ocfs2_flush_truncate_log(osb); | ||
1416 | if (status < 0) | ||
1417 | mlog_errno(status); | ||
1418 | |||
1419 | brelse(osb->osb_tl_bh); | ||
1420 | iput(osb->osb_tl_inode); | ||
1421 | } | ||
1422 | |||
1423 | mlog_exit_void(); | ||
1424 | } | ||
1425 | |||
1426 | int ocfs2_truncate_log_init(struct ocfs2_super *osb) | ||
1427 | { | ||
1428 | int status; | ||
1429 | struct inode *tl_inode = NULL; | ||
1430 | struct buffer_head *tl_bh = NULL; | ||
1431 | |||
1432 | mlog_entry_void(); | ||
1433 | |||
1434 | status = ocfs2_get_truncate_log_info(osb, | ||
1435 | osb->slot_num, | ||
1436 | &tl_inode, | ||
1437 | &tl_bh); | ||
1438 | if (status < 0) | ||
1439 | mlog_errno(status); | ||
1440 | |||
1441 | /* ocfs2_truncate_log_shutdown keys on the existence of | ||
1442 | * osb->osb_tl_inode so we don't set any of the osb variables | ||
1443 | * until we're sure all is well. */ | ||
1444 | INIT_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker, osb); | ||
1445 | osb->osb_tl_bh = tl_bh; | ||
1446 | osb->osb_tl_inode = tl_inode; | ||
1447 | |||
1448 | mlog_exit(status); | ||
1449 | return status; | ||
1450 | } | ||
1451 | |||
1452 | /* This function will figure out whether the currently last extent | ||
1453 | * block will be deleted, and if it will, what the new last extent | ||
1454 | * block will be so we can update his h_next_leaf_blk field, as well | ||
1455 | * as the dinodes i_last_eb_blk */ | ||
1456 | static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, | ||
1457 | struct inode *inode, | ||
1458 | struct ocfs2_dinode *fe, | ||
1459 | u32 new_i_clusters, | ||
1460 | struct buffer_head *old_last_eb, | ||
1461 | struct buffer_head **new_last_eb) | ||
1462 | { | ||
1463 | int i, status = 0; | ||
1464 | u64 block = 0; | ||
1465 | struct ocfs2_extent_block *eb; | ||
1466 | struct ocfs2_extent_list *el; | ||
1467 | struct buffer_head *bh = NULL; | ||
1468 | |||
1469 | *new_last_eb = NULL; | ||
1470 | |||
1471 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
1472 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | ||
1473 | status = -EIO; | ||
1474 | goto bail; | ||
1475 | } | ||
1476 | |||
1477 | /* we have no tree, so of course, no last_eb. */ | ||
1478 | if (!fe->id2.i_list.l_tree_depth) | ||
1479 | goto bail; | ||
1480 | |||
1481 | /* trunc to zero special case - this makes tree_depth = 0 | ||
1482 | * regardless of what it is. */ | ||
1483 | if (!new_i_clusters) | ||
1484 | goto bail; | ||
1485 | |||
1486 | eb = (struct ocfs2_extent_block *) old_last_eb->b_data; | ||
1487 | el = &(eb->h_list); | ||
1488 | BUG_ON(!el->l_next_free_rec); | ||
1489 | |||
1490 | /* Make sure that this guy will actually be empty after we | ||
1491 | * clear away the data. */ | ||
1492 | if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters) | ||
1493 | goto bail; | ||
1494 | |||
1495 | /* Ok, at this point, we know that last_eb will definitely | ||
1496 | * change, so lets traverse the tree and find the second to | ||
1497 | * last extent block. */ | ||
1498 | el = &(fe->id2.i_list); | ||
1499 | /* go down the tree, */ | ||
1500 | do { | ||
1501 | for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) { | ||
1502 | if (le32_to_cpu(el->l_recs[i].e_cpos) < | ||
1503 | new_i_clusters) { | ||
1504 | block = le64_to_cpu(el->l_recs[i].e_blkno); | ||
1505 | break; | ||
1506 | } | ||
1507 | } | ||
1508 | BUG_ON(i < 0); | ||
1509 | |||
1510 | if (bh) { | ||
1511 | brelse(bh); | ||
1512 | bh = NULL; | ||
1513 | } | ||
1514 | |||
1515 | status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED, | ||
1516 | inode); | ||
1517 | if (status < 0) { | ||
1518 | mlog_errno(status); | ||
1519 | goto bail; | ||
1520 | } | ||
1521 | eb = (struct ocfs2_extent_block *) bh->b_data; | ||
1522 | el = &eb->h_list; | ||
1523 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
1524 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
1525 | status = -EIO; | ||
1526 | goto bail; | ||
1527 | } | ||
1528 | } while (el->l_tree_depth); | ||
1529 | |||
1530 | *new_last_eb = bh; | ||
1531 | get_bh(*new_last_eb); | ||
1532 | mlog(0, "returning block %"MLFu64"\n", le64_to_cpu(eb->h_blkno)); | ||
1533 | bail: | ||
1534 | if (bh) | ||
1535 | brelse(bh); | ||
1536 | |||
1537 | return status; | ||
1538 | } | ||
1539 | |||
1540 | static int ocfs2_do_truncate(struct ocfs2_super *osb, | ||
1541 | unsigned int clusters_to_del, | ||
1542 | struct inode *inode, | ||
1543 | struct buffer_head *fe_bh, | ||
1544 | struct buffer_head *old_last_eb_bh, | ||
1545 | struct ocfs2_journal_handle *handle, | ||
1546 | struct ocfs2_truncate_context *tc) | ||
1547 | { | ||
1548 | int status, i, depth; | ||
1549 | struct ocfs2_dinode *fe; | ||
1550 | struct ocfs2_extent_block *eb; | ||
1551 | struct ocfs2_extent_block *last_eb = NULL; | ||
1552 | struct ocfs2_extent_list *el; | ||
1553 | struct buffer_head *eb_bh = NULL; | ||
1554 | struct buffer_head *last_eb_bh = NULL; | ||
1555 | u64 next_eb = 0; | ||
1556 | u64 delete_blk = 0; | ||
1557 | |||
1558 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
1559 | |||
1560 | status = ocfs2_find_new_last_ext_blk(osb, | ||
1561 | inode, | ||
1562 | fe, | ||
1563 | le32_to_cpu(fe->i_clusters) - | ||
1564 | clusters_to_del, | ||
1565 | old_last_eb_bh, | ||
1566 | &last_eb_bh); | ||
1567 | if (status < 0) { | ||
1568 | mlog_errno(status); | ||
1569 | goto bail; | ||
1570 | } | ||
1571 | if (last_eb_bh) | ||
1572 | last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
1573 | |||
1574 | status = ocfs2_journal_access(handle, inode, fe_bh, | ||
1575 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1576 | if (status < 0) { | ||
1577 | mlog_errno(status); | ||
1578 | goto bail; | ||
1579 | } | ||
1580 | el = &(fe->id2.i_list); | ||
1581 | |||
1582 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
1583 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - | ||
1584 | clusters_to_del; | ||
1585 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
1586 | le32_add_cpu(&fe->i_clusters, -clusters_to_del); | ||
1587 | fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec); | ||
1588 | fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec); | ||
1589 | |||
1590 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
1591 | |||
1592 | BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); | ||
1593 | le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); | ||
1594 | /* tree depth zero, we can just delete the clusters, otherwise | ||
1595 | * we need to record the offset of the next level extent block | ||
1596 | * as we may overwrite it. */ | ||
1597 | if (!el->l_tree_depth) | ||
1598 | delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) | ||
1599 | + ocfs2_clusters_to_blocks(osb->sb, | ||
1600 | le32_to_cpu(el->l_recs[i].e_clusters)); | ||
1601 | else | ||
1602 | next_eb = le64_to_cpu(el->l_recs[i].e_blkno); | ||
1603 | |||
1604 | if (!el->l_recs[i].e_clusters) { | ||
1605 | /* if we deleted the whole extent record, then clear | ||
1606 | * out the other fields and update the extent | ||
1607 | * list. For depth > 0 trees, we've already recorded | ||
1608 | * the extent block in 'next_eb' */ | ||
1609 | el->l_recs[i].e_cpos = 0; | ||
1610 | el->l_recs[i].e_blkno = 0; | ||
1611 | BUG_ON(!el->l_next_free_rec); | ||
1612 | le16_add_cpu(&el->l_next_free_rec, -1); | ||
1613 | } | ||
1614 | |||
1615 | depth = le16_to_cpu(el->l_tree_depth); | ||
1616 | if (!fe->i_clusters) { | ||
1617 | /* trunc to zero is a special case. */ | ||
1618 | el->l_tree_depth = 0; | ||
1619 | fe->i_last_eb_blk = 0; | ||
1620 | } else if (last_eb) | ||
1621 | fe->i_last_eb_blk = last_eb->h_blkno; | ||
1622 | |||
1623 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
1624 | if (status < 0) { | ||
1625 | mlog_errno(status); | ||
1626 | goto bail; | ||
1627 | } | ||
1628 | |||
1629 | if (last_eb) { | ||
1630 | /* If there will be a new last extent block, then by | ||
1631 | * definition, there cannot be any leaves to the right of | ||
1632 | * him. */ | ||
1633 | status = ocfs2_journal_access(handle, inode, last_eb_bh, | ||
1634 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1635 | if (status < 0) { | ||
1636 | mlog_errno(status); | ||
1637 | goto bail; | ||
1638 | } | ||
1639 | last_eb->h_next_leaf_blk = 0; | ||
1640 | status = ocfs2_journal_dirty(handle, last_eb_bh); | ||
1641 | if (status < 0) { | ||
1642 | mlog_errno(status); | ||
1643 | goto bail; | ||
1644 | } | ||
1645 | } | ||
1646 | |||
1647 | /* if our tree depth > 0, update all the tree blocks below us. */ | ||
1648 | while (depth) { | ||
1649 | mlog(0, "traveling tree (depth = %d, next_eb = %"MLFu64")\n", | ||
1650 | depth, next_eb); | ||
1651 | status = ocfs2_read_block(osb, next_eb, &eb_bh, | ||
1652 | OCFS2_BH_CACHED, inode); | ||
1653 | if (status < 0) { | ||
1654 | mlog_errno(status); | ||
1655 | goto bail; | ||
1656 | } | ||
1657 | eb = (struct ocfs2_extent_block *)eb_bh->b_data; | ||
1658 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
1659 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
1660 | status = -EIO; | ||
1661 | goto bail; | ||
1662 | } | ||
1663 | el = &(eb->h_list); | ||
1664 | |||
1665 | status = ocfs2_journal_access(handle, inode, eb_bh, | ||
1666 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1667 | if (status < 0) { | ||
1668 | mlog_errno(status); | ||
1669 | goto bail; | ||
1670 | } | ||
1671 | |||
1672 | BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); | ||
1673 | BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1)); | ||
1674 | |||
1675 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
1676 | |||
1677 | mlog(0, "extent block %"MLFu64", before: record %d: " | ||
1678 | "(%u, %u, %"MLFu64"), next = %u\n", | ||
1679 | le64_to_cpu(eb->h_blkno), i, | ||
1680 | le32_to_cpu(el->l_recs[i].e_cpos), | ||
1681 | le32_to_cpu(el->l_recs[i].e_clusters), | ||
1682 | le64_to_cpu(el->l_recs[i].e_blkno), | ||
1683 | le16_to_cpu(el->l_next_free_rec)); | ||
1684 | |||
1685 | BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); | ||
1686 | le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); | ||
1687 | |||
1688 | next_eb = le64_to_cpu(el->l_recs[i].e_blkno); | ||
1689 | /* bottom-most block requires us to delete data.*/ | ||
1690 | if (!el->l_tree_depth) | ||
1691 | delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) | ||
1692 | + ocfs2_clusters_to_blocks(osb->sb, | ||
1693 | le32_to_cpu(el->l_recs[i].e_clusters)); | ||
1694 | if (!el->l_recs[i].e_clusters) { | ||
1695 | el->l_recs[i].e_cpos = 0; | ||
1696 | el->l_recs[i].e_blkno = 0; | ||
1697 | BUG_ON(!el->l_next_free_rec); | ||
1698 | le16_add_cpu(&el->l_next_free_rec, -1); | ||
1699 | } | ||
1700 | mlog(0, "extent block %"MLFu64", after: record %d: " | ||
1701 | "(%u, %u, %"MLFu64"), next = %u\n", | ||
1702 | le64_to_cpu(eb->h_blkno), i, | ||
1703 | le32_to_cpu(el->l_recs[i].e_cpos), | ||
1704 | le32_to_cpu(el->l_recs[i].e_clusters), | ||
1705 | le64_to_cpu(el->l_recs[i].e_blkno), | ||
1706 | le16_to_cpu(el->l_next_free_rec)); | ||
1707 | |||
1708 | status = ocfs2_journal_dirty(handle, eb_bh); | ||
1709 | if (status < 0) { | ||
1710 | mlog_errno(status); | ||
1711 | goto bail; | ||
1712 | } | ||
1713 | |||
1714 | if (!el->l_next_free_rec) { | ||
1715 | mlog(0, "deleting this extent block.\n"); | ||
1716 | |||
1717 | ocfs2_remove_from_cache(inode, eb_bh); | ||
1718 | |||
1719 | BUG_ON(eb->h_suballoc_slot); | ||
1720 | BUG_ON(el->l_recs[0].e_clusters); | ||
1721 | BUG_ON(el->l_recs[0].e_cpos); | ||
1722 | BUG_ON(el->l_recs[0].e_blkno); | ||
1723 | status = ocfs2_free_extent_block(handle, | ||
1724 | tc->tc_ext_alloc_inode, | ||
1725 | tc->tc_ext_alloc_bh, | ||
1726 | eb); | ||
1727 | if (status < 0) { | ||
1728 | mlog_errno(status); | ||
1729 | goto bail; | ||
1730 | } | ||
1731 | } | ||
1732 | brelse(eb_bh); | ||
1733 | eb_bh = NULL; | ||
1734 | depth--; | ||
1735 | } | ||
1736 | |||
1737 | BUG_ON(!delete_blk); | ||
1738 | status = ocfs2_truncate_log_append(osb, handle, delete_blk, | ||
1739 | clusters_to_del); | ||
1740 | if (status < 0) { | ||
1741 | mlog_errno(status); | ||
1742 | goto bail; | ||
1743 | } | ||
1744 | status = 0; | ||
1745 | bail: | ||
1746 | if (!status) | ||
1747 | ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters)); | ||
1748 | else | ||
1749 | ocfs2_extent_map_drop(inode, 0); | ||
1750 | mlog_exit(status); | ||
1751 | return status; | ||
1752 | } | ||
1753 | |||
1754 | /* | ||
1755 | * It is expected, that by the time you call this function, | ||
1756 | * inode->i_size and fe->i_size have been adjusted. | ||
1757 | * | ||
1758 | * WARNING: This will kfree the truncate context | ||
1759 | */ | ||
1760 | int ocfs2_commit_truncate(struct ocfs2_super *osb, | ||
1761 | struct inode *inode, | ||
1762 | struct buffer_head *fe_bh, | ||
1763 | struct ocfs2_truncate_context *tc) | ||
1764 | { | ||
1765 | int status, i, credits, tl_sem = 0; | ||
1766 | u32 clusters_to_del, target_i_clusters; | ||
1767 | u64 last_eb = 0; | ||
1768 | struct ocfs2_dinode *fe; | ||
1769 | struct ocfs2_extent_block *eb; | ||
1770 | struct ocfs2_extent_list *el; | ||
1771 | struct buffer_head *last_eb_bh; | ||
1772 | struct ocfs2_journal_handle *handle = NULL; | ||
1773 | struct inode *tl_inode = osb->osb_tl_inode; | ||
1774 | |||
1775 | mlog_entry_void(); | ||
1776 | |||
1777 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1778 | |||
1779 | target_i_clusters = ocfs2_clusters_for_bytes(osb->sb, | ||
1780 | i_size_read(inode)); | ||
1781 | |||
1782 | last_eb_bh = tc->tc_last_eb_bh; | ||
1783 | tc->tc_last_eb_bh = NULL; | ||
1784 | |||
1785 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
1786 | |||
1787 | if (fe->id2.i_list.l_tree_depth) { | ||
1788 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
1789 | el = &eb->h_list; | ||
1790 | } else | ||
1791 | el = &fe->id2.i_list; | ||
1792 | last_eb = le64_to_cpu(fe->i_last_eb_blk); | ||
1793 | start: | ||
1794 | mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, " | ||
1795 | "last_eb = %"MLFu64", fe->i_last_eb_blk = %"MLFu64", " | ||
1796 | "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n", | ||
1797 | le32_to_cpu(fe->i_clusters), last_eb, | ||
1798 | le64_to_cpu(fe->i_last_eb_blk), | ||
1799 | le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh); | ||
1800 | |||
1801 | if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) { | ||
1802 | mlog(0, "last_eb changed!\n"); | ||
1803 | BUG_ON(!fe->id2.i_list.l_tree_depth); | ||
1804 | last_eb = le64_to_cpu(fe->i_last_eb_blk); | ||
1805 | /* i_last_eb_blk may have changed, read it if | ||
1806 | * necessary. We don't have to worry about the | ||
1807 | * truncate to zero case here (where there becomes no | ||
1808 | * last_eb) because we never loop back after our work | ||
1809 | * is done. */ | ||
1810 | if (last_eb_bh) { | ||
1811 | brelse(last_eb_bh); | ||
1812 | last_eb_bh = NULL; | ||
1813 | } | ||
1814 | |||
1815 | status = ocfs2_read_block(osb, last_eb, | ||
1816 | &last_eb_bh, OCFS2_BH_CACHED, | ||
1817 | inode); | ||
1818 | if (status < 0) { | ||
1819 | mlog_errno(status); | ||
1820 | goto bail; | ||
1821 | } | ||
1822 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
1823 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
1824 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
1825 | status = -EIO; | ||
1826 | goto bail; | ||
1827 | } | ||
1828 | el = &(eb->h_list); | ||
1829 | } | ||
1830 | |||
1831 | /* by now, el will point to the extent list on the bottom most | ||
1832 | * portion of this tree. */ | ||
1833 | i = le16_to_cpu(el->l_next_free_rec) - 1; | ||
1834 | if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters) | ||
1835 | clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters); | ||
1836 | else | ||
1837 | clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) + | ||
1838 | le32_to_cpu(el->l_recs[i].e_cpos)) - | ||
1839 | target_i_clusters; | ||
1840 | |||
1841 | mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del); | ||
1842 | |||
1843 | down(&tl_inode->i_sem); | ||
1844 | tl_sem = 1; | ||
1845 | /* ocfs2_truncate_log_needs_flush guarantees us at least one | ||
1846 | * record is free for use. If there isn't any, we flush to get | ||
1847 | * an empty truncate log. */ | ||
1848 | if (ocfs2_truncate_log_needs_flush(osb)) { | ||
1849 | status = __ocfs2_flush_truncate_log(osb); | ||
1850 | if (status < 0) { | ||
1851 | mlog_errno(status); | ||
1852 | goto bail; | ||
1853 | } | ||
1854 | } | ||
1855 | |||
1856 | credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, | ||
1857 | fe, el); | ||
1858 | handle = ocfs2_start_trans(osb, NULL, credits); | ||
1859 | if (IS_ERR(handle)) { | ||
1860 | status = PTR_ERR(handle); | ||
1861 | handle = NULL; | ||
1862 | mlog_errno(status); | ||
1863 | goto bail; | ||
1864 | } | ||
1865 | |||
1866 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | ||
1867 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); | ||
1868 | if (status < 0) | ||
1869 | mlog_errno(status); | ||
1870 | |||
1871 | status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, | ||
1872 | last_eb_bh, handle, tc); | ||
1873 | if (status < 0) { | ||
1874 | mlog_errno(status); | ||
1875 | goto bail; | ||
1876 | } | ||
1877 | |||
1878 | up(&tl_inode->i_sem); | ||
1879 | tl_sem = 0; | ||
1880 | |||
1881 | ocfs2_commit_trans(handle); | ||
1882 | handle = NULL; | ||
1883 | |||
1884 | BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters); | ||
1885 | if (le32_to_cpu(fe->i_clusters) > target_i_clusters) | ||
1886 | goto start; | ||
1887 | bail: | ||
1888 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1889 | |||
1890 | ocfs2_schedule_truncate_log_flush(osb, 1); | ||
1891 | |||
1892 | if (tl_sem) | ||
1893 | up(&tl_inode->i_sem); | ||
1894 | |||
1895 | if (handle) | ||
1896 | ocfs2_commit_trans(handle); | ||
1897 | |||
1898 | if (last_eb_bh) | ||
1899 | brelse(last_eb_bh); | ||
1900 | |||
1901 | /* This will drop the ext_alloc cluster lock for us */ | ||
1902 | ocfs2_free_truncate_context(tc); | ||
1903 | |||
1904 | mlog_exit(status); | ||
1905 | return status; | ||
1906 | } | ||
1907 | |||
1908 | |||
1909 | /* | ||
1910 | * Expects the inode to already be locked. This will figure out which | ||
1911 | * inodes need to be locked and will put them on the returned truncate | ||
1912 | * context. | ||
1913 | */ | ||
1914 | int ocfs2_prepare_truncate(struct ocfs2_super *osb, | ||
1915 | struct inode *inode, | ||
1916 | struct buffer_head *fe_bh, | ||
1917 | struct ocfs2_truncate_context **tc) | ||
1918 | { | ||
1919 | int status, metadata_delete; | ||
1920 | unsigned int new_i_clusters; | ||
1921 | struct ocfs2_dinode *fe; | ||
1922 | struct ocfs2_extent_block *eb; | ||
1923 | struct ocfs2_extent_list *el; | ||
1924 | struct buffer_head *last_eb_bh = NULL; | ||
1925 | struct inode *ext_alloc_inode = NULL; | ||
1926 | struct buffer_head *ext_alloc_bh = NULL; | ||
1927 | |||
1928 | mlog_entry_void(); | ||
1929 | |||
1930 | *tc = NULL; | ||
1931 | |||
1932 | new_i_clusters = ocfs2_clusters_for_bytes(osb->sb, | ||
1933 | i_size_read(inode)); | ||
1934 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
1935 | |||
1936 | mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size =" | ||
1937 | "%"MLFu64"\n", fe->i_clusters, new_i_clusters, fe->i_size); | ||
1938 | |||
1939 | if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) { | ||
1940 | ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has cluster count " | ||
1941 | "%u and size %"MLFu64" whereas struct inode has " | ||
1942 | "cluster count %u and size %llu which caused an " | ||
1943 | "invalid truncate to %u clusters.", | ||
1944 | le64_to_cpu(fe->i_blkno), | ||
1945 | le32_to_cpu(fe->i_clusters), | ||
1946 | le64_to_cpu(fe->i_size), | ||
1947 | OCFS2_I(inode)->ip_clusters, i_size_read(inode), | ||
1948 | new_i_clusters); | ||
1949 | mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres); | ||
1950 | status = -EIO; | ||
1951 | goto bail; | ||
1952 | } | ||
1953 | |||
1954 | *tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL); | ||
1955 | if (!(*tc)) { | ||
1956 | status = -ENOMEM; | ||
1957 | mlog_errno(status); | ||
1958 | goto bail; | ||
1959 | } | ||
1960 | |||
1961 | metadata_delete = 0; | ||
1962 | if (fe->id2.i_list.l_tree_depth) { | ||
1963 | /* If we have a tree, then the truncate may result in | ||
1964 | * metadata deletes. Figure this out from the | ||
1965 | * rightmost leaf block.*/ | ||
1966 | status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), | ||
1967 | &last_eb_bh, OCFS2_BH_CACHED, inode); | ||
1968 | if (status < 0) { | ||
1969 | mlog_errno(status); | ||
1970 | goto bail; | ||
1971 | } | ||
1972 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
1973 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
1974 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
1975 | |||
1976 | brelse(last_eb_bh); | ||
1977 | status = -EIO; | ||
1978 | goto bail; | ||
1979 | } | ||
1980 | el = &(eb->h_list); | ||
1981 | if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters) | ||
1982 | metadata_delete = 1; | ||
1983 | } | ||
1984 | |||
1985 | (*tc)->tc_last_eb_bh = last_eb_bh; | ||
1986 | |||
1987 | if (metadata_delete) { | ||
1988 | mlog(0, "Will have to delete metadata for this trunc. " | ||
1989 | "locking allocator.\n"); | ||
1990 | ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0); | ||
1991 | if (!ext_alloc_inode) { | ||
1992 | status = -ENOMEM; | ||
1993 | mlog_errno(status); | ||
1994 | goto bail; | ||
1995 | } | ||
1996 | |||
1997 | down(&ext_alloc_inode->i_sem); | ||
1998 | (*tc)->tc_ext_alloc_inode = ext_alloc_inode; | ||
1999 | |||
2000 | status = ocfs2_meta_lock(ext_alloc_inode, | ||
2001 | NULL, | ||
2002 | &ext_alloc_bh, | ||
2003 | 1); | ||
2004 | if (status < 0) { | ||
2005 | mlog_errno(status); | ||
2006 | goto bail; | ||
2007 | } | ||
2008 | (*tc)->tc_ext_alloc_bh = ext_alloc_bh; | ||
2009 | (*tc)->tc_ext_alloc_locked = 1; | ||
2010 | } | ||
2011 | |||
2012 | status = 0; | ||
2013 | bail: | ||
2014 | if (status < 0) { | ||
2015 | if (*tc) | ||
2016 | ocfs2_free_truncate_context(*tc); | ||
2017 | *tc = NULL; | ||
2018 | } | ||
2019 | mlog_exit_void(); | ||
2020 | return status; | ||
2021 | } | ||
2022 | |||
2023 | static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) | ||
2024 | { | ||
2025 | if (tc->tc_ext_alloc_inode) { | ||
2026 | if (tc->tc_ext_alloc_locked) | ||
2027 | ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1); | ||
2028 | |||
2029 | up(&tc->tc_ext_alloc_inode->i_sem); | ||
2030 | iput(tc->tc_ext_alloc_inode); | ||
2031 | } | ||
2032 | |||
2033 | if (tc->tc_ext_alloc_bh) | ||
2034 | brelse(tc->tc_ext_alloc_bh); | ||
2035 | |||
2036 | if (tc->tc_last_eb_bh) | ||
2037 | brelse(tc->tc_last_eb_bh); | ||
2038 | |||
2039 | kfree(tc); | ||
2040 | } | ||
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h new file mode 100644 index 000000000000..12ba897743f4 --- /dev/null +++ b/fs/ocfs2/alloc.h | |||
@@ -0,0 +1,82 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * alloc.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_ALLOC_H | ||
27 | #define OCFS2_ALLOC_H | ||
28 | |||
29 | struct ocfs2_alloc_context; | ||
30 | int ocfs2_insert_extent(struct ocfs2_super *osb, | ||
31 | struct ocfs2_journal_handle *handle, | ||
32 | struct inode *inode, | ||
33 | struct buffer_head *fe_bh, | ||
34 | u64 blkno, | ||
35 | u32 new_clusters, | ||
36 | struct ocfs2_alloc_context *meta_ac); | ||
37 | int ocfs2_num_free_extents(struct ocfs2_super *osb, | ||
38 | struct inode *inode, | ||
39 | struct ocfs2_dinode *fe); | ||
40 | /* how many new metadata chunks would an allocation need at maximum? */ | ||
41 | static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe) | ||
42 | { | ||
43 | /* | ||
44 | * Rather than do all the work of determining how much we need | ||
45 | * (involves a ton of reads and locks), just ask for the | ||
46 | * maximal limit. That's a tree depth shift. So, one block for | ||
47 | * level of the tree (current l_tree_depth), one block for the | ||
48 | * new tree_depth==0 extent_block, and one block at the new | ||
49 | * top-of-the tree. | ||
50 | */ | ||
51 | return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2; | ||
52 | } | ||
53 | |||
54 | int ocfs2_truncate_log_init(struct ocfs2_super *osb); | ||
55 | void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb); | ||
56 | void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, | ||
57 | int cancel); | ||
58 | int ocfs2_flush_truncate_log(struct ocfs2_super *osb); | ||
59 | int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, | ||
60 | int slot_num, | ||
61 | struct ocfs2_dinode **tl_copy); | ||
62 | int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, | ||
63 | struct ocfs2_dinode *tl_copy); | ||
64 | |||
65 | struct ocfs2_truncate_context { | ||
66 | struct inode *tc_ext_alloc_inode; | ||
67 | struct buffer_head *tc_ext_alloc_bh; | ||
68 | int tc_ext_alloc_locked; /* is it cluster locked? */ | ||
69 | /* these get destroyed once it's passed to ocfs2_commit_truncate. */ | ||
70 | struct buffer_head *tc_last_eb_bh; | ||
71 | }; | ||
72 | |||
73 | int ocfs2_prepare_truncate(struct ocfs2_super *osb, | ||
74 | struct inode *inode, | ||
75 | struct buffer_head *fe_bh, | ||
76 | struct ocfs2_truncate_context **tc); | ||
77 | int ocfs2_commit_truncate(struct ocfs2_super *osb, | ||
78 | struct inode *inode, | ||
79 | struct buffer_head *fe_bh, | ||
80 | struct ocfs2_truncate_context *tc); | ||
81 | |||
82 | #endif /* OCFS2_ALLOC_H */ | ||
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c new file mode 100644 index 000000000000..8f4467a930a5 --- /dev/null +++ b/fs/ocfs2/aops.c | |||
@@ -0,0 +1,643 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | */ | ||
21 | |||
22 | #include <linux/fs.h> | ||
23 | #include <linux/slab.h> | ||
24 | #include <linux/highmem.h> | ||
25 | #include <linux/pagemap.h> | ||
26 | #include <asm/byteorder.h> | ||
27 | |||
28 | #define MLOG_MASK_PREFIX ML_FILE_IO | ||
29 | #include <cluster/masklog.h> | ||
30 | |||
31 | #include "ocfs2.h" | ||
32 | |||
33 | #include "alloc.h" | ||
34 | #include "aops.h" | ||
35 | #include "dlmglue.h" | ||
36 | #include "extent_map.h" | ||
37 | #include "file.h" | ||
38 | #include "inode.h" | ||
39 | #include "journal.h" | ||
40 | #include "super.h" | ||
41 | #include "symlink.h" | ||
42 | |||
43 | #include "buffer_head_io.h" | ||
44 | |||
45 | static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, | ||
46 | struct buffer_head *bh_result, int create) | ||
47 | { | ||
48 | int err = -EIO; | ||
49 | int status; | ||
50 | struct ocfs2_dinode *fe = NULL; | ||
51 | struct buffer_head *bh = NULL; | ||
52 | struct buffer_head *buffer_cache_bh = NULL; | ||
53 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
54 | void *kaddr; | ||
55 | |||
56 | mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, | ||
57 | (unsigned long long)iblock, bh_result, create); | ||
58 | |||
59 | BUG_ON(ocfs2_inode_is_fast_symlink(inode)); | ||
60 | |||
61 | if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) { | ||
62 | mlog(ML_ERROR, "block offset > PATH_MAX: %llu", | ||
63 | (unsigned long long)iblock); | ||
64 | goto bail; | ||
65 | } | ||
66 | |||
67 | status = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
68 | OCFS2_I(inode)->ip_blkno, | ||
69 | &bh, OCFS2_BH_CACHED, inode); | ||
70 | if (status < 0) { | ||
71 | mlog_errno(status); | ||
72 | goto bail; | ||
73 | } | ||
74 | fe = (struct ocfs2_dinode *) bh->b_data; | ||
75 | |||
76 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
77 | mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n", | ||
78 | fe->i_blkno, 7, fe->i_signature); | ||
79 | goto bail; | ||
80 | } | ||
81 | |||
82 | if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, | ||
83 | le32_to_cpu(fe->i_clusters))) { | ||
84 | mlog(ML_ERROR, "block offset is outside the allocated size: " | ||
85 | "%llu\n", (unsigned long long)iblock); | ||
86 | goto bail; | ||
87 | } | ||
88 | |||
89 | /* We don't use the page cache to create symlink data, so if | ||
90 | * need be, copy it over from the buffer cache. */ | ||
91 | if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) { | ||
92 | u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + | ||
93 | iblock; | ||
94 | buffer_cache_bh = sb_getblk(osb->sb, blkno); | ||
95 | if (!buffer_cache_bh) { | ||
96 | mlog(ML_ERROR, "couldn't getblock for symlink!\n"); | ||
97 | goto bail; | ||
98 | } | ||
99 | |||
100 | /* we haven't locked out transactions, so a commit | ||
101 | * could've happened. Since we've got a reference on | ||
102 | * the bh, even if it commits while we're doing the | ||
103 | * copy, the data is still good. */ | ||
104 | if (buffer_jbd(buffer_cache_bh) | ||
105 | && ocfs2_inode_is_new(inode)) { | ||
106 | kaddr = kmap_atomic(bh_result->b_page, KM_USER0); | ||
107 | if (!kaddr) { | ||
108 | mlog(ML_ERROR, "couldn't kmap!\n"); | ||
109 | goto bail; | ||
110 | } | ||
111 | memcpy(kaddr + (bh_result->b_size * iblock), | ||
112 | buffer_cache_bh->b_data, | ||
113 | bh_result->b_size); | ||
114 | kunmap_atomic(kaddr, KM_USER0); | ||
115 | set_buffer_uptodate(bh_result); | ||
116 | } | ||
117 | brelse(buffer_cache_bh); | ||
118 | } | ||
119 | |||
120 | map_bh(bh_result, inode->i_sb, | ||
121 | le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock); | ||
122 | |||
123 | err = 0; | ||
124 | |||
125 | bail: | ||
126 | if (bh) | ||
127 | brelse(bh); | ||
128 | |||
129 | mlog_exit(err); | ||
130 | return err; | ||
131 | } | ||
132 | |||
133 | static int ocfs2_get_block(struct inode *inode, sector_t iblock, | ||
134 | struct buffer_head *bh_result, int create) | ||
135 | { | ||
136 | int err = 0; | ||
137 | u64 p_blkno, past_eof; | ||
138 | |||
139 | mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, | ||
140 | (unsigned long long)iblock, bh_result, create); | ||
141 | |||
142 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) | ||
143 | mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", | ||
144 | inode, inode->i_ino); | ||
145 | |||
146 | if (S_ISLNK(inode->i_mode)) { | ||
147 | /* this always does I/O for some reason. */ | ||
148 | err = ocfs2_symlink_get_block(inode, iblock, bh_result, create); | ||
149 | goto bail; | ||
150 | } | ||
151 | |||
152 | /* this can happen if another node truncs after our extend! */ | ||
153 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
154 | if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb, | ||
155 | OCFS2_I(inode)->ip_clusters)) | ||
156 | err = -EIO; | ||
157 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
158 | if (err) | ||
159 | goto bail; | ||
160 | |||
161 | err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, | ||
162 | NULL); | ||
163 | if (err) { | ||
164 | mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " | ||
165 | "%"MLFu64", NULL)\n", err, inode, | ||
166 | (unsigned long long)iblock, p_blkno); | ||
167 | goto bail; | ||
168 | } | ||
169 | |||
170 | map_bh(bh_result, inode->i_sb, p_blkno); | ||
171 | |||
172 | if (bh_result->b_blocknr == 0) { | ||
173 | err = -EIO; | ||
174 | mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" " | ||
175 | "blkno=(%"MLFu64")\n", (unsigned long long)iblock, | ||
176 | p_blkno, OCFS2_I(inode)->ip_blkno); | ||
177 | } | ||
178 | |||
179 | past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); | ||
180 | mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof); | ||
181 | |||
182 | if (create && (iblock >= past_eof)) | ||
183 | set_buffer_new(bh_result); | ||
184 | |||
185 | bail: | ||
186 | if (err < 0) | ||
187 | err = -EIO; | ||
188 | |||
189 | mlog_exit(err); | ||
190 | return err; | ||
191 | } | ||
192 | |||
193 | static int ocfs2_readpage(struct file *file, struct page *page) | ||
194 | { | ||
195 | struct inode *inode = page->mapping->host; | ||
196 | loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; | ||
197 | int ret, unlock = 1; | ||
198 | |||
199 | mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); | ||
200 | |||
201 | ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page); | ||
202 | if (ret != 0) { | ||
203 | if (ret == AOP_TRUNCATED_PAGE) | ||
204 | unlock = 0; | ||
205 | mlog_errno(ret); | ||
206 | goto out; | ||
207 | } | ||
208 | |||
209 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
210 | |||
211 | /* | ||
212 | * i_size might have just been updated as we grabed the meta lock. We | ||
213 | * might now be discovering a truncate that hit on another node. | ||
214 | * block_read_full_page->get_block freaks out if it is asked to read | ||
215 | * beyond the end of a file, so we check here. Callers | ||
216 | * (generic_file_read, fault->nopage) are clever enough to check i_size | ||
217 | * and notice that the page they just read isn't needed. | ||
218 | * | ||
219 | * XXX sys_readahead() seems to get that wrong? | ||
220 | */ | ||
221 | if (start >= i_size_read(inode)) { | ||
222 | char *addr = kmap(page); | ||
223 | memset(addr, 0, PAGE_SIZE); | ||
224 | flush_dcache_page(page); | ||
225 | kunmap(page); | ||
226 | SetPageUptodate(page); | ||
227 | ret = 0; | ||
228 | goto out_alloc; | ||
229 | } | ||
230 | |||
231 | ret = ocfs2_data_lock_with_page(inode, 0, page); | ||
232 | if (ret != 0) { | ||
233 | if (ret == AOP_TRUNCATED_PAGE) | ||
234 | unlock = 0; | ||
235 | mlog_errno(ret); | ||
236 | goto out_alloc; | ||
237 | } | ||
238 | |||
239 | ret = block_read_full_page(page, ocfs2_get_block); | ||
240 | unlock = 0; | ||
241 | |||
242 | ocfs2_data_unlock(inode, 0); | ||
243 | out_alloc: | ||
244 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
245 | ocfs2_meta_unlock(inode, 0); | ||
246 | out: | ||
247 | if (unlock) | ||
248 | unlock_page(page); | ||
249 | mlog_exit(ret); | ||
250 | return ret; | ||
251 | } | ||
252 | |||
253 | /* Note: Because we don't support holes, our allocation has | ||
254 | * already happened (allocation writes zeros to the file data) | ||
255 | * so we don't have to worry about ordered writes in | ||
256 | * ocfs2_writepage. | ||
257 | * | ||
258 | * ->writepage is called during the process of invalidating the page cache | ||
259 | * during blocked lock processing. It can't block on any cluster locks | ||
260 | * to during block mapping. It's relying on the fact that the block | ||
261 | * mapping can't have disappeared under the dirty pages that it is | ||
262 | * being asked to write back. | ||
263 | */ | ||
264 | static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) | ||
265 | { | ||
266 | int ret; | ||
267 | |||
268 | mlog_entry("(0x%p)\n", page); | ||
269 | |||
270 | ret = block_write_full_page(page, ocfs2_get_block, wbc); | ||
271 | |||
272 | mlog_exit(ret); | ||
273 | |||
274 | return ret; | ||
275 | } | ||
276 | |||
277 | /* | ||
278 | * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called | ||
279 | * from loopback. It must be able to perform its own locking around | ||
280 | * ocfs2_get_block(). | ||
281 | */ | ||
282 | int ocfs2_prepare_write(struct file *file, struct page *page, | ||
283 | unsigned from, unsigned to) | ||
284 | { | ||
285 | struct inode *inode = page->mapping->host; | ||
286 | int ret; | ||
287 | |||
288 | mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); | ||
289 | |||
290 | ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page); | ||
291 | if (ret != 0) { | ||
292 | mlog_errno(ret); | ||
293 | goto out; | ||
294 | } | ||
295 | |||
296 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
297 | |||
298 | ret = block_prepare_write(page, from, to, ocfs2_get_block); | ||
299 | |||
300 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
301 | |||
302 | ocfs2_meta_unlock(inode, 0); | ||
303 | out: | ||
304 | mlog_exit(ret); | ||
305 | return ret; | ||
306 | } | ||
307 | |||
308 | /* Taken from ext3. We don't necessarily need the full blown | ||
309 | * functionality yet, but IMHO it's better to cut and paste the whole | ||
310 | * thing so we can avoid introducing our own bugs (and easily pick up | ||
311 | * their fixes when they happen) --Mark */ | ||
312 | static int walk_page_buffers( handle_t *handle, | ||
313 | struct buffer_head *head, | ||
314 | unsigned from, | ||
315 | unsigned to, | ||
316 | int *partial, | ||
317 | int (*fn)( handle_t *handle, | ||
318 | struct buffer_head *bh)) | ||
319 | { | ||
320 | struct buffer_head *bh; | ||
321 | unsigned block_start, block_end; | ||
322 | unsigned blocksize = head->b_size; | ||
323 | int err, ret = 0; | ||
324 | struct buffer_head *next; | ||
325 | |||
326 | for ( bh = head, block_start = 0; | ||
327 | ret == 0 && (bh != head || !block_start); | ||
328 | block_start = block_end, bh = next) | ||
329 | { | ||
330 | next = bh->b_this_page; | ||
331 | block_end = block_start + blocksize; | ||
332 | if (block_end <= from || block_start >= to) { | ||
333 | if (partial && !buffer_uptodate(bh)) | ||
334 | *partial = 1; | ||
335 | continue; | ||
336 | } | ||
337 | err = (*fn)(handle, bh); | ||
338 | if (!ret) | ||
339 | ret = err; | ||
340 | } | ||
341 | return ret; | ||
342 | } | ||
343 | |||
344 | struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, | ||
345 | struct page *page, | ||
346 | unsigned from, | ||
347 | unsigned to) | ||
348 | { | ||
349 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
350 | struct ocfs2_journal_handle *handle = NULL; | ||
351 | int ret = 0; | ||
352 | |||
353 | handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); | ||
354 | if (!handle) { | ||
355 | ret = -ENOMEM; | ||
356 | mlog_errno(ret); | ||
357 | goto out; | ||
358 | } | ||
359 | |||
360 | if (ocfs2_should_order_data(inode)) { | ||
361 | ret = walk_page_buffers(handle->k_handle, | ||
362 | page_buffers(page), | ||
363 | from, to, NULL, | ||
364 | ocfs2_journal_dirty_data); | ||
365 | if (ret < 0) | ||
366 | mlog_errno(ret); | ||
367 | } | ||
368 | out: | ||
369 | if (ret) { | ||
370 | if (handle) | ||
371 | ocfs2_commit_trans(handle); | ||
372 | handle = ERR_PTR(ret); | ||
373 | } | ||
374 | return handle; | ||
375 | } | ||
376 | |||
377 | static int ocfs2_commit_write(struct file *file, struct page *page, | ||
378 | unsigned from, unsigned to) | ||
379 | { | ||
380 | int ret, extending = 0, locklevel = 0; | ||
381 | loff_t new_i_size; | ||
382 | struct buffer_head *di_bh = NULL; | ||
383 | struct inode *inode = page->mapping->host; | ||
384 | struct ocfs2_journal_handle *handle = NULL; | ||
385 | |||
386 | mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); | ||
387 | |||
388 | /* NOTE: ocfs2_file_aio_write has ensured that it's safe for | ||
389 | * us to sample inode->i_size here without the metadata lock: | ||
390 | * | ||
391 | * 1) We're currently holding the inode alloc lock, so no | ||
392 | * nodes can change it underneath us. | ||
393 | * | ||
394 | * 2) We've had to take the metadata lock at least once | ||
395 | * already to check for extending writes, hence insuring | ||
396 | * that our current copy is also up to date. | ||
397 | */ | ||
398 | new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; | ||
399 | if (new_i_size > i_size_read(inode)) { | ||
400 | extending = 1; | ||
401 | locklevel = 1; | ||
402 | } | ||
403 | |||
404 | ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page); | ||
405 | if (ret != 0) { | ||
406 | mlog_errno(ret); | ||
407 | goto out; | ||
408 | } | ||
409 | |||
410 | ret = ocfs2_data_lock_with_page(inode, 1, page); | ||
411 | if (ret != 0) { | ||
412 | mlog_errno(ret); | ||
413 | goto out_unlock_meta; | ||
414 | } | ||
415 | |||
416 | if (extending) { | ||
417 | handle = ocfs2_start_walk_page_trans(inode, page, from, to); | ||
418 | if (IS_ERR(handle)) { | ||
419 | ret = PTR_ERR(handle); | ||
420 | handle = NULL; | ||
421 | goto out_unlock_data; | ||
422 | } | ||
423 | |||
424 | /* Mark our buffer early. We'd rather catch this error up here | ||
425 | * as opposed to after a successful commit_write which would | ||
426 | * require us to set back inode->i_size. */ | ||
427 | ret = ocfs2_journal_access(handle, inode, di_bh, | ||
428 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
429 | if (ret < 0) { | ||
430 | mlog_errno(ret); | ||
431 | goto out_commit; | ||
432 | } | ||
433 | } | ||
434 | |||
435 | /* might update i_size */ | ||
436 | ret = generic_commit_write(file, page, from, to); | ||
437 | if (ret < 0) { | ||
438 | mlog_errno(ret); | ||
439 | goto out_commit; | ||
440 | } | ||
441 | |||
442 | if (extending) { | ||
443 | loff_t size = (u64) i_size_read(inode); | ||
444 | struct ocfs2_dinode *di = | ||
445 | (struct ocfs2_dinode *)di_bh->b_data; | ||
446 | |||
447 | /* ocfs2_mark_inode_dirty is too heavy to use here. */ | ||
448 | inode->i_blocks = ocfs2_align_bytes_to_sectors(size); | ||
449 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | ||
450 | |||
451 | di->i_size = cpu_to_le64(size); | ||
452 | di->i_ctime = di->i_mtime = | ||
453 | cpu_to_le64(inode->i_mtime.tv_sec); | ||
454 | di->i_ctime_nsec = di->i_mtime_nsec = | ||
455 | cpu_to_le32(inode->i_mtime.tv_nsec); | ||
456 | |||
457 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
458 | if (ret < 0) { | ||
459 | mlog_errno(ret); | ||
460 | goto out_commit; | ||
461 | } | ||
462 | } | ||
463 | |||
464 | BUG_ON(extending && (i_size_read(inode) != new_i_size)); | ||
465 | |||
466 | out_commit: | ||
467 | if (handle) | ||
468 | ocfs2_commit_trans(handle); | ||
469 | out_unlock_data: | ||
470 | ocfs2_data_unlock(inode, 1); | ||
471 | out_unlock_meta: | ||
472 | ocfs2_meta_unlock(inode, locklevel); | ||
473 | out: | ||
474 | if (di_bh) | ||
475 | brelse(di_bh); | ||
476 | |||
477 | mlog_exit(ret); | ||
478 | return ret; | ||
479 | } | ||
480 | |||
481 | static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) | ||
482 | { | ||
483 | sector_t status; | ||
484 | u64 p_blkno = 0; | ||
485 | int err = 0; | ||
486 | struct inode *inode = mapping->host; | ||
487 | |||
488 | mlog_entry("(block = %llu)\n", (unsigned long long)block); | ||
489 | |||
490 | /* We don't need to lock journal system files, since they aren't | ||
491 | * accessed concurrently from multiple nodes. | ||
492 | */ | ||
493 | if (!INODE_JOURNAL(inode)) { | ||
494 | err = ocfs2_meta_lock(inode, NULL, NULL, 0); | ||
495 | if (err) { | ||
496 | if (err != -ENOENT) | ||
497 | mlog_errno(err); | ||
498 | goto bail; | ||
499 | } | ||
500 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
501 | } | ||
502 | |||
503 | err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, | ||
504 | NULL); | ||
505 | |||
506 | if (!INODE_JOURNAL(inode)) { | ||
507 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
508 | ocfs2_meta_unlock(inode, 0); | ||
509 | } | ||
510 | |||
511 | if (err) { | ||
512 | mlog(ML_ERROR, "get_blocks() failed, block = %llu\n", | ||
513 | (unsigned long long)block); | ||
514 | mlog_errno(err); | ||
515 | goto bail; | ||
516 | } | ||
517 | |||
518 | |||
519 | bail: | ||
520 | status = err ? 0 : p_blkno; | ||
521 | |||
522 | mlog_exit((int)status); | ||
523 | |||
524 | return status; | ||
525 | } | ||
526 | |||
527 | /* | ||
528 | * TODO: Make this into a generic get_blocks function. | ||
529 | * | ||
530 | * From do_direct_io in direct-io.c: | ||
531 | * "So what we do is to permit the ->get_blocks function to populate | ||
532 | * bh.b_size with the size of IO which is permitted at this offset and | ||
533 | * this i_blkbits." | ||
534 | * | ||
535 | * This function is called directly from get_more_blocks in direct-io.c. | ||
536 | * | ||
537 | * called like this: dio->get_blocks(dio->inode, fs_startblk, | ||
538 | * fs_count, map_bh, dio->rw == WRITE); | ||
539 | */ | ||
540 | static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | ||
541 | unsigned long max_blocks, | ||
542 | struct buffer_head *bh_result, int create) | ||
543 | { | ||
544 | int ret; | ||
545 | u64 vbo_max; /* file offset, max_blocks from iblock */ | ||
546 | u64 p_blkno; | ||
547 | int contig_blocks; | ||
548 | unsigned char blocksize_bits; | ||
549 | |||
550 | if (!inode || !bh_result) { | ||
551 | mlog(ML_ERROR, "inode or bh_result is null\n"); | ||
552 | return -EIO; | ||
553 | } | ||
554 | |||
555 | blocksize_bits = inode->i_sb->s_blocksize_bits; | ||
556 | |||
557 | /* This function won't even be called if the request isn't all | ||
558 | * nicely aligned and of the right size, so there's no need | ||
559 | * for us to check any of that. */ | ||
560 | |||
561 | vbo_max = ((u64)iblock + max_blocks) << blocksize_bits; | ||
562 | |||
563 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
564 | if ((iblock + max_blocks) > | ||
565 | ocfs2_clusters_to_blocks(inode->i_sb, | ||
566 | OCFS2_I(inode)->ip_clusters)) { | ||
567 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
568 | ret = -EIO; | ||
569 | goto bail; | ||
570 | } | ||
571 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
572 | |||
573 | /* This figures out the size of the next contiguous block, and | ||
574 | * our logical offset */ | ||
575 | ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, | ||
576 | &contig_blocks); | ||
577 | if (ret) { | ||
578 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", | ||
579 | (unsigned long long)iblock); | ||
580 | ret = -EIO; | ||
581 | goto bail; | ||
582 | } | ||
583 | |||
584 | map_bh(bh_result, inode->i_sb, p_blkno); | ||
585 | |||
586 | /* make sure we don't map more than max_blocks blocks here as | ||
587 | that's all the kernel will handle at this point. */ | ||
588 | if (max_blocks < contig_blocks) | ||
589 | contig_blocks = max_blocks; | ||
590 | bh_result->b_size = contig_blocks << blocksize_bits; | ||
591 | bail: | ||
592 | return ret; | ||
593 | } | ||
594 | |||
595 | /* | ||
596 | * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're | ||
597 | * particularly interested in the aio/dio case. Like the core uses | ||
598 | * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from | ||
599 | * truncation on another. | ||
600 | */ | ||
601 | static void ocfs2_dio_end_io(struct kiocb *iocb, | ||
602 | loff_t offset, | ||
603 | ssize_t bytes, | ||
604 | void *private) | ||
605 | { | ||
606 | struct inode *inode = iocb->ki_filp->f_dentry->d_inode; | ||
607 | |||
608 | /* this io's submitter should not have unlocked this before we could */ | ||
609 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); | ||
610 | ocfs2_iocb_clear_rw_locked(iocb); | ||
611 | up_read(&inode->i_alloc_sem); | ||
612 | ocfs2_rw_unlock(inode, 0); | ||
613 | } | ||
614 | |||
615 | static ssize_t ocfs2_direct_IO(int rw, | ||
616 | struct kiocb *iocb, | ||
617 | const struct iovec *iov, | ||
618 | loff_t offset, | ||
619 | unsigned long nr_segs) | ||
620 | { | ||
621 | struct file *file = iocb->ki_filp; | ||
622 | struct inode *inode = file->f_dentry->d_inode->i_mapping->host; | ||
623 | int ret; | ||
624 | |||
625 | mlog_entry_void(); | ||
626 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, | ||
627 | inode->i_sb->s_bdev, iov, offset, | ||
628 | nr_segs, | ||
629 | ocfs2_direct_IO_get_blocks, | ||
630 | ocfs2_dio_end_io); | ||
631 | mlog_exit(ret); | ||
632 | return ret; | ||
633 | } | ||
634 | |||
635 | struct address_space_operations ocfs2_aops = { | ||
636 | .readpage = ocfs2_readpage, | ||
637 | .writepage = ocfs2_writepage, | ||
638 | .prepare_write = ocfs2_prepare_write, | ||
639 | .commit_write = ocfs2_commit_write, | ||
640 | .bmap = ocfs2_bmap, | ||
641 | .sync_page = block_sync_page, | ||
642 | .direct_IO = ocfs2_direct_IO | ||
643 | }; | ||
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h new file mode 100644 index 000000000000..d40456d509a0 --- /dev/null +++ b/fs/ocfs2/aops.h | |||
@@ -0,0 +1,41 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | */ | ||
21 | |||
22 | #ifndef OCFS2_AOPS_H | ||
23 | #define OCFS2_AOPS_H | ||
24 | |||
25 | int ocfs2_prepare_write(struct file *file, struct page *page, | ||
26 | unsigned from, unsigned to); | ||
27 | |||
28 | struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, | ||
29 | struct page *page, | ||
30 | unsigned from, | ||
31 | unsigned to); | ||
32 | |||
33 | /* all ocfs2_dio_end_io()'s fault */ | ||
34 | #define ocfs2_iocb_is_rw_locked(iocb) \ | ||
35 | test_bit(0, (unsigned long *)&iocb->private) | ||
36 | #define ocfs2_iocb_set_rw_locked(iocb) \ | ||
37 | set_bit(0, (unsigned long *)&iocb->private) | ||
38 | #define ocfs2_iocb_clear_rw_locked(iocb) \ | ||
39 | clear_bit(0, (unsigned long *)&iocb->private) | ||
40 | |||
41 | #endif /* OCFS2_FILE_H */ | ||
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c new file mode 100644 index 000000000000..d424041b38e9 --- /dev/null +++ b/fs/ocfs2/buffer_head_io.c | |||
@@ -0,0 +1,232 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * io.c | ||
5 | * | ||
6 | * Buffer cache handling | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | |||
31 | #include <cluster/masklog.h> | ||
32 | |||
33 | #include "ocfs2.h" | ||
34 | |||
35 | #include "alloc.h" | ||
36 | #include "inode.h" | ||
37 | #include "journal.h" | ||
38 | #include "uptodate.h" | ||
39 | |||
40 | #include "buffer_head_io.h" | ||
41 | |||
42 | int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, | ||
43 | struct inode *inode) | ||
44 | { | ||
45 | int ret = 0; | ||
46 | |||
47 | mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n", | ||
48 | (unsigned long long)bh->b_blocknr, inode); | ||
49 | |||
50 | BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO); | ||
51 | BUG_ON(buffer_jbd(bh)); | ||
52 | |||
53 | /* No need to check for a soft readonly file system here. non | ||
54 | * journalled writes are only ever done on system files which | ||
55 | * can get modified during recovery even if read-only. */ | ||
56 | if (ocfs2_is_hard_readonly(osb)) { | ||
57 | ret = -EROFS; | ||
58 | goto out; | ||
59 | } | ||
60 | |||
61 | down(&OCFS2_I(inode)->ip_io_sem); | ||
62 | |||
63 | lock_buffer(bh); | ||
64 | set_buffer_uptodate(bh); | ||
65 | |||
66 | /* remove from dirty list before I/O. */ | ||
67 | clear_buffer_dirty(bh); | ||
68 | |||
69 | get_bh(bh); /* for end_buffer_write_sync() */ | ||
70 | bh->b_end_io = end_buffer_write_sync; | ||
71 | submit_bh(WRITE, bh); | ||
72 | |||
73 | wait_on_buffer(bh); | ||
74 | |||
75 | if (buffer_uptodate(bh)) { | ||
76 | ocfs2_set_buffer_uptodate(inode, bh); | ||
77 | } else { | ||
78 | /* We don't need to remove the clustered uptodate | ||
79 | * information for this bh as it's not marked locally | ||
80 | * uptodate. */ | ||
81 | ret = -EIO; | ||
82 | brelse(bh); | ||
83 | } | ||
84 | |||
85 | up(&OCFS2_I(inode)->ip_io_sem); | ||
86 | out: | ||
87 | mlog_exit(ret); | ||
88 | return ret; | ||
89 | } | ||
90 | |||
91 | int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, | ||
92 | struct buffer_head *bhs[], int flags, | ||
93 | struct inode *inode) | ||
94 | { | ||
95 | int status = 0; | ||
96 | struct super_block *sb; | ||
97 | int i, ignore_cache = 0; | ||
98 | struct buffer_head *bh; | ||
99 | |||
100 | mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n", | ||
101 | block, nr, flags, inode); | ||
102 | |||
103 | if (osb == NULL || osb->sb == NULL || bhs == NULL) { | ||
104 | status = -EINVAL; | ||
105 | mlog_errno(status); | ||
106 | goto bail; | ||
107 | } | ||
108 | |||
109 | if (nr < 0) { | ||
110 | mlog(ML_ERROR, "asked to read %d blocks!\n", nr); | ||
111 | status = -EINVAL; | ||
112 | mlog_errno(status); | ||
113 | goto bail; | ||
114 | } | ||
115 | |||
116 | if (nr == 0) { | ||
117 | mlog(ML_BH_IO, "No buffers will be read!\n"); | ||
118 | status = 0; | ||
119 | goto bail; | ||
120 | } | ||
121 | |||
122 | sb = osb->sb; | ||
123 | |||
124 | if (flags & OCFS2_BH_CACHED && !inode) | ||
125 | flags &= ~OCFS2_BH_CACHED; | ||
126 | |||
127 | if (inode) | ||
128 | down(&OCFS2_I(inode)->ip_io_sem); | ||
129 | for (i = 0 ; i < nr ; i++) { | ||
130 | if (bhs[i] == NULL) { | ||
131 | bhs[i] = sb_getblk(sb, block++); | ||
132 | if (bhs[i] == NULL) { | ||
133 | if (inode) | ||
134 | up(&OCFS2_I(inode)->ip_io_sem); | ||
135 | status = -EIO; | ||
136 | mlog_errno(status); | ||
137 | goto bail; | ||
138 | } | ||
139 | } | ||
140 | bh = bhs[i]; | ||
141 | ignore_cache = 0; | ||
142 | |||
143 | if (flags & OCFS2_BH_CACHED && | ||
144 | !ocfs2_buffer_uptodate(inode, bh)) { | ||
145 | mlog(ML_UPTODATE, | ||
146 | "bh (%llu), inode %"MLFu64" not uptodate\n", | ||
147 | (unsigned long long)bh->b_blocknr, | ||
148 | OCFS2_I(inode)->ip_blkno); | ||
149 | ignore_cache = 1; | ||
150 | } | ||
151 | |||
152 | /* XXX: Can we ever get this and *not* have the cached | ||
153 | * flag set? */ | ||
154 | if (buffer_jbd(bh)) { | ||
155 | if (!(flags & OCFS2_BH_CACHED) || ignore_cache) | ||
156 | mlog(ML_BH_IO, "trying to sync read a jbd " | ||
157 | "managed bh (blocknr = %llu)\n", | ||
158 | (unsigned long long)bh->b_blocknr); | ||
159 | continue; | ||
160 | } | ||
161 | |||
162 | if (!(flags & OCFS2_BH_CACHED) || ignore_cache) { | ||
163 | if (buffer_dirty(bh)) { | ||
164 | /* This should probably be a BUG, or | ||
165 | * at least return an error. */ | ||
166 | mlog(ML_BH_IO, "asking me to sync read a dirty " | ||
167 | "buffer! (blocknr = %llu)\n", | ||
168 | (unsigned long long)bh->b_blocknr); | ||
169 | continue; | ||
170 | } | ||
171 | |||
172 | lock_buffer(bh); | ||
173 | if (buffer_jbd(bh)) { | ||
174 | #ifdef CATCH_BH_JBD_RACES | ||
175 | mlog(ML_ERROR, "block %llu had the JBD bit set " | ||
176 | "while I was in lock_buffer!", | ||
177 | (unsigned long long)bh->b_blocknr); | ||
178 | BUG(); | ||
179 | #else | ||
180 | unlock_buffer(bh); | ||
181 | continue; | ||
182 | #endif | ||
183 | } | ||
184 | clear_buffer_uptodate(bh); | ||
185 | get_bh(bh); /* for end_buffer_read_sync() */ | ||
186 | bh->b_end_io = end_buffer_read_sync; | ||
187 | if (flags & OCFS2_BH_READAHEAD) | ||
188 | submit_bh(READA, bh); | ||
189 | else | ||
190 | submit_bh(READ, bh); | ||
191 | continue; | ||
192 | } | ||
193 | } | ||
194 | |||
195 | status = 0; | ||
196 | |||
197 | for (i = (nr - 1); i >= 0; i--) { | ||
198 | bh = bhs[i]; | ||
199 | |||
200 | /* We know this can't have changed as we hold the | ||
201 | * inode sem. Avoid doing any work on the bh if the | ||
202 | * journal has it. */ | ||
203 | if (!buffer_jbd(bh)) | ||
204 | wait_on_buffer(bh); | ||
205 | |||
206 | if (!buffer_uptodate(bh)) { | ||
207 | /* Status won't be cleared from here on out, | ||
208 | * so we can safely record this and loop back | ||
209 | * to cleanup the other buffers. Don't need to | ||
210 | * remove the clustered uptodate information | ||
211 | * for this bh as it's not marked locally | ||
212 | * uptodate. */ | ||
213 | status = -EIO; | ||
214 | brelse(bh); | ||
215 | bhs[i] = NULL; | ||
216 | continue; | ||
217 | } | ||
218 | |||
219 | if (inode) | ||
220 | ocfs2_set_buffer_uptodate(inode, bh); | ||
221 | } | ||
222 | if (inode) | ||
223 | up(&OCFS2_I(inode)->ip_io_sem); | ||
224 | |||
225 | mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr, | ||
226 | (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes"); | ||
227 | |||
228 | bail: | ||
229 | |||
230 | mlog_exit(status); | ||
231 | return status; | ||
232 | } | ||
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h new file mode 100644 index 000000000000..6ecb90937b68 --- /dev/null +++ b/fs/ocfs2/buffer_head_io.h | |||
@@ -0,0 +1,73 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ocfs2_buffer_head.h | ||
5 | * | ||
6 | * Buffer cache handling functions defined | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_BUFFER_HEAD_IO_H | ||
27 | #define OCFS2_BUFFER_HEAD_IO_H | ||
28 | |||
29 | #include <linux/buffer_head.h> | ||
30 | |||
31 | void ocfs2_end_buffer_io_sync(struct buffer_head *bh, | ||
32 | int uptodate); | ||
33 | |||
34 | static inline int ocfs2_read_block(struct ocfs2_super *osb, | ||
35 | u64 off, | ||
36 | struct buffer_head **bh, | ||
37 | int flags, | ||
38 | struct inode *inode); | ||
39 | |||
40 | int ocfs2_write_block(struct ocfs2_super *osb, | ||
41 | struct buffer_head *bh, | ||
42 | struct inode *inode); | ||
43 | int ocfs2_read_blocks(struct ocfs2_super *osb, | ||
44 | u64 block, | ||
45 | int nr, | ||
46 | struct buffer_head *bhs[], | ||
47 | int flags, | ||
48 | struct inode *inode); | ||
49 | |||
50 | |||
51 | #define OCFS2_BH_CACHED 1 | ||
52 | #define OCFS2_BH_READAHEAD 8 /* use this to pass READA down to submit_bh */ | ||
53 | |||
54 | static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, | ||
55 | struct buffer_head **bh, int flags, | ||
56 | struct inode *inode) | ||
57 | { | ||
58 | int status = 0; | ||
59 | |||
60 | if (bh == NULL) { | ||
61 | printk("ocfs2: bh == NULL\n"); | ||
62 | status = -EINVAL; | ||
63 | goto bail; | ||
64 | } | ||
65 | |||
66 | status = ocfs2_read_blocks(osb, off, 1, bh, | ||
67 | flags, inode); | ||
68 | |||
69 | bail: | ||
70 | return status; | ||
71 | } | ||
72 | |||
73 | #endif /* OCFS2_BUFFER_HEAD_IO_H */ | ||
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile new file mode 100644 index 000000000000..cdd162f13650 --- /dev/null +++ b/fs/ocfs2/cluster/Makefile | |||
@@ -0,0 +1,4 @@ | |||
1 | obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o | ||
2 | |||
3 | ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ | ||
4 | quorum.o tcp.o ver.o | ||
diff --git a/fs/ocfs2/cluster/endian.h b/fs/ocfs2/cluster/endian.h new file mode 100644 index 000000000000..2df9082f4e35 --- /dev/null +++ b/fs/ocfs2/cluster/endian.h | |||
@@ -0,0 +1,30 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * Copyright (C) 2005 Oracle. All rights reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | */ | ||
21 | |||
22 | #ifndef OCFS2_CLUSTER_ENDIAN_H | ||
23 | #define OCFS2_CLUSTER_ENDIAN_H | ||
24 | |||
25 | static inline void be32_add_cpu(__be32 *var, u32 val) | ||
26 | { | ||
27 | *var = cpu_to_be32(be32_to_cpu(*var) + val); | ||
28 | } | ||
29 | |||
30 | #endif /* OCFS2_CLUSTER_ENDIAN_H */ | ||
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c new file mode 100644 index 000000000000..7307ba528913 --- /dev/null +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -0,0 +1,1797 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * Copyright (C) 2004, 2005 Oracle. All rights reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | */ | ||
21 | |||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/sched.h> | ||
24 | #include <linux/jiffies.h> | ||
25 | #include <linux/module.h> | ||
26 | #include <linux/fs.h> | ||
27 | #include <linux/bio.h> | ||
28 | #include <linux/blkdev.h> | ||
29 | #include <linux/delay.h> | ||
30 | #include <linux/file.h> | ||
31 | #include <linux/kthread.h> | ||
32 | #include <linux/configfs.h> | ||
33 | #include <linux/random.h> | ||
34 | #include <linux/crc32.h> | ||
35 | #include <linux/time.h> | ||
36 | |||
37 | #include "heartbeat.h" | ||
38 | #include "tcp.h" | ||
39 | #include "nodemanager.h" | ||
40 | #include "quorum.h" | ||
41 | |||
42 | #include "masklog.h" | ||
43 | |||
44 | |||
45 | /* | ||
46 | * The first heartbeat pass had one global thread that would serialize all hb | ||
47 | * callback calls. This global serializing sem should only be removed once | ||
48 | * we've made sure that all callees can deal with being called concurrently | ||
49 | * from multiple hb region threads. | ||
50 | */ | ||
51 | static DECLARE_RWSEM(o2hb_callback_sem); | ||
52 | |||
53 | /* | ||
54 | * multiple hb threads are watching multiple regions. A node is live | ||
55 | * whenever any of the threads sees activity from the node in its region. | ||
56 | */ | ||
57 | static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED; | ||
58 | static struct list_head o2hb_live_slots[O2NM_MAX_NODES]; | ||
59 | static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
60 | static LIST_HEAD(o2hb_node_events); | ||
61 | static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); | ||
62 | |||
63 | static LIST_HEAD(o2hb_all_regions); | ||
64 | |||
65 | static struct o2hb_callback { | ||
66 | struct list_head list; | ||
67 | } o2hb_callbacks[O2HB_NUM_CB]; | ||
68 | |||
69 | static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type); | ||
70 | |||
71 | #define O2HB_DEFAULT_BLOCK_BITS 9 | ||
72 | |||
73 | unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; | ||
74 | |||
75 | /* Only sets a new threshold if there are no active regions. | ||
76 | * | ||
77 | * No locking or otherwise interesting code is required for reading | ||
78 | * o2hb_dead_threshold as it can't change once regions are active and | ||
79 | * it's not interesting to anyone until then anyway. */ | ||
80 | static void o2hb_dead_threshold_set(unsigned int threshold) | ||
81 | { | ||
82 | if (threshold > O2HB_MIN_DEAD_THRESHOLD) { | ||
83 | spin_lock(&o2hb_live_lock); | ||
84 | if (list_empty(&o2hb_all_regions)) | ||
85 | o2hb_dead_threshold = threshold; | ||
86 | spin_unlock(&o2hb_live_lock); | ||
87 | } | ||
88 | } | ||
89 | |||
90 | struct o2hb_node_event { | ||
91 | struct list_head hn_item; | ||
92 | enum o2hb_callback_type hn_event_type; | ||
93 | struct o2nm_node *hn_node; | ||
94 | int hn_node_num; | ||
95 | }; | ||
96 | |||
97 | struct o2hb_disk_slot { | ||
98 | struct o2hb_disk_heartbeat_block *ds_raw_block; | ||
99 | u8 ds_node_num; | ||
100 | u64 ds_last_time; | ||
101 | u64 ds_last_generation; | ||
102 | u16 ds_equal_samples; | ||
103 | u16 ds_changed_samples; | ||
104 | struct list_head ds_live_item; | ||
105 | }; | ||
106 | |||
107 | /* each thread owns a region.. when we're asked to tear down the region | ||
108 | * we ask the thread to stop, who cleans up the region */ | ||
109 | struct o2hb_region { | ||
110 | struct config_item hr_item; | ||
111 | |||
112 | struct list_head hr_all_item; | ||
113 | unsigned hr_unclean_stop:1; | ||
114 | |||
115 | /* protected by the hr_callback_sem */ | ||
116 | struct task_struct *hr_task; | ||
117 | |||
118 | unsigned int hr_blocks; | ||
119 | unsigned long long hr_start_block; | ||
120 | |||
121 | unsigned int hr_block_bits; | ||
122 | unsigned int hr_block_bytes; | ||
123 | |||
124 | unsigned int hr_slots_per_page; | ||
125 | unsigned int hr_num_pages; | ||
126 | |||
127 | struct page **hr_slot_data; | ||
128 | struct block_device *hr_bdev; | ||
129 | struct o2hb_disk_slot *hr_slots; | ||
130 | |||
131 | /* let the person setting up hb wait for it to return until it | ||
132 | * has reached a 'steady' state. This will be fixed when we have | ||
133 | * a more complete api that doesn't lead to this sort of fragility. */ | ||
134 | atomic_t hr_steady_iterations; | ||
135 | |||
136 | char hr_dev_name[BDEVNAME_SIZE]; | ||
137 | |||
138 | unsigned int hr_timeout_ms; | ||
139 | |||
140 | /* randomized as the region goes up and down so that a node | ||
141 | * recognizes a node going up and down in one iteration */ | ||
142 | u64 hr_generation; | ||
143 | |||
144 | struct work_struct hr_write_timeout_work; | ||
145 | unsigned long hr_last_timeout_start; | ||
146 | |||
147 | /* Used during o2hb_check_slot to hold a copy of the block | ||
148 | * being checked because we temporarily have to zero out the | ||
149 | * crc field. */ | ||
150 | struct o2hb_disk_heartbeat_block *hr_tmp_block; | ||
151 | }; | ||
152 | |||
153 | struct o2hb_bio_wait_ctxt { | ||
154 | atomic_t wc_num_reqs; | ||
155 | struct completion wc_io_complete; | ||
156 | }; | ||
157 | |||
158 | static void o2hb_write_timeout(void *arg) | ||
159 | { | ||
160 | struct o2hb_region *reg = arg; | ||
161 | |||
162 | mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " | ||
163 | "milliseconds\n", reg->hr_dev_name, | ||
164 | jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); | ||
165 | o2quo_disk_timeout(); | ||
166 | } | ||
167 | |||
168 | static void o2hb_arm_write_timeout(struct o2hb_region *reg) | ||
169 | { | ||
170 | mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS); | ||
171 | |||
172 | cancel_delayed_work(®->hr_write_timeout_work); | ||
173 | reg->hr_last_timeout_start = jiffies; | ||
174 | schedule_delayed_work(®->hr_write_timeout_work, | ||
175 | msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)); | ||
176 | } | ||
177 | |||
178 | static void o2hb_disarm_write_timeout(struct o2hb_region *reg) | ||
179 | { | ||
180 | cancel_delayed_work(®->hr_write_timeout_work); | ||
181 | flush_scheduled_work(); | ||
182 | } | ||
183 | |||
184 | static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc, | ||
185 | unsigned int num_ios) | ||
186 | { | ||
187 | atomic_set(&wc->wc_num_reqs, num_ios); | ||
188 | init_completion(&wc->wc_io_complete); | ||
189 | } | ||
190 | |||
191 | /* Used in error paths too */ | ||
192 | static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc, | ||
193 | unsigned int num) | ||
194 | { | ||
195 | /* sadly atomic_sub_and_test() isn't available on all platforms. The | ||
196 | * good news is that the fast path only completes one at a time */ | ||
197 | while(num--) { | ||
198 | if (atomic_dec_and_test(&wc->wc_num_reqs)) { | ||
199 | BUG_ON(num > 0); | ||
200 | complete(&wc->wc_io_complete); | ||
201 | } | ||
202 | } | ||
203 | } | ||
204 | |||
205 | static void o2hb_wait_on_io(struct o2hb_region *reg, | ||
206 | struct o2hb_bio_wait_ctxt *wc) | ||
207 | { | ||
208 | struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping; | ||
209 | |||
210 | blk_run_address_space(mapping); | ||
211 | |||
212 | wait_for_completion(&wc->wc_io_complete); | ||
213 | } | ||
214 | |||
215 | static int o2hb_bio_end_io(struct bio *bio, | ||
216 | unsigned int bytes_done, | ||
217 | int error) | ||
218 | { | ||
219 | struct o2hb_bio_wait_ctxt *wc = bio->bi_private; | ||
220 | |||
221 | if (error) | ||
222 | mlog(ML_ERROR, "IO Error %d\n", error); | ||
223 | |||
224 | if (bio->bi_size) | ||
225 | return 1; | ||
226 | |||
227 | o2hb_bio_wait_dec(wc, 1); | ||
228 | return 0; | ||
229 | } | ||
230 | |||
231 | /* Setup a Bio to cover I/O against num_slots slots starting at | ||
232 | * start_slot. */ | ||
233 | static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, | ||
234 | struct o2hb_bio_wait_ctxt *wc, | ||
235 | unsigned int start_slot, | ||
236 | unsigned int num_slots) | ||
237 | { | ||
238 | int i, nr_vecs, len, first_page, last_page; | ||
239 | unsigned int vec_len, vec_start; | ||
240 | unsigned int bits = reg->hr_block_bits; | ||
241 | unsigned int spp = reg->hr_slots_per_page; | ||
242 | struct bio *bio; | ||
243 | struct page *page; | ||
244 | |||
245 | nr_vecs = (num_slots + spp - 1) / spp; | ||
246 | |||
247 | /* Testing has shown this allocation to take long enough under | ||
248 | * GFP_KERNEL that the local node can get fenced. It would be | ||
249 | * nicest if we could pre-allocate these bios and avoid this | ||
250 | * all together. */ | ||
251 | bio = bio_alloc(GFP_ATOMIC, nr_vecs); | ||
252 | if (!bio) { | ||
253 | mlog(ML_ERROR, "Could not alloc slots BIO!\n"); | ||
254 | bio = ERR_PTR(-ENOMEM); | ||
255 | goto bail; | ||
256 | } | ||
257 | |||
258 | /* Must put everything in 512 byte sectors for the bio... */ | ||
259 | bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9); | ||
260 | bio->bi_bdev = reg->hr_bdev; | ||
261 | bio->bi_private = wc; | ||
262 | bio->bi_end_io = o2hb_bio_end_io; | ||
263 | |||
264 | first_page = start_slot / spp; | ||
265 | last_page = first_page + nr_vecs; | ||
266 | vec_start = (start_slot << bits) % PAGE_CACHE_SIZE; | ||
267 | for(i = first_page; i < last_page; i++) { | ||
268 | page = reg->hr_slot_data[i]; | ||
269 | |||
270 | vec_len = PAGE_CACHE_SIZE; | ||
271 | /* last page might be short */ | ||
272 | if (((i + 1) * spp) > (start_slot + num_slots)) | ||
273 | vec_len = ((num_slots + start_slot) % spp) << bits; | ||
274 | vec_len -= vec_start; | ||
275 | |||
276 | mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", | ||
277 | i, vec_len, vec_start); | ||
278 | |||
279 | len = bio_add_page(bio, page, vec_len, vec_start); | ||
280 | if (len != vec_len) { | ||
281 | bio_put(bio); | ||
282 | bio = ERR_PTR(-EIO); | ||
283 | |||
284 | mlog(ML_ERROR, "Error adding page to bio i = %d, " | ||
285 | "vec_len = %u, len = %d\n, start = %u\n", | ||
286 | i, vec_len, len, vec_start); | ||
287 | goto bail; | ||
288 | } | ||
289 | |||
290 | vec_start = 0; | ||
291 | } | ||
292 | |||
293 | bail: | ||
294 | return bio; | ||
295 | } | ||
296 | |||
297 | /* | ||
298 | * Compute the maximum number of sectors the bdev can handle in one bio, | ||
299 | * as a power of two. | ||
300 | * | ||
301 | * Stolen from oracleasm, thanks Joel! | ||
302 | */ | ||
303 | static int compute_max_sectors(struct block_device *bdev) | ||
304 | { | ||
305 | int max_pages, max_sectors, pow_two_sectors; | ||
306 | |||
307 | struct request_queue *q; | ||
308 | |||
309 | q = bdev_get_queue(bdev); | ||
310 | max_pages = q->max_sectors >> (PAGE_SHIFT - 9); | ||
311 | if (max_pages > BIO_MAX_PAGES) | ||
312 | max_pages = BIO_MAX_PAGES; | ||
313 | if (max_pages > q->max_phys_segments) | ||
314 | max_pages = q->max_phys_segments; | ||
315 | if (max_pages > q->max_hw_segments) | ||
316 | max_pages = q->max_hw_segments; | ||
317 | max_pages--; /* Handle I/Os that straddle a page */ | ||
318 | |||
319 | max_sectors = max_pages << (PAGE_SHIFT - 9); | ||
320 | |||
321 | /* Why is fls() 1-based???? */ | ||
322 | pow_two_sectors = 1 << (fls(max_sectors) - 1); | ||
323 | |||
324 | return pow_two_sectors; | ||
325 | } | ||
326 | |||
327 | static inline void o2hb_compute_request_limits(struct o2hb_region *reg, | ||
328 | unsigned int num_slots, | ||
329 | unsigned int *num_bios, | ||
330 | unsigned int *slots_per_bio) | ||
331 | { | ||
332 | unsigned int max_sectors, io_sectors; | ||
333 | |||
334 | max_sectors = compute_max_sectors(reg->hr_bdev); | ||
335 | |||
336 | io_sectors = num_slots << (reg->hr_block_bits - 9); | ||
337 | |||
338 | *num_bios = (io_sectors + max_sectors - 1) / max_sectors; | ||
339 | *slots_per_bio = max_sectors >> (reg->hr_block_bits - 9); | ||
340 | |||
341 | mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This " | ||
342 | "device can handle %u sectors of I/O\n", io_sectors, num_slots, | ||
343 | max_sectors); | ||
344 | mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n", | ||
345 | *num_bios, *slots_per_bio); | ||
346 | } | ||
347 | |||
348 | static int o2hb_read_slots(struct o2hb_region *reg, | ||
349 | unsigned int max_slots) | ||
350 | { | ||
351 | unsigned int num_bios, slots_per_bio, start_slot, num_slots; | ||
352 | int i, status; | ||
353 | struct o2hb_bio_wait_ctxt wc; | ||
354 | struct bio **bios; | ||
355 | struct bio *bio; | ||
356 | |||
357 | o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio); | ||
358 | |||
359 | bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL); | ||
360 | if (!bios) { | ||
361 | status = -ENOMEM; | ||
362 | mlog_errno(status); | ||
363 | return status; | ||
364 | } | ||
365 | |||
366 | o2hb_bio_wait_init(&wc, num_bios); | ||
367 | |||
368 | num_slots = slots_per_bio; | ||
369 | for(i = 0; i < num_bios; i++) { | ||
370 | start_slot = i * slots_per_bio; | ||
371 | |||
372 | /* adjust num_slots at last bio */ | ||
373 | if (max_slots < (start_slot + num_slots)) | ||
374 | num_slots = max_slots - start_slot; | ||
375 | |||
376 | bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots); | ||
377 | if (IS_ERR(bio)) { | ||
378 | o2hb_bio_wait_dec(&wc, num_bios - i); | ||
379 | |||
380 | status = PTR_ERR(bio); | ||
381 | mlog_errno(status); | ||
382 | goto bail_and_wait; | ||
383 | } | ||
384 | bios[i] = bio; | ||
385 | |||
386 | submit_bio(READ, bio); | ||
387 | } | ||
388 | |||
389 | status = 0; | ||
390 | |||
391 | bail_and_wait: | ||
392 | o2hb_wait_on_io(reg, &wc); | ||
393 | |||
394 | if (bios) { | ||
395 | for(i = 0; i < num_bios; i++) | ||
396 | if (bios[i]) | ||
397 | bio_put(bios[i]); | ||
398 | kfree(bios); | ||
399 | } | ||
400 | |||
401 | return status; | ||
402 | } | ||
403 | |||
404 | static int o2hb_issue_node_write(struct o2hb_region *reg, | ||
405 | struct bio **write_bio, | ||
406 | struct o2hb_bio_wait_ctxt *write_wc) | ||
407 | { | ||
408 | int status; | ||
409 | unsigned int slot; | ||
410 | struct bio *bio; | ||
411 | |||
412 | o2hb_bio_wait_init(write_wc, 1); | ||
413 | |||
414 | slot = o2nm_this_node(); | ||
415 | |||
416 | bio = o2hb_setup_one_bio(reg, write_wc, slot, 1); | ||
417 | if (IS_ERR(bio)) { | ||
418 | status = PTR_ERR(bio); | ||
419 | mlog_errno(status); | ||
420 | goto bail; | ||
421 | } | ||
422 | |||
423 | submit_bio(WRITE, bio); | ||
424 | |||
425 | *write_bio = bio; | ||
426 | status = 0; | ||
427 | bail: | ||
428 | return status; | ||
429 | } | ||
430 | |||
431 | static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg, | ||
432 | struct o2hb_disk_heartbeat_block *hb_block) | ||
433 | { | ||
434 | __le32 old_cksum; | ||
435 | u32 ret; | ||
436 | |||
437 | /* We want to compute the block crc with a 0 value in the | ||
438 | * hb_cksum field. Save it off here and replace after the | ||
439 | * crc. */ | ||
440 | old_cksum = hb_block->hb_cksum; | ||
441 | hb_block->hb_cksum = 0; | ||
442 | |||
443 | ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes); | ||
444 | |||
445 | hb_block->hb_cksum = old_cksum; | ||
446 | |||
447 | return ret; | ||
448 | } | ||
449 | |||
450 | static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block) | ||
451 | { | ||
452 | mlog(ML_ERROR, "Dump slot information: seq = 0x%"MLFx64", node = %u, " | ||
453 | "cksum = 0x%x, generation 0x%"MLFx64"\n", | ||
454 | le64_to_cpu(hb_block->hb_seq), hb_block->hb_node, | ||
455 | le32_to_cpu(hb_block->hb_cksum), | ||
456 | le64_to_cpu(hb_block->hb_generation)); | ||
457 | } | ||
458 | |||
459 | static int o2hb_verify_crc(struct o2hb_region *reg, | ||
460 | struct o2hb_disk_heartbeat_block *hb_block) | ||
461 | { | ||
462 | u32 read, computed; | ||
463 | |||
464 | read = le32_to_cpu(hb_block->hb_cksum); | ||
465 | computed = o2hb_compute_block_crc_le(reg, hb_block); | ||
466 | |||
467 | return read == computed; | ||
468 | } | ||
469 | |||
470 | /* We want to make sure that nobody is heartbeating on top of us -- | ||
471 | * this will help detect an invalid configuration. */ | ||
472 | static int o2hb_check_last_timestamp(struct o2hb_region *reg) | ||
473 | { | ||
474 | int node_num, ret; | ||
475 | struct o2hb_disk_slot *slot; | ||
476 | struct o2hb_disk_heartbeat_block *hb_block; | ||
477 | |||
478 | node_num = o2nm_this_node(); | ||
479 | |||
480 | ret = 1; | ||
481 | slot = ®->hr_slots[node_num]; | ||
482 | /* Don't check on our 1st timestamp */ | ||
483 | if (slot->ds_last_time) { | ||
484 | hb_block = slot->ds_raw_block; | ||
485 | |||
486 | if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time) | ||
487 | ret = 0; | ||
488 | } | ||
489 | |||
490 | return ret; | ||
491 | } | ||
492 | |||
493 | static inline void o2hb_prepare_block(struct o2hb_region *reg, | ||
494 | u64 generation) | ||
495 | { | ||
496 | int node_num; | ||
497 | u64 cputime; | ||
498 | struct o2hb_disk_slot *slot; | ||
499 | struct o2hb_disk_heartbeat_block *hb_block; | ||
500 | |||
501 | node_num = o2nm_this_node(); | ||
502 | slot = ®->hr_slots[node_num]; | ||
503 | |||
504 | hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block; | ||
505 | memset(hb_block, 0, reg->hr_block_bytes); | ||
506 | /* TODO: time stuff */ | ||
507 | cputime = CURRENT_TIME.tv_sec; | ||
508 | if (!cputime) | ||
509 | cputime = 1; | ||
510 | |||
511 | hb_block->hb_seq = cpu_to_le64(cputime); | ||
512 | hb_block->hb_node = node_num; | ||
513 | hb_block->hb_generation = cpu_to_le64(generation); | ||
514 | |||
515 | /* This step must always happen last! */ | ||
516 | hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg, | ||
517 | hb_block)); | ||
518 | |||
519 | mlog(ML_HB_BIO, "our node generation = 0x%"MLFx64", cksum = 0x%x\n", | ||
520 | cpu_to_le64(generation), le32_to_cpu(hb_block->hb_cksum)); | ||
521 | } | ||
522 | |||
523 | static void o2hb_fire_callbacks(struct o2hb_callback *hbcall, | ||
524 | struct o2nm_node *node, | ||
525 | int idx) | ||
526 | { | ||
527 | struct list_head *iter; | ||
528 | struct o2hb_callback_func *f; | ||
529 | |||
530 | list_for_each(iter, &hbcall->list) { | ||
531 | f = list_entry(iter, struct o2hb_callback_func, hc_item); | ||
532 | mlog(ML_HEARTBEAT, "calling funcs %p\n", f); | ||
533 | (f->hc_func)(node, idx, f->hc_data); | ||
534 | } | ||
535 | } | ||
536 | |||
537 | /* Will run the list in order until we process the passed event */ | ||
538 | static void o2hb_run_event_list(struct o2hb_node_event *queued_event) | ||
539 | { | ||
540 | int empty; | ||
541 | struct o2hb_callback *hbcall; | ||
542 | struct o2hb_node_event *event; | ||
543 | |||
544 | spin_lock(&o2hb_live_lock); | ||
545 | empty = list_empty(&queued_event->hn_item); | ||
546 | spin_unlock(&o2hb_live_lock); | ||
547 | if (empty) | ||
548 | return; | ||
549 | |||
550 | /* Holding callback sem assures we don't alter the callback | ||
551 | * lists when doing this, and serializes ourselves with other | ||
552 | * processes wanting callbacks. */ | ||
553 | down_write(&o2hb_callback_sem); | ||
554 | |||
555 | spin_lock(&o2hb_live_lock); | ||
556 | while (!list_empty(&o2hb_node_events) | ||
557 | && !list_empty(&queued_event->hn_item)) { | ||
558 | event = list_entry(o2hb_node_events.next, | ||
559 | struct o2hb_node_event, | ||
560 | hn_item); | ||
561 | list_del_init(&event->hn_item); | ||
562 | spin_unlock(&o2hb_live_lock); | ||
563 | |||
564 | mlog(ML_HEARTBEAT, "Node %s event for %d\n", | ||
565 | event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN", | ||
566 | event->hn_node_num); | ||
567 | |||
568 | hbcall = hbcall_from_type(event->hn_event_type); | ||
569 | |||
570 | /* We should *never* have gotten on to the list with a | ||
571 | * bad type... This isn't something that we should try | ||
572 | * to recover from. */ | ||
573 | BUG_ON(IS_ERR(hbcall)); | ||
574 | |||
575 | o2hb_fire_callbacks(hbcall, event->hn_node, event->hn_node_num); | ||
576 | |||
577 | spin_lock(&o2hb_live_lock); | ||
578 | } | ||
579 | spin_unlock(&o2hb_live_lock); | ||
580 | |||
581 | up_write(&o2hb_callback_sem); | ||
582 | } | ||
583 | |||
584 | static void o2hb_queue_node_event(struct o2hb_node_event *event, | ||
585 | enum o2hb_callback_type type, | ||
586 | struct o2nm_node *node, | ||
587 | int node_num) | ||
588 | { | ||
589 | assert_spin_locked(&o2hb_live_lock); | ||
590 | |||
591 | event->hn_event_type = type; | ||
592 | event->hn_node = node; | ||
593 | event->hn_node_num = node_num; | ||
594 | |||
595 | mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n", | ||
596 | type == O2HB_NODE_UP_CB ? "UP" : "DOWN", node_num); | ||
597 | |||
598 | list_add_tail(&event->hn_item, &o2hb_node_events); | ||
599 | } | ||
600 | |||
601 | static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) | ||
602 | { | ||
603 | struct o2hb_node_event event = | ||
604 | { .hn_item = LIST_HEAD_INIT(event.hn_item), }; | ||
605 | struct o2nm_node *node; | ||
606 | |||
607 | node = o2nm_get_node_by_num(slot->ds_node_num); | ||
608 | if (!node) | ||
609 | return; | ||
610 | |||
611 | spin_lock(&o2hb_live_lock); | ||
612 | if (!list_empty(&slot->ds_live_item)) { | ||
613 | mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n", | ||
614 | slot->ds_node_num); | ||
615 | |||
616 | list_del_init(&slot->ds_live_item); | ||
617 | |||
618 | if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { | ||
619 | clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); | ||
620 | |||
621 | o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, | ||
622 | slot->ds_node_num); | ||
623 | } | ||
624 | } | ||
625 | spin_unlock(&o2hb_live_lock); | ||
626 | |||
627 | o2hb_run_event_list(&event); | ||
628 | |||
629 | o2nm_node_put(node); | ||
630 | } | ||
631 | |||
632 | static int o2hb_check_slot(struct o2hb_region *reg, | ||
633 | struct o2hb_disk_slot *slot) | ||
634 | { | ||
635 | int changed = 0, gen_changed = 0; | ||
636 | struct o2hb_node_event event = | ||
637 | { .hn_item = LIST_HEAD_INIT(event.hn_item), }; | ||
638 | struct o2nm_node *node; | ||
639 | struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block; | ||
640 | u64 cputime; | ||
641 | |||
642 | memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); | ||
643 | |||
644 | /* Is this correct? Do we assume that the node doesn't exist | ||
645 | * if we're not configured for him? */ | ||
646 | node = o2nm_get_node_by_num(slot->ds_node_num); | ||
647 | if (!node) | ||
648 | return 0; | ||
649 | |||
650 | if (!o2hb_verify_crc(reg, hb_block)) { | ||
651 | /* all paths from here will drop o2hb_live_lock for | ||
652 | * us. */ | ||
653 | spin_lock(&o2hb_live_lock); | ||
654 | |||
655 | /* Don't print an error on the console in this case - | ||
656 | * a freshly formatted heartbeat area will not have a | ||
657 | * crc set on it. */ | ||
658 | if (list_empty(&slot->ds_live_item)) | ||
659 | goto out; | ||
660 | |||
661 | /* The node is live but pushed out a bad crc. We | ||
662 | * consider it a transient miss but don't populate any | ||
663 | * other values as they may be junk. */ | ||
664 | mlog(ML_ERROR, "Node %d has written a bad crc to %s\n", | ||
665 | slot->ds_node_num, reg->hr_dev_name); | ||
666 | o2hb_dump_slot(hb_block); | ||
667 | |||
668 | slot->ds_equal_samples++; | ||
669 | goto fire_callbacks; | ||
670 | } | ||
671 | |||
672 | /* we don't care if these wrap.. the state transitions below | ||
673 | * clear at the right places */ | ||
674 | cputime = le64_to_cpu(hb_block->hb_seq); | ||
675 | if (slot->ds_last_time != cputime) | ||
676 | slot->ds_changed_samples++; | ||
677 | else | ||
678 | slot->ds_equal_samples++; | ||
679 | slot->ds_last_time = cputime; | ||
680 | |||
681 | /* The node changed heartbeat generations. We assume this to | ||
682 | * mean it dropped off but came back before we timed out. We | ||
683 | * want to consider it down for the time being but don't want | ||
684 | * to lose any changed_samples state we might build up to | ||
685 | * considering it live again. */ | ||
686 | if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) { | ||
687 | gen_changed = 1; | ||
688 | slot->ds_equal_samples = 0; | ||
689 | mlog(ML_HEARTBEAT, "Node %d changed generation (0x%"MLFx64" " | ||
690 | "to 0x%"MLFx64")\n", slot->ds_node_num, | ||
691 | slot->ds_last_generation, | ||
692 | le64_to_cpu(hb_block->hb_generation)); | ||
693 | } | ||
694 | |||
695 | slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation); | ||
696 | |||
697 | mlog(ML_HEARTBEAT, "Slot %d gen 0x%"MLFx64" cksum 0x%x " | ||
698 | "seq %"MLFu64" last %"MLFu64" changed %u equal %u\n", | ||
699 | slot->ds_node_num, slot->ds_last_generation, | ||
700 | le32_to_cpu(hb_block->hb_cksum), le64_to_cpu(hb_block->hb_seq), | ||
701 | slot->ds_last_time, slot->ds_changed_samples, | ||
702 | slot->ds_equal_samples); | ||
703 | |||
704 | spin_lock(&o2hb_live_lock); | ||
705 | |||
706 | fire_callbacks: | ||
707 | /* dead nodes only come to life after some number of | ||
708 | * changes at any time during their dead time */ | ||
709 | if (list_empty(&slot->ds_live_item) && | ||
710 | slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) { | ||
711 | mlog(ML_HEARTBEAT, "Node %d (id 0x%"MLFx64") joined my " | ||
712 | "region\n", slot->ds_node_num, slot->ds_last_generation); | ||
713 | |||
714 | /* first on the list generates a callback */ | ||
715 | if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { | ||
716 | set_bit(slot->ds_node_num, o2hb_live_node_bitmap); | ||
717 | |||
718 | o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, | ||
719 | slot->ds_node_num); | ||
720 | |||
721 | changed = 1; | ||
722 | } | ||
723 | |||
724 | list_add_tail(&slot->ds_live_item, | ||
725 | &o2hb_live_slots[slot->ds_node_num]); | ||
726 | |||
727 | slot->ds_equal_samples = 0; | ||
728 | goto out; | ||
729 | } | ||
730 | |||
731 | /* if the list is dead, we're done.. */ | ||
732 | if (list_empty(&slot->ds_live_item)) | ||
733 | goto out; | ||
734 | |||
735 | /* live nodes only go dead after enough consequtive missed | ||
736 | * samples.. reset the missed counter whenever we see | ||
737 | * activity */ | ||
738 | if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) { | ||
739 | mlog(ML_HEARTBEAT, "Node %d left my region\n", | ||
740 | slot->ds_node_num); | ||
741 | |||
742 | /* last off the live_slot generates a callback */ | ||
743 | list_del_init(&slot->ds_live_item); | ||
744 | if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { | ||
745 | clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); | ||
746 | |||
747 | o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, | ||
748 | slot->ds_node_num); | ||
749 | |||
750 | changed = 1; | ||
751 | } | ||
752 | |||
753 | /* We don't clear this because the node is still | ||
754 | * actually writing new blocks. */ | ||
755 | if (!gen_changed) | ||
756 | slot->ds_changed_samples = 0; | ||
757 | goto out; | ||
758 | } | ||
759 | if (slot->ds_changed_samples) { | ||
760 | slot->ds_changed_samples = 0; | ||
761 | slot->ds_equal_samples = 0; | ||
762 | } | ||
763 | out: | ||
764 | spin_unlock(&o2hb_live_lock); | ||
765 | |||
766 | o2hb_run_event_list(&event); | ||
767 | |||
768 | o2nm_node_put(node); | ||
769 | return changed; | ||
770 | } | ||
771 | |||
772 | /* This could be faster if we just implmented a find_last_bit, but I | ||
773 | * don't think the circumstances warrant it. */ | ||
774 | static int o2hb_highest_node(unsigned long *nodes, | ||
775 | int numbits) | ||
776 | { | ||
777 | int highest, node; | ||
778 | |||
779 | highest = numbits; | ||
780 | node = -1; | ||
781 | while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) { | ||
782 | if (node >= numbits) | ||
783 | break; | ||
784 | |||
785 | highest = node; | ||
786 | } | ||
787 | |||
788 | return highest; | ||
789 | } | ||
790 | |||
791 | static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) | ||
792 | { | ||
793 | int i, ret, highest_node, change = 0; | ||
794 | unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
795 | struct bio *write_bio; | ||
796 | struct o2hb_bio_wait_ctxt write_wc; | ||
797 | |||
798 | if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes))) | ||
799 | return; | ||
800 | |||
801 | highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); | ||
802 | if (highest_node >= O2NM_MAX_NODES) { | ||
803 | mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); | ||
804 | return; | ||
805 | } | ||
806 | |||
807 | /* No sense in reading the slots of nodes that don't exist | ||
808 | * yet. Of course, if the node definitions have holes in them | ||
809 | * then we're reading an empty slot anyway... Consider this | ||
810 | * best-effort. */ | ||
811 | ret = o2hb_read_slots(reg, highest_node + 1); | ||
812 | if (ret < 0) { | ||
813 | mlog_errno(ret); | ||
814 | return; | ||
815 | } | ||
816 | |||
817 | /* With an up to date view of the slots, we can check that no | ||
818 | * other node has been improperly configured to heartbeat in | ||
819 | * our slot. */ | ||
820 | if (!o2hb_check_last_timestamp(reg)) | ||
821 | mlog(ML_ERROR, "Device \"%s\": another node is heartbeating " | ||
822 | "in our slot!\n", reg->hr_dev_name); | ||
823 | |||
824 | /* fill in the proper info for our next heartbeat */ | ||
825 | o2hb_prepare_block(reg, reg->hr_generation); | ||
826 | |||
827 | /* And fire off the write. Note that we don't wait on this I/O | ||
828 | * until later. */ | ||
829 | ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); | ||
830 | if (ret < 0) { | ||
831 | mlog_errno(ret); | ||
832 | return; | ||
833 | } | ||
834 | |||
835 | i = -1; | ||
836 | while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { | ||
837 | |||
838 | change |= o2hb_check_slot(reg, ®->hr_slots[i]); | ||
839 | } | ||
840 | |||
841 | /* | ||
842 | * We have to be sure we've advertised ourselves on disk | ||
843 | * before we can go to steady state. This ensures that | ||
844 | * people we find in our steady state have seen us. | ||
845 | */ | ||
846 | o2hb_wait_on_io(reg, &write_wc); | ||
847 | bio_put(write_bio); | ||
848 | o2hb_arm_write_timeout(reg); | ||
849 | |||
850 | /* let the person who launched us know when things are steady */ | ||
851 | if (!change && (atomic_read(®->hr_steady_iterations) != 0)) { | ||
852 | if (atomic_dec_and_test(®->hr_steady_iterations)) | ||
853 | wake_up(&o2hb_steady_queue); | ||
854 | } | ||
855 | } | ||
856 | |||
857 | /* Subtract b from a, storing the result in a. a *must* have a larger | ||
858 | * value than b. */ | ||
859 | static void o2hb_tv_subtract(struct timeval *a, | ||
860 | struct timeval *b) | ||
861 | { | ||
862 | /* just return 0 when a is after b */ | ||
863 | if (a->tv_sec < b->tv_sec || | ||
864 | (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) { | ||
865 | a->tv_sec = 0; | ||
866 | a->tv_usec = 0; | ||
867 | return; | ||
868 | } | ||
869 | |||
870 | a->tv_sec -= b->tv_sec; | ||
871 | a->tv_usec -= b->tv_usec; | ||
872 | while ( a->tv_usec < 0 ) { | ||
873 | a->tv_sec--; | ||
874 | a->tv_usec += 1000000; | ||
875 | } | ||
876 | } | ||
877 | |||
878 | static unsigned int o2hb_elapsed_msecs(struct timeval *start, | ||
879 | struct timeval *end) | ||
880 | { | ||
881 | struct timeval res = *end; | ||
882 | |||
883 | o2hb_tv_subtract(&res, start); | ||
884 | |||
885 | return res.tv_sec * 1000 + res.tv_usec / 1000; | ||
886 | } | ||
887 | |||
888 | /* | ||
889 | * we ride the region ref that the region dir holds. before the region | ||
890 | * dir is removed and drops it ref it will wait to tear down this | ||
891 | * thread. | ||
892 | */ | ||
893 | static int o2hb_thread(void *data) | ||
894 | { | ||
895 | int i, ret; | ||
896 | struct o2hb_region *reg = data; | ||
897 | struct bio *write_bio; | ||
898 | struct o2hb_bio_wait_ctxt write_wc; | ||
899 | struct timeval before_hb, after_hb; | ||
900 | unsigned int elapsed_msec; | ||
901 | |||
902 | mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); | ||
903 | |||
904 | set_user_nice(current, -20); | ||
905 | |||
906 | while (!kthread_should_stop() && !reg->hr_unclean_stop) { | ||
907 | /* We track the time spent inside | ||
908 | * o2hb_do_disk_heartbeat so that we avoid more then | ||
909 | * hr_timeout_ms between disk writes. On busy systems | ||
910 | * this should result in a heartbeat which is less | ||
911 | * likely to time itself out. */ | ||
912 | do_gettimeofday(&before_hb); | ||
913 | |||
914 | o2hb_do_disk_heartbeat(reg); | ||
915 | |||
916 | do_gettimeofday(&after_hb); | ||
917 | elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); | ||
918 | |||
919 | mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n", | ||
920 | before_hb.tv_sec, before_hb.tv_usec, | ||
921 | after_hb.tv_sec, after_hb.tv_usec, elapsed_msec); | ||
922 | |||
923 | if (elapsed_msec < reg->hr_timeout_ms) { | ||
924 | /* the kthread api has blocked signals for us so no | ||
925 | * need to record the return value. */ | ||
926 | msleep_interruptible(reg->hr_timeout_ms - elapsed_msec); | ||
927 | } | ||
928 | } | ||
929 | |||
930 | o2hb_disarm_write_timeout(reg); | ||
931 | |||
932 | /* unclean stop is only used in very bad situation */ | ||
933 | for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++) | ||
934 | o2hb_shutdown_slot(®->hr_slots[i]); | ||
935 | |||
936 | /* Explicit down notification - avoid forcing the other nodes | ||
937 | * to timeout on this region when we could just as easily | ||
938 | * write a clear generation - thus indicating to them that | ||
939 | * this node has left this region. | ||
940 | * | ||
941 | * XXX: Should we skip this on unclean_stop? */ | ||
942 | o2hb_prepare_block(reg, 0); | ||
943 | ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); | ||
944 | if (ret == 0) { | ||
945 | o2hb_wait_on_io(reg, &write_wc); | ||
946 | bio_put(write_bio); | ||
947 | } else { | ||
948 | mlog_errno(ret); | ||
949 | } | ||
950 | |||
951 | mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); | ||
952 | |||
953 | return 0; | ||
954 | } | ||
955 | |||
956 | void o2hb_init(void) | ||
957 | { | ||
958 | int i; | ||
959 | |||
960 | for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++) | ||
961 | INIT_LIST_HEAD(&o2hb_callbacks[i].list); | ||
962 | |||
963 | for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++) | ||
964 | INIT_LIST_HEAD(&o2hb_live_slots[i]); | ||
965 | |||
966 | INIT_LIST_HEAD(&o2hb_node_events); | ||
967 | |||
968 | memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); | ||
969 | } | ||
970 | |||
971 | /* if we're already in a callback then we're already serialized by the sem */ | ||
972 | static void o2hb_fill_node_map_from_callback(unsigned long *map, | ||
973 | unsigned bytes) | ||
974 | { | ||
975 | BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); | ||
976 | |||
977 | memcpy(map, &o2hb_live_node_bitmap, bytes); | ||
978 | } | ||
979 | |||
980 | /* | ||
981 | * get a map of all nodes that are heartbeating in any regions | ||
982 | */ | ||
983 | void o2hb_fill_node_map(unsigned long *map, unsigned bytes) | ||
984 | { | ||
985 | /* callers want to serialize this map and callbacks so that they | ||
986 | * can trust that they don't miss nodes coming to the party */ | ||
987 | down_read(&o2hb_callback_sem); | ||
988 | spin_lock(&o2hb_live_lock); | ||
989 | o2hb_fill_node_map_from_callback(map, bytes); | ||
990 | spin_unlock(&o2hb_live_lock); | ||
991 | up_read(&o2hb_callback_sem); | ||
992 | } | ||
993 | EXPORT_SYMBOL_GPL(o2hb_fill_node_map); | ||
994 | |||
995 | /* | ||
996 | * heartbeat configfs bits. The heartbeat set is a default set under | ||
997 | * the cluster set in nodemanager.c. | ||
998 | */ | ||
999 | |||
1000 | static struct o2hb_region *to_o2hb_region(struct config_item *item) | ||
1001 | { | ||
1002 | return item ? container_of(item, struct o2hb_region, hr_item) : NULL; | ||
1003 | } | ||
1004 | |||
1005 | /* drop_item only drops its ref after killing the thread, nothing should | ||
1006 | * be using the region anymore. this has to clean up any state that | ||
1007 | * attributes might have built up. */ | ||
1008 | static void o2hb_region_release(struct config_item *item) | ||
1009 | { | ||
1010 | int i; | ||
1011 | struct page *page; | ||
1012 | struct o2hb_region *reg = to_o2hb_region(item); | ||
1013 | |||
1014 | if (reg->hr_tmp_block) | ||
1015 | kfree(reg->hr_tmp_block); | ||
1016 | |||
1017 | if (reg->hr_slot_data) { | ||
1018 | for (i = 0; i < reg->hr_num_pages; i++) { | ||
1019 | page = reg->hr_slot_data[i]; | ||
1020 | if (page) | ||
1021 | __free_page(page); | ||
1022 | } | ||
1023 | kfree(reg->hr_slot_data); | ||
1024 | } | ||
1025 | |||
1026 | if (reg->hr_bdev) | ||
1027 | blkdev_put(reg->hr_bdev); | ||
1028 | |||
1029 | if (reg->hr_slots) | ||
1030 | kfree(reg->hr_slots); | ||
1031 | |||
1032 | spin_lock(&o2hb_live_lock); | ||
1033 | list_del(®->hr_all_item); | ||
1034 | spin_unlock(&o2hb_live_lock); | ||
1035 | |||
1036 | kfree(reg); | ||
1037 | } | ||
1038 | |||
1039 | static int o2hb_read_block_input(struct o2hb_region *reg, | ||
1040 | const char *page, | ||
1041 | size_t count, | ||
1042 | unsigned long *ret_bytes, | ||
1043 | unsigned int *ret_bits) | ||
1044 | { | ||
1045 | unsigned long bytes; | ||
1046 | char *p = (char *)page; | ||
1047 | |||
1048 | bytes = simple_strtoul(p, &p, 0); | ||
1049 | if (!p || (*p && (*p != '\n'))) | ||
1050 | return -EINVAL; | ||
1051 | |||
1052 | /* Heartbeat and fs min / max block sizes are the same. */ | ||
1053 | if (bytes > 4096 || bytes < 512) | ||
1054 | return -ERANGE; | ||
1055 | if (hweight16(bytes) != 1) | ||
1056 | return -EINVAL; | ||
1057 | |||
1058 | if (ret_bytes) | ||
1059 | *ret_bytes = bytes; | ||
1060 | if (ret_bits) | ||
1061 | *ret_bits = ffs(bytes) - 1; | ||
1062 | |||
1063 | return 0; | ||
1064 | } | ||
1065 | |||
1066 | static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg, | ||
1067 | char *page) | ||
1068 | { | ||
1069 | return sprintf(page, "%u\n", reg->hr_block_bytes); | ||
1070 | } | ||
1071 | |||
1072 | static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg, | ||
1073 | const char *page, | ||
1074 | size_t count) | ||
1075 | { | ||
1076 | int status; | ||
1077 | unsigned long block_bytes; | ||
1078 | unsigned int block_bits; | ||
1079 | |||
1080 | if (reg->hr_bdev) | ||
1081 | return -EINVAL; | ||
1082 | |||
1083 | status = o2hb_read_block_input(reg, page, count, | ||
1084 | &block_bytes, &block_bits); | ||
1085 | if (status) | ||
1086 | return status; | ||
1087 | |||
1088 | reg->hr_block_bytes = (unsigned int)block_bytes; | ||
1089 | reg->hr_block_bits = block_bits; | ||
1090 | |||
1091 | return count; | ||
1092 | } | ||
1093 | |||
1094 | static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg, | ||
1095 | char *page) | ||
1096 | { | ||
1097 | return sprintf(page, "%llu\n", reg->hr_start_block); | ||
1098 | } | ||
1099 | |||
1100 | static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg, | ||
1101 | const char *page, | ||
1102 | size_t count) | ||
1103 | { | ||
1104 | unsigned long long tmp; | ||
1105 | char *p = (char *)page; | ||
1106 | |||
1107 | if (reg->hr_bdev) | ||
1108 | return -EINVAL; | ||
1109 | |||
1110 | tmp = simple_strtoull(p, &p, 0); | ||
1111 | if (!p || (*p && (*p != '\n'))) | ||
1112 | return -EINVAL; | ||
1113 | |||
1114 | reg->hr_start_block = tmp; | ||
1115 | |||
1116 | return count; | ||
1117 | } | ||
1118 | |||
1119 | static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg, | ||
1120 | char *page) | ||
1121 | { | ||
1122 | return sprintf(page, "%d\n", reg->hr_blocks); | ||
1123 | } | ||
1124 | |||
1125 | static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg, | ||
1126 | const char *page, | ||
1127 | size_t count) | ||
1128 | { | ||
1129 | unsigned long tmp; | ||
1130 | char *p = (char *)page; | ||
1131 | |||
1132 | if (reg->hr_bdev) | ||
1133 | return -EINVAL; | ||
1134 | |||
1135 | tmp = simple_strtoul(p, &p, 0); | ||
1136 | if (!p || (*p && (*p != '\n'))) | ||
1137 | return -EINVAL; | ||
1138 | |||
1139 | if (tmp > O2NM_MAX_NODES || tmp == 0) | ||
1140 | return -ERANGE; | ||
1141 | |||
1142 | reg->hr_blocks = (unsigned int)tmp; | ||
1143 | |||
1144 | return count; | ||
1145 | } | ||
1146 | |||
1147 | static ssize_t o2hb_region_dev_read(struct o2hb_region *reg, | ||
1148 | char *page) | ||
1149 | { | ||
1150 | unsigned int ret = 0; | ||
1151 | |||
1152 | if (reg->hr_bdev) | ||
1153 | ret = sprintf(page, "%s\n", reg->hr_dev_name); | ||
1154 | |||
1155 | return ret; | ||
1156 | } | ||
1157 | |||
1158 | static void o2hb_init_region_params(struct o2hb_region *reg) | ||
1159 | { | ||
1160 | reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits; | ||
1161 | reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS; | ||
1162 | |||
1163 | mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n", | ||
1164 | reg->hr_start_block, reg->hr_blocks); | ||
1165 | mlog(ML_HEARTBEAT, "hr_block_bytes = %u, hr_block_bits = %u\n", | ||
1166 | reg->hr_block_bytes, reg->hr_block_bits); | ||
1167 | mlog(ML_HEARTBEAT, "hr_timeout_ms = %u\n", reg->hr_timeout_ms); | ||
1168 | mlog(ML_HEARTBEAT, "dead threshold = %u\n", o2hb_dead_threshold); | ||
1169 | } | ||
1170 | |||
1171 | static int o2hb_map_slot_data(struct o2hb_region *reg) | ||
1172 | { | ||
1173 | int i, j; | ||
1174 | unsigned int last_slot; | ||
1175 | unsigned int spp = reg->hr_slots_per_page; | ||
1176 | struct page *page; | ||
1177 | char *raw; | ||
1178 | struct o2hb_disk_slot *slot; | ||
1179 | |||
1180 | reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL); | ||
1181 | if (reg->hr_tmp_block == NULL) { | ||
1182 | mlog_errno(-ENOMEM); | ||
1183 | return -ENOMEM; | ||
1184 | } | ||
1185 | |||
1186 | reg->hr_slots = kcalloc(reg->hr_blocks, | ||
1187 | sizeof(struct o2hb_disk_slot), GFP_KERNEL); | ||
1188 | if (reg->hr_slots == NULL) { | ||
1189 | mlog_errno(-ENOMEM); | ||
1190 | return -ENOMEM; | ||
1191 | } | ||
1192 | |||
1193 | for(i = 0; i < reg->hr_blocks; i++) { | ||
1194 | slot = ®->hr_slots[i]; | ||
1195 | slot->ds_node_num = i; | ||
1196 | INIT_LIST_HEAD(&slot->ds_live_item); | ||
1197 | slot->ds_raw_block = NULL; | ||
1198 | } | ||
1199 | |||
1200 | reg->hr_num_pages = (reg->hr_blocks + spp - 1) / spp; | ||
1201 | mlog(ML_HEARTBEAT, "Going to require %u pages to cover %u blocks " | ||
1202 | "at %u blocks per page\n", | ||
1203 | reg->hr_num_pages, reg->hr_blocks, spp); | ||
1204 | |||
1205 | reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *), | ||
1206 | GFP_KERNEL); | ||
1207 | if (!reg->hr_slot_data) { | ||
1208 | mlog_errno(-ENOMEM); | ||
1209 | return -ENOMEM; | ||
1210 | } | ||
1211 | |||
1212 | for(i = 0; i < reg->hr_num_pages; i++) { | ||
1213 | page = alloc_page(GFP_KERNEL); | ||
1214 | if (!page) { | ||
1215 | mlog_errno(-ENOMEM); | ||
1216 | return -ENOMEM; | ||
1217 | } | ||
1218 | |||
1219 | reg->hr_slot_data[i] = page; | ||
1220 | |||
1221 | last_slot = i * spp; | ||
1222 | raw = page_address(page); | ||
1223 | for (j = 0; | ||
1224 | (j < spp) && ((j + last_slot) < reg->hr_blocks); | ||
1225 | j++) { | ||
1226 | BUG_ON((j + last_slot) >= reg->hr_blocks); | ||
1227 | |||
1228 | slot = ®->hr_slots[j + last_slot]; | ||
1229 | slot->ds_raw_block = | ||
1230 | (struct o2hb_disk_heartbeat_block *) raw; | ||
1231 | |||
1232 | raw += reg->hr_block_bytes; | ||
1233 | } | ||
1234 | } | ||
1235 | |||
1236 | return 0; | ||
1237 | } | ||
1238 | |||
1239 | /* Read in all the slots available and populate the tracking | ||
1240 | * structures so that we can start with a baseline idea of what's | ||
1241 | * there. */ | ||
1242 | static int o2hb_populate_slot_data(struct o2hb_region *reg) | ||
1243 | { | ||
1244 | int ret, i; | ||
1245 | struct o2hb_disk_slot *slot; | ||
1246 | struct o2hb_disk_heartbeat_block *hb_block; | ||
1247 | |||
1248 | mlog_entry_void(); | ||
1249 | |||
1250 | ret = o2hb_read_slots(reg, reg->hr_blocks); | ||
1251 | if (ret) { | ||
1252 | mlog_errno(ret); | ||
1253 | goto out; | ||
1254 | } | ||
1255 | |||
1256 | /* We only want to get an idea of the values initially in each | ||
1257 | * slot, so we do no verification - o2hb_check_slot will | ||
1258 | * actually determine if each configured slot is valid and | ||
1259 | * whether any values have changed. */ | ||
1260 | for(i = 0; i < reg->hr_blocks; i++) { | ||
1261 | slot = ®->hr_slots[i]; | ||
1262 | hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block; | ||
1263 | |||
1264 | /* Only fill the values that o2hb_check_slot uses to | ||
1265 | * determine changing slots */ | ||
1266 | slot->ds_last_time = le64_to_cpu(hb_block->hb_seq); | ||
1267 | slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation); | ||
1268 | } | ||
1269 | |||
1270 | out: | ||
1271 | mlog_exit(ret); | ||
1272 | return ret; | ||
1273 | } | ||
1274 | |||
1275 | /* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */ | ||
1276 | static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, | ||
1277 | const char *page, | ||
1278 | size_t count) | ||
1279 | { | ||
1280 | long fd; | ||
1281 | int sectsize; | ||
1282 | char *p = (char *)page; | ||
1283 | struct file *filp = NULL; | ||
1284 | struct inode *inode = NULL; | ||
1285 | ssize_t ret = -EINVAL; | ||
1286 | |||
1287 | if (reg->hr_bdev) | ||
1288 | goto out; | ||
1289 | |||
1290 | /* We can't heartbeat without having had our node number | ||
1291 | * configured yet. */ | ||
1292 | if (o2nm_this_node() == O2NM_MAX_NODES) | ||
1293 | goto out; | ||
1294 | |||
1295 | fd = simple_strtol(p, &p, 0); | ||
1296 | if (!p || (*p && (*p != '\n'))) | ||
1297 | goto out; | ||
1298 | |||
1299 | if (fd < 0 || fd >= INT_MAX) | ||
1300 | goto out; | ||
1301 | |||
1302 | filp = fget(fd); | ||
1303 | if (filp == NULL) | ||
1304 | goto out; | ||
1305 | |||
1306 | if (reg->hr_blocks == 0 || reg->hr_start_block == 0 || | ||
1307 | reg->hr_block_bytes == 0) | ||
1308 | goto out; | ||
1309 | |||
1310 | inode = igrab(filp->f_mapping->host); | ||
1311 | if (inode == NULL) | ||
1312 | goto out; | ||
1313 | |||
1314 | if (!S_ISBLK(inode->i_mode)) | ||
1315 | goto out; | ||
1316 | |||
1317 | reg->hr_bdev = I_BDEV(filp->f_mapping->host); | ||
1318 | ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, 0); | ||
1319 | if (ret) { | ||
1320 | reg->hr_bdev = NULL; | ||
1321 | goto out; | ||
1322 | } | ||
1323 | inode = NULL; | ||
1324 | |||
1325 | bdevname(reg->hr_bdev, reg->hr_dev_name); | ||
1326 | |||
1327 | sectsize = bdev_hardsect_size(reg->hr_bdev); | ||
1328 | if (sectsize != reg->hr_block_bytes) { | ||
1329 | mlog(ML_ERROR, | ||
1330 | "blocksize %u incorrect for device, expected %d", | ||
1331 | reg->hr_block_bytes, sectsize); | ||
1332 | ret = -EINVAL; | ||
1333 | goto out; | ||
1334 | } | ||
1335 | |||
1336 | o2hb_init_region_params(reg); | ||
1337 | |||
1338 | /* Generation of zero is invalid */ | ||
1339 | do { | ||
1340 | get_random_bytes(®->hr_generation, | ||
1341 | sizeof(reg->hr_generation)); | ||
1342 | } while (reg->hr_generation == 0); | ||
1343 | |||
1344 | ret = o2hb_map_slot_data(reg); | ||
1345 | if (ret) { | ||
1346 | mlog_errno(ret); | ||
1347 | goto out; | ||
1348 | } | ||
1349 | |||
1350 | ret = o2hb_populate_slot_data(reg); | ||
1351 | if (ret) { | ||
1352 | mlog_errno(ret); | ||
1353 | goto out; | ||
1354 | } | ||
1355 | |||
1356 | INIT_WORK(®->hr_write_timeout_work, o2hb_write_timeout, reg); | ||
1357 | |||
1358 | /* | ||
1359 | * A node is considered live after it has beat LIVE_THRESHOLD | ||
1360 | * times. We're not steady until we've given them a chance | ||
1361 | * _after_ our first read. | ||
1362 | */ | ||
1363 | atomic_set(®->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1); | ||
1364 | |||
1365 | reg->hr_task = kthread_run(o2hb_thread, reg, "o2hb-%s", | ||
1366 | reg->hr_item.ci_name); | ||
1367 | if (IS_ERR(reg->hr_task)) { | ||
1368 | ret = PTR_ERR(reg->hr_task); | ||
1369 | mlog_errno(ret); | ||
1370 | reg->hr_task = NULL; | ||
1371 | goto out; | ||
1372 | } | ||
1373 | |||
1374 | ret = wait_event_interruptible(o2hb_steady_queue, | ||
1375 | atomic_read(®->hr_steady_iterations) == 0); | ||
1376 | if (ret) { | ||
1377 | kthread_stop(reg->hr_task); | ||
1378 | reg->hr_task = NULL; | ||
1379 | goto out; | ||
1380 | } | ||
1381 | |||
1382 | ret = count; | ||
1383 | out: | ||
1384 | if (filp) | ||
1385 | fput(filp); | ||
1386 | if (inode) | ||
1387 | iput(inode); | ||
1388 | if (ret < 0) { | ||
1389 | if (reg->hr_bdev) { | ||
1390 | blkdev_put(reg->hr_bdev); | ||
1391 | reg->hr_bdev = NULL; | ||
1392 | } | ||
1393 | } | ||
1394 | return ret; | ||
1395 | } | ||
1396 | |||
1397 | struct o2hb_region_attribute { | ||
1398 | struct configfs_attribute attr; | ||
1399 | ssize_t (*show)(struct o2hb_region *, char *); | ||
1400 | ssize_t (*store)(struct o2hb_region *, const char *, size_t); | ||
1401 | }; | ||
1402 | |||
1403 | static struct o2hb_region_attribute o2hb_region_attr_block_bytes = { | ||
1404 | .attr = { .ca_owner = THIS_MODULE, | ||
1405 | .ca_name = "block_bytes", | ||
1406 | .ca_mode = S_IRUGO | S_IWUSR }, | ||
1407 | .show = o2hb_region_block_bytes_read, | ||
1408 | .store = o2hb_region_block_bytes_write, | ||
1409 | }; | ||
1410 | |||
1411 | static struct o2hb_region_attribute o2hb_region_attr_start_block = { | ||
1412 | .attr = { .ca_owner = THIS_MODULE, | ||
1413 | .ca_name = "start_block", | ||
1414 | .ca_mode = S_IRUGO | S_IWUSR }, | ||
1415 | .show = o2hb_region_start_block_read, | ||
1416 | .store = o2hb_region_start_block_write, | ||
1417 | }; | ||
1418 | |||
1419 | static struct o2hb_region_attribute o2hb_region_attr_blocks = { | ||
1420 | .attr = { .ca_owner = THIS_MODULE, | ||
1421 | .ca_name = "blocks", | ||
1422 | .ca_mode = S_IRUGO | S_IWUSR }, | ||
1423 | .show = o2hb_region_blocks_read, | ||
1424 | .store = o2hb_region_blocks_write, | ||
1425 | }; | ||
1426 | |||
1427 | static struct o2hb_region_attribute o2hb_region_attr_dev = { | ||
1428 | .attr = { .ca_owner = THIS_MODULE, | ||
1429 | .ca_name = "dev", | ||
1430 | .ca_mode = S_IRUGO | S_IWUSR }, | ||
1431 | .show = o2hb_region_dev_read, | ||
1432 | .store = o2hb_region_dev_write, | ||
1433 | }; | ||
1434 | |||
1435 | static struct configfs_attribute *o2hb_region_attrs[] = { | ||
1436 | &o2hb_region_attr_block_bytes.attr, | ||
1437 | &o2hb_region_attr_start_block.attr, | ||
1438 | &o2hb_region_attr_blocks.attr, | ||
1439 | &o2hb_region_attr_dev.attr, | ||
1440 | NULL, | ||
1441 | }; | ||
1442 | |||
1443 | static ssize_t o2hb_region_show(struct config_item *item, | ||
1444 | struct configfs_attribute *attr, | ||
1445 | char *page) | ||
1446 | { | ||
1447 | struct o2hb_region *reg = to_o2hb_region(item); | ||
1448 | struct o2hb_region_attribute *o2hb_region_attr = | ||
1449 | container_of(attr, struct o2hb_region_attribute, attr); | ||
1450 | ssize_t ret = 0; | ||
1451 | |||
1452 | if (o2hb_region_attr->show) | ||
1453 | ret = o2hb_region_attr->show(reg, page); | ||
1454 | return ret; | ||
1455 | } | ||
1456 | |||
1457 | static ssize_t o2hb_region_store(struct config_item *item, | ||
1458 | struct configfs_attribute *attr, | ||
1459 | const char *page, size_t count) | ||
1460 | { | ||
1461 | struct o2hb_region *reg = to_o2hb_region(item); | ||
1462 | struct o2hb_region_attribute *o2hb_region_attr = | ||
1463 | container_of(attr, struct o2hb_region_attribute, attr); | ||
1464 | ssize_t ret = -EINVAL; | ||
1465 | |||
1466 | if (o2hb_region_attr->store) | ||
1467 | ret = o2hb_region_attr->store(reg, page, count); | ||
1468 | return ret; | ||
1469 | } | ||
1470 | |||
1471 | static struct configfs_item_operations o2hb_region_item_ops = { | ||
1472 | .release = o2hb_region_release, | ||
1473 | .show_attribute = o2hb_region_show, | ||
1474 | .store_attribute = o2hb_region_store, | ||
1475 | }; | ||
1476 | |||
1477 | static struct config_item_type o2hb_region_type = { | ||
1478 | .ct_item_ops = &o2hb_region_item_ops, | ||
1479 | .ct_attrs = o2hb_region_attrs, | ||
1480 | .ct_owner = THIS_MODULE, | ||
1481 | }; | ||
1482 | |||
1483 | /* heartbeat set */ | ||
1484 | |||
1485 | struct o2hb_heartbeat_group { | ||
1486 | struct config_group hs_group; | ||
1487 | /* some stuff? */ | ||
1488 | }; | ||
1489 | |||
1490 | static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group *group) | ||
1491 | { | ||
1492 | return group ? | ||
1493 | container_of(group, struct o2hb_heartbeat_group, hs_group) | ||
1494 | : NULL; | ||
1495 | } | ||
1496 | |||
1497 | static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, | ||
1498 | const char *name) | ||
1499 | { | ||
1500 | struct o2hb_region *reg = NULL; | ||
1501 | struct config_item *ret = NULL; | ||
1502 | |||
1503 | reg = kcalloc(1, sizeof(struct o2hb_region), GFP_KERNEL); | ||
1504 | if (reg == NULL) | ||
1505 | goto out; /* ENOMEM */ | ||
1506 | |||
1507 | config_item_init_type_name(®->hr_item, name, &o2hb_region_type); | ||
1508 | |||
1509 | ret = ®->hr_item; | ||
1510 | |||
1511 | spin_lock(&o2hb_live_lock); | ||
1512 | list_add_tail(®->hr_all_item, &o2hb_all_regions); | ||
1513 | spin_unlock(&o2hb_live_lock); | ||
1514 | out: | ||
1515 | if (ret == NULL) | ||
1516 | kfree(reg); | ||
1517 | |||
1518 | return ret; | ||
1519 | } | ||
1520 | |||
1521 | static void o2hb_heartbeat_group_drop_item(struct config_group *group, | ||
1522 | struct config_item *item) | ||
1523 | { | ||
1524 | struct o2hb_region *reg = to_o2hb_region(item); | ||
1525 | |||
1526 | /* stop the thread when the user removes the region dir */ | ||
1527 | if (reg->hr_task) { | ||
1528 | kthread_stop(reg->hr_task); | ||
1529 | reg->hr_task = NULL; | ||
1530 | } | ||
1531 | |||
1532 | config_item_put(item); | ||
1533 | } | ||
1534 | |||
1535 | struct o2hb_heartbeat_group_attribute { | ||
1536 | struct configfs_attribute attr; | ||
1537 | ssize_t (*show)(struct o2hb_heartbeat_group *, char *); | ||
1538 | ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t); | ||
1539 | }; | ||
1540 | |||
1541 | static ssize_t o2hb_heartbeat_group_show(struct config_item *item, | ||
1542 | struct configfs_attribute *attr, | ||
1543 | char *page) | ||
1544 | { | ||
1545 | struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item)); | ||
1546 | struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr = | ||
1547 | container_of(attr, struct o2hb_heartbeat_group_attribute, attr); | ||
1548 | ssize_t ret = 0; | ||
1549 | |||
1550 | if (o2hb_heartbeat_group_attr->show) | ||
1551 | ret = o2hb_heartbeat_group_attr->show(reg, page); | ||
1552 | return ret; | ||
1553 | } | ||
1554 | |||
1555 | static ssize_t o2hb_heartbeat_group_store(struct config_item *item, | ||
1556 | struct configfs_attribute *attr, | ||
1557 | const char *page, size_t count) | ||
1558 | { | ||
1559 | struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item)); | ||
1560 | struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr = | ||
1561 | container_of(attr, struct o2hb_heartbeat_group_attribute, attr); | ||
1562 | ssize_t ret = -EINVAL; | ||
1563 | |||
1564 | if (o2hb_heartbeat_group_attr->store) | ||
1565 | ret = o2hb_heartbeat_group_attr->store(reg, page, count); | ||
1566 | return ret; | ||
1567 | } | ||
1568 | |||
1569 | static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group, | ||
1570 | char *page) | ||
1571 | { | ||
1572 | return sprintf(page, "%u\n", o2hb_dead_threshold); | ||
1573 | } | ||
1574 | |||
1575 | static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group, | ||
1576 | const char *page, | ||
1577 | size_t count) | ||
1578 | { | ||
1579 | unsigned long tmp; | ||
1580 | char *p = (char *)page; | ||
1581 | |||
1582 | tmp = simple_strtoul(p, &p, 10); | ||
1583 | if (!p || (*p && (*p != '\n'))) | ||
1584 | return -EINVAL; | ||
1585 | |||
1586 | /* this will validate ranges for us. */ | ||
1587 | o2hb_dead_threshold_set((unsigned int) tmp); | ||
1588 | |||
1589 | return count; | ||
1590 | } | ||
1591 | |||
1592 | static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { | ||
1593 | .attr = { .ca_owner = THIS_MODULE, | ||
1594 | .ca_name = "dead_threshold", | ||
1595 | .ca_mode = S_IRUGO | S_IWUSR }, | ||
1596 | .show = o2hb_heartbeat_group_threshold_show, | ||
1597 | .store = o2hb_heartbeat_group_threshold_store, | ||
1598 | }; | ||
1599 | |||
1600 | static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { | ||
1601 | &o2hb_heartbeat_group_attr_threshold.attr, | ||
1602 | NULL, | ||
1603 | }; | ||
1604 | |||
1605 | static struct configfs_item_operations o2hb_hearbeat_group_item_ops = { | ||
1606 | .show_attribute = o2hb_heartbeat_group_show, | ||
1607 | .store_attribute = o2hb_heartbeat_group_store, | ||
1608 | }; | ||
1609 | |||
1610 | static struct configfs_group_operations o2hb_heartbeat_group_group_ops = { | ||
1611 | .make_item = o2hb_heartbeat_group_make_item, | ||
1612 | .drop_item = o2hb_heartbeat_group_drop_item, | ||
1613 | }; | ||
1614 | |||
1615 | static struct config_item_type o2hb_heartbeat_group_type = { | ||
1616 | .ct_group_ops = &o2hb_heartbeat_group_group_ops, | ||
1617 | .ct_item_ops = &o2hb_hearbeat_group_item_ops, | ||
1618 | .ct_attrs = o2hb_heartbeat_group_attrs, | ||
1619 | .ct_owner = THIS_MODULE, | ||
1620 | }; | ||
1621 | |||
1622 | /* this is just here to avoid touching group in heartbeat.h which the | ||
1623 | * entire damn world #includes */ | ||
1624 | struct config_group *o2hb_alloc_hb_set(void) | ||
1625 | { | ||
1626 | struct o2hb_heartbeat_group *hs = NULL; | ||
1627 | struct config_group *ret = NULL; | ||
1628 | |||
1629 | hs = kcalloc(1, sizeof(struct o2hb_heartbeat_group), GFP_KERNEL); | ||
1630 | if (hs == NULL) | ||
1631 | goto out; | ||
1632 | |||
1633 | config_group_init_type_name(&hs->hs_group, "heartbeat", | ||
1634 | &o2hb_heartbeat_group_type); | ||
1635 | |||
1636 | ret = &hs->hs_group; | ||
1637 | out: | ||
1638 | if (ret == NULL) | ||
1639 | kfree(hs); | ||
1640 | return ret; | ||
1641 | } | ||
1642 | |||
1643 | void o2hb_free_hb_set(struct config_group *group) | ||
1644 | { | ||
1645 | struct o2hb_heartbeat_group *hs = to_o2hb_heartbeat_group(group); | ||
1646 | kfree(hs); | ||
1647 | } | ||
1648 | |||
1649 | /* hb callback registration and issueing */ | ||
1650 | |||
1651 | static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type) | ||
1652 | { | ||
1653 | if (type == O2HB_NUM_CB) | ||
1654 | return ERR_PTR(-EINVAL); | ||
1655 | |||
1656 | return &o2hb_callbacks[type]; | ||
1657 | } | ||
1658 | |||
1659 | void o2hb_setup_callback(struct o2hb_callback_func *hc, | ||
1660 | enum o2hb_callback_type type, | ||
1661 | o2hb_cb_func *func, | ||
1662 | void *data, | ||
1663 | int priority) | ||
1664 | { | ||
1665 | INIT_LIST_HEAD(&hc->hc_item); | ||
1666 | hc->hc_func = func; | ||
1667 | hc->hc_data = data; | ||
1668 | hc->hc_priority = priority; | ||
1669 | hc->hc_type = type; | ||
1670 | hc->hc_magic = O2HB_CB_MAGIC; | ||
1671 | } | ||
1672 | EXPORT_SYMBOL_GPL(o2hb_setup_callback); | ||
1673 | |||
1674 | int o2hb_register_callback(struct o2hb_callback_func *hc) | ||
1675 | { | ||
1676 | struct o2hb_callback_func *tmp; | ||
1677 | struct list_head *iter; | ||
1678 | struct o2hb_callback *hbcall; | ||
1679 | int ret; | ||
1680 | |||
1681 | BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); | ||
1682 | BUG_ON(!list_empty(&hc->hc_item)); | ||
1683 | |||
1684 | hbcall = hbcall_from_type(hc->hc_type); | ||
1685 | if (IS_ERR(hbcall)) { | ||
1686 | ret = PTR_ERR(hbcall); | ||
1687 | goto out; | ||
1688 | } | ||
1689 | |||
1690 | down_write(&o2hb_callback_sem); | ||
1691 | |||
1692 | list_for_each(iter, &hbcall->list) { | ||
1693 | tmp = list_entry(iter, struct o2hb_callback_func, hc_item); | ||
1694 | if (hc->hc_priority < tmp->hc_priority) { | ||
1695 | list_add_tail(&hc->hc_item, iter); | ||
1696 | break; | ||
1697 | } | ||
1698 | } | ||
1699 | if (list_empty(&hc->hc_item)) | ||
1700 | list_add_tail(&hc->hc_item, &hbcall->list); | ||
1701 | |||
1702 | up_write(&o2hb_callback_sem); | ||
1703 | ret = 0; | ||
1704 | out: | ||
1705 | mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n", | ||
1706 | ret, __builtin_return_address(0), hc); | ||
1707 | return ret; | ||
1708 | } | ||
1709 | EXPORT_SYMBOL_GPL(o2hb_register_callback); | ||
1710 | |||
1711 | int o2hb_unregister_callback(struct o2hb_callback_func *hc) | ||
1712 | { | ||
1713 | BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); | ||
1714 | |||
1715 | mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n", | ||
1716 | __builtin_return_address(0), hc); | ||
1717 | |||
1718 | if (list_empty(&hc->hc_item)) | ||
1719 | return 0; | ||
1720 | |||
1721 | down_write(&o2hb_callback_sem); | ||
1722 | |||
1723 | list_del_init(&hc->hc_item); | ||
1724 | |||
1725 | up_write(&o2hb_callback_sem); | ||
1726 | |||
1727 | return 0; | ||
1728 | } | ||
1729 | EXPORT_SYMBOL_GPL(o2hb_unregister_callback); | ||
1730 | |||
1731 | int o2hb_check_node_heartbeating(u8 node_num) | ||
1732 | { | ||
1733 | unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
1734 | |||
1735 | o2hb_fill_node_map(testing_map, sizeof(testing_map)); | ||
1736 | if (!test_bit(node_num, testing_map)) { | ||
1737 | mlog(ML_HEARTBEAT, | ||
1738 | "node (%u) does not have heartbeating enabled.\n", | ||
1739 | node_num); | ||
1740 | return 0; | ||
1741 | } | ||
1742 | |||
1743 | return 1; | ||
1744 | } | ||
1745 | EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); | ||
1746 | |||
1747 | int o2hb_check_node_heartbeating_from_callback(u8 node_num) | ||
1748 | { | ||
1749 | unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
1750 | |||
1751 | o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); | ||
1752 | if (!test_bit(node_num, testing_map)) { | ||
1753 | mlog(ML_HEARTBEAT, | ||
1754 | "node (%u) does not have heartbeating enabled.\n", | ||
1755 | node_num); | ||
1756 | return 0; | ||
1757 | } | ||
1758 | |||
1759 | return 1; | ||
1760 | } | ||
1761 | EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback); | ||
1762 | |||
1763 | /* Makes sure our local node is configured with a node number, and is | ||
1764 | * heartbeating. */ | ||
1765 | int o2hb_check_local_node_heartbeating(void) | ||
1766 | { | ||
1767 | u8 node_num; | ||
1768 | |||
1769 | /* if this node was set then we have networking */ | ||
1770 | node_num = o2nm_this_node(); | ||
1771 | if (node_num == O2NM_MAX_NODES) { | ||
1772 | mlog(ML_HEARTBEAT, "this node has not been configured.\n"); | ||
1773 | return 0; | ||
1774 | } | ||
1775 | |||
1776 | return o2hb_check_node_heartbeating(node_num); | ||
1777 | } | ||
1778 | EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating); | ||
1779 | |||
1780 | /* | ||
1781 | * this is just a hack until we get the plumbing which flips file systems | ||
1782 | * read only and drops the hb ref instead of killing the node dead. | ||
1783 | */ | ||
1784 | void o2hb_stop_all_regions(void) | ||
1785 | { | ||
1786 | struct o2hb_region *reg; | ||
1787 | |||
1788 | mlog(ML_ERROR, "stopping heartbeat on all active regions.\n"); | ||
1789 | |||
1790 | spin_lock(&o2hb_live_lock); | ||
1791 | |||
1792 | list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) | ||
1793 | reg->hr_unclean_stop = 1; | ||
1794 | |||
1795 | spin_unlock(&o2hb_live_lock); | ||
1796 | } | ||
1797 | EXPORT_SYMBOL_GPL(o2hb_stop_all_regions); | ||
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h new file mode 100644 index 000000000000..cac6223206a9 --- /dev/null +++ b/fs/ocfs2/cluster/heartbeat.h | |||
@@ -0,0 +1,82 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * heartbeat.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | #ifndef O2CLUSTER_HEARTBEAT_H | ||
28 | #define O2CLUSTER_HEARTBEAT_H | ||
29 | |||
30 | #include "ocfs2_heartbeat.h" | ||
31 | |||
32 | #define O2HB_REGION_TIMEOUT_MS 2000 | ||
33 | |||
34 | /* number of changes to be seen as live */ | ||
35 | #define O2HB_LIVE_THRESHOLD 2 | ||
36 | /* number of equal samples to be seen as dead */ | ||
37 | extern unsigned int o2hb_dead_threshold; | ||
38 | #define O2HB_DEFAULT_DEAD_THRESHOLD 7 | ||
39 | /* Otherwise MAX_WRITE_TIMEOUT will be zero... */ | ||
40 | #define O2HB_MIN_DEAD_THRESHOLD 2 | ||
41 | #define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1)) | ||
42 | |||
43 | #define O2HB_CB_MAGIC 0x51d1e4ec | ||
44 | |||
45 | /* callback stuff */ | ||
46 | enum o2hb_callback_type { | ||
47 | O2HB_NODE_DOWN_CB = 0, | ||
48 | O2HB_NODE_UP_CB, | ||
49 | O2HB_NUM_CB | ||
50 | }; | ||
51 | |||
52 | struct o2nm_node; | ||
53 | typedef void (o2hb_cb_func)(struct o2nm_node *, int, void *); | ||
54 | |||
55 | struct o2hb_callback_func { | ||
56 | u32 hc_magic; | ||
57 | struct list_head hc_item; | ||
58 | o2hb_cb_func *hc_func; | ||
59 | void *hc_data; | ||
60 | int hc_priority; | ||
61 | enum o2hb_callback_type hc_type; | ||
62 | }; | ||
63 | |||
64 | struct config_group *o2hb_alloc_hb_set(void); | ||
65 | void o2hb_free_hb_set(struct config_group *group); | ||
66 | |||
67 | void o2hb_setup_callback(struct o2hb_callback_func *hc, | ||
68 | enum o2hb_callback_type type, | ||
69 | o2hb_cb_func *func, | ||
70 | void *data, | ||
71 | int priority); | ||
72 | int o2hb_register_callback(struct o2hb_callback_func *hc); | ||
73 | int o2hb_unregister_callback(struct o2hb_callback_func *hc); | ||
74 | void o2hb_fill_node_map(unsigned long *map, | ||
75 | unsigned bytes); | ||
76 | void o2hb_init(void); | ||
77 | int o2hb_check_node_heartbeating(u8 node_num); | ||
78 | int o2hb_check_node_heartbeating_from_callback(u8 node_num); | ||
79 | int o2hb_check_local_node_heartbeating(void); | ||
80 | void o2hb_stop_all_regions(void); | ||
81 | |||
82 | #endif /* O2CLUSTER_HEARTBEAT_H */ | ||
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c new file mode 100644 index 000000000000..fd741cea5705 --- /dev/null +++ b/fs/ocfs2/cluster/masklog.c | |||
@@ -0,0 +1,166 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * Copyright (C) 2004, 2005 Oracle. All rights reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | */ | ||
21 | |||
22 | #include <linux/module.h> | ||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/proc_fs.h> | ||
25 | #include <linux/seq_file.h> | ||
26 | #include <linux/string.h> | ||
27 | #include <asm/uaccess.h> | ||
28 | |||
29 | #include "masklog.h" | ||
30 | |||
31 | struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK); | ||
32 | EXPORT_SYMBOL_GPL(mlog_and_bits); | ||
33 | struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK); | ||
34 | EXPORT_SYMBOL_GPL(mlog_not_bits); | ||
35 | |||
36 | static ssize_t mlog_mask_show(u64 mask, char *buf) | ||
37 | { | ||
38 | char *state; | ||
39 | |||
40 | if (__mlog_test_u64(mask, mlog_and_bits)) | ||
41 | state = "allow"; | ||
42 | else if (__mlog_test_u64(mask, mlog_not_bits)) | ||
43 | state = "deny"; | ||
44 | else | ||
45 | state = "off"; | ||
46 | |||
47 | return snprintf(buf, PAGE_SIZE, "%s\n", state); | ||
48 | } | ||
49 | |||
50 | static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) | ||
51 | { | ||
52 | if (!strnicmp(buf, "allow", 5)) { | ||
53 | __mlog_set_u64(mask, mlog_and_bits); | ||
54 | __mlog_clear_u64(mask, mlog_not_bits); | ||
55 | } else if (!strnicmp(buf, "deny", 4)) { | ||
56 | __mlog_set_u64(mask, mlog_not_bits); | ||
57 | __mlog_clear_u64(mask, mlog_and_bits); | ||
58 | } else if (!strnicmp(buf, "off", 3)) { | ||
59 | __mlog_clear_u64(mask, mlog_not_bits); | ||
60 | __mlog_clear_u64(mask, mlog_and_bits); | ||
61 | } else | ||
62 | return -EINVAL; | ||
63 | |||
64 | return count; | ||
65 | } | ||
66 | |||
67 | struct mlog_attribute { | ||
68 | struct attribute attr; | ||
69 | u64 mask; | ||
70 | }; | ||
71 | |||
72 | #define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr) | ||
73 | |||
74 | #define define_mask(_name) { \ | ||
75 | .attr = { \ | ||
76 | .name = #_name, \ | ||
77 | .mode = S_IRUGO | S_IWUSR, \ | ||
78 | }, \ | ||
79 | .mask = ML_##_name, \ | ||
80 | } | ||
81 | |||
82 | static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { | ||
83 | define_mask(ENTRY), | ||
84 | define_mask(EXIT), | ||
85 | define_mask(TCP), | ||
86 | define_mask(MSG), | ||
87 | define_mask(SOCKET), | ||
88 | define_mask(HEARTBEAT), | ||
89 | define_mask(HB_BIO), | ||
90 | define_mask(DLMFS), | ||
91 | define_mask(DLM), | ||
92 | define_mask(DLM_DOMAIN), | ||
93 | define_mask(DLM_THREAD), | ||
94 | define_mask(DLM_MASTER), | ||
95 | define_mask(DLM_RECOVERY), | ||
96 | define_mask(AIO), | ||
97 | define_mask(JOURNAL), | ||
98 | define_mask(DISK_ALLOC), | ||
99 | define_mask(SUPER), | ||
100 | define_mask(FILE_IO), | ||
101 | define_mask(EXTENT_MAP), | ||
102 | define_mask(DLM_GLUE), | ||
103 | define_mask(BH_IO), | ||
104 | define_mask(UPTODATE), | ||
105 | define_mask(NAMEI), | ||
106 | define_mask(INODE), | ||
107 | define_mask(VOTE), | ||
108 | define_mask(DCACHE), | ||
109 | define_mask(CONN), | ||
110 | define_mask(QUORUM), | ||
111 | define_mask(EXPORT), | ||
112 | define_mask(ERROR), | ||
113 | define_mask(NOTICE), | ||
114 | define_mask(KTHREAD), | ||
115 | }; | ||
116 | |||
117 | static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; | ||
118 | |||
119 | static ssize_t mlog_show(struct kobject *obj, struct attribute *attr, | ||
120 | char *buf) | ||
121 | { | ||
122 | struct mlog_attribute *mlog_attr = to_mlog_attr(attr); | ||
123 | |||
124 | return mlog_mask_show(mlog_attr->mask, buf); | ||
125 | } | ||
126 | |||
127 | static ssize_t mlog_store(struct kobject *obj, struct attribute *attr, | ||
128 | const char *buf, size_t count) | ||
129 | { | ||
130 | struct mlog_attribute *mlog_attr = to_mlog_attr(attr); | ||
131 | |||
132 | return mlog_mask_store(mlog_attr->mask, buf, count); | ||
133 | } | ||
134 | |||
135 | static struct sysfs_ops mlog_attr_ops = { | ||
136 | .show = mlog_show, | ||
137 | .store = mlog_store, | ||
138 | }; | ||
139 | |||
140 | static struct kobj_type mlog_ktype = { | ||
141 | .default_attrs = mlog_attr_ptrs, | ||
142 | .sysfs_ops = &mlog_attr_ops, | ||
143 | }; | ||
144 | |||
145 | static struct kset mlog_kset = { | ||
146 | .kobj = {.name = "logmask", .ktype = &mlog_ktype}, | ||
147 | }; | ||
148 | |||
149 | int mlog_sys_init(struct subsystem *o2cb_subsys) | ||
150 | { | ||
151 | int i = 0; | ||
152 | |||
153 | while (mlog_attrs[i].attr.mode) { | ||
154 | mlog_attr_ptrs[i] = &mlog_attrs[i].attr; | ||
155 | i++; | ||
156 | } | ||
157 | mlog_attr_ptrs[i] = NULL; | ||
158 | |||
159 | mlog_kset.subsys = o2cb_subsys; | ||
160 | return kset_register(&mlog_kset); | ||
161 | } | ||
162 | |||
163 | void mlog_sys_shutdown(void) | ||
164 | { | ||
165 | kset_unregister(&mlog_kset); | ||
166 | } | ||
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h new file mode 100644 index 000000000000..f5ef5ea61a05 --- /dev/null +++ b/fs/ocfs2/cluster/masklog.h | |||
@@ -0,0 +1,275 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * Copyright (C) 2005 Oracle. All rights reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | */ | ||
21 | |||
22 | #ifndef O2CLUSTER_MASKLOG_H | ||
23 | #define O2CLUSTER_MASKLOG_H | ||
24 | |||
25 | /* | ||
26 | * For now this is a trivial wrapper around printk() that gives the critical | ||
27 | * ability to enable sets of debugging output at run-time. In the future this | ||
28 | * will almost certainly be redirected to relayfs so that it can pay a | ||
29 | * substantially lower heisenberg tax. | ||
30 | * | ||
31 | * Callers associate the message with a bitmask and a global bitmask is | ||
32 | * maintained with help from /proc. If any of the bits match the message is | ||
33 | * output. | ||
34 | * | ||
35 | * We must have efficient bit tests on i386 and it seems gcc still emits crazy | ||
36 | * code for the 64bit compare. It emits very good code for the dual unsigned | ||
37 | * long tests, though, completely avoiding tests that can never pass if the | ||
38 | * caller gives a constant bitmask that fills one of the longs with all 0s. So | ||
39 | * the desire is to have almost all of the calls decided on by comparing just | ||
40 | * one of the longs. This leads to having infrequently given bits that are | ||
41 | * frequently matched in the high bits. | ||
42 | * | ||
43 | * _ERROR and _NOTICE are used for messages that always go to the console and | ||
44 | * have appropriate KERN_ prefixes. We wrap these in our function instead of | ||
45 | * just calling printk() so that this can eventually make its way through | ||
46 | * relayfs along with the debugging messages. Everything else gets KERN_DEBUG. | ||
47 | * The inline tests and macro dance give GCC the opportunity to quite cleverly | ||
48 | * only emit the appropriage printk() when the caller passes in a constant | ||
49 | * mask, as is almost always the case. | ||
50 | * | ||
51 | * All this bitmask nonsense is hidden from the /proc interface so that Joel | ||
52 | * doesn't have an aneurism. Reading the file gives a straight forward | ||
53 | * indication of which bits are on or off: | ||
54 | * ENTRY off | ||
55 | * EXIT off | ||
56 | * TCP off | ||
57 | * MSG off | ||
58 | * SOCKET off | ||
59 | * ERROR off | ||
60 | * NOTICE on | ||
61 | * | ||
62 | * Writing changes the state of a given bit and requires a strictly formatted | ||
63 | * single write() call: | ||
64 | * | ||
65 | * write(fd, "ENTRY on", 8); | ||
66 | * | ||
67 | * would turn the entry bit on. "1" is also accepted in the place of "on", and | ||
68 | * "off" and "0" behave as expected. | ||
69 | * | ||
70 | * Some trivial shell can flip all the bits on or off: | ||
71 | * | ||
72 | * log_mask="/proc/fs/ocfs2_nodemanager/log_mask" | ||
73 | * cat $log_mask | ( | ||
74 | * while read bit status; do | ||
75 | * # $1 is "on" or "off", say | ||
76 | * echo "$bit $1" > $log_mask | ||
77 | * done | ||
78 | * ) | ||
79 | */ | ||
80 | |||
81 | /* for task_struct */ | ||
82 | #include <linux/sched.h> | ||
83 | |||
84 | /* bits that are frequently given and infrequently matched in the low word */ | ||
85 | /* NOTE: If you add a flag, you need to also update mlog.c! */ | ||
86 | #define ML_ENTRY 0x0000000000000001ULL /* func call entry */ | ||
87 | #define ML_EXIT 0x0000000000000002ULL /* func call exit */ | ||
88 | #define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */ | ||
89 | #define ML_MSG 0x0000000000000008ULL /* net network messages */ | ||
90 | #define ML_SOCKET 0x0000000000000010ULL /* net socket lifetime */ | ||
91 | #define ML_HEARTBEAT 0x0000000000000020ULL /* hb all heartbeat tracking */ | ||
92 | #define ML_HB_BIO 0x0000000000000040ULL /* hb io tracing */ | ||
93 | #define ML_DLMFS 0x0000000000000080ULL /* dlm user dlmfs */ | ||
94 | #define ML_DLM 0x0000000000000100ULL /* dlm general debugging */ | ||
95 | #define ML_DLM_DOMAIN 0x0000000000000200ULL /* dlm domain debugging */ | ||
96 | #define ML_DLM_THREAD 0x0000000000000400ULL /* dlm domain thread */ | ||
97 | #define ML_DLM_MASTER 0x0000000000000800ULL /* dlm master functions */ | ||
98 | #define ML_DLM_RECOVERY 0x0000000000001000ULL /* dlm master functions */ | ||
99 | #define ML_AIO 0x0000000000002000ULL /* ocfs2 aio read and write */ | ||
100 | #define ML_JOURNAL 0x0000000000004000ULL /* ocfs2 journalling functions */ | ||
101 | #define ML_DISK_ALLOC 0x0000000000008000ULL /* ocfs2 disk allocation */ | ||
102 | #define ML_SUPER 0x0000000000010000ULL /* ocfs2 mount / umount */ | ||
103 | #define ML_FILE_IO 0x0000000000020000ULL /* ocfs2 file I/O */ | ||
104 | #define ML_EXTENT_MAP 0x0000000000040000ULL /* ocfs2 extent map caching */ | ||
105 | #define ML_DLM_GLUE 0x0000000000080000ULL /* ocfs2 dlm glue layer */ | ||
106 | #define ML_BH_IO 0x0000000000100000ULL /* ocfs2 buffer I/O */ | ||
107 | #define ML_UPTODATE 0x0000000000200000ULL /* ocfs2 caching sequence #'s */ | ||
108 | #define ML_NAMEI 0x0000000000400000ULL /* ocfs2 directory / namespace */ | ||
109 | #define ML_INODE 0x0000000000800000ULL /* ocfs2 inode manipulation */ | ||
110 | #define ML_VOTE 0x0000000001000000ULL /* ocfs2 node messaging */ | ||
111 | #define ML_DCACHE 0x0000000002000000ULL /* ocfs2 dcache operations */ | ||
112 | #define ML_CONN 0x0000000004000000ULL /* net connection management */ | ||
113 | #define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */ | ||
114 | #define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ | ||
115 | /* bits that are infrequently given and frequently matched in the high word */ | ||
116 | #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ | ||
117 | #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ | ||
118 | #define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ | ||
119 | |||
120 | #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) | ||
121 | #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) | ||
122 | #ifndef MLOG_MASK_PREFIX | ||
123 | #define MLOG_MASK_PREFIX 0 | ||
124 | #endif | ||
125 | |||
126 | #define MLOG_MAX_BITS 64 | ||
127 | |||
128 | struct mlog_bits { | ||
129 | unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG]; | ||
130 | }; | ||
131 | |||
132 | extern struct mlog_bits mlog_and_bits, mlog_not_bits; | ||
133 | |||
134 | #if BITS_PER_LONG == 32 | ||
135 | |||
136 | #define __mlog_test_u64(mask, bits) \ | ||
137 | ( (u32)(mask & 0xffffffff) & bits.words[0] || \ | ||
138 | ((u64)(mask) >> 32) & bits.words[1] ) | ||
139 | #define __mlog_set_u64(mask, bits) do { \ | ||
140 | bits.words[0] |= (u32)(mask & 0xffffffff); \ | ||
141 | bits.words[1] |= (u64)(mask) >> 32; \ | ||
142 | } while (0) | ||
143 | #define __mlog_clear_u64(mask, bits) do { \ | ||
144 | bits.words[0] &= ~((u32)(mask & 0xffffffff)); \ | ||
145 | bits.words[1] &= ~((u64)(mask) >> 32); \ | ||
146 | } while (0) | ||
147 | #define MLOG_BITS_RHS(mask) { \ | ||
148 | { \ | ||
149 | [0] = (u32)(mask & 0xffffffff), \ | ||
150 | [1] = (u64)(mask) >> 32, \ | ||
151 | } \ | ||
152 | } | ||
153 | |||
154 | #else /* 32bit long above, 64bit long below */ | ||
155 | |||
156 | #define __mlog_test_u64(mask, bits) ((mask) & bits.words[0]) | ||
157 | #define __mlog_set_u64(mask, bits) do { \ | ||
158 | bits.words[0] |= (mask); \ | ||
159 | } while (0) | ||
160 | #define __mlog_clear_u64(mask, bits) do { \ | ||
161 | bits.words[0] &= ~(mask); \ | ||
162 | } while (0) | ||
163 | #define MLOG_BITS_RHS(mask) { { (mask) } } | ||
164 | |||
165 | #endif | ||
166 | |||
167 | /* | ||
168 | * smp_processor_id() "helpfully" screams when called outside preemptible | ||
169 | * regions in current kernels. sles doesn't have the variants that don't | ||
170 | * scream. just do this instead of trying to guess which we're building | ||
171 | * against.. *sigh*. | ||
172 | */ | ||
173 | #define __mlog_cpu_guess ({ \ | ||
174 | unsigned long _cpu = get_cpu(); \ | ||
175 | put_cpu(); \ | ||
176 | _cpu; \ | ||
177 | }) | ||
178 | |||
179 | /* In the following two macros, the whitespace after the ',' just | ||
180 | * before ##args is intentional. Otherwise, gcc 2.95 will eat the | ||
181 | * previous token if args expands to nothing. | ||
182 | */ | ||
183 | #define __mlog_printk(level, fmt, args...) \ | ||
184 | printk(level "(%u,%lu):%s:%d " fmt, current->pid, \ | ||
185 | __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ , \ | ||
186 | ##args) | ||
187 | |||
188 | #define mlog(mask, fmt, args...) do { \ | ||
189 | u64 __m = MLOG_MASK_PREFIX | (mask); \ | ||
190 | if (__mlog_test_u64(__m, mlog_and_bits) && \ | ||
191 | !__mlog_test_u64(__m, mlog_not_bits)) { \ | ||
192 | if (__m & ML_ERROR) \ | ||
193 | __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \ | ||
194 | else if (__m & ML_NOTICE) \ | ||
195 | __mlog_printk(KERN_NOTICE, fmt , ##args); \ | ||
196 | else __mlog_printk(KERN_INFO, fmt , ##args); \ | ||
197 | } \ | ||
198 | } while (0) | ||
199 | |||
200 | #define mlog_errno(st) do { \ | ||
201 | int _st = (st); \ | ||
202 | if (_st != -ERESTARTSYS && _st != -EINTR && \ | ||
203 | _st != AOP_TRUNCATED_PAGE) \ | ||
204 | mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ | ||
205 | } while (0) | ||
206 | |||
207 | #define mlog_entry(fmt, args...) do { \ | ||
208 | mlog(ML_ENTRY, "ENTRY:" fmt , ##args); \ | ||
209 | } while (0) | ||
210 | |||
211 | #define mlog_entry_void() do { \ | ||
212 | mlog(ML_ENTRY, "ENTRY:\n"); \ | ||
213 | } while (0) | ||
214 | |||
215 | /* We disable this for old compilers since they don't have support for | ||
216 | * __builtin_types_compatible_p. | ||
217 | */ | ||
218 | #if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) && \ | ||
219 | !defined(__CHECKER__) | ||
220 | #define mlog_exit(st) do { \ | ||
221 | if (__builtin_types_compatible_p(typeof(st), unsigned long)) \ | ||
222 | mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st)); \ | ||
223 | else if (__builtin_types_compatible_p(typeof(st), signed long)) \ | ||
224 | mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st)); \ | ||
225 | else if (__builtin_types_compatible_p(typeof(st), unsigned int) \ | ||
226 | || __builtin_types_compatible_p(typeof(st), unsigned short) \ | ||
227 | || __builtin_types_compatible_p(typeof(st), unsigned char)) \ | ||
228 | mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st)); \ | ||
229 | else if (__builtin_types_compatible_p(typeof(st), signed int) \ | ||
230 | || __builtin_types_compatible_p(typeof(st), signed short) \ | ||
231 | || __builtin_types_compatible_p(typeof(st), signed char)) \ | ||
232 | mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st)); \ | ||
233 | else if (__builtin_types_compatible_p(typeof(st), long long)) \ | ||
234 | mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \ | ||
235 | else \ | ||
236 | mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st)); \ | ||
237 | } while (0) | ||
238 | #else | ||
239 | #define mlog_exit(st) do { \ | ||
240 | mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \ | ||
241 | } while (0) | ||
242 | #endif | ||
243 | |||
244 | #define mlog_exit_ptr(ptr) do { \ | ||
245 | mlog(ML_EXIT, "EXIT: %p\n", ptr); \ | ||
246 | } while (0) | ||
247 | |||
248 | #define mlog_exit_void() do { \ | ||
249 | mlog(ML_EXIT, "EXIT\n"); \ | ||
250 | } while (0) | ||
251 | |||
252 | #define mlog_bug_on_msg(cond, fmt, args...) do { \ | ||
253 | if (cond) { \ | ||
254 | mlog(ML_ERROR, "bug expression: " #cond "\n"); \ | ||
255 | mlog(ML_ERROR, fmt, ##args); \ | ||
256 | BUG(); \ | ||
257 | } \ | ||
258 | } while (0) | ||
259 | |||
260 | #if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64) | ||
261 | #define MLFi64 "lld" | ||
262 | #define MLFu64 "llu" | ||
263 | #define MLFx64 "llx" | ||
264 | #else | ||
265 | #define MLFi64 "ld" | ||
266 | #define MLFu64 "lu" | ||
267 | #define MLFx64 "lx" | ||
268 | #endif | ||
269 | |||
270 | #include <linux/kobject.h> | ||
271 | #include <linux/sysfs.h> | ||
272 | int mlog_sys_init(struct subsystem *o2cb_subsys); | ||
273 | void mlog_sys_shutdown(void); | ||
274 | |||
275 | #endif /* O2CLUSTER_MASKLOG_H */ | ||
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c new file mode 100644 index 000000000000..5fd60c105913 --- /dev/null +++ b/fs/ocfs2/cluster/nodemanager.c | |||
@@ -0,0 +1,791 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * Copyright (C) 2004, 2005 Oracle. All rights reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | */ | ||
21 | |||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/module.h> | ||
24 | #include <linux/sysctl.h> | ||
25 | #include <linux/configfs.h> | ||
26 | |||
27 | #include "endian.h" | ||
28 | #include "tcp.h" | ||
29 | #include "nodemanager.h" | ||
30 | #include "heartbeat.h" | ||
31 | #include "masklog.h" | ||
32 | #include "sys.h" | ||
33 | #include "ver.h" | ||
34 | |||
35 | /* for now we operate under the assertion that there can be only one | ||
36 | * cluster active at a time. Changing this will require trickling | ||
37 | * cluster references throughout where nodes are looked up */ | ||
38 | static struct o2nm_cluster *o2nm_single_cluster = NULL; | ||
39 | |||
40 | #define OCFS2_MAX_HB_CTL_PATH 256 | ||
41 | static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; | ||
42 | |||
43 | static ctl_table ocfs2_nm_table[] = { | ||
44 | { | ||
45 | .ctl_name = 1, | ||
46 | .procname = "hb_ctl_path", | ||
47 | .data = ocfs2_hb_ctl_path, | ||
48 | .maxlen = OCFS2_MAX_HB_CTL_PATH, | ||
49 | .mode = 0644, | ||
50 | .proc_handler = &proc_dostring, | ||
51 | .strategy = &sysctl_string, | ||
52 | }, | ||
53 | { .ctl_name = 0 } | ||
54 | }; | ||
55 | |||
56 | static ctl_table ocfs2_mod_table[] = { | ||
57 | { | ||
58 | .ctl_name = KERN_OCFS2_NM, | ||
59 | .procname = "nm", | ||
60 | .data = NULL, | ||
61 | .maxlen = 0, | ||
62 | .mode = 0555, | ||
63 | .child = ocfs2_nm_table | ||
64 | }, | ||
65 | { .ctl_name = 0} | ||
66 | }; | ||
67 | |||
68 | static ctl_table ocfs2_kern_table[] = { | ||
69 | { | ||
70 | .ctl_name = KERN_OCFS2, | ||
71 | .procname = "ocfs2", | ||
72 | .data = NULL, | ||
73 | .maxlen = 0, | ||
74 | .mode = 0555, | ||
75 | .child = ocfs2_mod_table | ||
76 | }, | ||
77 | { .ctl_name = 0} | ||
78 | }; | ||
79 | |||
80 | static ctl_table ocfs2_root_table[] = { | ||
81 | { | ||
82 | .ctl_name = CTL_FS, | ||
83 | .procname = "fs", | ||
84 | .data = NULL, | ||
85 | .maxlen = 0, | ||
86 | .mode = 0555, | ||
87 | .child = ocfs2_kern_table | ||
88 | }, | ||
89 | { .ctl_name = 0 } | ||
90 | }; | ||
91 | |||
92 | static struct ctl_table_header *ocfs2_table_header = NULL; | ||
93 | |||
94 | const char *o2nm_get_hb_ctl_path(void) | ||
95 | { | ||
96 | return ocfs2_hb_ctl_path; | ||
97 | } | ||
98 | EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path); | ||
99 | |||
100 | struct o2nm_cluster { | ||
101 | struct config_group cl_group; | ||
102 | unsigned cl_has_local:1; | ||
103 | u8 cl_local_node; | ||
104 | rwlock_t cl_nodes_lock; | ||
105 | struct o2nm_node *cl_nodes[O2NM_MAX_NODES]; | ||
106 | struct rb_root cl_node_ip_tree; | ||
107 | /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ | ||
108 | unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
109 | }; | ||
110 | |||
111 | struct o2nm_node *o2nm_get_node_by_num(u8 node_num) | ||
112 | { | ||
113 | struct o2nm_node *node = NULL; | ||
114 | |||
115 | if (node_num >= O2NM_MAX_NODES || o2nm_single_cluster == NULL) | ||
116 | goto out; | ||
117 | |||
118 | read_lock(&o2nm_single_cluster->cl_nodes_lock); | ||
119 | node = o2nm_single_cluster->cl_nodes[node_num]; | ||
120 | if (node) | ||
121 | config_item_get(&node->nd_item); | ||
122 | read_unlock(&o2nm_single_cluster->cl_nodes_lock); | ||
123 | out: | ||
124 | return node; | ||
125 | } | ||
126 | EXPORT_SYMBOL_GPL(o2nm_get_node_by_num); | ||
127 | |||
128 | int o2nm_configured_node_map(unsigned long *map, unsigned bytes) | ||
129 | { | ||
130 | struct o2nm_cluster *cluster = o2nm_single_cluster; | ||
131 | |||
132 | BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap))); | ||
133 | |||
134 | if (cluster == NULL) | ||
135 | return -EINVAL; | ||
136 | |||
137 | read_lock(&cluster->cl_nodes_lock); | ||
138 | memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap)); | ||
139 | read_unlock(&cluster->cl_nodes_lock); | ||
140 | |||
141 | return 0; | ||
142 | } | ||
143 | EXPORT_SYMBOL_GPL(o2nm_configured_node_map); | ||
144 | |||
145 | static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster, | ||
146 | __be32 ip_needle, | ||
147 | struct rb_node ***ret_p, | ||
148 | struct rb_node **ret_parent) | ||
149 | { | ||
150 | struct rb_node **p = &cluster->cl_node_ip_tree.rb_node; | ||
151 | struct rb_node *parent = NULL; | ||
152 | struct o2nm_node *node, *ret = NULL; | ||
153 | |||
154 | while (*p) { | ||
155 | parent = *p; | ||
156 | node = rb_entry(parent, struct o2nm_node, nd_ip_node); | ||
157 | |||
158 | if (memcmp(&ip_needle, &node->nd_ipv4_address, | ||
159 | sizeof(ip_needle)) < 0) | ||
160 | p = &(*p)->rb_left; | ||
161 | else if (memcmp(&ip_needle, &node->nd_ipv4_address, | ||
162 | sizeof(ip_needle)) > 0) | ||
163 | p = &(*p)->rb_right; | ||
164 | else { | ||
165 | ret = node; | ||
166 | break; | ||
167 | } | ||
168 | } | ||
169 | |||
170 | if (ret_p != NULL) | ||
171 | *ret_p = p; | ||
172 | if (ret_parent != NULL) | ||
173 | *ret_parent = parent; | ||
174 | |||
175 | return ret; | ||
176 | } | ||
177 | |||
178 | struct o2nm_node *o2nm_get_node_by_ip(__be32 addr) | ||
179 | { | ||
180 | struct o2nm_node *node = NULL; | ||
181 | struct o2nm_cluster *cluster = o2nm_single_cluster; | ||
182 | |||
183 | if (cluster == NULL) | ||
184 | goto out; | ||
185 | |||
186 | read_lock(&cluster->cl_nodes_lock); | ||
187 | node = o2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL); | ||
188 | if (node) | ||
189 | config_item_get(&node->nd_item); | ||
190 | read_unlock(&cluster->cl_nodes_lock); | ||
191 | |||
192 | out: | ||
193 | return node; | ||
194 | } | ||
195 | EXPORT_SYMBOL_GPL(o2nm_get_node_by_ip); | ||
196 | |||
197 | void o2nm_node_put(struct o2nm_node *node) | ||
198 | { | ||
199 | config_item_put(&node->nd_item); | ||
200 | } | ||
201 | EXPORT_SYMBOL_GPL(o2nm_node_put); | ||
202 | |||
203 | void o2nm_node_get(struct o2nm_node *node) | ||
204 | { | ||
205 | config_item_get(&node->nd_item); | ||
206 | } | ||
207 | EXPORT_SYMBOL_GPL(o2nm_node_get); | ||
208 | |||
209 | u8 o2nm_this_node(void) | ||
210 | { | ||
211 | u8 node_num = O2NM_MAX_NODES; | ||
212 | |||
213 | if (o2nm_single_cluster && o2nm_single_cluster->cl_has_local) | ||
214 | node_num = o2nm_single_cluster->cl_local_node; | ||
215 | |||
216 | return node_num; | ||
217 | } | ||
218 | EXPORT_SYMBOL_GPL(o2nm_this_node); | ||
219 | |||
220 | /* node configfs bits */ | ||
221 | |||
222 | static struct o2nm_cluster *to_o2nm_cluster(struct config_item *item) | ||
223 | { | ||
224 | return item ? | ||
225 | container_of(to_config_group(item), struct o2nm_cluster, | ||
226 | cl_group) | ||
227 | : NULL; | ||
228 | } | ||
229 | |||
230 | static struct o2nm_node *to_o2nm_node(struct config_item *item) | ||
231 | { | ||
232 | return item ? container_of(item, struct o2nm_node, nd_item) : NULL; | ||
233 | } | ||
234 | |||
235 | static void o2nm_node_release(struct config_item *item) | ||
236 | { | ||
237 | struct o2nm_node *node = to_o2nm_node(item); | ||
238 | kfree(node); | ||
239 | } | ||
240 | |||
241 | static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page) | ||
242 | { | ||
243 | return sprintf(page, "%d\n", node->nd_num); | ||
244 | } | ||
245 | |||
246 | static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node) | ||
247 | { | ||
248 | /* through the first node_set .parent | ||
249 | * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */ | ||
250 | return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); | ||
251 | } | ||
252 | |||
253 | enum { | ||
254 | O2NM_NODE_ATTR_NUM = 0, | ||
255 | O2NM_NODE_ATTR_PORT, | ||
256 | O2NM_NODE_ATTR_ADDRESS, | ||
257 | O2NM_NODE_ATTR_LOCAL, | ||
258 | }; | ||
259 | |||
260 | static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page, | ||
261 | size_t count) | ||
262 | { | ||
263 | struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); | ||
264 | unsigned long tmp; | ||
265 | char *p = (char *)page; | ||
266 | |||
267 | tmp = simple_strtoul(p, &p, 0); | ||
268 | if (!p || (*p && (*p != '\n'))) | ||
269 | return -EINVAL; | ||
270 | |||
271 | if (tmp >= O2NM_MAX_NODES) | ||
272 | return -ERANGE; | ||
273 | |||
274 | /* once we're in the cl_nodes tree networking can look us up by | ||
275 | * node number and try to use our address and port attributes | ||
276 | * to connect to this node.. make sure that they've been set | ||
277 | * before writing the node attribute? */ | ||
278 | if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) || | ||
279 | !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) | ||
280 | return -EINVAL; /* XXX */ | ||
281 | |||
282 | write_lock(&cluster->cl_nodes_lock); | ||
283 | if (cluster->cl_nodes[tmp]) | ||
284 | p = NULL; | ||
285 | else { | ||
286 | cluster->cl_nodes[tmp] = node; | ||
287 | node->nd_num = tmp; | ||
288 | set_bit(tmp, cluster->cl_nodes_bitmap); | ||
289 | } | ||
290 | write_unlock(&cluster->cl_nodes_lock); | ||
291 | if (p == NULL) | ||
292 | return -EEXIST; | ||
293 | |||
294 | return count; | ||
295 | } | ||
296 | static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page) | ||
297 | { | ||
298 | return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port)); | ||
299 | } | ||
300 | |||
301 | static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node, | ||
302 | const char *page, size_t count) | ||
303 | { | ||
304 | unsigned long tmp; | ||
305 | char *p = (char *)page; | ||
306 | |||
307 | tmp = simple_strtoul(p, &p, 0); | ||
308 | if (!p || (*p && (*p != '\n'))) | ||
309 | return -EINVAL; | ||
310 | |||
311 | if (tmp == 0) | ||
312 | return -EINVAL; | ||
313 | if (tmp >= (u16)-1) | ||
314 | return -ERANGE; | ||
315 | |||
316 | node->nd_ipv4_port = htons(tmp); | ||
317 | |||
318 | return count; | ||
319 | } | ||
320 | |||
321 | static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page) | ||
322 | { | ||
323 | return sprintf(page, "%u.%u.%u.%u\n", NIPQUAD(node->nd_ipv4_address)); | ||
324 | } | ||
325 | |||
326 | static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node, | ||
327 | const char *page, | ||
328 | size_t count) | ||
329 | { | ||
330 | struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); | ||
331 | int ret, i; | ||
332 | struct rb_node **p, *parent; | ||
333 | unsigned int octets[4]; | ||
334 | __be32 ipv4_addr = 0; | ||
335 | |||
336 | ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2], | ||
337 | &octets[1], &octets[0]); | ||
338 | if (ret != 4) | ||
339 | return -EINVAL; | ||
340 | |||
341 | for (i = 0; i < ARRAY_SIZE(octets); i++) { | ||
342 | if (octets[i] > 255) | ||
343 | return -ERANGE; | ||
344 | be32_add_cpu(&ipv4_addr, octets[i] << (i * 8)); | ||
345 | } | ||
346 | |||
347 | ret = 0; | ||
348 | write_lock(&cluster->cl_nodes_lock); | ||
349 | if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent)) | ||
350 | ret = -EEXIST; | ||
351 | else { | ||
352 | rb_link_node(&node->nd_ip_node, parent, p); | ||
353 | rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree); | ||
354 | } | ||
355 | write_unlock(&cluster->cl_nodes_lock); | ||
356 | if (ret) | ||
357 | return ret; | ||
358 | |||
359 | memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr)); | ||
360 | |||
361 | return count; | ||
362 | } | ||
363 | |||
364 | static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page) | ||
365 | { | ||
366 | return sprintf(page, "%d\n", node->nd_local); | ||
367 | } | ||
368 | |||
369 | static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page, | ||
370 | size_t count) | ||
371 | { | ||
372 | struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); | ||
373 | unsigned long tmp; | ||
374 | char *p = (char *)page; | ||
375 | ssize_t ret; | ||
376 | |||
377 | tmp = simple_strtoul(p, &p, 0); | ||
378 | if (!p || (*p && (*p != '\n'))) | ||
379 | return -EINVAL; | ||
380 | |||
381 | tmp = !!tmp; /* boolean of whether this node wants to be local */ | ||
382 | |||
383 | /* setting local turns on networking rx for now so we require having | ||
384 | * set everything else first */ | ||
385 | if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) || | ||
386 | !test_bit(O2NM_NODE_ATTR_NUM, &node->nd_set_attributes) || | ||
387 | !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) | ||
388 | return -EINVAL; /* XXX */ | ||
389 | |||
390 | /* the only failure case is trying to set a new local node | ||
391 | * when a different one is already set */ | ||
392 | if (tmp && tmp == cluster->cl_has_local && | ||
393 | cluster->cl_local_node != node->nd_num) | ||
394 | return -EBUSY; | ||
395 | |||
396 | /* bring up the rx thread if we're setting the new local node. */ | ||
397 | if (tmp && !cluster->cl_has_local) { | ||
398 | ret = o2net_start_listening(node); | ||
399 | if (ret) | ||
400 | return ret; | ||
401 | } | ||
402 | |||
403 | if (!tmp && cluster->cl_has_local && | ||
404 | cluster->cl_local_node == node->nd_num) { | ||
405 | o2net_stop_listening(node); | ||
406 | cluster->cl_local_node = O2NM_INVALID_NODE_NUM; | ||
407 | } | ||
408 | |||
409 | node->nd_local = tmp; | ||
410 | if (node->nd_local) { | ||
411 | cluster->cl_has_local = tmp; | ||
412 | cluster->cl_local_node = node->nd_num; | ||
413 | } | ||
414 | |||
415 | return count; | ||
416 | } | ||
417 | |||
418 | struct o2nm_node_attribute { | ||
419 | struct configfs_attribute attr; | ||
420 | ssize_t (*show)(struct o2nm_node *, char *); | ||
421 | ssize_t (*store)(struct o2nm_node *, const char *, size_t); | ||
422 | }; | ||
423 | |||
424 | static struct o2nm_node_attribute o2nm_node_attr_num = { | ||
425 | .attr = { .ca_owner = THIS_MODULE, | ||
426 | .ca_name = "num", | ||
427 | .ca_mode = S_IRUGO | S_IWUSR }, | ||
428 | .show = o2nm_node_num_read, | ||
429 | .store = o2nm_node_num_write, | ||
430 | }; | ||
431 | |||
432 | static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = { | ||
433 | .attr = { .ca_owner = THIS_MODULE, | ||
434 | .ca_name = "ipv4_port", | ||
435 | .ca_mode = S_IRUGO | S_IWUSR }, | ||
436 | .show = o2nm_node_ipv4_port_read, | ||
437 | .store = o2nm_node_ipv4_port_write, | ||
438 | }; | ||
439 | |||
440 | static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = { | ||
441 | .attr = { .ca_owner = THIS_MODULE, | ||
442 | .ca_name = "ipv4_address", | ||
443 | .ca_mode = S_IRUGO | S_IWUSR }, | ||
444 | .show = o2nm_node_ipv4_address_read, | ||
445 | .store = o2nm_node_ipv4_address_write, | ||
446 | }; | ||
447 | |||
448 | static struct o2nm_node_attribute o2nm_node_attr_local = { | ||
449 | .attr = { .ca_owner = THIS_MODULE, | ||
450 | .ca_name = "local", | ||
451 | .ca_mode = S_IRUGO | S_IWUSR }, | ||
452 | .show = o2nm_node_local_read, | ||
453 | .store = o2nm_node_local_write, | ||
454 | }; | ||
455 | |||
456 | static struct configfs_attribute *o2nm_node_attrs[] = { | ||
457 | [O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr, | ||
458 | [O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr, | ||
459 | [O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr, | ||
460 | [O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr, | ||
461 | NULL, | ||
462 | }; | ||
463 | |||
464 | static int o2nm_attr_index(struct configfs_attribute *attr) | ||
465 | { | ||
466 | int i; | ||
467 | for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) { | ||
468 | if (attr == o2nm_node_attrs[i]) | ||
469 | return i; | ||
470 | } | ||
471 | BUG(); | ||
472 | return 0; | ||
473 | } | ||
474 | |||
475 | static ssize_t o2nm_node_show(struct config_item *item, | ||
476 | struct configfs_attribute *attr, | ||
477 | char *page) | ||
478 | { | ||
479 | struct o2nm_node *node = to_o2nm_node(item); | ||
480 | struct o2nm_node_attribute *o2nm_node_attr = | ||
481 | container_of(attr, struct o2nm_node_attribute, attr); | ||
482 | ssize_t ret = 0; | ||
483 | |||
484 | if (o2nm_node_attr->show) | ||
485 | ret = o2nm_node_attr->show(node, page); | ||
486 | return ret; | ||
487 | } | ||
488 | |||
489 | static ssize_t o2nm_node_store(struct config_item *item, | ||
490 | struct configfs_attribute *attr, | ||
491 | const char *page, size_t count) | ||
492 | { | ||
493 | struct o2nm_node *node = to_o2nm_node(item); | ||
494 | struct o2nm_node_attribute *o2nm_node_attr = | ||
495 | container_of(attr, struct o2nm_node_attribute, attr); | ||
496 | ssize_t ret; | ||
497 | int attr_index = o2nm_attr_index(attr); | ||
498 | |||
499 | if (o2nm_node_attr->store == NULL) { | ||
500 | ret = -EINVAL; | ||
501 | goto out; | ||
502 | } | ||
503 | |||
504 | if (test_bit(attr_index, &node->nd_set_attributes)) | ||
505 | return -EBUSY; | ||
506 | |||
507 | ret = o2nm_node_attr->store(node, page, count); | ||
508 | if (ret < count) | ||
509 | goto out; | ||
510 | |||
511 | set_bit(attr_index, &node->nd_set_attributes); | ||
512 | out: | ||
513 | return ret; | ||
514 | } | ||
515 | |||
516 | static struct configfs_item_operations o2nm_node_item_ops = { | ||
517 | .release = o2nm_node_release, | ||
518 | .show_attribute = o2nm_node_show, | ||
519 | .store_attribute = o2nm_node_store, | ||
520 | }; | ||
521 | |||
522 | static struct config_item_type o2nm_node_type = { | ||
523 | .ct_item_ops = &o2nm_node_item_ops, | ||
524 | .ct_attrs = o2nm_node_attrs, | ||
525 | .ct_owner = THIS_MODULE, | ||
526 | }; | ||
527 | |||
528 | /* node set */ | ||
529 | |||
530 | struct o2nm_node_group { | ||
531 | struct config_group ns_group; | ||
532 | /* some stuff? */ | ||
533 | }; | ||
534 | |||
535 | #if 0 | ||
536 | static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group) | ||
537 | { | ||
538 | return group ? | ||
539 | container_of(group, struct o2nm_node_group, ns_group) | ||
540 | : NULL; | ||
541 | } | ||
542 | #endif | ||
543 | |||
544 | static struct config_item *o2nm_node_group_make_item(struct config_group *group, | ||
545 | const char *name) | ||
546 | { | ||
547 | struct o2nm_node *node = NULL; | ||
548 | struct config_item *ret = NULL; | ||
549 | |||
550 | if (strlen(name) > O2NM_MAX_NAME_LEN) | ||
551 | goto out; /* ENAMETOOLONG */ | ||
552 | |||
553 | node = kcalloc(1, sizeof(struct o2nm_node), GFP_KERNEL); | ||
554 | if (node == NULL) | ||
555 | goto out; /* ENOMEM */ | ||
556 | |||
557 | strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */ | ||
558 | config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); | ||
559 | spin_lock_init(&node->nd_lock); | ||
560 | |||
561 | ret = &node->nd_item; | ||
562 | |||
563 | out: | ||
564 | if (ret == NULL) | ||
565 | kfree(node); | ||
566 | |||
567 | return ret; | ||
568 | } | ||
569 | |||
570 | static void o2nm_node_group_drop_item(struct config_group *group, | ||
571 | struct config_item *item) | ||
572 | { | ||
573 | struct o2nm_node *node = to_o2nm_node(item); | ||
574 | struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent); | ||
575 | |||
576 | o2net_disconnect_node(node); | ||
577 | |||
578 | if (cluster->cl_has_local && | ||
579 | (cluster->cl_local_node == node->nd_num)) { | ||
580 | cluster->cl_has_local = 0; | ||
581 | cluster->cl_local_node = O2NM_INVALID_NODE_NUM; | ||
582 | o2net_stop_listening(node); | ||
583 | } | ||
584 | |||
585 | /* XXX call into net to stop this node from trading messages */ | ||
586 | |||
587 | write_lock(&cluster->cl_nodes_lock); | ||
588 | |||
589 | /* XXX sloppy */ | ||
590 | if (node->nd_ipv4_address) | ||
591 | rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree); | ||
592 | |||
593 | /* nd_num might be 0 if the node number hasn't been set.. */ | ||
594 | if (cluster->cl_nodes[node->nd_num] == node) { | ||
595 | cluster->cl_nodes[node->nd_num] = NULL; | ||
596 | clear_bit(node->nd_num, cluster->cl_nodes_bitmap); | ||
597 | } | ||
598 | write_unlock(&cluster->cl_nodes_lock); | ||
599 | |||
600 | config_item_put(item); | ||
601 | } | ||
602 | |||
603 | static struct configfs_group_operations o2nm_node_group_group_ops = { | ||
604 | .make_item = o2nm_node_group_make_item, | ||
605 | .drop_item = o2nm_node_group_drop_item, | ||
606 | }; | ||
607 | |||
608 | static struct config_item_type o2nm_node_group_type = { | ||
609 | .ct_group_ops = &o2nm_node_group_group_ops, | ||
610 | .ct_owner = THIS_MODULE, | ||
611 | }; | ||
612 | |||
613 | /* cluster */ | ||
614 | |||
615 | static void o2nm_cluster_release(struct config_item *item) | ||
616 | { | ||
617 | struct o2nm_cluster *cluster = to_o2nm_cluster(item); | ||
618 | |||
619 | kfree(cluster->cl_group.default_groups); | ||
620 | kfree(cluster); | ||
621 | } | ||
622 | |||
623 | static struct configfs_item_operations o2nm_cluster_item_ops = { | ||
624 | .release = o2nm_cluster_release, | ||
625 | }; | ||
626 | |||
627 | static struct config_item_type o2nm_cluster_type = { | ||
628 | .ct_item_ops = &o2nm_cluster_item_ops, | ||
629 | .ct_owner = THIS_MODULE, | ||
630 | }; | ||
631 | |||
632 | /* cluster set */ | ||
633 | |||
634 | struct o2nm_cluster_group { | ||
635 | struct configfs_subsystem cs_subsys; | ||
636 | /* some stuff? */ | ||
637 | }; | ||
638 | |||
639 | #if 0 | ||
640 | static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *group) | ||
641 | { | ||
642 | return group ? | ||
643 | container_of(to_configfs_subsystem(group), struct o2nm_cluster_group, cs_subsys) | ||
644 | : NULL; | ||
645 | } | ||
646 | #endif | ||
647 | |||
648 | static struct config_group *o2nm_cluster_group_make_group(struct config_group *group, | ||
649 | const char *name) | ||
650 | { | ||
651 | struct o2nm_cluster *cluster = NULL; | ||
652 | struct o2nm_node_group *ns = NULL; | ||
653 | struct config_group *o2hb_group = NULL, *ret = NULL; | ||
654 | void *defs = NULL; | ||
655 | |||
656 | /* this runs under the parent dir's i_sem; there can be only | ||
657 | * one caller in here at a time */ | ||
658 | if (o2nm_single_cluster) | ||
659 | goto out; /* ENOSPC */ | ||
660 | |||
661 | cluster = kcalloc(1, sizeof(struct o2nm_cluster), GFP_KERNEL); | ||
662 | ns = kcalloc(1, sizeof(struct o2nm_node_group), GFP_KERNEL); | ||
663 | defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); | ||
664 | o2hb_group = o2hb_alloc_hb_set(); | ||
665 | if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) | ||
666 | goto out; | ||
667 | |||
668 | config_group_init_type_name(&cluster->cl_group, name, | ||
669 | &o2nm_cluster_type); | ||
670 | config_group_init_type_name(&ns->ns_group, "node", | ||
671 | &o2nm_node_group_type); | ||
672 | |||
673 | cluster->cl_group.default_groups = defs; | ||
674 | cluster->cl_group.default_groups[0] = &ns->ns_group; | ||
675 | cluster->cl_group.default_groups[1] = o2hb_group; | ||
676 | cluster->cl_group.default_groups[2] = NULL; | ||
677 | rwlock_init(&cluster->cl_nodes_lock); | ||
678 | cluster->cl_node_ip_tree = RB_ROOT; | ||
679 | |||
680 | ret = &cluster->cl_group; | ||
681 | o2nm_single_cluster = cluster; | ||
682 | |||
683 | out: | ||
684 | if (ret == NULL) { | ||
685 | kfree(cluster); | ||
686 | kfree(ns); | ||
687 | o2hb_free_hb_set(o2hb_group); | ||
688 | kfree(defs); | ||
689 | } | ||
690 | |||
691 | return ret; | ||
692 | } | ||
693 | |||
694 | static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item) | ||
695 | { | ||
696 | struct o2nm_cluster *cluster = to_o2nm_cluster(item); | ||
697 | int i; | ||
698 | struct config_item *killme; | ||
699 | |||
700 | BUG_ON(o2nm_single_cluster != cluster); | ||
701 | o2nm_single_cluster = NULL; | ||
702 | |||
703 | for (i = 0; cluster->cl_group.default_groups[i]; i++) { | ||
704 | killme = &cluster->cl_group.default_groups[i]->cg_item; | ||
705 | cluster->cl_group.default_groups[i] = NULL; | ||
706 | config_item_put(killme); | ||
707 | } | ||
708 | |||
709 | config_item_put(item); | ||
710 | } | ||
711 | |||
712 | static struct configfs_group_operations o2nm_cluster_group_group_ops = { | ||
713 | .make_group = o2nm_cluster_group_make_group, | ||
714 | .drop_item = o2nm_cluster_group_drop_item, | ||
715 | }; | ||
716 | |||
717 | static struct config_item_type o2nm_cluster_group_type = { | ||
718 | .ct_group_ops = &o2nm_cluster_group_group_ops, | ||
719 | .ct_owner = THIS_MODULE, | ||
720 | }; | ||
721 | |||
722 | static struct o2nm_cluster_group o2nm_cluster_group = { | ||
723 | .cs_subsys = { | ||
724 | .su_group = { | ||
725 | .cg_item = { | ||
726 | .ci_namebuf = "cluster", | ||
727 | .ci_type = &o2nm_cluster_group_type, | ||
728 | }, | ||
729 | }, | ||
730 | }, | ||
731 | }; | ||
732 | |||
733 | static void __exit exit_o2nm(void) | ||
734 | { | ||
735 | if (ocfs2_table_header) | ||
736 | unregister_sysctl_table(ocfs2_table_header); | ||
737 | |||
738 | /* XXX sync with hb callbacks and shut down hb? */ | ||
739 | o2net_unregister_hb_callbacks(); | ||
740 | configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys); | ||
741 | o2cb_sys_shutdown(); | ||
742 | |||
743 | o2net_exit(); | ||
744 | } | ||
745 | |||
746 | static int __init init_o2nm(void) | ||
747 | { | ||
748 | int ret = -1; | ||
749 | |||
750 | cluster_print_version(); | ||
751 | |||
752 | o2hb_init(); | ||
753 | o2net_init(); | ||
754 | |||
755 | ocfs2_table_header = register_sysctl_table(ocfs2_root_table, 0); | ||
756 | if (!ocfs2_table_header) { | ||
757 | printk(KERN_ERR "nodemanager: unable to register sysctl\n"); | ||
758 | ret = -ENOMEM; /* or something. */ | ||
759 | goto out; | ||
760 | } | ||
761 | |||
762 | ret = o2net_register_hb_callbacks(); | ||
763 | if (ret) | ||
764 | goto out_sysctl; | ||
765 | |||
766 | config_group_init(&o2nm_cluster_group.cs_subsys.su_group); | ||
767 | init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem); | ||
768 | ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys); | ||
769 | if (ret) { | ||
770 | printk(KERN_ERR "nodemanager: Registration returned %d\n", ret); | ||
771 | goto out_callbacks; | ||
772 | } | ||
773 | |||
774 | ret = o2cb_sys_init(); | ||
775 | if (!ret) | ||
776 | goto out; | ||
777 | |||
778 | configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys); | ||
779 | out_callbacks: | ||
780 | o2net_unregister_hb_callbacks(); | ||
781 | out_sysctl: | ||
782 | unregister_sysctl_table(ocfs2_table_header); | ||
783 | out: | ||
784 | return ret; | ||
785 | } | ||
786 | |||
787 | MODULE_AUTHOR("Oracle"); | ||
788 | MODULE_LICENSE("GPL"); | ||
789 | |||
790 | module_init(init_o2nm) | ||
791 | module_exit(exit_o2nm) | ||
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h new file mode 100644 index 000000000000..fce8033c310f --- /dev/null +++ b/fs/ocfs2/cluster/nodemanager.h | |||
@@ -0,0 +1,64 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * nodemanager.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | #ifndef O2CLUSTER_NODEMANAGER_H | ||
28 | #define O2CLUSTER_NODEMANAGER_H | ||
29 | |||
30 | #include "ocfs2_nodemanager.h" | ||
31 | |||
32 | /* This totally doesn't belong here. */ | ||
33 | #include <linux/configfs.h> | ||
34 | #include <linux/rbtree.h> | ||
35 | |||
36 | #define KERN_OCFS2 988 | ||
37 | #define KERN_OCFS2_NM 1 | ||
38 | |||
39 | const char *o2nm_get_hb_ctl_path(void); | ||
40 | |||
41 | struct o2nm_node { | ||
42 | spinlock_t nd_lock; | ||
43 | struct config_item nd_item; | ||
44 | char nd_name[O2NM_MAX_NAME_LEN+1]; /* replace? */ | ||
45 | __u8 nd_num; | ||
46 | /* only one address per node, as attributes, for now. */ | ||
47 | __be32 nd_ipv4_address; | ||
48 | __be16 nd_ipv4_port; | ||
49 | struct rb_node nd_ip_node; | ||
50 | /* there can be only one local node for now */ | ||
51 | int nd_local; | ||
52 | |||
53 | unsigned long nd_set_attributes; | ||
54 | }; | ||
55 | |||
56 | u8 o2nm_this_node(void); | ||
57 | |||
58 | int o2nm_configured_node_map(unsigned long *map, unsigned bytes); | ||
59 | struct o2nm_node *o2nm_get_node_by_num(u8 node_num); | ||
60 | struct o2nm_node *o2nm_get_node_by_ip(__be32 addr); | ||
61 | void o2nm_node_get(struct o2nm_node *node); | ||
62 | void o2nm_node_put(struct o2nm_node *node); | ||
63 | |||
64 | #endif /* O2CLUSTER_NODEMANAGER_H */ | ||
diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h new file mode 100644 index 000000000000..94096069cb43 --- /dev/null +++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h | |||
@@ -0,0 +1,37 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ocfs2_heartbeat.h | ||
5 | * | ||
6 | * On-disk structures for ocfs2_heartbeat | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef _OCFS2_HEARTBEAT_H | ||
27 | #define _OCFS2_HEARTBEAT_H | ||
28 | |||
29 | struct o2hb_disk_heartbeat_block { | ||
30 | __le64 hb_seq; | ||
31 | __u8 hb_node; | ||
32 | __u8 hb_pad1[3]; | ||
33 | __le32 hb_cksum; | ||
34 | __le64 hb_generation; | ||
35 | }; | ||
36 | |||
37 | #endif /* _OCFS2_HEARTBEAT_H */ | ||
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h new file mode 100644 index 000000000000..5b9854bad571 --- /dev/null +++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h | |||
@@ -0,0 +1,39 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ocfs2_nodemanager.h | ||
5 | * | ||
6 | * Header describing the interface between userspace and the kernel | ||
7 | * for the ocfs2_nodemanager module. | ||
8 | * | ||
9 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public | ||
13 | * License as published by the Free Software Foundation; either | ||
14 | * version 2 of the License, or (at your option) any later version. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
19 | * General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public | ||
22 | * License along with this program; if not, write to the | ||
23 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
24 | * Boston, MA 021110-1307, USA. | ||
25 | * | ||
26 | */ | ||
27 | |||
28 | #ifndef _OCFS2_NODEMANAGER_H | ||
29 | #define _OCFS2_NODEMANAGER_H | ||
30 | |||
31 | #define O2NM_API_VERSION 5 | ||
32 | |||
33 | #define O2NM_MAX_NODES 255 | ||
34 | #define O2NM_INVALID_NODE_NUM 255 | ||
35 | |||
36 | /* host name, group name, cluster name all 64 bytes */ | ||
37 | #define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN | ||
38 | |||
39 | #endif /* _OCFS2_NODEMANAGER_H */ | ||
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c new file mode 100644 index 000000000000..7bba98fbfc15 --- /dev/null +++ b/fs/ocfs2/cluster/quorum.c | |||
@@ -0,0 +1,315 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * | ||
3 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
4 | * | ||
5 | * Copyright (C) 2005 Oracle. All rights reserved. | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public | ||
9 | * License as published by the Free Software Foundation; either | ||
10 | * version 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
15 | * General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public | ||
18 | * License along with this program; if not, write to the | ||
19 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
20 | * Boston, MA 021110-1307, USA. | ||
21 | */ | ||
22 | |||
23 | /* This quorum hack is only here until we transition to some more rational | ||
24 | * approach that is driven from userspace. Honest. No foolin'. | ||
25 | * | ||
26 | * Imagine two nodes lose network connectivity to each other but they're still | ||
27 | * up and operating in every other way. Presumably a network timeout indicates | ||
28 | * that a node is broken and should be recovered. They can't both recover each | ||
29 | * other and both carry on without serialising their access to the file system. | ||
30 | * They need to decide who is authoritative. Now extend that problem to | ||
31 | * arbitrary groups of nodes losing connectivity between each other. | ||
32 | * | ||
33 | * So we declare that a node which has given up on connecting to a majority | ||
34 | * of nodes who are still heartbeating will fence itself. | ||
35 | * | ||
36 | * There are huge opportunities for races here. After we give up on a node's | ||
37 | * connection we need to wait long enough to give heartbeat an opportunity | ||
38 | * to declare the node as truly dead. We also need to be careful with the | ||
39 | * race between when we see a node start heartbeating and when we connect | ||
40 | * to it. | ||
41 | * | ||
42 | * So nodes that are in this transtion put a hold on the quorum decision | ||
43 | * with a counter. As they fall out of this transition they drop the count | ||
44 | * and if they're the last, they fire off the decision. | ||
45 | */ | ||
46 | #include <linux/kernel.h> | ||
47 | #include <linux/slab.h> | ||
48 | #include <linux/workqueue.h> | ||
49 | |||
50 | #include "heartbeat.h" | ||
51 | #include "nodemanager.h" | ||
52 | #define MLOG_MASK_PREFIX ML_QUORUM | ||
53 | #include "masklog.h" | ||
54 | #include "quorum.h" | ||
55 | |||
56 | static struct o2quo_state { | ||
57 | spinlock_t qs_lock; | ||
58 | struct work_struct qs_work; | ||
59 | int qs_pending; | ||
60 | int qs_heartbeating; | ||
61 | unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
62 | int qs_connected; | ||
63 | unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
64 | int qs_holds; | ||
65 | unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
66 | } o2quo_state; | ||
67 | |||
68 | /* this is horribly heavy-handed. It should instead flip the file | ||
69 | * system RO and call some userspace script. */ | ||
70 | static void o2quo_fence_self(void) | ||
71 | { | ||
72 | /* panic spins with interrupts enabled. with preempt | ||
73 | * threads can still schedule, etc, etc */ | ||
74 | o2hb_stop_all_regions(); | ||
75 | panic("ocfs2 is very sorry to be fencing this system by panicing\n"); | ||
76 | } | ||
77 | |||
78 | /* Indicate that a timeout occured on a hearbeat region write. The | ||
79 | * other nodes in the cluster may consider us dead at that time so we | ||
80 | * want to "fence" ourselves so that we don't scribble on the disk | ||
81 | * after they think they've recovered us. This can't solve all | ||
82 | * problems related to writeout after recovery but this hack can at | ||
83 | * least close some of those gaps. When we have real fencing, this can | ||
84 | * go away as our node would be fenced externally before other nodes | ||
85 | * begin recovery. */ | ||
86 | void o2quo_disk_timeout(void) | ||
87 | { | ||
88 | o2quo_fence_self(); | ||
89 | } | ||
90 | |||
91 | static void o2quo_make_decision(void *arg) | ||
92 | { | ||
93 | int quorum; | ||
94 | int lowest_hb, lowest_reachable = 0, fence = 0; | ||
95 | struct o2quo_state *qs = &o2quo_state; | ||
96 | |||
97 | spin_lock(&qs->qs_lock); | ||
98 | |||
99 | lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES); | ||
100 | if (lowest_hb != O2NM_MAX_NODES) | ||
101 | lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm); | ||
102 | |||
103 | mlog(0, "heartbeating: %d, connected: %d, " | ||
104 | "lowest: %d (%sreachable)\n", qs->qs_heartbeating, | ||
105 | qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un"); | ||
106 | |||
107 | if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) || | ||
108 | qs->qs_heartbeating == 1) | ||
109 | goto out; | ||
110 | |||
111 | if (qs->qs_heartbeating & 1) { | ||
112 | /* the odd numbered cluster case is straight forward -- | ||
113 | * if we can't talk to the majority we're hosed */ | ||
114 | quorum = (qs->qs_heartbeating + 1)/2; | ||
115 | if (qs->qs_connected < quorum) { | ||
116 | mlog(ML_ERROR, "fencing this node because it is " | ||
117 | "only connected to %u nodes and %u is needed " | ||
118 | "to make a quorum out of %u heartbeating nodes\n", | ||
119 | qs->qs_connected, quorum, | ||
120 | qs->qs_heartbeating); | ||
121 | fence = 1; | ||
122 | } | ||
123 | } else { | ||
124 | /* the even numbered cluster adds the possibility of each half | ||
125 | * of the cluster being able to talk amongst themselves.. in | ||
126 | * that case we're hosed if we can't talk to the group that has | ||
127 | * the lowest numbered node */ | ||
128 | quorum = qs->qs_heartbeating / 2; | ||
129 | if (qs->qs_connected < quorum) { | ||
130 | mlog(ML_ERROR, "fencing this node because it is " | ||
131 | "only connected to %u nodes and %u is needed " | ||
132 | "to make a quorum out of %u heartbeating nodes\n", | ||
133 | qs->qs_connected, quorum, | ||
134 | qs->qs_heartbeating); | ||
135 | fence = 1; | ||
136 | } | ||
137 | else if ((qs->qs_connected == quorum) && | ||
138 | !lowest_reachable) { | ||
139 | mlog(ML_ERROR, "fencing this node because it is " | ||
140 | "connected to a half-quorum of %u out of %u " | ||
141 | "nodes which doesn't include the lowest active " | ||
142 | "node %u\n", quorum, qs->qs_heartbeating, | ||
143 | lowest_hb); | ||
144 | fence = 1; | ||
145 | } | ||
146 | } | ||
147 | |||
148 | out: | ||
149 | spin_unlock(&qs->qs_lock); | ||
150 | if (fence) | ||
151 | o2quo_fence_self(); | ||
152 | } | ||
153 | |||
154 | static void o2quo_set_hold(struct o2quo_state *qs, u8 node) | ||
155 | { | ||
156 | assert_spin_locked(&qs->qs_lock); | ||
157 | |||
158 | if (!test_and_set_bit(node, qs->qs_hold_bm)) { | ||
159 | qs->qs_holds++; | ||
160 | mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES, | ||
161 | "node %u\n", node); | ||
162 | mlog(0, "node %u, %d total\n", node, qs->qs_holds); | ||
163 | } | ||
164 | } | ||
165 | |||
166 | static void o2quo_clear_hold(struct o2quo_state *qs, u8 node) | ||
167 | { | ||
168 | assert_spin_locked(&qs->qs_lock); | ||
169 | |||
170 | if (test_and_clear_bit(node, qs->qs_hold_bm)) { | ||
171 | mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1); | ||
172 | if (--qs->qs_holds == 0) { | ||
173 | if (qs->qs_pending) { | ||
174 | qs->qs_pending = 0; | ||
175 | schedule_work(&qs->qs_work); | ||
176 | } | ||
177 | } | ||
178 | mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n", | ||
179 | node, qs->qs_holds); | ||
180 | } | ||
181 | } | ||
182 | |||
183 | /* as a node comes up we delay the quorum decision until we know the fate of | ||
184 | * the connection. the hold will be droped in conn_up or hb_down. it might be | ||
185 | * perpetuated by con_err until hb_down. if we already have a conn, we might | ||
186 | * be dropping a hold that conn_up got. */ | ||
187 | void o2quo_hb_up(u8 node) | ||
188 | { | ||
189 | struct o2quo_state *qs = &o2quo_state; | ||
190 | |||
191 | spin_lock(&qs->qs_lock); | ||
192 | |||
193 | qs->qs_heartbeating++; | ||
194 | mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES, | ||
195 | "node %u\n", node); | ||
196 | mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node); | ||
197 | set_bit(node, qs->qs_hb_bm); | ||
198 | |||
199 | mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); | ||
200 | |||
201 | if (!test_bit(node, qs->qs_conn_bm)) | ||
202 | o2quo_set_hold(qs, node); | ||
203 | else | ||
204 | o2quo_clear_hold(qs, node); | ||
205 | |||
206 | spin_unlock(&qs->qs_lock); | ||
207 | } | ||
208 | |||
209 | /* hb going down releases any holds we might have had due to this node from | ||
210 | * conn_up, conn_err, or hb_up */ | ||
211 | void o2quo_hb_down(u8 node) | ||
212 | { | ||
213 | struct o2quo_state *qs = &o2quo_state; | ||
214 | |||
215 | spin_lock(&qs->qs_lock); | ||
216 | |||
217 | qs->qs_heartbeating--; | ||
218 | mlog_bug_on_msg(qs->qs_heartbeating < 0, | ||
219 | "node %u, %d heartbeating\n", | ||
220 | node, qs->qs_heartbeating); | ||
221 | mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node); | ||
222 | clear_bit(node, qs->qs_hb_bm); | ||
223 | |||
224 | mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); | ||
225 | |||
226 | o2quo_clear_hold(qs, node); | ||
227 | |||
228 | spin_unlock(&qs->qs_lock); | ||
229 | } | ||
230 | |||
231 | /* this tells us that we've decided that the node is still heartbeating | ||
232 | * even though we've lost it's conn. it must only be called after conn_err | ||
233 | * and indicates that we must now make a quorum decision in the future, | ||
234 | * though we might be doing so after waiting for holds to drain. Here | ||
235 | * we'll be dropping the hold from conn_err. */ | ||
236 | void o2quo_hb_still_up(u8 node) | ||
237 | { | ||
238 | struct o2quo_state *qs = &o2quo_state; | ||
239 | |||
240 | spin_lock(&qs->qs_lock); | ||
241 | |||
242 | mlog(0, "node %u\n", node); | ||
243 | |||
244 | qs->qs_pending = 1; | ||
245 | o2quo_clear_hold(qs, node); | ||
246 | |||
247 | spin_unlock(&qs->qs_lock); | ||
248 | } | ||
249 | |||
250 | /* This is analagous to hb_up. as a node's connection comes up we delay the | ||
251 | * quorum decision until we see it heartbeating. the hold will be droped in | ||
252 | * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if | ||
253 | * it's already heartbeating we we might be dropping a hold that conn_up got. | ||
254 | * */ | ||
255 | void o2quo_conn_up(u8 node) | ||
256 | { | ||
257 | struct o2quo_state *qs = &o2quo_state; | ||
258 | |||
259 | spin_lock(&qs->qs_lock); | ||
260 | |||
261 | qs->qs_connected++; | ||
262 | mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES, | ||
263 | "node %u\n", node); | ||
264 | mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node); | ||
265 | set_bit(node, qs->qs_conn_bm); | ||
266 | |||
267 | mlog(0, "node %u, %d total\n", node, qs->qs_connected); | ||
268 | |||
269 | if (!test_bit(node, qs->qs_hb_bm)) | ||
270 | o2quo_set_hold(qs, node); | ||
271 | else | ||
272 | o2quo_clear_hold(qs, node); | ||
273 | |||
274 | spin_unlock(&qs->qs_lock); | ||
275 | } | ||
276 | |||
277 | /* we've decided that we won't ever be connecting to the node again. if it's | ||
278 | * still heartbeating we grab a hold that will delay decisions until either the | ||
279 | * node stops heartbeating from hb_down or the caller decides that the node is | ||
280 | * still up and calls still_up */ | ||
281 | void o2quo_conn_err(u8 node) | ||
282 | { | ||
283 | struct o2quo_state *qs = &o2quo_state; | ||
284 | |||
285 | spin_lock(&qs->qs_lock); | ||
286 | |||
287 | if (test_bit(node, qs->qs_conn_bm)) { | ||
288 | qs->qs_connected--; | ||
289 | mlog_bug_on_msg(qs->qs_connected < 0, | ||
290 | "node %u, connected %d\n", | ||
291 | node, qs->qs_connected); | ||
292 | |||
293 | clear_bit(node, qs->qs_conn_bm); | ||
294 | } | ||
295 | |||
296 | mlog(0, "node %u, %d total\n", node, qs->qs_connected); | ||
297 | |||
298 | if (test_bit(node, qs->qs_hb_bm)) | ||
299 | o2quo_set_hold(qs, node); | ||
300 | |||
301 | spin_unlock(&qs->qs_lock); | ||
302 | } | ||
303 | |||
304 | void o2quo_init(void) | ||
305 | { | ||
306 | struct o2quo_state *qs = &o2quo_state; | ||
307 | |||
308 | spin_lock_init(&qs->qs_lock); | ||
309 | INIT_WORK(&qs->qs_work, o2quo_make_decision, NULL); | ||
310 | } | ||
311 | |||
312 | void o2quo_exit(void) | ||
313 | { | ||
314 | flush_scheduled_work(); | ||
315 | } | ||
diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h new file mode 100644 index 000000000000..6649cc6f67c9 --- /dev/null +++ b/fs/ocfs2/cluster/quorum.h | |||
@@ -0,0 +1,36 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * Copyright (C) 2005 Oracle. All rights reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | * | ||
21 | */ | ||
22 | |||
23 | #ifndef O2CLUSTER_QUORUM_H | ||
24 | #define O2CLUSTER_QUORUM_H | ||
25 | |||
26 | void o2quo_init(void); | ||
27 | void o2quo_exit(void); | ||
28 | |||
29 | void o2quo_hb_up(u8 node); | ||
30 | void o2quo_hb_down(u8 node); | ||
31 | void o2quo_hb_still_up(u8 node); | ||
32 | void o2quo_conn_up(u8 node); | ||
33 | void o2quo_conn_err(u8 node); | ||
34 | void o2quo_disk_timeout(void); | ||
35 | |||
36 | #endif /* O2CLUSTER_QUORUM_H */ | ||
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c new file mode 100644 index 000000000000..1d9f6acafa2e --- /dev/null +++ b/fs/ocfs2/cluster/sys.c | |||
@@ -0,0 +1,124 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * sys.c | ||
5 | * | ||
6 | * OCFS2 cluster sysfs interface | ||
7 | * | ||
8 | * Copyright (C) 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation, | ||
13 | * version 2 of the License. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | #include <linux/kernel.h> | ||
28 | #include <linux/module.h> | ||
29 | #include <linux/kobject.h> | ||
30 | #include <linux/sysfs.h> | ||
31 | |||
32 | #include "ocfs2_nodemanager.h" | ||
33 | #include "masklog.h" | ||
34 | #include "sys.h" | ||
35 | |||
36 | struct o2cb_attribute { | ||
37 | struct attribute attr; | ||
38 | ssize_t (*show)(char *buf); | ||
39 | ssize_t (*store)(const char *buf, size_t count); | ||
40 | }; | ||
41 | |||
42 | #define O2CB_ATTR(_name, _mode, _show, _store) \ | ||
43 | struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store) | ||
44 | |||
45 | #define to_o2cb_subsys(k) container_of(to_kset(k), struct subsystem, kset) | ||
46 | #define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr) | ||
47 | |||
48 | static ssize_t o2cb_interface_revision_show(char *buf) | ||
49 | { | ||
50 | return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); | ||
51 | } | ||
52 | |||
53 | static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL); | ||
54 | |||
55 | static struct attribute *o2cb_attrs[] = { | ||
56 | &o2cb_attr_interface_revision.attr, | ||
57 | NULL, | ||
58 | }; | ||
59 | |||
60 | static ssize_t | ||
61 | o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer); | ||
62 | static ssize_t | ||
63 | o2cb_store(struct kobject * kobj, struct attribute * attr, | ||
64 | const char * buffer, size_t count); | ||
65 | static struct sysfs_ops o2cb_sysfs_ops = { | ||
66 | .show = o2cb_show, | ||
67 | .store = o2cb_store, | ||
68 | }; | ||
69 | |||
70 | static struct kobj_type o2cb_subsys_type = { | ||
71 | .default_attrs = o2cb_attrs, | ||
72 | .sysfs_ops = &o2cb_sysfs_ops, | ||
73 | }; | ||
74 | |||
75 | /* gives us o2cb_subsys */ | ||
76 | static decl_subsys(o2cb, NULL, NULL); | ||
77 | |||
78 | static ssize_t | ||
79 | o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer) | ||
80 | { | ||
81 | struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr); | ||
82 | struct subsystem *sbs = to_o2cb_subsys(kobj); | ||
83 | |||
84 | BUG_ON(sbs != &o2cb_subsys); | ||
85 | |||
86 | if (o2cb_attr->show) | ||
87 | return o2cb_attr->show(buffer); | ||
88 | return -EIO; | ||
89 | } | ||
90 | |||
91 | static ssize_t | ||
92 | o2cb_store(struct kobject * kobj, struct attribute * attr, | ||
93 | const char * buffer, size_t count) | ||
94 | { | ||
95 | struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr); | ||
96 | struct subsystem *sbs = to_o2cb_subsys(kobj); | ||
97 | |||
98 | BUG_ON(sbs != &o2cb_subsys); | ||
99 | |||
100 | if (o2cb_attr->store) | ||
101 | return o2cb_attr->store(buffer, count); | ||
102 | return -EIO; | ||
103 | } | ||
104 | |||
105 | void o2cb_sys_shutdown(void) | ||
106 | { | ||
107 | mlog_sys_shutdown(); | ||
108 | subsystem_unregister(&o2cb_subsys); | ||
109 | } | ||
110 | |||
111 | int o2cb_sys_init(void) | ||
112 | { | ||
113 | int ret; | ||
114 | |||
115 | o2cb_subsys.kset.kobj.ktype = &o2cb_subsys_type; | ||
116 | ret = subsystem_register(&o2cb_subsys); | ||
117 | if (ret) | ||
118 | return ret; | ||
119 | |||
120 | ret = mlog_sys_init(&o2cb_subsys); | ||
121 | if (ret) | ||
122 | subsystem_unregister(&o2cb_subsys); | ||
123 | return ret; | ||
124 | } | ||
diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h new file mode 100644 index 000000000000..d66b8ab0045e --- /dev/null +++ b/fs/ocfs2/cluster/sys.h | |||
@@ -0,0 +1,33 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * sys.h | ||
5 | * | ||
6 | * Function prototypes for o2cb sysfs interface | ||
7 | * | ||
8 | * Copyright (C) 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation, | ||
13 | * version 2 of the License. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | #ifndef O2CLUSTER_SYS_H | ||
28 | #define O2CLUSTER_SYS_H | ||
29 | |||
30 | void o2cb_sys_shutdown(void); | ||
31 | int o2cb_sys_init(void); | ||
32 | |||
33 | #endif /* O2CLUSTER_SYS_H */ | ||
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c new file mode 100644 index 000000000000..35d92c01a972 --- /dev/null +++ b/fs/ocfs2/cluster/tcp.c | |||
@@ -0,0 +1,1829 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * | ||
3 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
4 | * | ||
5 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public | ||
9 | * License as published by the Free Software Foundation; either | ||
10 | * version 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
15 | * General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public | ||
18 | * License along with this program; if not, write to the | ||
19 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
20 | * Boston, MA 021110-1307, USA. | ||
21 | * | ||
22 | * ---- | ||
23 | * | ||
24 | * Callers for this were originally written against a very simple synchronus | ||
25 | * API. This implementation reflects those simple callers. Some day I'm sure | ||
26 | * we'll need to move to a more robust posting/callback mechanism. | ||
27 | * | ||
28 | * Transmit calls pass in kernel virtual addresses and block copying this into | ||
29 | * the socket's tx buffers via a usual blocking sendmsg. They'll block waiting | ||
30 | * for a failed socket to timeout. TX callers can also pass in a poniter to an | ||
31 | * 'int' which gets filled with an errno off the wire in response to the | ||
32 | * message they send. | ||
33 | * | ||
34 | * Handlers for unsolicited messages are registered. Each socket has a page | ||
35 | * that incoming data is copied into. First the header, then the data. | ||
36 | * Handlers are called from only one thread with a reference to this per-socket | ||
37 | * page. This page is destroyed after the handler call, so it can't be | ||
38 | * referenced beyond the call. Handlers may block but are discouraged from | ||
39 | * doing so. | ||
40 | * | ||
41 | * Any framing errors (bad magic, large payload lengths) close a connection. | ||
42 | * | ||
43 | * Our sock_container holds the state we associate with a socket. It's current | ||
44 | * framing state is held there as well as the refcounting we do around when it | ||
45 | * is safe to tear down the socket. The socket is only finally torn down from | ||
46 | * the container when the container loses all of its references -- so as long | ||
47 | * as you hold a ref on the container you can trust that the socket is valid | ||
48 | * for use with kernel socket APIs. | ||
49 | * | ||
50 | * Connections are initiated between a pair of nodes when the node with the | ||
51 | * higher node number gets a heartbeat callback which indicates that the lower | ||
52 | * numbered node has started heartbeating. The lower numbered node is passive | ||
53 | * and only accepts the connection if the higher numbered node is heartbeating. | ||
54 | */ | ||
55 | |||
56 | #include <linux/kernel.h> | ||
57 | #include <linux/jiffies.h> | ||
58 | #include <linux/slab.h> | ||
59 | #include <linux/idr.h> | ||
60 | #include <linux/kref.h> | ||
61 | #include <net/tcp.h> | ||
62 | |||
63 | #include <asm/uaccess.h> | ||
64 | |||
65 | #include "heartbeat.h" | ||
66 | #include "tcp.h" | ||
67 | #include "nodemanager.h" | ||
68 | #define MLOG_MASK_PREFIX ML_TCP | ||
69 | #include "masklog.h" | ||
70 | #include "quorum.h" | ||
71 | |||
72 | #include "tcp_internal.h" | ||
73 | |||
74 | /* | ||
75 | * The linux network stack isn't sparse endian clean.. It has macros like | ||
76 | * ntohs() which perform the endian checks and structs like sockaddr_in | ||
77 | * which aren't annotated. So __force is found here to get the build | ||
78 | * clean. When they emerge from the dark ages and annotate the code | ||
79 | * we can remove these. | ||
80 | */ | ||
81 | |||
82 | #define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u" | ||
83 | #define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \ | ||
84 | NIPQUAD(sc->sc_node->nd_ipv4_address), \ | ||
85 | ntohs(sc->sc_node->nd_ipv4_port) | ||
86 | |||
87 | /* | ||
88 | * In the following two log macros, the whitespace after the ',' just | ||
89 | * before ##args is intentional. Otherwise, gcc 2.95 will eat the | ||
90 | * previous token if args expands to nothing. | ||
91 | */ | ||
92 | #define msglog(hdr, fmt, args...) do { \ | ||
93 | typeof(hdr) __hdr = (hdr); \ | ||
94 | mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d " \ | ||
95 | "key %08x num %u] " fmt, \ | ||
96 | be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), \ | ||
97 | be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status), \ | ||
98 | be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key), \ | ||
99 | be32_to_cpu(__hdr->msg_num) , ##args); \ | ||
100 | } while (0) | ||
101 | |||
102 | #define sclog(sc, fmt, args...) do { \ | ||
103 | typeof(sc) __sc = (sc); \ | ||
104 | mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \ | ||
105 | "pg_off %zu] " fmt, __sc, \ | ||
106 | atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock, \ | ||
107 | __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \ | ||
108 | ##args); \ | ||
109 | } while (0) | ||
110 | |||
111 | static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED; | ||
112 | static struct rb_root o2net_handler_tree = RB_ROOT; | ||
113 | |||
114 | static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; | ||
115 | |||
116 | /* XXX someday we'll need better accounting */ | ||
117 | static struct socket *o2net_listen_sock = NULL; | ||
118 | |||
119 | /* | ||
120 | * listen work is only queued by the listening socket callbacks on the | ||
121 | * o2net_wq. teardown detaches the callbacks before destroying the workqueue. | ||
122 | * quorum work is queued as sock containers are shutdown.. stop_listening | ||
123 | * tears down all the node's sock containers, preventing future shutdowns | ||
124 | * and queued quroum work, before canceling delayed quorum work and | ||
125 | * destroying the work queue. | ||
126 | */ | ||
127 | static struct workqueue_struct *o2net_wq; | ||
128 | static struct work_struct o2net_listen_work; | ||
129 | |||
130 | static struct o2hb_callback_func o2net_hb_up, o2net_hb_down; | ||
131 | #define O2NET_HB_PRI 0x1 | ||
132 | |||
133 | static struct o2net_handshake *o2net_hand; | ||
134 | static struct o2net_msg *o2net_keep_req, *o2net_keep_resp; | ||
135 | |||
136 | static int o2net_sys_err_translations[O2NET_ERR_MAX] = | ||
137 | {[O2NET_ERR_NONE] = 0, | ||
138 | [O2NET_ERR_NO_HNDLR] = -ENOPROTOOPT, | ||
139 | [O2NET_ERR_OVERFLOW] = -EOVERFLOW, | ||
140 | [O2NET_ERR_DIED] = -EHOSTDOWN,}; | ||
141 | |||
142 | /* can't quite avoid *all* internal declarations :/ */ | ||
143 | static void o2net_sc_connect_completed(void *arg); | ||
144 | static void o2net_rx_until_empty(void *arg); | ||
145 | static void o2net_shutdown_sc(void *arg); | ||
146 | static void o2net_listen_data_ready(struct sock *sk, int bytes); | ||
147 | static void o2net_sc_send_keep_req(void *arg); | ||
148 | static void o2net_idle_timer(unsigned long data); | ||
149 | static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); | ||
150 | |||
151 | static inline int o2net_sys_err_to_errno(enum o2net_system_error err) | ||
152 | { | ||
153 | int trans; | ||
154 | BUG_ON(err >= O2NET_ERR_MAX); | ||
155 | trans = o2net_sys_err_translations[err]; | ||
156 | |||
157 | /* Just in case we mess up the translation table above */ | ||
158 | BUG_ON(err != O2NET_ERR_NONE && trans == 0); | ||
159 | return trans; | ||
160 | } | ||
161 | |||
162 | static struct o2net_node * o2net_nn_from_num(u8 node_num) | ||
163 | { | ||
164 | BUG_ON(node_num >= ARRAY_SIZE(o2net_nodes)); | ||
165 | return &o2net_nodes[node_num]; | ||
166 | } | ||
167 | |||
168 | static u8 o2net_num_from_nn(struct o2net_node *nn) | ||
169 | { | ||
170 | BUG_ON(nn == NULL); | ||
171 | return nn - o2net_nodes; | ||
172 | } | ||
173 | |||
174 | /* ------------------------------------------------------------ */ | ||
175 | |||
176 | static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw) | ||
177 | { | ||
178 | int ret = 0; | ||
179 | |||
180 | do { | ||
181 | if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) { | ||
182 | ret = -EAGAIN; | ||
183 | break; | ||
184 | } | ||
185 | spin_lock(&nn->nn_lock); | ||
186 | ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id); | ||
187 | if (ret == 0) | ||
188 | list_add_tail(&nsw->ns_node_item, | ||
189 | &nn->nn_status_list); | ||
190 | spin_unlock(&nn->nn_lock); | ||
191 | } while (ret == -EAGAIN); | ||
192 | |||
193 | if (ret == 0) { | ||
194 | init_waitqueue_head(&nsw->ns_wq); | ||
195 | nsw->ns_sys_status = O2NET_ERR_NONE; | ||
196 | nsw->ns_status = 0; | ||
197 | } | ||
198 | |||
199 | return ret; | ||
200 | } | ||
201 | |||
202 | static void o2net_complete_nsw_locked(struct o2net_node *nn, | ||
203 | struct o2net_status_wait *nsw, | ||
204 | enum o2net_system_error sys_status, | ||
205 | s32 status) | ||
206 | { | ||
207 | assert_spin_locked(&nn->nn_lock); | ||
208 | |||
209 | if (!list_empty(&nsw->ns_node_item)) { | ||
210 | list_del_init(&nsw->ns_node_item); | ||
211 | nsw->ns_sys_status = sys_status; | ||
212 | nsw->ns_status = status; | ||
213 | idr_remove(&nn->nn_status_idr, nsw->ns_id); | ||
214 | wake_up(&nsw->ns_wq); | ||
215 | } | ||
216 | } | ||
217 | |||
218 | static void o2net_complete_nsw(struct o2net_node *nn, | ||
219 | struct o2net_status_wait *nsw, | ||
220 | u64 id, enum o2net_system_error sys_status, | ||
221 | s32 status) | ||
222 | { | ||
223 | spin_lock(&nn->nn_lock); | ||
224 | if (nsw == NULL) { | ||
225 | if (id > INT_MAX) | ||
226 | goto out; | ||
227 | |||
228 | nsw = idr_find(&nn->nn_status_idr, id); | ||
229 | if (nsw == NULL) | ||
230 | goto out; | ||
231 | } | ||
232 | |||
233 | o2net_complete_nsw_locked(nn, nsw, sys_status, status); | ||
234 | |||
235 | out: | ||
236 | spin_unlock(&nn->nn_lock); | ||
237 | return; | ||
238 | } | ||
239 | |||
240 | static void o2net_complete_nodes_nsw(struct o2net_node *nn) | ||
241 | { | ||
242 | struct list_head *iter, *tmp; | ||
243 | unsigned int num_kills = 0; | ||
244 | struct o2net_status_wait *nsw; | ||
245 | |||
246 | assert_spin_locked(&nn->nn_lock); | ||
247 | |||
248 | list_for_each_safe(iter, tmp, &nn->nn_status_list) { | ||
249 | nsw = list_entry(iter, struct o2net_status_wait, ns_node_item); | ||
250 | o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0); | ||
251 | num_kills++; | ||
252 | } | ||
253 | |||
254 | mlog(0, "completed %d messages for node %u\n", num_kills, | ||
255 | o2net_num_from_nn(nn)); | ||
256 | } | ||
257 | |||
258 | static int o2net_nsw_completed(struct o2net_node *nn, | ||
259 | struct o2net_status_wait *nsw) | ||
260 | { | ||
261 | int completed; | ||
262 | spin_lock(&nn->nn_lock); | ||
263 | completed = list_empty(&nsw->ns_node_item); | ||
264 | spin_unlock(&nn->nn_lock); | ||
265 | return completed; | ||
266 | } | ||
267 | |||
268 | /* ------------------------------------------------------------ */ | ||
269 | |||
270 | static void sc_kref_release(struct kref *kref) | ||
271 | { | ||
272 | struct o2net_sock_container *sc = container_of(kref, | ||
273 | struct o2net_sock_container, sc_kref); | ||
274 | sclog(sc, "releasing\n"); | ||
275 | |||
276 | if (sc->sc_sock) { | ||
277 | sock_release(sc->sc_sock); | ||
278 | sc->sc_sock = NULL; | ||
279 | } | ||
280 | |||
281 | o2nm_node_put(sc->sc_node); | ||
282 | sc->sc_node = NULL; | ||
283 | |||
284 | kfree(sc); | ||
285 | } | ||
286 | |||
287 | static void sc_put(struct o2net_sock_container *sc) | ||
288 | { | ||
289 | sclog(sc, "put\n"); | ||
290 | kref_put(&sc->sc_kref, sc_kref_release); | ||
291 | } | ||
292 | static void sc_get(struct o2net_sock_container *sc) | ||
293 | { | ||
294 | sclog(sc, "get\n"); | ||
295 | kref_get(&sc->sc_kref); | ||
296 | } | ||
297 | static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) | ||
298 | { | ||
299 | struct o2net_sock_container *sc, *ret = NULL; | ||
300 | struct page *page = NULL; | ||
301 | |||
302 | page = alloc_page(GFP_NOFS); | ||
303 | sc = kcalloc(1, sizeof(*sc), GFP_NOFS); | ||
304 | if (sc == NULL || page == NULL) | ||
305 | goto out; | ||
306 | |||
307 | kref_init(&sc->sc_kref); | ||
308 | o2nm_node_get(node); | ||
309 | sc->sc_node = node; | ||
310 | |||
311 | INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed, sc); | ||
312 | INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty, sc); | ||
313 | INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc, sc); | ||
314 | INIT_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req, sc); | ||
315 | |||
316 | init_timer(&sc->sc_idle_timeout); | ||
317 | sc->sc_idle_timeout.function = o2net_idle_timer; | ||
318 | sc->sc_idle_timeout.data = (unsigned long)sc; | ||
319 | |||
320 | sclog(sc, "alloced\n"); | ||
321 | |||
322 | ret = sc; | ||
323 | sc->sc_page = page; | ||
324 | sc = NULL; | ||
325 | page = NULL; | ||
326 | |||
327 | out: | ||
328 | if (page) | ||
329 | __free_page(page); | ||
330 | kfree(sc); | ||
331 | |||
332 | return ret; | ||
333 | } | ||
334 | |||
335 | /* ------------------------------------------------------------ */ | ||
336 | |||
337 | static void o2net_sc_queue_work(struct o2net_sock_container *sc, | ||
338 | struct work_struct *work) | ||
339 | { | ||
340 | sc_get(sc); | ||
341 | if (!queue_work(o2net_wq, work)) | ||
342 | sc_put(sc); | ||
343 | } | ||
344 | static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc, | ||
345 | struct work_struct *work, | ||
346 | int delay) | ||
347 | { | ||
348 | sc_get(sc); | ||
349 | if (!queue_delayed_work(o2net_wq, work, delay)) | ||
350 | sc_put(sc); | ||
351 | } | ||
352 | static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc, | ||
353 | struct work_struct *work) | ||
354 | { | ||
355 | if (cancel_delayed_work(work)) | ||
356 | sc_put(sc); | ||
357 | } | ||
358 | |||
359 | static void o2net_set_nn_state(struct o2net_node *nn, | ||
360 | struct o2net_sock_container *sc, | ||
361 | unsigned valid, int err) | ||
362 | { | ||
363 | int was_valid = nn->nn_sc_valid; | ||
364 | int was_err = nn->nn_persistent_error; | ||
365 | struct o2net_sock_container *old_sc = nn->nn_sc; | ||
366 | |||
367 | assert_spin_locked(&nn->nn_lock); | ||
368 | |||
369 | /* the node num comparison and single connect/accept path should stop | ||
370 | * an non-null sc from being overwritten with another */ | ||
371 | BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc); | ||
372 | mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid); | ||
373 | mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc); | ||
374 | |||
375 | /* we won't reconnect after our valid conn goes away for | ||
376 | * this hb iteration.. here so it shows up in the logs */ | ||
377 | if (was_valid && !valid && err == 0) | ||
378 | err = -ENOTCONN; | ||
379 | |||
380 | mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n", | ||
381 | o2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid, | ||
382 | nn->nn_persistent_error, err); | ||
383 | |||
384 | nn->nn_sc = sc; | ||
385 | nn->nn_sc_valid = valid ? 1 : 0; | ||
386 | nn->nn_persistent_error = err; | ||
387 | |||
388 | /* mirrors o2net_tx_can_proceed() */ | ||
389 | if (nn->nn_persistent_error || nn->nn_sc_valid) | ||
390 | wake_up(&nn->nn_sc_wq); | ||
391 | |||
392 | if (!was_err && nn->nn_persistent_error) { | ||
393 | o2quo_conn_err(o2net_num_from_nn(nn)); | ||
394 | queue_delayed_work(o2net_wq, &nn->nn_still_up, | ||
395 | msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); | ||
396 | } | ||
397 | |||
398 | if (was_valid && !valid) { | ||
399 | mlog(ML_NOTICE, "no longer connected to " SC_NODEF_FMT "\n", | ||
400 | SC_NODEF_ARGS(old_sc)); | ||
401 | o2net_complete_nodes_nsw(nn); | ||
402 | } | ||
403 | |||
404 | if (!was_valid && valid) { | ||
405 | o2quo_conn_up(o2net_num_from_nn(nn)); | ||
406 | /* this is a bit of a hack. we only try reconnecting | ||
407 | * when heartbeating starts until we get a connection. | ||
408 | * if that connection then dies we don't try reconnecting. | ||
409 | * the only way to start connecting again is to down | ||
410 | * heartbeat and bring it back up. */ | ||
411 | cancel_delayed_work(&nn->nn_connect_expired); | ||
412 | mlog(ML_NOTICE, "%s " SC_NODEF_FMT "\n", | ||
413 | o2nm_this_node() > sc->sc_node->nd_num ? | ||
414 | "connected to" : "accepted connection from", | ||
415 | SC_NODEF_ARGS(sc)); | ||
416 | } | ||
417 | |||
418 | /* trigger the connecting worker func as long as we're not valid, | ||
419 | * it will back off if it shouldn't connect. This can be called | ||
420 | * from node config teardown and so needs to be careful about | ||
421 | * the work queue actually being up. */ | ||
422 | if (!valid && o2net_wq) { | ||
423 | unsigned long delay; | ||
424 | /* delay if we're withing a RECONNECT_DELAY of the | ||
425 | * last attempt */ | ||
426 | delay = (nn->nn_last_connect_attempt + | ||
427 | msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS)) | ||
428 | - jiffies; | ||
429 | if (delay > msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS)) | ||
430 | delay = 0; | ||
431 | mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); | ||
432 | queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); | ||
433 | } | ||
434 | |||
435 | /* keep track of the nn's sc ref for the caller */ | ||
436 | if ((old_sc == NULL) && sc) | ||
437 | sc_get(sc); | ||
438 | if (old_sc && (old_sc != sc)) { | ||
439 | o2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work); | ||
440 | sc_put(old_sc); | ||
441 | } | ||
442 | } | ||
443 | |||
444 | /* see o2net_register_callbacks() */ | ||
445 | static void o2net_data_ready(struct sock *sk, int bytes) | ||
446 | { | ||
447 | void (*ready)(struct sock *sk, int bytes); | ||
448 | |||
449 | read_lock(&sk->sk_callback_lock); | ||
450 | if (sk->sk_user_data) { | ||
451 | struct o2net_sock_container *sc = sk->sk_user_data; | ||
452 | sclog(sc, "data_ready hit\n"); | ||
453 | do_gettimeofday(&sc->sc_tv_data_ready); | ||
454 | o2net_sc_queue_work(sc, &sc->sc_rx_work); | ||
455 | ready = sc->sc_data_ready; | ||
456 | } else { | ||
457 | ready = sk->sk_data_ready; | ||
458 | } | ||
459 | read_unlock(&sk->sk_callback_lock); | ||
460 | |||
461 | ready(sk, bytes); | ||
462 | } | ||
463 | |||
464 | /* see o2net_register_callbacks() */ | ||
465 | static void o2net_state_change(struct sock *sk) | ||
466 | { | ||
467 | void (*state_change)(struct sock *sk); | ||
468 | struct o2net_sock_container *sc; | ||
469 | |||
470 | read_lock(&sk->sk_callback_lock); | ||
471 | sc = sk->sk_user_data; | ||
472 | if (sc == NULL) { | ||
473 | state_change = sk->sk_state_change; | ||
474 | goto out; | ||
475 | } | ||
476 | |||
477 | sclog(sc, "state_change to %d\n", sk->sk_state); | ||
478 | |||
479 | state_change = sc->sc_state_change; | ||
480 | |||
481 | switch(sk->sk_state) { | ||
482 | /* ignore connecting sockets as they make progress */ | ||
483 | case TCP_SYN_SENT: | ||
484 | case TCP_SYN_RECV: | ||
485 | break; | ||
486 | case TCP_ESTABLISHED: | ||
487 | o2net_sc_queue_work(sc, &sc->sc_connect_work); | ||
488 | break; | ||
489 | default: | ||
490 | o2net_sc_queue_work(sc, &sc->sc_shutdown_work); | ||
491 | break; | ||
492 | } | ||
493 | out: | ||
494 | read_unlock(&sk->sk_callback_lock); | ||
495 | state_change(sk); | ||
496 | } | ||
497 | |||
498 | /* | ||
499 | * we register callbacks so we can queue work on events before calling | ||
500 | * the original callbacks. our callbacks our careful to test user_data | ||
501 | * to discover when they've reaced with o2net_unregister_callbacks(). | ||
502 | */ | ||
503 | static void o2net_register_callbacks(struct sock *sk, | ||
504 | struct o2net_sock_container *sc) | ||
505 | { | ||
506 | write_lock_bh(&sk->sk_callback_lock); | ||
507 | |||
508 | /* accepted sockets inherit the old listen socket data ready */ | ||
509 | if (sk->sk_data_ready == o2net_listen_data_ready) { | ||
510 | sk->sk_data_ready = sk->sk_user_data; | ||
511 | sk->sk_user_data = NULL; | ||
512 | } | ||
513 | |||
514 | BUG_ON(sk->sk_user_data != NULL); | ||
515 | sk->sk_user_data = sc; | ||
516 | sc_get(sc); | ||
517 | |||
518 | sc->sc_data_ready = sk->sk_data_ready; | ||
519 | sc->sc_state_change = sk->sk_state_change; | ||
520 | sk->sk_data_ready = o2net_data_ready; | ||
521 | sk->sk_state_change = o2net_state_change; | ||
522 | |||
523 | write_unlock_bh(&sk->sk_callback_lock); | ||
524 | } | ||
525 | |||
526 | static int o2net_unregister_callbacks(struct sock *sk, | ||
527 | struct o2net_sock_container *sc) | ||
528 | { | ||
529 | int ret = 0; | ||
530 | |||
531 | write_lock_bh(&sk->sk_callback_lock); | ||
532 | if (sk->sk_user_data == sc) { | ||
533 | ret = 1; | ||
534 | sk->sk_user_data = NULL; | ||
535 | sk->sk_data_ready = sc->sc_data_ready; | ||
536 | sk->sk_state_change = sc->sc_state_change; | ||
537 | } | ||
538 | write_unlock_bh(&sk->sk_callback_lock); | ||
539 | |||
540 | return ret; | ||
541 | } | ||
542 | |||
543 | /* | ||
544 | * this is a little helper that is called by callers who have seen a problem | ||
545 | * with an sc and want to detach it from the nn if someone already hasn't beat | ||
546 | * them to it. if an error is given then the shutdown will be persistent | ||
547 | * and pending transmits will be canceled. | ||
548 | */ | ||
549 | static void o2net_ensure_shutdown(struct o2net_node *nn, | ||
550 | struct o2net_sock_container *sc, | ||
551 | int err) | ||
552 | { | ||
553 | spin_lock(&nn->nn_lock); | ||
554 | if (nn->nn_sc == sc) | ||
555 | o2net_set_nn_state(nn, NULL, 0, err); | ||
556 | spin_unlock(&nn->nn_lock); | ||
557 | } | ||
558 | |||
559 | /* | ||
560 | * This work queue function performs the blocking parts of socket shutdown. A | ||
561 | * few paths lead here. set_nn_state will trigger this callback if it sees an | ||
562 | * sc detached from the nn. state_change will also trigger this callback | ||
563 | * directly when it sees errors. In that case we need to call set_nn_state | ||
564 | * ourselves as state_change couldn't get the nn_lock and call set_nn_state | ||
565 | * itself. | ||
566 | */ | ||
567 | static void o2net_shutdown_sc(void *arg) | ||
568 | { | ||
569 | struct o2net_sock_container *sc = arg; | ||
570 | struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); | ||
571 | |||
572 | sclog(sc, "shutting down\n"); | ||
573 | |||
574 | /* drop the callbacks ref and call shutdown only once */ | ||
575 | if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) { | ||
576 | /* we shouldn't flush as we're in the thread, the | ||
577 | * races with pending sc work structs are harmless */ | ||
578 | del_timer_sync(&sc->sc_idle_timeout); | ||
579 | o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); | ||
580 | sc_put(sc); | ||
581 | sc->sc_sock->ops->shutdown(sc->sc_sock, | ||
582 | RCV_SHUTDOWN|SEND_SHUTDOWN); | ||
583 | } | ||
584 | |||
585 | /* not fatal so failed connects before the other guy has our | ||
586 | * heartbeat can be retried */ | ||
587 | o2net_ensure_shutdown(nn, sc, 0); | ||
588 | sc_put(sc); | ||
589 | } | ||
590 | |||
591 | /* ------------------------------------------------------------ */ | ||
592 | |||
593 | static int o2net_handler_cmp(struct o2net_msg_handler *nmh, u32 msg_type, | ||
594 | u32 key) | ||
595 | { | ||
596 | int ret = memcmp(&nmh->nh_key, &key, sizeof(key)); | ||
597 | |||
598 | if (ret == 0) | ||
599 | ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type)); | ||
600 | |||
601 | return ret; | ||
602 | } | ||
603 | |||
604 | static struct o2net_msg_handler * | ||
605 | o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p, | ||
606 | struct rb_node **ret_parent) | ||
607 | { | ||
608 | struct rb_node **p = &o2net_handler_tree.rb_node; | ||
609 | struct rb_node *parent = NULL; | ||
610 | struct o2net_msg_handler *nmh, *ret = NULL; | ||
611 | int cmp; | ||
612 | |||
613 | while (*p) { | ||
614 | parent = *p; | ||
615 | nmh = rb_entry(parent, struct o2net_msg_handler, nh_node); | ||
616 | cmp = o2net_handler_cmp(nmh, msg_type, key); | ||
617 | |||
618 | if (cmp < 0) | ||
619 | p = &(*p)->rb_left; | ||
620 | else if (cmp > 0) | ||
621 | p = &(*p)->rb_right; | ||
622 | else { | ||
623 | ret = nmh; | ||
624 | break; | ||
625 | } | ||
626 | } | ||
627 | |||
628 | if (ret_p != NULL) | ||
629 | *ret_p = p; | ||
630 | if (ret_parent != NULL) | ||
631 | *ret_parent = parent; | ||
632 | |||
633 | return ret; | ||
634 | } | ||
635 | |||
636 | static void o2net_handler_kref_release(struct kref *kref) | ||
637 | { | ||
638 | struct o2net_msg_handler *nmh; | ||
639 | nmh = container_of(kref, struct o2net_msg_handler, nh_kref); | ||
640 | |||
641 | kfree(nmh); | ||
642 | } | ||
643 | |||
644 | static void o2net_handler_put(struct o2net_msg_handler *nmh) | ||
645 | { | ||
646 | kref_put(&nmh->nh_kref, o2net_handler_kref_release); | ||
647 | } | ||
648 | |||
649 | /* max_len is protection for the handler func. incoming messages won't | ||
650 | * be given to the handler if their payload is longer than the max. */ | ||
651 | int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, | ||
652 | o2net_msg_handler_func *func, void *data, | ||
653 | struct list_head *unreg_list) | ||
654 | { | ||
655 | struct o2net_msg_handler *nmh = NULL; | ||
656 | struct rb_node **p, *parent; | ||
657 | int ret = 0; | ||
658 | |||
659 | if (max_len > O2NET_MAX_PAYLOAD_BYTES) { | ||
660 | mlog(0, "max_len for message handler out of range: %u\n", | ||
661 | max_len); | ||
662 | ret = -EINVAL; | ||
663 | goto out; | ||
664 | } | ||
665 | |||
666 | if (!msg_type) { | ||
667 | mlog(0, "no message type provided: %u, %p\n", msg_type, func); | ||
668 | ret = -EINVAL; | ||
669 | goto out; | ||
670 | |||
671 | } | ||
672 | if (!func) { | ||
673 | mlog(0, "no message handler provided: %u, %p\n", | ||
674 | msg_type, func); | ||
675 | ret = -EINVAL; | ||
676 | goto out; | ||
677 | } | ||
678 | |||
679 | nmh = kcalloc(1, sizeof(struct o2net_msg_handler), GFP_NOFS); | ||
680 | if (nmh == NULL) { | ||
681 | ret = -ENOMEM; | ||
682 | goto out; | ||
683 | } | ||
684 | |||
685 | nmh->nh_func = func; | ||
686 | nmh->nh_func_data = data; | ||
687 | nmh->nh_msg_type = msg_type; | ||
688 | nmh->nh_max_len = max_len; | ||
689 | nmh->nh_key = key; | ||
690 | /* the tree and list get this ref.. they're both removed in | ||
691 | * unregister when this ref is dropped */ | ||
692 | kref_init(&nmh->nh_kref); | ||
693 | INIT_LIST_HEAD(&nmh->nh_unregister_item); | ||
694 | |||
695 | write_lock(&o2net_handler_lock); | ||
696 | if (o2net_handler_tree_lookup(msg_type, key, &p, &parent)) | ||
697 | ret = -EEXIST; | ||
698 | else { | ||
699 | rb_link_node(&nmh->nh_node, parent, p); | ||
700 | rb_insert_color(&nmh->nh_node, &o2net_handler_tree); | ||
701 | list_add_tail(&nmh->nh_unregister_item, unreg_list); | ||
702 | |||
703 | mlog(ML_TCP, "registered handler func %p type %u key %08x\n", | ||
704 | func, msg_type, key); | ||
705 | /* we've had some trouble with handlers seemingly vanishing. */ | ||
706 | mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p, | ||
707 | &parent) == NULL, | ||
708 | "couldn't find handler we *just* registerd " | ||
709 | "for type %u key %08x\n", msg_type, key); | ||
710 | } | ||
711 | write_unlock(&o2net_handler_lock); | ||
712 | if (ret) | ||
713 | goto out; | ||
714 | |||
715 | out: | ||
716 | if (ret) | ||
717 | kfree(nmh); | ||
718 | |||
719 | return ret; | ||
720 | } | ||
721 | EXPORT_SYMBOL_GPL(o2net_register_handler); | ||
722 | |||
723 | void o2net_unregister_handler_list(struct list_head *list) | ||
724 | { | ||
725 | struct list_head *pos, *n; | ||
726 | struct o2net_msg_handler *nmh; | ||
727 | |||
728 | write_lock(&o2net_handler_lock); | ||
729 | list_for_each_safe(pos, n, list) { | ||
730 | nmh = list_entry(pos, struct o2net_msg_handler, | ||
731 | nh_unregister_item); | ||
732 | mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n", | ||
733 | nmh->nh_func, nmh->nh_msg_type, nmh->nh_key); | ||
734 | rb_erase(&nmh->nh_node, &o2net_handler_tree); | ||
735 | list_del_init(&nmh->nh_unregister_item); | ||
736 | kref_put(&nmh->nh_kref, o2net_handler_kref_release); | ||
737 | } | ||
738 | write_unlock(&o2net_handler_lock); | ||
739 | } | ||
740 | EXPORT_SYMBOL_GPL(o2net_unregister_handler_list); | ||
741 | |||
742 | static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key) | ||
743 | { | ||
744 | struct o2net_msg_handler *nmh; | ||
745 | |||
746 | read_lock(&o2net_handler_lock); | ||
747 | nmh = o2net_handler_tree_lookup(msg_type, key, NULL, NULL); | ||
748 | if (nmh) | ||
749 | kref_get(&nmh->nh_kref); | ||
750 | read_unlock(&o2net_handler_lock); | ||
751 | |||
752 | return nmh; | ||
753 | } | ||
754 | |||
755 | /* ------------------------------------------------------------ */ | ||
756 | |||
757 | static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) | ||
758 | { | ||
759 | int ret; | ||
760 | mm_segment_t oldfs; | ||
761 | struct kvec vec = { | ||
762 | .iov_len = len, | ||
763 | .iov_base = data, | ||
764 | }; | ||
765 | struct msghdr msg = { | ||
766 | .msg_iovlen = 1, | ||
767 | .msg_iov = (struct iovec *)&vec, | ||
768 | .msg_flags = MSG_DONTWAIT, | ||
769 | }; | ||
770 | |||
771 | oldfs = get_fs(); | ||
772 | set_fs(get_ds()); | ||
773 | ret = sock_recvmsg(sock, &msg, len, msg.msg_flags); | ||
774 | set_fs(oldfs); | ||
775 | |||
776 | return ret; | ||
777 | } | ||
778 | |||
779 | static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, | ||
780 | size_t veclen, size_t total) | ||
781 | { | ||
782 | int ret; | ||
783 | mm_segment_t oldfs; | ||
784 | struct msghdr msg = { | ||
785 | .msg_iov = (struct iovec *)vec, | ||
786 | .msg_iovlen = veclen, | ||
787 | }; | ||
788 | |||
789 | if (sock == NULL) { | ||
790 | ret = -EINVAL; | ||
791 | goto out; | ||
792 | } | ||
793 | |||
794 | oldfs = get_fs(); | ||
795 | set_fs(get_ds()); | ||
796 | ret = sock_sendmsg(sock, &msg, total); | ||
797 | set_fs(oldfs); | ||
798 | if (ret != total) { | ||
799 | mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, | ||
800 | total); | ||
801 | if (ret >= 0) | ||
802 | ret = -EPIPE; /* should be smarter, I bet */ | ||
803 | goto out; | ||
804 | } | ||
805 | |||
806 | ret = 0; | ||
807 | out: | ||
808 | if (ret < 0) | ||
809 | mlog(0, "returning error: %d\n", ret); | ||
810 | return ret; | ||
811 | } | ||
812 | |||
813 | static void o2net_sendpage(struct o2net_sock_container *sc, | ||
814 | void *kmalloced_virt, | ||
815 | size_t size) | ||
816 | { | ||
817 | struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); | ||
818 | ssize_t ret; | ||
819 | |||
820 | |||
821 | ret = sc->sc_sock->ops->sendpage(sc->sc_sock, | ||
822 | virt_to_page(kmalloced_virt), | ||
823 | (long)kmalloced_virt & ~PAGE_MASK, | ||
824 | size, MSG_DONTWAIT); | ||
825 | if (ret != size) { | ||
826 | mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT | ||
827 | " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret); | ||
828 | o2net_ensure_shutdown(nn, sc, 0); | ||
829 | } | ||
830 | } | ||
831 | |||
832 | static void o2net_init_msg(struct o2net_msg *msg, u16 data_len, u16 msg_type, u32 key) | ||
833 | { | ||
834 | memset(msg, 0, sizeof(struct o2net_msg)); | ||
835 | msg->magic = cpu_to_be16(O2NET_MSG_MAGIC); | ||
836 | msg->data_len = cpu_to_be16(data_len); | ||
837 | msg->msg_type = cpu_to_be16(msg_type); | ||
838 | msg->sys_status = cpu_to_be32(O2NET_ERR_NONE); | ||
839 | msg->status = 0; | ||
840 | msg->key = cpu_to_be32(key); | ||
841 | } | ||
842 | |||
843 | static int o2net_tx_can_proceed(struct o2net_node *nn, | ||
844 | struct o2net_sock_container **sc_ret, | ||
845 | int *error) | ||
846 | { | ||
847 | int ret = 0; | ||
848 | |||
849 | spin_lock(&nn->nn_lock); | ||
850 | if (nn->nn_persistent_error) { | ||
851 | ret = 1; | ||
852 | *sc_ret = NULL; | ||
853 | *error = nn->nn_persistent_error; | ||
854 | } else if (nn->nn_sc_valid) { | ||
855 | kref_get(&nn->nn_sc->sc_kref); | ||
856 | |||
857 | ret = 1; | ||
858 | *sc_ret = nn->nn_sc; | ||
859 | *error = 0; | ||
860 | } | ||
861 | spin_unlock(&nn->nn_lock); | ||
862 | |||
863 | return ret; | ||
864 | } | ||
865 | |||
866 | int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, | ||
867 | size_t caller_veclen, u8 target_node, int *status) | ||
868 | { | ||
869 | int ret, error = 0; | ||
870 | struct o2net_msg *msg = NULL; | ||
871 | size_t veclen, caller_bytes = 0; | ||
872 | struct kvec *vec = NULL; | ||
873 | struct o2net_sock_container *sc = NULL; | ||
874 | struct o2net_node *nn = o2net_nn_from_num(target_node); | ||
875 | struct o2net_status_wait nsw = { | ||
876 | .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item), | ||
877 | }; | ||
878 | |||
879 | if (o2net_wq == NULL) { | ||
880 | mlog(0, "attempt to tx without o2netd running\n"); | ||
881 | ret = -ESRCH; | ||
882 | goto out; | ||
883 | } | ||
884 | |||
885 | if (caller_veclen == 0) { | ||
886 | mlog(0, "bad kvec array length\n"); | ||
887 | ret = -EINVAL; | ||
888 | goto out; | ||
889 | } | ||
890 | |||
891 | caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen); | ||
892 | if (caller_bytes > O2NET_MAX_PAYLOAD_BYTES) { | ||
893 | mlog(0, "total payload len %zu too large\n", caller_bytes); | ||
894 | ret = -EINVAL; | ||
895 | goto out; | ||
896 | } | ||
897 | |||
898 | if (target_node == o2nm_this_node()) { | ||
899 | ret = -ELOOP; | ||
900 | goto out; | ||
901 | } | ||
902 | |||
903 | ret = wait_event_interruptible(nn->nn_sc_wq, | ||
904 | o2net_tx_can_proceed(nn, &sc, &error)); | ||
905 | if (!ret && error) | ||
906 | ret = error; | ||
907 | if (ret) | ||
908 | goto out; | ||
909 | |||
910 | veclen = caller_veclen + 1; | ||
911 | vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); | ||
912 | if (vec == NULL) { | ||
913 | mlog(0, "failed to %zu element kvec!\n", veclen); | ||
914 | ret = -ENOMEM; | ||
915 | goto out; | ||
916 | } | ||
917 | |||
918 | msg = kmalloc(sizeof(struct o2net_msg), GFP_ATOMIC); | ||
919 | if (!msg) { | ||
920 | mlog(0, "failed to allocate a o2net_msg!\n"); | ||
921 | ret = -ENOMEM; | ||
922 | goto out; | ||
923 | } | ||
924 | |||
925 | o2net_init_msg(msg, caller_bytes, msg_type, key); | ||
926 | |||
927 | vec[0].iov_len = sizeof(struct o2net_msg); | ||
928 | vec[0].iov_base = msg; | ||
929 | memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec)); | ||
930 | |||
931 | ret = o2net_prep_nsw(nn, &nsw); | ||
932 | if (ret) | ||
933 | goto out; | ||
934 | |||
935 | msg->msg_num = cpu_to_be32(nsw.ns_id); | ||
936 | |||
937 | /* finally, convert the message header to network byte-order | ||
938 | * and send */ | ||
939 | ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen, | ||
940 | sizeof(struct o2net_msg) + caller_bytes); | ||
941 | msglog(msg, "sending returned %d\n", ret); | ||
942 | if (ret < 0) { | ||
943 | mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret); | ||
944 | goto out; | ||
945 | } | ||
946 | |||
947 | /* wait on other node's handler */ | ||
948 | wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); | ||
949 | |||
950 | /* Note that we avoid overwriting the callers status return | ||
951 | * variable if a system error was reported on the other | ||
952 | * side. Callers beware. */ | ||
953 | ret = o2net_sys_err_to_errno(nsw.ns_sys_status); | ||
954 | if (status && !ret) | ||
955 | *status = nsw.ns_status; | ||
956 | |||
957 | mlog(0, "woken, returning system status %d, user status %d\n", | ||
958 | ret, nsw.ns_status); | ||
959 | out: | ||
960 | if (sc) | ||
961 | sc_put(sc); | ||
962 | if (vec) | ||
963 | kfree(vec); | ||
964 | if (msg) | ||
965 | kfree(msg); | ||
966 | o2net_complete_nsw(nn, &nsw, 0, 0, 0); | ||
967 | return ret; | ||
968 | } | ||
969 | EXPORT_SYMBOL_GPL(o2net_send_message_vec); | ||
970 | |||
971 | int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len, | ||
972 | u8 target_node, int *status) | ||
973 | { | ||
974 | struct kvec vec = { | ||
975 | .iov_base = data, | ||
976 | .iov_len = len, | ||
977 | }; | ||
978 | return o2net_send_message_vec(msg_type, key, &vec, 1, | ||
979 | target_node, status); | ||
980 | } | ||
981 | EXPORT_SYMBOL_GPL(o2net_send_message); | ||
982 | |||
983 | static int o2net_send_status_magic(struct socket *sock, struct o2net_msg *hdr, | ||
984 | enum o2net_system_error syserr, int err) | ||
985 | { | ||
986 | struct kvec vec = { | ||
987 | .iov_base = hdr, | ||
988 | .iov_len = sizeof(struct o2net_msg), | ||
989 | }; | ||
990 | |||
991 | BUG_ON(syserr >= O2NET_ERR_MAX); | ||
992 | |||
993 | /* leave other fields intact from the incoming message, msg_num | ||
994 | * in particular */ | ||
995 | hdr->sys_status = cpu_to_be32(syserr); | ||
996 | hdr->status = cpu_to_be32(err); | ||
997 | hdr->magic = cpu_to_be16(O2NET_MSG_STATUS_MAGIC); // twiddle the magic | ||
998 | hdr->data_len = 0; | ||
999 | |||
1000 | msglog(hdr, "about to send status magic %d\n", err); | ||
1001 | /* hdr has been in host byteorder this whole time */ | ||
1002 | return o2net_send_tcp_msg(sock, &vec, 1, sizeof(struct o2net_msg)); | ||
1003 | } | ||
1004 | |||
1005 | /* this returns -errno if the header was unknown or too large, etc. | ||
1006 | * after this is called the buffer us reused for the next message */ | ||
1007 | static int o2net_process_message(struct o2net_sock_container *sc, | ||
1008 | struct o2net_msg *hdr) | ||
1009 | { | ||
1010 | struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); | ||
1011 | int ret = 0, handler_status; | ||
1012 | enum o2net_system_error syserr; | ||
1013 | struct o2net_msg_handler *nmh = NULL; | ||
1014 | |||
1015 | msglog(hdr, "processing message\n"); | ||
1016 | |||
1017 | o2net_sc_postpone_idle(sc); | ||
1018 | |||
1019 | switch(be16_to_cpu(hdr->magic)) { | ||
1020 | case O2NET_MSG_STATUS_MAGIC: | ||
1021 | /* special type for returning message status */ | ||
1022 | o2net_complete_nsw(nn, NULL, | ||
1023 | be32_to_cpu(hdr->msg_num), | ||
1024 | be32_to_cpu(hdr->sys_status), | ||
1025 | be32_to_cpu(hdr->status)); | ||
1026 | goto out; | ||
1027 | case O2NET_MSG_KEEP_REQ_MAGIC: | ||
1028 | o2net_sendpage(sc, o2net_keep_resp, | ||
1029 | sizeof(*o2net_keep_resp)); | ||
1030 | goto out; | ||
1031 | case O2NET_MSG_KEEP_RESP_MAGIC: | ||
1032 | goto out; | ||
1033 | case O2NET_MSG_MAGIC: | ||
1034 | break; | ||
1035 | default: | ||
1036 | msglog(hdr, "bad magic\n"); | ||
1037 | ret = -EINVAL; | ||
1038 | goto out; | ||
1039 | break; | ||
1040 | } | ||
1041 | |||
1042 | /* find a handler for it */ | ||
1043 | handler_status = 0; | ||
1044 | nmh = o2net_handler_get(be16_to_cpu(hdr->msg_type), | ||
1045 | be32_to_cpu(hdr->key)); | ||
1046 | if (!nmh) { | ||
1047 | mlog(ML_TCP, "couldn't find handler for type %u key %08x\n", | ||
1048 | be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key)); | ||
1049 | syserr = O2NET_ERR_NO_HNDLR; | ||
1050 | goto out_respond; | ||
1051 | } | ||
1052 | |||
1053 | syserr = O2NET_ERR_NONE; | ||
1054 | |||
1055 | if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len) | ||
1056 | syserr = O2NET_ERR_OVERFLOW; | ||
1057 | |||
1058 | if (syserr != O2NET_ERR_NONE) | ||
1059 | goto out_respond; | ||
1060 | |||
1061 | do_gettimeofday(&sc->sc_tv_func_start); | ||
1062 | sc->sc_msg_key = be32_to_cpu(hdr->key); | ||
1063 | sc->sc_msg_type = be16_to_cpu(hdr->msg_type); | ||
1064 | handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) + | ||
1065 | be16_to_cpu(hdr->data_len), | ||
1066 | nmh->nh_func_data); | ||
1067 | do_gettimeofday(&sc->sc_tv_func_stop); | ||
1068 | |||
1069 | out_respond: | ||
1070 | /* this destroys the hdr, so don't use it after this */ | ||
1071 | ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr, | ||
1072 | handler_status); | ||
1073 | hdr = NULL; | ||
1074 | mlog(0, "sending handler status %d, syserr %d returned %d\n", | ||
1075 | handler_status, syserr, ret); | ||
1076 | |||
1077 | out: | ||
1078 | if (nmh) | ||
1079 | o2net_handler_put(nmh); | ||
1080 | return ret; | ||
1081 | } | ||
1082 | |||
1083 | static int o2net_check_handshake(struct o2net_sock_container *sc) | ||
1084 | { | ||
1085 | struct o2net_handshake *hand = page_address(sc->sc_page); | ||
1086 | struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); | ||
1087 | |||
1088 | if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) { | ||
1089 | mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol " | ||
1090 | "version %llu but %llu is required, disconnecting\n", | ||
1091 | SC_NODEF_ARGS(sc), | ||
1092 | (unsigned long long)be64_to_cpu(hand->protocol_version), | ||
1093 | O2NET_PROTOCOL_VERSION); | ||
1094 | |||
1095 | /* don't bother reconnecting if its the wrong version. */ | ||
1096 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); | ||
1097 | return -1; | ||
1098 | } | ||
1099 | |||
1100 | sc->sc_handshake_ok = 1; | ||
1101 | |||
1102 | spin_lock(&nn->nn_lock); | ||
1103 | /* set valid and queue the idle timers only if it hasn't been | ||
1104 | * shut down already */ | ||
1105 | if (nn->nn_sc == sc) { | ||
1106 | o2net_sc_postpone_idle(sc); | ||
1107 | o2net_set_nn_state(nn, sc, 1, 0); | ||
1108 | } | ||
1109 | spin_unlock(&nn->nn_lock); | ||
1110 | |||
1111 | /* shift everything up as though it wasn't there */ | ||
1112 | sc->sc_page_off -= sizeof(struct o2net_handshake); | ||
1113 | if (sc->sc_page_off) | ||
1114 | memmove(hand, hand + 1, sc->sc_page_off); | ||
1115 | |||
1116 | return 0; | ||
1117 | } | ||
1118 | |||
1119 | /* this demuxes the queued rx bytes into header or payload bits and calls | ||
1120 | * handlers as each full message is read off the socket. it returns -error, | ||
1121 | * == 0 eof, or > 0 for progress made.*/ | ||
1122 | static int o2net_advance_rx(struct o2net_sock_container *sc) | ||
1123 | { | ||
1124 | struct o2net_msg *hdr; | ||
1125 | int ret = 0; | ||
1126 | void *data; | ||
1127 | size_t datalen; | ||
1128 | |||
1129 | sclog(sc, "receiving\n"); | ||
1130 | do_gettimeofday(&sc->sc_tv_advance_start); | ||
1131 | |||
1132 | /* do we need more header? */ | ||
1133 | if (sc->sc_page_off < sizeof(struct o2net_msg)) { | ||
1134 | data = page_address(sc->sc_page) + sc->sc_page_off; | ||
1135 | datalen = sizeof(struct o2net_msg) - sc->sc_page_off; | ||
1136 | ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); | ||
1137 | if (ret > 0) { | ||
1138 | sc->sc_page_off += ret; | ||
1139 | |||
1140 | /* this working relies on the handshake being | ||
1141 | * smaller than the normal message header */ | ||
1142 | if (sc->sc_page_off >= sizeof(struct o2net_handshake)&& | ||
1143 | !sc->sc_handshake_ok && o2net_check_handshake(sc)) { | ||
1144 | ret = -EPROTO; | ||
1145 | goto out; | ||
1146 | } | ||
1147 | |||
1148 | /* only swab incoming here.. we can | ||
1149 | * only get here once as we cross from | ||
1150 | * being under to over */ | ||
1151 | if (sc->sc_page_off == sizeof(struct o2net_msg)) { | ||
1152 | hdr = page_address(sc->sc_page); | ||
1153 | if (be16_to_cpu(hdr->data_len) > | ||
1154 | O2NET_MAX_PAYLOAD_BYTES) | ||
1155 | ret = -EOVERFLOW; | ||
1156 | } | ||
1157 | } | ||
1158 | if (ret <= 0) | ||
1159 | goto out; | ||
1160 | } | ||
1161 | |||
1162 | if (sc->sc_page_off < sizeof(struct o2net_msg)) { | ||
1163 | /* oof, still don't have a header */ | ||
1164 | goto out; | ||
1165 | } | ||
1166 | |||
1167 | /* this was swabbed above when we first read it */ | ||
1168 | hdr = page_address(sc->sc_page); | ||
1169 | |||
1170 | msglog(hdr, "at page_off %zu\n", sc->sc_page_off); | ||
1171 | |||
1172 | /* do we need more payload? */ | ||
1173 | if (sc->sc_page_off - sizeof(struct o2net_msg) < be16_to_cpu(hdr->data_len)) { | ||
1174 | /* need more payload */ | ||
1175 | data = page_address(sc->sc_page) + sc->sc_page_off; | ||
1176 | datalen = (sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len)) - | ||
1177 | sc->sc_page_off; | ||
1178 | ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); | ||
1179 | if (ret > 0) | ||
1180 | sc->sc_page_off += ret; | ||
1181 | if (ret <= 0) | ||
1182 | goto out; | ||
1183 | } | ||
1184 | |||
1185 | if (sc->sc_page_off - sizeof(struct o2net_msg) == be16_to_cpu(hdr->data_len)) { | ||
1186 | /* we can only get here once, the first time we read | ||
1187 | * the payload.. so set ret to progress if the handler | ||
1188 | * works out. after calling this the message is toast */ | ||
1189 | ret = o2net_process_message(sc, hdr); | ||
1190 | if (ret == 0) | ||
1191 | ret = 1; | ||
1192 | sc->sc_page_off = 0; | ||
1193 | } | ||
1194 | |||
1195 | out: | ||
1196 | sclog(sc, "ret = %d\n", ret); | ||
1197 | do_gettimeofday(&sc->sc_tv_advance_stop); | ||
1198 | return ret; | ||
1199 | } | ||
1200 | |||
1201 | /* this work func is triggerd by data ready. it reads until it can read no | ||
1202 | * more. it interprets 0, eof, as fatal. if data_ready hits while we're doing | ||
1203 | * our work the work struct will be marked and we'll be called again. */ | ||
1204 | static void o2net_rx_until_empty(void *arg) | ||
1205 | { | ||
1206 | struct o2net_sock_container *sc = arg; | ||
1207 | int ret; | ||
1208 | |||
1209 | do { | ||
1210 | ret = o2net_advance_rx(sc); | ||
1211 | } while (ret > 0); | ||
1212 | |||
1213 | if (ret <= 0 && ret != -EAGAIN) { | ||
1214 | struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); | ||
1215 | sclog(sc, "saw error %d, closing\n", ret); | ||
1216 | /* not permanent so read failed handshake can retry */ | ||
1217 | o2net_ensure_shutdown(nn, sc, 0); | ||
1218 | } | ||
1219 | |||
1220 | sc_put(sc); | ||
1221 | } | ||
1222 | |||
1223 | static int o2net_set_nodelay(struct socket *sock) | ||
1224 | { | ||
1225 | int ret, val = 1; | ||
1226 | mm_segment_t oldfs; | ||
1227 | |||
1228 | oldfs = get_fs(); | ||
1229 | set_fs(KERNEL_DS); | ||
1230 | |||
1231 | /* | ||
1232 | * Dear unsuspecting programmer, | ||
1233 | * | ||
1234 | * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level | ||
1235 | * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will | ||
1236 | * silently turn into SO_DEBUG. | ||
1237 | * | ||
1238 | * Yours, | ||
1239 | * Keeper of hilariously fragile interfaces. | ||
1240 | */ | ||
1241 | ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, | ||
1242 | (char __user *)&val, sizeof(val)); | ||
1243 | |||
1244 | set_fs(oldfs); | ||
1245 | return ret; | ||
1246 | } | ||
1247 | |||
1248 | /* ------------------------------------------------------------ */ | ||
1249 | |||
1250 | /* called when a connect completes and after a sock is accepted. the | ||
1251 | * rx path will see the response and mark the sc valid */ | ||
1252 | static void o2net_sc_connect_completed(void *arg) | ||
1253 | { | ||
1254 | struct o2net_sock_container *sc = arg; | ||
1255 | |||
1256 | mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n", | ||
1257 | (unsigned long long)O2NET_PROTOCOL_VERSION, | ||
1258 | (unsigned long long)be64_to_cpu(o2net_hand->connector_id)); | ||
1259 | |||
1260 | o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); | ||
1261 | sc_put(sc); | ||
1262 | } | ||
1263 | |||
1264 | /* this is called as a work_struct func. */ | ||
1265 | static void o2net_sc_send_keep_req(void *arg) | ||
1266 | { | ||
1267 | struct o2net_sock_container *sc = arg; | ||
1268 | |||
1269 | o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req)); | ||
1270 | sc_put(sc); | ||
1271 | } | ||
1272 | |||
1273 | /* socket shutdown does a del_timer_sync against this as it tears down. | ||
1274 | * we can't start this timer until we've got to the point in sc buildup | ||
1275 | * where shutdown is going to be involved */ | ||
1276 | static void o2net_idle_timer(unsigned long data) | ||
1277 | { | ||
1278 | struct o2net_sock_container *sc = (struct o2net_sock_container *)data; | ||
1279 | struct timeval now; | ||
1280 | |||
1281 | do_gettimeofday(&now); | ||
1282 | |||
1283 | mlog(ML_NOTICE, "connection to " SC_NODEF_FMT " has been idle for 10 " | ||
1284 | "seconds, shutting it down.\n", SC_NODEF_ARGS(sc)); | ||
1285 | mlog(ML_NOTICE, "here are some times that might help debug the " | ||
1286 | "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " | ||
1287 | "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", | ||
1288 | sc->sc_tv_timer.tv_sec, sc->sc_tv_timer.tv_usec, | ||
1289 | now.tv_sec, now.tv_usec, | ||
1290 | sc->sc_tv_data_ready.tv_sec, sc->sc_tv_data_ready.tv_usec, | ||
1291 | sc->sc_tv_advance_start.tv_sec, sc->sc_tv_advance_start.tv_usec, | ||
1292 | sc->sc_tv_advance_stop.tv_sec, sc->sc_tv_advance_stop.tv_usec, | ||
1293 | sc->sc_msg_key, sc->sc_msg_type, | ||
1294 | sc->sc_tv_func_start.tv_sec, sc->sc_tv_func_start.tv_usec, | ||
1295 | sc->sc_tv_func_stop.tv_sec, sc->sc_tv_func_stop.tv_usec); | ||
1296 | |||
1297 | o2net_sc_queue_work(sc, &sc->sc_shutdown_work); | ||
1298 | } | ||
1299 | |||
1300 | static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) | ||
1301 | { | ||
1302 | o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); | ||
1303 | o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, | ||
1304 | O2NET_KEEPALIVE_DELAY_SECS * HZ); | ||
1305 | do_gettimeofday(&sc->sc_tv_timer); | ||
1306 | mod_timer(&sc->sc_idle_timeout, | ||
1307 | jiffies + (O2NET_IDLE_TIMEOUT_SECS * HZ)); | ||
1308 | } | ||
1309 | |||
1310 | /* this work func is kicked whenever a path sets the nn state which doesn't | ||
1311 | * have valid set. This includes seeing hb come up, losing a connection, | ||
1312 | * having a connect attempt fail, etc. This centralizes the logic which decides | ||
1313 | * if a connect attempt should be made or if we should give up and all future | ||
1314 | * transmit attempts should fail */ | ||
1315 | static void o2net_start_connect(void *arg) | ||
1316 | { | ||
1317 | struct o2net_node *nn = arg; | ||
1318 | struct o2net_sock_container *sc = NULL; | ||
1319 | struct o2nm_node *node = NULL; | ||
1320 | struct socket *sock = NULL; | ||
1321 | struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; | ||
1322 | int ret = 0; | ||
1323 | |||
1324 | /* if we're greater we initiate tx, otherwise we accept */ | ||
1325 | if (o2nm_this_node() <= o2net_num_from_nn(nn)) | ||
1326 | goto out; | ||
1327 | |||
1328 | /* watch for racing with tearing a node down */ | ||
1329 | node = o2nm_get_node_by_num(o2net_num_from_nn(nn)); | ||
1330 | if (node == NULL) { | ||
1331 | ret = 0; | ||
1332 | goto out; | ||
1333 | } | ||
1334 | |||
1335 | spin_lock(&nn->nn_lock); | ||
1336 | /* see if we already have one pending or have given up */ | ||
1337 | if (nn->nn_sc || nn->nn_persistent_error) | ||
1338 | arg = NULL; | ||
1339 | spin_unlock(&nn->nn_lock); | ||
1340 | if (arg == NULL) /* *shrug*, needed some indicator */ | ||
1341 | goto out; | ||
1342 | |||
1343 | nn->nn_last_connect_attempt = jiffies; | ||
1344 | |||
1345 | sc = sc_alloc(node); | ||
1346 | if (sc == NULL) { | ||
1347 | mlog(0, "couldn't allocate sc\n"); | ||
1348 | ret = -ENOMEM; | ||
1349 | goto out; | ||
1350 | } | ||
1351 | |||
1352 | ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); | ||
1353 | if (ret < 0) { | ||
1354 | mlog(0, "can't create socket: %d\n", ret); | ||
1355 | goto out; | ||
1356 | } | ||
1357 | sc->sc_sock = sock; /* freed by sc_kref_release */ | ||
1358 | |||
1359 | sock->sk->sk_allocation = GFP_ATOMIC; | ||
1360 | |||
1361 | myaddr.sin_family = AF_INET; | ||
1362 | myaddr.sin_port = (__force u16)htons(0); /* any port */ | ||
1363 | |||
1364 | ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, | ||
1365 | sizeof(myaddr)); | ||
1366 | if (ret) { | ||
1367 | mlog(0, "bind failed: %d\n", ret); | ||
1368 | goto out; | ||
1369 | } | ||
1370 | |||
1371 | ret = o2net_set_nodelay(sc->sc_sock); | ||
1372 | if (ret) { | ||
1373 | mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); | ||
1374 | goto out; | ||
1375 | } | ||
1376 | |||
1377 | o2net_register_callbacks(sc->sc_sock->sk, sc); | ||
1378 | |||
1379 | spin_lock(&nn->nn_lock); | ||
1380 | /* handshake completion will set nn->nn_sc_valid */ | ||
1381 | o2net_set_nn_state(nn, sc, 0, 0); | ||
1382 | spin_unlock(&nn->nn_lock); | ||
1383 | |||
1384 | remoteaddr.sin_family = AF_INET; | ||
1385 | remoteaddr.sin_addr.s_addr = (__force u32)node->nd_ipv4_address; | ||
1386 | remoteaddr.sin_port = (__force u16)node->nd_ipv4_port; | ||
1387 | |||
1388 | ret = sc->sc_sock->ops->connect(sc->sc_sock, | ||
1389 | (struct sockaddr *)&remoteaddr, | ||
1390 | sizeof(remoteaddr), | ||
1391 | O_NONBLOCK); | ||
1392 | if (ret == -EINPROGRESS) | ||
1393 | ret = 0; | ||
1394 | |||
1395 | out: | ||
1396 | if (ret) { | ||
1397 | mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed " | ||
1398 | "with errno %d\n", SC_NODEF_ARGS(sc), ret); | ||
1399 | /* 0 err so that another will be queued and attempted | ||
1400 | * from set_nn_state */ | ||
1401 | if (sc) | ||
1402 | o2net_ensure_shutdown(nn, sc, 0); | ||
1403 | } | ||
1404 | if (sc) | ||
1405 | sc_put(sc); | ||
1406 | if (node) | ||
1407 | o2nm_node_put(node); | ||
1408 | |||
1409 | return; | ||
1410 | } | ||
1411 | |||
1412 | static void o2net_connect_expired(void *arg) | ||
1413 | { | ||
1414 | struct o2net_node *nn = arg; | ||
1415 | |||
1416 | spin_lock(&nn->nn_lock); | ||
1417 | if (!nn->nn_sc_valid) { | ||
1418 | mlog(ML_ERROR, "no connection established with node %u after " | ||
1419 | "%u seconds, giving up and returning errors.\n", | ||
1420 | o2net_num_from_nn(nn), O2NET_IDLE_TIMEOUT_SECS); | ||
1421 | |||
1422 | o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); | ||
1423 | } | ||
1424 | spin_unlock(&nn->nn_lock); | ||
1425 | } | ||
1426 | |||
1427 | static void o2net_still_up(void *arg) | ||
1428 | { | ||
1429 | struct o2net_node *nn = arg; | ||
1430 | |||
1431 | o2quo_hb_still_up(o2net_num_from_nn(nn)); | ||
1432 | } | ||
1433 | |||
1434 | /* ------------------------------------------------------------ */ | ||
1435 | |||
1436 | void o2net_disconnect_node(struct o2nm_node *node) | ||
1437 | { | ||
1438 | struct o2net_node *nn = o2net_nn_from_num(node->nd_num); | ||
1439 | |||
1440 | /* don't reconnect until it's heartbeating again */ | ||
1441 | spin_lock(&nn->nn_lock); | ||
1442 | o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); | ||
1443 | spin_unlock(&nn->nn_lock); | ||
1444 | |||
1445 | if (o2net_wq) { | ||
1446 | cancel_delayed_work(&nn->nn_connect_expired); | ||
1447 | cancel_delayed_work(&nn->nn_connect_work); | ||
1448 | cancel_delayed_work(&nn->nn_still_up); | ||
1449 | flush_workqueue(o2net_wq); | ||
1450 | } | ||
1451 | } | ||
1452 | |||
1453 | static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num, | ||
1454 | void *data) | ||
1455 | { | ||
1456 | o2quo_hb_down(node_num); | ||
1457 | |||
1458 | if (node_num != o2nm_this_node()) | ||
1459 | o2net_disconnect_node(node); | ||
1460 | } | ||
1461 | |||
1462 | static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, | ||
1463 | void *data) | ||
1464 | { | ||
1465 | struct o2net_node *nn = o2net_nn_from_num(node_num); | ||
1466 | |||
1467 | o2quo_hb_up(node_num); | ||
1468 | |||
1469 | /* ensure an immediate connect attempt */ | ||
1470 | nn->nn_last_connect_attempt = jiffies - | ||
1471 | (msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS) + 1); | ||
1472 | |||
1473 | if (node_num != o2nm_this_node()) { | ||
1474 | /* heartbeat doesn't work unless a local node number is | ||
1475 | * configured and doing so brings up the o2net_wq, so we can | ||
1476 | * use it.. */ | ||
1477 | queue_delayed_work(o2net_wq, &nn->nn_connect_expired, | ||
1478 | O2NET_IDLE_TIMEOUT_SECS * HZ); | ||
1479 | |||
1480 | /* believe it or not, accept and node hearbeating testing | ||
1481 | * can succeed for this node before we got here.. so | ||
1482 | * only use set_nn_state to clear the persistent error | ||
1483 | * if that hasn't already happened */ | ||
1484 | spin_lock(&nn->nn_lock); | ||
1485 | if (nn->nn_persistent_error) | ||
1486 | o2net_set_nn_state(nn, NULL, 0, 0); | ||
1487 | spin_unlock(&nn->nn_lock); | ||
1488 | } | ||
1489 | } | ||
1490 | |||
1491 | void o2net_unregister_hb_callbacks(void) | ||
1492 | { | ||
1493 | int ret; | ||
1494 | |||
1495 | ret = o2hb_unregister_callback(&o2net_hb_up); | ||
1496 | if (ret < 0) | ||
1497 | mlog(ML_ERROR, "Status return %d unregistering heartbeat up " | ||
1498 | "callback!\n", ret); | ||
1499 | |||
1500 | ret = o2hb_unregister_callback(&o2net_hb_down); | ||
1501 | if (ret < 0) | ||
1502 | mlog(ML_ERROR, "Status return %d unregistering heartbeat down " | ||
1503 | "callback!\n", ret); | ||
1504 | } | ||
1505 | |||
1506 | int o2net_register_hb_callbacks(void) | ||
1507 | { | ||
1508 | int ret; | ||
1509 | |||
1510 | o2hb_setup_callback(&o2net_hb_down, O2HB_NODE_DOWN_CB, | ||
1511 | o2net_hb_node_down_cb, NULL, O2NET_HB_PRI); | ||
1512 | o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB, | ||
1513 | o2net_hb_node_up_cb, NULL, O2NET_HB_PRI); | ||
1514 | |||
1515 | ret = o2hb_register_callback(&o2net_hb_up); | ||
1516 | if (ret == 0) | ||
1517 | ret = o2hb_register_callback(&o2net_hb_down); | ||
1518 | |||
1519 | if (ret) | ||
1520 | o2net_unregister_hb_callbacks(); | ||
1521 | |||
1522 | return ret; | ||
1523 | } | ||
1524 | |||
1525 | /* ------------------------------------------------------------ */ | ||
1526 | |||
1527 | static int o2net_accept_one(struct socket *sock) | ||
1528 | { | ||
1529 | int ret, slen; | ||
1530 | struct sockaddr_in sin; | ||
1531 | struct socket *new_sock = NULL; | ||
1532 | struct o2nm_node *node = NULL; | ||
1533 | struct o2net_sock_container *sc = NULL; | ||
1534 | struct o2net_node *nn; | ||
1535 | |||
1536 | BUG_ON(sock == NULL); | ||
1537 | ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, | ||
1538 | sock->sk->sk_protocol, &new_sock); | ||
1539 | if (ret) | ||
1540 | goto out; | ||
1541 | |||
1542 | new_sock->type = sock->type; | ||
1543 | new_sock->ops = sock->ops; | ||
1544 | ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); | ||
1545 | if (ret < 0) | ||
1546 | goto out; | ||
1547 | |||
1548 | new_sock->sk->sk_allocation = GFP_ATOMIC; | ||
1549 | |||
1550 | ret = o2net_set_nodelay(new_sock); | ||
1551 | if (ret) { | ||
1552 | mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); | ||
1553 | goto out; | ||
1554 | } | ||
1555 | |||
1556 | slen = sizeof(sin); | ||
1557 | ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, | ||
1558 | &slen, 1); | ||
1559 | if (ret < 0) | ||
1560 | goto out; | ||
1561 | |||
1562 | node = o2nm_get_node_by_ip((__force __be32)sin.sin_addr.s_addr); | ||
1563 | if (node == NULL) { | ||
1564 | mlog(ML_NOTICE, "attempt to connect from unknown node at " | ||
1565 | "%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr), | ||
1566 | ntohs((__force __be16)sin.sin_port)); | ||
1567 | ret = -EINVAL; | ||
1568 | goto out; | ||
1569 | } | ||
1570 | |||
1571 | if (o2nm_this_node() > node->nd_num) { | ||
1572 | mlog(ML_NOTICE, "unexpected connect attempted from a lower " | ||
1573 | "numbered node '%s' at " "%u.%u.%u.%u:%d with num %u\n", | ||
1574 | node->nd_name, NIPQUAD(sin.sin_addr.s_addr), | ||
1575 | ntohs((__force __be16)sin.sin_port), node->nd_num); | ||
1576 | ret = -EINVAL; | ||
1577 | goto out; | ||
1578 | } | ||
1579 | |||
1580 | /* this happens all the time when the other node sees our heartbeat | ||
1581 | * and tries to connect before we see their heartbeat */ | ||
1582 | if (!o2hb_check_node_heartbeating_from_callback(node->nd_num)) { | ||
1583 | mlog(ML_CONN, "attempt to connect from node '%s' at " | ||
1584 | "%u.%u.%u.%u:%d but it isn't heartbeating\n", | ||
1585 | node->nd_name, NIPQUAD(sin.sin_addr.s_addr), | ||
1586 | ntohs((__force __be16)sin.sin_port)); | ||
1587 | ret = -EINVAL; | ||
1588 | goto out; | ||
1589 | } | ||
1590 | |||
1591 | nn = o2net_nn_from_num(node->nd_num); | ||
1592 | |||
1593 | spin_lock(&nn->nn_lock); | ||
1594 | if (nn->nn_sc) | ||
1595 | ret = -EBUSY; | ||
1596 | else | ||
1597 | ret = 0; | ||
1598 | spin_unlock(&nn->nn_lock); | ||
1599 | if (ret) { | ||
1600 | mlog(ML_NOTICE, "attempt to connect from node '%s' at " | ||
1601 | "%u.%u.%u.%u:%d but it already has an open connection\n", | ||
1602 | node->nd_name, NIPQUAD(sin.sin_addr.s_addr), | ||
1603 | ntohs((__force __be16)sin.sin_port)); | ||
1604 | goto out; | ||
1605 | } | ||
1606 | |||
1607 | sc = sc_alloc(node); | ||
1608 | if (sc == NULL) { | ||
1609 | ret = -ENOMEM; | ||
1610 | goto out; | ||
1611 | } | ||
1612 | |||
1613 | sc->sc_sock = new_sock; | ||
1614 | new_sock = NULL; | ||
1615 | |||
1616 | spin_lock(&nn->nn_lock); | ||
1617 | o2net_set_nn_state(nn, sc, 0, 0); | ||
1618 | spin_unlock(&nn->nn_lock); | ||
1619 | |||
1620 | o2net_register_callbacks(sc->sc_sock->sk, sc); | ||
1621 | o2net_sc_queue_work(sc, &sc->sc_rx_work); | ||
1622 | |||
1623 | o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); | ||
1624 | |||
1625 | out: | ||
1626 | if (new_sock) | ||
1627 | sock_release(new_sock); | ||
1628 | if (node) | ||
1629 | o2nm_node_put(node); | ||
1630 | if (sc) | ||
1631 | sc_put(sc); | ||
1632 | return ret; | ||
1633 | } | ||
1634 | |||
1635 | static void o2net_accept_many(void *arg) | ||
1636 | { | ||
1637 | struct socket *sock = arg; | ||
1638 | while (o2net_accept_one(sock) == 0) | ||
1639 | cond_resched(); | ||
1640 | } | ||
1641 | |||
1642 | static void o2net_listen_data_ready(struct sock *sk, int bytes) | ||
1643 | { | ||
1644 | void (*ready)(struct sock *sk, int bytes); | ||
1645 | |||
1646 | read_lock(&sk->sk_callback_lock); | ||
1647 | ready = sk->sk_user_data; | ||
1648 | if (ready == NULL) { /* check for teardown race */ | ||
1649 | ready = sk->sk_data_ready; | ||
1650 | goto out; | ||
1651 | } | ||
1652 | |||
1653 | /* ->sk_data_ready is also called for a newly established child socket | ||
1654 | * before it has been accepted and the acceptor has set up their | ||
1655 | * data_ready.. we only want to queue listen work for our listening | ||
1656 | * socket */ | ||
1657 | if (sk->sk_state == TCP_LISTEN) { | ||
1658 | mlog(ML_TCP, "bytes: %d\n", bytes); | ||
1659 | queue_work(o2net_wq, &o2net_listen_work); | ||
1660 | } | ||
1661 | |||
1662 | out: | ||
1663 | read_unlock(&sk->sk_callback_lock); | ||
1664 | ready(sk, bytes); | ||
1665 | } | ||
1666 | |||
1667 | static int o2net_open_listening_sock(__be16 port) | ||
1668 | { | ||
1669 | struct socket *sock = NULL; | ||
1670 | int ret; | ||
1671 | struct sockaddr_in sin = { | ||
1672 | .sin_family = PF_INET, | ||
1673 | .sin_addr = { .s_addr = (__force u32)htonl(INADDR_ANY) }, | ||
1674 | .sin_port = (__force u16)port, | ||
1675 | }; | ||
1676 | |||
1677 | ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); | ||
1678 | if (ret < 0) { | ||
1679 | mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret); | ||
1680 | goto out; | ||
1681 | } | ||
1682 | |||
1683 | sock->sk->sk_allocation = GFP_ATOMIC; | ||
1684 | |||
1685 | write_lock_bh(&sock->sk->sk_callback_lock); | ||
1686 | sock->sk->sk_user_data = sock->sk->sk_data_ready; | ||
1687 | sock->sk->sk_data_ready = o2net_listen_data_ready; | ||
1688 | write_unlock_bh(&sock->sk->sk_callback_lock); | ||
1689 | |||
1690 | o2net_listen_sock = sock; | ||
1691 | INIT_WORK(&o2net_listen_work, o2net_accept_many, sock); | ||
1692 | |||
1693 | sock->sk->sk_reuse = 1; | ||
1694 | ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); | ||
1695 | if (ret < 0) { | ||
1696 | mlog(ML_ERROR, "unable to bind socket to port %d, ret=%d\n", | ||
1697 | ntohs(port), ret); | ||
1698 | goto out; | ||
1699 | } | ||
1700 | |||
1701 | ret = sock->ops->listen(sock, 64); | ||
1702 | if (ret < 0) { | ||
1703 | mlog(ML_ERROR, "unable to listen on port %d, ret=%d\n", | ||
1704 | ntohs(port), ret); | ||
1705 | } | ||
1706 | |||
1707 | out: | ||
1708 | if (ret) { | ||
1709 | o2net_listen_sock = NULL; | ||
1710 | if (sock) | ||
1711 | sock_release(sock); | ||
1712 | } | ||
1713 | return ret; | ||
1714 | } | ||
1715 | |||
1716 | /* | ||
1717 | * called from node manager when we should bring up our network listening | ||
1718 | * socket. node manager handles all the serialization to only call this | ||
1719 | * once and to match it with o2net_stop_listening(). note, | ||
1720 | * o2nm_this_node() doesn't work yet as we're being called while it | ||
1721 | * is being set up. | ||
1722 | */ | ||
1723 | int o2net_start_listening(struct o2nm_node *node) | ||
1724 | { | ||
1725 | int ret = 0; | ||
1726 | |||
1727 | BUG_ON(o2net_wq != NULL); | ||
1728 | BUG_ON(o2net_listen_sock != NULL); | ||
1729 | |||
1730 | mlog(ML_KTHREAD, "starting o2net thread...\n"); | ||
1731 | o2net_wq = create_singlethread_workqueue("o2net"); | ||
1732 | if (o2net_wq == NULL) { | ||
1733 | mlog(ML_ERROR, "unable to launch o2net thread\n"); | ||
1734 | return -ENOMEM; /* ? */ | ||
1735 | } | ||
1736 | |||
1737 | ret = o2net_open_listening_sock(node->nd_ipv4_port); | ||
1738 | if (ret) { | ||
1739 | destroy_workqueue(o2net_wq); | ||
1740 | o2net_wq = NULL; | ||
1741 | } else | ||
1742 | o2quo_conn_up(node->nd_num); | ||
1743 | |||
1744 | return ret; | ||
1745 | } | ||
1746 | |||
1747 | /* again, o2nm_this_node() doesn't work here as we're involved in | ||
1748 | * tearing it down */ | ||
1749 | void o2net_stop_listening(struct o2nm_node *node) | ||
1750 | { | ||
1751 | struct socket *sock = o2net_listen_sock; | ||
1752 | size_t i; | ||
1753 | |||
1754 | BUG_ON(o2net_wq == NULL); | ||
1755 | BUG_ON(o2net_listen_sock == NULL); | ||
1756 | |||
1757 | /* stop the listening socket from generating work */ | ||
1758 | write_lock_bh(&sock->sk->sk_callback_lock); | ||
1759 | sock->sk->sk_data_ready = sock->sk->sk_user_data; | ||
1760 | sock->sk->sk_user_data = NULL; | ||
1761 | write_unlock_bh(&sock->sk->sk_callback_lock); | ||
1762 | |||
1763 | for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { | ||
1764 | struct o2nm_node *node = o2nm_get_node_by_num(i); | ||
1765 | if (node) { | ||
1766 | o2net_disconnect_node(node); | ||
1767 | o2nm_node_put(node); | ||
1768 | } | ||
1769 | } | ||
1770 | |||
1771 | /* finish all work and tear down the work queue */ | ||
1772 | mlog(ML_KTHREAD, "waiting for o2net thread to exit....\n"); | ||
1773 | destroy_workqueue(o2net_wq); | ||
1774 | o2net_wq = NULL; | ||
1775 | |||
1776 | sock_release(o2net_listen_sock); | ||
1777 | o2net_listen_sock = NULL; | ||
1778 | |||
1779 | o2quo_conn_err(node->nd_num); | ||
1780 | } | ||
1781 | |||
1782 | /* ------------------------------------------------------------ */ | ||
1783 | |||
1784 | int o2net_init(void) | ||
1785 | { | ||
1786 | unsigned long i; | ||
1787 | |||
1788 | o2quo_init(); | ||
1789 | |||
1790 | o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL); | ||
1791 | o2net_keep_req = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL); | ||
1792 | o2net_keep_resp = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL); | ||
1793 | if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) { | ||
1794 | kfree(o2net_hand); | ||
1795 | kfree(o2net_keep_req); | ||
1796 | kfree(o2net_keep_resp); | ||
1797 | return -ENOMEM; | ||
1798 | } | ||
1799 | |||
1800 | o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION); | ||
1801 | o2net_hand->connector_id = cpu_to_be64(1); | ||
1802 | |||
1803 | o2net_keep_req->magic = cpu_to_be16(O2NET_MSG_KEEP_REQ_MAGIC); | ||
1804 | o2net_keep_resp->magic = cpu_to_be16(O2NET_MSG_KEEP_RESP_MAGIC); | ||
1805 | |||
1806 | for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { | ||
1807 | struct o2net_node *nn = o2net_nn_from_num(i); | ||
1808 | |||
1809 | spin_lock_init(&nn->nn_lock); | ||
1810 | INIT_WORK(&nn->nn_connect_work, o2net_start_connect, nn); | ||
1811 | INIT_WORK(&nn->nn_connect_expired, o2net_connect_expired, nn); | ||
1812 | INIT_WORK(&nn->nn_still_up, o2net_still_up, nn); | ||
1813 | /* until we see hb from a node we'll return einval */ | ||
1814 | nn->nn_persistent_error = -ENOTCONN; | ||
1815 | init_waitqueue_head(&nn->nn_sc_wq); | ||
1816 | idr_init(&nn->nn_status_idr); | ||
1817 | INIT_LIST_HEAD(&nn->nn_status_list); | ||
1818 | } | ||
1819 | |||
1820 | return 0; | ||
1821 | } | ||
1822 | |||
1823 | void o2net_exit(void) | ||
1824 | { | ||
1825 | o2quo_exit(); | ||
1826 | kfree(o2net_hand); | ||
1827 | kfree(o2net_keep_req); | ||
1828 | kfree(o2net_keep_resp); | ||
1829 | } | ||
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h new file mode 100644 index 000000000000..a6f4585501c8 --- /dev/null +++ b/fs/ocfs2/cluster/tcp.h | |||
@@ -0,0 +1,113 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * tcp.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | #ifndef O2CLUSTER_TCP_H | ||
28 | #define O2CLUSTER_TCP_H | ||
29 | |||
30 | #include <linux/socket.h> | ||
31 | #ifdef __KERNEL__ | ||
32 | #include <net/sock.h> | ||
33 | #include <linux/tcp.h> | ||
34 | #else | ||
35 | #include <sys/socket.h> | ||
36 | #endif | ||
37 | #include <linux/inet.h> | ||
38 | #include <linux/in.h> | ||
39 | |||
40 | struct o2net_msg | ||
41 | { | ||
42 | __be16 magic; | ||
43 | __be16 data_len; | ||
44 | __be16 msg_type; | ||
45 | __be16 pad1; | ||
46 | __be32 sys_status; | ||
47 | __be32 status; | ||
48 | __be32 key; | ||
49 | __be32 msg_num; | ||
50 | __u8 buf[0]; | ||
51 | }; | ||
52 | |||
53 | typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data); | ||
54 | |||
55 | #define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg)) | ||
56 | |||
57 | /* TODO: figure this out.... */ | ||
58 | static inline int o2net_link_down(int err, struct socket *sock) | ||
59 | { | ||
60 | if (sock) { | ||
61 | if (sock->sk->sk_state != TCP_ESTABLISHED && | ||
62 | sock->sk->sk_state != TCP_CLOSE_WAIT) | ||
63 | return 1; | ||
64 | } | ||
65 | |||
66 | if (err >= 0) | ||
67 | return 0; | ||
68 | switch (err) { | ||
69 | /* ????????????????????????? */ | ||
70 | case -ERESTARTSYS: | ||
71 | case -EBADF: | ||
72 | /* When the server has died, an ICMP port unreachable | ||
73 | * message prompts ECONNREFUSED. */ | ||
74 | case -ECONNREFUSED: | ||
75 | case -ENOTCONN: | ||
76 | case -ECONNRESET: | ||
77 | case -EPIPE: | ||
78 | return 1; | ||
79 | } | ||
80 | return 0; | ||
81 | } | ||
82 | |||
83 | enum { | ||
84 | O2NET_DRIVER_UNINITED, | ||
85 | O2NET_DRIVER_READY, | ||
86 | }; | ||
87 | |||
88 | int o2net_init_tcp_sock(struct inode *inode); | ||
89 | int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len, | ||
90 | u8 target_node, int *status); | ||
91 | int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec, | ||
92 | size_t veclen, u8 target_node, int *status); | ||
93 | int o2net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len, | ||
94 | struct inode *group); | ||
95 | |||
96 | int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, | ||
97 | o2net_msg_handler_func *func, void *data, | ||
98 | struct list_head *unreg_list); | ||
99 | void o2net_unregister_handler_list(struct list_head *list); | ||
100 | |||
101 | struct o2nm_node; | ||
102 | int o2net_register_hb_callbacks(void); | ||
103 | void o2net_unregister_hb_callbacks(void); | ||
104 | int o2net_start_listening(struct o2nm_node *node); | ||
105 | void o2net_stop_listening(struct o2nm_node *node); | ||
106 | void o2net_disconnect_node(struct o2nm_node *node); | ||
107 | |||
108 | int o2net_init(void); | ||
109 | void o2net_exit(void); | ||
110 | int o2net_proc_init(struct proc_dir_entry *parent); | ||
111 | void o2net_proc_exit(struct proc_dir_entry *parent); | ||
112 | |||
113 | #endif /* O2CLUSTER_TCP_H */ | ||
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h new file mode 100644 index 000000000000..ff9e2e2104c2 --- /dev/null +++ b/fs/ocfs2/cluster/tcp_internal.h | |||
@@ -0,0 +1,174 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * Copyright (C) 2005 Oracle. All rights reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | */ | ||
21 | |||
22 | #ifndef O2CLUSTER_TCP_INTERNAL_H | ||
23 | #define O2CLUSTER_TCP_INTERNAL_H | ||
24 | |||
25 | #define O2NET_MSG_MAGIC ((u16)0xfa55) | ||
26 | #define O2NET_MSG_STATUS_MAGIC ((u16)0xfa56) | ||
27 | #define O2NET_MSG_KEEP_REQ_MAGIC ((u16)0xfa57) | ||
28 | #define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58) | ||
29 | |||
30 | /* same as hb delay, we're waiting for another node to recognize our hb */ | ||
31 | #define O2NET_RECONNECT_DELAY_MS O2HB_REGION_TIMEOUT_MS | ||
32 | |||
33 | /* we're delaying our quorum decision so that heartbeat will have timed | ||
34 | * out truly dead nodes by the time we come around to making decisions | ||
35 | * on their number */ | ||
36 | #define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS) | ||
37 | |||
38 | #define O2NET_KEEPALIVE_DELAY_SECS 5 | ||
39 | #define O2NET_IDLE_TIMEOUT_SECS 10 | ||
40 | |||
41 | /* | ||
42 | * This version number represents quite a lot, unfortunately. It not | ||
43 | * only represents the raw network message protocol on the wire but also | ||
44 | * locking semantics of the file system using the protocol. It should | ||
45 | * be somewhere else, I'm sure, but right now it isn't. | ||
46 | * | ||
47 | * New in version 2: | ||
48 | * - full 64 bit i_size in the metadata lock lvbs | ||
49 | * - introduction of "rw" lock and pushing meta/data locking down | ||
50 | */ | ||
51 | #define O2NET_PROTOCOL_VERSION 2ULL | ||
52 | struct o2net_handshake { | ||
53 | __be64 protocol_version; | ||
54 | __be64 connector_id; | ||
55 | }; | ||
56 | |||
57 | struct o2net_node { | ||
58 | /* this is never called from int/bh */ | ||
59 | spinlock_t nn_lock; | ||
60 | |||
61 | /* set the moment an sc is allocated and a connect is started */ | ||
62 | struct o2net_sock_container *nn_sc; | ||
63 | /* _valid is only set after the handshake passes and tx can happen */ | ||
64 | unsigned nn_sc_valid:1; | ||
65 | /* if this is set tx just returns it */ | ||
66 | int nn_persistent_error; | ||
67 | |||
68 | /* threads waiting for an sc to arrive wait on the wq for generation | ||
69 | * to increase. it is increased when a connecting socket succeeds | ||
70 | * or fails or when an accepted socket is attached. */ | ||
71 | wait_queue_head_t nn_sc_wq; | ||
72 | |||
73 | struct idr nn_status_idr; | ||
74 | struct list_head nn_status_list; | ||
75 | |||
76 | /* connects are attempted from when heartbeat comes up until either hb | ||
77 | * goes down, the node is unconfigured, no connect attempts succeed | ||
78 | * before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work | ||
79 | * is queued from set_nn_state both from hb up and from itself if a | ||
80 | * connect attempt fails and so can be self-arming. shutdown is | ||
81 | * careful to first mark the nn such that no connects will be attempted | ||
82 | * before canceling delayed connect work and flushing the queue. */ | ||
83 | struct work_struct nn_connect_work; | ||
84 | unsigned long nn_last_connect_attempt; | ||
85 | |||
86 | /* this is queued as nodes come up and is canceled when a connection is | ||
87 | * established. this expiring gives up on the node and errors out | ||
88 | * transmits */ | ||
89 | struct work_struct nn_connect_expired; | ||
90 | |||
91 | /* after we give up on a socket we wait a while before deciding | ||
92 | * that it is still heartbeating and that we should do some | ||
93 | * quorum work */ | ||
94 | struct work_struct nn_still_up; | ||
95 | }; | ||
96 | |||
97 | struct o2net_sock_container { | ||
98 | struct kref sc_kref; | ||
99 | /* the next two are vaild for the life time of the sc */ | ||
100 | struct socket *sc_sock; | ||
101 | struct o2nm_node *sc_node; | ||
102 | |||
103 | /* all of these sc work structs hold refs on the sc while they are | ||
104 | * queued. they should not be able to ref a freed sc. the teardown | ||
105 | * race is with o2net_wq destruction in o2net_stop_listening() */ | ||
106 | |||
107 | /* rx and connect work are generated from socket callbacks. sc | ||
108 | * shutdown removes the callbacks and then flushes the work queue */ | ||
109 | struct work_struct sc_rx_work; | ||
110 | struct work_struct sc_connect_work; | ||
111 | /* shutdown work is triggered in two ways. the simple way is | ||
112 | * for a code path calls ensure_shutdown which gets a lock, removes | ||
113 | * the sc from the nn, and queues the work. in this case the | ||
114 | * work is single-shot. the work is also queued from a sock | ||
115 | * callback, though, and in this case the work will find the sc | ||
116 | * still on the nn and will call ensure_shutdown itself.. this | ||
117 | * ends up triggering the shutdown work again, though nothing | ||
118 | * will be done in that second iteration. so work queue teardown | ||
119 | * has to be careful to remove the sc from the nn before waiting | ||
120 | * on the work queue so that the shutdown work doesn't remove the | ||
121 | * sc and rearm itself. | ||
122 | */ | ||
123 | struct work_struct sc_shutdown_work; | ||
124 | |||
125 | struct timer_list sc_idle_timeout; | ||
126 | struct work_struct sc_keepalive_work; | ||
127 | |||
128 | unsigned sc_handshake_ok:1; | ||
129 | |||
130 | struct page *sc_page; | ||
131 | size_t sc_page_off; | ||
132 | |||
133 | /* original handlers for the sockets */ | ||
134 | void (*sc_state_change)(struct sock *sk); | ||
135 | void (*sc_data_ready)(struct sock *sk, int bytes); | ||
136 | |||
137 | struct timeval sc_tv_timer; | ||
138 | struct timeval sc_tv_data_ready; | ||
139 | struct timeval sc_tv_advance_start; | ||
140 | struct timeval sc_tv_advance_stop; | ||
141 | struct timeval sc_tv_func_start; | ||
142 | struct timeval sc_tv_func_stop; | ||
143 | u32 sc_msg_key; | ||
144 | u16 sc_msg_type; | ||
145 | }; | ||
146 | |||
147 | struct o2net_msg_handler { | ||
148 | struct rb_node nh_node; | ||
149 | u32 nh_max_len; | ||
150 | u32 nh_msg_type; | ||
151 | u32 nh_key; | ||
152 | o2net_msg_handler_func *nh_func; | ||
153 | o2net_msg_handler_func *nh_func_data; | ||
154 | struct kref nh_kref; | ||
155 | struct list_head nh_unregister_item; | ||
156 | }; | ||
157 | |||
158 | enum o2net_system_error { | ||
159 | O2NET_ERR_NONE = 0, | ||
160 | O2NET_ERR_NO_HNDLR, | ||
161 | O2NET_ERR_OVERFLOW, | ||
162 | O2NET_ERR_DIED, | ||
163 | O2NET_ERR_MAX | ||
164 | }; | ||
165 | |||
166 | struct o2net_status_wait { | ||
167 | enum o2net_system_error ns_sys_status; | ||
168 | s32 ns_status; | ||
169 | int ns_id; | ||
170 | wait_queue_head_t ns_wq; | ||
171 | struct list_head ns_node_item; | ||
172 | }; | ||
173 | |||
174 | #endif /* O2CLUSTER_TCP_INTERNAL_H */ | ||
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c new file mode 100644 index 000000000000..7286c48bb30d --- /dev/null +++ b/fs/ocfs2/cluster/ver.c | |||
@@ -0,0 +1,42 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ver.c | ||
5 | * | ||
6 | * version string | ||
7 | * | ||
8 | * Copyright (C) 2002, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/kernel.h> | ||
28 | |||
29 | #include "ver.h" | ||
30 | |||
31 | #define CLUSTER_BUILD_VERSION "1.3.3" | ||
32 | |||
33 | #define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION | ||
34 | |||
35 | void cluster_print_version(void) | ||
36 | { | ||
37 | printk(KERN_INFO "%s\n", VERSION_STR); | ||
38 | } | ||
39 | |||
40 | MODULE_DESCRIPTION(VERSION_STR); | ||
41 | |||
42 | MODULE_VERSION(CLUSTER_BUILD_VERSION); | ||
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h new file mode 100644 index 000000000000..32554c3382c2 --- /dev/null +++ b/fs/ocfs2/cluster/ver.h | |||
@@ -0,0 +1,31 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ver.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef O2CLUSTER_VER_H | ||
27 | #define O2CLUSTER_VER_H | ||
28 | |||
29 | void cluster_print_version(void); | ||
30 | |||
31 | #endif /* O2CLUSTER_VER_H */ | ||
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c new file mode 100644 index 000000000000..bd85182e97bc --- /dev/null +++ b/fs/ocfs2/dcache.c | |||
@@ -0,0 +1,91 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dcache.c | ||
5 | * | ||
6 | * dentry cache handling code | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/namei.h> | ||
30 | |||
31 | #define MLOG_MASK_PREFIX ML_DCACHE | ||
32 | #include <cluster/masklog.h> | ||
33 | |||
34 | #include "ocfs2.h" | ||
35 | |||
36 | #include "alloc.h" | ||
37 | #include "dcache.h" | ||
38 | #include "file.h" | ||
39 | #include "inode.h" | ||
40 | |||
41 | static int ocfs2_dentry_revalidate(struct dentry *dentry, | ||
42 | struct nameidata *nd) | ||
43 | { | ||
44 | struct inode *inode = dentry->d_inode; | ||
45 | int ret = 0; /* if all else fails, just return false */ | ||
46 | struct ocfs2_super *osb; | ||
47 | |||
48 | mlog_entry("(0x%p, '%.*s')\n", dentry, | ||
49 | dentry->d_name.len, dentry->d_name.name); | ||
50 | |||
51 | /* Never trust a negative dentry - force a new lookup. */ | ||
52 | if (inode == NULL) { | ||
53 | mlog(0, "negative dentry: %.*s\n", dentry->d_name.len, | ||
54 | dentry->d_name.name); | ||
55 | goto bail; | ||
56 | } | ||
57 | |||
58 | osb = OCFS2_SB(inode->i_sb); | ||
59 | |||
60 | BUG_ON(!osb); | ||
61 | |||
62 | if (inode != osb->root_inode) { | ||
63 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
64 | /* did we or someone else delete this inode? */ | ||
65 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { | ||
66 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
67 | mlog(0, "inode (%"MLFu64") deleted, returning false\n", | ||
68 | OCFS2_I(inode)->ip_blkno); | ||
69 | goto bail; | ||
70 | } | ||
71 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
72 | |||
73 | if (!inode->i_nlink) { | ||
74 | mlog(0, "Inode %"MLFu64" orphaned, returning false " | ||
75 | "dir = %d\n", OCFS2_I(inode)->ip_blkno, | ||
76 | S_ISDIR(inode->i_mode)); | ||
77 | goto bail; | ||
78 | } | ||
79 | } | ||
80 | |||
81 | ret = 1; | ||
82 | |||
83 | bail: | ||
84 | mlog_exit(ret); | ||
85 | |||
86 | return ret; | ||
87 | } | ||
88 | |||
89 | struct dentry_operations ocfs2_dentry_ops = { | ||
90 | .d_revalidate = ocfs2_dentry_revalidate, | ||
91 | }; | ||
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h new file mode 100644 index 000000000000..90072771114b --- /dev/null +++ b/fs/ocfs2/dcache.h | |||
@@ -0,0 +1,31 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dcache.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_DCACHE_H | ||
27 | #define OCFS2_DCACHE_H | ||
28 | |||
29 | extern struct dentry_operations ocfs2_dentry_ops; | ||
30 | |||
31 | #endif /* OCFS2_DCACHE_H */ | ||
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c new file mode 100644 index 000000000000..856e20ae8263 --- /dev/null +++ b/fs/ocfs2/dir.c | |||
@@ -0,0 +1,618 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dir.c | ||
5 | * | ||
6 | * Creates, reads, walks and deletes directory-nodes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * Portions of this code from linux/fs/ext3/dir.c | ||
11 | * | ||
12 | * Copyright (C) 1992, 1993, 1994, 1995 | ||
13 | * Remy Card (card@masi.ibp.fr) | ||
14 | * Laboratoire MASI - Institut Blaise pascal | ||
15 | * Universite Pierre et Marie Curie (Paris VI) | ||
16 | * | ||
17 | * from | ||
18 | * | ||
19 | * linux/fs/minix/dir.c | ||
20 | * | ||
21 | * Copyright (C) 1991, 1992 Linux Torvalds | ||
22 | * | ||
23 | * This program is free software; you can redistribute it and/or | ||
24 | * modify it under the terms of the GNU General Public | ||
25 | * License as published by the Free Software Foundation; either | ||
26 | * version 2 of the License, or (at your option) any later version. | ||
27 | * | ||
28 | * This program is distributed in the hope that it will be useful, | ||
29 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
30 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
31 | * General Public License for more details. | ||
32 | * | ||
33 | * You should have received a copy of the GNU General Public | ||
34 | * License along with this program; if not, write to the | ||
35 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
36 | * Boston, MA 021110-1307, USA. | ||
37 | */ | ||
38 | |||
39 | #include <linux/fs.h> | ||
40 | #include <linux/types.h> | ||
41 | #include <linux/slab.h> | ||
42 | #include <linux/highmem.h> | ||
43 | |||
44 | #define MLOG_MASK_PREFIX ML_NAMEI | ||
45 | #include <cluster/masklog.h> | ||
46 | |||
47 | #include "ocfs2.h" | ||
48 | |||
49 | #include "alloc.h" | ||
50 | #include "dir.h" | ||
51 | #include "dlmglue.h" | ||
52 | #include "extent_map.h" | ||
53 | #include "file.h" | ||
54 | #include "inode.h" | ||
55 | #include "journal.h" | ||
56 | #include "namei.h" | ||
57 | #include "suballoc.h" | ||
58 | #include "uptodate.h" | ||
59 | |||
60 | #include "buffer_head_io.h" | ||
61 | |||
62 | static unsigned char ocfs2_filetype_table[] = { | ||
63 | DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK | ||
64 | }; | ||
65 | |||
66 | static int ocfs2_extend_dir(struct ocfs2_super *osb, | ||
67 | struct inode *dir, | ||
68 | struct buffer_head *parent_fe_bh, | ||
69 | struct buffer_head **new_de_bh); | ||
70 | /* | ||
71 | * ocfs2_readdir() | ||
72 | * | ||
73 | */ | ||
74 | int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) | ||
75 | { | ||
76 | int error = 0; | ||
77 | unsigned long offset, blk; | ||
78 | int i, num, stored; | ||
79 | struct buffer_head * bh, * tmp; | ||
80 | struct ocfs2_dir_entry * de; | ||
81 | int err; | ||
82 | struct inode *inode = filp->f_dentry->d_inode; | ||
83 | struct super_block * sb = inode->i_sb; | ||
84 | int have_disk_lock = 0; | ||
85 | |||
86 | mlog_entry("dirino=%"MLFu64"\n", OCFS2_I(inode)->ip_blkno); | ||
87 | |||
88 | stored = 0; | ||
89 | bh = NULL; | ||
90 | |||
91 | error = ocfs2_meta_lock(inode, NULL, NULL, 0); | ||
92 | if (error < 0) { | ||
93 | if (error != -ENOENT) | ||
94 | mlog_errno(error); | ||
95 | /* we haven't got any yet, so propagate the error. */ | ||
96 | stored = error; | ||
97 | goto bail; | ||
98 | } | ||
99 | have_disk_lock = 1; | ||
100 | |||
101 | offset = filp->f_pos & (sb->s_blocksize - 1); | ||
102 | |||
103 | while (!error && !stored && filp->f_pos < i_size_read(inode)) { | ||
104 | blk = (filp->f_pos) >> sb->s_blocksize_bits; | ||
105 | bh = ocfs2_bread(inode, blk, &err, 0); | ||
106 | if (!bh) { | ||
107 | mlog(ML_ERROR, "directory #%"MLFu64" contains a hole " | ||
108 | "at offset %lld\n", | ||
109 | OCFS2_I(inode)->ip_blkno, | ||
110 | filp->f_pos); | ||
111 | filp->f_pos += sb->s_blocksize - offset; | ||
112 | continue; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * Do the readahead (8k) | ||
117 | */ | ||
118 | if (!offset) { | ||
119 | for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0; | ||
120 | i > 0; i--) { | ||
121 | tmp = ocfs2_bread(inode, ++blk, &err, 1); | ||
122 | if (tmp) | ||
123 | brelse(tmp); | ||
124 | } | ||
125 | } | ||
126 | |||
127 | revalidate: | ||
128 | /* If the dir block has changed since the last call to | ||
129 | * readdir(2), then we might be pointing to an invalid | ||
130 | * dirent right now. Scan from the start of the block | ||
131 | * to make sure. */ | ||
132 | if (filp->f_version != inode->i_version) { | ||
133 | for (i = 0; i < sb->s_blocksize && i < offset; ) { | ||
134 | de = (struct ocfs2_dir_entry *) (bh->b_data + i); | ||
135 | /* It's too expensive to do a full | ||
136 | * dirent test each time round this | ||
137 | * loop, but we do have to test at | ||
138 | * least that it is non-zero. A | ||
139 | * failure will be detected in the | ||
140 | * dirent test below. */ | ||
141 | if (le16_to_cpu(de->rec_len) < | ||
142 | OCFS2_DIR_REC_LEN(1)) | ||
143 | break; | ||
144 | i += le16_to_cpu(de->rec_len); | ||
145 | } | ||
146 | offset = i; | ||
147 | filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) | ||
148 | | offset; | ||
149 | filp->f_version = inode->i_version; | ||
150 | } | ||
151 | |||
152 | while (!error && filp->f_pos < i_size_read(inode) | ||
153 | && offset < sb->s_blocksize) { | ||
154 | de = (struct ocfs2_dir_entry *) (bh->b_data + offset); | ||
155 | if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { | ||
156 | /* On error, skip the f_pos to the | ||
157 | next block. */ | ||
158 | filp->f_pos = (filp->f_pos | | ||
159 | (sb->s_blocksize - 1)) + 1; | ||
160 | brelse(bh); | ||
161 | goto bail; | ||
162 | } | ||
163 | offset += le16_to_cpu(de->rec_len); | ||
164 | if (le64_to_cpu(de->inode)) { | ||
165 | /* We might block in the next section | ||
166 | * if the data destination is | ||
167 | * currently swapped out. So, use a | ||
168 | * version stamp to detect whether or | ||
169 | * not the directory has been modified | ||
170 | * during the copy operation. | ||
171 | */ | ||
172 | unsigned long version = filp->f_version; | ||
173 | unsigned char d_type = DT_UNKNOWN; | ||
174 | |||
175 | if (de->file_type < OCFS2_FT_MAX) | ||
176 | d_type = ocfs2_filetype_table[de->file_type]; | ||
177 | error = filldir(dirent, de->name, | ||
178 | de->name_len, | ||
179 | filp->f_pos, | ||
180 | ino_from_blkno(sb, le64_to_cpu(de->inode)), | ||
181 | d_type); | ||
182 | if (error) | ||
183 | break; | ||
184 | if (version != filp->f_version) | ||
185 | goto revalidate; | ||
186 | stored ++; | ||
187 | } | ||
188 | filp->f_pos += le16_to_cpu(de->rec_len); | ||
189 | } | ||
190 | offset = 0; | ||
191 | brelse(bh); | ||
192 | } | ||
193 | |||
194 | stored = 0; | ||
195 | bail: | ||
196 | if (have_disk_lock) | ||
197 | ocfs2_meta_unlock(inode, 0); | ||
198 | |||
199 | mlog_exit(stored); | ||
200 | |||
201 | return stored; | ||
202 | } | ||
203 | |||
204 | /* | ||
205 | * NOTE: this should always be called with parent dir i_sem taken. | ||
206 | */ | ||
207 | int ocfs2_find_files_on_disk(const char *name, | ||
208 | int namelen, | ||
209 | u64 *blkno, | ||
210 | struct inode *inode, | ||
211 | struct buffer_head **dirent_bh, | ||
212 | struct ocfs2_dir_entry **dirent) | ||
213 | { | ||
214 | int status = -ENOENT; | ||
215 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
216 | |||
217 | mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, " | ||
218 | "inode=%p)\n", | ||
219 | osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode); | ||
220 | |||
221 | *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent); | ||
222 | if (!*dirent_bh || !*dirent) { | ||
223 | status = -ENOENT; | ||
224 | goto leave; | ||
225 | } | ||
226 | |||
227 | *blkno = le64_to_cpu((*dirent)->inode); | ||
228 | |||
229 | status = 0; | ||
230 | leave: | ||
231 | if (status < 0) { | ||
232 | *dirent = NULL; | ||
233 | if (*dirent_bh) { | ||
234 | brelse(*dirent_bh); | ||
235 | *dirent_bh = NULL; | ||
236 | } | ||
237 | } | ||
238 | |||
239 | mlog_exit(status); | ||
240 | return status; | ||
241 | } | ||
242 | |||
243 | /* Check for a name within a directory. | ||
244 | * | ||
245 | * Return 0 if the name does not exist | ||
246 | * Return -EEXIST if the directory contains the name | ||
247 | * | ||
248 | * Callers should have i_sem + a cluster lock on dir | ||
249 | */ | ||
250 | int ocfs2_check_dir_for_entry(struct inode *dir, | ||
251 | const char *name, | ||
252 | int namelen) | ||
253 | { | ||
254 | int ret; | ||
255 | struct buffer_head *dirent_bh = NULL; | ||
256 | struct ocfs2_dir_entry *dirent = NULL; | ||
257 | |||
258 | mlog_entry("dir %"MLFu64", name '%.*s'\n", OCFS2_I(dir)->ip_blkno, | ||
259 | namelen, name); | ||
260 | |||
261 | ret = -EEXIST; | ||
262 | dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent); | ||
263 | if (dirent_bh) | ||
264 | goto bail; | ||
265 | |||
266 | ret = 0; | ||
267 | bail: | ||
268 | if (dirent_bh) | ||
269 | brelse(dirent_bh); | ||
270 | |||
271 | mlog_exit(ret); | ||
272 | return ret; | ||
273 | } | ||
274 | |||
275 | /* | ||
276 | * routine to check that the specified directory is empty (for rmdir) | ||
277 | */ | ||
278 | int ocfs2_empty_dir(struct inode *inode) | ||
279 | { | ||
280 | unsigned long offset; | ||
281 | struct buffer_head * bh; | ||
282 | struct ocfs2_dir_entry * de, * de1; | ||
283 | struct super_block * sb; | ||
284 | int err; | ||
285 | |||
286 | sb = inode->i_sb; | ||
287 | if ((i_size_read(inode) < | ||
288 | (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) || | ||
289 | !(bh = ocfs2_bread(inode, 0, &err, 0))) { | ||
290 | mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - " | ||
291 | "no data block\n", | ||
292 | OCFS2_I(inode)->ip_blkno); | ||
293 | return 1; | ||
294 | } | ||
295 | |||
296 | de = (struct ocfs2_dir_entry *) bh->b_data; | ||
297 | de1 = (struct ocfs2_dir_entry *) | ||
298 | ((char *)de + le16_to_cpu(de->rec_len)); | ||
299 | if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) || | ||
300 | !le64_to_cpu(de1->inode) || | ||
301 | strcmp(".", de->name) || | ||
302 | strcmp("..", de1->name)) { | ||
303 | mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - " | ||
304 | "no `.' or `..'\n", | ||
305 | OCFS2_I(inode)->ip_blkno); | ||
306 | brelse(bh); | ||
307 | return 1; | ||
308 | } | ||
309 | offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); | ||
310 | de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len)); | ||
311 | while (offset < i_size_read(inode) ) { | ||
312 | if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) { | ||
313 | brelse(bh); | ||
314 | bh = ocfs2_bread(inode, | ||
315 | offset >> sb->s_blocksize_bits, &err, 0); | ||
316 | if (!bh) { | ||
317 | mlog(ML_ERROR, "directory #%"MLFu64" contains " | ||
318 | "a hole at offset %lu\n", | ||
319 | OCFS2_I(inode)->ip_blkno, offset); | ||
320 | offset += sb->s_blocksize; | ||
321 | continue; | ||
322 | } | ||
323 | de = (struct ocfs2_dir_entry *) bh->b_data; | ||
324 | } | ||
325 | if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { | ||
326 | brelse(bh); | ||
327 | return 1; | ||
328 | } | ||
329 | if (le64_to_cpu(de->inode)) { | ||
330 | brelse(bh); | ||
331 | return 0; | ||
332 | } | ||
333 | offset += le16_to_cpu(de->rec_len); | ||
334 | de = (struct ocfs2_dir_entry *) | ||
335 | ((char *)de + le16_to_cpu(de->rec_len)); | ||
336 | } | ||
337 | brelse(bh); | ||
338 | return 1; | ||
339 | } | ||
340 | |||
341 | /* returns a bh of the 1st new block in the allocation. */ | ||
342 | int ocfs2_do_extend_dir(struct super_block *sb, | ||
343 | struct ocfs2_journal_handle *handle, | ||
344 | struct inode *dir, | ||
345 | struct buffer_head *parent_fe_bh, | ||
346 | struct ocfs2_alloc_context *data_ac, | ||
347 | struct ocfs2_alloc_context *meta_ac, | ||
348 | struct buffer_head **new_bh) | ||
349 | { | ||
350 | int status; | ||
351 | int extend; | ||
352 | u64 p_blkno; | ||
353 | |||
354 | spin_lock(&OCFS2_I(dir)->ip_lock); | ||
355 | extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); | ||
356 | spin_unlock(&OCFS2_I(dir)->ip_lock); | ||
357 | |||
358 | if (extend) { | ||
359 | status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1, | ||
360 | parent_fe_bh, handle, | ||
361 | data_ac, meta_ac, NULL); | ||
362 | BUG_ON(status == -EAGAIN); | ||
363 | if (status < 0) { | ||
364 | mlog_errno(status); | ||
365 | goto bail; | ||
366 | } | ||
367 | } | ||
368 | |||
369 | status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >> | ||
370 | (sb->s_blocksize_bits - 9)), | ||
371 | 1, &p_blkno, NULL); | ||
372 | if (status < 0) { | ||
373 | mlog_errno(status); | ||
374 | goto bail; | ||
375 | } | ||
376 | |||
377 | *new_bh = sb_getblk(sb, p_blkno); | ||
378 | if (!*new_bh) { | ||
379 | status = -EIO; | ||
380 | mlog_errno(status); | ||
381 | goto bail; | ||
382 | } | ||
383 | status = 0; | ||
384 | bail: | ||
385 | mlog_exit(status); | ||
386 | return status; | ||
387 | } | ||
388 | |||
389 | /* assumes you already have a cluster lock on the directory. */ | ||
390 | static int ocfs2_extend_dir(struct ocfs2_super *osb, | ||
391 | struct inode *dir, | ||
392 | struct buffer_head *parent_fe_bh, | ||
393 | struct buffer_head **new_de_bh) | ||
394 | { | ||
395 | int status = 0; | ||
396 | int credits, num_free_extents; | ||
397 | loff_t dir_i_size; | ||
398 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; | ||
399 | struct ocfs2_alloc_context *data_ac = NULL; | ||
400 | struct ocfs2_alloc_context *meta_ac = NULL; | ||
401 | struct ocfs2_journal_handle *handle = NULL; | ||
402 | struct buffer_head *new_bh = NULL; | ||
403 | struct ocfs2_dir_entry * de; | ||
404 | struct super_block *sb = osb->sb; | ||
405 | |||
406 | mlog_entry_void(); | ||
407 | |||
408 | dir_i_size = i_size_read(dir); | ||
409 | mlog(0, "extending dir %"MLFu64" (i_size = %lld)\n", | ||
410 | OCFS2_I(dir)->ip_blkno, dir_i_size); | ||
411 | |||
412 | handle = ocfs2_alloc_handle(osb); | ||
413 | if (handle == NULL) { | ||
414 | status = -ENOMEM; | ||
415 | mlog_errno(status); | ||
416 | goto bail; | ||
417 | } | ||
418 | |||
419 | /* dir->i_size is always block aligned. */ | ||
420 | spin_lock(&OCFS2_I(dir)->ip_lock); | ||
421 | if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { | ||
422 | spin_unlock(&OCFS2_I(dir)->ip_lock); | ||
423 | num_free_extents = ocfs2_num_free_extents(osb, dir, fe); | ||
424 | if (num_free_extents < 0) { | ||
425 | status = num_free_extents; | ||
426 | mlog_errno(status); | ||
427 | goto bail; | ||
428 | } | ||
429 | |||
430 | if (!num_free_extents) { | ||
431 | status = ocfs2_reserve_new_metadata(osb, handle, | ||
432 | fe, &meta_ac); | ||
433 | if (status < 0) { | ||
434 | if (status != -ENOSPC) | ||
435 | mlog_errno(status); | ||
436 | goto bail; | ||
437 | } | ||
438 | } | ||
439 | |||
440 | status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac); | ||
441 | if (status < 0) { | ||
442 | if (status != -ENOSPC) | ||
443 | mlog_errno(status); | ||
444 | goto bail; | ||
445 | } | ||
446 | |||
447 | credits = ocfs2_calc_extend_credits(sb, fe, 1); | ||
448 | } else { | ||
449 | spin_unlock(&OCFS2_I(dir)->ip_lock); | ||
450 | credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; | ||
451 | } | ||
452 | |||
453 | handle = ocfs2_start_trans(osb, handle, credits); | ||
454 | if (IS_ERR(handle)) { | ||
455 | status = PTR_ERR(handle); | ||
456 | handle = NULL; | ||
457 | mlog_errno(status); | ||
458 | goto bail; | ||
459 | } | ||
460 | |||
461 | status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh, | ||
462 | data_ac, meta_ac, &new_bh); | ||
463 | if (status < 0) { | ||
464 | mlog_errno(status); | ||
465 | goto bail; | ||
466 | } | ||
467 | |||
468 | ocfs2_set_new_buffer_uptodate(dir, new_bh); | ||
469 | |||
470 | status = ocfs2_journal_access(handle, dir, new_bh, | ||
471 | OCFS2_JOURNAL_ACCESS_CREATE); | ||
472 | if (status < 0) { | ||
473 | mlog_errno(status); | ||
474 | goto bail; | ||
475 | } | ||
476 | memset(new_bh->b_data, 0, sb->s_blocksize); | ||
477 | de = (struct ocfs2_dir_entry *) new_bh->b_data; | ||
478 | de->inode = 0; | ||
479 | de->rec_len = cpu_to_le16(sb->s_blocksize); | ||
480 | status = ocfs2_journal_dirty(handle, new_bh); | ||
481 | if (status < 0) { | ||
482 | mlog_errno(status); | ||
483 | goto bail; | ||
484 | } | ||
485 | |||
486 | dir_i_size += dir->i_sb->s_blocksize; | ||
487 | i_size_write(dir, dir_i_size); | ||
488 | dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size); | ||
489 | status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); | ||
490 | if (status < 0) { | ||
491 | mlog_errno(status); | ||
492 | goto bail; | ||
493 | } | ||
494 | |||
495 | *new_de_bh = new_bh; | ||
496 | get_bh(*new_de_bh); | ||
497 | bail: | ||
498 | if (handle) | ||
499 | ocfs2_commit_trans(handle); | ||
500 | |||
501 | if (data_ac) | ||
502 | ocfs2_free_alloc_context(data_ac); | ||
503 | if (meta_ac) | ||
504 | ocfs2_free_alloc_context(meta_ac); | ||
505 | |||
506 | if (new_bh) | ||
507 | brelse(new_bh); | ||
508 | |||
509 | mlog_exit(status); | ||
510 | return status; | ||
511 | } | ||
512 | |||
513 | /* | ||
514 | * Search the dir for a good spot, extending it if necessary. The | ||
515 | * block containing an appropriate record is returned in ret_de_bh. | ||
516 | */ | ||
517 | int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, | ||
518 | struct inode *dir, | ||
519 | struct buffer_head *parent_fe_bh, | ||
520 | const char *name, | ||
521 | int namelen, | ||
522 | struct buffer_head **ret_de_bh) | ||
523 | { | ||
524 | unsigned long offset; | ||
525 | struct buffer_head * bh = NULL; | ||
526 | unsigned short rec_len; | ||
527 | struct ocfs2_dinode *fe; | ||
528 | struct ocfs2_dir_entry *de; | ||
529 | struct super_block *sb; | ||
530 | int status; | ||
531 | |||
532 | mlog_entry_void(); | ||
533 | |||
534 | mlog(0, "getting ready to insert namelen %d into dir %"MLFu64"\n", | ||
535 | namelen, OCFS2_I(dir)->ip_blkno); | ||
536 | |||
537 | BUG_ON(!S_ISDIR(dir->i_mode)); | ||
538 | fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; | ||
539 | BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir)); | ||
540 | |||
541 | sb = dir->i_sb; | ||
542 | |||
543 | if (!namelen) { | ||
544 | status = -EINVAL; | ||
545 | mlog_errno(status); | ||
546 | goto bail; | ||
547 | } | ||
548 | |||
549 | bh = ocfs2_bread(dir, 0, &status, 0); | ||
550 | if (!bh) { | ||
551 | mlog_errno(status); | ||
552 | goto bail; | ||
553 | } | ||
554 | |||
555 | rec_len = OCFS2_DIR_REC_LEN(namelen); | ||
556 | offset = 0; | ||
557 | de = (struct ocfs2_dir_entry *) bh->b_data; | ||
558 | while (1) { | ||
559 | if ((char *)de >= sb->s_blocksize + bh->b_data) { | ||
560 | brelse(bh); | ||
561 | bh = NULL; | ||
562 | |||
563 | if (i_size_read(dir) <= offset) { | ||
564 | status = ocfs2_extend_dir(osb, | ||
565 | dir, | ||
566 | parent_fe_bh, | ||
567 | &bh); | ||
568 | if (status < 0) { | ||
569 | mlog_errno(status); | ||
570 | goto bail; | ||
571 | } | ||
572 | BUG_ON(!bh); | ||
573 | *ret_de_bh = bh; | ||
574 | get_bh(*ret_de_bh); | ||
575 | goto bail; | ||
576 | } | ||
577 | bh = ocfs2_bread(dir, | ||
578 | offset >> sb->s_blocksize_bits, | ||
579 | &status, | ||
580 | 0); | ||
581 | if (!bh) { | ||
582 | mlog_errno(status); | ||
583 | goto bail; | ||
584 | } | ||
585 | /* move to next block */ | ||
586 | de = (struct ocfs2_dir_entry *) bh->b_data; | ||
587 | } | ||
588 | if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { | ||
589 | status = -ENOENT; | ||
590 | goto bail; | ||
591 | } | ||
592 | if (ocfs2_match(namelen, name, de)) { | ||
593 | status = -EEXIST; | ||
594 | goto bail; | ||
595 | } | ||
596 | if (((le64_to_cpu(de->inode) == 0) && | ||
597 | (le16_to_cpu(de->rec_len) >= rec_len)) || | ||
598 | (le16_to_cpu(de->rec_len) >= | ||
599 | (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) { | ||
600 | /* Ok, we found a spot. Return this bh and let | ||
601 | * the caller actually fill it in. */ | ||
602 | *ret_de_bh = bh; | ||
603 | get_bh(*ret_de_bh); | ||
604 | status = 0; | ||
605 | goto bail; | ||
606 | } | ||
607 | offset += le16_to_cpu(de->rec_len); | ||
608 | de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); | ||
609 | } | ||
610 | |||
611 | status = 0; | ||
612 | bail: | ||
613 | if (bh) | ||
614 | brelse(bh); | ||
615 | |||
616 | mlog_exit(status); | ||
617 | return status; | ||
618 | } | ||
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h new file mode 100644 index 000000000000..5f614ec9649c --- /dev/null +++ b/fs/ocfs2/dir.h | |||
@@ -0,0 +1,54 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dir.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_DIR_H | ||
27 | #define OCFS2_DIR_H | ||
28 | |||
29 | int ocfs2_check_dir_for_entry(struct inode *dir, | ||
30 | const char *name, | ||
31 | int namelen); | ||
32 | int ocfs2_empty_dir(struct inode *inode); /* FIXME: to namei.c */ | ||
33 | int ocfs2_find_files_on_disk(const char *name, | ||
34 | int namelen, | ||
35 | u64 *blkno, | ||
36 | struct inode *inode, | ||
37 | struct buffer_head **dirent_bh, | ||
38 | struct ocfs2_dir_entry **dirent); | ||
39 | int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); | ||
40 | int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, | ||
41 | struct inode *dir, | ||
42 | struct buffer_head *parent_fe_bh, | ||
43 | const char *name, | ||
44 | int namelen, | ||
45 | struct buffer_head **ret_de_bh); | ||
46 | struct ocfs2_alloc_context; | ||
47 | int ocfs2_do_extend_dir(struct super_block *sb, | ||
48 | struct ocfs2_journal_handle *handle, | ||
49 | struct inode *dir, | ||
50 | struct buffer_head *parent_fe_bh, | ||
51 | struct ocfs2_alloc_context *data_ac, | ||
52 | struct ocfs2_alloc_context *meta_ac, | ||
53 | struct buffer_head **new_bh); | ||
54 | #endif /* OCFS2_DIR_H */ | ||
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile new file mode 100644 index 000000000000..ce3f7c29d270 --- /dev/null +++ b/fs/ocfs2/dlm/Makefile | |||
@@ -0,0 +1,8 @@ | |||
1 | EXTRA_CFLAGS += -Ifs/ocfs2 | ||
2 | |||
3 | obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o | ||
4 | |||
5 | ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ | ||
6 | dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o | ||
7 | |||
8 | ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o | ||
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h new file mode 100644 index 000000000000..53652f51c0e1 --- /dev/null +++ b/fs/ocfs2/dlm/dlmapi.h | |||
@@ -0,0 +1,214 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmapi.h | ||
5 | * | ||
6 | * externally exported dlm interfaces | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | #ifndef DLMAPI_H | ||
28 | #define DLMAPI_H | ||
29 | |||
30 | struct dlm_lock; | ||
31 | struct dlm_ctxt; | ||
32 | |||
33 | /* NOTE: changes made to this enum should be reflected in dlmdebug.c */ | ||
34 | enum dlm_status { | ||
35 | DLM_NORMAL = 0, /* 0: request in progress */ | ||
36 | DLM_GRANTED, /* 1: request granted */ | ||
37 | DLM_DENIED, /* 2: request denied */ | ||
38 | DLM_DENIED_NOLOCKS, /* 3: request denied, out of system resources */ | ||
39 | DLM_WORKING, /* 4: async request in progress */ | ||
40 | DLM_BLOCKED, /* 5: lock request blocked */ | ||
41 | DLM_BLOCKED_ORPHAN, /* 6: lock request blocked by a orphan lock*/ | ||
42 | DLM_DENIED_GRACE_PERIOD, /* 7: topological change in progress */ | ||
43 | DLM_SYSERR, /* 8: system error */ | ||
44 | DLM_NOSUPPORT, /* 9: unsupported */ | ||
45 | DLM_CANCELGRANT, /* 10: can't cancel convert: already granted */ | ||
46 | DLM_IVLOCKID, /* 11: bad lockid */ | ||
47 | DLM_SYNC, /* 12: synchronous request granted */ | ||
48 | DLM_BADTYPE, /* 13: bad resource type */ | ||
49 | DLM_BADRESOURCE, /* 14: bad resource handle */ | ||
50 | DLM_MAXHANDLES, /* 15: no more resource handles */ | ||
51 | DLM_NOCLINFO, /* 16: can't contact cluster manager */ | ||
52 | DLM_NOLOCKMGR, /* 17: can't contact lock manager */ | ||
53 | DLM_NOPURGED, /* 18: can't contact purge daemon */ | ||
54 | DLM_BADARGS, /* 19: bad api args */ | ||
55 | DLM_VOID, /* 20: no status */ | ||
56 | DLM_NOTQUEUED, /* 21: NOQUEUE was specified and request failed */ | ||
57 | DLM_IVBUFLEN, /* 22: invalid resource name length */ | ||
58 | DLM_CVTUNGRANT, /* 23: attempted to convert ungranted lock */ | ||
59 | DLM_BADPARAM, /* 24: invalid lock mode specified */ | ||
60 | DLM_VALNOTVALID, /* 25: value block has been invalidated */ | ||
61 | DLM_REJECTED, /* 26: request rejected, unrecognized client */ | ||
62 | DLM_ABORT, /* 27: blocked lock request cancelled */ | ||
63 | DLM_CANCEL, /* 28: conversion request cancelled */ | ||
64 | DLM_IVRESHANDLE, /* 29: invalid resource handle */ | ||
65 | DLM_DEADLOCK, /* 30: deadlock recovery refused this request */ | ||
66 | DLM_DENIED_NOASTS, /* 31: failed to allocate AST */ | ||
67 | DLM_FORWARD, /* 32: request must wait for primary's response */ | ||
68 | DLM_TIMEOUT, /* 33: timeout value for lock has expired */ | ||
69 | DLM_IVGROUPID, /* 34: invalid group specification */ | ||
70 | DLM_VERS_CONFLICT, /* 35: version conflicts prevent request handling */ | ||
71 | DLM_BAD_DEVICE_PATH, /* 36: Locks device does not exist or path wrong */ | ||
72 | DLM_NO_DEVICE_PERMISSION, /* 37: Client has insufficient pers for device */ | ||
73 | DLM_NO_CONTROL_DEVICE, /* 38: Cannot set options on opened device */ | ||
74 | |||
75 | DLM_RECOVERING, /* 39: extension, allows caller to fail a lock | ||
76 | request if it is being recovered */ | ||
77 | DLM_MIGRATING, /* 40: extension, allows caller to fail a lock | ||
78 | request if it is being migrated */ | ||
79 | DLM_MAXSTATS, /* 41: upper limit for return code validation */ | ||
80 | }; | ||
81 | |||
82 | /* for pretty-printing dlm_status error messages */ | ||
83 | const char *dlm_errmsg(enum dlm_status err); | ||
84 | /* for pretty-printing dlm_status error names */ | ||
85 | const char *dlm_errname(enum dlm_status err); | ||
86 | |||
87 | /* Eventually the DLM will use standard errno values, but in the | ||
88 | * meantime this lets us track dlm errors as they bubble up. When we | ||
89 | * bring its error reporting into line with the rest of the stack, | ||
90 | * these can just be replaced with calls to mlog_errno. */ | ||
91 | #define dlm_error(st) do { \ | ||
92 | if ((st) != DLM_RECOVERING && \ | ||
93 | (st) != DLM_MIGRATING && \ | ||
94 | (st) != DLM_FORWARD) \ | ||
95 | mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \ | ||
96 | } while (0) | ||
97 | |||
98 | #define DLM_LKSB_UNUSED1 0x01 | ||
99 | #define DLM_LKSB_PUT_LVB 0x02 | ||
100 | #define DLM_LKSB_GET_LVB 0x04 | ||
101 | #define DLM_LKSB_UNUSED2 0x08 | ||
102 | #define DLM_LKSB_UNUSED3 0x10 | ||
103 | #define DLM_LKSB_UNUSED4 0x20 | ||
104 | #define DLM_LKSB_UNUSED5 0x40 | ||
105 | #define DLM_LKSB_UNUSED6 0x80 | ||
106 | |||
107 | #define DLM_LVB_LEN 64 | ||
108 | |||
109 | /* Callers are only allowed access to the lvb and status members of | ||
110 | * this struct. */ | ||
111 | struct dlm_lockstatus { | ||
112 | enum dlm_status status; | ||
113 | u32 flags; | ||
114 | struct dlm_lock *lockid; | ||
115 | char lvb[DLM_LVB_LEN]; | ||
116 | }; | ||
117 | |||
118 | /* Valid lock modes. */ | ||
119 | #define LKM_IVMODE (-1) /* invalid mode */ | ||
120 | #define LKM_NLMODE 0 /* null lock */ | ||
121 | #define LKM_CRMODE 1 /* concurrent read unsupported */ | ||
122 | #define LKM_CWMODE 2 /* concurrent write unsupported */ | ||
123 | #define LKM_PRMODE 3 /* protected read */ | ||
124 | #define LKM_PWMODE 4 /* protected write unsupported */ | ||
125 | #define LKM_EXMODE 5 /* exclusive */ | ||
126 | #define LKM_MAXMODE 5 | ||
127 | #define LKM_MODEMASK 0xff | ||
128 | |||
129 | /* Flags passed to dlmlock and dlmunlock: | ||
130 | * reserved: flags used by the "real" dlm | ||
131 | * only a few are supported by this dlm | ||
132 | * (U) = unsupported by ocfs2 dlm */ | ||
133 | #define LKM_ORPHAN 0x00000010 /* this lock is orphanable (U) */ | ||
134 | #define LKM_PARENTABLE 0x00000020 /* this lock was orphaned (U) */ | ||
135 | #define LKM_BLOCK 0x00000040 /* blocking lock request (U) */ | ||
136 | #define LKM_LOCAL 0x00000080 /* local lock request */ | ||
137 | #define LKM_VALBLK 0x00000100 /* lock value block request */ | ||
138 | #define LKM_NOQUEUE 0x00000200 /* non blocking request */ | ||
139 | #define LKM_CONVERT 0x00000400 /* conversion request */ | ||
140 | #define LKM_NODLCKWT 0x00000800 /* this lock wont deadlock (U) */ | ||
141 | #define LKM_UNLOCK 0x00001000 /* deallocate this lock */ | ||
142 | #define LKM_CANCEL 0x00002000 /* cancel conversion request */ | ||
143 | #define LKM_DEQALL 0x00004000 /* remove all locks held by proc (U) */ | ||
144 | #define LKM_INVVALBLK 0x00008000 /* invalidate lock value block */ | ||
145 | #define LKM_SYNCSTS 0x00010000 /* return synchronous status if poss (U) */ | ||
146 | #define LKM_TIMEOUT 0x00020000 /* lock request contains timeout (U) */ | ||
147 | #define LKM_SNGLDLCK 0x00040000 /* request can self-deadlock (U) */ | ||
148 | #define LKM_FINDLOCAL 0x00080000 /* find local lock request (U) */ | ||
149 | #define LKM_PROC_OWNED 0x00100000 /* owned by process, not group (U) */ | ||
150 | #define LKM_XID 0x00200000 /* use transaction id for deadlock (U) */ | ||
151 | #define LKM_XID_CONFLICT 0x00400000 /* do not allow lock inheritance (U) */ | ||
152 | #define LKM_FORCE 0x00800000 /* force unlock flag */ | ||
153 | #define LKM_REVVALBLK 0x01000000 /* temporary solution: re-validate | ||
154 | lock value block (U) */ | ||
155 | /* unused */ | ||
156 | #define LKM_UNUSED1 0x00000001 /* unused */ | ||
157 | #define LKM_UNUSED2 0x00000002 /* unused */ | ||
158 | #define LKM_UNUSED3 0x00000004 /* unused */ | ||
159 | #define LKM_UNUSED4 0x00000008 /* unused */ | ||
160 | #define LKM_UNUSED5 0x02000000 /* unused */ | ||
161 | #define LKM_UNUSED6 0x04000000 /* unused */ | ||
162 | #define LKM_UNUSED7 0x08000000 /* unused */ | ||
163 | |||
164 | /* ocfs2 extensions: internal only | ||
165 | * should never be used by caller */ | ||
166 | #define LKM_MIGRATION 0x10000000 /* extension: lockres is to be migrated | ||
167 | to another node */ | ||
168 | #define LKM_PUT_LVB 0x20000000 /* extension: lvb is being passed | ||
169 | should be applied to lockres */ | ||
170 | #define LKM_GET_LVB 0x40000000 /* extension: lvb should be copied | ||
171 | from lockres when lock is granted */ | ||
172 | #define LKM_RECOVERY 0x80000000 /* extension: flag for recovery lock | ||
173 | used to avoid recovery rwsem */ | ||
174 | |||
175 | |||
176 | typedef void (dlm_astlockfunc_t)(void *); | ||
177 | typedef void (dlm_bastlockfunc_t)(void *, int); | ||
178 | typedef void (dlm_astunlockfunc_t)(void *, enum dlm_status); | ||
179 | |||
180 | enum dlm_status dlmlock(struct dlm_ctxt *dlm, | ||
181 | int mode, | ||
182 | struct dlm_lockstatus *lksb, | ||
183 | int flags, | ||
184 | const char *name, | ||
185 | dlm_astlockfunc_t *ast, | ||
186 | void *data, | ||
187 | dlm_bastlockfunc_t *bast); | ||
188 | |||
189 | enum dlm_status dlmunlock(struct dlm_ctxt *dlm, | ||
190 | struct dlm_lockstatus *lksb, | ||
191 | int flags, | ||
192 | dlm_astunlockfunc_t *unlockast, | ||
193 | void *data); | ||
194 | |||
195 | struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key); | ||
196 | |||
197 | void dlm_unregister_domain(struct dlm_ctxt *dlm); | ||
198 | |||
199 | void dlm_print_one_lock(struct dlm_lock *lockid); | ||
200 | |||
201 | typedef void (dlm_eviction_func)(int, void *); | ||
202 | struct dlm_eviction_cb { | ||
203 | struct list_head ec_item; | ||
204 | dlm_eviction_func *ec_func; | ||
205 | void *ec_data; | ||
206 | }; | ||
207 | void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb, | ||
208 | dlm_eviction_func *f, | ||
209 | void *data); | ||
210 | void dlm_register_eviction_cb(struct dlm_ctxt *dlm, | ||
211 | struct dlm_eviction_cb *cb); | ||
212 | void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb); | ||
213 | |||
214 | #endif /* DLMAPI_H */ | ||
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c new file mode 100644 index 000000000000..8d17d28ef91c --- /dev/null +++ b/fs/ocfs2/dlm/dlmast.c | |||
@@ -0,0 +1,466 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmast.c | ||
5 | * | ||
6 | * AST and BAST functionality for local and remote nodes | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | |||
28 | #include <linux/module.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/types.h> | ||
31 | #include <linux/slab.h> | ||
32 | #include <linux/highmem.h> | ||
33 | #include <linux/utsname.h> | ||
34 | #include <linux/init.h> | ||
35 | #include <linux/sysctl.h> | ||
36 | #include <linux/random.h> | ||
37 | #include <linux/blkdev.h> | ||
38 | #include <linux/socket.h> | ||
39 | #include <linux/inet.h> | ||
40 | #include <linux/spinlock.h> | ||
41 | |||
42 | |||
43 | #include "cluster/heartbeat.h" | ||
44 | #include "cluster/nodemanager.h" | ||
45 | #include "cluster/tcp.h" | ||
46 | #include "cluster/endian.h" | ||
47 | |||
48 | #include "dlmapi.h" | ||
49 | #include "dlmcommon.h" | ||
50 | |||
51 | #define MLOG_MASK_PREFIX ML_DLM | ||
52 | #include "cluster/masklog.h" | ||
53 | |||
54 | static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | ||
55 | struct dlm_lock *lock); | ||
56 | static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); | ||
57 | |||
58 | /* Should be called as an ast gets queued to see if the new | ||
59 | * lock level will obsolete a pending bast. | ||
60 | * For example, if dlm_thread queued a bast for an EX lock that | ||
61 | * was blocking another EX, but before sending the bast the | ||
62 | * lock owner downconverted to NL, the bast is now obsolete. | ||
63 | * Only the ast should be sent. | ||
64 | * This is needed because the lock and convert paths can queue | ||
65 | * asts out-of-band (not waiting for dlm_thread) in order to | ||
66 | * allow for LKM_NOQUEUE to get immediate responses. */ | ||
67 | static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) | ||
68 | { | ||
69 | assert_spin_locked(&dlm->ast_lock); | ||
70 | assert_spin_locked(&lock->spinlock); | ||
71 | |||
72 | if (lock->ml.highest_blocked == LKM_IVMODE) | ||
73 | return 0; | ||
74 | BUG_ON(lock->ml.highest_blocked == LKM_NLMODE); | ||
75 | |||
76 | if (lock->bast_pending && | ||
77 | list_empty(&lock->bast_list)) | ||
78 | /* old bast already sent, ok */ | ||
79 | return 0; | ||
80 | |||
81 | if (lock->ml.type == LKM_EXMODE) | ||
82 | /* EX blocks anything left, any bast still valid */ | ||
83 | return 0; | ||
84 | else if (lock->ml.type == LKM_NLMODE) | ||
85 | /* NL blocks nothing, no reason to send any bast, cancel it */ | ||
86 | return 1; | ||
87 | else if (lock->ml.highest_blocked != LKM_EXMODE) | ||
88 | /* PR only blocks EX */ | ||
89 | return 1; | ||
90 | |||
91 | return 0; | ||
92 | } | ||
93 | |||
94 | static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) | ||
95 | { | ||
96 | mlog_entry_void(); | ||
97 | |||
98 | BUG_ON(!dlm); | ||
99 | BUG_ON(!lock); | ||
100 | |||
101 | assert_spin_locked(&dlm->ast_lock); | ||
102 | if (!list_empty(&lock->ast_list)) { | ||
103 | mlog(ML_ERROR, "ast list not empty!! pending=%d, newlevel=%d\n", | ||
104 | lock->ast_pending, lock->ml.type); | ||
105 | BUG(); | ||
106 | } | ||
107 | BUG_ON(!list_empty(&lock->ast_list)); | ||
108 | if (lock->ast_pending) | ||
109 | mlog(0, "lock has an ast getting flushed right now\n"); | ||
110 | |||
111 | /* putting lock on list, add a ref */ | ||
112 | dlm_lock_get(lock); | ||
113 | spin_lock(&lock->spinlock); | ||
114 | |||
115 | /* check to see if this ast obsoletes the bast */ | ||
116 | if (dlm_should_cancel_bast(dlm, lock)) { | ||
117 | struct dlm_lock_resource *res = lock->lockres; | ||
118 | mlog(0, "%s: cancelling bast for %.*s\n", | ||
119 | dlm->name, res->lockname.len, res->lockname.name); | ||
120 | lock->bast_pending = 0; | ||
121 | list_del_init(&lock->bast_list); | ||
122 | lock->ml.highest_blocked = LKM_IVMODE; | ||
123 | /* removing lock from list, remove a ref. guaranteed | ||
124 | * this won't be the last ref because of the get above, | ||
125 | * so res->spinlock will not be taken here */ | ||
126 | dlm_lock_put(lock); | ||
127 | /* free up the reserved bast that we are cancelling. | ||
128 | * guaranteed that this will not be the last reserved | ||
129 | * ast because *both* an ast and a bast were reserved | ||
130 | * to get to this point. the res->spinlock will not be | ||
131 | * taken here */ | ||
132 | dlm_lockres_release_ast(dlm, res); | ||
133 | } | ||
134 | list_add_tail(&lock->ast_list, &dlm->pending_asts); | ||
135 | lock->ast_pending = 1; | ||
136 | spin_unlock(&lock->spinlock); | ||
137 | } | ||
138 | |||
139 | void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) | ||
140 | { | ||
141 | mlog_entry_void(); | ||
142 | |||
143 | BUG_ON(!dlm); | ||
144 | BUG_ON(!lock); | ||
145 | |||
146 | spin_lock(&dlm->ast_lock); | ||
147 | __dlm_queue_ast(dlm, lock); | ||
148 | spin_unlock(&dlm->ast_lock); | ||
149 | } | ||
150 | |||
151 | |||
152 | static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) | ||
153 | { | ||
154 | mlog_entry_void(); | ||
155 | |||
156 | BUG_ON(!dlm); | ||
157 | BUG_ON(!lock); | ||
158 | assert_spin_locked(&dlm->ast_lock); | ||
159 | |||
160 | BUG_ON(!list_empty(&lock->bast_list)); | ||
161 | if (lock->bast_pending) | ||
162 | mlog(0, "lock has a bast getting flushed right now\n"); | ||
163 | |||
164 | /* putting lock on list, add a ref */ | ||
165 | dlm_lock_get(lock); | ||
166 | spin_lock(&lock->spinlock); | ||
167 | list_add_tail(&lock->bast_list, &dlm->pending_basts); | ||
168 | lock->bast_pending = 1; | ||
169 | spin_unlock(&lock->spinlock); | ||
170 | } | ||
171 | |||
172 | void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) | ||
173 | { | ||
174 | mlog_entry_void(); | ||
175 | |||
176 | BUG_ON(!dlm); | ||
177 | BUG_ON(!lock); | ||
178 | |||
179 | spin_lock(&dlm->ast_lock); | ||
180 | __dlm_queue_bast(dlm, lock); | ||
181 | spin_unlock(&dlm->ast_lock); | ||
182 | } | ||
183 | |||
184 | static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | ||
185 | struct dlm_lock *lock) | ||
186 | { | ||
187 | struct dlm_lockstatus *lksb = lock->lksb; | ||
188 | BUG_ON(!lksb); | ||
189 | |||
190 | /* only updates if this node masters the lockres */ | ||
191 | if (res->owner == dlm->node_num) { | ||
192 | |||
193 | spin_lock(&res->spinlock); | ||
194 | /* check the lksb flags for the direction */ | ||
195 | if (lksb->flags & DLM_LKSB_GET_LVB) { | ||
196 | mlog(0, "getting lvb from lockres for %s node\n", | ||
197 | lock->ml.node == dlm->node_num ? "master" : | ||
198 | "remote"); | ||
199 | memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN); | ||
200 | } else if (lksb->flags & DLM_LKSB_PUT_LVB) { | ||
201 | mlog(0, "setting lvb from lockres for %s node\n", | ||
202 | lock->ml.node == dlm->node_num ? "master" : | ||
203 | "remote"); | ||
204 | memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN); | ||
205 | } | ||
206 | spin_unlock(&res->spinlock); | ||
207 | } | ||
208 | |||
209 | /* reset any lvb flags on the lksb */ | ||
210 | lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB); | ||
211 | } | ||
212 | |||
213 | void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | ||
214 | struct dlm_lock *lock) | ||
215 | { | ||
216 | dlm_astlockfunc_t *fn; | ||
217 | struct dlm_lockstatus *lksb; | ||
218 | |||
219 | mlog_entry_void(); | ||
220 | |||
221 | lksb = lock->lksb; | ||
222 | fn = lock->ast; | ||
223 | BUG_ON(lock->ml.node != dlm->node_num); | ||
224 | |||
225 | dlm_update_lvb(dlm, res, lock); | ||
226 | (*fn)(lock->astdata); | ||
227 | } | ||
228 | |||
229 | |||
230 | int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | ||
231 | struct dlm_lock *lock) | ||
232 | { | ||
233 | int ret; | ||
234 | struct dlm_lockstatus *lksb; | ||
235 | int lksbflags; | ||
236 | |||
237 | mlog_entry_void(); | ||
238 | |||
239 | lksb = lock->lksb; | ||
240 | BUG_ON(lock->ml.node == dlm->node_num); | ||
241 | |||
242 | lksbflags = lksb->flags; | ||
243 | dlm_update_lvb(dlm, res, lock); | ||
244 | |||
245 | /* lock request came from another node | ||
246 | * go do the ast over there */ | ||
247 | ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags); | ||
248 | return ret; | ||
249 | } | ||
250 | |||
251 | void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | ||
252 | struct dlm_lock *lock, int blocked_type) | ||
253 | { | ||
254 | dlm_bastlockfunc_t *fn = lock->bast; | ||
255 | |||
256 | mlog_entry_void(); | ||
257 | BUG_ON(lock->ml.node != dlm->node_num); | ||
258 | |||
259 | (*fn)(lock->astdata, blocked_type); | ||
260 | } | ||
261 | |||
262 | |||
263 | |||
264 | int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) | ||
265 | { | ||
266 | int ret; | ||
267 | unsigned int locklen; | ||
268 | struct dlm_ctxt *dlm = data; | ||
269 | struct dlm_lock_resource *res = NULL; | ||
270 | struct dlm_lock *lock = NULL; | ||
271 | struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf; | ||
272 | char *name; | ||
273 | struct list_head *iter, *head=NULL; | ||
274 | u64 cookie; | ||
275 | u32 flags; | ||
276 | |||
277 | if (!dlm_grab(dlm)) { | ||
278 | dlm_error(DLM_REJECTED); | ||
279 | return DLM_REJECTED; | ||
280 | } | ||
281 | |||
282 | mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), | ||
283 | "Domain %s not fully joined!\n", dlm->name); | ||
284 | |||
285 | name = past->name; | ||
286 | locklen = past->namelen; | ||
287 | cookie = be64_to_cpu(past->cookie); | ||
288 | flags = be32_to_cpu(past->flags); | ||
289 | |||
290 | if (locklen > DLM_LOCKID_NAME_MAX) { | ||
291 | ret = DLM_IVBUFLEN; | ||
292 | mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n"); | ||
293 | goto leave; | ||
294 | } | ||
295 | |||
296 | if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) == | ||
297 | (LKM_PUT_LVB|LKM_GET_LVB)) { | ||
298 | mlog(ML_ERROR, "both PUT and GET lvb specified\n"); | ||
299 | ret = DLM_BADARGS; | ||
300 | goto leave; | ||
301 | } | ||
302 | |||
303 | mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : | ||
304 | (flags & LKM_GET_LVB ? "get lvb" : "none")); | ||
305 | |||
306 | mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type); | ||
307 | |||
308 | if (past->type != DLM_AST && | ||
309 | past->type != DLM_BAST) { | ||
310 | mlog(ML_ERROR, "Unknown ast type! %d, cookie=%"MLFu64", " | ||
311 | "name=%.*s\n", past->type, cookie, locklen, name); | ||
312 | ret = DLM_IVLOCKID; | ||
313 | goto leave; | ||
314 | } | ||
315 | |||
316 | res = dlm_lookup_lockres(dlm, name, locklen); | ||
317 | if (!res) { | ||
318 | mlog(ML_ERROR, "got %sast for unknown lockres! " | ||
319 | "cookie=%"MLFu64", name=%.*s, namelen=%u\n", | ||
320 | past->type == DLM_AST ? "" : "b", | ||
321 | cookie, locklen, name, locklen); | ||
322 | ret = DLM_IVLOCKID; | ||
323 | goto leave; | ||
324 | } | ||
325 | |||
326 | /* cannot get a proxy ast message if this node owns it */ | ||
327 | BUG_ON(res->owner == dlm->node_num); | ||
328 | |||
329 | mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name); | ||
330 | |||
331 | spin_lock(&res->spinlock); | ||
332 | if (res->state & DLM_LOCK_RES_RECOVERING) { | ||
333 | mlog(0, "responding with DLM_RECOVERING!\n"); | ||
334 | ret = DLM_RECOVERING; | ||
335 | goto unlock_out; | ||
336 | } | ||
337 | if (res->state & DLM_LOCK_RES_MIGRATING) { | ||
338 | mlog(0, "responding with DLM_MIGRATING!\n"); | ||
339 | ret = DLM_MIGRATING; | ||
340 | goto unlock_out; | ||
341 | } | ||
342 | /* try convert queue for both ast/bast */ | ||
343 | head = &res->converting; | ||
344 | lock = NULL; | ||
345 | list_for_each(iter, head) { | ||
346 | lock = list_entry (iter, struct dlm_lock, list); | ||
347 | if (be64_to_cpu(lock->ml.cookie) == cookie) | ||
348 | goto do_ast; | ||
349 | } | ||
350 | |||
351 | /* if not on convert, try blocked for ast, granted for bast */ | ||
352 | if (past->type == DLM_AST) | ||
353 | head = &res->blocked; | ||
354 | else | ||
355 | head = &res->granted; | ||
356 | |||
357 | list_for_each(iter, head) { | ||
358 | lock = list_entry (iter, struct dlm_lock, list); | ||
359 | if (be64_to_cpu(lock->ml.cookie) == cookie) | ||
360 | goto do_ast; | ||
361 | } | ||
362 | |||
363 | mlog(ML_ERROR, "got %sast for unknown lock! cookie=%"MLFu64", " | ||
364 | "name=%.*s, namelen=%u\n", | ||
365 | past->type == DLM_AST ? "" : "b", cookie, locklen, name, locklen); | ||
366 | |||
367 | ret = DLM_NORMAL; | ||
368 | unlock_out: | ||
369 | spin_unlock(&res->spinlock); | ||
370 | goto leave; | ||
371 | |||
372 | do_ast: | ||
373 | ret = DLM_NORMAL; | ||
374 | if (past->type == DLM_AST) { | ||
375 | /* do not alter lock refcount. switching lists. */ | ||
376 | list_del_init(&lock->list); | ||
377 | list_add_tail(&lock->list, &res->granted); | ||
378 | mlog(0, "ast: adding to granted list... type=%d, " | ||
379 | "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); | ||
380 | if (lock->ml.convert_type != LKM_IVMODE) { | ||
381 | lock->ml.type = lock->ml.convert_type; | ||
382 | lock->ml.convert_type = LKM_IVMODE; | ||
383 | } else { | ||
384 | // should already be there.... | ||
385 | } | ||
386 | |||
387 | lock->lksb->status = DLM_NORMAL; | ||
388 | |||
389 | /* if we requested the lvb, fetch it into our lksb now */ | ||
390 | if (flags & LKM_GET_LVB) { | ||
391 | BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB)); | ||
392 | memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN); | ||
393 | } | ||
394 | } | ||
395 | spin_unlock(&res->spinlock); | ||
396 | |||
397 | if (past->type == DLM_AST) | ||
398 | dlm_do_local_ast(dlm, res, lock); | ||
399 | else | ||
400 | dlm_do_local_bast(dlm, res, lock, past->blocked_type); | ||
401 | |||
402 | leave: | ||
403 | |||
404 | if (res) | ||
405 | dlm_lockres_put(res); | ||
406 | |||
407 | dlm_put(dlm); | ||
408 | return ret; | ||
409 | } | ||
410 | |||
411 | |||
412 | |||
413 | int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | ||
414 | struct dlm_lock *lock, int msg_type, | ||
415 | int blocked_type, int flags) | ||
416 | { | ||
417 | int ret = 0; | ||
418 | struct dlm_proxy_ast past; | ||
419 | struct kvec vec[2]; | ||
420 | size_t veclen = 1; | ||
421 | int status; | ||
422 | |||
423 | mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n", | ||
424 | res->lockname.len, res->lockname.name, lock->ml.node, | ||
425 | msg_type, blocked_type); | ||
426 | |||
427 | memset(&past, 0, sizeof(struct dlm_proxy_ast)); | ||
428 | past.node_idx = dlm->node_num; | ||
429 | past.type = msg_type; | ||
430 | past.blocked_type = blocked_type; | ||
431 | past.namelen = res->lockname.len; | ||
432 | memcpy(past.name, res->lockname.name, past.namelen); | ||
433 | past.cookie = lock->ml.cookie; | ||
434 | |||
435 | vec[0].iov_len = sizeof(struct dlm_proxy_ast); | ||
436 | vec[0].iov_base = &past; | ||
437 | if (flags & DLM_LKSB_GET_LVB) { | ||
438 | mlog(0, "returning requested LVB data\n"); | ||
439 | be32_add_cpu(&past.flags, LKM_GET_LVB); | ||
440 | vec[1].iov_len = DLM_LVB_LEN; | ||
441 | vec[1].iov_base = lock->lksb->lvb; | ||
442 | veclen++; | ||
443 | } | ||
444 | |||
445 | ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen, | ||
446 | lock->ml.node, &status); | ||
447 | if (ret < 0) | ||
448 | mlog_errno(ret); | ||
449 | else { | ||
450 | if (status == DLM_RECOVERING) { | ||
451 | mlog(ML_ERROR, "sent AST to node %u, it thinks this " | ||
452 | "node is dead!\n", lock->ml.node); | ||
453 | BUG(); | ||
454 | } else if (status == DLM_MIGRATING) { | ||
455 | mlog(ML_ERROR, "sent AST to node %u, it returned " | ||
456 | "DLM_MIGRATING!\n", lock->ml.node); | ||
457 | BUG(); | ||
458 | } else if (status != DLM_NORMAL) { | ||
459 | mlog(ML_ERROR, "AST to node %u returned %d!\n", | ||
460 | lock->ml.node, status); | ||
461 | /* ignore it */ | ||
462 | } | ||
463 | ret = 0; | ||
464 | } | ||
465 | return ret; | ||
466 | } | ||
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h new file mode 100644 index 000000000000..3fecba0a6023 --- /dev/null +++ b/fs/ocfs2/dlm/dlmcommon.h | |||
@@ -0,0 +1,884 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmcommon.h | ||
5 | * | ||
6 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public | ||
10 | * License as published by the Free Software Foundation; either | ||
11 | * version 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
16 | * General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public | ||
19 | * License along with this program; if not, write to the | ||
20 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
21 | * Boston, MA 021110-1307, USA. | ||
22 | * | ||
23 | */ | ||
24 | |||
25 | #ifndef DLMCOMMON_H | ||
26 | #define DLMCOMMON_H | ||
27 | |||
28 | #include <linux/kref.h> | ||
29 | |||
30 | #define DLM_HB_NODE_DOWN_PRI (0xf000000) | ||
31 | #define DLM_HB_NODE_UP_PRI (0x8000000) | ||
32 | |||
33 | #define DLM_LOCKID_NAME_MAX 32 | ||
34 | |||
35 | #define DLM_DOMAIN_NAME_MAX_LEN 255 | ||
36 | #define DLM_LOCK_RES_OWNER_UNKNOWN O2NM_MAX_NODES | ||
37 | #define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes | ||
38 | #define DLM_THREAD_MS 200 // flush at least every 200 ms | ||
39 | |||
40 | #define DLM_HASH_BITS 7 | ||
41 | #define DLM_HASH_SIZE (1 << DLM_HASH_BITS) | ||
42 | #define DLM_HASH_MASK (DLM_HASH_SIZE - 1) | ||
43 | |||
44 | enum dlm_ast_type { | ||
45 | DLM_AST = 0, | ||
46 | DLM_BAST, | ||
47 | DLM_ASTUNLOCK | ||
48 | }; | ||
49 | |||
50 | |||
51 | #define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \ | ||
52 | LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \ | ||
53 | LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE) | ||
54 | |||
55 | #define DLM_RECOVERY_LOCK_NAME "$RECOVERY" | ||
56 | #define DLM_RECOVERY_LOCK_NAME_LEN 9 | ||
57 | |||
58 | static inline int dlm_is_recovery_lock(const char *lock_name, int name_len) | ||
59 | { | ||
60 | if (name_len == DLM_RECOVERY_LOCK_NAME_LEN && | ||
61 | memcmp(lock_name, DLM_RECOVERY_LOCK_NAME, name_len)==0) | ||
62 | return 1; | ||
63 | return 0; | ||
64 | } | ||
65 | |||
66 | #define DLM_RECO_STATE_ACTIVE 0x0001 | ||
67 | |||
68 | struct dlm_recovery_ctxt | ||
69 | { | ||
70 | struct list_head resources; | ||
71 | struct list_head received; | ||
72 | struct list_head node_data; | ||
73 | u8 new_master; | ||
74 | u8 dead_node; | ||
75 | u16 state; | ||
76 | unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
77 | wait_queue_head_t event; | ||
78 | }; | ||
79 | |||
80 | enum dlm_ctxt_state { | ||
81 | DLM_CTXT_NEW = 0, | ||
82 | DLM_CTXT_JOINED, | ||
83 | DLM_CTXT_IN_SHUTDOWN, | ||
84 | DLM_CTXT_LEAVING, | ||
85 | }; | ||
86 | |||
87 | struct dlm_ctxt | ||
88 | { | ||
89 | struct list_head list; | ||
90 | struct list_head *resources; | ||
91 | struct list_head dirty_list; | ||
92 | struct list_head purge_list; | ||
93 | struct list_head pending_asts; | ||
94 | struct list_head pending_basts; | ||
95 | unsigned int purge_count; | ||
96 | spinlock_t spinlock; | ||
97 | spinlock_t ast_lock; | ||
98 | char *name; | ||
99 | u8 node_num; | ||
100 | u32 key; | ||
101 | u8 joining_node; | ||
102 | wait_queue_head_t dlm_join_events; | ||
103 | unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
104 | unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
105 | unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
106 | struct dlm_recovery_ctxt reco; | ||
107 | spinlock_t master_lock; | ||
108 | struct list_head master_list; | ||
109 | struct list_head mle_hb_events; | ||
110 | |||
111 | /* these give a really vague idea of the system load */ | ||
112 | atomic_t local_resources; | ||
113 | atomic_t remote_resources; | ||
114 | atomic_t unknown_resources; | ||
115 | |||
116 | /* NOTE: Next three are protected by dlm_domain_lock */ | ||
117 | struct kref dlm_refs; | ||
118 | enum dlm_ctxt_state dlm_state; | ||
119 | unsigned int num_joins; | ||
120 | |||
121 | struct o2hb_callback_func dlm_hb_up; | ||
122 | struct o2hb_callback_func dlm_hb_down; | ||
123 | struct task_struct *dlm_thread_task; | ||
124 | struct task_struct *dlm_reco_thread_task; | ||
125 | wait_queue_head_t dlm_thread_wq; | ||
126 | wait_queue_head_t dlm_reco_thread_wq; | ||
127 | wait_queue_head_t ast_wq; | ||
128 | wait_queue_head_t migration_wq; | ||
129 | |||
130 | struct work_struct dispatched_work; | ||
131 | struct list_head work_list; | ||
132 | spinlock_t work_lock; | ||
133 | struct list_head dlm_domain_handlers; | ||
134 | struct list_head dlm_eviction_callbacks; | ||
135 | }; | ||
136 | |||
137 | /* these keventd work queue items are for less-frequently | ||
138 | * called functions that cannot be directly called from the | ||
139 | * net message handlers for some reason, usually because | ||
140 | * they need to send net messages of their own. */ | ||
141 | void dlm_dispatch_work(void *data); | ||
142 | |||
143 | struct dlm_lock_resource; | ||
144 | struct dlm_work_item; | ||
145 | |||
146 | typedef void (dlm_workfunc_t)(struct dlm_work_item *, void *); | ||
147 | |||
148 | struct dlm_request_all_locks_priv | ||
149 | { | ||
150 | u8 reco_master; | ||
151 | u8 dead_node; | ||
152 | }; | ||
153 | |||
154 | struct dlm_mig_lockres_priv | ||
155 | { | ||
156 | struct dlm_lock_resource *lockres; | ||
157 | u8 real_master; | ||
158 | }; | ||
159 | |||
160 | struct dlm_assert_master_priv | ||
161 | { | ||
162 | struct dlm_lock_resource *lockres; | ||
163 | u8 request_from; | ||
164 | u32 flags; | ||
165 | unsigned ignore_higher:1; | ||
166 | }; | ||
167 | |||
168 | |||
169 | struct dlm_work_item | ||
170 | { | ||
171 | struct list_head list; | ||
172 | dlm_workfunc_t *func; | ||
173 | struct dlm_ctxt *dlm; | ||
174 | void *data; | ||
175 | union { | ||
176 | struct dlm_request_all_locks_priv ral; | ||
177 | struct dlm_mig_lockres_priv ml; | ||
178 | struct dlm_assert_master_priv am; | ||
179 | } u; | ||
180 | }; | ||
181 | |||
182 | static inline void dlm_init_work_item(struct dlm_ctxt *dlm, | ||
183 | struct dlm_work_item *i, | ||
184 | dlm_workfunc_t *f, void *data) | ||
185 | { | ||
186 | memset(i, 0, sizeof(*i)); | ||
187 | i->func = f; | ||
188 | INIT_LIST_HEAD(&i->list); | ||
189 | i->data = data; | ||
190 | i->dlm = dlm; /* must have already done a dlm_grab on this! */ | ||
191 | } | ||
192 | |||
193 | |||
194 | |||
195 | static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, | ||
196 | u8 node) | ||
197 | { | ||
198 | assert_spin_locked(&dlm->spinlock); | ||
199 | |||
200 | dlm->joining_node = node; | ||
201 | wake_up(&dlm->dlm_join_events); | ||
202 | } | ||
203 | |||
204 | #define DLM_LOCK_RES_UNINITED 0x00000001 | ||
205 | #define DLM_LOCK_RES_RECOVERING 0x00000002 | ||
206 | #define DLM_LOCK_RES_READY 0x00000004 | ||
207 | #define DLM_LOCK_RES_DIRTY 0x00000008 | ||
208 | #define DLM_LOCK_RES_IN_PROGRESS 0x00000010 | ||
209 | #define DLM_LOCK_RES_MIGRATING 0x00000020 | ||
210 | |||
211 | #define DLM_PURGE_INTERVAL_MS (8 * 1000) | ||
212 | |||
213 | struct dlm_lock_resource | ||
214 | { | ||
215 | /* WARNING: Please see the comment in dlm_init_lockres before | ||
216 | * adding fields here. */ | ||
217 | struct list_head list; | ||
218 | struct kref refs; | ||
219 | |||
220 | /* please keep these next 3 in this order | ||
221 | * some funcs want to iterate over all lists */ | ||
222 | struct list_head granted; | ||
223 | struct list_head converting; | ||
224 | struct list_head blocked; | ||
225 | |||
226 | struct list_head dirty; | ||
227 | struct list_head recovering; // dlm_recovery_ctxt.resources list | ||
228 | |||
229 | /* unused lock resources have their last_used stamped and are | ||
230 | * put on a list for the dlm thread to run. */ | ||
231 | struct list_head purge; | ||
232 | unsigned long last_used; | ||
233 | |||
234 | unsigned migration_pending:1; | ||
235 | atomic_t asts_reserved; | ||
236 | spinlock_t spinlock; | ||
237 | wait_queue_head_t wq; | ||
238 | u8 owner; //node which owns the lock resource, or unknown | ||
239 | u16 state; | ||
240 | struct qstr lockname; | ||
241 | char lvb[DLM_LVB_LEN]; | ||
242 | }; | ||
243 | |||
244 | struct dlm_migratable_lock | ||
245 | { | ||
246 | __be64 cookie; | ||
247 | |||
248 | /* these 3 are just padding for the in-memory structure, but | ||
249 | * list and flags are actually used when sent over the wire */ | ||
250 | __be16 pad1; | ||
251 | u8 list; // 0=granted, 1=converting, 2=blocked | ||
252 | u8 flags; | ||
253 | |||
254 | s8 type; | ||
255 | s8 convert_type; | ||
256 | s8 highest_blocked; | ||
257 | u8 node; | ||
258 | }; // 16 bytes | ||
259 | |||
260 | struct dlm_lock | ||
261 | { | ||
262 | struct dlm_migratable_lock ml; | ||
263 | |||
264 | struct list_head list; | ||
265 | struct list_head ast_list; | ||
266 | struct list_head bast_list; | ||
267 | struct dlm_lock_resource *lockres; | ||
268 | spinlock_t spinlock; | ||
269 | struct kref lock_refs; | ||
270 | |||
271 | // ast and bast must be callable while holding a spinlock! | ||
272 | dlm_astlockfunc_t *ast; | ||
273 | dlm_bastlockfunc_t *bast; | ||
274 | void *astdata; | ||
275 | struct dlm_lockstatus *lksb; | ||
276 | unsigned ast_pending:1, | ||
277 | bast_pending:1, | ||
278 | convert_pending:1, | ||
279 | lock_pending:1, | ||
280 | cancel_pending:1, | ||
281 | unlock_pending:1, | ||
282 | lksb_kernel_allocated:1; | ||
283 | }; | ||
284 | |||
285 | |||
286 | #define DLM_LKSB_UNUSED1 0x01 | ||
287 | #define DLM_LKSB_PUT_LVB 0x02 | ||
288 | #define DLM_LKSB_GET_LVB 0x04 | ||
289 | #define DLM_LKSB_UNUSED2 0x08 | ||
290 | #define DLM_LKSB_UNUSED3 0x10 | ||
291 | #define DLM_LKSB_UNUSED4 0x20 | ||
292 | #define DLM_LKSB_UNUSED5 0x40 | ||
293 | #define DLM_LKSB_UNUSED6 0x80 | ||
294 | |||
295 | |||
296 | enum dlm_lockres_list { | ||
297 | DLM_GRANTED_LIST = 0, | ||
298 | DLM_CONVERTING_LIST, | ||
299 | DLM_BLOCKED_LIST | ||
300 | }; | ||
301 | |||
302 | static inline struct list_head * | ||
303 | dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx) | ||
304 | { | ||
305 | struct list_head *ret = NULL; | ||
306 | if (idx == DLM_GRANTED_LIST) | ||
307 | ret = &res->granted; | ||
308 | else if (idx == DLM_CONVERTING_LIST) | ||
309 | ret = &res->converting; | ||
310 | else if (idx == DLM_BLOCKED_LIST) | ||
311 | ret = &res->blocked; | ||
312 | else | ||
313 | BUG(); | ||
314 | return ret; | ||
315 | } | ||
316 | |||
317 | |||
318 | |||
319 | |||
320 | struct dlm_node_iter | ||
321 | { | ||
322 | unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
323 | int curnode; | ||
324 | }; | ||
325 | |||
326 | |||
327 | enum { | ||
328 | DLM_MASTER_REQUEST_MSG = 500, | ||
329 | DLM_UNUSED_MSG1, /* 501 */ | ||
330 | DLM_ASSERT_MASTER_MSG, /* 502 */ | ||
331 | DLM_CREATE_LOCK_MSG, /* 503 */ | ||
332 | DLM_CONVERT_LOCK_MSG, /* 504 */ | ||
333 | DLM_PROXY_AST_MSG, /* 505 */ | ||
334 | DLM_UNLOCK_LOCK_MSG, /* 506 */ | ||
335 | DLM_UNUSED_MSG2, /* 507 */ | ||
336 | DLM_MIGRATE_REQUEST_MSG, /* 508 */ | ||
337 | DLM_MIG_LOCKRES_MSG, /* 509 */ | ||
338 | DLM_QUERY_JOIN_MSG, /* 510 */ | ||
339 | DLM_ASSERT_JOINED_MSG, /* 511 */ | ||
340 | DLM_CANCEL_JOIN_MSG, /* 512 */ | ||
341 | DLM_EXIT_DOMAIN_MSG, /* 513 */ | ||
342 | DLM_MASTER_REQUERY_MSG, /* 514 */ | ||
343 | DLM_LOCK_REQUEST_MSG, /* 515 */ | ||
344 | DLM_RECO_DATA_DONE_MSG, /* 516 */ | ||
345 | DLM_BEGIN_RECO_MSG, /* 517 */ | ||
346 | DLM_FINALIZE_RECO_MSG /* 518 */ | ||
347 | }; | ||
348 | |||
349 | struct dlm_reco_node_data | ||
350 | { | ||
351 | int state; | ||
352 | u8 node_num; | ||
353 | struct list_head list; | ||
354 | }; | ||
355 | |||
356 | enum { | ||
357 | DLM_RECO_NODE_DATA_DEAD = -1, | ||
358 | DLM_RECO_NODE_DATA_INIT = 0, | ||
359 | DLM_RECO_NODE_DATA_REQUESTING, | ||
360 | DLM_RECO_NODE_DATA_REQUESTED, | ||
361 | DLM_RECO_NODE_DATA_RECEIVING, | ||
362 | DLM_RECO_NODE_DATA_DONE, | ||
363 | DLM_RECO_NODE_DATA_FINALIZE_SENT, | ||
364 | }; | ||
365 | |||
366 | |||
367 | enum { | ||
368 | DLM_MASTER_RESP_NO = 0, | ||
369 | DLM_MASTER_RESP_YES, | ||
370 | DLM_MASTER_RESP_MAYBE, | ||
371 | DLM_MASTER_RESP_ERROR | ||
372 | }; | ||
373 | |||
374 | |||
375 | struct dlm_master_request | ||
376 | { | ||
377 | u8 node_idx; | ||
378 | u8 namelen; | ||
379 | __be16 pad1; | ||
380 | __be32 flags; | ||
381 | |||
382 | u8 name[O2NM_MAX_NAME_LEN]; | ||
383 | }; | ||
384 | |||
385 | #define DLM_ASSERT_MASTER_MLE_CLEANUP 0x00000001 | ||
386 | #define DLM_ASSERT_MASTER_REQUERY 0x00000002 | ||
387 | #define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004 | ||
388 | struct dlm_assert_master | ||
389 | { | ||
390 | u8 node_idx; | ||
391 | u8 namelen; | ||
392 | __be16 pad1; | ||
393 | __be32 flags; | ||
394 | |||
395 | u8 name[O2NM_MAX_NAME_LEN]; | ||
396 | }; | ||
397 | |||
398 | struct dlm_migrate_request | ||
399 | { | ||
400 | u8 master; | ||
401 | u8 new_master; | ||
402 | u8 namelen; | ||
403 | u8 pad1; | ||
404 | __be32 pad2; | ||
405 | u8 name[O2NM_MAX_NAME_LEN]; | ||
406 | }; | ||
407 | |||
408 | struct dlm_master_requery | ||
409 | { | ||
410 | u8 pad1; | ||
411 | u8 pad2; | ||
412 | u8 node_idx; | ||
413 | u8 namelen; | ||
414 | __be32 pad3; | ||
415 | u8 name[O2NM_MAX_NAME_LEN]; | ||
416 | }; | ||
417 | |||
418 | #define DLM_MRES_RECOVERY 0x01 | ||
419 | #define DLM_MRES_MIGRATION 0x02 | ||
420 | #define DLM_MRES_ALL_DONE 0x04 | ||
421 | |||
422 | /* | ||
423 | * We would like to get one whole lockres into a single network | ||
424 | * message whenever possible. Generally speaking, there will be | ||
425 | * at most one dlm_lock on a lockres for each node in the cluster, | ||
426 | * plus (infrequently) any additional locks coming in from userdlm. | ||
427 | * | ||
428 | * struct _dlm_lockres_page | ||
429 | * { | ||
430 | * dlm_migratable_lockres mres; | ||
431 | * dlm_migratable_lock ml[DLM_MAX_MIGRATABLE_LOCKS]; | ||
432 | * u8 pad[DLM_MIG_LOCKRES_RESERVED]; | ||
433 | * }; | ||
434 | * | ||
435 | * from ../cluster/tcp.h | ||
436 | * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg)) | ||
437 | * (roughly 4080 bytes) | ||
438 | * and sizeof(dlm_migratable_lockres) = 112 bytes | ||
439 | * and sizeof(dlm_migratable_lock) = 16 bytes | ||
440 | * | ||
441 | * Choosing DLM_MAX_MIGRATABLE_LOCKS=240 and | ||
442 | * DLM_MIG_LOCKRES_RESERVED=128 means we have this: | ||
443 | * | ||
444 | * (DLM_MAX_MIGRATABLE_LOCKS * sizeof(dlm_migratable_lock)) + | ||
445 | * sizeof(dlm_migratable_lockres) + DLM_MIG_LOCKRES_RESERVED = | ||
446 | * NET_MAX_PAYLOAD_BYTES | ||
447 | * (240 * 16) + 112 + 128 = 4080 | ||
448 | * | ||
449 | * So a lockres would need more than 240 locks before it would | ||
450 | * use more than one network packet to recover. Not too bad. | ||
451 | */ | ||
452 | #define DLM_MAX_MIGRATABLE_LOCKS 240 | ||
453 | |||
454 | struct dlm_migratable_lockres | ||
455 | { | ||
456 | u8 master; | ||
457 | u8 lockname_len; | ||
458 | u8 num_locks; // locks sent in this structure | ||
459 | u8 flags; | ||
460 | __be32 total_locks; // locks to be sent for this migration cookie | ||
461 | __be64 mig_cookie; // cookie for this lockres migration | ||
462 | // or zero if not needed | ||
463 | // 16 bytes | ||
464 | u8 lockname[DLM_LOCKID_NAME_MAX]; | ||
465 | // 48 bytes | ||
466 | u8 lvb[DLM_LVB_LEN]; | ||
467 | // 112 bytes | ||
468 | struct dlm_migratable_lock ml[0]; // 16 bytes each, begins at byte 112 | ||
469 | }; | ||
470 | #define DLM_MIG_LOCKRES_MAX_LEN \ | ||
471 | (sizeof(struct dlm_migratable_lockres) + \ | ||
472 | (sizeof(struct dlm_migratable_lock) * \ | ||
473 | DLM_MAX_MIGRATABLE_LOCKS) ) | ||
474 | |||
475 | /* from above, 128 bytes | ||
476 | * for some undetermined future use */ | ||
477 | #define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \ | ||
478 | DLM_MIG_LOCKRES_MAX_LEN) | ||
479 | |||
480 | struct dlm_create_lock | ||
481 | { | ||
482 | __be64 cookie; | ||
483 | |||
484 | __be32 flags; | ||
485 | u8 pad1; | ||
486 | u8 node_idx; | ||
487 | s8 requested_type; | ||
488 | u8 namelen; | ||
489 | |||
490 | u8 name[O2NM_MAX_NAME_LEN]; | ||
491 | }; | ||
492 | |||
493 | struct dlm_convert_lock | ||
494 | { | ||
495 | __be64 cookie; | ||
496 | |||
497 | __be32 flags; | ||
498 | u8 pad1; | ||
499 | u8 node_idx; | ||
500 | s8 requested_type; | ||
501 | u8 namelen; | ||
502 | |||
503 | u8 name[O2NM_MAX_NAME_LEN]; | ||
504 | |||
505 | s8 lvb[0]; | ||
506 | }; | ||
507 | #define DLM_CONVERT_LOCK_MAX_LEN (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN) | ||
508 | |||
509 | struct dlm_unlock_lock | ||
510 | { | ||
511 | __be64 cookie; | ||
512 | |||
513 | __be32 flags; | ||
514 | __be16 pad1; | ||
515 | u8 node_idx; | ||
516 | u8 namelen; | ||
517 | |||
518 | u8 name[O2NM_MAX_NAME_LEN]; | ||
519 | |||
520 | s8 lvb[0]; | ||
521 | }; | ||
522 | #define DLM_UNLOCK_LOCK_MAX_LEN (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN) | ||
523 | |||
524 | struct dlm_proxy_ast | ||
525 | { | ||
526 | __be64 cookie; | ||
527 | |||
528 | __be32 flags; | ||
529 | u8 node_idx; | ||
530 | u8 type; | ||
531 | u8 blocked_type; | ||
532 | u8 namelen; | ||
533 | |||
534 | u8 name[O2NM_MAX_NAME_LEN]; | ||
535 | |||
536 | s8 lvb[0]; | ||
537 | }; | ||
538 | #define DLM_PROXY_AST_MAX_LEN (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN) | ||
539 | |||
540 | #define DLM_MOD_KEY (0x666c6172) | ||
541 | enum dlm_query_join_response { | ||
542 | JOIN_DISALLOW = 0, | ||
543 | JOIN_OK, | ||
544 | JOIN_OK_NO_MAP, | ||
545 | }; | ||
546 | |||
547 | struct dlm_lock_request | ||
548 | { | ||
549 | u8 node_idx; | ||
550 | u8 dead_node; | ||
551 | __be16 pad1; | ||
552 | __be32 pad2; | ||
553 | }; | ||
554 | |||
555 | struct dlm_reco_data_done | ||
556 | { | ||
557 | u8 node_idx; | ||
558 | u8 dead_node; | ||
559 | __be16 pad1; | ||
560 | __be32 pad2; | ||
561 | |||
562 | /* unused for now */ | ||
563 | /* eventually we can use this to attempt | ||
564 | * lvb recovery based on each node's info */ | ||
565 | u8 reco_lvb[DLM_LVB_LEN]; | ||
566 | }; | ||
567 | |||
568 | struct dlm_begin_reco | ||
569 | { | ||
570 | u8 node_idx; | ||
571 | u8 dead_node; | ||
572 | __be16 pad1; | ||
573 | __be32 pad2; | ||
574 | }; | ||
575 | |||
576 | |||
577 | struct dlm_query_join_request | ||
578 | { | ||
579 | u8 node_idx; | ||
580 | u8 pad1[2]; | ||
581 | u8 name_len; | ||
582 | u8 domain[O2NM_MAX_NAME_LEN]; | ||
583 | }; | ||
584 | |||
585 | struct dlm_assert_joined | ||
586 | { | ||
587 | u8 node_idx; | ||
588 | u8 pad1[2]; | ||
589 | u8 name_len; | ||
590 | u8 domain[O2NM_MAX_NAME_LEN]; | ||
591 | }; | ||
592 | |||
593 | struct dlm_cancel_join | ||
594 | { | ||
595 | u8 node_idx; | ||
596 | u8 pad1[2]; | ||
597 | u8 name_len; | ||
598 | u8 domain[O2NM_MAX_NAME_LEN]; | ||
599 | }; | ||
600 | |||
601 | struct dlm_exit_domain | ||
602 | { | ||
603 | u8 node_idx; | ||
604 | u8 pad1[3]; | ||
605 | }; | ||
606 | |||
607 | struct dlm_finalize_reco | ||
608 | { | ||
609 | u8 node_idx; | ||
610 | u8 dead_node; | ||
611 | __be16 pad1; | ||
612 | __be32 pad2; | ||
613 | }; | ||
614 | |||
615 | static inline enum dlm_status | ||
616 | __dlm_lockres_state_to_status(struct dlm_lock_resource *res) | ||
617 | { | ||
618 | enum dlm_status status = DLM_NORMAL; | ||
619 | |||
620 | assert_spin_locked(&res->spinlock); | ||
621 | |||
622 | if (res->state & DLM_LOCK_RES_RECOVERING) | ||
623 | status = DLM_RECOVERING; | ||
624 | else if (res->state & DLM_LOCK_RES_MIGRATING) | ||
625 | status = DLM_MIGRATING; | ||
626 | else if (res->state & DLM_LOCK_RES_IN_PROGRESS) | ||
627 | status = DLM_FORWARD; | ||
628 | |||
629 | return status; | ||
630 | } | ||
631 | |||
632 | struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, | ||
633 | struct dlm_lockstatus *lksb); | ||
634 | void dlm_lock_get(struct dlm_lock *lock); | ||
635 | void dlm_lock_put(struct dlm_lock *lock); | ||
636 | |||
637 | void dlm_lock_attach_lockres(struct dlm_lock *lock, | ||
638 | struct dlm_lock_resource *res); | ||
639 | |||
640 | int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data); | ||
641 | int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data); | ||
642 | int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data); | ||
643 | |||
644 | void dlm_revert_pending_convert(struct dlm_lock_resource *res, | ||
645 | struct dlm_lock *lock); | ||
646 | void dlm_revert_pending_lock(struct dlm_lock_resource *res, | ||
647 | struct dlm_lock *lock); | ||
648 | |||
649 | int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data); | ||
650 | void dlm_commit_pending_cancel(struct dlm_lock_resource *res, | ||
651 | struct dlm_lock *lock); | ||
652 | void dlm_commit_pending_unlock(struct dlm_lock_resource *res, | ||
653 | struct dlm_lock *lock); | ||
654 | |||
655 | int dlm_launch_thread(struct dlm_ctxt *dlm); | ||
656 | void dlm_complete_thread(struct dlm_ctxt *dlm); | ||
657 | int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); | ||
658 | void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); | ||
659 | void dlm_wait_for_recovery(struct dlm_ctxt *dlm); | ||
660 | |||
661 | void dlm_put(struct dlm_ctxt *dlm); | ||
662 | struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); | ||
663 | int dlm_domain_fully_joined(struct dlm_ctxt *dlm); | ||
664 | |||
665 | void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, | ||
666 | struct dlm_lock_resource *res); | ||
667 | void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, | ||
668 | struct dlm_lock_resource *res); | ||
669 | void dlm_purge_lockres(struct dlm_ctxt *dlm, | ||
670 | struct dlm_lock_resource *lockres); | ||
671 | void dlm_lockres_get(struct dlm_lock_resource *res); | ||
672 | void dlm_lockres_put(struct dlm_lock_resource *res); | ||
673 | void __dlm_unhash_lockres(struct dlm_lock_resource *res); | ||
674 | void __dlm_insert_lockres(struct dlm_ctxt *dlm, | ||
675 | struct dlm_lock_resource *res); | ||
676 | struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, | ||
677 | const char *name, | ||
678 | unsigned int len); | ||
679 | struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, | ||
680 | const char *name, | ||
681 | unsigned int len); | ||
682 | |||
683 | int dlm_is_host_down(int errno); | ||
684 | void dlm_change_lockres_owner(struct dlm_ctxt *dlm, | ||
685 | struct dlm_lock_resource *res, | ||
686 | u8 owner); | ||
687 | struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, | ||
688 | const char *lockid, | ||
689 | int flags); | ||
690 | struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, | ||
691 | const char *name, | ||
692 | unsigned int namelen); | ||
693 | |||
694 | void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); | ||
695 | void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); | ||
696 | void dlm_do_local_ast(struct dlm_ctxt *dlm, | ||
697 | struct dlm_lock_resource *res, | ||
698 | struct dlm_lock *lock); | ||
699 | int dlm_do_remote_ast(struct dlm_ctxt *dlm, | ||
700 | struct dlm_lock_resource *res, | ||
701 | struct dlm_lock *lock); | ||
702 | void dlm_do_local_bast(struct dlm_ctxt *dlm, | ||
703 | struct dlm_lock_resource *res, | ||
704 | struct dlm_lock *lock, | ||
705 | int blocked_type); | ||
706 | int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, | ||
707 | struct dlm_lock_resource *res, | ||
708 | struct dlm_lock *lock, | ||
709 | int msg_type, | ||
710 | int blocked_type, int flags); | ||
711 | static inline int dlm_send_proxy_bast(struct dlm_ctxt *dlm, | ||
712 | struct dlm_lock_resource *res, | ||
713 | struct dlm_lock *lock, | ||
714 | int blocked_type) | ||
715 | { | ||
716 | return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_BAST, | ||
717 | blocked_type, 0); | ||
718 | } | ||
719 | |||
720 | static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm, | ||
721 | struct dlm_lock_resource *res, | ||
722 | struct dlm_lock *lock, | ||
723 | int flags) | ||
724 | { | ||
725 | return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_AST, | ||
726 | 0, flags); | ||
727 | } | ||
728 | |||
729 | void dlm_print_one_lock_resource(struct dlm_lock_resource *res); | ||
730 | void __dlm_print_one_lock_resource(struct dlm_lock_resource *res); | ||
731 | |||
732 | u8 dlm_nm_this_node(struct dlm_ctxt *dlm); | ||
733 | void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); | ||
734 | void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); | ||
735 | |||
736 | |||
737 | int dlm_nm_init(struct dlm_ctxt *dlm); | ||
738 | int dlm_heartbeat_init(struct dlm_ctxt *dlm); | ||
739 | void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data); | ||
740 | void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data); | ||
741 | |||
742 | int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); | ||
743 | int dlm_migrate_lockres(struct dlm_ctxt *dlm, | ||
744 | struct dlm_lock_resource *res, | ||
745 | u8 target); | ||
746 | int dlm_finish_migration(struct dlm_ctxt *dlm, | ||
747 | struct dlm_lock_resource *res, | ||
748 | u8 old_master); | ||
749 | void dlm_lockres_release_ast(struct dlm_ctxt *dlm, | ||
750 | struct dlm_lock_resource *res); | ||
751 | void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res); | ||
752 | |||
753 | int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data); | ||
754 | int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data); | ||
755 | int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data); | ||
756 | int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data); | ||
757 | int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data); | ||
758 | int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data); | ||
759 | int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data); | ||
760 | int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data); | ||
761 | int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data); | ||
762 | |||
763 | int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, | ||
764 | struct dlm_lock_resource *res, | ||
765 | int ignore_higher, | ||
766 | u8 request_from, | ||
767 | u32 flags); | ||
768 | |||
769 | |||
770 | int dlm_send_one_lockres(struct dlm_ctxt *dlm, | ||
771 | struct dlm_lock_resource *res, | ||
772 | struct dlm_migratable_lockres *mres, | ||
773 | u8 send_to, | ||
774 | u8 flags); | ||
775 | void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, | ||
776 | struct dlm_lock_resource *res); | ||
777 | |||
778 | /* will exit holding res->spinlock, but may drop in function */ | ||
779 | void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags); | ||
780 | void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags); | ||
781 | |||
782 | /* will exit holding res->spinlock, but may drop in function */ | ||
783 | static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) | ||
784 | { | ||
785 | __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS| | ||
786 | DLM_LOCK_RES_RECOVERING| | ||
787 | DLM_LOCK_RES_MIGRATING)); | ||
788 | } | ||
789 | |||
790 | |||
791 | int dlm_init_mle_cache(void); | ||
792 | void dlm_destroy_mle_cache(void); | ||
793 | void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up); | ||
794 | void dlm_clean_master_list(struct dlm_ctxt *dlm, | ||
795 | u8 dead_node); | ||
796 | int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); | ||
797 | |||
798 | |||
799 | static inline const char * dlm_lock_mode_name(int mode) | ||
800 | { | ||
801 | switch (mode) { | ||
802 | case LKM_EXMODE: | ||
803 | return "EX"; | ||
804 | case LKM_PRMODE: | ||
805 | return "PR"; | ||
806 | case LKM_NLMODE: | ||
807 | return "NL"; | ||
808 | } | ||
809 | return "UNKNOWN"; | ||
810 | } | ||
811 | |||
812 | |||
813 | static inline int dlm_lock_compatible(int existing, int request) | ||
814 | { | ||
815 | /* NO_LOCK compatible with all */ | ||
816 | if (request == LKM_NLMODE || | ||
817 | existing == LKM_NLMODE) | ||
818 | return 1; | ||
819 | |||
820 | /* EX incompatible with all non-NO_LOCK */ | ||
821 | if (request == LKM_EXMODE) | ||
822 | return 0; | ||
823 | |||
824 | /* request must be PR, which is compatible with PR */ | ||
825 | if (existing == LKM_PRMODE) | ||
826 | return 1; | ||
827 | |||
828 | return 0; | ||
829 | } | ||
830 | |||
831 | static inline int dlm_lock_on_list(struct list_head *head, | ||
832 | struct dlm_lock *lock) | ||
833 | { | ||
834 | struct list_head *iter; | ||
835 | struct dlm_lock *tmplock; | ||
836 | |||
837 | list_for_each(iter, head) { | ||
838 | tmplock = list_entry(iter, struct dlm_lock, list); | ||
839 | if (tmplock == lock) | ||
840 | return 1; | ||
841 | } | ||
842 | return 0; | ||
843 | } | ||
844 | |||
845 | |||
846 | static inline enum dlm_status dlm_err_to_dlm_status(int err) | ||
847 | { | ||
848 | enum dlm_status ret; | ||
849 | if (err == -ENOMEM) | ||
850 | ret = DLM_SYSERR; | ||
851 | else if (err == -ETIMEDOUT || o2net_link_down(err, NULL)) | ||
852 | ret = DLM_NOLOCKMGR; | ||
853 | else if (err == -EINVAL) | ||
854 | ret = DLM_BADPARAM; | ||
855 | else if (err == -ENAMETOOLONG) | ||
856 | ret = DLM_IVBUFLEN; | ||
857 | else | ||
858 | ret = DLM_BADARGS; | ||
859 | return ret; | ||
860 | } | ||
861 | |||
862 | |||
863 | static inline void dlm_node_iter_init(unsigned long *map, | ||
864 | struct dlm_node_iter *iter) | ||
865 | { | ||
866 | memcpy(iter->node_map, map, sizeof(iter->node_map)); | ||
867 | iter->curnode = -1; | ||
868 | } | ||
869 | |||
870 | static inline int dlm_node_iter_next(struct dlm_node_iter *iter) | ||
871 | { | ||
872 | int bit; | ||
873 | bit = find_next_bit(iter->node_map, O2NM_MAX_NODES, iter->curnode+1); | ||
874 | if (bit >= O2NM_MAX_NODES) { | ||
875 | iter->curnode = O2NM_MAX_NODES; | ||
876 | return -ENOENT; | ||
877 | } | ||
878 | iter->curnode = bit; | ||
879 | return bit; | ||
880 | } | ||
881 | |||
882 | |||
883 | |||
884 | #endif /* DLMCOMMON_H */ | ||
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c new file mode 100644 index 000000000000..6001b22a997d --- /dev/null +++ b/fs/ocfs2/dlm/dlmconvert.c | |||
@@ -0,0 +1,530 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmconvert.c | ||
5 | * | ||
6 | * underlying calls for lock conversion | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | |||
28 | #include <linux/module.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/types.h> | ||
31 | #include <linux/slab.h> | ||
32 | #include <linux/highmem.h> | ||
33 | #include <linux/utsname.h> | ||
34 | #include <linux/init.h> | ||
35 | #include <linux/sysctl.h> | ||
36 | #include <linux/random.h> | ||
37 | #include <linux/blkdev.h> | ||
38 | #include <linux/socket.h> | ||
39 | #include <linux/inet.h> | ||
40 | #include <linux/spinlock.h> | ||
41 | |||
42 | |||
43 | #include "cluster/heartbeat.h" | ||
44 | #include "cluster/nodemanager.h" | ||
45 | #include "cluster/tcp.h" | ||
46 | |||
47 | #include "dlmapi.h" | ||
48 | #include "dlmcommon.h" | ||
49 | |||
50 | #include "dlmconvert.h" | ||
51 | |||
52 | #define MLOG_MASK_PREFIX ML_DLM | ||
53 | #include "cluster/masklog.h" | ||
54 | |||
55 | /* NOTE: __dlmconvert_master is the only function in here that | ||
56 | * needs a spinlock held on entry (res->spinlock) and it is the | ||
57 | * only one that holds a lock on exit (res->spinlock). | ||
58 | * All other functions in here need no locks and drop all of | ||
59 | * the locks that they acquire. */ | ||
60 | static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm, | ||
61 | struct dlm_lock_resource *res, | ||
62 | struct dlm_lock *lock, int flags, | ||
63 | int type, int *call_ast, | ||
64 | int *kick_thread); | ||
65 | static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, | ||
66 | struct dlm_lock_resource *res, | ||
67 | struct dlm_lock *lock, int flags, int type); | ||
68 | |||
69 | /* | ||
70 | * this is only called directly by dlmlock(), and only when the | ||
71 | * local node is the owner of the lockres | ||
72 | * locking: | ||
73 | * caller needs: none | ||
74 | * taken: takes and drops res->spinlock | ||
75 | * held on exit: none | ||
76 | * returns: see __dlmconvert_master | ||
77 | */ | ||
78 | enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm, | ||
79 | struct dlm_lock_resource *res, | ||
80 | struct dlm_lock *lock, int flags, int type) | ||
81 | { | ||
82 | int call_ast = 0, kick_thread = 0; | ||
83 | enum dlm_status status; | ||
84 | |||
85 | spin_lock(&res->spinlock); | ||
86 | /* we are not in a network handler, this is fine */ | ||
87 | __dlm_wait_on_lockres(res); | ||
88 | __dlm_lockres_reserve_ast(res); | ||
89 | res->state |= DLM_LOCK_RES_IN_PROGRESS; | ||
90 | |||
91 | status = __dlmconvert_master(dlm, res, lock, flags, type, | ||
92 | &call_ast, &kick_thread); | ||
93 | |||
94 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; | ||
95 | spin_unlock(&res->spinlock); | ||
96 | wake_up(&res->wq); | ||
97 | if (status != DLM_NORMAL && status != DLM_NOTQUEUED) | ||
98 | dlm_error(status); | ||
99 | |||
100 | /* either queue the ast or release it */ | ||
101 | if (call_ast) | ||
102 | dlm_queue_ast(dlm, lock); | ||
103 | else | ||
104 | dlm_lockres_release_ast(dlm, res); | ||
105 | |||
106 | if (kick_thread) | ||
107 | dlm_kick_thread(dlm, res); | ||
108 | |||
109 | return status; | ||
110 | } | ||
111 | |||
112 | /* performs lock conversion at the lockres master site | ||
113 | * locking: | ||
114 | * caller needs: res->spinlock | ||
115 | * taken: takes and drops lock->spinlock | ||
116 | * held on exit: res->spinlock | ||
117 | * returns: DLM_NORMAL, DLM_NOTQUEUED, DLM_DENIED | ||
118 | * call_ast: whether ast should be called for this lock | ||
119 | * kick_thread: whether dlm_kick_thread should be called | ||
120 | */ | ||
121 | static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm, | ||
122 | struct dlm_lock_resource *res, | ||
123 | struct dlm_lock *lock, int flags, | ||
124 | int type, int *call_ast, | ||
125 | int *kick_thread) | ||
126 | { | ||
127 | enum dlm_status status = DLM_NORMAL; | ||
128 | struct list_head *iter; | ||
129 | struct dlm_lock *tmplock=NULL; | ||
130 | |||
131 | assert_spin_locked(&res->spinlock); | ||
132 | |||
133 | mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n", | ||
134 | lock->ml.type, lock->ml.convert_type, type); | ||
135 | |||
136 | spin_lock(&lock->spinlock); | ||
137 | |||
138 | /* already converting? */ | ||
139 | if (lock->ml.convert_type != LKM_IVMODE) { | ||
140 | mlog(ML_ERROR, "attempted to convert a lock with a lock " | ||
141 | "conversion pending\n"); | ||
142 | status = DLM_DENIED; | ||
143 | goto unlock_exit; | ||
144 | } | ||
145 | |||
146 | /* must be on grant queue to convert */ | ||
147 | if (!dlm_lock_on_list(&res->granted, lock)) { | ||
148 | mlog(ML_ERROR, "attempted to convert a lock not on grant " | ||
149 | "queue\n"); | ||
150 | status = DLM_DENIED; | ||
151 | goto unlock_exit; | ||
152 | } | ||
153 | |||
154 | if (flags & LKM_VALBLK) { | ||
155 | switch (lock->ml.type) { | ||
156 | case LKM_EXMODE: | ||
157 | /* EX + LKM_VALBLK + convert == set lvb */ | ||
158 | mlog(0, "will set lvb: converting %s->%s\n", | ||
159 | dlm_lock_mode_name(lock->ml.type), | ||
160 | dlm_lock_mode_name(type)); | ||
161 | lock->lksb->flags |= DLM_LKSB_PUT_LVB; | ||
162 | break; | ||
163 | case LKM_PRMODE: | ||
164 | case LKM_NLMODE: | ||
165 | /* refetch if new level is not NL */ | ||
166 | if (type > LKM_NLMODE) { | ||
167 | mlog(0, "will fetch new value into " | ||
168 | "lvb: converting %s->%s\n", | ||
169 | dlm_lock_mode_name(lock->ml.type), | ||
170 | dlm_lock_mode_name(type)); | ||
171 | lock->lksb->flags |= DLM_LKSB_GET_LVB; | ||
172 | } else { | ||
173 | mlog(0, "will NOT fetch new value " | ||
174 | "into lvb: converting %s->%s\n", | ||
175 | dlm_lock_mode_name(lock->ml.type), | ||
176 | dlm_lock_mode_name(type)); | ||
177 | flags &= ~(LKM_VALBLK); | ||
178 | } | ||
179 | break; | ||
180 | } | ||
181 | } | ||
182 | |||
183 | |||
184 | /* in-place downconvert? */ | ||
185 | if (type <= lock->ml.type) | ||
186 | goto grant; | ||
187 | |||
188 | /* upconvert from here on */ | ||
189 | status = DLM_NORMAL; | ||
190 | list_for_each(iter, &res->granted) { | ||
191 | tmplock = list_entry(iter, struct dlm_lock, list); | ||
192 | if (tmplock == lock) | ||
193 | continue; | ||
194 | if (!dlm_lock_compatible(tmplock->ml.type, type)) | ||
195 | goto switch_queues; | ||
196 | } | ||
197 | |||
198 | list_for_each(iter, &res->converting) { | ||
199 | tmplock = list_entry(iter, struct dlm_lock, list); | ||
200 | if (!dlm_lock_compatible(tmplock->ml.type, type)) | ||
201 | goto switch_queues; | ||
202 | /* existing conversion requests take precedence */ | ||
203 | if (!dlm_lock_compatible(tmplock->ml.convert_type, type)) | ||
204 | goto switch_queues; | ||
205 | } | ||
206 | |||
207 | /* fall thru to grant */ | ||
208 | |||
209 | grant: | ||
210 | mlog(0, "res %.*s, granting %s lock\n", res->lockname.len, | ||
211 | res->lockname.name, dlm_lock_mode_name(type)); | ||
212 | /* immediately grant the new lock type */ | ||
213 | lock->lksb->status = DLM_NORMAL; | ||
214 | if (lock->ml.node == dlm->node_num) | ||
215 | mlog(0, "doing in-place convert for nonlocal lock\n"); | ||
216 | lock->ml.type = type; | ||
217 | status = DLM_NORMAL; | ||
218 | *call_ast = 1; | ||
219 | goto unlock_exit; | ||
220 | |||
221 | switch_queues: | ||
222 | if (flags & LKM_NOQUEUE) { | ||
223 | mlog(0, "failed to convert NOQUEUE lock %.*s from " | ||
224 | "%d to %d...\n", res->lockname.len, res->lockname.name, | ||
225 | lock->ml.type, type); | ||
226 | status = DLM_NOTQUEUED; | ||
227 | goto unlock_exit; | ||
228 | } | ||
229 | mlog(0, "res %.*s, queueing...\n", res->lockname.len, | ||
230 | res->lockname.name); | ||
231 | |||
232 | lock->ml.convert_type = type; | ||
233 | /* do not alter lock refcount. switching lists. */ | ||
234 | list_del_init(&lock->list); | ||
235 | list_add_tail(&lock->list, &res->converting); | ||
236 | |||
237 | unlock_exit: | ||
238 | spin_unlock(&lock->spinlock); | ||
239 | if (status == DLM_DENIED) { | ||
240 | __dlm_print_one_lock_resource(res); | ||
241 | } | ||
242 | if (status == DLM_NORMAL) | ||
243 | *kick_thread = 1; | ||
244 | return status; | ||
245 | } | ||
246 | |||
247 | void dlm_revert_pending_convert(struct dlm_lock_resource *res, | ||
248 | struct dlm_lock *lock) | ||
249 | { | ||
250 | /* do not alter lock refcount. switching lists. */ | ||
251 | list_del_init(&lock->list); | ||
252 | list_add_tail(&lock->list, &res->granted); | ||
253 | lock->ml.convert_type = LKM_IVMODE; | ||
254 | lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB); | ||
255 | } | ||
256 | |||
257 | /* messages the master site to do lock conversion | ||
258 | * locking: | ||
259 | * caller needs: none | ||
260 | * taken: takes and drops res->spinlock, uses DLM_LOCK_RES_IN_PROGRESS | ||
261 | * held on exit: none | ||
262 | * returns: DLM_NORMAL, DLM_RECOVERING, status from remote node | ||
263 | */ | ||
264 | enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, | ||
265 | struct dlm_lock_resource *res, | ||
266 | struct dlm_lock *lock, int flags, int type) | ||
267 | { | ||
268 | enum dlm_status status; | ||
269 | |||
270 | mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, | ||
271 | lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); | ||
272 | |||
273 | spin_lock(&res->spinlock); | ||
274 | if (res->state & DLM_LOCK_RES_RECOVERING) { | ||
275 | mlog(0, "bailing out early since res is RECOVERING " | ||
276 | "on secondary queue\n"); | ||
277 | /* __dlm_print_one_lock_resource(res); */ | ||
278 | status = DLM_RECOVERING; | ||
279 | goto bail; | ||
280 | } | ||
281 | /* will exit this call with spinlock held */ | ||
282 | __dlm_wait_on_lockres(res); | ||
283 | |||
284 | if (lock->ml.convert_type != LKM_IVMODE) { | ||
285 | __dlm_print_one_lock_resource(res); | ||
286 | mlog(ML_ERROR, "converting a remote lock that is already " | ||
287 | "converting! (cookie=%"MLFu64", conv=%d)\n", | ||
288 | lock->ml.cookie, lock->ml.convert_type); | ||
289 | status = DLM_DENIED; | ||
290 | goto bail; | ||
291 | } | ||
292 | res->state |= DLM_LOCK_RES_IN_PROGRESS; | ||
293 | /* move lock to local convert queue */ | ||
294 | /* do not alter lock refcount. switching lists. */ | ||
295 | list_del_init(&lock->list); | ||
296 | list_add_tail(&lock->list, &res->converting); | ||
297 | lock->convert_pending = 1; | ||
298 | lock->ml.convert_type = type; | ||
299 | |||
300 | if (flags & LKM_VALBLK) { | ||
301 | if (lock->ml.type == LKM_EXMODE) { | ||
302 | flags |= LKM_PUT_LVB; | ||
303 | lock->lksb->flags |= DLM_LKSB_PUT_LVB; | ||
304 | } else { | ||
305 | if (lock->ml.convert_type == LKM_NLMODE) | ||
306 | flags &= ~LKM_VALBLK; | ||
307 | else { | ||
308 | flags |= LKM_GET_LVB; | ||
309 | lock->lksb->flags |= DLM_LKSB_GET_LVB; | ||
310 | } | ||
311 | } | ||
312 | } | ||
313 | spin_unlock(&res->spinlock); | ||
314 | |||
315 | /* no locks held here. | ||
316 | * need to wait for a reply as to whether it got queued or not. */ | ||
317 | status = dlm_send_remote_convert_request(dlm, res, lock, flags, type); | ||
318 | |||
319 | spin_lock(&res->spinlock); | ||
320 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; | ||
321 | lock->convert_pending = 0; | ||
322 | /* if it failed, move it back to granted queue */ | ||
323 | if (status != DLM_NORMAL) { | ||
324 | if (status != DLM_NOTQUEUED) | ||
325 | dlm_error(status); | ||
326 | dlm_revert_pending_convert(res, lock); | ||
327 | } | ||
328 | bail: | ||
329 | spin_unlock(&res->spinlock); | ||
330 | |||
331 | /* TODO: should this be a wake_one? */ | ||
332 | /* wake up any IN_PROGRESS waiters */ | ||
333 | wake_up(&res->wq); | ||
334 | |||
335 | return status; | ||
336 | } | ||
337 | |||
338 | /* sends DLM_CONVERT_LOCK_MSG to master site | ||
339 | * locking: | ||
340 | * caller needs: none | ||
341 | * taken: none | ||
342 | * held on exit: none | ||
343 | * returns: DLM_NOLOCKMGR, status from remote node | ||
344 | */ | ||
345 | static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, | ||
346 | struct dlm_lock_resource *res, | ||
347 | struct dlm_lock *lock, int flags, int type) | ||
348 | { | ||
349 | struct dlm_convert_lock convert; | ||
350 | int tmpret; | ||
351 | enum dlm_status ret; | ||
352 | int status = 0; | ||
353 | struct kvec vec[2]; | ||
354 | size_t veclen = 1; | ||
355 | |||
356 | mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); | ||
357 | |||
358 | memset(&convert, 0, sizeof(struct dlm_convert_lock)); | ||
359 | convert.node_idx = dlm->node_num; | ||
360 | convert.requested_type = type; | ||
361 | convert.cookie = lock->ml.cookie; | ||
362 | convert.namelen = res->lockname.len; | ||
363 | convert.flags = cpu_to_be32(flags); | ||
364 | memcpy(convert.name, res->lockname.name, convert.namelen); | ||
365 | |||
366 | vec[0].iov_len = sizeof(struct dlm_convert_lock); | ||
367 | vec[0].iov_base = &convert; | ||
368 | |||
369 | if (flags & LKM_PUT_LVB) { | ||
370 | /* extra data to send if we are updating lvb */ | ||
371 | vec[1].iov_len = DLM_LVB_LEN; | ||
372 | vec[1].iov_base = lock->lksb->lvb; | ||
373 | veclen++; | ||
374 | } | ||
375 | |||
376 | tmpret = o2net_send_message_vec(DLM_CONVERT_LOCK_MSG, dlm->key, | ||
377 | vec, veclen, res->owner, &status); | ||
378 | if (tmpret >= 0) { | ||
379 | // successfully sent and received | ||
380 | ret = status; // this is already a dlm_status | ||
381 | if (ret == DLM_RECOVERING) { | ||
382 | mlog(0, "node %u returned DLM_RECOVERING from convert " | ||
383 | "message!\n", res->owner); | ||
384 | } else if (ret == DLM_MIGRATING) { | ||
385 | mlog(0, "node %u returned DLM_MIGRATING from convert " | ||
386 | "message!\n", res->owner); | ||
387 | } else if (ret == DLM_FORWARD) { | ||
388 | mlog(0, "node %u returned DLM_FORWARD from convert " | ||
389 | "message!\n", res->owner); | ||
390 | } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED) | ||
391 | dlm_error(ret); | ||
392 | } else { | ||
393 | mlog_errno(tmpret); | ||
394 | if (dlm_is_host_down(tmpret)) { | ||
395 | ret = DLM_RECOVERING; | ||
396 | mlog(0, "node %u died so returning DLM_RECOVERING " | ||
397 | "from convert message!\n", res->owner); | ||
398 | } else { | ||
399 | ret = dlm_err_to_dlm_status(tmpret); | ||
400 | } | ||
401 | } | ||
402 | |||
403 | return ret; | ||
404 | } | ||
405 | |||
406 | /* handler for DLM_CONVERT_LOCK_MSG on master site | ||
407 | * locking: | ||
408 | * caller needs: none | ||
409 | * taken: takes and drop res->spinlock | ||
410 | * held on exit: none | ||
411 | * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS, | ||
412 | * status from __dlmconvert_master | ||
413 | */ | ||
414 | int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) | ||
415 | { | ||
416 | struct dlm_ctxt *dlm = data; | ||
417 | struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf; | ||
418 | struct dlm_lock_resource *res = NULL; | ||
419 | struct list_head *iter; | ||
420 | struct dlm_lock *lock = NULL; | ||
421 | struct dlm_lockstatus *lksb; | ||
422 | enum dlm_status status = DLM_NORMAL; | ||
423 | u32 flags; | ||
424 | int call_ast = 0, kick_thread = 0; | ||
425 | |||
426 | if (!dlm_grab(dlm)) { | ||
427 | dlm_error(DLM_REJECTED); | ||
428 | return DLM_REJECTED; | ||
429 | } | ||
430 | |||
431 | mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), | ||
432 | "Domain %s not fully joined!\n", dlm->name); | ||
433 | |||
434 | if (cnv->namelen > DLM_LOCKID_NAME_MAX) { | ||
435 | status = DLM_IVBUFLEN; | ||
436 | dlm_error(status); | ||
437 | goto leave; | ||
438 | } | ||
439 | |||
440 | flags = be32_to_cpu(cnv->flags); | ||
441 | |||
442 | if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) == | ||
443 | (LKM_PUT_LVB|LKM_GET_LVB)) { | ||
444 | mlog(ML_ERROR, "both PUT and GET lvb specified\n"); | ||
445 | status = DLM_BADARGS; | ||
446 | goto leave; | ||
447 | } | ||
448 | |||
449 | mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : | ||
450 | (flags & LKM_GET_LVB ? "get lvb" : "none")); | ||
451 | |||
452 | status = DLM_IVLOCKID; | ||
453 | res = dlm_lookup_lockres(dlm, cnv->name, cnv->namelen); | ||
454 | if (!res) { | ||
455 | dlm_error(status); | ||
456 | goto leave; | ||
457 | } | ||
458 | |||
459 | spin_lock(&res->spinlock); | ||
460 | list_for_each(iter, &res->granted) { | ||
461 | lock = list_entry(iter, struct dlm_lock, list); | ||
462 | if (lock->ml.cookie == cnv->cookie && | ||
463 | lock->ml.node == cnv->node_idx) { | ||
464 | dlm_lock_get(lock); | ||
465 | break; | ||
466 | } | ||
467 | lock = NULL; | ||
468 | } | ||
469 | spin_unlock(&res->spinlock); | ||
470 | if (!lock) { | ||
471 | status = DLM_IVLOCKID; | ||
472 | dlm_error(status); | ||
473 | goto leave; | ||
474 | } | ||
475 | |||
476 | /* found the lock */ | ||
477 | lksb = lock->lksb; | ||
478 | |||
479 | /* see if caller needed to get/put lvb */ | ||
480 | if (flags & LKM_PUT_LVB) { | ||
481 | BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); | ||
482 | lksb->flags |= DLM_LKSB_PUT_LVB; | ||
483 | memcpy(&lksb->lvb[0], &cnv->lvb[0], DLM_LVB_LEN); | ||
484 | } else if (flags & LKM_GET_LVB) { | ||
485 | BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); | ||
486 | lksb->flags |= DLM_LKSB_GET_LVB; | ||
487 | } | ||
488 | |||
489 | spin_lock(&res->spinlock); | ||
490 | status = __dlm_lockres_state_to_status(res); | ||
491 | if (status == DLM_NORMAL) { | ||
492 | __dlm_lockres_reserve_ast(res); | ||
493 | res->state |= DLM_LOCK_RES_IN_PROGRESS; | ||
494 | status = __dlmconvert_master(dlm, res, lock, flags, | ||
495 | cnv->requested_type, | ||
496 | &call_ast, &kick_thread); | ||
497 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; | ||
498 | } | ||
499 | spin_unlock(&res->spinlock); | ||
500 | |||
501 | if (status != DLM_NORMAL) { | ||
502 | if (status != DLM_NOTQUEUED) | ||
503 | dlm_error(status); | ||
504 | lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB); | ||
505 | } | ||
506 | |||
507 | leave: | ||
508 | if (!lock) | ||
509 | mlog(ML_ERROR, "did not find lock to convert on grant queue! " | ||
510 | "cookie=%"MLFu64"\n", | ||
511 | cnv->cookie); | ||
512 | else | ||
513 | dlm_lock_put(lock); | ||
514 | |||
515 | /* either queue the ast or release it */ | ||
516 | if (call_ast) | ||
517 | dlm_queue_ast(dlm, lock); | ||
518 | else | ||
519 | dlm_lockres_release_ast(dlm, res); | ||
520 | |||
521 | if (kick_thread) | ||
522 | dlm_kick_thread(dlm, res); | ||
523 | |||
524 | if (res) | ||
525 | dlm_lockres_put(res); | ||
526 | |||
527 | dlm_put(dlm); | ||
528 | |||
529 | return status; | ||
530 | } | ||
diff --git a/fs/ocfs2/dlm/dlmconvert.h b/fs/ocfs2/dlm/dlmconvert.h new file mode 100644 index 000000000000..b2e3677df878 --- /dev/null +++ b/fs/ocfs2/dlm/dlmconvert.h | |||
@@ -0,0 +1,35 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmconvert.h | ||
5 | * | ||
6 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public | ||
10 | * License as published by the Free Software Foundation; either | ||
11 | * version 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
16 | * General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public | ||
19 | * License along with this program; if not, write to the | ||
20 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
21 | * Boston, MA 021110-1307, USA. | ||
22 | * | ||
23 | */ | ||
24 | |||
25 | #ifndef DLMCONVERT_H | ||
26 | #define DLMCONVERT_H | ||
27 | |||
28 | enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm, | ||
29 | struct dlm_lock_resource *res, | ||
30 | struct dlm_lock *lock, int flags, int type); | ||
31 | enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, | ||
32 | struct dlm_lock_resource *res, | ||
33 | struct dlm_lock *lock, int flags, int type); | ||
34 | |||
35 | #endif | ||
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c new file mode 100644 index 000000000000..f339fe27975a --- /dev/null +++ b/fs/ocfs2/dlm/dlmdebug.c | |||
@@ -0,0 +1,246 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmdebug.c | ||
5 | * | ||
6 | * debug functionality for the dlm | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/utsname.h> | ||
31 | #include <linux/sysctl.h> | ||
32 | #include <linux/spinlock.h> | ||
33 | |||
34 | #include "cluster/heartbeat.h" | ||
35 | #include "cluster/nodemanager.h" | ||
36 | #include "cluster/tcp.h" | ||
37 | |||
38 | #include "dlmapi.h" | ||
39 | #include "dlmcommon.h" | ||
40 | #include "dlmdebug.h" | ||
41 | |||
42 | #include "dlmdomain.h" | ||
43 | #include "dlmdebug.h" | ||
44 | |||
45 | #define MLOG_MASK_PREFIX ML_DLM | ||
46 | #include "cluster/masklog.h" | ||
47 | |||
48 | void dlm_print_one_lock_resource(struct dlm_lock_resource *res) | ||
49 | { | ||
50 | mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n", | ||
51 | res->lockname.len, res->lockname.name, | ||
52 | res->owner, res->state); | ||
53 | spin_lock(&res->spinlock); | ||
54 | __dlm_print_one_lock_resource(res); | ||
55 | spin_unlock(&res->spinlock); | ||
56 | } | ||
57 | |||
58 | void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) | ||
59 | { | ||
60 | struct list_head *iter2; | ||
61 | struct dlm_lock *lock; | ||
62 | |||
63 | assert_spin_locked(&res->spinlock); | ||
64 | |||
65 | mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n", | ||
66 | res->lockname.len, res->lockname.name, | ||
67 | res->owner, res->state); | ||
68 | mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n", | ||
69 | res->last_used, list_empty(&res->purge) ? "no" : "yes"); | ||
70 | mlog(ML_NOTICE, " granted queue: \n"); | ||
71 | list_for_each(iter2, &res->granted) { | ||
72 | lock = list_entry(iter2, struct dlm_lock, list); | ||
73 | spin_lock(&lock->spinlock); | ||
74 | mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " | ||
75 | "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", | ||
76 | lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, | ||
77 | list_empty(&lock->ast_list) ? 'y' : 'n', | ||
78 | lock->ast_pending ? 'y' : 'n', | ||
79 | list_empty(&lock->bast_list) ? 'y' : 'n', | ||
80 | lock->bast_pending ? 'y' : 'n'); | ||
81 | spin_unlock(&lock->spinlock); | ||
82 | } | ||
83 | mlog(ML_NOTICE, " converting queue: \n"); | ||
84 | list_for_each(iter2, &res->converting) { | ||
85 | lock = list_entry(iter2, struct dlm_lock, list); | ||
86 | spin_lock(&lock->spinlock); | ||
87 | mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " | ||
88 | "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", | ||
89 | lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, | ||
90 | list_empty(&lock->ast_list) ? 'y' : 'n', | ||
91 | lock->ast_pending ? 'y' : 'n', | ||
92 | list_empty(&lock->bast_list) ? 'y' : 'n', | ||
93 | lock->bast_pending ? 'y' : 'n'); | ||
94 | spin_unlock(&lock->spinlock); | ||
95 | } | ||
96 | mlog(ML_NOTICE, " blocked queue: \n"); | ||
97 | list_for_each(iter2, &res->blocked) { | ||
98 | lock = list_entry(iter2, struct dlm_lock, list); | ||
99 | spin_lock(&lock->spinlock); | ||
100 | mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " | ||
101 | "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", | ||
102 | lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, | ||
103 | list_empty(&lock->ast_list) ? 'y' : 'n', | ||
104 | lock->ast_pending ? 'y' : 'n', | ||
105 | list_empty(&lock->bast_list) ? 'y' : 'n', | ||
106 | lock->bast_pending ? 'y' : 'n'); | ||
107 | spin_unlock(&lock->spinlock); | ||
108 | } | ||
109 | } | ||
110 | |||
111 | void dlm_print_one_lock(struct dlm_lock *lockid) | ||
112 | { | ||
113 | dlm_print_one_lock_resource(lockid->lockres); | ||
114 | } | ||
115 | EXPORT_SYMBOL_GPL(dlm_print_one_lock); | ||
116 | |||
117 | void dlm_dump_lock_resources(struct dlm_ctxt *dlm) | ||
118 | { | ||
119 | struct dlm_lock_resource *res; | ||
120 | struct list_head *iter; | ||
121 | struct list_head *bucket; | ||
122 | int i; | ||
123 | |||
124 | mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n", | ||
125 | dlm->name, dlm->node_num, dlm->key); | ||
126 | if (!dlm || !dlm->name) { | ||
127 | mlog(ML_ERROR, "dlm=%p\n", dlm); | ||
128 | return; | ||
129 | } | ||
130 | |||
131 | spin_lock(&dlm->spinlock); | ||
132 | for (i=0; i<DLM_HASH_SIZE; i++) { | ||
133 | bucket = &(dlm->resources[i]); | ||
134 | list_for_each(iter, bucket) { | ||
135 | res = list_entry(iter, struct dlm_lock_resource, list); | ||
136 | dlm_print_one_lock_resource(res); | ||
137 | } | ||
138 | } | ||
139 | spin_unlock(&dlm->spinlock); | ||
140 | } | ||
141 | |||
142 | static const char *dlm_errnames[] = { | ||
143 | [DLM_NORMAL] = "DLM_NORMAL", | ||
144 | [DLM_GRANTED] = "DLM_GRANTED", | ||
145 | [DLM_DENIED] = "DLM_DENIED", | ||
146 | [DLM_DENIED_NOLOCKS] = "DLM_DENIED_NOLOCKS", | ||
147 | [DLM_WORKING] = "DLM_WORKING", | ||
148 | [DLM_BLOCKED] = "DLM_BLOCKED", | ||
149 | [DLM_BLOCKED_ORPHAN] = "DLM_BLOCKED_ORPHAN", | ||
150 | [DLM_DENIED_GRACE_PERIOD] = "DLM_DENIED_GRACE_PERIOD", | ||
151 | [DLM_SYSERR] = "DLM_SYSERR", | ||
152 | [DLM_NOSUPPORT] = "DLM_NOSUPPORT", | ||
153 | [DLM_CANCELGRANT] = "DLM_CANCELGRANT", | ||
154 | [DLM_IVLOCKID] = "DLM_IVLOCKID", | ||
155 | [DLM_SYNC] = "DLM_SYNC", | ||
156 | [DLM_BADTYPE] = "DLM_BADTYPE", | ||
157 | [DLM_BADRESOURCE] = "DLM_BADRESOURCE", | ||
158 | [DLM_MAXHANDLES] = "DLM_MAXHANDLES", | ||
159 | [DLM_NOCLINFO] = "DLM_NOCLINFO", | ||
160 | [DLM_NOLOCKMGR] = "DLM_NOLOCKMGR", | ||
161 | [DLM_NOPURGED] = "DLM_NOPURGED", | ||
162 | [DLM_BADARGS] = "DLM_BADARGS", | ||
163 | [DLM_VOID] = "DLM_VOID", | ||
164 | [DLM_NOTQUEUED] = "DLM_NOTQUEUED", | ||
165 | [DLM_IVBUFLEN] = "DLM_IVBUFLEN", | ||
166 | [DLM_CVTUNGRANT] = "DLM_CVTUNGRANT", | ||
167 | [DLM_BADPARAM] = "DLM_BADPARAM", | ||
168 | [DLM_VALNOTVALID] = "DLM_VALNOTVALID", | ||
169 | [DLM_REJECTED] = "DLM_REJECTED", | ||
170 | [DLM_ABORT] = "DLM_ABORT", | ||
171 | [DLM_CANCEL] = "DLM_CANCEL", | ||
172 | [DLM_IVRESHANDLE] = "DLM_IVRESHANDLE", | ||
173 | [DLM_DEADLOCK] = "DLM_DEADLOCK", | ||
174 | [DLM_DENIED_NOASTS] = "DLM_DENIED_NOASTS", | ||
175 | [DLM_FORWARD] = "DLM_FORWARD", | ||
176 | [DLM_TIMEOUT] = "DLM_TIMEOUT", | ||
177 | [DLM_IVGROUPID] = "DLM_IVGROUPID", | ||
178 | [DLM_VERS_CONFLICT] = "DLM_VERS_CONFLICT", | ||
179 | [DLM_BAD_DEVICE_PATH] = "DLM_BAD_DEVICE_PATH", | ||
180 | [DLM_NO_DEVICE_PERMISSION] = "DLM_NO_DEVICE_PERMISSION", | ||
181 | [DLM_NO_CONTROL_DEVICE ] = "DLM_NO_CONTROL_DEVICE ", | ||
182 | [DLM_RECOVERING] = "DLM_RECOVERING", | ||
183 | [DLM_MIGRATING] = "DLM_MIGRATING", | ||
184 | [DLM_MAXSTATS] = "DLM_MAXSTATS", | ||
185 | }; | ||
186 | |||
187 | static const char *dlm_errmsgs[] = { | ||
188 | [DLM_NORMAL] = "request in progress", | ||
189 | [DLM_GRANTED] = "request granted", | ||
190 | [DLM_DENIED] = "request denied", | ||
191 | [DLM_DENIED_NOLOCKS] = "request denied, out of system resources", | ||
192 | [DLM_WORKING] = "async request in progress", | ||
193 | [DLM_BLOCKED] = "lock request blocked", | ||
194 | [DLM_BLOCKED_ORPHAN] = "lock request blocked by a orphan lock", | ||
195 | [DLM_DENIED_GRACE_PERIOD] = "topological change in progress", | ||
196 | [DLM_SYSERR] = "system error", | ||
197 | [DLM_NOSUPPORT] = "unsupported", | ||
198 | [DLM_CANCELGRANT] = "can't cancel convert: already granted", | ||
199 | [DLM_IVLOCKID] = "bad lockid", | ||
200 | [DLM_SYNC] = "synchronous request granted", | ||
201 | [DLM_BADTYPE] = "bad resource type", | ||
202 | [DLM_BADRESOURCE] = "bad resource handle", | ||
203 | [DLM_MAXHANDLES] = "no more resource handles", | ||
204 | [DLM_NOCLINFO] = "can't contact cluster manager", | ||
205 | [DLM_NOLOCKMGR] = "can't contact lock manager", | ||
206 | [DLM_NOPURGED] = "can't contact purge daemon", | ||
207 | [DLM_BADARGS] = "bad api args", | ||
208 | [DLM_VOID] = "no status", | ||
209 | [DLM_NOTQUEUED] = "NOQUEUE was specified and request failed", | ||
210 | [DLM_IVBUFLEN] = "invalid resource name length", | ||
211 | [DLM_CVTUNGRANT] = "attempted to convert ungranted lock", | ||
212 | [DLM_BADPARAM] = "invalid lock mode specified", | ||
213 | [DLM_VALNOTVALID] = "value block has been invalidated", | ||
214 | [DLM_REJECTED] = "request rejected, unrecognized client", | ||
215 | [DLM_ABORT] = "blocked lock request cancelled", | ||
216 | [DLM_CANCEL] = "conversion request cancelled", | ||
217 | [DLM_IVRESHANDLE] = "invalid resource handle", | ||
218 | [DLM_DEADLOCK] = "deadlock recovery refused this request", | ||
219 | [DLM_DENIED_NOASTS] = "failed to allocate AST", | ||
220 | [DLM_FORWARD] = "request must wait for primary's response", | ||
221 | [DLM_TIMEOUT] = "timeout value for lock has expired", | ||
222 | [DLM_IVGROUPID] = "invalid group specification", | ||
223 | [DLM_VERS_CONFLICT] = "version conflicts prevent request handling", | ||
224 | [DLM_BAD_DEVICE_PATH] = "Locks device does not exist or path wrong", | ||
225 | [DLM_NO_DEVICE_PERMISSION] = "Client has insufficient perms for device", | ||
226 | [DLM_NO_CONTROL_DEVICE] = "Cannot set options on opened device ", | ||
227 | [DLM_RECOVERING] = "lock resource being recovered", | ||
228 | [DLM_MIGRATING] = "lock resource being migrated", | ||
229 | [DLM_MAXSTATS] = "invalid error number", | ||
230 | }; | ||
231 | |||
232 | const char *dlm_errmsg(enum dlm_status err) | ||
233 | { | ||
234 | if (err >= DLM_MAXSTATS || err < 0) | ||
235 | return dlm_errmsgs[DLM_MAXSTATS]; | ||
236 | return dlm_errmsgs[err]; | ||
237 | } | ||
238 | EXPORT_SYMBOL_GPL(dlm_errmsg); | ||
239 | |||
240 | const char *dlm_errname(enum dlm_status err) | ||
241 | { | ||
242 | if (err >= DLM_MAXSTATS || err < 0) | ||
243 | return dlm_errnames[DLM_MAXSTATS]; | ||
244 | return dlm_errnames[err]; | ||
245 | } | ||
246 | EXPORT_SYMBOL_GPL(dlm_errname); | ||
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h new file mode 100644 index 000000000000..6858510c3ccd --- /dev/null +++ b/fs/ocfs2/dlm/dlmdebug.h | |||
@@ -0,0 +1,30 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmdebug.h | ||
5 | * | ||
6 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public | ||
10 | * License as published by the Free Software Foundation; either | ||
11 | * version 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
16 | * General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public | ||
19 | * License along with this program; if not, write to the | ||
20 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
21 | * Boston, MA 021110-1307, USA. | ||
22 | * | ||
23 | */ | ||
24 | |||
25 | #ifndef DLMDEBUG_H | ||
26 | #define DLMDEBUG_H | ||
27 | |||
28 | void dlm_dump_lock_resources(struct dlm_ctxt *dlm); | ||
29 | |||
30 | #endif | ||
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c new file mode 100644 index 000000000000..da3c22045f89 --- /dev/null +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
@@ -0,0 +1,1469 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmdomain.c | ||
5 | * | ||
6 | * defines domain join / leave apis | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | #include <linux/module.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/highmem.h> | ||
31 | #include <linux/utsname.h> | ||
32 | #include <linux/init.h> | ||
33 | #include <linux/spinlock.h> | ||
34 | #include <linux/delay.h> | ||
35 | #include <linux/err.h> | ||
36 | |||
37 | #include "cluster/heartbeat.h" | ||
38 | #include "cluster/nodemanager.h" | ||
39 | #include "cluster/tcp.h" | ||
40 | |||
41 | #include "dlmapi.h" | ||
42 | #include "dlmcommon.h" | ||
43 | |||
44 | #include "dlmdebug.h" | ||
45 | #include "dlmdomain.h" | ||
46 | |||
47 | #include "dlmver.h" | ||
48 | |||
49 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) | ||
50 | #include "cluster/masklog.h" | ||
51 | |||
52 | /* | ||
53 | * | ||
54 | * spinlock lock ordering: if multiple locks are needed, obey this ordering: | ||
55 | * dlm_domain_lock | ||
56 | * struct dlm_ctxt->spinlock | ||
57 | * struct dlm_lock_resource->spinlock | ||
58 | * struct dlm_ctxt->master_lock | ||
59 | * struct dlm_ctxt->ast_lock | ||
60 | * dlm_master_list_entry->spinlock | ||
61 | * dlm_lock->spinlock | ||
62 | * | ||
63 | */ | ||
64 | |||
65 | spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED; | ||
66 | LIST_HEAD(dlm_domains); | ||
67 | static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); | ||
68 | |||
69 | #define DLM_DOMAIN_BACKOFF_MS 200 | ||
70 | |||
71 | static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data); | ||
72 | static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data); | ||
73 | static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data); | ||
74 | static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data); | ||
75 | |||
76 | static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); | ||
77 | |||
78 | void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) | ||
79 | { | ||
80 | list_del_init(&lockres->list); | ||
81 | dlm_lockres_put(lockres); | ||
82 | } | ||
83 | |||
84 | void __dlm_insert_lockres(struct dlm_ctxt *dlm, | ||
85 | struct dlm_lock_resource *res) | ||
86 | { | ||
87 | struct list_head *bucket; | ||
88 | struct qstr *q; | ||
89 | |||
90 | assert_spin_locked(&dlm->spinlock); | ||
91 | |||
92 | q = &res->lockname; | ||
93 | q->hash = full_name_hash(q->name, q->len); | ||
94 | bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]); | ||
95 | |||
96 | /* get a reference for our hashtable */ | ||
97 | dlm_lockres_get(res); | ||
98 | |||
99 | list_add_tail(&res->list, bucket); | ||
100 | } | ||
101 | |||
102 | struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, | ||
103 | const char *name, | ||
104 | unsigned int len) | ||
105 | { | ||
106 | unsigned int hash; | ||
107 | struct list_head *iter; | ||
108 | struct dlm_lock_resource *tmpres=NULL; | ||
109 | struct list_head *bucket; | ||
110 | |||
111 | mlog_entry("%.*s\n", len, name); | ||
112 | |||
113 | assert_spin_locked(&dlm->spinlock); | ||
114 | |||
115 | hash = full_name_hash(name, len); | ||
116 | |||
117 | bucket = &(dlm->resources[hash & DLM_HASH_MASK]); | ||
118 | |||
119 | /* check for pre-existing lock */ | ||
120 | list_for_each(iter, bucket) { | ||
121 | tmpres = list_entry(iter, struct dlm_lock_resource, list); | ||
122 | if (tmpres->lockname.len == len && | ||
123 | memcmp(tmpres->lockname.name, name, len) == 0) { | ||
124 | dlm_lockres_get(tmpres); | ||
125 | break; | ||
126 | } | ||
127 | |||
128 | tmpres = NULL; | ||
129 | } | ||
130 | return tmpres; | ||
131 | } | ||
132 | |||
133 | struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, | ||
134 | const char *name, | ||
135 | unsigned int len) | ||
136 | { | ||
137 | struct dlm_lock_resource *res; | ||
138 | |||
139 | spin_lock(&dlm->spinlock); | ||
140 | res = __dlm_lookup_lockres(dlm, name, len); | ||
141 | spin_unlock(&dlm->spinlock); | ||
142 | return res; | ||
143 | } | ||
144 | |||
145 | static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len) | ||
146 | { | ||
147 | struct dlm_ctxt *tmp = NULL; | ||
148 | struct list_head *iter; | ||
149 | |||
150 | assert_spin_locked(&dlm_domain_lock); | ||
151 | |||
152 | /* tmp->name here is always NULL terminated, | ||
153 | * but domain may not be! */ | ||
154 | list_for_each(iter, &dlm_domains) { | ||
155 | tmp = list_entry (iter, struct dlm_ctxt, list); | ||
156 | if (strlen(tmp->name) == len && | ||
157 | memcmp(tmp->name, domain, len)==0) | ||
158 | break; | ||
159 | tmp = NULL; | ||
160 | } | ||
161 | |||
162 | return tmp; | ||
163 | } | ||
164 | |||
165 | /* For null terminated domain strings ONLY */ | ||
166 | static struct dlm_ctxt * __dlm_lookup_domain(const char *domain) | ||
167 | { | ||
168 | assert_spin_locked(&dlm_domain_lock); | ||
169 | |||
170 | return __dlm_lookup_domain_full(domain, strlen(domain)); | ||
171 | } | ||
172 | |||
173 | |||
174 | /* returns true on one of two conditions: | ||
175 | * 1) the domain does not exist | ||
176 | * 2) the domain exists and it's state is "joined" */ | ||
177 | static int dlm_wait_on_domain_helper(const char *domain) | ||
178 | { | ||
179 | int ret = 0; | ||
180 | struct dlm_ctxt *tmp = NULL; | ||
181 | |||
182 | spin_lock(&dlm_domain_lock); | ||
183 | |||
184 | tmp = __dlm_lookup_domain(domain); | ||
185 | if (!tmp) | ||
186 | ret = 1; | ||
187 | else if (tmp->dlm_state == DLM_CTXT_JOINED) | ||
188 | ret = 1; | ||
189 | |||
190 | spin_unlock(&dlm_domain_lock); | ||
191 | return ret; | ||
192 | } | ||
193 | |||
194 | static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) | ||
195 | { | ||
196 | if (dlm->resources) | ||
197 | free_page((unsigned long) dlm->resources); | ||
198 | |||
199 | if (dlm->name) | ||
200 | kfree(dlm->name); | ||
201 | |||
202 | kfree(dlm); | ||
203 | } | ||
204 | |||
205 | /* A little strange - this function will be called while holding | ||
206 | * dlm_domain_lock and is expected to be holding it on the way out. We | ||
207 | * will however drop and reacquire it multiple times */ | ||
208 | static void dlm_ctxt_release(struct kref *kref) | ||
209 | { | ||
210 | struct dlm_ctxt *dlm; | ||
211 | |||
212 | dlm = container_of(kref, struct dlm_ctxt, dlm_refs); | ||
213 | |||
214 | BUG_ON(dlm->num_joins); | ||
215 | BUG_ON(dlm->dlm_state == DLM_CTXT_JOINED); | ||
216 | |||
217 | /* we may still be in the list if we hit an error during join. */ | ||
218 | list_del_init(&dlm->list); | ||
219 | |||
220 | spin_unlock(&dlm_domain_lock); | ||
221 | |||
222 | mlog(0, "freeing memory from domain %s\n", dlm->name); | ||
223 | |||
224 | wake_up(&dlm_domain_events); | ||
225 | |||
226 | dlm_free_ctxt_mem(dlm); | ||
227 | |||
228 | spin_lock(&dlm_domain_lock); | ||
229 | } | ||
230 | |||
231 | void dlm_put(struct dlm_ctxt *dlm) | ||
232 | { | ||
233 | spin_lock(&dlm_domain_lock); | ||
234 | kref_put(&dlm->dlm_refs, dlm_ctxt_release); | ||
235 | spin_unlock(&dlm_domain_lock); | ||
236 | } | ||
237 | |||
238 | static void __dlm_get(struct dlm_ctxt *dlm) | ||
239 | { | ||
240 | kref_get(&dlm->dlm_refs); | ||
241 | } | ||
242 | |||
243 | /* given a questionable reference to a dlm object, gets a reference if | ||
244 | * it can find it in the list, otherwise returns NULL in which case | ||
245 | * you shouldn't trust your pointer. */ | ||
246 | struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm) | ||
247 | { | ||
248 | struct list_head *iter; | ||
249 | struct dlm_ctxt *target = NULL; | ||
250 | |||
251 | spin_lock(&dlm_domain_lock); | ||
252 | |||
253 | list_for_each(iter, &dlm_domains) { | ||
254 | target = list_entry (iter, struct dlm_ctxt, list); | ||
255 | |||
256 | if (target == dlm) { | ||
257 | __dlm_get(target); | ||
258 | break; | ||
259 | } | ||
260 | |||
261 | target = NULL; | ||
262 | } | ||
263 | |||
264 | spin_unlock(&dlm_domain_lock); | ||
265 | |||
266 | return target; | ||
267 | } | ||
268 | |||
269 | int dlm_domain_fully_joined(struct dlm_ctxt *dlm) | ||
270 | { | ||
271 | int ret; | ||
272 | |||
273 | spin_lock(&dlm_domain_lock); | ||
274 | ret = (dlm->dlm_state == DLM_CTXT_JOINED) || | ||
275 | (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN); | ||
276 | spin_unlock(&dlm_domain_lock); | ||
277 | |||
278 | return ret; | ||
279 | } | ||
280 | |||
281 | static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) | ||
282 | { | ||
283 | dlm_unregister_domain_handlers(dlm); | ||
284 | dlm_complete_thread(dlm); | ||
285 | dlm_complete_recovery_thread(dlm); | ||
286 | |||
287 | /* We've left the domain. Now we can take ourselves out of the | ||
288 | * list and allow the kref stuff to help us free the | ||
289 | * memory. */ | ||
290 | spin_lock(&dlm_domain_lock); | ||
291 | list_del_init(&dlm->list); | ||
292 | spin_unlock(&dlm_domain_lock); | ||
293 | |||
294 | /* Wake up anyone waiting for us to remove this domain */ | ||
295 | wake_up(&dlm_domain_events); | ||
296 | } | ||
297 | |||
298 | static void dlm_migrate_all_locks(struct dlm_ctxt *dlm) | ||
299 | { | ||
300 | int i; | ||
301 | struct dlm_lock_resource *res; | ||
302 | |||
303 | mlog(0, "Migrating locks from domain %s\n", dlm->name); | ||
304 | restart: | ||
305 | spin_lock(&dlm->spinlock); | ||
306 | for (i=0; i<DLM_HASH_SIZE; i++) { | ||
307 | while (!list_empty(&dlm->resources[i])) { | ||
308 | res = list_entry(dlm->resources[i].next, | ||
309 | struct dlm_lock_resource, list); | ||
310 | /* need reference when manually grabbing lockres */ | ||
311 | dlm_lockres_get(res); | ||
312 | /* this should unhash the lockres | ||
313 | * and exit with dlm->spinlock */ | ||
314 | mlog(0, "purging res=%p\n", res); | ||
315 | if (dlm_lockres_is_dirty(dlm, res)) { | ||
316 | /* HACK! this should absolutely go. | ||
317 | * need to figure out why some empty | ||
318 | * lockreses are still marked dirty */ | ||
319 | mlog(ML_ERROR, "lockres %.*s dirty!\n", | ||
320 | res->lockname.len, res->lockname.name); | ||
321 | |||
322 | spin_unlock(&dlm->spinlock); | ||
323 | dlm_kick_thread(dlm, res); | ||
324 | wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); | ||
325 | dlm_lockres_put(res); | ||
326 | goto restart; | ||
327 | } | ||
328 | dlm_purge_lockres(dlm, res); | ||
329 | dlm_lockres_put(res); | ||
330 | } | ||
331 | } | ||
332 | spin_unlock(&dlm->spinlock); | ||
333 | |||
334 | mlog(0, "DONE Migrating locks from domain %s\n", dlm->name); | ||
335 | } | ||
336 | |||
337 | static int dlm_no_joining_node(struct dlm_ctxt *dlm) | ||
338 | { | ||
339 | int ret; | ||
340 | |||
341 | spin_lock(&dlm->spinlock); | ||
342 | ret = dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN; | ||
343 | spin_unlock(&dlm->spinlock); | ||
344 | |||
345 | return ret; | ||
346 | } | ||
347 | |||
348 | static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm) | ||
349 | { | ||
350 | /* Yikes, a double spinlock! I need domain_lock for the dlm | ||
351 | * state and the dlm spinlock for join state... Sorry! */ | ||
352 | again: | ||
353 | spin_lock(&dlm_domain_lock); | ||
354 | spin_lock(&dlm->spinlock); | ||
355 | |||
356 | if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
357 | mlog(0, "Node %d is joining, we wait on it.\n", | ||
358 | dlm->joining_node); | ||
359 | spin_unlock(&dlm->spinlock); | ||
360 | spin_unlock(&dlm_domain_lock); | ||
361 | |||
362 | wait_event(dlm->dlm_join_events, dlm_no_joining_node(dlm)); | ||
363 | goto again; | ||
364 | } | ||
365 | |||
366 | dlm->dlm_state = DLM_CTXT_LEAVING; | ||
367 | spin_unlock(&dlm->spinlock); | ||
368 | spin_unlock(&dlm_domain_lock); | ||
369 | } | ||
370 | |||
371 | static void __dlm_print_nodes(struct dlm_ctxt *dlm) | ||
372 | { | ||
373 | int node = -1; | ||
374 | |||
375 | assert_spin_locked(&dlm->spinlock); | ||
376 | |||
377 | mlog(ML_NOTICE, "Nodes in my domain (\"%s\"):\n", dlm->name); | ||
378 | |||
379 | while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, | ||
380 | node + 1)) < O2NM_MAX_NODES) { | ||
381 | mlog(ML_NOTICE, " node %d\n", node); | ||
382 | } | ||
383 | } | ||
384 | |||
385 | static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data) | ||
386 | { | ||
387 | struct dlm_ctxt *dlm = data; | ||
388 | unsigned int node; | ||
389 | struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf; | ||
390 | |||
391 | mlog_entry("%p %u %p", msg, len, data); | ||
392 | |||
393 | if (!dlm_grab(dlm)) | ||
394 | return 0; | ||
395 | |||
396 | node = exit_msg->node_idx; | ||
397 | |||
398 | mlog(0, "Node %u leaves domain %s\n", node, dlm->name); | ||
399 | |||
400 | spin_lock(&dlm->spinlock); | ||
401 | clear_bit(node, dlm->domain_map); | ||
402 | __dlm_print_nodes(dlm); | ||
403 | |||
404 | /* notify anything attached to the heartbeat events */ | ||
405 | dlm_hb_event_notify_attached(dlm, node, 0); | ||
406 | |||
407 | spin_unlock(&dlm->spinlock); | ||
408 | |||
409 | dlm_put(dlm); | ||
410 | |||
411 | return 0; | ||
412 | } | ||
413 | |||
414 | static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, | ||
415 | unsigned int node) | ||
416 | { | ||
417 | int status; | ||
418 | struct dlm_exit_domain leave_msg; | ||
419 | |||
420 | mlog(0, "Asking node %u if we can leave the domain %s me = %u\n", | ||
421 | node, dlm->name, dlm->node_num); | ||
422 | |||
423 | memset(&leave_msg, 0, sizeof(leave_msg)); | ||
424 | leave_msg.node_idx = dlm->node_num; | ||
425 | |||
426 | status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key, | ||
427 | &leave_msg, sizeof(leave_msg), node, | ||
428 | NULL); | ||
429 | |||
430 | mlog(0, "status return %d from o2net_send_message\n", status); | ||
431 | |||
432 | return status; | ||
433 | } | ||
434 | |||
435 | |||
436 | static void dlm_leave_domain(struct dlm_ctxt *dlm) | ||
437 | { | ||
438 | int node, clear_node, status; | ||
439 | |||
440 | /* At this point we've migrated away all our locks and won't | ||
441 | * accept mastership of new ones. The dlm is responsible for | ||
442 | * almost nothing now. We make sure not to confuse any joining | ||
443 | * nodes and then commence shutdown procedure. */ | ||
444 | |||
445 | spin_lock(&dlm->spinlock); | ||
446 | /* Clear ourselves from the domain map */ | ||
447 | clear_bit(dlm->node_num, dlm->domain_map); | ||
448 | while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, | ||
449 | 0)) < O2NM_MAX_NODES) { | ||
450 | /* Drop the dlm spinlock. This is safe wrt the domain_map. | ||
451 | * -nodes cannot be added now as the | ||
452 | * query_join_handlers knows to respond with OK_NO_MAP | ||
453 | * -we catch the right network errors if a node is | ||
454 | * removed from the map while we're sending him the | ||
455 | * exit message. */ | ||
456 | spin_unlock(&dlm->spinlock); | ||
457 | |||
458 | clear_node = 1; | ||
459 | |||
460 | status = dlm_send_one_domain_exit(dlm, node); | ||
461 | if (status < 0 && | ||
462 | status != -ENOPROTOOPT && | ||
463 | status != -ENOTCONN) { | ||
464 | mlog(ML_NOTICE, "Error %d sending domain exit message " | ||
465 | "to node %d\n", status, node); | ||
466 | |||
467 | /* Not sure what to do here but lets sleep for | ||
468 | * a bit in case this was a transient | ||
469 | * error... */ | ||
470 | msleep(DLM_DOMAIN_BACKOFF_MS); | ||
471 | clear_node = 0; | ||
472 | } | ||
473 | |||
474 | spin_lock(&dlm->spinlock); | ||
475 | /* If we're not clearing the node bit then we intend | ||
476 | * to loop back around to try again. */ | ||
477 | if (clear_node) | ||
478 | clear_bit(node, dlm->domain_map); | ||
479 | } | ||
480 | spin_unlock(&dlm->spinlock); | ||
481 | } | ||
482 | |||
483 | int dlm_joined(struct dlm_ctxt *dlm) | ||
484 | { | ||
485 | int ret = 0; | ||
486 | |||
487 | spin_lock(&dlm_domain_lock); | ||
488 | |||
489 | if (dlm->dlm_state == DLM_CTXT_JOINED) | ||
490 | ret = 1; | ||
491 | |||
492 | spin_unlock(&dlm_domain_lock); | ||
493 | |||
494 | return ret; | ||
495 | } | ||
496 | |||
497 | int dlm_shutting_down(struct dlm_ctxt *dlm) | ||
498 | { | ||
499 | int ret = 0; | ||
500 | |||
501 | spin_lock(&dlm_domain_lock); | ||
502 | |||
503 | if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) | ||
504 | ret = 1; | ||
505 | |||
506 | spin_unlock(&dlm_domain_lock); | ||
507 | |||
508 | return ret; | ||
509 | } | ||
510 | |||
511 | void dlm_unregister_domain(struct dlm_ctxt *dlm) | ||
512 | { | ||
513 | int leave = 0; | ||
514 | |||
515 | spin_lock(&dlm_domain_lock); | ||
516 | BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED); | ||
517 | BUG_ON(!dlm->num_joins); | ||
518 | |||
519 | dlm->num_joins--; | ||
520 | if (!dlm->num_joins) { | ||
521 | /* We mark it "in shutdown" now so new register | ||
522 | * requests wait until we've completely left the | ||
523 | * domain. Don't use DLM_CTXT_LEAVING yet as we still | ||
524 | * want new domain joins to communicate with us at | ||
525 | * least until we've completed migration of our | ||
526 | * resources. */ | ||
527 | dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN; | ||
528 | leave = 1; | ||
529 | } | ||
530 | spin_unlock(&dlm_domain_lock); | ||
531 | |||
532 | if (leave) { | ||
533 | mlog(0, "shutting down domain %s\n", dlm->name); | ||
534 | |||
535 | /* We changed dlm state, notify the thread */ | ||
536 | dlm_kick_thread(dlm, NULL); | ||
537 | |||
538 | dlm_migrate_all_locks(dlm); | ||
539 | dlm_mark_domain_leaving(dlm); | ||
540 | dlm_leave_domain(dlm); | ||
541 | dlm_complete_dlm_shutdown(dlm); | ||
542 | } | ||
543 | dlm_put(dlm); | ||
544 | } | ||
545 | EXPORT_SYMBOL_GPL(dlm_unregister_domain); | ||
546 | |||
547 | static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) | ||
548 | { | ||
549 | struct dlm_query_join_request *query; | ||
550 | enum dlm_query_join_response response; | ||
551 | struct dlm_ctxt *dlm = NULL; | ||
552 | |||
553 | query = (struct dlm_query_join_request *) msg->buf; | ||
554 | |||
555 | mlog(0, "node %u wants to join domain %s\n", query->node_idx, | ||
556 | query->domain); | ||
557 | |||
558 | /* | ||
559 | * If heartbeat doesn't consider the node live, tell it | ||
560 | * to back off and try again. This gives heartbeat a chance | ||
561 | * to catch up. | ||
562 | */ | ||
563 | if (!o2hb_check_node_heartbeating(query->node_idx)) { | ||
564 | mlog(0, "node %u is not in our live map yet\n", | ||
565 | query->node_idx); | ||
566 | |||
567 | response = JOIN_DISALLOW; | ||
568 | goto respond; | ||
569 | } | ||
570 | |||
571 | response = JOIN_OK_NO_MAP; | ||
572 | |||
573 | spin_lock(&dlm_domain_lock); | ||
574 | dlm = __dlm_lookup_domain_full(query->domain, query->name_len); | ||
575 | /* Once the dlm ctxt is marked as leaving then we don't want | ||
576 | * to be put in someone's domain map. */ | ||
577 | if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { | ||
578 | spin_lock(&dlm->spinlock); | ||
579 | |||
580 | if (dlm->dlm_state == DLM_CTXT_NEW && | ||
581 | dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
582 | /*If this is a brand new context and we | ||
583 | * haven't started our join process yet, then | ||
584 | * the other node won the race. */ | ||
585 | response = JOIN_OK_NO_MAP; | ||
586 | } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
587 | /* Disallow parallel joins. */ | ||
588 | response = JOIN_DISALLOW; | ||
589 | } else { | ||
590 | /* Alright we're fully a part of this domain | ||
591 | * so we keep some state as to who's joining | ||
592 | * and indicate to him that needs to be fixed | ||
593 | * up. */ | ||
594 | response = JOIN_OK; | ||
595 | __dlm_set_joining_node(dlm, query->node_idx); | ||
596 | } | ||
597 | |||
598 | spin_unlock(&dlm->spinlock); | ||
599 | } | ||
600 | spin_unlock(&dlm_domain_lock); | ||
601 | |||
602 | respond: | ||
603 | mlog(0, "We respond with %u\n", response); | ||
604 | |||
605 | return response; | ||
606 | } | ||
607 | |||
608 | static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data) | ||
609 | { | ||
610 | struct dlm_assert_joined *assert; | ||
611 | struct dlm_ctxt *dlm = NULL; | ||
612 | |||
613 | assert = (struct dlm_assert_joined *) msg->buf; | ||
614 | |||
615 | mlog(0, "node %u asserts join on domain %s\n", assert->node_idx, | ||
616 | assert->domain); | ||
617 | |||
618 | spin_lock(&dlm_domain_lock); | ||
619 | dlm = __dlm_lookup_domain_full(assert->domain, assert->name_len); | ||
620 | /* XXX should we consider no dlm ctxt an error? */ | ||
621 | if (dlm) { | ||
622 | spin_lock(&dlm->spinlock); | ||
623 | |||
624 | /* Alright, this node has officially joined our | ||
625 | * domain. Set him in the map and clean up our | ||
626 | * leftover join state. */ | ||
627 | BUG_ON(dlm->joining_node != assert->node_idx); | ||
628 | set_bit(assert->node_idx, dlm->domain_map); | ||
629 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | ||
630 | |||
631 | __dlm_print_nodes(dlm); | ||
632 | |||
633 | /* notify anything attached to the heartbeat events */ | ||
634 | dlm_hb_event_notify_attached(dlm, assert->node_idx, 1); | ||
635 | |||
636 | spin_unlock(&dlm->spinlock); | ||
637 | } | ||
638 | spin_unlock(&dlm_domain_lock); | ||
639 | |||
640 | return 0; | ||
641 | } | ||
642 | |||
643 | static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data) | ||
644 | { | ||
645 | struct dlm_cancel_join *cancel; | ||
646 | struct dlm_ctxt *dlm = NULL; | ||
647 | |||
648 | cancel = (struct dlm_cancel_join *) msg->buf; | ||
649 | |||
650 | mlog(0, "node %u cancels join on domain %s\n", cancel->node_idx, | ||
651 | cancel->domain); | ||
652 | |||
653 | spin_lock(&dlm_domain_lock); | ||
654 | dlm = __dlm_lookup_domain_full(cancel->domain, cancel->name_len); | ||
655 | |||
656 | if (dlm) { | ||
657 | spin_lock(&dlm->spinlock); | ||
658 | |||
659 | /* Yikes, this guy wants to cancel his join. No | ||
660 | * problem, we simply cleanup our join state. */ | ||
661 | BUG_ON(dlm->joining_node != cancel->node_idx); | ||
662 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | ||
663 | |||
664 | spin_unlock(&dlm->spinlock); | ||
665 | } | ||
666 | spin_unlock(&dlm_domain_lock); | ||
667 | |||
668 | return 0; | ||
669 | } | ||
670 | |||
671 | static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm, | ||
672 | unsigned int node) | ||
673 | { | ||
674 | int status; | ||
675 | struct dlm_cancel_join cancel_msg; | ||
676 | |||
677 | memset(&cancel_msg, 0, sizeof(cancel_msg)); | ||
678 | cancel_msg.node_idx = dlm->node_num; | ||
679 | cancel_msg.name_len = strlen(dlm->name); | ||
680 | memcpy(cancel_msg.domain, dlm->name, cancel_msg.name_len); | ||
681 | |||
682 | status = o2net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, | ||
683 | &cancel_msg, sizeof(cancel_msg), node, | ||
684 | NULL); | ||
685 | if (status < 0) { | ||
686 | mlog_errno(status); | ||
687 | goto bail; | ||
688 | } | ||
689 | |||
690 | bail: | ||
691 | return status; | ||
692 | } | ||
693 | |||
694 | /* map_size should be in bytes. */ | ||
695 | static int dlm_send_join_cancels(struct dlm_ctxt *dlm, | ||
696 | unsigned long *node_map, | ||
697 | unsigned int map_size) | ||
698 | { | ||
699 | int status, tmpstat; | ||
700 | unsigned int node; | ||
701 | |||
702 | if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) * | ||
703 | sizeof(unsigned long))) { | ||
704 | mlog(ML_ERROR, | ||
705 | "map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n", | ||
706 | map_size, BITS_TO_LONGS(O2NM_MAX_NODES)); | ||
707 | return -EINVAL; | ||
708 | } | ||
709 | |||
710 | status = 0; | ||
711 | node = -1; | ||
712 | while ((node = find_next_bit(node_map, O2NM_MAX_NODES, | ||
713 | node + 1)) < O2NM_MAX_NODES) { | ||
714 | if (node == dlm->node_num) | ||
715 | continue; | ||
716 | |||
717 | tmpstat = dlm_send_one_join_cancel(dlm, node); | ||
718 | if (tmpstat) { | ||
719 | mlog(ML_ERROR, "Error return %d cancelling join on " | ||
720 | "node %d\n", tmpstat, node); | ||
721 | if (!status) | ||
722 | status = tmpstat; | ||
723 | } | ||
724 | } | ||
725 | |||
726 | if (status) | ||
727 | mlog_errno(status); | ||
728 | return status; | ||
729 | } | ||
730 | |||
731 | static int dlm_request_join(struct dlm_ctxt *dlm, | ||
732 | int node, | ||
733 | enum dlm_query_join_response *response) | ||
734 | { | ||
735 | int status, retval; | ||
736 | struct dlm_query_join_request join_msg; | ||
737 | |||
738 | mlog(0, "querying node %d\n", node); | ||
739 | |||
740 | memset(&join_msg, 0, sizeof(join_msg)); | ||
741 | join_msg.node_idx = dlm->node_num; | ||
742 | join_msg.name_len = strlen(dlm->name); | ||
743 | memcpy(join_msg.domain, dlm->name, join_msg.name_len); | ||
744 | |||
745 | status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, | ||
746 | sizeof(join_msg), node, &retval); | ||
747 | if (status < 0 && status != -ENOPROTOOPT) { | ||
748 | mlog_errno(status); | ||
749 | goto bail; | ||
750 | } | ||
751 | |||
752 | /* -ENOPROTOOPT from the net code means the other side isn't | ||
753 | listening for our message type -- that's fine, it means | ||
754 | his dlm isn't up, so we can consider him a 'yes' but not | ||
755 | joined into the domain. */ | ||
756 | if (status == -ENOPROTOOPT) { | ||
757 | status = 0; | ||
758 | *response = JOIN_OK_NO_MAP; | ||
759 | } else if (retval == JOIN_DISALLOW || | ||
760 | retval == JOIN_OK || | ||
761 | retval == JOIN_OK_NO_MAP) { | ||
762 | *response = retval; | ||
763 | } else { | ||
764 | status = -EINVAL; | ||
765 | mlog(ML_ERROR, "invalid response %d from node %u\n", retval, | ||
766 | node); | ||
767 | } | ||
768 | |||
769 | mlog(0, "status %d, node %d response is %d\n", status, node, | ||
770 | *response); | ||
771 | |||
772 | bail: | ||
773 | return status; | ||
774 | } | ||
775 | |||
776 | static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, | ||
777 | unsigned int node) | ||
778 | { | ||
779 | int status; | ||
780 | struct dlm_assert_joined assert_msg; | ||
781 | |||
782 | mlog(0, "Sending join assert to node %u\n", node); | ||
783 | |||
784 | memset(&assert_msg, 0, sizeof(assert_msg)); | ||
785 | assert_msg.node_idx = dlm->node_num; | ||
786 | assert_msg.name_len = strlen(dlm->name); | ||
787 | memcpy(assert_msg.domain, dlm->name, assert_msg.name_len); | ||
788 | |||
789 | status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, | ||
790 | &assert_msg, sizeof(assert_msg), node, | ||
791 | NULL); | ||
792 | if (status < 0) | ||
793 | mlog_errno(status); | ||
794 | |||
795 | return status; | ||
796 | } | ||
797 | |||
798 | static void dlm_send_join_asserts(struct dlm_ctxt *dlm, | ||
799 | unsigned long *node_map) | ||
800 | { | ||
801 | int status, node, live; | ||
802 | |||
803 | status = 0; | ||
804 | node = -1; | ||
805 | while ((node = find_next_bit(node_map, O2NM_MAX_NODES, | ||
806 | node + 1)) < O2NM_MAX_NODES) { | ||
807 | if (node == dlm->node_num) | ||
808 | continue; | ||
809 | |||
810 | do { | ||
811 | /* It is very important that this message be | ||
812 | * received so we spin until either the node | ||
813 | * has died or it gets the message. */ | ||
814 | status = dlm_send_one_join_assert(dlm, node); | ||
815 | |||
816 | spin_lock(&dlm->spinlock); | ||
817 | live = test_bit(node, dlm->live_nodes_map); | ||
818 | spin_unlock(&dlm->spinlock); | ||
819 | |||
820 | if (status) { | ||
821 | mlog(ML_ERROR, "Error return %d asserting " | ||
822 | "join on node %d\n", status, node); | ||
823 | |||
824 | /* give us some time between errors... */ | ||
825 | if (live) | ||
826 | msleep(DLM_DOMAIN_BACKOFF_MS); | ||
827 | } | ||
828 | } while (status && live); | ||
829 | } | ||
830 | } | ||
831 | |||
832 | struct domain_join_ctxt { | ||
833 | unsigned long live_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
834 | unsigned long yes_resp_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
835 | }; | ||
836 | |||
837 | static int dlm_should_restart_join(struct dlm_ctxt *dlm, | ||
838 | struct domain_join_ctxt *ctxt, | ||
839 | enum dlm_query_join_response response) | ||
840 | { | ||
841 | int ret; | ||
842 | |||
843 | if (response == JOIN_DISALLOW) { | ||
844 | mlog(0, "Latest response of disallow -- should restart\n"); | ||
845 | return 1; | ||
846 | } | ||
847 | |||
848 | spin_lock(&dlm->spinlock); | ||
849 | /* For now, we restart the process if the node maps have | ||
850 | * changed at all */ | ||
851 | ret = memcmp(ctxt->live_map, dlm->live_nodes_map, | ||
852 | sizeof(dlm->live_nodes_map)); | ||
853 | spin_unlock(&dlm->spinlock); | ||
854 | |||
855 | if (ret) | ||
856 | mlog(0, "Node maps changed -- should restart\n"); | ||
857 | |||
858 | return ret; | ||
859 | } | ||
860 | |||
861 | static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) | ||
862 | { | ||
863 | int status = 0, tmpstat, node; | ||
864 | struct domain_join_ctxt *ctxt; | ||
865 | enum dlm_query_join_response response; | ||
866 | |||
867 | mlog_entry("%p", dlm); | ||
868 | |||
869 | ctxt = kcalloc(1, sizeof(*ctxt), GFP_KERNEL); | ||
870 | if (!ctxt) { | ||
871 | status = -ENOMEM; | ||
872 | mlog_errno(status); | ||
873 | goto bail; | ||
874 | } | ||
875 | |||
876 | /* group sem locking should work for us here -- we're already | ||
877 | * registered for heartbeat events so filling this should be | ||
878 | * atomic wrt getting those handlers called. */ | ||
879 | o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map)); | ||
880 | |||
881 | spin_lock(&dlm->spinlock); | ||
882 | memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map)); | ||
883 | |||
884 | __dlm_set_joining_node(dlm, dlm->node_num); | ||
885 | |||
886 | spin_unlock(&dlm->spinlock); | ||
887 | |||
888 | node = -1; | ||
889 | while ((node = find_next_bit(ctxt->live_map, O2NM_MAX_NODES, | ||
890 | node + 1)) < O2NM_MAX_NODES) { | ||
891 | if (node == dlm->node_num) | ||
892 | continue; | ||
893 | |||
894 | status = dlm_request_join(dlm, node, &response); | ||
895 | if (status < 0) { | ||
896 | mlog_errno(status); | ||
897 | goto bail; | ||
898 | } | ||
899 | |||
900 | /* Ok, either we got a response or the node doesn't have a | ||
901 | * dlm up. */ | ||
902 | if (response == JOIN_OK) | ||
903 | set_bit(node, ctxt->yes_resp_map); | ||
904 | |||
905 | if (dlm_should_restart_join(dlm, ctxt, response)) { | ||
906 | status = -EAGAIN; | ||
907 | goto bail; | ||
908 | } | ||
909 | } | ||
910 | |||
911 | mlog(0, "Yay, done querying nodes!\n"); | ||
912 | |||
913 | /* Yay, everyone agree's we can join the domain. My domain is | ||
914 | * comprised of all nodes who were put in the | ||
915 | * yes_resp_map. Copy that into our domain map and send a join | ||
916 | * assert message to clean up everyone elses state. */ | ||
917 | spin_lock(&dlm->spinlock); | ||
918 | memcpy(dlm->domain_map, ctxt->yes_resp_map, | ||
919 | sizeof(ctxt->yes_resp_map)); | ||
920 | set_bit(dlm->node_num, dlm->domain_map); | ||
921 | spin_unlock(&dlm->spinlock); | ||
922 | |||
923 | dlm_send_join_asserts(dlm, ctxt->yes_resp_map); | ||
924 | |||
925 | /* Joined state *must* be set before the joining node | ||
926 | * information, otherwise the query_join handler may read no | ||
927 | * current joiner but a state of NEW and tell joining nodes | ||
928 | * we're not in the domain. */ | ||
929 | spin_lock(&dlm_domain_lock); | ||
930 | dlm->dlm_state = DLM_CTXT_JOINED; | ||
931 | dlm->num_joins++; | ||
932 | spin_unlock(&dlm_domain_lock); | ||
933 | |||
934 | bail: | ||
935 | spin_lock(&dlm->spinlock); | ||
936 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | ||
937 | if (!status) | ||
938 | __dlm_print_nodes(dlm); | ||
939 | spin_unlock(&dlm->spinlock); | ||
940 | |||
941 | if (ctxt) { | ||
942 | /* Do we need to send a cancel message to any nodes? */ | ||
943 | if (status < 0) { | ||
944 | tmpstat = dlm_send_join_cancels(dlm, | ||
945 | ctxt->yes_resp_map, | ||
946 | sizeof(ctxt->yes_resp_map)); | ||
947 | if (tmpstat < 0) | ||
948 | mlog_errno(tmpstat); | ||
949 | } | ||
950 | kfree(ctxt); | ||
951 | } | ||
952 | |||
953 | mlog(0, "returning %d\n", status); | ||
954 | return status; | ||
955 | } | ||
956 | |||
957 | static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) | ||
958 | { | ||
959 | o2hb_unregister_callback(&dlm->dlm_hb_up); | ||
960 | o2hb_unregister_callback(&dlm->dlm_hb_down); | ||
961 | o2net_unregister_handler_list(&dlm->dlm_domain_handlers); | ||
962 | } | ||
963 | |||
964 | static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) | ||
965 | { | ||
966 | int status; | ||
967 | |||
968 | mlog(0, "registering handlers.\n"); | ||
969 | |||
970 | o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, | ||
971 | dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); | ||
972 | status = o2hb_register_callback(&dlm->dlm_hb_down); | ||
973 | if (status) | ||
974 | goto bail; | ||
975 | |||
976 | o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, | ||
977 | dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); | ||
978 | status = o2hb_register_callback(&dlm->dlm_hb_up); | ||
979 | if (status) | ||
980 | goto bail; | ||
981 | |||
982 | status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key, | ||
983 | sizeof(struct dlm_master_request), | ||
984 | dlm_master_request_handler, | ||
985 | dlm, &dlm->dlm_domain_handlers); | ||
986 | if (status) | ||
987 | goto bail; | ||
988 | |||
989 | status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key, | ||
990 | sizeof(struct dlm_assert_master), | ||
991 | dlm_assert_master_handler, | ||
992 | dlm, &dlm->dlm_domain_handlers); | ||
993 | if (status) | ||
994 | goto bail; | ||
995 | |||
996 | status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key, | ||
997 | sizeof(struct dlm_create_lock), | ||
998 | dlm_create_lock_handler, | ||
999 | dlm, &dlm->dlm_domain_handlers); | ||
1000 | if (status) | ||
1001 | goto bail; | ||
1002 | |||
1003 | status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key, | ||
1004 | DLM_CONVERT_LOCK_MAX_LEN, | ||
1005 | dlm_convert_lock_handler, | ||
1006 | dlm, &dlm->dlm_domain_handlers); | ||
1007 | if (status) | ||
1008 | goto bail; | ||
1009 | |||
1010 | status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key, | ||
1011 | DLM_UNLOCK_LOCK_MAX_LEN, | ||
1012 | dlm_unlock_lock_handler, | ||
1013 | dlm, &dlm->dlm_domain_handlers); | ||
1014 | if (status) | ||
1015 | goto bail; | ||
1016 | |||
1017 | status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key, | ||
1018 | DLM_PROXY_AST_MAX_LEN, | ||
1019 | dlm_proxy_ast_handler, | ||
1020 | dlm, &dlm->dlm_domain_handlers); | ||
1021 | if (status) | ||
1022 | goto bail; | ||
1023 | |||
1024 | status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key, | ||
1025 | sizeof(struct dlm_exit_domain), | ||
1026 | dlm_exit_domain_handler, | ||
1027 | dlm, &dlm->dlm_domain_handlers); | ||
1028 | if (status) | ||
1029 | goto bail; | ||
1030 | |||
1031 | status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key, | ||
1032 | sizeof(struct dlm_migrate_request), | ||
1033 | dlm_migrate_request_handler, | ||
1034 | dlm, &dlm->dlm_domain_handlers); | ||
1035 | if (status) | ||
1036 | goto bail; | ||
1037 | |||
1038 | status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key, | ||
1039 | DLM_MIG_LOCKRES_MAX_LEN, | ||
1040 | dlm_mig_lockres_handler, | ||
1041 | dlm, &dlm->dlm_domain_handlers); | ||
1042 | if (status) | ||
1043 | goto bail; | ||
1044 | |||
1045 | status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key, | ||
1046 | sizeof(struct dlm_master_requery), | ||
1047 | dlm_master_requery_handler, | ||
1048 | dlm, &dlm->dlm_domain_handlers); | ||
1049 | if (status) | ||
1050 | goto bail; | ||
1051 | |||
1052 | status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key, | ||
1053 | sizeof(struct dlm_lock_request), | ||
1054 | dlm_request_all_locks_handler, | ||
1055 | dlm, &dlm->dlm_domain_handlers); | ||
1056 | if (status) | ||
1057 | goto bail; | ||
1058 | |||
1059 | status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key, | ||
1060 | sizeof(struct dlm_reco_data_done), | ||
1061 | dlm_reco_data_done_handler, | ||
1062 | dlm, &dlm->dlm_domain_handlers); | ||
1063 | if (status) | ||
1064 | goto bail; | ||
1065 | |||
1066 | status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key, | ||
1067 | sizeof(struct dlm_begin_reco), | ||
1068 | dlm_begin_reco_handler, | ||
1069 | dlm, &dlm->dlm_domain_handlers); | ||
1070 | if (status) | ||
1071 | goto bail; | ||
1072 | |||
1073 | status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key, | ||
1074 | sizeof(struct dlm_finalize_reco), | ||
1075 | dlm_finalize_reco_handler, | ||
1076 | dlm, &dlm->dlm_domain_handlers); | ||
1077 | if (status) | ||
1078 | goto bail; | ||
1079 | |||
1080 | bail: | ||
1081 | if (status) | ||
1082 | dlm_unregister_domain_handlers(dlm); | ||
1083 | |||
1084 | return status; | ||
1085 | } | ||
1086 | |||
1087 | static int dlm_join_domain(struct dlm_ctxt *dlm) | ||
1088 | { | ||
1089 | int status; | ||
1090 | |||
1091 | BUG_ON(!dlm); | ||
1092 | |||
1093 | mlog(0, "Join domain %s\n", dlm->name); | ||
1094 | |||
1095 | status = dlm_register_domain_handlers(dlm); | ||
1096 | if (status) { | ||
1097 | mlog_errno(status); | ||
1098 | goto bail; | ||
1099 | } | ||
1100 | |||
1101 | status = dlm_launch_thread(dlm); | ||
1102 | if (status < 0) { | ||
1103 | mlog_errno(status); | ||
1104 | goto bail; | ||
1105 | } | ||
1106 | |||
1107 | status = dlm_launch_recovery_thread(dlm); | ||
1108 | if (status < 0) { | ||
1109 | mlog_errno(status); | ||
1110 | goto bail; | ||
1111 | } | ||
1112 | |||
1113 | do { | ||
1114 | unsigned int backoff; | ||
1115 | status = dlm_try_to_join_domain(dlm); | ||
1116 | |||
1117 | /* If we're racing another node to the join, then we | ||
1118 | * need to back off temporarily and let them | ||
1119 | * complete. */ | ||
1120 | if (status == -EAGAIN) { | ||
1121 | if (signal_pending(current)) { | ||
1122 | status = -ERESTARTSYS; | ||
1123 | goto bail; | ||
1124 | } | ||
1125 | |||
1126 | /* | ||
1127 | * <chip> After you! | ||
1128 | * <dale> No, after you! | ||
1129 | * <chip> I insist! | ||
1130 | * <dale> But you first! | ||
1131 | * ... | ||
1132 | */ | ||
1133 | backoff = (unsigned int)(jiffies & 0x3); | ||
1134 | backoff *= DLM_DOMAIN_BACKOFF_MS; | ||
1135 | mlog(0, "backoff %d\n", backoff); | ||
1136 | msleep(backoff); | ||
1137 | } | ||
1138 | } while (status == -EAGAIN); | ||
1139 | |||
1140 | if (status < 0) { | ||
1141 | mlog_errno(status); | ||
1142 | goto bail; | ||
1143 | } | ||
1144 | |||
1145 | status = 0; | ||
1146 | bail: | ||
1147 | wake_up(&dlm_domain_events); | ||
1148 | |||
1149 | if (status) { | ||
1150 | dlm_unregister_domain_handlers(dlm); | ||
1151 | dlm_complete_thread(dlm); | ||
1152 | dlm_complete_recovery_thread(dlm); | ||
1153 | } | ||
1154 | |||
1155 | return status; | ||
1156 | } | ||
1157 | |||
1158 | static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, | ||
1159 | u32 key) | ||
1160 | { | ||
1161 | int i; | ||
1162 | struct dlm_ctxt *dlm = NULL; | ||
1163 | |||
1164 | dlm = kcalloc(1, sizeof(*dlm), GFP_KERNEL); | ||
1165 | if (!dlm) { | ||
1166 | mlog_errno(-ENOMEM); | ||
1167 | goto leave; | ||
1168 | } | ||
1169 | |||
1170 | dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL); | ||
1171 | if (dlm->name == NULL) { | ||
1172 | mlog_errno(-ENOMEM); | ||
1173 | kfree(dlm); | ||
1174 | dlm = NULL; | ||
1175 | goto leave; | ||
1176 | } | ||
1177 | |||
1178 | dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL); | ||
1179 | if (!dlm->resources) { | ||
1180 | mlog_errno(-ENOMEM); | ||
1181 | kfree(dlm->name); | ||
1182 | kfree(dlm); | ||
1183 | dlm = NULL; | ||
1184 | goto leave; | ||
1185 | } | ||
1186 | memset(dlm->resources, 0, PAGE_SIZE); | ||
1187 | |||
1188 | for (i=0; i<DLM_HASH_SIZE; i++) | ||
1189 | INIT_LIST_HEAD(&dlm->resources[i]); | ||
1190 | |||
1191 | strcpy(dlm->name, domain); | ||
1192 | dlm->key = key; | ||
1193 | dlm->node_num = o2nm_this_node(); | ||
1194 | |||
1195 | spin_lock_init(&dlm->spinlock); | ||
1196 | spin_lock_init(&dlm->master_lock); | ||
1197 | spin_lock_init(&dlm->ast_lock); | ||
1198 | INIT_LIST_HEAD(&dlm->list); | ||
1199 | INIT_LIST_HEAD(&dlm->dirty_list); | ||
1200 | INIT_LIST_HEAD(&dlm->reco.resources); | ||
1201 | INIT_LIST_HEAD(&dlm->reco.received); | ||
1202 | INIT_LIST_HEAD(&dlm->reco.node_data); | ||
1203 | INIT_LIST_HEAD(&dlm->purge_list); | ||
1204 | INIT_LIST_HEAD(&dlm->dlm_domain_handlers); | ||
1205 | dlm->reco.state = 0; | ||
1206 | |||
1207 | INIT_LIST_HEAD(&dlm->pending_asts); | ||
1208 | INIT_LIST_HEAD(&dlm->pending_basts); | ||
1209 | |||
1210 | mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n", | ||
1211 | dlm->recovery_map, &(dlm->recovery_map[0])); | ||
1212 | |||
1213 | memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map)); | ||
1214 | memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map)); | ||
1215 | memset(dlm->domain_map, 0, sizeof(dlm->domain_map)); | ||
1216 | |||
1217 | dlm->dlm_thread_task = NULL; | ||
1218 | dlm->dlm_reco_thread_task = NULL; | ||
1219 | init_waitqueue_head(&dlm->dlm_thread_wq); | ||
1220 | init_waitqueue_head(&dlm->dlm_reco_thread_wq); | ||
1221 | init_waitqueue_head(&dlm->reco.event); | ||
1222 | init_waitqueue_head(&dlm->ast_wq); | ||
1223 | init_waitqueue_head(&dlm->migration_wq); | ||
1224 | INIT_LIST_HEAD(&dlm->master_list); | ||
1225 | INIT_LIST_HEAD(&dlm->mle_hb_events); | ||
1226 | |||
1227 | dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; | ||
1228 | init_waitqueue_head(&dlm->dlm_join_events); | ||
1229 | |||
1230 | dlm->reco.new_master = O2NM_INVALID_NODE_NUM; | ||
1231 | dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; | ||
1232 | atomic_set(&dlm->local_resources, 0); | ||
1233 | atomic_set(&dlm->remote_resources, 0); | ||
1234 | atomic_set(&dlm->unknown_resources, 0); | ||
1235 | |||
1236 | spin_lock_init(&dlm->work_lock); | ||
1237 | INIT_LIST_HEAD(&dlm->work_list); | ||
1238 | INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work, dlm); | ||
1239 | |||
1240 | kref_init(&dlm->dlm_refs); | ||
1241 | dlm->dlm_state = DLM_CTXT_NEW; | ||
1242 | |||
1243 | INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks); | ||
1244 | |||
1245 | mlog(0, "context init: refcount %u\n", | ||
1246 | atomic_read(&dlm->dlm_refs.refcount)); | ||
1247 | |||
1248 | leave: | ||
1249 | return dlm; | ||
1250 | } | ||
1251 | |||
1252 | /* | ||
1253 | * dlm_register_domain: one-time setup per "domain" | ||
1254 | */ | ||
1255 | struct dlm_ctxt * dlm_register_domain(const char *domain, | ||
1256 | u32 key) | ||
1257 | { | ||
1258 | int ret; | ||
1259 | struct dlm_ctxt *dlm = NULL; | ||
1260 | struct dlm_ctxt *new_ctxt = NULL; | ||
1261 | |||
1262 | if (strlen(domain) > O2NM_MAX_NAME_LEN) { | ||
1263 | ret = -ENAMETOOLONG; | ||
1264 | mlog(ML_ERROR, "domain name length too long\n"); | ||
1265 | goto leave; | ||
1266 | } | ||
1267 | |||
1268 | if (!o2hb_check_local_node_heartbeating()) { | ||
1269 | mlog(ML_ERROR, "the local node has not been configured, or is " | ||
1270 | "not heartbeating\n"); | ||
1271 | ret = -EPROTO; | ||
1272 | goto leave; | ||
1273 | } | ||
1274 | |||
1275 | mlog(0, "register called for domain \"%s\"\n", domain); | ||
1276 | |||
1277 | retry: | ||
1278 | dlm = NULL; | ||
1279 | if (signal_pending(current)) { | ||
1280 | ret = -ERESTARTSYS; | ||
1281 | mlog_errno(ret); | ||
1282 | goto leave; | ||
1283 | } | ||
1284 | |||
1285 | spin_lock(&dlm_domain_lock); | ||
1286 | |||
1287 | dlm = __dlm_lookup_domain(domain); | ||
1288 | if (dlm) { | ||
1289 | if (dlm->dlm_state != DLM_CTXT_JOINED) { | ||
1290 | spin_unlock(&dlm_domain_lock); | ||
1291 | |||
1292 | mlog(0, "This ctxt is not joined yet!\n"); | ||
1293 | wait_event_interruptible(dlm_domain_events, | ||
1294 | dlm_wait_on_domain_helper( | ||
1295 | domain)); | ||
1296 | goto retry; | ||
1297 | } | ||
1298 | |||
1299 | __dlm_get(dlm); | ||
1300 | dlm->num_joins++; | ||
1301 | |||
1302 | spin_unlock(&dlm_domain_lock); | ||
1303 | |||
1304 | ret = 0; | ||
1305 | goto leave; | ||
1306 | } | ||
1307 | |||
1308 | /* doesn't exist */ | ||
1309 | if (!new_ctxt) { | ||
1310 | spin_unlock(&dlm_domain_lock); | ||
1311 | |||
1312 | new_ctxt = dlm_alloc_ctxt(domain, key); | ||
1313 | if (new_ctxt) | ||
1314 | goto retry; | ||
1315 | |||
1316 | ret = -ENOMEM; | ||
1317 | mlog_errno(ret); | ||
1318 | goto leave; | ||
1319 | } | ||
1320 | |||
1321 | /* a little variable switch-a-roo here... */ | ||
1322 | dlm = new_ctxt; | ||
1323 | new_ctxt = NULL; | ||
1324 | |||
1325 | /* add the new domain */ | ||
1326 | list_add_tail(&dlm->list, &dlm_domains); | ||
1327 | spin_unlock(&dlm_domain_lock); | ||
1328 | |||
1329 | ret = dlm_join_domain(dlm); | ||
1330 | if (ret) { | ||
1331 | mlog_errno(ret); | ||
1332 | dlm_put(dlm); | ||
1333 | goto leave; | ||
1334 | } | ||
1335 | |||
1336 | ret = 0; | ||
1337 | leave: | ||
1338 | if (new_ctxt) | ||
1339 | dlm_free_ctxt_mem(new_ctxt); | ||
1340 | |||
1341 | if (ret < 0) | ||
1342 | dlm = ERR_PTR(ret); | ||
1343 | |||
1344 | return dlm; | ||
1345 | } | ||
1346 | EXPORT_SYMBOL_GPL(dlm_register_domain); | ||
1347 | |||
1348 | static LIST_HEAD(dlm_join_handlers); | ||
1349 | |||
1350 | static void dlm_unregister_net_handlers(void) | ||
1351 | { | ||
1352 | o2net_unregister_handler_list(&dlm_join_handlers); | ||
1353 | } | ||
1354 | |||
1355 | static int dlm_register_net_handlers(void) | ||
1356 | { | ||
1357 | int status = 0; | ||
1358 | |||
1359 | status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, | ||
1360 | sizeof(struct dlm_query_join_request), | ||
1361 | dlm_query_join_handler, | ||
1362 | NULL, &dlm_join_handlers); | ||
1363 | if (status) | ||
1364 | goto bail; | ||
1365 | |||
1366 | status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, | ||
1367 | sizeof(struct dlm_assert_joined), | ||
1368 | dlm_assert_joined_handler, | ||
1369 | NULL, &dlm_join_handlers); | ||
1370 | if (status) | ||
1371 | goto bail; | ||
1372 | |||
1373 | status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, | ||
1374 | sizeof(struct dlm_cancel_join), | ||
1375 | dlm_cancel_join_handler, | ||
1376 | NULL, &dlm_join_handlers); | ||
1377 | |||
1378 | bail: | ||
1379 | if (status < 0) | ||
1380 | dlm_unregister_net_handlers(); | ||
1381 | |||
1382 | return status; | ||
1383 | } | ||
1384 | |||
1385 | /* Domain eviction callback handling. | ||
1386 | * | ||
1387 | * The file system requires notification of node death *before* the | ||
1388 | * dlm completes it's recovery work, otherwise it may be able to | ||
1389 | * acquire locks on resources requiring recovery. Since the dlm can | ||
1390 | * evict a node from it's domain *before* heartbeat fires, a similar | ||
1391 | * mechanism is required. */ | ||
1392 | |||
1393 | /* Eviction is not expected to happen often, so a per-domain lock is | ||
1394 | * not necessary. Eviction callbacks are allowed to sleep for short | ||
1395 | * periods of time. */ | ||
1396 | static DECLARE_RWSEM(dlm_callback_sem); | ||
1397 | |||
1398 | void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, | ||
1399 | int node_num) | ||
1400 | { | ||
1401 | struct list_head *iter; | ||
1402 | struct dlm_eviction_cb *cb; | ||
1403 | |||
1404 | down_read(&dlm_callback_sem); | ||
1405 | list_for_each(iter, &dlm->dlm_eviction_callbacks) { | ||
1406 | cb = list_entry(iter, struct dlm_eviction_cb, ec_item); | ||
1407 | |||
1408 | cb->ec_func(node_num, cb->ec_data); | ||
1409 | } | ||
1410 | up_read(&dlm_callback_sem); | ||
1411 | } | ||
1412 | |||
1413 | void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb, | ||
1414 | dlm_eviction_func *f, | ||
1415 | void *data) | ||
1416 | { | ||
1417 | INIT_LIST_HEAD(&cb->ec_item); | ||
1418 | cb->ec_func = f; | ||
1419 | cb->ec_data = data; | ||
1420 | } | ||
1421 | EXPORT_SYMBOL_GPL(dlm_setup_eviction_cb); | ||
1422 | |||
1423 | void dlm_register_eviction_cb(struct dlm_ctxt *dlm, | ||
1424 | struct dlm_eviction_cb *cb) | ||
1425 | { | ||
1426 | down_write(&dlm_callback_sem); | ||
1427 | list_add_tail(&cb->ec_item, &dlm->dlm_eviction_callbacks); | ||
1428 | up_write(&dlm_callback_sem); | ||
1429 | } | ||
1430 | EXPORT_SYMBOL_GPL(dlm_register_eviction_cb); | ||
1431 | |||
1432 | void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb) | ||
1433 | { | ||
1434 | down_write(&dlm_callback_sem); | ||
1435 | list_del_init(&cb->ec_item); | ||
1436 | up_write(&dlm_callback_sem); | ||
1437 | } | ||
1438 | EXPORT_SYMBOL_GPL(dlm_unregister_eviction_cb); | ||
1439 | |||
1440 | static int __init dlm_init(void) | ||
1441 | { | ||
1442 | int status; | ||
1443 | |||
1444 | dlm_print_version(); | ||
1445 | |||
1446 | status = dlm_init_mle_cache(); | ||
1447 | if (status) | ||
1448 | return -1; | ||
1449 | |||
1450 | status = dlm_register_net_handlers(); | ||
1451 | if (status) { | ||
1452 | dlm_destroy_mle_cache(); | ||
1453 | return -1; | ||
1454 | } | ||
1455 | |||
1456 | return 0; | ||
1457 | } | ||
1458 | |||
1459 | static void __exit dlm_exit (void) | ||
1460 | { | ||
1461 | dlm_unregister_net_handlers(); | ||
1462 | dlm_destroy_mle_cache(); | ||
1463 | } | ||
1464 | |||
1465 | MODULE_AUTHOR("Oracle"); | ||
1466 | MODULE_LICENSE("GPL"); | ||
1467 | |||
1468 | module_init(dlm_init); | ||
1469 | module_exit(dlm_exit); | ||
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h new file mode 100644 index 000000000000..2f7f60bfeb3b --- /dev/null +++ b/fs/ocfs2/dlm/dlmdomain.h | |||
@@ -0,0 +1,36 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmdomain.h | ||
5 | * | ||
6 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public | ||
10 | * License as published by the Free Software Foundation; either | ||
11 | * version 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
16 | * General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public | ||
19 | * License along with this program; if not, write to the | ||
20 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
21 | * Boston, MA 021110-1307, USA. | ||
22 | * | ||
23 | */ | ||
24 | |||
25 | #ifndef DLMDOMAIN_H | ||
26 | #define DLMDOMAIN_H | ||
27 | |||
28 | extern spinlock_t dlm_domain_lock; | ||
29 | extern struct list_head dlm_domains; | ||
30 | |||
31 | int dlm_joined(struct dlm_ctxt *dlm); | ||
32 | int dlm_shutting_down(struct dlm_ctxt *dlm); | ||
33 | void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, | ||
34 | int node_num); | ||
35 | |||
36 | #endif | ||
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c new file mode 100644 index 000000000000..dd2d24dc25e0 --- /dev/null +++ b/fs/ocfs2/dlm/dlmfs.c | |||
@@ -0,0 +1,640 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmfs.c | ||
5 | * | ||
6 | * Code which implements the kernel side of a minimal userspace | ||
7 | * interface to our DLM. This file handles the virtual file system | ||
8 | * used for communication with userspace. Credit should go to ramfs, | ||
9 | * which was a template for the fs side of this module. | ||
10 | * | ||
11 | * Copyright (C) 2003, 2004 Oracle. All rights reserved. | ||
12 | * | ||
13 | * This program is free software; you can redistribute it and/or | ||
14 | * modify it under the terms of the GNU General Public | ||
15 | * License as published by the Free Software Foundation; either | ||
16 | * version 2 of the License, or (at your option) any later version. | ||
17 | * | ||
18 | * This program is distributed in the hope that it will be useful, | ||
19 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
20 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
21 | * General Public License for more details. | ||
22 | * | ||
23 | * You should have received a copy of the GNU General Public | ||
24 | * License along with this program; if not, write to the | ||
25 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
26 | * Boston, MA 021110-1307, USA. | ||
27 | */ | ||
28 | |||
29 | /* Simple VFS hooks based on: */ | ||
30 | /* | ||
31 | * Resizable simple ram filesystem for Linux. | ||
32 | * | ||
33 | * Copyright (C) 2000 Linus Torvalds. | ||
34 | * 2000 Transmeta Corp. | ||
35 | */ | ||
36 | |||
37 | #include <linux/module.h> | ||
38 | #include <linux/fs.h> | ||
39 | #include <linux/pagemap.h> | ||
40 | #include <linux/types.h> | ||
41 | #include <linux/slab.h> | ||
42 | #include <linux/highmem.h> | ||
43 | #include <linux/init.h> | ||
44 | #include <linux/string.h> | ||
45 | #include <linux/smp_lock.h> | ||
46 | #include <linux/backing-dev.h> | ||
47 | |||
48 | #include <asm/uaccess.h> | ||
49 | |||
50 | |||
51 | #include "cluster/nodemanager.h" | ||
52 | #include "cluster/heartbeat.h" | ||
53 | #include "cluster/tcp.h" | ||
54 | |||
55 | #include "dlmapi.h" | ||
56 | |||
57 | #include "userdlm.h" | ||
58 | |||
59 | #include "dlmfsver.h" | ||
60 | |||
61 | #define MLOG_MASK_PREFIX ML_DLMFS | ||
62 | #include "cluster/masklog.h" | ||
63 | |||
64 | static struct super_operations dlmfs_ops; | ||
65 | static struct file_operations dlmfs_file_operations; | ||
66 | static struct inode_operations dlmfs_dir_inode_operations; | ||
67 | static struct inode_operations dlmfs_root_inode_operations; | ||
68 | static struct inode_operations dlmfs_file_inode_operations; | ||
69 | static kmem_cache_t *dlmfs_inode_cache; | ||
70 | |||
71 | struct workqueue_struct *user_dlm_worker; | ||
72 | |||
73 | /* | ||
74 | * decodes a set of open flags into a valid lock level and a set of flags. | ||
75 | * returns < 0 if we have invalid flags | ||
76 | * flags which mean something to us: | ||
77 | * O_RDONLY -> PRMODE level | ||
78 | * O_WRONLY -> EXMODE level | ||
79 | * | ||
80 | * O_NONBLOCK -> LKM_NOQUEUE | ||
81 | */ | ||
82 | static int dlmfs_decode_open_flags(int open_flags, | ||
83 | int *level, | ||
84 | int *flags) | ||
85 | { | ||
86 | if (open_flags & (O_WRONLY|O_RDWR)) | ||
87 | *level = LKM_EXMODE; | ||
88 | else | ||
89 | *level = LKM_PRMODE; | ||
90 | |||
91 | *flags = 0; | ||
92 | if (open_flags & O_NONBLOCK) | ||
93 | *flags |= LKM_NOQUEUE; | ||
94 | |||
95 | return 0; | ||
96 | } | ||
97 | |||
98 | static int dlmfs_file_open(struct inode *inode, | ||
99 | struct file *file) | ||
100 | { | ||
101 | int status, level, flags; | ||
102 | struct dlmfs_filp_private *fp = NULL; | ||
103 | struct dlmfs_inode_private *ip; | ||
104 | |||
105 | if (S_ISDIR(inode->i_mode)) | ||
106 | BUG(); | ||
107 | |||
108 | mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino, | ||
109 | file->f_flags); | ||
110 | |||
111 | status = dlmfs_decode_open_flags(file->f_flags, &level, &flags); | ||
112 | if (status < 0) | ||
113 | goto bail; | ||
114 | |||
115 | /* We don't want to honor O_APPEND at read/write time as it | ||
116 | * doesn't make sense for LVB writes. */ | ||
117 | file->f_flags &= ~O_APPEND; | ||
118 | |||
119 | fp = kmalloc(sizeof(*fp), GFP_KERNEL); | ||
120 | if (!fp) { | ||
121 | status = -ENOMEM; | ||
122 | goto bail; | ||
123 | } | ||
124 | fp->fp_lock_level = level; | ||
125 | |||
126 | ip = DLMFS_I(inode); | ||
127 | |||
128 | status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags); | ||
129 | if (status < 0) { | ||
130 | /* this is a strange error to return here but I want | ||
131 | * to be able userspace to be able to distinguish a | ||
132 | * valid lock request from one that simply couldn't be | ||
133 | * granted. */ | ||
134 | if (flags & LKM_NOQUEUE && status == -EAGAIN) | ||
135 | status = -ETXTBSY; | ||
136 | kfree(fp); | ||
137 | goto bail; | ||
138 | } | ||
139 | |||
140 | file->private_data = fp; | ||
141 | bail: | ||
142 | return status; | ||
143 | } | ||
144 | |||
145 | static int dlmfs_file_release(struct inode *inode, | ||
146 | struct file *file) | ||
147 | { | ||
148 | int level, status; | ||
149 | struct dlmfs_inode_private *ip = DLMFS_I(inode); | ||
150 | struct dlmfs_filp_private *fp = | ||
151 | (struct dlmfs_filp_private *) file->private_data; | ||
152 | |||
153 | if (S_ISDIR(inode->i_mode)) | ||
154 | BUG(); | ||
155 | |||
156 | mlog(0, "close called on inode %lu\n", inode->i_ino); | ||
157 | |||
158 | status = 0; | ||
159 | if (fp) { | ||
160 | level = fp->fp_lock_level; | ||
161 | if (level != LKM_IVMODE) | ||
162 | user_dlm_cluster_unlock(&ip->ip_lockres, level); | ||
163 | |||
164 | kfree(fp); | ||
165 | file->private_data = NULL; | ||
166 | } | ||
167 | |||
168 | return 0; | ||
169 | } | ||
170 | |||
171 | static ssize_t dlmfs_file_read(struct file *filp, | ||
172 | char __user *buf, | ||
173 | size_t count, | ||
174 | loff_t *ppos) | ||
175 | { | ||
176 | int bytes_left; | ||
177 | ssize_t readlen; | ||
178 | char *lvb_buf; | ||
179 | struct inode *inode = filp->f_dentry->d_inode; | ||
180 | |||
181 | mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", | ||
182 | inode->i_ino, count, *ppos); | ||
183 | |||
184 | if (*ppos >= i_size_read(inode)) | ||
185 | return 0; | ||
186 | |||
187 | if (!count) | ||
188 | return 0; | ||
189 | |||
190 | if (!access_ok(VERIFY_WRITE, buf, count)) | ||
191 | return -EFAULT; | ||
192 | |||
193 | /* don't read past the lvb */ | ||
194 | if ((count + *ppos) > i_size_read(inode)) | ||
195 | readlen = i_size_read(inode) - *ppos; | ||
196 | else | ||
197 | readlen = count - *ppos; | ||
198 | |||
199 | lvb_buf = kmalloc(readlen, GFP_KERNEL); | ||
200 | if (!lvb_buf) | ||
201 | return -ENOMEM; | ||
202 | |||
203 | user_dlm_read_lvb(inode, lvb_buf, readlen); | ||
204 | bytes_left = __copy_to_user(buf, lvb_buf, readlen); | ||
205 | readlen -= bytes_left; | ||
206 | |||
207 | kfree(lvb_buf); | ||
208 | |||
209 | *ppos = *ppos + readlen; | ||
210 | |||
211 | mlog(0, "read %zd bytes\n", readlen); | ||
212 | return readlen; | ||
213 | } | ||
214 | |||
215 | static ssize_t dlmfs_file_write(struct file *filp, | ||
216 | const char __user *buf, | ||
217 | size_t count, | ||
218 | loff_t *ppos) | ||
219 | { | ||
220 | int bytes_left; | ||
221 | ssize_t writelen; | ||
222 | char *lvb_buf; | ||
223 | struct inode *inode = filp->f_dentry->d_inode; | ||
224 | |||
225 | mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", | ||
226 | inode->i_ino, count, *ppos); | ||
227 | |||
228 | if (*ppos >= i_size_read(inode)) | ||
229 | return -ENOSPC; | ||
230 | |||
231 | if (!count) | ||
232 | return 0; | ||
233 | |||
234 | if (!access_ok(VERIFY_READ, buf, count)) | ||
235 | return -EFAULT; | ||
236 | |||
237 | /* don't write past the lvb */ | ||
238 | if ((count + *ppos) > i_size_read(inode)) | ||
239 | writelen = i_size_read(inode) - *ppos; | ||
240 | else | ||
241 | writelen = count - *ppos; | ||
242 | |||
243 | lvb_buf = kmalloc(writelen, GFP_KERNEL); | ||
244 | if (!lvb_buf) | ||
245 | return -ENOMEM; | ||
246 | |||
247 | bytes_left = copy_from_user(lvb_buf, buf, writelen); | ||
248 | writelen -= bytes_left; | ||
249 | if (writelen) | ||
250 | user_dlm_write_lvb(inode, lvb_buf, writelen); | ||
251 | |||
252 | kfree(lvb_buf); | ||
253 | |||
254 | *ppos = *ppos + writelen; | ||
255 | mlog(0, "wrote %zd bytes\n", writelen); | ||
256 | return writelen; | ||
257 | } | ||
258 | |||
259 | static void dlmfs_init_once(void *foo, | ||
260 | kmem_cache_t *cachep, | ||
261 | unsigned long flags) | ||
262 | { | ||
263 | struct dlmfs_inode_private *ip = | ||
264 | (struct dlmfs_inode_private *) foo; | ||
265 | |||
266 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == | ||
267 | SLAB_CTOR_CONSTRUCTOR) { | ||
268 | ip->ip_dlm = NULL; | ||
269 | ip->ip_parent = NULL; | ||
270 | |||
271 | inode_init_once(&ip->ip_vfs_inode); | ||
272 | } | ||
273 | } | ||
274 | |||
275 | static struct inode *dlmfs_alloc_inode(struct super_block *sb) | ||
276 | { | ||
277 | struct dlmfs_inode_private *ip; | ||
278 | |||
279 | ip = kmem_cache_alloc(dlmfs_inode_cache, SLAB_NOFS); | ||
280 | if (!ip) | ||
281 | return NULL; | ||
282 | |||
283 | return &ip->ip_vfs_inode; | ||
284 | } | ||
285 | |||
286 | static void dlmfs_destroy_inode(struct inode *inode) | ||
287 | { | ||
288 | kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); | ||
289 | } | ||
290 | |||
291 | static void dlmfs_clear_inode(struct inode *inode) | ||
292 | { | ||
293 | int status; | ||
294 | struct dlmfs_inode_private *ip; | ||
295 | |||
296 | if (!inode) | ||
297 | return; | ||
298 | |||
299 | mlog(0, "inode %lu\n", inode->i_ino); | ||
300 | |||
301 | ip = DLMFS_I(inode); | ||
302 | |||
303 | if (S_ISREG(inode->i_mode)) { | ||
304 | status = user_dlm_destroy_lock(&ip->ip_lockres); | ||
305 | if (status < 0) | ||
306 | mlog_errno(status); | ||
307 | iput(ip->ip_parent); | ||
308 | goto clear_fields; | ||
309 | } | ||
310 | |||
311 | mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm); | ||
312 | /* we must be a directory. If required, lets unregister the | ||
313 | * dlm context now. */ | ||
314 | if (ip->ip_dlm) | ||
315 | user_dlm_unregister_context(ip->ip_dlm); | ||
316 | clear_fields: | ||
317 | ip->ip_parent = NULL; | ||
318 | ip->ip_dlm = NULL; | ||
319 | } | ||
320 | |||
321 | static struct backing_dev_info dlmfs_backing_dev_info = { | ||
322 | .ra_pages = 0, /* No readahead */ | ||
323 | .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, | ||
324 | }; | ||
325 | |||
326 | static struct inode *dlmfs_get_root_inode(struct super_block *sb) | ||
327 | { | ||
328 | struct inode *inode = new_inode(sb); | ||
329 | int mode = S_IFDIR | 0755; | ||
330 | struct dlmfs_inode_private *ip; | ||
331 | |||
332 | if (inode) { | ||
333 | ip = DLMFS_I(inode); | ||
334 | |||
335 | inode->i_mode = mode; | ||
336 | inode->i_uid = current->fsuid; | ||
337 | inode->i_gid = current->fsgid; | ||
338 | inode->i_blksize = PAGE_CACHE_SIZE; | ||
339 | inode->i_blocks = 0; | ||
340 | inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; | ||
341 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
342 | inode->i_nlink++; | ||
343 | |||
344 | inode->i_fop = &simple_dir_operations; | ||
345 | inode->i_op = &dlmfs_root_inode_operations; | ||
346 | } | ||
347 | |||
348 | return inode; | ||
349 | } | ||
350 | |||
351 | static struct inode *dlmfs_get_inode(struct inode *parent, | ||
352 | struct dentry *dentry, | ||
353 | int mode) | ||
354 | { | ||
355 | struct super_block *sb = parent->i_sb; | ||
356 | struct inode * inode = new_inode(sb); | ||
357 | struct dlmfs_inode_private *ip; | ||
358 | |||
359 | if (!inode) | ||
360 | return NULL; | ||
361 | |||
362 | inode->i_mode = mode; | ||
363 | inode->i_uid = current->fsuid; | ||
364 | inode->i_gid = current->fsgid; | ||
365 | inode->i_blksize = PAGE_CACHE_SIZE; | ||
366 | inode->i_blocks = 0; | ||
367 | inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; | ||
368 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
369 | |||
370 | ip = DLMFS_I(inode); | ||
371 | ip->ip_dlm = DLMFS_I(parent)->ip_dlm; | ||
372 | |||
373 | switch (mode & S_IFMT) { | ||
374 | default: | ||
375 | /* for now we don't support anything other than | ||
376 | * directories and regular files. */ | ||
377 | BUG(); | ||
378 | break; | ||
379 | case S_IFREG: | ||
380 | inode->i_op = &dlmfs_file_inode_operations; | ||
381 | inode->i_fop = &dlmfs_file_operations; | ||
382 | |||
383 | i_size_write(inode, DLM_LVB_LEN); | ||
384 | |||
385 | user_dlm_lock_res_init(&ip->ip_lockres, dentry); | ||
386 | |||
387 | /* released at clear_inode time, this insures that we | ||
388 | * get to drop the dlm reference on each lock *before* | ||
389 | * we call the unregister code for releasing parent | ||
390 | * directories. */ | ||
391 | ip->ip_parent = igrab(parent); | ||
392 | BUG_ON(!ip->ip_parent); | ||
393 | break; | ||
394 | case S_IFDIR: | ||
395 | inode->i_op = &dlmfs_dir_inode_operations; | ||
396 | inode->i_fop = &simple_dir_operations; | ||
397 | |||
398 | /* directory inodes start off with i_nlink == | ||
399 | * 2 (for "." entry) */ | ||
400 | inode->i_nlink++; | ||
401 | break; | ||
402 | } | ||
403 | |||
404 | if (parent->i_mode & S_ISGID) { | ||
405 | inode->i_gid = parent->i_gid; | ||
406 | if (S_ISDIR(mode)) | ||
407 | inode->i_mode |= S_ISGID; | ||
408 | } | ||
409 | |||
410 | return inode; | ||
411 | } | ||
412 | |||
413 | /* | ||
414 | * File creation. Allocate an inode, and we're done.. | ||
415 | */ | ||
416 | /* SMP-safe */ | ||
417 | static int dlmfs_mkdir(struct inode * dir, | ||
418 | struct dentry * dentry, | ||
419 | int mode) | ||
420 | { | ||
421 | int status; | ||
422 | struct inode *inode = NULL; | ||
423 | struct qstr *domain = &dentry->d_name; | ||
424 | struct dlmfs_inode_private *ip; | ||
425 | struct dlm_ctxt *dlm; | ||
426 | |||
427 | mlog(0, "mkdir %.*s\n", domain->len, domain->name); | ||
428 | |||
429 | /* verify that we have a proper domain */ | ||
430 | if (domain->len >= O2NM_MAX_NAME_LEN) { | ||
431 | status = -EINVAL; | ||
432 | mlog(ML_ERROR, "invalid domain name for directory.\n"); | ||
433 | goto bail; | ||
434 | } | ||
435 | |||
436 | inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR); | ||
437 | if (!inode) { | ||
438 | status = -ENOMEM; | ||
439 | mlog_errno(status); | ||
440 | goto bail; | ||
441 | } | ||
442 | |||
443 | ip = DLMFS_I(inode); | ||
444 | |||
445 | dlm = user_dlm_register_context(domain); | ||
446 | if (IS_ERR(dlm)) { | ||
447 | status = PTR_ERR(dlm); | ||
448 | mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n", | ||
449 | status, domain->len, domain->name); | ||
450 | goto bail; | ||
451 | } | ||
452 | ip->ip_dlm = dlm; | ||
453 | |||
454 | dir->i_nlink++; | ||
455 | d_instantiate(dentry, inode); | ||
456 | dget(dentry); /* Extra count - pin the dentry in core */ | ||
457 | |||
458 | status = 0; | ||
459 | bail: | ||
460 | if (status < 0) | ||
461 | iput(inode); | ||
462 | return status; | ||
463 | } | ||
464 | |||
465 | static int dlmfs_create(struct inode *dir, | ||
466 | struct dentry *dentry, | ||
467 | int mode, | ||
468 | struct nameidata *nd) | ||
469 | { | ||
470 | int status = 0; | ||
471 | struct inode *inode; | ||
472 | struct qstr *name = &dentry->d_name; | ||
473 | |||
474 | mlog(0, "create %.*s\n", name->len, name->name); | ||
475 | |||
476 | /* verify name is valid and doesn't contain any dlm reserved | ||
477 | * characters */ | ||
478 | if (name->len >= USER_DLM_LOCK_ID_MAX_LEN || | ||
479 | name->name[0] == '$') { | ||
480 | status = -EINVAL; | ||
481 | mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len, | ||
482 | name->name); | ||
483 | goto bail; | ||
484 | } | ||
485 | |||
486 | inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG); | ||
487 | if (!inode) { | ||
488 | status = -ENOMEM; | ||
489 | mlog_errno(status); | ||
490 | goto bail; | ||
491 | } | ||
492 | |||
493 | d_instantiate(dentry, inode); | ||
494 | dget(dentry); /* Extra count - pin the dentry in core */ | ||
495 | bail: | ||
496 | return status; | ||
497 | } | ||
498 | |||
499 | static int dlmfs_unlink(struct inode *dir, | ||
500 | struct dentry *dentry) | ||
501 | { | ||
502 | int status; | ||
503 | struct inode *inode = dentry->d_inode; | ||
504 | |||
505 | mlog(0, "unlink inode %lu\n", inode->i_ino); | ||
506 | |||
507 | /* if there are no current holders, or none that are waiting | ||
508 | * to acquire a lock, this basically destroys our lockres. */ | ||
509 | status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres); | ||
510 | if (status < 0) { | ||
511 | mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n", | ||
512 | dentry->d_name.len, dentry->d_name.name, status); | ||
513 | goto bail; | ||
514 | } | ||
515 | status = simple_unlink(dir, dentry); | ||
516 | bail: | ||
517 | return status; | ||
518 | } | ||
519 | |||
520 | static int dlmfs_fill_super(struct super_block * sb, | ||
521 | void * data, | ||
522 | int silent) | ||
523 | { | ||
524 | struct inode * inode; | ||
525 | struct dentry * root; | ||
526 | |||
527 | sb->s_maxbytes = MAX_LFS_FILESIZE; | ||
528 | sb->s_blocksize = PAGE_CACHE_SIZE; | ||
529 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; | ||
530 | sb->s_magic = DLMFS_MAGIC; | ||
531 | sb->s_op = &dlmfs_ops; | ||
532 | inode = dlmfs_get_root_inode(sb); | ||
533 | if (!inode) | ||
534 | return -ENOMEM; | ||
535 | |||
536 | root = d_alloc_root(inode); | ||
537 | if (!root) { | ||
538 | iput(inode); | ||
539 | return -ENOMEM; | ||
540 | } | ||
541 | sb->s_root = root; | ||
542 | return 0; | ||
543 | } | ||
544 | |||
545 | static struct file_operations dlmfs_file_operations = { | ||
546 | .open = dlmfs_file_open, | ||
547 | .release = dlmfs_file_release, | ||
548 | .read = dlmfs_file_read, | ||
549 | .write = dlmfs_file_write, | ||
550 | }; | ||
551 | |||
552 | static struct inode_operations dlmfs_dir_inode_operations = { | ||
553 | .create = dlmfs_create, | ||
554 | .lookup = simple_lookup, | ||
555 | .unlink = dlmfs_unlink, | ||
556 | }; | ||
557 | |||
558 | /* this way we can restrict mkdir to only the toplevel of the fs. */ | ||
559 | static struct inode_operations dlmfs_root_inode_operations = { | ||
560 | .lookup = simple_lookup, | ||
561 | .mkdir = dlmfs_mkdir, | ||
562 | .rmdir = simple_rmdir, | ||
563 | }; | ||
564 | |||
565 | static struct super_operations dlmfs_ops = { | ||
566 | .statfs = simple_statfs, | ||
567 | .alloc_inode = dlmfs_alloc_inode, | ||
568 | .destroy_inode = dlmfs_destroy_inode, | ||
569 | .clear_inode = dlmfs_clear_inode, | ||
570 | .drop_inode = generic_delete_inode, | ||
571 | }; | ||
572 | |||
573 | static struct inode_operations dlmfs_file_inode_operations = { | ||
574 | .getattr = simple_getattr, | ||
575 | }; | ||
576 | |||
577 | static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type, | ||
578 | int flags, const char *dev_name, void *data) | ||
579 | { | ||
580 | return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super); | ||
581 | } | ||
582 | |||
583 | static struct file_system_type dlmfs_fs_type = { | ||
584 | .owner = THIS_MODULE, | ||
585 | .name = "ocfs2_dlmfs", | ||
586 | .get_sb = dlmfs_get_sb, | ||
587 | .kill_sb = kill_litter_super, | ||
588 | }; | ||
589 | |||
590 | static int __init init_dlmfs_fs(void) | ||
591 | { | ||
592 | int status; | ||
593 | int cleanup_inode = 0, cleanup_worker = 0; | ||
594 | |||
595 | dlmfs_print_version(); | ||
596 | |||
597 | dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", | ||
598 | sizeof(struct dlmfs_inode_private), | ||
599 | 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, | ||
600 | dlmfs_init_once, NULL); | ||
601 | if (!dlmfs_inode_cache) | ||
602 | return -ENOMEM; | ||
603 | cleanup_inode = 1; | ||
604 | |||
605 | user_dlm_worker = create_singlethread_workqueue("user_dlm"); | ||
606 | if (!user_dlm_worker) { | ||
607 | status = -ENOMEM; | ||
608 | goto bail; | ||
609 | } | ||
610 | cleanup_worker = 1; | ||
611 | |||
612 | status = register_filesystem(&dlmfs_fs_type); | ||
613 | bail: | ||
614 | if (status) { | ||
615 | if (cleanup_inode) | ||
616 | kmem_cache_destroy(dlmfs_inode_cache); | ||
617 | if (cleanup_worker) | ||
618 | destroy_workqueue(user_dlm_worker); | ||
619 | } else | ||
620 | printk("OCFS2 User DLM kernel interface loaded\n"); | ||
621 | return status; | ||
622 | } | ||
623 | |||
624 | static void __exit exit_dlmfs_fs(void) | ||
625 | { | ||
626 | unregister_filesystem(&dlmfs_fs_type); | ||
627 | |||
628 | flush_workqueue(user_dlm_worker); | ||
629 | destroy_workqueue(user_dlm_worker); | ||
630 | |||
631 | if (kmem_cache_destroy(dlmfs_inode_cache)) | ||
632 | printk(KERN_INFO "dlmfs_inode_cache: not all structures " | ||
633 | "were freed\n"); | ||
634 | } | ||
635 | |||
636 | MODULE_AUTHOR("Oracle"); | ||
637 | MODULE_LICENSE("GPL"); | ||
638 | |||
639 | module_init(init_dlmfs_fs) | ||
640 | module_exit(exit_dlmfs_fs) | ||
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c new file mode 100644 index 000000000000..d2be3ad841f9 --- /dev/null +++ b/fs/ocfs2/dlm/dlmfsver.c | |||
@@ -0,0 +1,42 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmfsver.c | ||
5 | * | ||
6 | * version string | ||
7 | * | ||
8 | * Copyright (C) 2002, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/kernel.h> | ||
28 | |||
29 | #include "dlmfsver.h" | ||
30 | |||
31 | #define DLM_BUILD_VERSION "1.3.3" | ||
32 | |||
33 | #define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION | ||
34 | |||
35 | void dlmfs_print_version(void) | ||
36 | { | ||
37 | printk(KERN_INFO "%s\n", VERSION_STR); | ||
38 | } | ||
39 | |||
40 | MODULE_DESCRIPTION(VERSION_STR); | ||
41 | |||
42 | MODULE_VERSION(DLM_BUILD_VERSION); | ||
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlm/dlmfsver.h new file mode 100644 index 000000000000..f35eadbed25c --- /dev/null +++ b/fs/ocfs2/dlm/dlmfsver.h | |||
@@ -0,0 +1,31 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmver.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef DLMFS_VER_H | ||
27 | #define DLMFS_VER_H | ||
28 | |||
29 | void dlmfs_print_version(void); | ||
30 | |||
31 | #endif /* DLMFS_VER_H */ | ||
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c new file mode 100644 index 000000000000..d1a0038557a3 --- /dev/null +++ b/fs/ocfs2/dlm/dlmlock.c | |||
@@ -0,0 +1,676 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmlock.c | ||
5 | * | ||
6 | * underlying calls for lock creation | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | |||
28 | #include <linux/module.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/types.h> | ||
31 | #include <linux/slab.h> | ||
32 | #include <linux/highmem.h> | ||
33 | #include <linux/utsname.h> | ||
34 | #include <linux/init.h> | ||
35 | #include <linux/sysctl.h> | ||
36 | #include <linux/random.h> | ||
37 | #include <linux/blkdev.h> | ||
38 | #include <linux/socket.h> | ||
39 | #include <linux/inet.h> | ||
40 | #include <linux/spinlock.h> | ||
41 | #include <linux/delay.h> | ||
42 | |||
43 | |||
44 | #include "cluster/heartbeat.h" | ||
45 | #include "cluster/nodemanager.h" | ||
46 | #include "cluster/tcp.h" | ||
47 | |||
48 | #include "dlmapi.h" | ||
49 | #include "dlmcommon.h" | ||
50 | |||
51 | #include "dlmconvert.h" | ||
52 | |||
53 | #define MLOG_MASK_PREFIX ML_DLM | ||
54 | #include "cluster/masklog.h" | ||
55 | |||
56 | static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED; | ||
57 | static u64 dlm_next_cookie = 1; | ||
58 | |||
59 | static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, | ||
60 | struct dlm_lock_resource *res, | ||
61 | struct dlm_lock *lock, int flags); | ||
62 | static void dlm_init_lock(struct dlm_lock *newlock, int type, | ||
63 | u8 node, u64 cookie); | ||
64 | static void dlm_lock_release(struct kref *kref); | ||
65 | static void dlm_lock_detach_lockres(struct dlm_lock *lock); | ||
66 | |||
67 | /* Tell us whether we can grant a new lock request. | ||
68 | * locking: | ||
69 | * caller needs: res->spinlock | ||
70 | * taken: none | ||
71 | * held on exit: none | ||
72 | * returns: 1 if the lock can be granted, 0 otherwise. | ||
73 | */ | ||
74 | static int dlm_can_grant_new_lock(struct dlm_lock_resource *res, | ||
75 | struct dlm_lock *lock) | ||
76 | { | ||
77 | struct list_head *iter; | ||
78 | struct dlm_lock *tmplock; | ||
79 | |||
80 | list_for_each(iter, &res->granted) { | ||
81 | tmplock = list_entry(iter, struct dlm_lock, list); | ||
82 | |||
83 | if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) | ||
84 | return 0; | ||
85 | } | ||
86 | |||
87 | list_for_each(iter, &res->converting) { | ||
88 | tmplock = list_entry(iter, struct dlm_lock, list); | ||
89 | |||
90 | if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) | ||
91 | return 0; | ||
92 | } | ||
93 | |||
94 | return 1; | ||
95 | } | ||
96 | |||
97 | /* performs lock creation at the lockres master site | ||
98 | * locking: | ||
99 | * caller needs: none | ||
100 | * taken: takes and drops res->spinlock | ||
101 | * held on exit: none | ||
102 | * returns: DLM_NORMAL, DLM_NOTQUEUED | ||
103 | */ | ||
104 | static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm, | ||
105 | struct dlm_lock_resource *res, | ||
106 | struct dlm_lock *lock, int flags) | ||
107 | { | ||
108 | int call_ast = 0, kick_thread = 0; | ||
109 | enum dlm_status status = DLM_NORMAL; | ||
110 | |||
111 | mlog_entry("type=%d\n", lock->ml.type); | ||
112 | |||
113 | spin_lock(&res->spinlock); | ||
114 | /* if called from dlm_create_lock_handler, need to | ||
115 | * ensure it will not sleep in dlm_wait_on_lockres */ | ||
116 | status = __dlm_lockres_state_to_status(res); | ||
117 | if (status != DLM_NORMAL && | ||
118 | lock->ml.node != dlm->node_num) { | ||
119 | /* erf. state changed after lock was dropped. */ | ||
120 | spin_unlock(&res->spinlock); | ||
121 | dlm_error(status); | ||
122 | return status; | ||
123 | } | ||
124 | __dlm_wait_on_lockres(res); | ||
125 | __dlm_lockres_reserve_ast(res); | ||
126 | |||
127 | if (dlm_can_grant_new_lock(res, lock)) { | ||
128 | mlog(0, "I can grant this lock right away\n"); | ||
129 | /* got it right away */ | ||
130 | lock->lksb->status = DLM_NORMAL; | ||
131 | status = DLM_NORMAL; | ||
132 | dlm_lock_get(lock); | ||
133 | list_add_tail(&lock->list, &res->granted); | ||
134 | |||
135 | /* for the recovery lock, we can't allow the ast | ||
136 | * to be queued since the dlmthread is already | ||
137 | * frozen. but the recovery lock is always locked | ||
138 | * with LKM_NOQUEUE so we do not need the ast in | ||
139 | * this special case */ | ||
140 | if (!dlm_is_recovery_lock(res->lockname.name, | ||
141 | res->lockname.len)) { | ||
142 | kick_thread = 1; | ||
143 | call_ast = 1; | ||
144 | } | ||
145 | } else { | ||
146 | /* for NOQUEUE request, unless we get the | ||
147 | * lock right away, return DLM_NOTQUEUED */ | ||
148 | if (flags & LKM_NOQUEUE) | ||
149 | status = DLM_NOTQUEUED; | ||
150 | else { | ||
151 | dlm_lock_get(lock); | ||
152 | list_add_tail(&lock->list, &res->blocked); | ||
153 | kick_thread = 1; | ||
154 | } | ||
155 | } | ||
156 | |||
157 | spin_unlock(&res->spinlock); | ||
158 | wake_up(&res->wq); | ||
159 | |||
160 | /* either queue the ast or release it */ | ||
161 | if (call_ast) | ||
162 | dlm_queue_ast(dlm, lock); | ||
163 | else | ||
164 | dlm_lockres_release_ast(dlm, res); | ||
165 | |||
166 | dlm_lockres_calc_usage(dlm, res); | ||
167 | if (kick_thread) | ||
168 | dlm_kick_thread(dlm, res); | ||
169 | |||
170 | return status; | ||
171 | } | ||
172 | |||
173 | void dlm_revert_pending_lock(struct dlm_lock_resource *res, | ||
174 | struct dlm_lock *lock) | ||
175 | { | ||
176 | /* remove from local queue if it failed */ | ||
177 | list_del_init(&lock->list); | ||
178 | lock->lksb->flags &= ~DLM_LKSB_GET_LVB; | ||
179 | } | ||
180 | |||
181 | |||
182 | /* | ||
183 | * locking: | ||
184 | * caller needs: none | ||
185 | * taken: takes and drops res->spinlock | ||
186 | * held on exit: none | ||
187 | * returns: DLM_DENIED, DLM_RECOVERING, or net status | ||
188 | */ | ||
189 | static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, | ||
190 | struct dlm_lock_resource *res, | ||
191 | struct dlm_lock *lock, int flags) | ||
192 | { | ||
193 | enum dlm_status status = DLM_DENIED; | ||
194 | |||
195 | mlog_entry("type=%d\n", lock->ml.type); | ||
196 | mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len, | ||
197 | res->lockname.name, flags); | ||
198 | |||
199 | spin_lock(&res->spinlock); | ||
200 | |||
201 | /* will exit this call with spinlock held */ | ||
202 | __dlm_wait_on_lockres(res); | ||
203 | res->state |= DLM_LOCK_RES_IN_PROGRESS; | ||
204 | |||
205 | /* add lock to local (secondary) queue */ | ||
206 | dlm_lock_get(lock); | ||
207 | list_add_tail(&lock->list, &res->blocked); | ||
208 | lock->lock_pending = 1; | ||
209 | spin_unlock(&res->spinlock); | ||
210 | |||
211 | /* spec seems to say that you will get DLM_NORMAL when the lock | ||
212 | * has been queued, meaning we need to wait for a reply here. */ | ||
213 | status = dlm_send_remote_lock_request(dlm, res, lock, flags); | ||
214 | |||
215 | spin_lock(&res->spinlock); | ||
216 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; | ||
217 | lock->lock_pending = 0; | ||
218 | if (status != DLM_NORMAL) { | ||
219 | if (status != DLM_NOTQUEUED) | ||
220 | dlm_error(status); | ||
221 | dlm_revert_pending_lock(res, lock); | ||
222 | dlm_lock_put(lock); | ||
223 | } | ||
224 | spin_unlock(&res->spinlock); | ||
225 | |||
226 | dlm_lockres_calc_usage(dlm, res); | ||
227 | |||
228 | wake_up(&res->wq); | ||
229 | return status; | ||
230 | } | ||
231 | |||
232 | |||
233 | /* for remote lock creation. | ||
234 | * locking: | ||
235 | * caller needs: none, but need res->state & DLM_LOCK_RES_IN_PROGRESS | ||
236 | * taken: none | ||
237 | * held on exit: none | ||
238 | * returns: DLM_NOLOCKMGR, or net status | ||
239 | */ | ||
240 | static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, | ||
241 | struct dlm_lock_resource *res, | ||
242 | struct dlm_lock *lock, int flags) | ||
243 | { | ||
244 | struct dlm_create_lock create; | ||
245 | int tmpret, status = 0; | ||
246 | enum dlm_status ret; | ||
247 | |||
248 | mlog_entry_void(); | ||
249 | |||
250 | memset(&create, 0, sizeof(create)); | ||
251 | create.node_idx = dlm->node_num; | ||
252 | create.requested_type = lock->ml.type; | ||
253 | create.cookie = lock->ml.cookie; | ||
254 | create.namelen = res->lockname.len; | ||
255 | create.flags = cpu_to_be32(flags); | ||
256 | memcpy(create.name, res->lockname.name, create.namelen); | ||
257 | |||
258 | tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, | ||
259 | sizeof(create), res->owner, &status); | ||
260 | if (tmpret >= 0) { | ||
261 | // successfully sent and received | ||
262 | ret = status; // this is already a dlm_status | ||
263 | } else { | ||
264 | mlog_errno(tmpret); | ||
265 | if (dlm_is_host_down(tmpret)) { | ||
266 | ret = DLM_RECOVERING; | ||
267 | mlog(0, "node %u died so returning DLM_RECOVERING " | ||
268 | "from lock message!\n", res->owner); | ||
269 | } else { | ||
270 | ret = dlm_err_to_dlm_status(tmpret); | ||
271 | } | ||
272 | } | ||
273 | |||
274 | return ret; | ||
275 | } | ||
276 | |||
277 | void dlm_lock_get(struct dlm_lock *lock) | ||
278 | { | ||
279 | kref_get(&lock->lock_refs); | ||
280 | } | ||
281 | |||
282 | void dlm_lock_put(struct dlm_lock *lock) | ||
283 | { | ||
284 | kref_put(&lock->lock_refs, dlm_lock_release); | ||
285 | } | ||
286 | |||
287 | static void dlm_lock_release(struct kref *kref) | ||
288 | { | ||
289 | struct dlm_lock *lock; | ||
290 | |||
291 | lock = container_of(kref, struct dlm_lock, lock_refs); | ||
292 | |||
293 | BUG_ON(!list_empty(&lock->list)); | ||
294 | BUG_ON(!list_empty(&lock->ast_list)); | ||
295 | BUG_ON(!list_empty(&lock->bast_list)); | ||
296 | BUG_ON(lock->ast_pending); | ||
297 | BUG_ON(lock->bast_pending); | ||
298 | |||
299 | dlm_lock_detach_lockres(lock); | ||
300 | |||
301 | if (lock->lksb_kernel_allocated) { | ||
302 | mlog(0, "freeing kernel-allocated lksb\n"); | ||
303 | kfree(lock->lksb); | ||
304 | } | ||
305 | kfree(lock); | ||
306 | } | ||
307 | |||
308 | /* associate a lock with it's lockres, getting a ref on the lockres */ | ||
309 | void dlm_lock_attach_lockres(struct dlm_lock *lock, | ||
310 | struct dlm_lock_resource *res) | ||
311 | { | ||
312 | dlm_lockres_get(res); | ||
313 | lock->lockres = res; | ||
314 | } | ||
315 | |||
316 | /* drop ref on lockres, if there is still one associated with lock */ | ||
317 | static void dlm_lock_detach_lockres(struct dlm_lock *lock) | ||
318 | { | ||
319 | struct dlm_lock_resource *res; | ||
320 | |||
321 | res = lock->lockres; | ||
322 | if (res) { | ||
323 | lock->lockres = NULL; | ||
324 | mlog(0, "removing lock's lockres reference\n"); | ||
325 | dlm_lockres_put(res); | ||
326 | } | ||
327 | } | ||
328 | |||
329 | static void dlm_init_lock(struct dlm_lock *newlock, int type, | ||
330 | u8 node, u64 cookie) | ||
331 | { | ||
332 | INIT_LIST_HEAD(&newlock->list); | ||
333 | INIT_LIST_HEAD(&newlock->ast_list); | ||
334 | INIT_LIST_HEAD(&newlock->bast_list); | ||
335 | spin_lock_init(&newlock->spinlock); | ||
336 | newlock->ml.type = type; | ||
337 | newlock->ml.convert_type = LKM_IVMODE; | ||
338 | newlock->ml.highest_blocked = LKM_IVMODE; | ||
339 | newlock->ml.node = node; | ||
340 | newlock->ml.pad1 = 0; | ||
341 | newlock->ml.list = 0; | ||
342 | newlock->ml.flags = 0; | ||
343 | newlock->ast = NULL; | ||
344 | newlock->bast = NULL; | ||
345 | newlock->astdata = NULL; | ||
346 | newlock->ml.cookie = cpu_to_be64(cookie); | ||
347 | newlock->ast_pending = 0; | ||
348 | newlock->bast_pending = 0; | ||
349 | newlock->convert_pending = 0; | ||
350 | newlock->lock_pending = 0; | ||
351 | newlock->unlock_pending = 0; | ||
352 | newlock->cancel_pending = 0; | ||
353 | newlock->lksb_kernel_allocated = 0; | ||
354 | |||
355 | kref_init(&newlock->lock_refs); | ||
356 | } | ||
357 | |||
358 | struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, | ||
359 | struct dlm_lockstatus *lksb) | ||
360 | { | ||
361 | struct dlm_lock *lock; | ||
362 | int kernel_allocated = 0; | ||
363 | |||
364 | lock = kcalloc(1, sizeof(*lock), GFP_KERNEL); | ||
365 | if (!lock) | ||
366 | return NULL; | ||
367 | |||
368 | if (!lksb) { | ||
369 | /* zero memory only if kernel-allocated */ | ||
370 | lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL); | ||
371 | if (!lksb) { | ||
372 | kfree(lock); | ||
373 | return NULL; | ||
374 | } | ||
375 | kernel_allocated = 1; | ||
376 | } | ||
377 | |||
378 | dlm_init_lock(lock, type, node, cookie); | ||
379 | if (kernel_allocated) | ||
380 | lock->lksb_kernel_allocated = 1; | ||
381 | lock->lksb = lksb; | ||
382 | lksb->lockid = lock; | ||
383 | return lock; | ||
384 | } | ||
385 | |||
386 | /* handler for lock creation net message | ||
387 | * locking: | ||
388 | * caller needs: none | ||
389 | * taken: takes and drops res->spinlock | ||
390 | * held on exit: none | ||
391 | * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED | ||
392 | */ | ||
393 | int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data) | ||
394 | { | ||
395 | struct dlm_ctxt *dlm = data; | ||
396 | struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf; | ||
397 | struct dlm_lock_resource *res = NULL; | ||
398 | struct dlm_lock *newlock = NULL; | ||
399 | struct dlm_lockstatus *lksb = NULL; | ||
400 | enum dlm_status status = DLM_NORMAL; | ||
401 | char *name; | ||
402 | unsigned int namelen; | ||
403 | |||
404 | BUG_ON(!dlm); | ||
405 | |||
406 | mlog_entry_void(); | ||
407 | |||
408 | if (!dlm_grab(dlm)) | ||
409 | return DLM_REJECTED; | ||
410 | |||
411 | mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), | ||
412 | "Domain %s not fully joined!\n", dlm->name); | ||
413 | |||
414 | name = create->name; | ||
415 | namelen = create->namelen; | ||
416 | |||
417 | status = DLM_IVBUFLEN; | ||
418 | if (namelen > DLM_LOCKID_NAME_MAX) { | ||
419 | dlm_error(status); | ||
420 | goto leave; | ||
421 | } | ||
422 | |||
423 | status = DLM_SYSERR; | ||
424 | newlock = dlm_new_lock(create->requested_type, | ||
425 | create->node_idx, | ||
426 | be64_to_cpu(create->cookie), NULL); | ||
427 | if (!newlock) { | ||
428 | dlm_error(status); | ||
429 | goto leave; | ||
430 | } | ||
431 | |||
432 | lksb = newlock->lksb; | ||
433 | |||
434 | if (be32_to_cpu(create->flags) & LKM_GET_LVB) { | ||
435 | lksb->flags |= DLM_LKSB_GET_LVB; | ||
436 | mlog(0, "set DLM_LKSB_GET_LVB flag\n"); | ||
437 | } | ||
438 | |||
439 | status = DLM_IVLOCKID; | ||
440 | res = dlm_lookup_lockres(dlm, name, namelen); | ||
441 | if (!res) { | ||
442 | dlm_error(status); | ||
443 | goto leave; | ||
444 | } | ||
445 | |||
446 | spin_lock(&res->spinlock); | ||
447 | status = __dlm_lockres_state_to_status(res); | ||
448 | spin_unlock(&res->spinlock); | ||
449 | |||
450 | if (status != DLM_NORMAL) { | ||
451 | mlog(0, "lockres recovering/migrating/in-progress\n"); | ||
452 | goto leave; | ||
453 | } | ||
454 | |||
455 | dlm_lock_attach_lockres(newlock, res); | ||
456 | |||
457 | status = dlmlock_master(dlm, res, newlock, be32_to_cpu(create->flags)); | ||
458 | leave: | ||
459 | if (status != DLM_NORMAL) | ||
460 | if (newlock) | ||
461 | dlm_lock_put(newlock); | ||
462 | |||
463 | if (res) | ||
464 | dlm_lockres_put(res); | ||
465 | |||
466 | dlm_put(dlm); | ||
467 | |||
468 | return status; | ||
469 | } | ||
470 | |||
471 | |||
472 | /* fetch next node-local (u8 nodenum + u56 cookie) into u64 */ | ||
473 | static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie) | ||
474 | { | ||
475 | u64 tmpnode = node_num; | ||
476 | |||
477 | /* shift single byte of node num into top 8 bits */ | ||
478 | tmpnode <<= 56; | ||
479 | |||
480 | spin_lock(&dlm_cookie_lock); | ||
481 | *cookie = (dlm_next_cookie | tmpnode); | ||
482 | if (++dlm_next_cookie & 0xff00000000000000ull) { | ||
483 | mlog(0, "This node's cookie will now wrap!\n"); | ||
484 | dlm_next_cookie = 1; | ||
485 | } | ||
486 | spin_unlock(&dlm_cookie_lock); | ||
487 | } | ||
488 | |||
489 | enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode, | ||
490 | struct dlm_lockstatus *lksb, int flags, | ||
491 | const char *name, dlm_astlockfunc_t *ast, void *data, | ||
492 | dlm_bastlockfunc_t *bast) | ||
493 | { | ||
494 | enum dlm_status status; | ||
495 | struct dlm_lock_resource *res = NULL; | ||
496 | struct dlm_lock *lock = NULL; | ||
497 | int convert = 0, recovery = 0; | ||
498 | |||
499 | /* yes this function is a mess. | ||
500 | * TODO: clean this up. lots of common code in the | ||
501 | * lock and convert paths, especially in the retry blocks */ | ||
502 | if (!lksb) { | ||
503 | dlm_error(DLM_BADARGS); | ||
504 | return DLM_BADARGS; | ||
505 | } | ||
506 | |||
507 | status = DLM_BADPARAM; | ||
508 | if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE) { | ||
509 | dlm_error(status); | ||
510 | goto error; | ||
511 | } | ||
512 | |||
513 | if (flags & ~LKM_VALID_FLAGS) { | ||
514 | dlm_error(status); | ||
515 | goto error; | ||
516 | } | ||
517 | |||
518 | convert = (flags & LKM_CONVERT); | ||
519 | recovery = (flags & LKM_RECOVERY); | ||
520 | |||
521 | if (recovery && | ||
522 | (!dlm_is_recovery_lock(name, strlen(name)) || convert) ) { | ||
523 | dlm_error(status); | ||
524 | goto error; | ||
525 | } | ||
526 | if (convert && (flags & LKM_LOCAL)) { | ||
527 | mlog(ML_ERROR, "strange LOCAL convert request!\n"); | ||
528 | goto error; | ||
529 | } | ||
530 | |||
531 | if (convert) { | ||
532 | /* CONVERT request */ | ||
533 | |||
534 | /* if converting, must pass in a valid dlm_lock */ | ||
535 | lock = lksb->lockid; | ||
536 | if (!lock) { | ||
537 | mlog(ML_ERROR, "NULL lock pointer in convert " | ||
538 | "request\n"); | ||
539 | goto error; | ||
540 | } | ||
541 | |||
542 | res = lock->lockres; | ||
543 | if (!res) { | ||
544 | mlog(ML_ERROR, "NULL lockres pointer in convert " | ||
545 | "request\n"); | ||
546 | goto error; | ||
547 | } | ||
548 | dlm_lockres_get(res); | ||
549 | |||
550 | /* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are | ||
551 | * static after the original lock call. convert requests will | ||
552 | * ensure that everything is the same, or return DLM_BADARGS. | ||
553 | * this means that DLM_DENIED_NOASTS will never be returned. | ||
554 | */ | ||
555 | if (lock->lksb != lksb || lock->ast != ast || | ||
556 | lock->bast != bast || lock->astdata != data) { | ||
557 | status = DLM_BADARGS; | ||
558 | mlog(ML_ERROR, "new args: lksb=%p, ast=%p, bast=%p, " | ||
559 | "astdata=%p\n", lksb, ast, bast, data); | ||
560 | mlog(ML_ERROR, "orig args: lksb=%p, ast=%p, bast=%p, " | ||
561 | "astdata=%p\n", lock->lksb, lock->ast, | ||
562 | lock->bast, lock->astdata); | ||
563 | goto error; | ||
564 | } | ||
565 | retry_convert: | ||
566 | dlm_wait_for_recovery(dlm); | ||
567 | |||
568 | if (res->owner == dlm->node_num) | ||
569 | status = dlmconvert_master(dlm, res, lock, flags, mode); | ||
570 | else | ||
571 | status = dlmconvert_remote(dlm, res, lock, flags, mode); | ||
572 | if (status == DLM_RECOVERING || status == DLM_MIGRATING || | ||
573 | status == DLM_FORWARD) { | ||
574 | /* for now, see how this works without sleeping | ||
575 | * and just retry right away. I suspect the reco | ||
576 | * or migration will complete fast enough that | ||
577 | * no waiting will be necessary */ | ||
578 | mlog(0, "retrying convert with migration/recovery/" | ||
579 | "in-progress\n"); | ||
580 | msleep(100); | ||
581 | goto retry_convert; | ||
582 | } | ||
583 | } else { | ||
584 | u64 tmpcookie; | ||
585 | |||
586 | /* LOCK request */ | ||
587 | status = DLM_BADARGS; | ||
588 | if (!name) { | ||
589 | dlm_error(status); | ||
590 | goto error; | ||
591 | } | ||
592 | |||
593 | status = DLM_IVBUFLEN; | ||
594 | if (strlen(name) > DLM_LOCKID_NAME_MAX || strlen(name) < 1) { | ||
595 | dlm_error(status); | ||
596 | goto error; | ||
597 | } | ||
598 | |||
599 | dlm_get_next_cookie(dlm->node_num, &tmpcookie); | ||
600 | lock = dlm_new_lock(mode, dlm->node_num, tmpcookie, lksb); | ||
601 | if (!lock) { | ||
602 | dlm_error(status); | ||
603 | goto error; | ||
604 | } | ||
605 | |||
606 | if (!recovery) | ||
607 | dlm_wait_for_recovery(dlm); | ||
608 | |||
609 | /* find or create the lock resource */ | ||
610 | res = dlm_get_lock_resource(dlm, name, flags); | ||
611 | if (!res) { | ||
612 | status = DLM_IVLOCKID; | ||
613 | dlm_error(status); | ||
614 | goto error; | ||
615 | } | ||
616 | |||
617 | mlog(0, "type=%d, flags = 0x%x\n", mode, flags); | ||
618 | mlog(0, "creating lock: lock=%p res=%p\n", lock, res); | ||
619 | |||
620 | dlm_lock_attach_lockres(lock, res); | ||
621 | lock->ast = ast; | ||
622 | lock->bast = bast; | ||
623 | lock->astdata = data; | ||
624 | |||
625 | retry_lock: | ||
626 | if (flags & LKM_VALBLK) { | ||
627 | mlog(0, "LKM_VALBLK passed by caller\n"); | ||
628 | |||
629 | /* LVB requests for non PR, PW or EX locks are | ||
630 | * ignored. */ | ||
631 | if (mode < LKM_PRMODE) | ||
632 | flags &= ~LKM_VALBLK; | ||
633 | else { | ||
634 | flags |= LKM_GET_LVB; | ||
635 | lock->lksb->flags |= DLM_LKSB_GET_LVB; | ||
636 | } | ||
637 | } | ||
638 | |||
639 | if (res->owner == dlm->node_num) | ||
640 | status = dlmlock_master(dlm, res, lock, flags); | ||
641 | else | ||
642 | status = dlmlock_remote(dlm, res, lock, flags); | ||
643 | |||
644 | if (status == DLM_RECOVERING || status == DLM_MIGRATING || | ||
645 | status == DLM_FORWARD) { | ||
646 | mlog(0, "retrying lock with migration/" | ||
647 | "recovery/in progress\n"); | ||
648 | msleep(100); | ||
649 | dlm_wait_for_recovery(dlm); | ||
650 | goto retry_lock; | ||
651 | } | ||
652 | |||
653 | if (status != DLM_NORMAL) { | ||
654 | lock->lksb->flags &= ~DLM_LKSB_GET_LVB; | ||
655 | if (status != DLM_NOTQUEUED) | ||
656 | dlm_error(status); | ||
657 | goto error; | ||
658 | } | ||
659 | } | ||
660 | |||
661 | error: | ||
662 | if (status != DLM_NORMAL) { | ||
663 | if (lock && !convert) | ||
664 | dlm_lock_put(lock); | ||
665 | // this is kind of unnecessary | ||
666 | lksb->status = status; | ||
667 | } | ||
668 | |||
669 | /* put lockres ref from the convert path | ||
670 | * or from dlm_get_lock_resource */ | ||
671 | if (res) | ||
672 | dlm_lockres_put(res); | ||
673 | |||
674 | return status; | ||
675 | } | ||
676 | EXPORT_SYMBOL_GPL(dlmlock); | ||
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c new file mode 100644 index 000000000000..27e984f7e4cd --- /dev/null +++ b/fs/ocfs2/dlm/dlmmaster.c | |||
@@ -0,0 +1,2664 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmmod.c | ||
5 | * | ||
6 | * standalone DLM module | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | |||
28 | #include <linux/module.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/types.h> | ||
31 | #include <linux/slab.h> | ||
32 | #include <linux/highmem.h> | ||
33 | #include <linux/utsname.h> | ||
34 | #include <linux/init.h> | ||
35 | #include <linux/sysctl.h> | ||
36 | #include <linux/random.h> | ||
37 | #include <linux/blkdev.h> | ||
38 | #include <linux/socket.h> | ||
39 | #include <linux/inet.h> | ||
40 | #include <linux/spinlock.h> | ||
41 | #include <linux/delay.h> | ||
42 | |||
43 | |||
44 | #include "cluster/heartbeat.h" | ||
45 | #include "cluster/nodemanager.h" | ||
46 | #include "cluster/tcp.h" | ||
47 | |||
48 | #include "dlmapi.h" | ||
49 | #include "dlmcommon.h" | ||
50 | #include "dlmdebug.h" | ||
51 | #include "dlmdomain.h" | ||
52 | |||
53 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) | ||
54 | #include "cluster/masklog.h" | ||
55 | |||
56 | enum dlm_mle_type { | ||
57 | DLM_MLE_BLOCK, | ||
58 | DLM_MLE_MASTER, | ||
59 | DLM_MLE_MIGRATION | ||
60 | }; | ||
61 | |||
62 | struct dlm_lock_name | ||
63 | { | ||
64 | u8 len; | ||
65 | u8 name[DLM_LOCKID_NAME_MAX]; | ||
66 | }; | ||
67 | |||
68 | struct dlm_master_list_entry | ||
69 | { | ||
70 | struct list_head list; | ||
71 | struct list_head hb_events; | ||
72 | struct dlm_ctxt *dlm; | ||
73 | spinlock_t spinlock; | ||
74 | wait_queue_head_t wq; | ||
75 | atomic_t woken; | ||
76 | struct kref mle_refs; | ||
77 | unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
78 | unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
79 | unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
80 | unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
81 | u8 master; | ||
82 | u8 new_master; | ||
83 | enum dlm_mle_type type; | ||
84 | struct o2hb_callback_func mle_hb_up; | ||
85 | struct o2hb_callback_func mle_hb_down; | ||
86 | union { | ||
87 | struct dlm_lock_resource *res; | ||
88 | struct dlm_lock_name name; | ||
89 | } u; | ||
90 | }; | ||
91 | |||
92 | static void dlm_mle_node_down(struct dlm_ctxt *dlm, | ||
93 | struct dlm_master_list_entry *mle, | ||
94 | struct o2nm_node *node, | ||
95 | int idx); | ||
96 | static void dlm_mle_node_up(struct dlm_ctxt *dlm, | ||
97 | struct dlm_master_list_entry *mle, | ||
98 | struct o2nm_node *node, | ||
99 | int idx); | ||
100 | |||
101 | static void dlm_assert_master_worker(struct dlm_work_item *item, void *data); | ||
102 | static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, | ||
103 | unsigned int namelen, void *nodemap, | ||
104 | u32 flags); | ||
105 | |||
106 | static inline int dlm_mle_equal(struct dlm_ctxt *dlm, | ||
107 | struct dlm_master_list_entry *mle, | ||
108 | const char *name, | ||
109 | unsigned int namelen) | ||
110 | { | ||
111 | struct dlm_lock_resource *res; | ||
112 | |||
113 | if (dlm != mle->dlm) | ||
114 | return 0; | ||
115 | |||
116 | if (mle->type == DLM_MLE_BLOCK || | ||
117 | mle->type == DLM_MLE_MIGRATION) { | ||
118 | if (namelen != mle->u.name.len || | ||
119 | memcmp(name, mle->u.name.name, namelen)!=0) | ||
120 | return 0; | ||
121 | } else { | ||
122 | res = mle->u.res; | ||
123 | if (namelen != res->lockname.len || | ||
124 | memcmp(res->lockname.name, name, namelen) != 0) | ||
125 | return 0; | ||
126 | } | ||
127 | return 1; | ||
128 | } | ||
129 | |||
130 | #if 0 | ||
131 | /* Code here is included but defined out as it aids debugging */ | ||
132 | |||
133 | void dlm_print_one_mle(struct dlm_master_list_entry *mle) | ||
134 | { | ||
135 | int i = 0, refs; | ||
136 | char *type; | ||
137 | char attached; | ||
138 | u8 master; | ||
139 | unsigned int namelen; | ||
140 | const char *name; | ||
141 | struct kref *k; | ||
142 | |||
143 | k = &mle->mle_refs; | ||
144 | if (mle->type == DLM_MLE_BLOCK) | ||
145 | type = "BLK"; | ||
146 | else if (mle->type == DLM_MLE_MASTER) | ||
147 | type = "MAS"; | ||
148 | else | ||
149 | type = "MIG"; | ||
150 | refs = atomic_read(&k->refcount); | ||
151 | master = mle->master; | ||
152 | attached = (list_empty(&mle->hb_events) ? 'N' : 'Y'); | ||
153 | |||
154 | if (mle->type != DLM_MLE_MASTER) { | ||
155 | namelen = mle->u.name.len; | ||
156 | name = mle->u.name.name; | ||
157 | } else { | ||
158 | namelen = mle->u.res->lockname.len; | ||
159 | name = mle->u.res->lockname.name; | ||
160 | } | ||
161 | |||
162 | mlog(ML_NOTICE, " #%3d: %3s %3d %3u %3u %c (%d)%.*s\n", | ||
163 | i, type, refs, master, mle->new_master, attached, | ||
164 | namelen, namelen, name); | ||
165 | } | ||
166 | |||
167 | static void dlm_dump_mles(struct dlm_ctxt *dlm) | ||
168 | { | ||
169 | struct dlm_master_list_entry *mle; | ||
170 | struct list_head *iter; | ||
171 | |||
172 | mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); | ||
173 | mlog(ML_NOTICE, " ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n"); | ||
174 | spin_lock(&dlm->master_lock); | ||
175 | list_for_each(iter, &dlm->master_list) { | ||
176 | mle = list_entry(iter, struct dlm_master_list_entry, list); | ||
177 | dlm_print_one_mle(mle); | ||
178 | } | ||
179 | spin_unlock(&dlm->master_lock); | ||
180 | } | ||
181 | |||
182 | int dlm_dump_all_mles(const char __user *data, unsigned int len) | ||
183 | { | ||
184 | struct list_head *iter; | ||
185 | struct dlm_ctxt *dlm; | ||
186 | |||
187 | spin_lock(&dlm_domain_lock); | ||
188 | list_for_each(iter, &dlm_domains) { | ||
189 | dlm = list_entry (iter, struct dlm_ctxt, list); | ||
190 | mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name); | ||
191 | dlm_dump_mles(dlm); | ||
192 | } | ||
193 | spin_unlock(&dlm_domain_lock); | ||
194 | return len; | ||
195 | } | ||
196 | EXPORT_SYMBOL_GPL(dlm_dump_all_mles); | ||
197 | |||
198 | #endif /* 0 */ | ||
199 | |||
200 | |||
201 | static kmem_cache_t *dlm_mle_cache = NULL; | ||
202 | |||
203 | |||
204 | static void dlm_mle_release(struct kref *kref); | ||
205 | static void dlm_init_mle(struct dlm_master_list_entry *mle, | ||
206 | enum dlm_mle_type type, | ||
207 | struct dlm_ctxt *dlm, | ||
208 | struct dlm_lock_resource *res, | ||
209 | const char *name, | ||
210 | unsigned int namelen); | ||
211 | static void dlm_put_mle(struct dlm_master_list_entry *mle); | ||
212 | static void __dlm_put_mle(struct dlm_master_list_entry *mle); | ||
213 | static int dlm_find_mle(struct dlm_ctxt *dlm, | ||
214 | struct dlm_master_list_entry **mle, | ||
215 | char *name, unsigned int namelen); | ||
216 | |||
217 | static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to); | ||
218 | |||
219 | |||
220 | static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, | ||
221 | struct dlm_lock_resource *res, | ||
222 | struct dlm_master_list_entry *mle, | ||
223 | int *blocked); | ||
224 | static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, | ||
225 | struct dlm_lock_resource *res, | ||
226 | struct dlm_master_list_entry *mle, | ||
227 | int blocked); | ||
228 | static int dlm_add_migration_mle(struct dlm_ctxt *dlm, | ||
229 | struct dlm_lock_resource *res, | ||
230 | struct dlm_master_list_entry *mle, | ||
231 | struct dlm_master_list_entry **oldmle, | ||
232 | const char *name, unsigned int namelen, | ||
233 | u8 new_master, u8 master); | ||
234 | |||
235 | static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, | ||
236 | struct dlm_lock_resource *res); | ||
237 | static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, | ||
238 | struct dlm_lock_resource *res); | ||
239 | static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, | ||
240 | struct dlm_lock_resource *res, | ||
241 | u8 target); | ||
242 | |||
243 | |||
244 | int dlm_is_host_down(int errno) | ||
245 | { | ||
246 | switch (errno) { | ||
247 | case -EBADF: | ||
248 | case -ECONNREFUSED: | ||
249 | case -ENOTCONN: | ||
250 | case -ECONNRESET: | ||
251 | case -EPIPE: | ||
252 | case -EHOSTDOWN: | ||
253 | case -EHOSTUNREACH: | ||
254 | case -ETIMEDOUT: | ||
255 | case -ECONNABORTED: | ||
256 | case -ENETDOWN: | ||
257 | case -ENETUNREACH: | ||
258 | case -ENETRESET: | ||
259 | case -ESHUTDOWN: | ||
260 | case -ENOPROTOOPT: | ||
261 | case -EINVAL: /* if returned from our tcp code, | ||
262 | this means there is no socket */ | ||
263 | return 1; | ||
264 | } | ||
265 | return 0; | ||
266 | } | ||
267 | |||
268 | |||
269 | /* | ||
270 | * MASTER LIST FUNCTIONS | ||
271 | */ | ||
272 | |||
273 | |||
274 | /* | ||
275 | * regarding master list entries and heartbeat callbacks: | ||
276 | * | ||
277 | * in order to avoid sleeping and allocation that occurs in | ||
278 | * heartbeat, master list entries are simply attached to the | ||
279 | * dlm's established heartbeat callbacks. the mle is attached | ||
280 | * when it is created, and since the dlm->spinlock is held at | ||
281 | * that time, any heartbeat event will be properly discovered | ||
282 | * by the mle. the mle needs to be detached from the | ||
283 | * dlm->mle_hb_events list as soon as heartbeat events are no | ||
284 | * longer useful to the mle, and before the mle is freed. | ||
285 | * | ||
286 | * as a general rule, heartbeat events are no longer needed by | ||
287 | * the mle once an "answer" regarding the lock master has been | ||
288 | * received. | ||
289 | */ | ||
290 | static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm, | ||
291 | struct dlm_master_list_entry *mle) | ||
292 | { | ||
293 | assert_spin_locked(&dlm->spinlock); | ||
294 | |||
295 | list_add_tail(&mle->hb_events, &dlm->mle_hb_events); | ||
296 | } | ||
297 | |||
298 | |||
299 | static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, | ||
300 | struct dlm_master_list_entry *mle) | ||
301 | { | ||
302 | if (!list_empty(&mle->hb_events)) | ||
303 | list_del_init(&mle->hb_events); | ||
304 | } | ||
305 | |||
306 | |||
307 | static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, | ||
308 | struct dlm_master_list_entry *mle) | ||
309 | { | ||
310 | spin_lock(&dlm->spinlock); | ||
311 | __dlm_mle_detach_hb_events(dlm, mle); | ||
312 | spin_unlock(&dlm->spinlock); | ||
313 | } | ||
314 | |||
315 | /* remove from list and free */ | ||
316 | static void __dlm_put_mle(struct dlm_master_list_entry *mle) | ||
317 | { | ||
318 | struct dlm_ctxt *dlm; | ||
319 | dlm = mle->dlm; | ||
320 | |||
321 | assert_spin_locked(&dlm->spinlock); | ||
322 | assert_spin_locked(&dlm->master_lock); | ||
323 | BUG_ON(!atomic_read(&mle->mle_refs.refcount)); | ||
324 | |||
325 | kref_put(&mle->mle_refs, dlm_mle_release); | ||
326 | } | ||
327 | |||
328 | |||
329 | /* must not have any spinlocks coming in */ | ||
330 | static void dlm_put_mle(struct dlm_master_list_entry *mle) | ||
331 | { | ||
332 | struct dlm_ctxt *dlm; | ||
333 | dlm = mle->dlm; | ||
334 | |||
335 | spin_lock(&dlm->spinlock); | ||
336 | spin_lock(&dlm->master_lock); | ||
337 | __dlm_put_mle(mle); | ||
338 | spin_unlock(&dlm->master_lock); | ||
339 | spin_unlock(&dlm->spinlock); | ||
340 | } | ||
341 | |||
342 | static inline void dlm_get_mle(struct dlm_master_list_entry *mle) | ||
343 | { | ||
344 | kref_get(&mle->mle_refs); | ||
345 | } | ||
346 | |||
347 | static void dlm_init_mle(struct dlm_master_list_entry *mle, | ||
348 | enum dlm_mle_type type, | ||
349 | struct dlm_ctxt *dlm, | ||
350 | struct dlm_lock_resource *res, | ||
351 | const char *name, | ||
352 | unsigned int namelen) | ||
353 | { | ||
354 | assert_spin_locked(&dlm->spinlock); | ||
355 | |||
356 | mle->dlm = dlm; | ||
357 | mle->type = type; | ||
358 | INIT_LIST_HEAD(&mle->list); | ||
359 | INIT_LIST_HEAD(&mle->hb_events); | ||
360 | memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); | ||
361 | spin_lock_init(&mle->spinlock); | ||
362 | init_waitqueue_head(&mle->wq); | ||
363 | atomic_set(&mle->woken, 0); | ||
364 | kref_init(&mle->mle_refs); | ||
365 | memset(mle->response_map, 0, sizeof(mle->response_map)); | ||
366 | mle->master = O2NM_MAX_NODES; | ||
367 | mle->new_master = O2NM_MAX_NODES; | ||
368 | |||
369 | if (mle->type == DLM_MLE_MASTER) { | ||
370 | BUG_ON(!res); | ||
371 | mle->u.res = res; | ||
372 | } else if (mle->type == DLM_MLE_BLOCK) { | ||
373 | BUG_ON(!name); | ||
374 | memcpy(mle->u.name.name, name, namelen); | ||
375 | mle->u.name.len = namelen; | ||
376 | } else /* DLM_MLE_MIGRATION */ { | ||
377 | BUG_ON(!name); | ||
378 | memcpy(mle->u.name.name, name, namelen); | ||
379 | mle->u.name.len = namelen; | ||
380 | } | ||
381 | |||
382 | /* copy off the node_map and register hb callbacks on our copy */ | ||
383 | memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); | ||
384 | memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); | ||
385 | clear_bit(dlm->node_num, mle->vote_map); | ||
386 | clear_bit(dlm->node_num, mle->node_map); | ||
387 | |||
388 | /* attach the mle to the domain node up/down events */ | ||
389 | __dlm_mle_attach_hb_events(dlm, mle); | ||
390 | } | ||
391 | |||
392 | |||
393 | /* returns 1 if found, 0 if not */ | ||
394 | static int dlm_find_mle(struct dlm_ctxt *dlm, | ||
395 | struct dlm_master_list_entry **mle, | ||
396 | char *name, unsigned int namelen) | ||
397 | { | ||
398 | struct dlm_master_list_entry *tmpmle; | ||
399 | struct list_head *iter; | ||
400 | |||
401 | assert_spin_locked(&dlm->master_lock); | ||
402 | |||
403 | list_for_each(iter, &dlm->master_list) { | ||
404 | tmpmle = list_entry(iter, struct dlm_master_list_entry, list); | ||
405 | if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) | ||
406 | continue; | ||
407 | dlm_get_mle(tmpmle); | ||
408 | *mle = tmpmle; | ||
409 | return 1; | ||
410 | } | ||
411 | return 0; | ||
412 | } | ||
413 | |||
414 | void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) | ||
415 | { | ||
416 | struct dlm_master_list_entry *mle; | ||
417 | struct list_head *iter; | ||
418 | |||
419 | assert_spin_locked(&dlm->spinlock); | ||
420 | |||
421 | list_for_each(iter, &dlm->mle_hb_events) { | ||
422 | mle = list_entry(iter, struct dlm_master_list_entry, | ||
423 | hb_events); | ||
424 | if (node_up) | ||
425 | dlm_mle_node_up(dlm, mle, NULL, idx); | ||
426 | else | ||
427 | dlm_mle_node_down(dlm, mle, NULL, idx); | ||
428 | } | ||
429 | } | ||
430 | |||
431 | static void dlm_mle_node_down(struct dlm_ctxt *dlm, | ||
432 | struct dlm_master_list_entry *mle, | ||
433 | struct o2nm_node *node, int idx) | ||
434 | { | ||
435 | spin_lock(&mle->spinlock); | ||
436 | |||
437 | if (!test_bit(idx, mle->node_map)) | ||
438 | mlog(0, "node %u already removed from nodemap!\n", idx); | ||
439 | else | ||
440 | clear_bit(idx, mle->node_map); | ||
441 | |||
442 | spin_unlock(&mle->spinlock); | ||
443 | } | ||
444 | |||
445 | static void dlm_mle_node_up(struct dlm_ctxt *dlm, | ||
446 | struct dlm_master_list_entry *mle, | ||
447 | struct o2nm_node *node, int idx) | ||
448 | { | ||
449 | spin_lock(&mle->spinlock); | ||
450 | |||
451 | if (test_bit(idx, mle->node_map)) | ||
452 | mlog(0, "node %u already in node map!\n", idx); | ||
453 | else | ||
454 | set_bit(idx, mle->node_map); | ||
455 | |||
456 | spin_unlock(&mle->spinlock); | ||
457 | } | ||
458 | |||
459 | |||
460 | int dlm_init_mle_cache(void) | ||
461 | { | ||
462 | dlm_mle_cache = kmem_cache_create("dlm_mle_cache", | ||
463 | sizeof(struct dlm_master_list_entry), | ||
464 | 0, SLAB_HWCACHE_ALIGN, | ||
465 | NULL, NULL); | ||
466 | if (dlm_mle_cache == NULL) | ||
467 | return -ENOMEM; | ||
468 | return 0; | ||
469 | } | ||
470 | |||
471 | void dlm_destroy_mle_cache(void) | ||
472 | { | ||
473 | if (dlm_mle_cache) | ||
474 | kmem_cache_destroy(dlm_mle_cache); | ||
475 | } | ||
476 | |||
477 | static void dlm_mle_release(struct kref *kref) | ||
478 | { | ||
479 | struct dlm_master_list_entry *mle; | ||
480 | struct dlm_ctxt *dlm; | ||
481 | |||
482 | mlog_entry_void(); | ||
483 | |||
484 | mle = container_of(kref, struct dlm_master_list_entry, mle_refs); | ||
485 | dlm = mle->dlm; | ||
486 | |||
487 | if (mle->type != DLM_MLE_MASTER) { | ||
488 | mlog(0, "calling mle_release for %.*s, type %d\n", | ||
489 | mle->u.name.len, mle->u.name.name, mle->type); | ||
490 | } else { | ||
491 | mlog(0, "calling mle_release for %.*s, type %d\n", | ||
492 | mle->u.res->lockname.len, | ||
493 | mle->u.res->lockname.name, mle->type); | ||
494 | } | ||
495 | assert_spin_locked(&dlm->spinlock); | ||
496 | assert_spin_locked(&dlm->master_lock); | ||
497 | |||
498 | /* remove from list if not already */ | ||
499 | if (!list_empty(&mle->list)) | ||
500 | list_del_init(&mle->list); | ||
501 | |||
502 | /* detach the mle from the domain node up/down events */ | ||
503 | __dlm_mle_detach_hb_events(dlm, mle); | ||
504 | |||
505 | /* NOTE: kfree under spinlock here. | ||
506 | * if this is bad, we can move this to a freelist. */ | ||
507 | kmem_cache_free(dlm_mle_cache, mle); | ||
508 | } | ||
509 | |||
510 | |||
511 | /* | ||
512 | * LOCK RESOURCE FUNCTIONS | ||
513 | */ | ||
514 | |||
515 | static void dlm_set_lockres_owner(struct dlm_ctxt *dlm, | ||
516 | struct dlm_lock_resource *res, | ||
517 | u8 owner) | ||
518 | { | ||
519 | assert_spin_locked(&res->spinlock); | ||
520 | |||
521 | mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner); | ||
522 | |||
523 | if (owner == dlm->node_num) | ||
524 | atomic_inc(&dlm->local_resources); | ||
525 | else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN) | ||
526 | atomic_inc(&dlm->unknown_resources); | ||
527 | else | ||
528 | atomic_inc(&dlm->remote_resources); | ||
529 | |||
530 | res->owner = owner; | ||
531 | } | ||
532 | |||
533 | void dlm_change_lockres_owner(struct dlm_ctxt *dlm, | ||
534 | struct dlm_lock_resource *res, u8 owner) | ||
535 | { | ||
536 | assert_spin_locked(&res->spinlock); | ||
537 | |||
538 | if (owner == res->owner) | ||
539 | return; | ||
540 | |||
541 | if (res->owner == dlm->node_num) | ||
542 | atomic_dec(&dlm->local_resources); | ||
543 | else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) | ||
544 | atomic_dec(&dlm->unknown_resources); | ||
545 | else | ||
546 | atomic_dec(&dlm->remote_resources); | ||
547 | |||
548 | dlm_set_lockres_owner(dlm, res, owner); | ||
549 | } | ||
550 | |||
551 | |||
552 | static void dlm_lockres_release(struct kref *kref) | ||
553 | { | ||
554 | struct dlm_lock_resource *res; | ||
555 | |||
556 | res = container_of(kref, struct dlm_lock_resource, refs); | ||
557 | |||
558 | /* This should not happen -- all lockres' have a name | ||
559 | * associated with them at init time. */ | ||
560 | BUG_ON(!res->lockname.name); | ||
561 | |||
562 | mlog(0, "destroying lockres %.*s\n", res->lockname.len, | ||
563 | res->lockname.name); | ||
564 | |||
565 | /* By the time we're ready to blow this guy away, we shouldn't | ||
566 | * be on any lists. */ | ||
567 | BUG_ON(!list_empty(&res->list)); | ||
568 | BUG_ON(!list_empty(&res->granted)); | ||
569 | BUG_ON(!list_empty(&res->converting)); | ||
570 | BUG_ON(!list_empty(&res->blocked)); | ||
571 | BUG_ON(!list_empty(&res->dirty)); | ||
572 | BUG_ON(!list_empty(&res->recovering)); | ||
573 | BUG_ON(!list_empty(&res->purge)); | ||
574 | |||
575 | kfree(res->lockname.name); | ||
576 | |||
577 | kfree(res); | ||
578 | } | ||
579 | |||
580 | void dlm_lockres_get(struct dlm_lock_resource *res) | ||
581 | { | ||
582 | kref_get(&res->refs); | ||
583 | } | ||
584 | |||
585 | void dlm_lockres_put(struct dlm_lock_resource *res) | ||
586 | { | ||
587 | kref_put(&res->refs, dlm_lockres_release); | ||
588 | } | ||
589 | |||
590 | static void dlm_init_lockres(struct dlm_ctxt *dlm, | ||
591 | struct dlm_lock_resource *res, | ||
592 | const char *name, unsigned int namelen) | ||
593 | { | ||
594 | char *qname; | ||
595 | |||
596 | /* If we memset here, we lose our reference to the kmalloc'd | ||
597 | * res->lockname.name, so be sure to init every field | ||
598 | * correctly! */ | ||
599 | |||
600 | qname = (char *) res->lockname.name; | ||
601 | memcpy(qname, name, namelen); | ||
602 | |||
603 | res->lockname.len = namelen; | ||
604 | res->lockname.hash = full_name_hash(name, namelen); | ||
605 | |||
606 | init_waitqueue_head(&res->wq); | ||
607 | spin_lock_init(&res->spinlock); | ||
608 | INIT_LIST_HEAD(&res->list); | ||
609 | INIT_LIST_HEAD(&res->granted); | ||
610 | INIT_LIST_HEAD(&res->converting); | ||
611 | INIT_LIST_HEAD(&res->blocked); | ||
612 | INIT_LIST_HEAD(&res->dirty); | ||
613 | INIT_LIST_HEAD(&res->recovering); | ||
614 | INIT_LIST_HEAD(&res->purge); | ||
615 | atomic_set(&res->asts_reserved, 0); | ||
616 | res->migration_pending = 0; | ||
617 | |||
618 | kref_init(&res->refs); | ||
619 | |||
620 | /* just for consistency */ | ||
621 | spin_lock(&res->spinlock); | ||
622 | dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); | ||
623 | spin_unlock(&res->spinlock); | ||
624 | |||
625 | res->state = DLM_LOCK_RES_IN_PROGRESS; | ||
626 | |||
627 | res->last_used = 0; | ||
628 | |||
629 | memset(res->lvb, 0, DLM_LVB_LEN); | ||
630 | } | ||
631 | |||
632 | struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, | ||
633 | const char *name, | ||
634 | unsigned int namelen) | ||
635 | { | ||
636 | struct dlm_lock_resource *res; | ||
637 | |||
638 | res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); | ||
639 | if (!res) | ||
640 | return NULL; | ||
641 | |||
642 | res->lockname.name = kmalloc(namelen, GFP_KERNEL); | ||
643 | if (!res->lockname.name) { | ||
644 | kfree(res); | ||
645 | return NULL; | ||
646 | } | ||
647 | |||
648 | dlm_init_lockres(dlm, res, name, namelen); | ||
649 | return res; | ||
650 | } | ||
651 | |||
652 | /* | ||
653 | * lookup a lock resource by name. | ||
654 | * may already exist in the hashtable. | ||
655 | * lockid is null terminated | ||
656 | * | ||
657 | * if not, allocate enough for the lockres and for | ||
658 | * the temporary structure used in doing the mastering. | ||
659 | * | ||
660 | * also, do a lookup in the dlm->master_list to see | ||
661 | * if another node has begun mastering the same lock. | ||
662 | * if so, there should be a block entry in there | ||
663 | * for this name, and we should *not* attempt to master | ||
664 | * the lock here. need to wait around for that node | ||
665 | * to assert_master (or die). | ||
666 | * | ||
667 | */ | ||
668 | struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, | ||
669 | const char *lockid, | ||
670 | int flags) | ||
671 | { | ||
672 | struct dlm_lock_resource *tmpres=NULL, *res=NULL; | ||
673 | struct dlm_master_list_entry *mle = NULL; | ||
674 | struct dlm_master_list_entry *alloc_mle = NULL; | ||
675 | int blocked = 0; | ||
676 | int ret, nodenum; | ||
677 | struct dlm_node_iter iter; | ||
678 | unsigned int namelen; | ||
679 | int tries = 0; | ||
680 | |||
681 | BUG_ON(!lockid); | ||
682 | |||
683 | namelen = strlen(lockid); | ||
684 | |||
685 | mlog(0, "get lockres %s (len %d)\n", lockid, namelen); | ||
686 | |||
687 | lookup: | ||
688 | spin_lock(&dlm->spinlock); | ||
689 | tmpres = __dlm_lookup_lockres(dlm, lockid, namelen); | ||
690 | if (tmpres) { | ||
691 | spin_unlock(&dlm->spinlock); | ||
692 | mlog(0, "found in hash!\n"); | ||
693 | if (res) | ||
694 | dlm_lockres_put(res); | ||
695 | res = tmpres; | ||
696 | goto leave; | ||
697 | } | ||
698 | |||
699 | if (!res) { | ||
700 | spin_unlock(&dlm->spinlock); | ||
701 | mlog(0, "allocating a new resource\n"); | ||
702 | /* nothing found and we need to allocate one. */ | ||
703 | alloc_mle = (struct dlm_master_list_entry *) | ||
704 | kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL); | ||
705 | if (!alloc_mle) | ||
706 | goto leave; | ||
707 | res = dlm_new_lockres(dlm, lockid, namelen); | ||
708 | if (!res) | ||
709 | goto leave; | ||
710 | goto lookup; | ||
711 | } | ||
712 | |||
713 | mlog(0, "no lockres found, allocated our own: %p\n", res); | ||
714 | |||
715 | if (flags & LKM_LOCAL) { | ||
716 | /* caller knows it's safe to assume it's not mastered elsewhere | ||
717 | * DONE! return right away */ | ||
718 | spin_lock(&res->spinlock); | ||
719 | dlm_change_lockres_owner(dlm, res, dlm->node_num); | ||
720 | __dlm_insert_lockres(dlm, res); | ||
721 | spin_unlock(&res->spinlock); | ||
722 | spin_unlock(&dlm->spinlock); | ||
723 | /* lockres still marked IN_PROGRESS */ | ||
724 | goto wake_waiters; | ||
725 | } | ||
726 | |||
727 | /* check master list to see if another node has started mastering it */ | ||
728 | spin_lock(&dlm->master_lock); | ||
729 | |||
730 | /* if we found a block, wait for lock to be mastered by another node */ | ||
731 | blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); | ||
732 | if (blocked) { | ||
733 | if (mle->type == DLM_MLE_MASTER) { | ||
734 | mlog(ML_ERROR, "master entry for nonexistent lock!\n"); | ||
735 | BUG(); | ||
736 | } else if (mle->type == DLM_MLE_MIGRATION) { | ||
737 | /* migration is in progress! */ | ||
738 | /* the good news is that we now know the | ||
739 | * "current" master (mle->master). */ | ||
740 | |||
741 | spin_unlock(&dlm->master_lock); | ||
742 | assert_spin_locked(&dlm->spinlock); | ||
743 | |||
744 | /* set the lockres owner and hash it */ | ||
745 | spin_lock(&res->spinlock); | ||
746 | dlm_set_lockres_owner(dlm, res, mle->master); | ||
747 | __dlm_insert_lockres(dlm, res); | ||
748 | spin_unlock(&res->spinlock); | ||
749 | spin_unlock(&dlm->spinlock); | ||
750 | |||
751 | /* master is known, detach */ | ||
752 | dlm_mle_detach_hb_events(dlm, mle); | ||
753 | dlm_put_mle(mle); | ||
754 | mle = NULL; | ||
755 | goto wake_waiters; | ||
756 | } | ||
757 | } else { | ||
758 | /* go ahead and try to master lock on this node */ | ||
759 | mle = alloc_mle; | ||
760 | /* make sure this does not get freed below */ | ||
761 | alloc_mle = NULL; | ||
762 | dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); | ||
763 | set_bit(dlm->node_num, mle->maybe_map); | ||
764 | list_add(&mle->list, &dlm->master_list); | ||
765 | } | ||
766 | |||
767 | /* at this point there is either a DLM_MLE_BLOCK or a | ||
768 | * DLM_MLE_MASTER on the master list, so it's safe to add the | ||
769 | * lockres to the hashtable. anyone who finds the lock will | ||
770 | * still have to wait on the IN_PROGRESS. */ | ||
771 | |||
772 | /* finally add the lockres to its hash bucket */ | ||
773 | __dlm_insert_lockres(dlm, res); | ||
774 | /* get an extra ref on the mle in case this is a BLOCK | ||
775 | * if so, the creator of the BLOCK may try to put the last | ||
776 | * ref at this time in the assert master handler, so we | ||
777 | * need an extra one to keep from a bad ptr deref. */ | ||
778 | dlm_get_mle(mle); | ||
779 | spin_unlock(&dlm->master_lock); | ||
780 | spin_unlock(&dlm->spinlock); | ||
781 | |||
782 | /* must wait for lock to be mastered elsewhere */ | ||
783 | if (blocked) | ||
784 | goto wait; | ||
785 | |||
786 | redo_request: | ||
787 | ret = -EINVAL; | ||
788 | dlm_node_iter_init(mle->vote_map, &iter); | ||
789 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { | ||
790 | ret = dlm_do_master_request(mle, nodenum); | ||
791 | if (ret < 0) | ||
792 | mlog_errno(ret); | ||
793 | if (mle->master != O2NM_MAX_NODES) { | ||
794 | /* found a master ! */ | ||
795 | break; | ||
796 | } | ||
797 | } | ||
798 | |||
799 | wait: | ||
800 | /* keep going until the response map includes all nodes */ | ||
801 | ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); | ||
802 | if (ret < 0) { | ||
803 | mlog(0, "%s:%.*s: node map changed, redo the " | ||
804 | "master request now, blocked=%d\n", | ||
805 | dlm->name, res->lockname.len, | ||
806 | res->lockname.name, blocked); | ||
807 | if (++tries > 20) { | ||
808 | mlog(ML_ERROR, "%s:%.*s: spinning on " | ||
809 | "dlm_wait_for_lock_mastery, blocked=%d\n", | ||
810 | dlm->name, res->lockname.len, | ||
811 | res->lockname.name, blocked); | ||
812 | dlm_print_one_lock_resource(res); | ||
813 | /* dlm_print_one_mle(mle); */ | ||
814 | tries = 0; | ||
815 | } | ||
816 | goto redo_request; | ||
817 | } | ||
818 | |||
819 | mlog(0, "lockres mastered by %u\n", res->owner); | ||
820 | /* make sure we never continue without this */ | ||
821 | BUG_ON(res->owner == O2NM_MAX_NODES); | ||
822 | |||
823 | /* master is known, detach if not already detached */ | ||
824 | dlm_mle_detach_hb_events(dlm, mle); | ||
825 | dlm_put_mle(mle); | ||
826 | /* put the extra ref */ | ||
827 | dlm_put_mle(mle); | ||
828 | |||
829 | wake_waiters: | ||
830 | spin_lock(&res->spinlock); | ||
831 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; | ||
832 | spin_unlock(&res->spinlock); | ||
833 | wake_up(&res->wq); | ||
834 | |||
835 | leave: | ||
836 | /* need to free the unused mle */ | ||
837 | if (alloc_mle) | ||
838 | kmem_cache_free(dlm_mle_cache, alloc_mle); | ||
839 | |||
840 | return res; | ||
841 | } | ||
842 | |||
843 | |||
844 | #define DLM_MASTERY_TIMEOUT_MS 5000 | ||
845 | |||
846 | static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, | ||
847 | struct dlm_lock_resource *res, | ||
848 | struct dlm_master_list_entry *mle, | ||
849 | int *blocked) | ||
850 | { | ||
851 | u8 m; | ||
852 | int ret, bit; | ||
853 | int map_changed, voting_done; | ||
854 | int assert, sleep; | ||
855 | |||
856 | recheck: | ||
857 | ret = 0; | ||
858 | assert = 0; | ||
859 | |||
860 | /* check if another node has already become the owner */ | ||
861 | spin_lock(&res->spinlock); | ||
862 | if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
863 | spin_unlock(&res->spinlock); | ||
864 | goto leave; | ||
865 | } | ||
866 | spin_unlock(&res->spinlock); | ||
867 | |||
868 | spin_lock(&mle->spinlock); | ||
869 | m = mle->master; | ||
870 | map_changed = (memcmp(mle->vote_map, mle->node_map, | ||
871 | sizeof(mle->vote_map)) != 0); | ||
872 | voting_done = (memcmp(mle->vote_map, mle->response_map, | ||
873 | sizeof(mle->vote_map)) == 0); | ||
874 | |||
875 | /* restart if we hit any errors */ | ||
876 | if (map_changed) { | ||
877 | int b; | ||
878 | mlog(0, "%s: %.*s: node map changed, restarting\n", | ||
879 | dlm->name, res->lockname.len, res->lockname.name); | ||
880 | ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked); | ||
881 | b = (mle->type == DLM_MLE_BLOCK); | ||
882 | if ((*blocked && !b) || (!*blocked && b)) { | ||
883 | mlog(0, "%s:%.*s: status change: old=%d new=%d\n", | ||
884 | dlm->name, res->lockname.len, res->lockname.name, | ||
885 | *blocked, b); | ||
886 | *blocked = b; | ||
887 | } | ||
888 | spin_unlock(&mle->spinlock); | ||
889 | if (ret < 0) { | ||
890 | mlog_errno(ret); | ||
891 | goto leave; | ||
892 | } | ||
893 | mlog(0, "%s:%.*s: restart lock mastery succeeded, " | ||
894 | "rechecking now\n", dlm->name, res->lockname.len, | ||
895 | res->lockname.name); | ||
896 | goto recheck; | ||
897 | } | ||
898 | |||
899 | if (m != O2NM_MAX_NODES) { | ||
900 | /* another node has done an assert! | ||
901 | * all done! */ | ||
902 | sleep = 0; | ||
903 | } else { | ||
904 | sleep = 1; | ||
905 | /* have all nodes responded? */ | ||
906 | if (voting_done && !*blocked) { | ||
907 | bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); | ||
908 | if (dlm->node_num <= bit) { | ||
909 | /* my node number is lowest. | ||
910 | * now tell other nodes that I am | ||
911 | * mastering this. */ | ||
912 | mle->master = dlm->node_num; | ||
913 | assert = 1; | ||
914 | sleep = 0; | ||
915 | } | ||
916 | /* if voting is done, but we have not received | ||
917 | * an assert master yet, we must sleep */ | ||
918 | } | ||
919 | } | ||
920 | |||
921 | spin_unlock(&mle->spinlock); | ||
922 | |||
923 | /* sleep if we haven't finished voting yet */ | ||
924 | if (sleep) { | ||
925 | unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS); | ||
926 | |||
927 | /* | ||
928 | if (atomic_read(&mle->mle_refs.refcount) < 2) | ||
929 | mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle, | ||
930 | atomic_read(&mle->mle_refs.refcount), | ||
931 | res->lockname.len, res->lockname.name); | ||
932 | */ | ||
933 | atomic_set(&mle->woken, 0); | ||
934 | (void)wait_event_timeout(mle->wq, | ||
935 | (atomic_read(&mle->woken) == 1), | ||
936 | timeo); | ||
937 | if (res->owner == O2NM_MAX_NODES) { | ||
938 | mlog(0, "waiting again\n"); | ||
939 | goto recheck; | ||
940 | } | ||
941 | mlog(0, "done waiting, master is %u\n", res->owner); | ||
942 | ret = 0; | ||
943 | goto leave; | ||
944 | } | ||
945 | |||
946 | ret = 0; /* done */ | ||
947 | if (assert) { | ||
948 | m = dlm->node_num; | ||
949 | mlog(0, "about to master %.*s here, this=%u\n", | ||
950 | res->lockname.len, res->lockname.name, m); | ||
951 | ret = dlm_do_assert_master(dlm, res->lockname.name, | ||
952 | res->lockname.len, mle->vote_map, 0); | ||
953 | if (ret) { | ||
954 | /* This is a failure in the network path, | ||
955 | * not in the response to the assert_master | ||
956 | * (any nonzero response is a BUG on this node). | ||
957 | * Most likely a socket just got disconnected | ||
958 | * due to node death. */ | ||
959 | mlog_errno(ret); | ||
960 | } | ||
961 | /* no longer need to restart lock mastery. | ||
962 | * all living nodes have been contacted. */ | ||
963 | ret = 0; | ||
964 | } | ||
965 | |||
966 | /* set the lockres owner */ | ||
967 | spin_lock(&res->spinlock); | ||
968 | dlm_change_lockres_owner(dlm, res, m); | ||
969 | spin_unlock(&res->spinlock); | ||
970 | |||
971 | leave: | ||
972 | return ret; | ||
973 | } | ||
974 | |||
975 | struct dlm_bitmap_diff_iter | ||
976 | { | ||
977 | int curnode; | ||
978 | unsigned long *orig_bm; | ||
979 | unsigned long *cur_bm; | ||
980 | unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
981 | }; | ||
982 | |||
983 | enum dlm_node_state_change | ||
984 | { | ||
985 | NODE_DOWN = -1, | ||
986 | NODE_NO_CHANGE = 0, | ||
987 | NODE_UP | ||
988 | }; | ||
989 | |||
990 | static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter, | ||
991 | unsigned long *orig_bm, | ||
992 | unsigned long *cur_bm) | ||
993 | { | ||
994 | unsigned long p1, p2; | ||
995 | int i; | ||
996 | |||
997 | iter->curnode = -1; | ||
998 | iter->orig_bm = orig_bm; | ||
999 | iter->cur_bm = cur_bm; | ||
1000 | |||
1001 | for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) { | ||
1002 | p1 = *(iter->orig_bm + i); | ||
1003 | p2 = *(iter->cur_bm + i); | ||
1004 | iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1); | ||
1005 | } | ||
1006 | } | ||
1007 | |||
1008 | static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter, | ||
1009 | enum dlm_node_state_change *state) | ||
1010 | { | ||
1011 | int bit; | ||
1012 | |||
1013 | if (iter->curnode >= O2NM_MAX_NODES) | ||
1014 | return -ENOENT; | ||
1015 | |||
1016 | bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES, | ||
1017 | iter->curnode+1); | ||
1018 | if (bit >= O2NM_MAX_NODES) { | ||
1019 | iter->curnode = O2NM_MAX_NODES; | ||
1020 | return -ENOENT; | ||
1021 | } | ||
1022 | |||
1023 | /* if it was there in the original then this node died */ | ||
1024 | if (test_bit(bit, iter->orig_bm)) | ||
1025 | *state = NODE_DOWN; | ||
1026 | else | ||
1027 | *state = NODE_UP; | ||
1028 | |||
1029 | iter->curnode = bit; | ||
1030 | return bit; | ||
1031 | } | ||
1032 | |||
1033 | |||
1034 | static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, | ||
1035 | struct dlm_lock_resource *res, | ||
1036 | struct dlm_master_list_entry *mle, | ||
1037 | int blocked) | ||
1038 | { | ||
1039 | struct dlm_bitmap_diff_iter bdi; | ||
1040 | enum dlm_node_state_change sc; | ||
1041 | int node; | ||
1042 | int ret = 0; | ||
1043 | |||
1044 | mlog(0, "something happened such that the " | ||
1045 | "master process may need to be restarted!\n"); | ||
1046 | |||
1047 | assert_spin_locked(&mle->spinlock); | ||
1048 | |||
1049 | dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map); | ||
1050 | node = dlm_bitmap_diff_iter_next(&bdi, &sc); | ||
1051 | while (node >= 0) { | ||
1052 | if (sc == NODE_UP) { | ||
1053 | /* a node came up. easy. might not even need | ||
1054 | * to talk to it if its node number is higher | ||
1055 | * or if we are already blocked. */ | ||
1056 | mlog(0, "node up! %d\n", node); | ||
1057 | if (blocked) | ||
1058 | goto next; | ||
1059 | |||
1060 | if (node > dlm->node_num) { | ||
1061 | mlog(0, "node > this node. skipping.\n"); | ||
1062 | goto next; | ||
1063 | } | ||
1064 | |||
1065 | /* redo the master request, but only for the new node */ | ||
1066 | mlog(0, "sending request to new node\n"); | ||
1067 | clear_bit(node, mle->response_map); | ||
1068 | set_bit(node, mle->vote_map); | ||
1069 | } else { | ||
1070 | mlog(ML_ERROR, "node down! %d\n", node); | ||
1071 | |||
1072 | /* if the node wasn't involved in mastery skip it, | ||
1073 | * but clear it out from the maps so that it will | ||
1074 | * not affect mastery of this lockres */ | ||
1075 | clear_bit(node, mle->response_map); | ||
1076 | clear_bit(node, mle->vote_map); | ||
1077 | if (!test_bit(node, mle->maybe_map)) | ||
1078 | goto next; | ||
1079 | |||
1080 | /* if we're already blocked on lock mastery, and the | ||
1081 | * dead node wasn't the expected master, or there is | ||
1082 | * another node in the maybe_map, keep waiting */ | ||
1083 | if (blocked) { | ||
1084 | int lowest = find_next_bit(mle->maybe_map, | ||
1085 | O2NM_MAX_NODES, 0); | ||
1086 | |||
1087 | /* act like it was never there */ | ||
1088 | clear_bit(node, mle->maybe_map); | ||
1089 | |||
1090 | if (node != lowest) | ||
1091 | goto next; | ||
1092 | |||
1093 | mlog(ML_ERROR, "expected master %u died while " | ||
1094 | "this node was blocked waiting on it!\n", | ||
1095 | node); | ||
1096 | lowest = find_next_bit(mle->maybe_map, | ||
1097 | O2NM_MAX_NODES, | ||
1098 | lowest+1); | ||
1099 | if (lowest < O2NM_MAX_NODES) { | ||
1100 | mlog(0, "still blocked. waiting " | ||
1101 | "on %u now\n", lowest); | ||
1102 | goto next; | ||
1103 | } | ||
1104 | |||
1105 | /* mle is an MLE_BLOCK, but there is now | ||
1106 | * nothing left to block on. we need to return | ||
1107 | * all the way back out and try again with | ||
1108 | * an MLE_MASTER. dlm_do_local_recovery_cleanup | ||
1109 | * has already run, so the mle refcount is ok */ | ||
1110 | mlog(0, "no longer blocking. we can " | ||
1111 | "try to master this here\n"); | ||
1112 | mle->type = DLM_MLE_MASTER; | ||
1113 | memset(mle->maybe_map, 0, | ||
1114 | sizeof(mle->maybe_map)); | ||
1115 | memset(mle->response_map, 0, | ||
1116 | sizeof(mle->maybe_map)); | ||
1117 | memcpy(mle->vote_map, mle->node_map, | ||
1118 | sizeof(mle->node_map)); | ||
1119 | mle->u.res = res; | ||
1120 | set_bit(dlm->node_num, mle->maybe_map); | ||
1121 | |||
1122 | ret = -EAGAIN; | ||
1123 | goto next; | ||
1124 | } | ||
1125 | |||
1126 | clear_bit(node, mle->maybe_map); | ||
1127 | if (node > dlm->node_num) | ||
1128 | goto next; | ||
1129 | |||
1130 | mlog(0, "dead node in map!\n"); | ||
1131 | /* yuck. go back and re-contact all nodes | ||
1132 | * in the vote_map, removing this node. */ | ||
1133 | memset(mle->response_map, 0, | ||
1134 | sizeof(mle->response_map)); | ||
1135 | } | ||
1136 | ret = -EAGAIN; | ||
1137 | next: | ||
1138 | node = dlm_bitmap_diff_iter_next(&bdi, &sc); | ||
1139 | } | ||
1140 | return ret; | ||
1141 | } | ||
1142 | |||
1143 | |||
1144 | /* | ||
1145 | * DLM_MASTER_REQUEST_MSG | ||
1146 | * | ||
1147 | * returns: 0 on success, | ||
1148 | * -errno on a network error | ||
1149 | * | ||
1150 | * on error, the caller should assume the target node is "dead" | ||
1151 | * | ||
1152 | */ | ||
1153 | |||
1154 | static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to) | ||
1155 | { | ||
1156 | struct dlm_ctxt *dlm = mle->dlm; | ||
1157 | struct dlm_master_request request; | ||
1158 | int ret, response=0, resend; | ||
1159 | |||
1160 | memset(&request, 0, sizeof(request)); | ||
1161 | request.node_idx = dlm->node_num; | ||
1162 | |||
1163 | BUG_ON(mle->type == DLM_MLE_MIGRATION); | ||
1164 | |||
1165 | if (mle->type != DLM_MLE_MASTER) { | ||
1166 | request.namelen = mle->u.name.len; | ||
1167 | memcpy(request.name, mle->u.name.name, request.namelen); | ||
1168 | } else { | ||
1169 | request.namelen = mle->u.res->lockname.len; | ||
1170 | memcpy(request.name, mle->u.res->lockname.name, | ||
1171 | request.namelen); | ||
1172 | } | ||
1173 | |||
1174 | again: | ||
1175 | ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, | ||
1176 | sizeof(request), to, &response); | ||
1177 | if (ret < 0) { | ||
1178 | if (ret == -ESRCH) { | ||
1179 | /* should never happen */ | ||
1180 | mlog(ML_ERROR, "TCP stack not ready!\n"); | ||
1181 | BUG(); | ||
1182 | } else if (ret == -EINVAL) { | ||
1183 | mlog(ML_ERROR, "bad args passed to o2net!\n"); | ||
1184 | BUG(); | ||
1185 | } else if (ret == -ENOMEM) { | ||
1186 | mlog(ML_ERROR, "out of memory while trying to send " | ||
1187 | "network message! retrying\n"); | ||
1188 | /* this is totally crude */ | ||
1189 | msleep(50); | ||
1190 | goto again; | ||
1191 | } else if (!dlm_is_host_down(ret)) { | ||
1192 | /* not a network error. bad. */ | ||
1193 | mlog_errno(ret); | ||
1194 | mlog(ML_ERROR, "unhandled error!"); | ||
1195 | BUG(); | ||
1196 | } | ||
1197 | /* all other errors should be network errors, | ||
1198 | * and likely indicate node death */ | ||
1199 | mlog(ML_ERROR, "link to %d went down!\n", to); | ||
1200 | goto out; | ||
1201 | } | ||
1202 | |||
1203 | ret = 0; | ||
1204 | resend = 0; | ||
1205 | spin_lock(&mle->spinlock); | ||
1206 | switch (response) { | ||
1207 | case DLM_MASTER_RESP_YES: | ||
1208 | set_bit(to, mle->response_map); | ||
1209 | mlog(0, "node %u is the master, response=YES\n", to); | ||
1210 | mle->master = to; | ||
1211 | break; | ||
1212 | case DLM_MASTER_RESP_NO: | ||
1213 | mlog(0, "node %u not master, response=NO\n", to); | ||
1214 | set_bit(to, mle->response_map); | ||
1215 | break; | ||
1216 | case DLM_MASTER_RESP_MAYBE: | ||
1217 | mlog(0, "node %u not master, response=MAYBE\n", to); | ||
1218 | set_bit(to, mle->response_map); | ||
1219 | set_bit(to, mle->maybe_map); | ||
1220 | break; | ||
1221 | case DLM_MASTER_RESP_ERROR: | ||
1222 | mlog(0, "node %u hit an error, resending\n", to); | ||
1223 | resend = 1; | ||
1224 | response = 0; | ||
1225 | break; | ||
1226 | default: | ||
1227 | mlog(ML_ERROR, "bad response! %u\n", response); | ||
1228 | BUG(); | ||
1229 | } | ||
1230 | spin_unlock(&mle->spinlock); | ||
1231 | if (resend) { | ||
1232 | /* this is also totally crude */ | ||
1233 | msleep(50); | ||
1234 | goto again; | ||
1235 | } | ||
1236 | |||
1237 | out: | ||
1238 | return ret; | ||
1239 | } | ||
1240 | |||
1241 | /* | ||
1242 | * locks that can be taken here: | ||
1243 | * dlm->spinlock | ||
1244 | * res->spinlock | ||
1245 | * mle->spinlock | ||
1246 | * dlm->master_list | ||
1247 | * | ||
1248 | * if possible, TRIM THIS DOWN!!! | ||
1249 | */ | ||
1250 | int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) | ||
1251 | { | ||
1252 | u8 response = DLM_MASTER_RESP_MAYBE; | ||
1253 | struct dlm_ctxt *dlm = data; | ||
1254 | struct dlm_lock_resource *res; | ||
1255 | struct dlm_master_request *request = (struct dlm_master_request *) msg->buf; | ||
1256 | struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL; | ||
1257 | char *name; | ||
1258 | unsigned int namelen; | ||
1259 | int found, ret; | ||
1260 | int set_maybe; | ||
1261 | |||
1262 | if (!dlm_grab(dlm)) | ||
1263 | return DLM_MASTER_RESP_NO; | ||
1264 | |||
1265 | if (!dlm_domain_fully_joined(dlm)) { | ||
1266 | response = DLM_MASTER_RESP_NO; | ||
1267 | goto send_response; | ||
1268 | } | ||
1269 | |||
1270 | name = request->name; | ||
1271 | namelen = request->namelen; | ||
1272 | |||
1273 | if (namelen > DLM_LOCKID_NAME_MAX) { | ||
1274 | response = DLM_IVBUFLEN; | ||
1275 | goto send_response; | ||
1276 | } | ||
1277 | |||
1278 | way_up_top: | ||
1279 | spin_lock(&dlm->spinlock); | ||
1280 | res = __dlm_lookup_lockres(dlm, name, namelen); | ||
1281 | if (res) { | ||
1282 | spin_unlock(&dlm->spinlock); | ||
1283 | |||
1284 | /* take care of the easy cases up front */ | ||
1285 | spin_lock(&res->spinlock); | ||
1286 | if (res->state & DLM_LOCK_RES_RECOVERING) { | ||
1287 | spin_unlock(&res->spinlock); | ||
1288 | mlog(0, "returning DLM_MASTER_RESP_ERROR since res is " | ||
1289 | "being recovered\n"); | ||
1290 | response = DLM_MASTER_RESP_ERROR; | ||
1291 | if (mle) | ||
1292 | kmem_cache_free(dlm_mle_cache, mle); | ||
1293 | goto send_response; | ||
1294 | } | ||
1295 | |||
1296 | if (res->owner == dlm->node_num) { | ||
1297 | u32 flags = DLM_ASSERT_MASTER_MLE_CLEANUP; | ||
1298 | spin_unlock(&res->spinlock); | ||
1299 | // mlog(0, "this node is the master\n"); | ||
1300 | response = DLM_MASTER_RESP_YES; | ||
1301 | if (mle) | ||
1302 | kmem_cache_free(dlm_mle_cache, mle); | ||
1303 | |||
1304 | /* this node is the owner. | ||
1305 | * there is some extra work that needs to | ||
1306 | * happen now. the requesting node has | ||
1307 | * caused all nodes up to this one to | ||
1308 | * create mles. this node now needs to | ||
1309 | * go back and clean those up. */ | ||
1310 | mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", | ||
1311 | dlm->node_num, res->lockname.len, res->lockname.name); | ||
1312 | ret = dlm_dispatch_assert_master(dlm, res, 1, | ||
1313 | request->node_idx, | ||
1314 | flags); | ||
1315 | if (ret < 0) { | ||
1316 | mlog(ML_ERROR, "failed to dispatch assert " | ||
1317 | "master work\n"); | ||
1318 | response = DLM_MASTER_RESP_ERROR; | ||
1319 | } | ||
1320 | goto send_response; | ||
1321 | } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
1322 | spin_unlock(&res->spinlock); | ||
1323 | // mlog(0, "node %u is the master\n", res->owner); | ||
1324 | response = DLM_MASTER_RESP_NO; | ||
1325 | if (mle) | ||
1326 | kmem_cache_free(dlm_mle_cache, mle); | ||
1327 | goto send_response; | ||
1328 | } | ||
1329 | |||
1330 | /* ok, there is no owner. either this node is | ||
1331 | * being blocked, or it is actively trying to | ||
1332 | * master this lock. */ | ||
1333 | if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { | ||
1334 | mlog(ML_ERROR, "lock with no owner should be " | ||
1335 | "in-progress!\n"); | ||
1336 | BUG(); | ||
1337 | } | ||
1338 | |||
1339 | // mlog(0, "lockres is in progress...\n"); | ||
1340 | spin_lock(&dlm->master_lock); | ||
1341 | found = dlm_find_mle(dlm, &tmpmle, name, namelen); | ||
1342 | if (!found) { | ||
1343 | mlog(ML_ERROR, "no mle found for this lock!\n"); | ||
1344 | BUG(); | ||
1345 | } | ||
1346 | set_maybe = 1; | ||
1347 | spin_lock(&tmpmle->spinlock); | ||
1348 | if (tmpmle->type == DLM_MLE_BLOCK) { | ||
1349 | // mlog(0, "this node is waiting for " | ||
1350 | // "lockres to be mastered\n"); | ||
1351 | response = DLM_MASTER_RESP_NO; | ||
1352 | } else if (tmpmle->type == DLM_MLE_MIGRATION) { | ||
1353 | mlog(0, "node %u is master, but trying to migrate to " | ||
1354 | "node %u.\n", tmpmle->master, tmpmle->new_master); | ||
1355 | if (tmpmle->master == dlm->node_num) { | ||
1356 | response = DLM_MASTER_RESP_YES; | ||
1357 | mlog(ML_ERROR, "no owner on lockres, but this " | ||
1358 | "node is trying to migrate it to %u?!\n", | ||
1359 | tmpmle->new_master); | ||
1360 | BUG(); | ||
1361 | } else { | ||
1362 | /* the real master can respond on its own */ | ||
1363 | response = DLM_MASTER_RESP_NO; | ||
1364 | } | ||
1365 | } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
1366 | set_maybe = 0; | ||
1367 | if (tmpmle->master == dlm->node_num) | ||
1368 | response = DLM_MASTER_RESP_YES; | ||
1369 | else | ||
1370 | response = DLM_MASTER_RESP_NO; | ||
1371 | } else { | ||
1372 | // mlog(0, "this node is attempting to " | ||
1373 | // "master lockres\n"); | ||
1374 | response = DLM_MASTER_RESP_MAYBE; | ||
1375 | } | ||
1376 | if (set_maybe) | ||
1377 | set_bit(request->node_idx, tmpmle->maybe_map); | ||
1378 | spin_unlock(&tmpmle->spinlock); | ||
1379 | |||
1380 | spin_unlock(&dlm->master_lock); | ||
1381 | spin_unlock(&res->spinlock); | ||
1382 | |||
1383 | /* keep the mle attached to heartbeat events */ | ||
1384 | dlm_put_mle(tmpmle); | ||
1385 | if (mle) | ||
1386 | kmem_cache_free(dlm_mle_cache, mle); | ||
1387 | goto send_response; | ||
1388 | } | ||
1389 | |||
1390 | /* | ||
1391 | * lockres doesn't exist on this node | ||
1392 | * if there is an MLE_BLOCK, return NO | ||
1393 | * if there is an MLE_MASTER, return MAYBE | ||
1394 | * otherwise, add an MLE_BLOCK, return NO | ||
1395 | */ | ||
1396 | spin_lock(&dlm->master_lock); | ||
1397 | found = dlm_find_mle(dlm, &tmpmle, name, namelen); | ||
1398 | if (!found) { | ||
1399 | /* this lockid has never been seen on this node yet */ | ||
1400 | // mlog(0, "no mle found\n"); | ||
1401 | if (!mle) { | ||
1402 | spin_unlock(&dlm->master_lock); | ||
1403 | spin_unlock(&dlm->spinlock); | ||
1404 | |||
1405 | mle = (struct dlm_master_list_entry *) | ||
1406 | kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL); | ||
1407 | if (!mle) { | ||
1408 | // bad bad bad... this sucks. | ||
1409 | response = DLM_MASTER_RESP_ERROR; | ||
1410 | goto send_response; | ||
1411 | } | ||
1412 | spin_lock(&dlm->spinlock); | ||
1413 | dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, | ||
1414 | name, namelen); | ||
1415 | spin_unlock(&dlm->spinlock); | ||
1416 | goto way_up_top; | ||
1417 | } | ||
1418 | |||
1419 | // mlog(0, "this is second time thru, already allocated, " | ||
1420 | // "add the block.\n"); | ||
1421 | set_bit(request->node_idx, mle->maybe_map); | ||
1422 | list_add(&mle->list, &dlm->master_list); | ||
1423 | response = DLM_MASTER_RESP_NO; | ||
1424 | } else { | ||
1425 | // mlog(0, "mle was found\n"); | ||
1426 | set_maybe = 1; | ||
1427 | spin_lock(&tmpmle->spinlock); | ||
1428 | if (tmpmle->type == DLM_MLE_BLOCK) | ||
1429 | response = DLM_MASTER_RESP_NO; | ||
1430 | else if (tmpmle->type == DLM_MLE_MIGRATION) { | ||
1431 | mlog(0, "migration mle was found (%u->%u)\n", | ||
1432 | tmpmle->master, tmpmle->new_master); | ||
1433 | if (tmpmle->master == dlm->node_num) { | ||
1434 | mlog(ML_ERROR, "no lockres, but migration mle " | ||
1435 | "says that this node is master!\n"); | ||
1436 | BUG(); | ||
1437 | } | ||
1438 | /* real master can respond on its own */ | ||
1439 | response = DLM_MASTER_RESP_NO; | ||
1440 | } else { | ||
1441 | if (tmpmle->master == dlm->node_num) { | ||
1442 | response = DLM_MASTER_RESP_YES; | ||
1443 | set_maybe = 0; | ||
1444 | } else | ||
1445 | response = DLM_MASTER_RESP_MAYBE; | ||
1446 | } | ||
1447 | if (set_maybe) | ||
1448 | set_bit(request->node_idx, tmpmle->maybe_map); | ||
1449 | spin_unlock(&tmpmle->spinlock); | ||
1450 | } | ||
1451 | spin_unlock(&dlm->master_lock); | ||
1452 | spin_unlock(&dlm->spinlock); | ||
1453 | |||
1454 | if (found) { | ||
1455 | /* keep the mle attached to heartbeat events */ | ||
1456 | dlm_put_mle(tmpmle); | ||
1457 | } | ||
1458 | send_response: | ||
1459 | dlm_put(dlm); | ||
1460 | return response; | ||
1461 | } | ||
1462 | |||
1463 | /* | ||
1464 | * DLM_ASSERT_MASTER_MSG | ||
1465 | */ | ||
1466 | |||
1467 | |||
1468 | /* | ||
1469 | * NOTE: this can be used for debugging | ||
1470 | * can periodically run all locks owned by this node | ||
1471 | * and re-assert across the cluster... | ||
1472 | */ | ||
1473 | static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, | ||
1474 | unsigned int namelen, void *nodemap, | ||
1475 | u32 flags) | ||
1476 | { | ||
1477 | struct dlm_assert_master assert; | ||
1478 | int to, tmpret; | ||
1479 | struct dlm_node_iter iter; | ||
1480 | int ret = 0; | ||
1481 | |||
1482 | BUG_ON(namelen > O2NM_MAX_NAME_LEN); | ||
1483 | |||
1484 | /* note that if this nodemap is empty, it returns 0 */ | ||
1485 | dlm_node_iter_init(nodemap, &iter); | ||
1486 | while ((to = dlm_node_iter_next(&iter)) >= 0) { | ||
1487 | int r = 0; | ||
1488 | mlog(0, "sending assert master to %d (%.*s)\n", to, | ||
1489 | namelen, lockname); | ||
1490 | memset(&assert, 0, sizeof(assert)); | ||
1491 | assert.node_idx = dlm->node_num; | ||
1492 | assert.namelen = namelen; | ||
1493 | memcpy(assert.name, lockname, namelen); | ||
1494 | assert.flags = cpu_to_be32(flags); | ||
1495 | |||
1496 | tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, | ||
1497 | &assert, sizeof(assert), to, &r); | ||
1498 | if (tmpret < 0) { | ||
1499 | mlog(ML_ERROR, "assert_master returned %d!\n", tmpret); | ||
1500 | if (!dlm_is_host_down(tmpret)) { | ||
1501 | mlog(ML_ERROR, "unhandled error!\n"); | ||
1502 | BUG(); | ||
1503 | } | ||
1504 | /* a node died. finish out the rest of the nodes. */ | ||
1505 | mlog(ML_ERROR, "link to %d went down!\n", to); | ||
1506 | /* any nonzero status return will do */ | ||
1507 | ret = tmpret; | ||
1508 | } else if (r < 0) { | ||
1509 | /* ok, something horribly messed. kill thyself. */ | ||
1510 | mlog(ML_ERROR,"during assert master of %.*s to %u, " | ||
1511 | "got %d.\n", namelen, lockname, to, r); | ||
1512 | dlm_dump_lock_resources(dlm); | ||
1513 | BUG(); | ||
1514 | } | ||
1515 | } | ||
1516 | |||
1517 | return ret; | ||
1518 | } | ||
1519 | |||
1520 | /* | ||
1521 | * locks that can be taken here: | ||
1522 | * dlm->spinlock | ||
1523 | * res->spinlock | ||
1524 | * mle->spinlock | ||
1525 | * dlm->master_list | ||
1526 | * | ||
1527 | * if possible, TRIM THIS DOWN!!! | ||
1528 | */ | ||
1529 | int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) | ||
1530 | { | ||
1531 | struct dlm_ctxt *dlm = data; | ||
1532 | struct dlm_master_list_entry *mle = NULL; | ||
1533 | struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf; | ||
1534 | struct dlm_lock_resource *res = NULL; | ||
1535 | char *name; | ||
1536 | unsigned int namelen; | ||
1537 | u32 flags; | ||
1538 | |||
1539 | if (!dlm_grab(dlm)) | ||
1540 | return 0; | ||
1541 | |||
1542 | name = assert->name; | ||
1543 | namelen = assert->namelen; | ||
1544 | flags = be32_to_cpu(assert->flags); | ||
1545 | |||
1546 | if (namelen > DLM_LOCKID_NAME_MAX) { | ||
1547 | mlog(ML_ERROR, "Invalid name length!"); | ||
1548 | goto done; | ||
1549 | } | ||
1550 | |||
1551 | spin_lock(&dlm->spinlock); | ||
1552 | |||
1553 | if (flags) | ||
1554 | mlog(0, "assert_master with flags: %u\n", flags); | ||
1555 | |||
1556 | /* find the MLE */ | ||
1557 | spin_lock(&dlm->master_lock); | ||
1558 | if (!dlm_find_mle(dlm, &mle, name, namelen)) { | ||
1559 | /* not an error, could be master just re-asserting */ | ||
1560 | mlog(0, "just got an assert_master from %u, but no " | ||
1561 | "MLE for it! (%.*s)\n", assert->node_idx, | ||
1562 | namelen, name); | ||
1563 | } else { | ||
1564 | int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0); | ||
1565 | if (bit >= O2NM_MAX_NODES) { | ||
1566 | /* not necessarily an error, though less likely. | ||
1567 | * could be master just re-asserting. */ | ||
1568 | mlog(ML_ERROR, "no bits set in the maybe_map, but %u " | ||
1569 | "is asserting! (%.*s)\n", assert->node_idx, | ||
1570 | namelen, name); | ||
1571 | } else if (bit != assert->node_idx) { | ||
1572 | if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { | ||
1573 | mlog(0, "master %u was found, %u should " | ||
1574 | "back off\n", assert->node_idx, bit); | ||
1575 | } else { | ||
1576 | /* with the fix for bug 569, a higher node | ||
1577 | * number winning the mastery will respond | ||
1578 | * YES to mastery requests, but this node | ||
1579 | * had no way of knowing. let it pass. */ | ||
1580 | mlog(ML_ERROR, "%u is the lowest node, " | ||
1581 | "%u is asserting. (%.*s) %u must " | ||
1582 | "have begun after %u won.\n", bit, | ||
1583 | assert->node_idx, namelen, name, bit, | ||
1584 | assert->node_idx); | ||
1585 | } | ||
1586 | } | ||
1587 | } | ||
1588 | spin_unlock(&dlm->master_lock); | ||
1589 | |||
1590 | /* ok everything checks out with the MLE | ||
1591 | * now check to see if there is a lockres */ | ||
1592 | res = __dlm_lookup_lockres(dlm, name, namelen); | ||
1593 | if (res) { | ||
1594 | spin_lock(&res->spinlock); | ||
1595 | if (res->state & DLM_LOCK_RES_RECOVERING) { | ||
1596 | mlog(ML_ERROR, "%u asserting but %.*s is " | ||
1597 | "RECOVERING!\n", assert->node_idx, namelen, name); | ||
1598 | goto kill; | ||
1599 | } | ||
1600 | if (!mle) { | ||
1601 | if (res->owner != assert->node_idx) { | ||
1602 | mlog(ML_ERROR, "assert_master from " | ||
1603 | "%u, but current owner is " | ||
1604 | "%u! (%.*s)\n", | ||
1605 | assert->node_idx, res->owner, | ||
1606 | namelen, name); | ||
1607 | goto kill; | ||
1608 | } | ||
1609 | } else if (mle->type != DLM_MLE_MIGRATION) { | ||
1610 | if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
1611 | /* owner is just re-asserting */ | ||
1612 | if (res->owner == assert->node_idx) { | ||
1613 | mlog(0, "owner %u re-asserting on " | ||
1614 | "lock %.*s\n", assert->node_idx, | ||
1615 | namelen, name); | ||
1616 | goto ok; | ||
1617 | } | ||
1618 | mlog(ML_ERROR, "got assert_master from " | ||
1619 | "node %u, but %u is the owner! " | ||
1620 | "(%.*s)\n", assert->node_idx, | ||
1621 | res->owner, namelen, name); | ||
1622 | goto kill; | ||
1623 | } | ||
1624 | if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { | ||
1625 | mlog(ML_ERROR, "got assert from %u, but lock " | ||
1626 | "with no owner should be " | ||
1627 | "in-progress! (%.*s)\n", | ||
1628 | assert->node_idx, | ||
1629 | namelen, name); | ||
1630 | goto kill; | ||
1631 | } | ||
1632 | } else /* mle->type == DLM_MLE_MIGRATION */ { | ||
1633 | /* should only be getting an assert from new master */ | ||
1634 | if (assert->node_idx != mle->new_master) { | ||
1635 | mlog(ML_ERROR, "got assert from %u, but " | ||
1636 | "new master is %u, and old master " | ||
1637 | "was %u (%.*s)\n", | ||
1638 | assert->node_idx, mle->new_master, | ||
1639 | mle->master, namelen, name); | ||
1640 | goto kill; | ||
1641 | } | ||
1642 | |||
1643 | } | ||
1644 | ok: | ||
1645 | spin_unlock(&res->spinlock); | ||
1646 | } | ||
1647 | spin_unlock(&dlm->spinlock); | ||
1648 | |||
1649 | // mlog(0, "woo! got an assert_master from node %u!\n", | ||
1650 | // assert->node_idx); | ||
1651 | if (mle) { | ||
1652 | int extra_ref; | ||
1653 | |||
1654 | spin_lock(&mle->spinlock); | ||
1655 | extra_ref = !!(mle->type == DLM_MLE_BLOCK | ||
1656 | || mle->type == DLM_MLE_MIGRATION); | ||
1657 | mle->master = assert->node_idx; | ||
1658 | atomic_set(&mle->woken, 1); | ||
1659 | wake_up(&mle->wq); | ||
1660 | spin_unlock(&mle->spinlock); | ||
1661 | |||
1662 | if (mle->type == DLM_MLE_MIGRATION && res) { | ||
1663 | mlog(0, "finishing off migration of lockres %.*s, " | ||
1664 | "from %u to %u\n", | ||
1665 | res->lockname.len, res->lockname.name, | ||
1666 | dlm->node_num, mle->new_master); | ||
1667 | spin_lock(&res->spinlock); | ||
1668 | res->state &= ~DLM_LOCK_RES_MIGRATING; | ||
1669 | dlm_change_lockres_owner(dlm, res, mle->new_master); | ||
1670 | BUG_ON(res->state & DLM_LOCK_RES_DIRTY); | ||
1671 | spin_unlock(&res->spinlock); | ||
1672 | } | ||
1673 | /* master is known, detach if not already detached */ | ||
1674 | dlm_mle_detach_hb_events(dlm, mle); | ||
1675 | dlm_put_mle(mle); | ||
1676 | |||
1677 | if (extra_ref) { | ||
1678 | /* the assert master message now balances the extra | ||
1679 | * ref given by the master / migration request message. | ||
1680 | * if this is the last put, it will be removed | ||
1681 | * from the list. */ | ||
1682 | dlm_put_mle(mle); | ||
1683 | } | ||
1684 | } | ||
1685 | |||
1686 | done: | ||
1687 | if (res) | ||
1688 | dlm_lockres_put(res); | ||
1689 | dlm_put(dlm); | ||
1690 | return 0; | ||
1691 | |||
1692 | kill: | ||
1693 | /* kill the caller! */ | ||
1694 | spin_unlock(&res->spinlock); | ||
1695 | spin_unlock(&dlm->spinlock); | ||
1696 | dlm_lockres_put(res); | ||
1697 | mlog(ML_ERROR, "Bad message received from another node. Dumping state " | ||
1698 | "and killing the other node now! This node is OK and can continue.\n"); | ||
1699 | dlm_dump_lock_resources(dlm); | ||
1700 | dlm_put(dlm); | ||
1701 | return -EINVAL; | ||
1702 | } | ||
1703 | |||
1704 | int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, | ||
1705 | struct dlm_lock_resource *res, | ||
1706 | int ignore_higher, u8 request_from, u32 flags) | ||
1707 | { | ||
1708 | struct dlm_work_item *item; | ||
1709 | item = kcalloc(1, sizeof(*item), GFP_KERNEL); | ||
1710 | if (!item) | ||
1711 | return -ENOMEM; | ||
1712 | |||
1713 | |||
1714 | /* queue up work for dlm_assert_master_worker */ | ||
1715 | dlm_grab(dlm); /* get an extra ref for the work item */ | ||
1716 | dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL); | ||
1717 | item->u.am.lockres = res; /* already have a ref */ | ||
1718 | /* can optionally ignore node numbers higher than this node */ | ||
1719 | item->u.am.ignore_higher = ignore_higher; | ||
1720 | item->u.am.request_from = request_from; | ||
1721 | item->u.am.flags = flags; | ||
1722 | |||
1723 | spin_lock(&dlm->work_lock); | ||
1724 | list_add_tail(&item->list, &dlm->work_list); | ||
1725 | spin_unlock(&dlm->work_lock); | ||
1726 | |||
1727 | schedule_work(&dlm->dispatched_work); | ||
1728 | return 0; | ||
1729 | } | ||
1730 | |||
1731 | static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) | ||
1732 | { | ||
1733 | struct dlm_ctxt *dlm = data; | ||
1734 | int ret = 0; | ||
1735 | struct dlm_lock_resource *res; | ||
1736 | unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
1737 | int ignore_higher; | ||
1738 | int bit; | ||
1739 | u8 request_from; | ||
1740 | u32 flags; | ||
1741 | |||
1742 | dlm = item->dlm; | ||
1743 | res = item->u.am.lockres; | ||
1744 | ignore_higher = item->u.am.ignore_higher; | ||
1745 | request_from = item->u.am.request_from; | ||
1746 | flags = item->u.am.flags; | ||
1747 | |||
1748 | spin_lock(&dlm->spinlock); | ||
1749 | memcpy(nodemap, dlm->domain_map, sizeof(nodemap)); | ||
1750 | spin_unlock(&dlm->spinlock); | ||
1751 | |||
1752 | clear_bit(dlm->node_num, nodemap); | ||
1753 | if (ignore_higher) { | ||
1754 | /* if is this just to clear up mles for nodes below | ||
1755 | * this node, do not send the message to the original | ||
1756 | * caller or any node number higher than this */ | ||
1757 | clear_bit(request_from, nodemap); | ||
1758 | bit = dlm->node_num; | ||
1759 | while (1) { | ||
1760 | bit = find_next_bit(nodemap, O2NM_MAX_NODES, | ||
1761 | bit+1); | ||
1762 | if (bit >= O2NM_MAX_NODES) | ||
1763 | break; | ||
1764 | clear_bit(bit, nodemap); | ||
1765 | } | ||
1766 | } | ||
1767 | |||
1768 | /* this call now finishes out the nodemap | ||
1769 | * even if one or more nodes die */ | ||
1770 | mlog(0, "worker about to master %.*s here, this=%u\n", | ||
1771 | res->lockname.len, res->lockname.name, dlm->node_num); | ||
1772 | ret = dlm_do_assert_master(dlm, res->lockname.name, | ||
1773 | res->lockname.len, | ||
1774 | nodemap, flags); | ||
1775 | if (ret < 0) { | ||
1776 | /* no need to restart, we are done */ | ||
1777 | mlog_errno(ret); | ||
1778 | } | ||
1779 | |||
1780 | dlm_lockres_put(res); | ||
1781 | |||
1782 | mlog(0, "finished with dlm_assert_master_worker\n"); | ||
1783 | } | ||
1784 | |||
1785 | |||
1786 | /* | ||
1787 | * DLM_MIGRATE_LOCKRES | ||
1788 | */ | ||
1789 | |||
1790 | |||
1791 | int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | ||
1792 | u8 target) | ||
1793 | { | ||
1794 | struct dlm_master_list_entry *mle = NULL; | ||
1795 | struct dlm_master_list_entry *oldmle = NULL; | ||
1796 | struct dlm_migratable_lockres *mres = NULL; | ||
1797 | int ret = -EINVAL; | ||
1798 | const char *name; | ||
1799 | unsigned int namelen; | ||
1800 | int mle_added = 0; | ||
1801 | struct list_head *queue, *iter; | ||
1802 | int i; | ||
1803 | struct dlm_lock *lock; | ||
1804 | int empty = 1; | ||
1805 | |||
1806 | if (!dlm_grab(dlm)) | ||
1807 | return -EINVAL; | ||
1808 | |||
1809 | name = res->lockname.name; | ||
1810 | namelen = res->lockname.len; | ||
1811 | |||
1812 | mlog(0, "migrating %.*s to %u\n", namelen, name, target); | ||
1813 | |||
1814 | /* | ||
1815 | * ensure this lockres is a proper candidate for migration | ||
1816 | */ | ||
1817 | spin_lock(&res->spinlock); | ||
1818 | if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
1819 | mlog(0, "cannot migrate lockres with unknown owner!\n"); | ||
1820 | spin_unlock(&res->spinlock); | ||
1821 | goto leave; | ||
1822 | } | ||
1823 | if (res->owner != dlm->node_num) { | ||
1824 | mlog(0, "cannot migrate lockres this node doesn't own!\n"); | ||
1825 | spin_unlock(&res->spinlock); | ||
1826 | goto leave; | ||
1827 | } | ||
1828 | mlog(0, "checking queues...\n"); | ||
1829 | queue = &res->granted; | ||
1830 | for (i=0; i<3; i++) { | ||
1831 | list_for_each(iter, queue) { | ||
1832 | lock = list_entry (iter, struct dlm_lock, list); | ||
1833 | empty = 0; | ||
1834 | if (lock->ml.node == dlm->node_num) { | ||
1835 | mlog(0, "found a lock owned by this node " | ||
1836 | "still on the %s queue! will not " | ||
1837 | "migrate this lockres\n", | ||
1838 | i==0 ? "granted" : | ||
1839 | (i==1 ? "converting" : "blocked")); | ||
1840 | spin_unlock(&res->spinlock); | ||
1841 | ret = -ENOTEMPTY; | ||
1842 | goto leave; | ||
1843 | } | ||
1844 | } | ||
1845 | queue++; | ||
1846 | } | ||
1847 | mlog(0, "all locks on this lockres are nonlocal. continuing\n"); | ||
1848 | spin_unlock(&res->spinlock); | ||
1849 | |||
1850 | /* no work to do */ | ||
1851 | if (empty) { | ||
1852 | mlog(0, "no locks were found on this lockres! done!\n"); | ||
1853 | ret = 0; | ||
1854 | goto leave; | ||
1855 | } | ||
1856 | |||
1857 | /* | ||
1858 | * preallocate up front | ||
1859 | * if this fails, abort | ||
1860 | */ | ||
1861 | |||
1862 | ret = -ENOMEM; | ||
1863 | mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL); | ||
1864 | if (!mres) { | ||
1865 | mlog_errno(ret); | ||
1866 | goto leave; | ||
1867 | } | ||
1868 | |||
1869 | mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, | ||
1870 | GFP_KERNEL); | ||
1871 | if (!mle) { | ||
1872 | mlog_errno(ret); | ||
1873 | goto leave; | ||
1874 | } | ||
1875 | ret = 0; | ||
1876 | |||
1877 | /* | ||
1878 | * find a node to migrate the lockres to | ||
1879 | */ | ||
1880 | |||
1881 | mlog(0, "picking a migration node\n"); | ||
1882 | spin_lock(&dlm->spinlock); | ||
1883 | /* pick a new node */ | ||
1884 | if (!test_bit(target, dlm->domain_map) || | ||
1885 | target >= O2NM_MAX_NODES) { | ||
1886 | target = dlm_pick_migration_target(dlm, res); | ||
1887 | } | ||
1888 | mlog(0, "node %u chosen for migration\n", target); | ||
1889 | |||
1890 | if (target >= O2NM_MAX_NODES || | ||
1891 | !test_bit(target, dlm->domain_map)) { | ||
1892 | /* target chosen is not alive */ | ||
1893 | ret = -EINVAL; | ||
1894 | } | ||
1895 | |||
1896 | if (ret) { | ||
1897 | spin_unlock(&dlm->spinlock); | ||
1898 | goto fail; | ||
1899 | } | ||
1900 | |||
1901 | mlog(0, "continuing with target = %u\n", target); | ||
1902 | |||
1903 | /* | ||
1904 | * clear any existing master requests and | ||
1905 | * add the migration mle to the list | ||
1906 | */ | ||
1907 | spin_lock(&dlm->master_lock); | ||
1908 | ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, | ||
1909 | namelen, target, dlm->node_num); | ||
1910 | spin_unlock(&dlm->master_lock); | ||
1911 | spin_unlock(&dlm->spinlock); | ||
1912 | |||
1913 | if (ret == -EEXIST) { | ||
1914 | mlog(0, "another process is already migrating it\n"); | ||
1915 | goto fail; | ||
1916 | } | ||
1917 | mle_added = 1; | ||
1918 | |||
1919 | /* | ||
1920 | * set the MIGRATING flag and flush asts | ||
1921 | * if we fail after this we need to re-dirty the lockres | ||
1922 | */ | ||
1923 | if (dlm_mark_lockres_migrating(dlm, res, target) < 0) { | ||
1924 | mlog(ML_ERROR, "tried to migrate %.*s to %u, but " | ||
1925 | "the target went down.\n", res->lockname.len, | ||
1926 | res->lockname.name, target); | ||
1927 | spin_lock(&res->spinlock); | ||
1928 | res->state &= ~DLM_LOCK_RES_MIGRATING; | ||
1929 | spin_unlock(&res->spinlock); | ||
1930 | ret = -EINVAL; | ||
1931 | } | ||
1932 | |||
1933 | fail: | ||
1934 | if (oldmle) { | ||
1935 | /* master is known, detach if not already detached */ | ||
1936 | dlm_mle_detach_hb_events(dlm, oldmle); | ||
1937 | dlm_put_mle(oldmle); | ||
1938 | } | ||
1939 | |||
1940 | if (ret < 0) { | ||
1941 | if (mle_added) { | ||
1942 | dlm_mle_detach_hb_events(dlm, mle); | ||
1943 | dlm_put_mle(mle); | ||
1944 | } else if (mle) { | ||
1945 | kmem_cache_free(dlm_mle_cache, mle); | ||
1946 | } | ||
1947 | goto leave; | ||
1948 | } | ||
1949 | |||
1950 | /* | ||
1951 | * at this point, we have a migration target, an mle | ||
1952 | * in the master list, and the MIGRATING flag set on | ||
1953 | * the lockres | ||
1954 | */ | ||
1955 | |||
1956 | |||
1957 | /* get an extra reference on the mle. | ||
1958 | * otherwise the assert_master from the new | ||
1959 | * master will destroy this. | ||
1960 | * also, make sure that all callers of dlm_get_mle | ||
1961 | * take both dlm->spinlock and dlm->master_lock */ | ||
1962 | spin_lock(&dlm->spinlock); | ||
1963 | spin_lock(&dlm->master_lock); | ||
1964 | dlm_get_mle(mle); | ||
1965 | spin_unlock(&dlm->master_lock); | ||
1966 | spin_unlock(&dlm->spinlock); | ||
1967 | |||
1968 | /* notify new node and send all lock state */ | ||
1969 | /* call send_one_lockres with migration flag. | ||
1970 | * this serves as notice to the target node that a | ||
1971 | * migration is starting. */ | ||
1972 | ret = dlm_send_one_lockres(dlm, res, mres, target, | ||
1973 | DLM_MRES_MIGRATION); | ||
1974 | |||
1975 | if (ret < 0) { | ||
1976 | mlog(0, "migration to node %u failed with %d\n", | ||
1977 | target, ret); | ||
1978 | /* migration failed, detach and clean up mle */ | ||
1979 | dlm_mle_detach_hb_events(dlm, mle); | ||
1980 | dlm_put_mle(mle); | ||
1981 | dlm_put_mle(mle); | ||
1982 | goto leave; | ||
1983 | } | ||
1984 | |||
1985 | /* at this point, the target sends a message to all nodes, | ||
1986 | * (using dlm_do_migrate_request). this node is skipped since | ||
1987 | * we had to put an mle in the list to begin the process. this | ||
1988 | * node now waits for target to do an assert master. this node | ||
1989 | * will be the last one notified, ensuring that the migration | ||
1990 | * is complete everywhere. if the target dies while this is | ||
1991 | * going on, some nodes could potentially see the target as the | ||
1992 | * master, so it is important that my recovery finds the migration | ||
1993 | * mle and sets the master to UNKNONWN. */ | ||
1994 | |||
1995 | |||
1996 | /* wait for new node to assert master */ | ||
1997 | while (1) { | ||
1998 | ret = wait_event_interruptible_timeout(mle->wq, | ||
1999 | (atomic_read(&mle->woken) == 1), | ||
2000 | msecs_to_jiffies(5000)); | ||
2001 | |||
2002 | if (ret >= 0) { | ||
2003 | if (atomic_read(&mle->woken) == 1 || | ||
2004 | res->owner == target) | ||
2005 | break; | ||
2006 | |||
2007 | mlog(0, "timed out during migration\n"); | ||
2008 | } | ||
2009 | if (ret == -ERESTARTSYS) { | ||
2010 | /* migration failed, detach and clean up mle */ | ||
2011 | dlm_mle_detach_hb_events(dlm, mle); | ||
2012 | dlm_put_mle(mle); | ||
2013 | dlm_put_mle(mle); | ||
2014 | goto leave; | ||
2015 | } | ||
2016 | /* TODO: if node died: stop, clean up, return error */ | ||
2017 | } | ||
2018 | |||
2019 | /* all done, set the owner, clear the flag */ | ||
2020 | spin_lock(&res->spinlock); | ||
2021 | dlm_set_lockres_owner(dlm, res, target); | ||
2022 | res->state &= ~DLM_LOCK_RES_MIGRATING; | ||
2023 | dlm_remove_nonlocal_locks(dlm, res); | ||
2024 | spin_unlock(&res->spinlock); | ||
2025 | wake_up(&res->wq); | ||
2026 | |||
2027 | /* master is known, detach if not already detached */ | ||
2028 | dlm_mle_detach_hb_events(dlm, mle); | ||
2029 | dlm_put_mle(mle); | ||
2030 | ret = 0; | ||
2031 | |||
2032 | dlm_lockres_calc_usage(dlm, res); | ||
2033 | |||
2034 | leave: | ||
2035 | /* re-dirty the lockres if we failed */ | ||
2036 | if (ret < 0) | ||
2037 | dlm_kick_thread(dlm, res); | ||
2038 | |||
2039 | /* TODO: cleanup */ | ||
2040 | if (mres) | ||
2041 | free_page((unsigned long)mres); | ||
2042 | |||
2043 | dlm_put(dlm); | ||
2044 | |||
2045 | mlog(0, "returning %d\n", ret); | ||
2046 | return ret; | ||
2047 | } | ||
2048 | EXPORT_SYMBOL_GPL(dlm_migrate_lockres); | ||
2049 | |||
2050 | int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) | ||
2051 | { | ||
2052 | int ret; | ||
2053 | spin_lock(&dlm->ast_lock); | ||
2054 | spin_lock(&lock->spinlock); | ||
2055 | ret = (list_empty(&lock->bast_list) && !lock->bast_pending); | ||
2056 | spin_unlock(&lock->spinlock); | ||
2057 | spin_unlock(&dlm->ast_lock); | ||
2058 | return ret; | ||
2059 | } | ||
2060 | |||
2061 | static int dlm_migration_can_proceed(struct dlm_ctxt *dlm, | ||
2062 | struct dlm_lock_resource *res, | ||
2063 | u8 mig_target) | ||
2064 | { | ||
2065 | int can_proceed; | ||
2066 | spin_lock(&res->spinlock); | ||
2067 | can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING); | ||
2068 | spin_unlock(&res->spinlock); | ||
2069 | |||
2070 | /* target has died, so make the caller break out of the | ||
2071 | * wait_event, but caller must recheck the domain_map */ | ||
2072 | spin_lock(&dlm->spinlock); | ||
2073 | if (!test_bit(mig_target, dlm->domain_map)) | ||
2074 | can_proceed = 1; | ||
2075 | spin_unlock(&dlm->spinlock); | ||
2076 | return can_proceed; | ||
2077 | } | ||
2078 | |||
2079 | int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) | ||
2080 | { | ||
2081 | int ret; | ||
2082 | spin_lock(&res->spinlock); | ||
2083 | ret = !!(res->state & DLM_LOCK_RES_DIRTY); | ||
2084 | spin_unlock(&res->spinlock); | ||
2085 | return ret; | ||
2086 | } | ||
2087 | |||
2088 | |||
2089 | static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, | ||
2090 | struct dlm_lock_resource *res, | ||
2091 | u8 target) | ||
2092 | { | ||
2093 | int ret = 0; | ||
2094 | |||
2095 | mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n", | ||
2096 | res->lockname.len, res->lockname.name, dlm->node_num, | ||
2097 | target); | ||
2098 | /* need to set MIGRATING flag on lockres. this is done by | ||
2099 | * ensuring that all asts have been flushed for this lockres. */ | ||
2100 | spin_lock(&res->spinlock); | ||
2101 | BUG_ON(res->migration_pending); | ||
2102 | res->migration_pending = 1; | ||
2103 | /* strategy is to reserve an extra ast then release | ||
2104 | * it below, letting the release do all of the work */ | ||
2105 | __dlm_lockres_reserve_ast(res); | ||
2106 | spin_unlock(&res->spinlock); | ||
2107 | |||
2108 | /* now flush all the pending asts.. hang out for a bit */ | ||
2109 | dlm_kick_thread(dlm, res); | ||
2110 | wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); | ||
2111 | dlm_lockres_release_ast(dlm, res); | ||
2112 | |||
2113 | mlog(0, "about to wait on migration_wq, dirty=%s\n", | ||
2114 | res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); | ||
2115 | /* if the extra ref we just put was the final one, this | ||
2116 | * will pass thru immediately. otherwise, we need to wait | ||
2117 | * for the last ast to finish. */ | ||
2118 | again: | ||
2119 | ret = wait_event_interruptible_timeout(dlm->migration_wq, | ||
2120 | dlm_migration_can_proceed(dlm, res, target), | ||
2121 | msecs_to_jiffies(1000)); | ||
2122 | if (ret < 0) { | ||
2123 | mlog(0, "woken again: migrating? %s, dead? %s\n", | ||
2124 | res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", | ||
2125 | test_bit(target, dlm->domain_map) ? "no":"yes"); | ||
2126 | } else { | ||
2127 | mlog(0, "all is well: migrating? %s, dead? %s\n", | ||
2128 | res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", | ||
2129 | test_bit(target, dlm->domain_map) ? "no":"yes"); | ||
2130 | } | ||
2131 | if (!dlm_migration_can_proceed(dlm, res, target)) { | ||
2132 | mlog(0, "trying again...\n"); | ||
2133 | goto again; | ||
2134 | } | ||
2135 | |||
2136 | /* did the target go down or die? */ | ||
2137 | spin_lock(&dlm->spinlock); | ||
2138 | if (!test_bit(target, dlm->domain_map)) { | ||
2139 | mlog(ML_ERROR, "aha. migration target %u just went down\n", | ||
2140 | target); | ||
2141 | ret = -EHOSTDOWN; | ||
2142 | } | ||
2143 | spin_unlock(&dlm->spinlock); | ||
2144 | |||
2145 | /* | ||
2146 | * at this point: | ||
2147 | * | ||
2148 | * o the DLM_LOCK_RES_MIGRATING flag is set | ||
2149 | * o there are no pending asts on this lockres | ||
2150 | * o all processes trying to reserve an ast on this | ||
2151 | * lockres must wait for the MIGRATING flag to clear | ||
2152 | */ | ||
2153 | return ret; | ||
2154 | } | ||
2155 | |||
2156 | /* last step in the migration process. | ||
2157 | * original master calls this to free all of the dlm_lock | ||
2158 | * structures that used to be for other nodes. */ | ||
2159 | static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, | ||
2160 | struct dlm_lock_resource *res) | ||
2161 | { | ||
2162 | struct list_head *iter, *iter2; | ||
2163 | struct list_head *queue = &res->granted; | ||
2164 | int i; | ||
2165 | struct dlm_lock *lock; | ||
2166 | |||
2167 | assert_spin_locked(&res->spinlock); | ||
2168 | |||
2169 | BUG_ON(res->owner == dlm->node_num); | ||
2170 | |||
2171 | for (i=0; i<3; i++) { | ||
2172 | list_for_each_safe(iter, iter2, queue) { | ||
2173 | lock = list_entry (iter, struct dlm_lock, list); | ||
2174 | if (lock->ml.node != dlm->node_num) { | ||
2175 | mlog(0, "putting lock for node %u\n", | ||
2176 | lock->ml.node); | ||
2177 | /* be extra careful */ | ||
2178 | BUG_ON(!list_empty(&lock->ast_list)); | ||
2179 | BUG_ON(!list_empty(&lock->bast_list)); | ||
2180 | BUG_ON(lock->ast_pending); | ||
2181 | BUG_ON(lock->bast_pending); | ||
2182 | list_del_init(&lock->list); | ||
2183 | dlm_lock_put(lock); | ||
2184 | } | ||
2185 | } | ||
2186 | queue++; | ||
2187 | } | ||
2188 | } | ||
2189 | |||
2190 | /* for now this is not too intelligent. we will | ||
2191 | * need stats to make this do the right thing. | ||
2192 | * this just finds the first lock on one of the | ||
2193 | * queues and uses that node as the target. */ | ||
2194 | static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, | ||
2195 | struct dlm_lock_resource *res) | ||
2196 | { | ||
2197 | int i; | ||
2198 | struct list_head *queue = &res->granted; | ||
2199 | struct list_head *iter; | ||
2200 | struct dlm_lock *lock; | ||
2201 | int nodenum; | ||
2202 | |||
2203 | assert_spin_locked(&dlm->spinlock); | ||
2204 | |||
2205 | spin_lock(&res->spinlock); | ||
2206 | for (i=0; i<3; i++) { | ||
2207 | list_for_each(iter, queue) { | ||
2208 | /* up to the caller to make sure this node | ||
2209 | * is alive */ | ||
2210 | lock = list_entry (iter, struct dlm_lock, list); | ||
2211 | if (lock->ml.node != dlm->node_num) { | ||
2212 | spin_unlock(&res->spinlock); | ||
2213 | return lock->ml.node; | ||
2214 | } | ||
2215 | } | ||
2216 | queue++; | ||
2217 | } | ||
2218 | spin_unlock(&res->spinlock); | ||
2219 | mlog(0, "have not found a suitable target yet! checking domain map\n"); | ||
2220 | |||
2221 | /* ok now we're getting desperate. pick anyone alive. */ | ||
2222 | nodenum = -1; | ||
2223 | while (1) { | ||
2224 | nodenum = find_next_bit(dlm->domain_map, | ||
2225 | O2NM_MAX_NODES, nodenum+1); | ||
2226 | mlog(0, "found %d in domain map\n", nodenum); | ||
2227 | if (nodenum >= O2NM_MAX_NODES) | ||
2228 | break; | ||
2229 | if (nodenum != dlm->node_num) { | ||
2230 | mlog(0, "picking %d\n", nodenum); | ||
2231 | return nodenum; | ||
2232 | } | ||
2233 | } | ||
2234 | |||
2235 | mlog(0, "giving up. no master to migrate to\n"); | ||
2236 | return DLM_LOCK_RES_OWNER_UNKNOWN; | ||
2237 | } | ||
2238 | |||
2239 | |||
2240 | |||
2241 | /* this is called by the new master once all lockres | ||
2242 | * data has been received */ | ||
2243 | static int dlm_do_migrate_request(struct dlm_ctxt *dlm, | ||
2244 | struct dlm_lock_resource *res, | ||
2245 | u8 master, u8 new_master, | ||
2246 | struct dlm_node_iter *iter) | ||
2247 | { | ||
2248 | struct dlm_migrate_request migrate; | ||
2249 | int ret, status = 0; | ||
2250 | int nodenum; | ||
2251 | |||
2252 | memset(&migrate, 0, sizeof(migrate)); | ||
2253 | migrate.namelen = res->lockname.len; | ||
2254 | memcpy(migrate.name, res->lockname.name, migrate.namelen); | ||
2255 | migrate.new_master = new_master; | ||
2256 | migrate.master = master; | ||
2257 | |||
2258 | ret = 0; | ||
2259 | |||
2260 | /* send message to all nodes, except the master and myself */ | ||
2261 | while ((nodenum = dlm_node_iter_next(iter)) >= 0) { | ||
2262 | if (nodenum == master || | ||
2263 | nodenum == new_master) | ||
2264 | continue; | ||
2265 | |||
2266 | ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key, | ||
2267 | &migrate, sizeof(migrate), nodenum, | ||
2268 | &status); | ||
2269 | if (ret < 0) | ||
2270 | mlog_errno(ret); | ||
2271 | else if (status < 0) { | ||
2272 | mlog(0, "migrate request (node %u) returned %d!\n", | ||
2273 | nodenum, status); | ||
2274 | ret = status; | ||
2275 | } | ||
2276 | } | ||
2277 | |||
2278 | if (ret < 0) | ||
2279 | mlog_errno(ret); | ||
2280 | |||
2281 | mlog(0, "returning ret=%d\n", ret); | ||
2282 | return ret; | ||
2283 | } | ||
2284 | |||
2285 | |||
2286 | /* if there is an existing mle for this lockres, we now know who the master is. | ||
2287 | * (the one who sent us *this* message) we can clear it up right away. | ||
2288 | * since the process that put the mle on the list still has a reference to it, | ||
2289 | * we can unhash it now, set the master and wake the process. as a result, | ||
2290 | * we will have no mle in the list to start with. now we can add an mle for | ||
2291 | * the migration and this should be the only one found for those scanning the | ||
2292 | * list. */ | ||
2293 | int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) | ||
2294 | { | ||
2295 | struct dlm_ctxt *dlm = data; | ||
2296 | struct dlm_lock_resource *res = NULL; | ||
2297 | struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf; | ||
2298 | struct dlm_master_list_entry *mle = NULL, *oldmle = NULL; | ||
2299 | const char *name; | ||
2300 | unsigned int namelen; | ||
2301 | int ret = 0; | ||
2302 | |||
2303 | if (!dlm_grab(dlm)) | ||
2304 | return -EINVAL; | ||
2305 | |||
2306 | name = migrate->name; | ||
2307 | namelen = migrate->namelen; | ||
2308 | |||
2309 | /* preallocate.. if this fails, abort */ | ||
2310 | mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, | ||
2311 | GFP_KERNEL); | ||
2312 | |||
2313 | if (!mle) { | ||
2314 | ret = -ENOMEM; | ||
2315 | goto leave; | ||
2316 | } | ||
2317 | |||
2318 | /* check for pre-existing lock */ | ||
2319 | spin_lock(&dlm->spinlock); | ||
2320 | res = __dlm_lookup_lockres(dlm, name, namelen); | ||
2321 | spin_lock(&dlm->master_lock); | ||
2322 | |||
2323 | if (res) { | ||
2324 | spin_lock(&res->spinlock); | ||
2325 | if (res->state & DLM_LOCK_RES_RECOVERING) { | ||
2326 | /* if all is working ok, this can only mean that we got | ||
2327 | * a migrate request from a node that we now see as | ||
2328 | * dead. what can we do here? drop it to the floor? */ | ||
2329 | spin_unlock(&res->spinlock); | ||
2330 | mlog(ML_ERROR, "Got a migrate request, but the " | ||
2331 | "lockres is marked as recovering!"); | ||
2332 | kmem_cache_free(dlm_mle_cache, mle); | ||
2333 | ret = -EINVAL; /* need a better solution */ | ||
2334 | goto unlock; | ||
2335 | } | ||
2336 | res->state |= DLM_LOCK_RES_MIGRATING; | ||
2337 | spin_unlock(&res->spinlock); | ||
2338 | } | ||
2339 | |||
2340 | /* ignore status. only nonzero status would BUG. */ | ||
2341 | ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, | ||
2342 | name, namelen, | ||
2343 | migrate->new_master, | ||
2344 | migrate->master); | ||
2345 | |||
2346 | unlock: | ||
2347 | spin_unlock(&dlm->master_lock); | ||
2348 | spin_unlock(&dlm->spinlock); | ||
2349 | |||
2350 | if (oldmle) { | ||
2351 | /* master is known, detach if not already detached */ | ||
2352 | dlm_mle_detach_hb_events(dlm, oldmle); | ||
2353 | dlm_put_mle(oldmle); | ||
2354 | } | ||
2355 | |||
2356 | if (res) | ||
2357 | dlm_lockres_put(res); | ||
2358 | leave: | ||
2359 | dlm_put(dlm); | ||
2360 | return ret; | ||
2361 | } | ||
2362 | |||
2363 | /* must be holding dlm->spinlock and dlm->master_lock | ||
2364 | * when adding a migration mle, we can clear any other mles | ||
2365 | * in the master list because we know with certainty that | ||
2366 | * the master is "master". so we remove any old mle from | ||
2367 | * the list after setting it's master field, and then add | ||
2368 | * the new migration mle. this way we can hold with the rule | ||
2369 | * of having only one mle for a given lock name at all times. */ | ||
2370 | static int dlm_add_migration_mle(struct dlm_ctxt *dlm, | ||
2371 | struct dlm_lock_resource *res, | ||
2372 | struct dlm_master_list_entry *mle, | ||
2373 | struct dlm_master_list_entry **oldmle, | ||
2374 | const char *name, unsigned int namelen, | ||
2375 | u8 new_master, u8 master) | ||
2376 | { | ||
2377 | int found; | ||
2378 | int ret = 0; | ||
2379 | |||
2380 | *oldmle = NULL; | ||
2381 | |||
2382 | mlog_entry_void(); | ||
2383 | |||
2384 | assert_spin_locked(&dlm->spinlock); | ||
2385 | assert_spin_locked(&dlm->master_lock); | ||
2386 | |||
2387 | /* caller is responsible for any ref taken here on oldmle */ | ||
2388 | found = dlm_find_mle(dlm, oldmle, (char *)name, namelen); | ||
2389 | if (found) { | ||
2390 | struct dlm_master_list_entry *tmp = *oldmle; | ||
2391 | spin_lock(&tmp->spinlock); | ||
2392 | if (tmp->type == DLM_MLE_MIGRATION) { | ||
2393 | if (master == dlm->node_num) { | ||
2394 | /* ah another process raced me to it */ | ||
2395 | mlog(0, "tried to migrate %.*s, but some " | ||
2396 | "process beat me to it\n", | ||
2397 | namelen, name); | ||
2398 | ret = -EEXIST; | ||
2399 | } else { | ||
2400 | /* bad. 2 NODES are trying to migrate! */ | ||
2401 | mlog(ML_ERROR, "migration error mle: " | ||
2402 | "master=%u new_master=%u // request: " | ||
2403 | "master=%u new_master=%u // " | ||
2404 | "lockres=%.*s\n", | ||
2405 | tmp->master, tmp->new_master, | ||
2406 | master, new_master, | ||
2407 | namelen, name); | ||
2408 | BUG(); | ||
2409 | } | ||
2410 | } else { | ||
2411 | /* this is essentially what assert_master does */ | ||
2412 | tmp->master = master; | ||
2413 | atomic_set(&tmp->woken, 1); | ||
2414 | wake_up(&tmp->wq); | ||
2415 | /* remove it from the list so that only one | ||
2416 | * mle will be found */ | ||
2417 | list_del_init(&tmp->list); | ||
2418 | } | ||
2419 | spin_unlock(&tmp->spinlock); | ||
2420 | } | ||
2421 | |||
2422 | /* now add a migration mle to the tail of the list */ | ||
2423 | dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); | ||
2424 | mle->new_master = new_master; | ||
2425 | mle->master = master; | ||
2426 | /* do this for consistency with other mle types */ | ||
2427 | set_bit(new_master, mle->maybe_map); | ||
2428 | list_add(&mle->list, &dlm->master_list); | ||
2429 | |||
2430 | return ret; | ||
2431 | } | ||
2432 | |||
2433 | |||
2434 | void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) | ||
2435 | { | ||
2436 | struct list_head *iter, *iter2; | ||
2437 | struct dlm_master_list_entry *mle; | ||
2438 | struct dlm_lock_resource *res; | ||
2439 | |||
2440 | mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); | ||
2441 | top: | ||
2442 | assert_spin_locked(&dlm->spinlock); | ||
2443 | |||
2444 | /* clean the master list */ | ||
2445 | spin_lock(&dlm->master_lock); | ||
2446 | list_for_each_safe(iter, iter2, &dlm->master_list) { | ||
2447 | mle = list_entry(iter, struct dlm_master_list_entry, list); | ||
2448 | |||
2449 | BUG_ON(mle->type != DLM_MLE_BLOCK && | ||
2450 | mle->type != DLM_MLE_MASTER && | ||
2451 | mle->type != DLM_MLE_MIGRATION); | ||
2452 | |||
2453 | /* MASTER mles are initiated locally. the waiting | ||
2454 | * process will notice the node map change | ||
2455 | * shortly. let that happen as normal. */ | ||
2456 | if (mle->type == DLM_MLE_MASTER) | ||
2457 | continue; | ||
2458 | |||
2459 | |||
2460 | /* BLOCK mles are initiated by other nodes. | ||
2461 | * need to clean up if the dead node would have | ||
2462 | * been the master. */ | ||
2463 | if (mle->type == DLM_MLE_BLOCK) { | ||
2464 | int bit; | ||
2465 | |||
2466 | spin_lock(&mle->spinlock); | ||
2467 | bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); | ||
2468 | if (bit != dead_node) { | ||
2469 | mlog(0, "mle found, but dead node %u would " | ||
2470 | "not have been master\n", dead_node); | ||
2471 | spin_unlock(&mle->spinlock); | ||
2472 | } else { | ||
2473 | /* must drop the refcount by one since the | ||
2474 | * assert_master will never arrive. this | ||
2475 | * may result in the mle being unlinked and | ||
2476 | * freed, but there may still be a process | ||
2477 | * waiting in the dlmlock path which is fine. */ | ||
2478 | mlog(ML_ERROR, "node %u was expected master\n", | ||
2479 | dead_node); | ||
2480 | atomic_set(&mle->woken, 1); | ||
2481 | spin_unlock(&mle->spinlock); | ||
2482 | wake_up(&mle->wq); | ||
2483 | /* final put will take care of list removal */ | ||
2484 | __dlm_put_mle(mle); | ||
2485 | } | ||
2486 | continue; | ||
2487 | } | ||
2488 | |||
2489 | /* everything else is a MIGRATION mle */ | ||
2490 | |||
2491 | /* the rule for MIGRATION mles is that the master | ||
2492 | * becomes UNKNOWN if *either* the original or | ||
2493 | * the new master dies. all UNKNOWN lockreses | ||
2494 | * are sent to whichever node becomes the recovery | ||
2495 | * master. the new master is responsible for | ||
2496 | * determining if there is still a master for | ||
2497 | * this lockres, or if he needs to take over | ||
2498 | * mastery. either way, this node should expect | ||
2499 | * another message to resolve this. */ | ||
2500 | if (mle->master != dead_node && | ||
2501 | mle->new_master != dead_node) | ||
2502 | continue; | ||
2503 | |||
2504 | /* if we have reached this point, this mle needs to | ||
2505 | * be removed from the list and freed. */ | ||
2506 | |||
2507 | /* remove from the list early. NOTE: unlinking | ||
2508 | * list_head while in list_for_each_safe */ | ||
2509 | spin_lock(&mle->spinlock); | ||
2510 | list_del_init(&mle->list); | ||
2511 | atomic_set(&mle->woken, 1); | ||
2512 | spin_unlock(&mle->spinlock); | ||
2513 | wake_up(&mle->wq); | ||
2514 | |||
2515 | mlog(0, "node %u died during migration from " | ||
2516 | "%u to %u!\n", dead_node, | ||
2517 | mle->master, mle->new_master); | ||
2518 | /* if there is a lockres associated with this | ||
2519 | * mle, find it and set its owner to UNKNOWN */ | ||
2520 | res = __dlm_lookup_lockres(dlm, mle->u.name.name, | ||
2521 | mle->u.name.len); | ||
2522 | if (res) { | ||
2523 | /* unfortunately if we hit this rare case, our | ||
2524 | * lock ordering is messed. we need to drop | ||
2525 | * the master lock so that we can take the | ||
2526 | * lockres lock, meaning that we will have to | ||
2527 | * restart from the head of list. */ | ||
2528 | spin_unlock(&dlm->master_lock); | ||
2529 | |||
2530 | /* move lockres onto recovery list */ | ||
2531 | spin_lock(&res->spinlock); | ||
2532 | dlm_set_lockres_owner(dlm, res, | ||
2533 | DLM_LOCK_RES_OWNER_UNKNOWN); | ||
2534 | dlm_move_lockres_to_recovery_list(dlm, res); | ||
2535 | spin_unlock(&res->spinlock); | ||
2536 | dlm_lockres_put(res); | ||
2537 | |||
2538 | /* dump the mle */ | ||
2539 | spin_lock(&dlm->master_lock); | ||
2540 | __dlm_put_mle(mle); | ||
2541 | spin_unlock(&dlm->master_lock); | ||
2542 | |||
2543 | /* restart */ | ||
2544 | goto top; | ||
2545 | } | ||
2546 | |||
2547 | /* this may be the last reference */ | ||
2548 | __dlm_put_mle(mle); | ||
2549 | } | ||
2550 | spin_unlock(&dlm->master_lock); | ||
2551 | } | ||
2552 | |||
2553 | |||
2554 | int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | ||
2555 | u8 old_master) | ||
2556 | { | ||
2557 | struct dlm_node_iter iter; | ||
2558 | int ret = 0; | ||
2559 | |||
2560 | spin_lock(&dlm->spinlock); | ||
2561 | dlm_node_iter_init(dlm->domain_map, &iter); | ||
2562 | clear_bit(old_master, iter.node_map); | ||
2563 | clear_bit(dlm->node_num, iter.node_map); | ||
2564 | spin_unlock(&dlm->spinlock); | ||
2565 | |||
2566 | mlog(0, "now time to do a migrate request to other nodes\n"); | ||
2567 | ret = dlm_do_migrate_request(dlm, res, old_master, | ||
2568 | dlm->node_num, &iter); | ||
2569 | if (ret < 0) { | ||
2570 | mlog_errno(ret); | ||
2571 | goto leave; | ||
2572 | } | ||
2573 | |||
2574 | mlog(0, "doing assert master of %.*s to all except the original node\n", | ||
2575 | res->lockname.len, res->lockname.name); | ||
2576 | /* this call now finishes out the nodemap | ||
2577 | * even if one or more nodes die */ | ||
2578 | ret = dlm_do_assert_master(dlm, res->lockname.name, | ||
2579 | res->lockname.len, iter.node_map, | ||
2580 | DLM_ASSERT_MASTER_FINISH_MIGRATION); | ||
2581 | if (ret < 0) { | ||
2582 | /* no longer need to retry. all living nodes contacted. */ | ||
2583 | mlog_errno(ret); | ||
2584 | ret = 0; | ||
2585 | } | ||
2586 | |||
2587 | memset(iter.node_map, 0, sizeof(iter.node_map)); | ||
2588 | set_bit(old_master, iter.node_map); | ||
2589 | mlog(0, "doing assert master of %.*s back to %u\n", | ||
2590 | res->lockname.len, res->lockname.name, old_master); | ||
2591 | ret = dlm_do_assert_master(dlm, res->lockname.name, | ||
2592 | res->lockname.len, iter.node_map, | ||
2593 | DLM_ASSERT_MASTER_FINISH_MIGRATION); | ||
2594 | if (ret < 0) { | ||
2595 | mlog(0, "assert master to original master failed " | ||
2596 | "with %d.\n", ret); | ||
2597 | /* the only nonzero status here would be because of | ||
2598 | * a dead original node. we're done. */ | ||
2599 | ret = 0; | ||
2600 | } | ||
2601 | |||
2602 | /* all done, set the owner, clear the flag */ | ||
2603 | spin_lock(&res->spinlock); | ||
2604 | dlm_set_lockres_owner(dlm, res, dlm->node_num); | ||
2605 | res->state &= ~DLM_LOCK_RES_MIGRATING; | ||
2606 | spin_unlock(&res->spinlock); | ||
2607 | /* re-dirty it on the new master */ | ||
2608 | dlm_kick_thread(dlm, res); | ||
2609 | wake_up(&res->wq); | ||
2610 | leave: | ||
2611 | return ret; | ||
2612 | } | ||
2613 | |||
2614 | /* | ||
2615 | * LOCKRES AST REFCOUNT | ||
2616 | * this is integral to migration | ||
2617 | */ | ||
2618 | |||
2619 | /* for future intent to call an ast, reserve one ahead of time. | ||
2620 | * this should be called only after waiting on the lockres | ||
2621 | * with dlm_wait_on_lockres, and while still holding the | ||
2622 | * spinlock after the call. */ | ||
2623 | void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res) | ||
2624 | { | ||
2625 | assert_spin_locked(&res->spinlock); | ||
2626 | if (res->state & DLM_LOCK_RES_MIGRATING) { | ||
2627 | __dlm_print_one_lock_resource(res); | ||
2628 | } | ||
2629 | BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); | ||
2630 | |||
2631 | atomic_inc(&res->asts_reserved); | ||
2632 | } | ||
2633 | |||
2634 | /* | ||
2635 | * used to drop the reserved ast, either because it went unused, | ||
2636 | * or because the ast/bast was actually called. | ||
2637 | * | ||
2638 | * also, if there is a pending migration on this lockres, | ||
2639 | * and this was the last pending ast on the lockres, | ||
2640 | * atomically set the MIGRATING flag before we drop the lock. | ||
2641 | * this is how we ensure that migration can proceed with no | ||
2642 | * asts in progress. note that it is ok if the state of the | ||
2643 | * queues is such that a lock should be granted in the future | ||
2644 | * or that a bast should be fired, because the new master will | ||
2645 | * shuffle the lists on this lockres as soon as it is migrated. | ||
2646 | */ | ||
2647 | void dlm_lockres_release_ast(struct dlm_ctxt *dlm, | ||
2648 | struct dlm_lock_resource *res) | ||
2649 | { | ||
2650 | if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock)) | ||
2651 | return; | ||
2652 | |||
2653 | if (!res->migration_pending) { | ||
2654 | spin_unlock(&res->spinlock); | ||
2655 | return; | ||
2656 | } | ||
2657 | |||
2658 | BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); | ||
2659 | res->migration_pending = 0; | ||
2660 | res->state |= DLM_LOCK_RES_MIGRATING; | ||
2661 | spin_unlock(&res->spinlock); | ||
2662 | wake_up(&res->wq); | ||
2663 | wake_up(&dlm->migration_wq); | ||
2664 | } | ||
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c new file mode 100644 index 000000000000..0c8eb1093f00 --- /dev/null +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -0,0 +1,2132 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmrecovery.c | ||
5 | * | ||
6 | * recovery stuff | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | |||
28 | #include <linux/module.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/types.h> | ||
31 | #include <linux/slab.h> | ||
32 | #include <linux/highmem.h> | ||
33 | #include <linux/utsname.h> | ||
34 | #include <linux/init.h> | ||
35 | #include <linux/sysctl.h> | ||
36 | #include <linux/random.h> | ||
37 | #include <linux/blkdev.h> | ||
38 | #include <linux/socket.h> | ||
39 | #include <linux/inet.h> | ||
40 | #include <linux/timer.h> | ||
41 | #include <linux/kthread.h> | ||
42 | |||
43 | |||
44 | #include "cluster/heartbeat.h" | ||
45 | #include "cluster/nodemanager.h" | ||
46 | #include "cluster/tcp.h" | ||
47 | |||
48 | #include "dlmapi.h" | ||
49 | #include "dlmcommon.h" | ||
50 | #include "dlmdomain.h" | ||
51 | |||
52 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY) | ||
53 | #include "cluster/masklog.h" | ||
54 | |||
55 | static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node); | ||
56 | |||
57 | static int dlm_recovery_thread(void *data); | ||
58 | void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); | ||
59 | int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); | ||
60 | static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); | ||
61 | static int dlm_do_recovery(struct dlm_ctxt *dlm); | ||
62 | |||
63 | static int dlm_pick_recovery_master(struct dlm_ctxt *dlm); | ||
64 | static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node); | ||
65 | static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); | ||
66 | static int dlm_request_all_locks(struct dlm_ctxt *dlm, | ||
67 | u8 request_from, u8 dead_node); | ||
68 | static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); | ||
69 | |||
70 | static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res); | ||
71 | static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, | ||
72 | const char *lockname, int namelen, | ||
73 | int total_locks, u64 cookie, | ||
74 | u8 flags, u8 master); | ||
75 | static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, | ||
76 | struct dlm_migratable_lockres *mres, | ||
77 | u8 send_to, | ||
78 | struct dlm_lock_resource *res, | ||
79 | int total_locks); | ||
80 | static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, | ||
81 | struct dlm_lock_resource *res, | ||
82 | u8 *real_master); | ||
83 | static int dlm_process_recovery_data(struct dlm_ctxt *dlm, | ||
84 | struct dlm_lock_resource *res, | ||
85 | struct dlm_migratable_lockres *mres); | ||
86 | static int dlm_do_master_requery(struct dlm_ctxt *dlm, | ||
87 | struct dlm_lock_resource *res, | ||
88 | u8 nodenum, u8 *real_master); | ||
89 | static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm); | ||
90 | static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, | ||
91 | u8 dead_node, u8 send_to); | ||
92 | static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node); | ||
93 | static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, | ||
94 | struct list_head *list, u8 dead_node); | ||
95 | static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, | ||
96 | u8 dead_node, u8 new_master); | ||
97 | static void dlm_reco_ast(void *astdata); | ||
98 | static void dlm_reco_bast(void *astdata, int blocked_type); | ||
99 | static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st); | ||
100 | static void dlm_request_all_locks_worker(struct dlm_work_item *item, | ||
101 | void *data); | ||
102 | static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data); | ||
103 | |||
104 | static u64 dlm_get_next_mig_cookie(void); | ||
105 | |||
106 | static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED; | ||
107 | static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED; | ||
108 | static u64 dlm_mig_cookie = 1; | ||
109 | |||
110 | static u64 dlm_get_next_mig_cookie(void) | ||
111 | { | ||
112 | u64 c; | ||
113 | spin_lock(&dlm_mig_cookie_lock); | ||
114 | c = dlm_mig_cookie; | ||
115 | if (dlm_mig_cookie == (~0ULL)) | ||
116 | dlm_mig_cookie = 1; | ||
117 | else | ||
118 | dlm_mig_cookie++; | ||
119 | spin_unlock(&dlm_mig_cookie_lock); | ||
120 | return c; | ||
121 | } | ||
122 | |||
123 | static inline void dlm_reset_recovery(struct dlm_ctxt *dlm) | ||
124 | { | ||
125 | spin_lock(&dlm->spinlock); | ||
126 | clear_bit(dlm->reco.dead_node, dlm->recovery_map); | ||
127 | dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; | ||
128 | dlm->reco.new_master = O2NM_INVALID_NODE_NUM; | ||
129 | spin_unlock(&dlm->spinlock); | ||
130 | } | ||
131 | |||
132 | /* Worker function used during recovery. */ | ||
133 | void dlm_dispatch_work(void *data) | ||
134 | { | ||
135 | struct dlm_ctxt *dlm = (struct dlm_ctxt *)data; | ||
136 | LIST_HEAD(tmp_list); | ||
137 | struct list_head *iter, *iter2; | ||
138 | struct dlm_work_item *item; | ||
139 | dlm_workfunc_t *workfunc; | ||
140 | |||
141 | spin_lock(&dlm->work_lock); | ||
142 | list_splice_init(&dlm->work_list, &tmp_list); | ||
143 | spin_unlock(&dlm->work_lock); | ||
144 | |||
145 | list_for_each_safe(iter, iter2, &tmp_list) { | ||
146 | item = list_entry(iter, struct dlm_work_item, list); | ||
147 | workfunc = item->func; | ||
148 | list_del_init(&item->list); | ||
149 | |||
150 | /* already have ref on dlm to avoid having | ||
151 | * it disappear. just double-check. */ | ||
152 | BUG_ON(item->dlm != dlm); | ||
153 | |||
154 | /* this is allowed to sleep and | ||
155 | * call network stuff */ | ||
156 | workfunc(item, item->data); | ||
157 | |||
158 | dlm_put(dlm); | ||
159 | kfree(item); | ||
160 | } | ||
161 | } | ||
162 | |||
163 | /* | ||
164 | * RECOVERY THREAD | ||
165 | */ | ||
166 | |||
167 | static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm) | ||
168 | { | ||
169 | /* wake the recovery thread | ||
170 | * this will wake the reco thread in one of three places | ||
171 | * 1) sleeping with no recovery happening | ||
172 | * 2) sleeping with recovery mastered elsewhere | ||
173 | * 3) recovery mastered here, waiting on reco data */ | ||
174 | |||
175 | wake_up(&dlm->dlm_reco_thread_wq); | ||
176 | } | ||
177 | |||
178 | /* Launch the recovery thread */ | ||
179 | int dlm_launch_recovery_thread(struct dlm_ctxt *dlm) | ||
180 | { | ||
181 | mlog(0, "starting dlm recovery thread...\n"); | ||
182 | |||
183 | dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm, | ||
184 | "dlm_reco_thread"); | ||
185 | if (IS_ERR(dlm->dlm_reco_thread_task)) { | ||
186 | mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task)); | ||
187 | dlm->dlm_reco_thread_task = NULL; | ||
188 | return -EINVAL; | ||
189 | } | ||
190 | |||
191 | return 0; | ||
192 | } | ||
193 | |||
194 | void dlm_complete_recovery_thread(struct dlm_ctxt *dlm) | ||
195 | { | ||
196 | if (dlm->dlm_reco_thread_task) { | ||
197 | mlog(0, "waiting for dlm recovery thread to exit\n"); | ||
198 | kthread_stop(dlm->dlm_reco_thread_task); | ||
199 | dlm->dlm_reco_thread_task = NULL; | ||
200 | } | ||
201 | } | ||
202 | |||
203 | |||
204 | |||
205 | /* | ||
206 | * this is lame, but here's how recovery works... | ||
207 | * 1) all recovery threads cluster wide will work on recovering | ||
208 | * ONE node at a time | ||
209 | * 2) negotiate who will take over all the locks for the dead node. | ||
210 | * thats right... ALL the locks. | ||
211 | * 3) once a new master is chosen, everyone scans all locks | ||
212 | * and moves aside those mastered by the dead guy | ||
213 | * 4) each of these locks should be locked until recovery is done | ||
214 | * 5) the new master collects up all of secondary lock queue info | ||
215 | * one lock at a time, forcing each node to communicate back | ||
216 | * before continuing | ||
217 | * 6) each secondary lock queue responds with the full known lock info | ||
218 | * 7) once the new master has run all its locks, it sends a ALLDONE! | ||
219 | * message to everyone | ||
220 | * 8) upon receiving this message, the secondary queue node unlocks | ||
221 | * and responds to the ALLDONE | ||
222 | * 9) once the new master gets responses from everyone, he unlocks | ||
223 | * everything and recovery for this dead node is done | ||
224 | *10) go back to 2) while there are still dead nodes | ||
225 | * | ||
226 | */ | ||
227 | |||
228 | |||
229 | #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000) | ||
230 | |||
231 | static int dlm_recovery_thread(void *data) | ||
232 | { | ||
233 | int status; | ||
234 | struct dlm_ctxt *dlm = data; | ||
235 | unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS); | ||
236 | |||
237 | mlog(0, "dlm thread running for %s...\n", dlm->name); | ||
238 | |||
239 | while (!kthread_should_stop()) { | ||
240 | if (dlm_joined(dlm)) { | ||
241 | status = dlm_do_recovery(dlm); | ||
242 | if (status == -EAGAIN) { | ||
243 | /* do not sleep, recheck immediately. */ | ||
244 | continue; | ||
245 | } | ||
246 | if (status < 0) | ||
247 | mlog_errno(status); | ||
248 | } | ||
249 | |||
250 | wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq, | ||
251 | kthread_should_stop(), | ||
252 | timeout); | ||
253 | } | ||
254 | |||
255 | mlog(0, "quitting DLM recovery thread\n"); | ||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | /* callers of the top-level api calls (dlmlock/dlmunlock) should | ||
260 | * block on the dlm->reco.event when recovery is in progress. | ||
261 | * the dlm recovery thread will set this state when it begins | ||
262 | * recovering a dead node (as the new master or not) and clear | ||
263 | * the state and wake as soon as all affected lock resources have | ||
264 | * been marked with the RECOVERY flag */ | ||
265 | static int dlm_in_recovery(struct dlm_ctxt *dlm) | ||
266 | { | ||
267 | int in_recovery; | ||
268 | spin_lock(&dlm->spinlock); | ||
269 | in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE); | ||
270 | spin_unlock(&dlm->spinlock); | ||
271 | return in_recovery; | ||
272 | } | ||
273 | |||
274 | |||
275 | void dlm_wait_for_recovery(struct dlm_ctxt *dlm) | ||
276 | { | ||
277 | wait_event(dlm->reco.event, !dlm_in_recovery(dlm)); | ||
278 | } | ||
279 | |||
280 | static void dlm_begin_recovery(struct dlm_ctxt *dlm) | ||
281 | { | ||
282 | spin_lock(&dlm->spinlock); | ||
283 | BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); | ||
284 | dlm->reco.state |= DLM_RECO_STATE_ACTIVE; | ||
285 | spin_unlock(&dlm->spinlock); | ||
286 | } | ||
287 | |||
288 | static void dlm_end_recovery(struct dlm_ctxt *dlm) | ||
289 | { | ||
290 | spin_lock(&dlm->spinlock); | ||
291 | BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); | ||
292 | dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; | ||
293 | spin_unlock(&dlm->spinlock); | ||
294 | wake_up(&dlm->reco.event); | ||
295 | } | ||
296 | |||
297 | static int dlm_do_recovery(struct dlm_ctxt *dlm) | ||
298 | { | ||
299 | int status = 0; | ||
300 | |||
301 | spin_lock(&dlm->spinlock); | ||
302 | |||
303 | /* check to see if the new master has died */ | ||
304 | if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM && | ||
305 | test_bit(dlm->reco.new_master, dlm->recovery_map)) { | ||
306 | mlog(0, "new master %u died while recovering %u!\n", | ||
307 | dlm->reco.new_master, dlm->reco.dead_node); | ||
308 | /* unset the new_master, leave dead_node */ | ||
309 | dlm->reco.new_master = O2NM_INVALID_NODE_NUM; | ||
310 | } | ||
311 | |||
312 | /* select a target to recover */ | ||
313 | if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { | ||
314 | int bit; | ||
315 | |||
316 | bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); | ||
317 | if (bit >= O2NM_MAX_NODES || bit < 0) | ||
318 | dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; | ||
319 | else | ||
320 | dlm->reco.dead_node = bit; | ||
321 | } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) { | ||
322 | /* BUG? */ | ||
323 | mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n", | ||
324 | dlm->reco.dead_node); | ||
325 | dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; | ||
326 | } | ||
327 | |||
328 | if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { | ||
329 | // mlog(0, "nothing to recover! sleeping now!\n"); | ||
330 | spin_unlock(&dlm->spinlock); | ||
331 | /* return to main thread loop and sleep. */ | ||
332 | return 0; | ||
333 | } | ||
334 | mlog(0, "recovery thread found node %u in the recovery map!\n", | ||
335 | dlm->reco.dead_node); | ||
336 | spin_unlock(&dlm->spinlock); | ||
337 | |||
338 | /* take write barrier */ | ||
339 | /* (stops the list reshuffling thread, proxy ast handling) */ | ||
340 | dlm_begin_recovery(dlm); | ||
341 | |||
342 | if (dlm->reco.new_master == dlm->node_num) | ||
343 | goto master_here; | ||
344 | |||
345 | if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { | ||
346 | /* choose a new master */ | ||
347 | if (!dlm_pick_recovery_master(dlm)) { | ||
348 | /* already notified everyone. go. */ | ||
349 | dlm->reco.new_master = dlm->node_num; | ||
350 | goto master_here; | ||
351 | } | ||
352 | mlog(0, "another node will master this recovery session.\n"); | ||
353 | } | ||
354 | mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n", | ||
355 | dlm->name, dlm->reco.new_master, | ||
356 | dlm->node_num, dlm->reco.dead_node); | ||
357 | |||
358 | /* it is safe to start everything back up here | ||
359 | * because all of the dead node's lock resources | ||
360 | * have been marked as in-recovery */ | ||
361 | dlm_end_recovery(dlm); | ||
362 | |||
363 | /* sleep out in main dlm_recovery_thread loop. */ | ||
364 | return 0; | ||
365 | |||
366 | master_here: | ||
367 | mlog(0, "mastering recovery of %s:%u here(this=%u)!\n", | ||
368 | dlm->name, dlm->reco.dead_node, dlm->node_num); | ||
369 | |||
370 | status = dlm_remaster_locks(dlm, dlm->reco.dead_node); | ||
371 | if (status < 0) { | ||
372 | mlog(ML_ERROR, "error %d remastering locks for node %u, " | ||
373 | "retrying.\n", status, dlm->reco.dead_node); | ||
374 | } else { | ||
375 | /* success! see if any other nodes need recovery */ | ||
376 | dlm_reset_recovery(dlm); | ||
377 | } | ||
378 | dlm_end_recovery(dlm); | ||
379 | |||
380 | /* continue and look for another dead node */ | ||
381 | return -EAGAIN; | ||
382 | } | ||
383 | |||
384 | static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | ||
385 | { | ||
386 | int status = 0; | ||
387 | struct dlm_reco_node_data *ndata; | ||
388 | struct list_head *iter; | ||
389 | int all_nodes_done; | ||
390 | int destroy = 0; | ||
391 | int pass = 0; | ||
392 | |||
393 | status = dlm_init_recovery_area(dlm, dead_node); | ||
394 | if (status < 0) | ||
395 | goto leave; | ||
396 | |||
397 | /* safe to access the node data list without a lock, since this | ||
398 | * process is the only one to change the list */ | ||
399 | list_for_each(iter, &dlm->reco.node_data) { | ||
400 | ndata = list_entry (iter, struct dlm_reco_node_data, list); | ||
401 | BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); | ||
402 | ndata->state = DLM_RECO_NODE_DATA_REQUESTING; | ||
403 | |||
404 | mlog(0, "requesting lock info from node %u\n", | ||
405 | ndata->node_num); | ||
406 | |||
407 | if (ndata->node_num == dlm->node_num) { | ||
408 | ndata->state = DLM_RECO_NODE_DATA_DONE; | ||
409 | continue; | ||
410 | } | ||
411 | |||
412 | status = dlm_request_all_locks(dlm, ndata->node_num, dead_node); | ||
413 | if (status < 0) { | ||
414 | mlog_errno(status); | ||
415 | if (dlm_is_host_down(status)) | ||
416 | ndata->state = DLM_RECO_NODE_DATA_DEAD; | ||
417 | else { | ||
418 | destroy = 1; | ||
419 | goto leave; | ||
420 | } | ||
421 | } | ||
422 | |||
423 | switch (ndata->state) { | ||
424 | case DLM_RECO_NODE_DATA_INIT: | ||
425 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | ||
426 | case DLM_RECO_NODE_DATA_REQUESTED: | ||
427 | BUG(); | ||
428 | break; | ||
429 | case DLM_RECO_NODE_DATA_DEAD: | ||
430 | mlog(0, "node %u died after requesting " | ||
431 | "recovery info for node %u\n", | ||
432 | ndata->node_num, dead_node); | ||
433 | // start all over | ||
434 | destroy = 1; | ||
435 | status = -EAGAIN; | ||
436 | goto leave; | ||
437 | case DLM_RECO_NODE_DATA_REQUESTING: | ||
438 | ndata->state = DLM_RECO_NODE_DATA_REQUESTED; | ||
439 | mlog(0, "now receiving recovery data from " | ||
440 | "node %u for dead node %u\n", | ||
441 | ndata->node_num, dead_node); | ||
442 | break; | ||
443 | case DLM_RECO_NODE_DATA_RECEIVING: | ||
444 | mlog(0, "already receiving recovery data from " | ||
445 | "node %u for dead node %u\n", | ||
446 | ndata->node_num, dead_node); | ||
447 | break; | ||
448 | case DLM_RECO_NODE_DATA_DONE: | ||
449 | mlog(0, "already DONE receiving recovery data " | ||
450 | "from node %u for dead node %u\n", | ||
451 | ndata->node_num, dead_node); | ||
452 | break; | ||
453 | } | ||
454 | } | ||
455 | |||
456 | mlog(0, "done requesting all lock info\n"); | ||
457 | |||
458 | /* nodes should be sending reco data now | ||
459 | * just need to wait */ | ||
460 | |||
461 | while (1) { | ||
462 | /* check all the nodes now to see if we are | ||
463 | * done, or if anyone died */ | ||
464 | all_nodes_done = 1; | ||
465 | spin_lock(&dlm_reco_state_lock); | ||
466 | list_for_each(iter, &dlm->reco.node_data) { | ||
467 | ndata = list_entry (iter, struct dlm_reco_node_data, list); | ||
468 | |||
469 | mlog(0, "checking recovery state of node %u\n", | ||
470 | ndata->node_num); | ||
471 | switch (ndata->state) { | ||
472 | case DLM_RECO_NODE_DATA_INIT: | ||
473 | case DLM_RECO_NODE_DATA_REQUESTING: | ||
474 | mlog(ML_ERROR, "bad ndata state for " | ||
475 | "node %u: state=%d\n", | ||
476 | ndata->node_num, ndata->state); | ||
477 | BUG(); | ||
478 | break; | ||
479 | case DLM_RECO_NODE_DATA_DEAD: | ||
480 | mlog(0, "node %u died after " | ||
481 | "requesting recovery info for " | ||
482 | "node %u\n", ndata->node_num, | ||
483 | dead_node); | ||
484 | spin_unlock(&dlm_reco_state_lock); | ||
485 | // start all over | ||
486 | destroy = 1; | ||
487 | status = -EAGAIN; | ||
488 | goto leave; | ||
489 | case DLM_RECO_NODE_DATA_RECEIVING: | ||
490 | case DLM_RECO_NODE_DATA_REQUESTED: | ||
491 | all_nodes_done = 0; | ||
492 | break; | ||
493 | case DLM_RECO_NODE_DATA_DONE: | ||
494 | break; | ||
495 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | ||
496 | break; | ||
497 | } | ||
498 | } | ||
499 | spin_unlock(&dlm_reco_state_lock); | ||
500 | |||
501 | mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass, | ||
502 | all_nodes_done?"yes":"no"); | ||
503 | if (all_nodes_done) { | ||
504 | int ret; | ||
505 | |||
506 | /* all nodes are now in DLM_RECO_NODE_DATA_DONE state | ||
507 | * just send a finalize message to everyone and | ||
508 | * clean up */ | ||
509 | mlog(0, "all nodes are done! send finalize\n"); | ||
510 | ret = dlm_send_finalize_reco_message(dlm); | ||
511 | if (ret < 0) | ||
512 | mlog_errno(ret); | ||
513 | |||
514 | spin_lock(&dlm->spinlock); | ||
515 | dlm_finish_local_lockres_recovery(dlm, dead_node, | ||
516 | dlm->node_num); | ||
517 | spin_unlock(&dlm->spinlock); | ||
518 | mlog(0, "should be done with recovery!\n"); | ||
519 | |||
520 | mlog(0, "finishing recovery of %s at %lu, " | ||
521 | "dead=%u, this=%u, new=%u\n", dlm->name, | ||
522 | jiffies, dlm->reco.dead_node, | ||
523 | dlm->node_num, dlm->reco.new_master); | ||
524 | destroy = 1; | ||
525 | status = ret; | ||
526 | /* rescan everything marked dirty along the way */ | ||
527 | dlm_kick_thread(dlm, NULL); | ||
528 | break; | ||
529 | } | ||
530 | /* wait to be signalled, with periodic timeout | ||
531 | * to check for node death */ | ||
532 | wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq, | ||
533 | kthread_should_stop(), | ||
534 | msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS)); | ||
535 | |||
536 | } | ||
537 | |||
538 | leave: | ||
539 | if (destroy) | ||
540 | dlm_destroy_recovery_area(dlm, dead_node); | ||
541 | |||
542 | mlog_exit(status); | ||
543 | return status; | ||
544 | } | ||
545 | |||
546 | static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) | ||
547 | { | ||
548 | int num=0; | ||
549 | struct dlm_reco_node_data *ndata; | ||
550 | |||
551 | spin_lock(&dlm->spinlock); | ||
552 | memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map)); | ||
553 | /* nodes can only be removed (by dying) after dropping | ||
554 | * this lock, and death will be trapped later, so this should do */ | ||
555 | spin_unlock(&dlm->spinlock); | ||
556 | |||
557 | while (1) { | ||
558 | num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num); | ||
559 | if (num >= O2NM_MAX_NODES) { | ||
560 | break; | ||
561 | } | ||
562 | BUG_ON(num == dead_node); | ||
563 | |||
564 | ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL); | ||
565 | if (!ndata) { | ||
566 | dlm_destroy_recovery_area(dlm, dead_node); | ||
567 | return -ENOMEM; | ||
568 | } | ||
569 | ndata->node_num = num; | ||
570 | ndata->state = DLM_RECO_NODE_DATA_INIT; | ||
571 | spin_lock(&dlm_reco_state_lock); | ||
572 | list_add_tail(&ndata->list, &dlm->reco.node_data); | ||
573 | spin_unlock(&dlm_reco_state_lock); | ||
574 | num++; | ||
575 | } | ||
576 | |||
577 | return 0; | ||
578 | } | ||
579 | |||
580 | static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) | ||
581 | { | ||
582 | struct list_head *iter, *iter2; | ||
583 | struct dlm_reco_node_data *ndata; | ||
584 | LIST_HEAD(tmplist); | ||
585 | |||
586 | spin_lock(&dlm_reco_state_lock); | ||
587 | list_splice_init(&dlm->reco.node_data, &tmplist); | ||
588 | spin_unlock(&dlm_reco_state_lock); | ||
589 | |||
590 | list_for_each_safe(iter, iter2, &tmplist) { | ||
591 | ndata = list_entry (iter, struct dlm_reco_node_data, list); | ||
592 | list_del_init(&ndata->list); | ||
593 | kfree(ndata); | ||
594 | } | ||
595 | } | ||
596 | |||
597 | static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, | ||
598 | u8 dead_node) | ||
599 | { | ||
600 | struct dlm_lock_request lr; | ||
601 | enum dlm_status ret; | ||
602 | |||
603 | mlog(0, "\n"); | ||
604 | |||
605 | |||
606 | mlog(0, "dlm_request_all_locks: dead node is %u, sending request " | ||
607 | "to %u\n", dead_node, request_from); | ||
608 | |||
609 | memset(&lr, 0, sizeof(lr)); | ||
610 | lr.node_idx = dlm->node_num; | ||
611 | lr.dead_node = dead_node; | ||
612 | |||
613 | // send message | ||
614 | ret = DLM_NOLOCKMGR; | ||
615 | ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, | ||
616 | &lr, sizeof(lr), request_from, NULL); | ||
617 | |||
618 | /* negative status is handled by caller */ | ||
619 | if (ret < 0) | ||
620 | mlog_errno(ret); | ||
621 | |||
622 | // return from here, then | ||
623 | // sleep until all received or error | ||
624 | return ret; | ||
625 | |||
626 | } | ||
627 | |||
628 | int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) | ||
629 | { | ||
630 | struct dlm_ctxt *dlm = data; | ||
631 | struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf; | ||
632 | char *buf = NULL; | ||
633 | struct dlm_work_item *item = NULL; | ||
634 | |||
635 | if (!dlm_grab(dlm)) | ||
636 | return -EINVAL; | ||
637 | |||
638 | BUG_ON(lr->dead_node != dlm->reco.dead_node); | ||
639 | |||
640 | item = kcalloc(1, sizeof(*item), GFP_KERNEL); | ||
641 | if (!item) { | ||
642 | dlm_put(dlm); | ||
643 | return -ENOMEM; | ||
644 | } | ||
645 | |||
646 | /* this will get freed by dlm_request_all_locks_worker */ | ||
647 | buf = (char *) __get_free_page(GFP_KERNEL); | ||
648 | if (!buf) { | ||
649 | kfree(item); | ||
650 | dlm_put(dlm); | ||
651 | return -ENOMEM; | ||
652 | } | ||
653 | |||
654 | /* queue up work for dlm_request_all_locks_worker */ | ||
655 | dlm_grab(dlm); /* get an extra ref for the work item */ | ||
656 | dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf); | ||
657 | item->u.ral.reco_master = lr->node_idx; | ||
658 | item->u.ral.dead_node = lr->dead_node; | ||
659 | spin_lock(&dlm->work_lock); | ||
660 | list_add_tail(&item->list, &dlm->work_list); | ||
661 | spin_unlock(&dlm->work_lock); | ||
662 | schedule_work(&dlm->dispatched_work); | ||
663 | |||
664 | dlm_put(dlm); | ||
665 | return 0; | ||
666 | } | ||
667 | |||
668 | static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | ||
669 | { | ||
670 | struct dlm_migratable_lockres *mres; | ||
671 | struct dlm_lock_resource *res; | ||
672 | struct dlm_ctxt *dlm; | ||
673 | LIST_HEAD(resources); | ||
674 | struct list_head *iter; | ||
675 | int ret; | ||
676 | u8 dead_node, reco_master; | ||
677 | |||
678 | dlm = item->dlm; | ||
679 | dead_node = item->u.ral.dead_node; | ||
680 | reco_master = item->u.ral.reco_master; | ||
681 | BUG_ON(dead_node != dlm->reco.dead_node); | ||
682 | BUG_ON(reco_master != dlm->reco.new_master); | ||
683 | |||
684 | mres = (struct dlm_migratable_lockres *)data; | ||
685 | |||
686 | /* lock resources should have already been moved to the | ||
687 | * dlm->reco.resources list. now move items from that list | ||
688 | * to a temp list if the dead owner matches. note that the | ||
689 | * whole cluster recovers only one node at a time, so we | ||
690 | * can safely move UNKNOWN lock resources for each recovery | ||
691 | * session. */ | ||
692 | dlm_move_reco_locks_to_list(dlm, &resources, dead_node); | ||
693 | |||
694 | /* now we can begin blasting lockreses without the dlm lock */ | ||
695 | list_for_each(iter, &resources) { | ||
696 | res = list_entry (iter, struct dlm_lock_resource, recovering); | ||
697 | ret = dlm_send_one_lockres(dlm, res, mres, reco_master, | ||
698 | DLM_MRES_RECOVERY); | ||
699 | if (ret < 0) | ||
700 | mlog_errno(ret); | ||
701 | } | ||
702 | |||
703 | /* move the resources back to the list */ | ||
704 | spin_lock(&dlm->spinlock); | ||
705 | list_splice_init(&resources, &dlm->reco.resources); | ||
706 | spin_unlock(&dlm->spinlock); | ||
707 | |||
708 | ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); | ||
709 | if (ret < 0) | ||
710 | mlog_errno(ret); | ||
711 | |||
712 | free_page((unsigned long)data); | ||
713 | } | ||
714 | |||
715 | |||
716 | static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) | ||
717 | { | ||
718 | int ret, tmpret; | ||
719 | struct dlm_reco_data_done done_msg; | ||
720 | |||
721 | memset(&done_msg, 0, sizeof(done_msg)); | ||
722 | done_msg.node_idx = dlm->node_num; | ||
723 | done_msg.dead_node = dead_node; | ||
724 | mlog(0, "sending DATA DONE message to %u, " | ||
725 | "my node=%u, dead node=%u\n", send_to, done_msg.node_idx, | ||
726 | done_msg.dead_node); | ||
727 | |||
728 | ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, | ||
729 | sizeof(done_msg), send_to, &tmpret); | ||
730 | /* negative status is ignored by the caller */ | ||
731 | if (ret >= 0) | ||
732 | ret = tmpret; | ||
733 | return ret; | ||
734 | } | ||
735 | |||
736 | |||
737 | int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) | ||
738 | { | ||
739 | struct dlm_ctxt *dlm = data; | ||
740 | struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; | ||
741 | struct list_head *iter; | ||
742 | struct dlm_reco_node_data *ndata = NULL; | ||
743 | int ret = -EINVAL; | ||
744 | |||
745 | if (!dlm_grab(dlm)) | ||
746 | return -EINVAL; | ||
747 | |||
748 | mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, " | ||
749 | "node_idx=%u, this node=%u\n", done->dead_node, | ||
750 | dlm->reco.dead_node, done->node_idx, dlm->node_num); | ||
751 | BUG_ON(done->dead_node != dlm->reco.dead_node); | ||
752 | |||
753 | spin_lock(&dlm_reco_state_lock); | ||
754 | list_for_each(iter, &dlm->reco.node_data) { | ||
755 | ndata = list_entry (iter, struct dlm_reco_node_data, list); | ||
756 | if (ndata->node_num != done->node_idx) | ||
757 | continue; | ||
758 | |||
759 | switch (ndata->state) { | ||
760 | case DLM_RECO_NODE_DATA_INIT: | ||
761 | case DLM_RECO_NODE_DATA_DEAD: | ||
762 | case DLM_RECO_NODE_DATA_DONE: | ||
763 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | ||
764 | mlog(ML_ERROR, "bad ndata state for node %u:" | ||
765 | " state=%d\n", ndata->node_num, | ||
766 | ndata->state); | ||
767 | BUG(); | ||
768 | break; | ||
769 | case DLM_RECO_NODE_DATA_RECEIVING: | ||
770 | case DLM_RECO_NODE_DATA_REQUESTED: | ||
771 | case DLM_RECO_NODE_DATA_REQUESTING: | ||
772 | mlog(0, "node %u is DONE sending " | ||
773 | "recovery data!\n", | ||
774 | ndata->node_num); | ||
775 | |||
776 | ndata->state = DLM_RECO_NODE_DATA_DONE; | ||
777 | ret = 0; | ||
778 | break; | ||
779 | } | ||
780 | } | ||
781 | spin_unlock(&dlm_reco_state_lock); | ||
782 | |||
783 | /* wake the recovery thread, some node is done */ | ||
784 | if (!ret) | ||
785 | dlm_kick_recovery_thread(dlm); | ||
786 | |||
787 | if (ret < 0) | ||
788 | mlog(ML_ERROR, "failed to find recovery node data for node " | ||
789 | "%u\n", done->node_idx); | ||
790 | dlm_put(dlm); | ||
791 | |||
792 | mlog(0, "leaving reco data done handler, ret=%d\n", ret); | ||
793 | return ret; | ||
794 | } | ||
795 | |||
796 | static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, | ||
797 | struct list_head *list, | ||
798 | u8 dead_node) | ||
799 | { | ||
800 | struct dlm_lock_resource *res; | ||
801 | struct list_head *iter, *iter2; | ||
802 | |||
803 | spin_lock(&dlm->spinlock); | ||
804 | list_for_each_safe(iter, iter2, &dlm->reco.resources) { | ||
805 | res = list_entry (iter, struct dlm_lock_resource, recovering); | ||
806 | if (dlm_is_recovery_lock(res->lockname.name, | ||
807 | res->lockname.len)) | ||
808 | continue; | ||
809 | if (res->owner == dead_node) { | ||
810 | mlog(0, "found lockres owned by dead node while " | ||
811 | "doing recovery for node %u. sending it.\n", | ||
812 | dead_node); | ||
813 | list_del_init(&res->recovering); | ||
814 | list_add_tail(&res->recovering, list); | ||
815 | } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
816 | mlog(0, "found UNKNOWN owner while doing recovery " | ||
817 | "for node %u. sending it.\n", dead_node); | ||
818 | list_del_init(&res->recovering); | ||
819 | list_add_tail(&res->recovering, list); | ||
820 | } | ||
821 | } | ||
822 | spin_unlock(&dlm->spinlock); | ||
823 | } | ||
824 | |||
825 | static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res) | ||
826 | { | ||
827 | int total_locks = 0; | ||
828 | struct list_head *iter, *queue = &res->granted; | ||
829 | int i; | ||
830 | |||
831 | for (i=0; i<3; i++) { | ||
832 | list_for_each(iter, queue) | ||
833 | total_locks++; | ||
834 | queue++; | ||
835 | } | ||
836 | return total_locks; | ||
837 | } | ||
838 | |||
839 | |||
840 | static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, | ||
841 | struct dlm_migratable_lockres *mres, | ||
842 | u8 send_to, | ||
843 | struct dlm_lock_resource *res, | ||
844 | int total_locks) | ||
845 | { | ||
846 | u64 mig_cookie = be64_to_cpu(mres->mig_cookie); | ||
847 | int mres_total_locks = be32_to_cpu(mres->total_locks); | ||
848 | int sz, ret = 0, status = 0; | ||
849 | u8 orig_flags = mres->flags, | ||
850 | orig_master = mres->master; | ||
851 | |||
852 | BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS); | ||
853 | if (!mres->num_locks) | ||
854 | return 0; | ||
855 | |||
856 | sz = sizeof(struct dlm_migratable_lockres) + | ||
857 | (mres->num_locks * sizeof(struct dlm_migratable_lock)); | ||
858 | |||
859 | /* add an all-done flag if we reached the last lock */ | ||
860 | orig_flags = mres->flags; | ||
861 | BUG_ON(total_locks > mres_total_locks); | ||
862 | if (total_locks == mres_total_locks) | ||
863 | mres->flags |= DLM_MRES_ALL_DONE; | ||
864 | |||
865 | /* send it */ | ||
866 | ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres, | ||
867 | sz, send_to, &status); | ||
868 | if (ret < 0) { | ||
869 | /* XXX: negative status is not handled. | ||
870 | * this will end up killing this node. */ | ||
871 | mlog_errno(ret); | ||
872 | } else { | ||
873 | /* might get an -ENOMEM back here */ | ||
874 | ret = status; | ||
875 | if (ret < 0) { | ||
876 | mlog_errno(ret); | ||
877 | |||
878 | if (ret == -EFAULT) { | ||
879 | mlog(ML_ERROR, "node %u told me to kill " | ||
880 | "myself!\n", send_to); | ||
881 | BUG(); | ||
882 | } | ||
883 | } | ||
884 | } | ||
885 | |||
886 | /* zero and reinit the message buffer */ | ||
887 | dlm_init_migratable_lockres(mres, res->lockname.name, | ||
888 | res->lockname.len, mres_total_locks, | ||
889 | mig_cookie, orig_flags, orig_master); | ||
890 | return ret; | ||
891 | } | ||
892 | |||
893 | static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, | ||
894 | const char *lockname, int namelen, | ||
895 | int total_locks, u64 cookie, | ||
896 | u8 flags, u8 master) | ||
897 | { | ||
898 | /* mres here is one full page */ | ||
899 | memset(mres, 0, PAGE_SIZE); | ||
900 | mres->lockname_len = namelen; | ||
901 | memcpy(mres->lockname, lockname, namelen); | ||
902 | mres->num_locks = 0; | ||
903 | mres->total_locks = cpu_to_be32(total_locks); | ||
904 | mres->mig_cookie = cpu_to_be64(cookie); | ||
905 | mres->flags = flags; | ||
906 | mres->master = master; | ||
907 | } | ||
908 | |||
909 | |||
910 | /* returns 1 if this lock fills the network structure, | ||
911 | * 0 otherwise */ | ||
912 | static int dlm_add_lock_to_array(struct dlm_lock *lock, | ||
913 | struct dlm_migratable_lockres *mres, int queue) | ||
914 | { | ||
915 | struct dlm_migratable_lock *ml; | ||
916 | int lock_num = mres->num_locks; | ||
917 | |||
918 | ml = &(mres->ml[lock_num]); | ||
919 | ml->cookie = lock->ml.cookie; | ||
920 | ml->type = lock->ml.type; | ||
921 | ml->convert_type = lock->ml.convert_type; | ||
922 | ml->highest_blocked = lock->ml.highest_blocked; | ||
923 | ml->list = queue; | ||
924 | if (lock->lksb) { | ||
925 | ml->flags = lock->lksb->flags; | ||
926 | /* send our current lvb */ | ||
927 | if (ml->type == LKM_EXMODE || | ||
928 | ml->type == LKM_PRMODE) { | ||
929 | /* if it is already set, this had better be a PR | ||
930 | * and it has to match */ | ||
931 | if (mres->lvb[0] && (ml->type == LKM_EXMODE || | ||
932 | memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) { | ||
933 | mlog(ML_ERROR, "mismatched lvbs!\n"); | ||
934 | __dlm_print_one_lock_resource(lock->lockres); | ||
935 | BUG(); | ||
936 | } | ||
937 | memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN); | ||
938 | } | ||
939 | } | ||
940 | ml->node = lock->ml.node; | ||
941 | mres->num_locks++; | ||
942 | /* we reached the max, send this network message */ | ||
943 | if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS) | ||
944 | return 1; | ||
945 | return 0; | ||
946 | } | ||
947 | |||
948 | |||
949 | int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | ||
950 | struct dlm_migratable_lockres *mres, | ||
951 | u8 send_to, u8 flags) | ||
952 | { | ||
953 | struct list_head *queue, *iter; | ||
954 | int total_locks, i; | ||
955 | u64 mig_cookie = 0; | ||
956 | struct dlm_lock *lock; | ||
957 | int ret = 0; | ||
958 | |||
959 | BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); | ||
960 | |||
961 | mlog(0, "sending to %u\n", send_to); | ||
962 | |||
963 | total_locks = dlm_num_locks_in_lockres(res); | ||
964 | if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) { | ||
965 | /* rare, but possible */ | ||
966 | mlog(0, "argh. lockres has %d locks. this will " | ||
967 | "require more than one network packet to " | ||
968 | "migrate\n", total_locks); | ||
969 | mig_cookie = dlm_get_next_mig_cookie(); | ||
970 | } | ||
971 | |||
972 | dlm_init_migratable_lockres(mres, res->lockname.name, | ||
973 | res->lockname.len, total_locks, | ||
974 | mig_cookie, flags, res->owner); | ||
975 | |||
976 | total_locks = 0; | ||
977 | for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) { | ||
978 | queue = dlm_list_idx_to_ptr(res, i); | ||
979 | list_for_each(iter, queue) { | ||
980 | lock = list_entry (iter, struct dlm_lock, list); | ||
981 | |||
982 | /* add another lock. */ | ||
983 | total_locks++; | ||
984 | if (!dlm_add_lock_to_array(lock, mres, i)) | ||
985 | continue; | ||
986 | |||
987 | /* this filled the lock message, | ||
988 | * we must send it immediately. */ | ||
989 | ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, | ||
990 | res, total_locks); | ||
991 | if (ret < 0) { | ||
992 | // TODO | ||
993 | mlog(ML_ERROR, "dlm_send_mig_lockres_msg " | ||
994 | "returned %d, TODO\n", ret); | ||
995 | BUG(); | ||
996 | } | ||
997 | } | ||
998 | } | ||
999 | /* flush any remaining locks */ | ||
1000 | ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); | ||
1001 | if (ret < 0) { | ||
1002 | // TODO | ||
1003 | mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, " | ||
1004 | "TODO\n", ret); | ||
1005 | BUG(); | ||
1006 | } | ||
1007 | return ret; | ||
1008 | } | ||
1009 | |||
1010 | |||
1011 | |||
1012 | /* | ||
1013 | * this message will contain no more than one page worth of | ||
1014 | * recovery data, and it will work on only one lockres. | ||
1015 | * there may be many locks in this page, and we may need to wait | ||
1016 | * for additional packets to complete all the locks (rare, but | ||
1017 | * possible). | ||
1018 | */ | ||
1019 | /* | ||
1020 | * NOTE: the allocation error cases here are scary | ||
1021 | * we really cannot afford to fail an alloc in recovery | ||
1022 | * do we spin? returning an error only delays the problem really | ||
1023 | */ | ||
1024 | |||
1025 | int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) | ||
1026 | { | ||
1027 | struct dlm_ctxt *dlm = data; | ||
1028 | struct dlm_migratable_lockres *mres = | ||
1029 | (struct dlm_migratable_lockres *)msg->buf; | ||
1030 | int ret = 0; | ||
1031 | u8 real_master; | ||
1032 | char *buf = NULL; | ||
1033 | struct dlm_work_item *item = NULL; | ||
1034 | struct dlm_lock_resource *res = NULL; | ||
1035 | |||
1036 | if (!dlm_grab(dlm)) | ||
1037 | return -EINVAL; | ||
1038 | |||
1039 | BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); | ||
1040 | |||
1041 | real_master = mres->master; | ||
1042 | if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
1043 | /* cannot migrate a lockres with no master */ | ||
1044 | BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); | ||
1045 | } | ||
1046 | |||
1047 | mlog(0, "%s message received from node %u\n", | ||
1048 | (mres->flags & DLM_MRES_RECOVERY) ? | ||
1049 | "recovery" : "migration", mres->master); | ||
1050 | if (mres->flags & DLM_MRES_ALL_DONE) | ||
1051 | mlog(0, "all done flag. all lockres data received!\n"); | ||
1052 | |||
1053 | ret = -ENOMEM; | ||
1054 | buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL); | ||
1055 | item = kcalloc(1, sizeof(*item), GFP_KERNEL); | ||
1056 | if (!buf || !item) | ||
1057 | goto leave; | ||
1058 | |||
1059 | /* lookup the lock to see if we have a secondary queue for this | ||
1060 | * already... just add the locks in and this will have its owner | ||
1061 | * and RECOVERY flag changed when it completes. */ | ||
1062 | res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len); | ||
1063 | if (res) { | ||
1064 | /* this will get a ref on res */ | ||
1065 | /* mark it as recovering/migrating and hash it */ | ||
1066 | spin_lock(&res->spinlock); | ||
1067 | if (mres->flags & DLM_MRES_RECOVERY) { | ||
1068 | res->state |= DLM_LOCK_RES_RECOVERING; | ||
1069 | } else { | ||
1070 | if (res->state & DLM_LOCK_RES_MIGRATING) { | ||
1071 | /* this is at least the second | ||
1072 | * lockres message */ | ||
1073 | mlog(0, "lock %.*s is already migrating\n", | ||
1074 | mres->lockname_len, | ||
1075 | mres->lockname); | ||
1076 | } else if (res->state & DLM_LOCK_RES_RECOVERING) { | ||
1077 | /* caller should BUG */ | ||
1078 | mlog(ML_ERROR, "node is attempting to migrate " | ||
1079 | "lock %.*s, but marked as recovering!\n", | ||
1080 | mres->lockname_len, mres->lockname); | ||
1081 | ret = -EFAULT; | ||
1082 | spin_unlock(&res->spinlock); | ||
1083 | goto leave; | ||
1084 | } | ||
1085 | res->state |= DLM_LOCK_RES_MIGRATING; | ||
1086 | } | ||
1087 | spin_unlock(&res->spinlock); | ||
1088 | } else { | ||
1089 | /* need to allocate, just like if it was | ||
1090 | * mastered here normally */ | ||
1091 | res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len); | ||
1092 | if (!res) | ||
1093 | goto leave; | ||
1094 | |||
1095 | /* to match the ref that we would have gotten if | ||
1096 | * dlm_lookup_lockres had succeeded */ | ||
1097 | dlm_lockres_get(res); | ||
1098 | |||
1099 | /* mark it as recovering/migrating and hash it */ | ||
1100 | if (mres->flags & DLM_MRES_RECOVERY) | ||
1101 | res->state |= DLM_LOCK_RES_RECOVERING; | ||
1102 | else | ||
1103 | res->state |= DLM_LOCK_RES_MIGRATING; | ||
1104 | |||
1105 | spin_lock(&dlm->spinlock); | ||
1106 | __dlm_insert_lockres(dlm, res); | ||
1107 | spin_unlock(&dlm->spinlock); | ||
1108 | |||
1109 | /* now that the new lockres is inserted, | ||
1110 | * make it usable by other processes */ | ||
1111 | spin_lock(&res->spinlock); | ||
1112 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; | ||
1113 | spin_unlock(&res->spinlock); | ||
1114 | |||
1115 | /* add an extra ref for just-allocated lockres | ||
1116 | * otherwise the lockres will be purged immediately */ | ||
1117 | dlm_lockres_get(res); | ||
1118 | |||
1119 | } | ||
1120 | |||
1121 | /* at this point we have allocated everything we need, | ||
1122 | * and we have a hashed lockres with an extra ref and | ||
1123 | * the proper res->state flags. */ | ||
1124 | ret = 0; | ||
1125 | if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
1126 | /* migration cannot have an unknown master */ | ||
1127 | BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); | ||
1128 | mlog(0, "recovery has passed me a lockres with an " | ||
1129 | "unknown owner.. will need to requery: " | ||
1130 | "%.*s\n", mres->lockname_len, mres->lockname); | ||
1131 | } else { | ||
1132 | spin_lock(&res->spinlock); | ||
1133 | dlm_change_lockres_owner(dlm, res, dlm->node_num); | ||
1134 | spin_unlock(&res->spinlock); | ||
1135 | } | ||
1136 | |||
1137 | /* queue up work for dlm_mig_lockres_worker */ | ||
1138 | dlm_grab(dlm); /* get an extra ref for the work item */ | ||
1139 | memcpy(buf, msg->buf, be16_to_cpu(msg->data_len)); /* copy the whole message */ | ||
1140 | dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf); | ||
1141 | item->u.ml.lockres = res; /* already have a ref */ | ||
1142 | item->u.ml.real_master = real_master; | ||
1143 | spin_lock(&dlm->work_lock); | ||
1144 | list_add_tail(&item->list, &dlm->work_list); | ||
1145 | spin_unlock(&dlm->work_lock); | ||
1146 | schedule_work(&dlm->dispatched_work); | ||
1147 | |||
1148 | leave: | ||
1149 | dlm_put(dlm); | ||
1150 | if (ret < 0) { | ||
1151 | if (buf) | ||
1152 | kfree(buf); | ||
1153 | if (item) | ||
1154 | kfree(item); | ||
1155 | } | ||
1156 | |||
1157 | mlog_exit(ret); | ||
1158 | return ret; | ||
1159 | } | ||
1160 | |||
1161 | |||
1162 | static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data) | ||
1163 | { | ||
1164 | struct dlm_ctxt *dlm = data; | ||
1165 | struct dlm_migratable_lockres *mres; | ||
1166 | int ret = 0; | ||
1167 | struct dlm_lock_resource *res; | ||
1168 | u8 real_master; | ||
1169 | |||
1170 | dlm = item->dlm; | ||
1171 | mres = (struct dlm_migratable_lockres *)data; | ||
1172 | |||
1173 | res = item->u.ml.lockres; | ||
1174 | real_master = item->u.ml.real_master; | ||
1175 | |||
1176 | if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
1177 | /* this case is super-rare. only occurs if | ||
1178 | * node death happens during migration. */ | ||
1179 | again: | ||
1180 | ret = dlm_lockres_master_requery(dlm, res, &real_master); | ||
1181 | if (ret < 0) { | ||
1182 | mlog(0, "dlm_lockres_master_requery failure: %d\n", | ||
1183 | ret); | ||
1184 | goto again; | ||
1185 | } | ||
1186 | if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
1187 | mlog(0, "lockres %.*s not claimed. " | ||
1188 | "this node will take it.\n", | ||
1189 | res->lockname.len, res->lockname.name); | ||
1190 | } else { | ||
1191 | mlog(0, "master needs to respond to sender " | ||
1192 | "that node %u still owns %.*s\n", | ||
1193 | real_master, res->lockname.len, | ||
1194 | res->lockname.name); | ||
1195 | /* cannot touch this lockres */ | ||
1196 | goto leave; | ||
1197 | } | ||
1198 | } | ||
1199 | |||
1200 | ret = dlm_process_recovery_data(dlm, res, mres); | ||
1201 | if (ret < 0) | ||
1202 | mlog(0, "dlm_process_recovery_data returned %d\n", ret); | ||
1203 | else | ||
1204 | mlog(0, "dlm_process_recovery_data succeeded\n"); | ||
1205 | |||
1206 | if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) == | ||
1207 | (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) { | ||
1208 | ret = dlm_finish_migration(dlm, res, mres->master); | ||
1209 | if (ret < 0) | ||
1210 | mlog_errno(ret); | ||
1211 | } | ||
1212 | |||
1213 | leave: | ||
1214 | kfree(data); | ||
1215 | mlog_exit(ret); | ||
1216 | } | ||
1217 | |||
1218 | |||
1219 | |||
1220 | static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, | ||
1221 | struct dlm_lock_resource *res, | ||
1222 | u8 *real_master) | ||
1223 | { | ||
1224 | struct dlm_node_iter iter; | ||
1225 | int nodenum; | ||
1226 | int ret = 0; | ||
1227 | |||
1228 | *real_master = DLM_LOCK_RES_OWNER_UNKNOWN; | ||
1229 | |||
1230 | /* we only reach here if one of the two nodes in a | ||
1231 | * migration died while the migration was in progress. | ||
1232 | * at this point we need to requery the master. we | ||
1233 | * know that the new_master got as far as creating | ||
1234 | * an mle on at least one node, but we do not know | ||
1235 | * if any nodes had actually cleared the mle and set | ||
1236 | * the master to the new_master. the old master | ||
1237 | * is supposed to set the owner to UNKNOWN in the | ||
1238 | * event of a new_master death, so the only possible | ||
1239 | * responses that we can get from nodes here are | ||
1240 | * that the master is new_master, or that the master | ||
1241 | * is UNKNOWN. | ||
1242 | * if all nodes come back with UNKNOWN then we know | ||
1243 | * the lock needs remastering here. | ||
1244 | * if any node comes back with a valid master, check | ||
1245 | * to see if that master is the one that we are | ||
1246 | * recovering. if so, then the new_master died and | ||
1247 | * we need to remaster this lock. if not, then the | ||
1248 | * new_master survived and that node will respond to | ||
1249 | * other nodes about the owner. | ||
1250 | * if there is an owner, this node needs to dump this | ||
1251 | * lockres and alert the sender that this lockres | ||
1252 | * was rejected. */ | ||
1253 | spin_lock(&dlm->spinlock); | ||
1254 | dlm_node_iter_init(dlm->domain_map, &iter); | ||
1255 | spin_unlock(&dlm->spinlock); | ||
1256 | |||
1257 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { | ||
1258 | /* do not send to self */ | ||
1259 | if (nodenum == dlm->node_num) | ||
1260 | continue; | ||
1261 | ret = dlm_do_master_requery(dlm, res, nodenum, real_master); | ||
1262 | if (ret < 0) { | ||
1263 | mlog_errno(ret); | ||
1264 | BUG(); | ||
1265 | /* TODO: need to figure a way to restart this */ | ||
1266 | } | ||
1267 | if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
1268 | mlog(0, "lock master is %u\n", *real_master); | ||
1269 | break; | ||
1270 | } | ||
1271 | } | ||
1272 | return ret; | ||
1273 | } | ||
1274 | |||
1275 | |||
1276 | static int dlm_do_master_requery(struct dlm_ctxt *dlm, | ||
1277 | struct dlm_lock_resource *res, | ||
1278 | u8 nodenum, u8 *real_master) | ||
1279 | { | ||
1280 | int ret = -EINVAL; | ||
1281 | struct dlm_master_requery req; | ||
1282 | int status = DLM_LOCK_RES_OWNER_UNKNOWN; | ||
1283 | |||
1284 | memset(&req, 0, sizeof(req)); | ||
1285 | req.node_idx = dlm->node_num; | ||
1286 | req.namelen = res->lockname.len; | ||
1287 | memcpy(req.name, res->lockname.name, res->lockname.len); | ||
1288 | |||
1289 | ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key, | ||
1290 | &req, sizeof(req), nodenum, &status); | ||
1291 | /* XXX: negative status not handled properly here. */ | ||
1292 | if (ret < 0) | ||
1293 | mlog_errno(ret); | ||
1294 | else { | ||
1295 | BUG_ON(status < 0); | ||
1296 | BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); | ||
1297 | *real_master = (u8) (status & 0xff); | ||
1298 | mlog(0, "node %u responded to master requery with %u\n", | ||
1299 | nodenum, *real_master); | ||
1300 | ret = 0; | ||
1301 | } | ||
1302 | return ret; | ||
1303 | } | ||
1304 | |||
1305 | |||
1306 | /* this function cannot error, so unless the sending | ||
1307 | * or receiving of the message failed, the owner can | ||
1308 | * be trusted */ | ||
1309 | int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data) | ||
1310 | { | ||
1311 | struct dlm_ctxt *dlm = data; | ||
1312 | struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; | ||
1313 | struct dlm_lock_resource *res = NULL; | ||
1314 | int master = DLM_LOCK_RES_OWNER_UNKNOWN; | ||
1315 | u32 flags = DLM_ASSERT_MASTER_REQUERY; | ||
1316 | |||
1317 | if (!dlm_grab(dlm)) { | ||
1318 | /* since the domain has gone away on this | ||
1319 | * node, the proper response is UNKNOWN */ | ||
1320 | return master; | ||
1321 | } | ||
1322 | |||
1323 | spin_lock(&dlm->spinlock); | ||
1324 | res = __dlm_lookup_lockres(dlm, req->name, req->namelen); | ||
1325 | if (res) { | ||
1326 | spin_lock(&res->spinlock); | ||
1327 | master = res->owner; | ||
1328 | if (master == dlm->node_num) { | ||
1329 | int ret = dlm_dispatch_assert_master(dlm, res, | ||
1330 | 0, 0, flags); | ||
1331 | if (ret < 0) { | ||
1332 | mlog_errno(-ENOMEM); | ||
1333 | /* retry!? */ | ||
1334 | BUG(); | ||
1335 | } | ||
1336 | } | ||
1337 | spin_unlock(&res->spinlock); | ||
1338 | } | ||
1339 | spin_unlock(&dlm->spinlock); | ||
1340 | |||
1341 | dlm_put(dlm); | ||
1342 | return master; | ||
1343 | } | ||
1344 | |||
1345 | static inline struct list_head * | ||
1346 | dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num) | ||
1347 | { | ||
1348 | struct list_head *ret; | ||
1349 | BUG_ON(list_num < 0); | ||
1350 | BUG_ON(list_num > 2); | ||
1351 | ret = &(res->granted); | ||
1352 | ret += list_num; | ||
1353 | return ret; | ||
1354 | } | ||
1355 | /* TODO: do ast flush business | ||
1356 | * TODO: do MIGRATING and RECOVERING spinning | ||
1357 | */ | ||
1358 | |||
1359 | /* | ||
1360 | * NOTE about in-flight requests during migration: | ||
1361 | * | ||
1362 | * Before attempting the migrate, the master has marked the lockres as | ||
1363 | * MIGRATING and then flushed all of its pending ASTS. So any in-flight | ||
1364 | * requests either got queued before the MIGRATING flag got set, in which | ||
1365 | * case the lock data will reflect the change and a return message is on | ||
1366 | * the way, or the request failed to get in before MIGRATING got set. In | ||
1367 | * this case, the caller will be told to spin and wait for the MIGRATING | ||
1368 | * flag to be dropped, then recheck the master. | ||
1369 | * This holds true for the convert, cancel and unlock cases, and since lvb | ||
1370 | * updates are tied to these same messages, it applies to lvb updates as | ||
1371 | * well. For the lock case, there is no way a lock can be on the master | ||
1372 | * queue and not be on the secondary queue since the lock is always added | ||
1373 | * locally first. This means that the new target node will never be sent | ||
1374 | * a lock that he doesn't already have on the list. | ||
1375 | * In total, this means that the local lock is correct and should not be | ||
1376 | * updated to match the one sent by the master. Any messages sent back | ||
1377 | * from the master before the MIGRATING flag will bring the lock properly | ||
1378 | * up-to-date, and the change will be ordered properly for the waiter. | ||
1379 | * We will *not* attempt to modify the lock underneath the waiter. | ||
1380 | */ | ||
1381 | |||
1382 | static int dlm_process_recovery_data(struct dlm_ctxt *dlm, | ||
1383 | struct dlm_lock_resource *res, | ||
1384 | struct dlm_migratable_lockres *mres) | ||
1385 | { | ||
1386 | struct dlm_migratable_lock *ml; | ||
1387 | struct list_head *queue; | ||
1388 | struct dlm_lock *newlock = NULL; | ||
1389 | struct dlm_lockstatus *lksb = NULL; | ||
1390 | int ret = 0; | ||
1391 | int i; | ||
1392 | struct list_head *iter; | ||
1393 | struct dlm_lock *lock = NULL; | ||
1394 | |||
1395 | mlog(0, "running %d locks for this lockres\n", mres->num_locks); | ||
1396 | for (i=0; i<mres->num_locks; i++) { | ||
1397 | ml = &(mres->ml[i]); | ||
1398 | BUG_ON(ml->highest_blocked != LKM_IVMODE); | ||
1399 | newlock = NULL; | ||
1400 | lksb = NULL; | ||
1401 | |||
1402 | queue = dlm_list_num_to_pointer(res, ml->list); | ||
1403 | |||
1404 | /* if the lock is for the local node it needs to | ||
1405 | * be moved to the proper location within the queue. | ||
1406 | * do not allocate a new lock structure. */ | ||
1407 | if (ml->node == dlm->node_num) { | ||
1408 | /* MIGRATION ONLY! */ | ||
1409 | BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); | ||
1410 | |||
1411 | spin_lock(&res->spinlock); | ||
1412 | list_for_each(iter, queue) { | ||
1413 | lock = list_entry (iter, struct dlm_lock, list); | ||
1414 | if (lock->ml.cookie != ml->cookie) | ||
1415 | lock = NULL; | ||
1416 | else | ||
1417 | break; | ||
1418 | } | ||
1419 | |||
1420 | /* lock is always created locally first, and | ||
1421 | * destroyed locally last. it must be on the list */ | ||
1422 | if (!lock) { | ||
1423 | mlog(ML_ERROR, "could not find local lock " | ||
1424 | "with cookie %"MLFu64"!\n", | ||
1425 | ml->cookie); | ||
1426 | BUG(); | ||
1427 | } | ||
1428 | BUG_ON(lock->ml.node != ml->node); | ||
1429 | |||
1430 | /* see NOTE above about why we do not update | ||
1431 | * to match the master here */ | ||
1432 | |||
1433 | /* move the lock to its proper place */ | ||
1434 | /* do not alter lock refcount. switching lists. */ | ||
1435 | list_del_init(&lock->list); | ||
1436 | list_add_tail(&lock->list, queue); | ||
1437 | spin_unlock(&res->spinlock); | ||
1438 | |||
1439 | mlog(0, "just reordered a local lock!\n"); | ||
1440 | continue; | ||
1441 | } | ||
1442 | |||
1443 | /* lock is for another node. */ | ||
1444 | newlock = dlm_new_lock(ml->type, ml->node, | ||
1445 | be64_to_cpu(ml->cookie), NULL); | ||
1446 | if (!newlock) { | ||
1447 | ret = -ENOMEM; | ||
1448 | goto leave; | ||
1449 | } | ||
1450 | lksb = newlock->lksb; | ||
1451 | dlm_lock_attach_lockres(newlock, res); | ||
1452 | |||
1453 | if (ml->convert_type != LKM_IVMODE) { | ||
1454 | BUG_ON(queue != &res->converting); | ||
1455 | newlock->ml.convert_type = ml->convert_type; | ||
1456 | } | ||
1457 | lksb->flags |= (ml->flags & | ||
1458 | (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); | ||
1459 | |||
1460 | if (mres->lvb[0]) { | ||
1461 | if (lksb->flags & DLM_LKSB_PUT_LVB) { | ||
1462 | /* other node was trying to update | ||
1463 | * lvb when node died. recreate the | ||
1464 | * lksb with the updated lvb. */ | ||
1465 | memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN); | ||
1466 | } else { | ||
1467 | /* otherwise, the node is sending its | ||
1468 | * most recent valid lvb info */ | ||
1469 | BUG_ON(ml->type != LKM_EXMODE && | ||
1470 | ml->type != LKM_PRMODE); | ||
1471 | if (res->lvb[0] && (ml->type == LKM_EXMODE || | ||
1472 | memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { | ||
1473 | mlog(ML_ERROR, "received bad lvb!\n"); | ||
1474 | __dlm_print_one_lock_resource(res); | ||
1475 | BUG(); | ||
1476 | } | ||
1477 | memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); | ||
1478 | } | ||
1479 | } | ||
1480 | |||
1481 | |||
1482 | /* NOTE: | ||
1483 | * wrt lock queue ordering and recovery: | ||
1484 | * 1. order of locks on granted queue is | ||
1485 | * meaningless. | ||
1486 | * 2. order of locks on converting queue is | ||
1487 | * LOST with the node death. sorry charlie. | ||
1488 | * 3. order of locks on the blocked queue is | ||
1489 | * also LOST. | ||
1490 | * order of locks does not affect integrity, it | ||
1491 | * just means that a lock request may get pushed | ||
1492 | * back in line as a result of the node death. | ||
1493 | * also note that for a given node the lock order | ||
1494 | * for its secondary queue locks is preserved | ||
1495 | * relative to each other, but clearly *not* | ||
1496 | * preserved relative to locks from other nodes. | ||
1497 | */ | ||
1498 | spin_lock(&res->spinlock); | ||
1499 | dlm_lock_get(newlock); | ||
1500 | list_add_tail(&newlock->list, queue); | ||
1501 | spin_unlock(&res->spinlock); | ||
1502 | } | ||
1503 | mlog(0, "done running all the locks\n"); | ||
1504 | |||
1505 | leave: | ||
1506 | if (ret < 0) { | ||
1507 | mlog_errno(ret); | ||
1508 | if (newlock) | ||
1509 | dlm_lock_put(newlock); | ||
1510 | } | ||
1511 | |||
1512 | mlog_exit(ret); | ||
1513 | return ret; | ||
1514 | } | ||
1515 | |||
1516 | void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, | ||
1517 | struct dlm_lock_resource *res) | ||
1518 | { | ||
1519 | int i; | ||
1520 | struct list_head *queue, *iter, *iter2; | ||
1521 | struct dlm_lock *lock; | ||
1522 | |||
1523 | res->state |= DLM_LOCK_RES_RECOVERING; | ||
1524 | if (!list_empty(&res->recovering)) | ||
1525 | list_del_init(&res->recovering); | ||
1526 | list_add_tail(&res->recovering, &dlm->reco.resources); | ||
1527 | |||
1528 | /* find any pending locks and put them back on proper list */ | ||
1529 | for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) { | ||
1530 | queue = dlm_list_idx_to_ptr(res, i); | ||
1531 | list_for_each_safe(iter, iter2, queue) { | ||
1532 | lock = list_entry (iter, struct dlm_lock, list); | ||
1533 | dlm_lock_get(lock); | ||
1534 | if (lock->convert_pending) { | ||
1535 | /* move converting lock back to granted */ | ||
1536 | BUG_ON(i != DLM_CONVERTING_LIST); | ||
1537 | mlog(0, "node died with convert pending " | ||
1538 | "on %.*s. move back to granted list.\n", | ||
1539 | res->lockname.len, res->lockname.name); | ||
1540 | dlm_revert_pending_convert(res, lock); | ||
1541 | lock->convert_pending = 0; | ||
1542 | } else if (lock->lock_pending) { | ||
1543 | /* remove pending lock requests completely */ | ||
1544 | BUG_ON(i != DLM_BLOCKED_LIST); | ||
1545 | mlog(0, "node died with lock pending " | ||
1546 | "on %.*s. remove from blocked list and skip.\n", | ||
1547 | res->lockname.len, res->lockname.name); | ||
1548 | /* lock will be floating until ref in | ||
1549 | * dlmlock_remote is freed after the network | ||
1550 | * call returns. ok for it to not be on any | ||
1551 | * list since no ast can be called | ||
1552 | * (the master is dead). */ | ||
1553 | dlm_revert_pending_lock(res, lock); | ||
1554 | lock->lock_pending = 0; | ||
1555 | } else if (lock->unlock_pending) { | ||
1556 | /* if an unlock was in progress, treat as | ||
1557 | * if this had completed successfully | ||
1558 | * before sending this lock state to the | ||
1559 | * new master. note that the dlm_unlock | ||
1560 | * call is still responsible for calling | ||
1561 | * the unlockast. that will happen after | ||
1562 | * the network call times out. for now, | ||
1563 | * just move lists to prepare the new | ||
1564 | * recovery master. */ | ||
1565 | BUG_ON(i != DLM_GRANTED_LIST); | ||
1566 | mlog(0, "node died with unlock pending " | ||
1567 | "on %.*s. remove from blocked list and skip.\n", | ||
1568 | res->lockname.len, res->lockname.name); | ||
1569 | dlm_commit_pending_unlock(res, lock); | ||
1570 | lock->unlock_pending = 0; | ||
1571 | } else if (lock->cancel_pending) { | ||
1572 | /* if a cancel was in progress, treat as | ||
1573 | * if this had completed successfully | ||
1574 | * before sending this lock state to the | ||
1575 | * new master */ | ||
1576 | BUG_ON(i != DLM_CONVERTING_LIST); | ||
1577 | mlog(0, "node died with cancel pending " | ||
1578 | "on %.*s. move back to granted list.\n", | ||
1579 | res->lockname.len, res->lockname.name); | ||
1580 | dlm_commit_pending_cancel(res, lock); | ||
1581 | lock->cancel_pending = 0; | ||
1582 | } | ||
1583 | dlm_lock_put(lock); | ||
1584 | } | ||
1585 | } | ||
1586 | } | ||
1587 | |||
1588 | |||
1589 | |||
1590 | /* removes all recovered locks from the recovery list. | ||
1591 | * sets the res->owner to the new master. | ||
1592 | * unsets the RECOVERY flag and wakes waiters. */ | ||
1593 | static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, | ||
1594 | u8 dead_node, u8 new_master) | ||
1595 | { | ||
1596 | int i; | ||
1597 | struct list_head *iter, *iter2, *bucket; | ||
1598 | struct dlm_lock_resource *res; | ||
1599 | |||
1600 | mlog_entry_void(); | ||
1601 | |||
1602 | assert_spin_locked(&dlm->spinlock); | ||
1603 | |||
1604 | list_for_each_safe(iter, iter2, &dlm->reco.resources) { | ||
1605 | res = list_entry (iter, struct dlm_lock_resource, recovering); | ||
1606 | if (res->owner == dead_node) { | ||
1607 | list_del_init(&res->recovering); | ||
1608 | spin_lock(&res->spinlock); | ||
1609 | dlm_change_lockres_owner(dlm, res, new_master); | ||
1610 | res->state &= ~DLM_LOCK_RES_RECOVERING; | ||
1611 | __dlm_dirty_lockres(dlm, res); | ||
1612 | spin_unlock(&res->spinlock); | ||
1613 | wake_up(&res->wq); | ||
1614 | } | ||
1615 | } | ||
1616 | |||
1617 | /* this will become unnecessary eventually, but | ||
1618 | * for now we need to run the whole hash, clear | ||
1619 | * the RECOVERING state and set the owner | ||
1620 | * if necessary */ | ||
1621 | for (i=0; i<DLM_HASH_SIZE; i++) { | ||
1622 | bucket = &(dlm->resources[i]); | ||
1623 | list_for_each(iter, bucket) { | ||
1624 | res = list_entry (iter, struct dlm_lock_resource, list); | ||
1625 | if (res->state & DLM_LOCK_RES_RECOVERING) { | ||
1626 | if (res->owner == dead_node) { | ||
1627 | mlog(0, "(this=%u) res %.*s owner=%u " | ||
1628 | "was not on recovering list, but " | ||
1629 | "clearing state anyway\n", | ||
1630 | dlm->node_num, res->lockname.len, | ||
1631 | res->lockname.name, new_master); | ||
1632 | } else if (res->owner == dlm->node_num) { | ||
1633 | mlog(0, "(this=%u) res %.*s owner=%u " | ||
1634 | "was not on recovering list, " | ||
1635 | "owner is THIS node, clearing\n", | ||
1636 | dlm->node_num, res->lockname.len, | ||
1637 | res->lockname.name, new_master); | ||
1638 | } else | ||
1639 | continue; | ||
1640 | |||
1641 | spin_lock(&res->spinlock); | ||
1642 | dlm_change_lockres_owner(dlm, res, new_master); | ||
1643 | res->state &= ~DLM_LOCK_RES_RECOVERING; | ||
1644 | __dlm_dirty_lockres(dlm, res); | ||
1645 | spin_unlock(&res->spinlock); | ||
1646 | wake_up(&res->wq); | ||
1647 | } | ||
1648 | } | ||
1649 | } | ||
1650 | } | ||
1651 | |||
1652 | static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local) | ||
1653 | { | ||
1654 | if (local) { | ||
1655 | if (lock->ml.type != LKM_EXMODE && | ||
1656 | lock->ml.type != LKM_PRMODE) | ||
1657 | return 1; | ||
1658 | } else if (lock->ml.type == LKM_EXMODE) | ||
1659 | return 1; | ||
1660 | return 0; | ||
1661 | } | ||
1662 | |||
1663 | static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, | ||
1664 | struct dlm_lock_resource *res, u8 dead_node) | ||
1665 | { | ||
1666 | struct list_head *iter, *queue; | ||
1667 | struct dlm_lock *lock; | ||
1668 | int blank_lvb = 0, local = 0; | ||
1669 | int i; | ||
1670 | u8 search_node; | ||
1671 | |||
1672 | assert_spin_locked(&dlm->spinlock); | ||
1673 | assert_spin_locked(&res->spinlock); | ||
1674 | |||
1675 | if (res->owner == dlm->node_num) | ||
1676 | /* if this node owned the lockres, and if the dead node | ||
1677 | * had an EX when he died, blank out the lvb */ | ||
1678 | search_node = dead_node; | ||
1679 | else { | ||
1680 | /* if this is a secondary lockres, and we had no EX or PR | ||
1681 | * locks granted, we can no longer trust the lvb */ | ||
1682 | search_node = dlm->node_num; | ||
1683 | local = 1; /* check local state for valid lvb */ | ||
1684 | } | ||
1685 | |||
1686 | for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) { | ||
1687 | queue = dlm_list_idx_to_ptr(res, i); | ||
1688 | list_for_each(iter, queue) { | ||
1689 | lock = list_entry (iter, struct dlm_lock, list); | ||
1690 | if (lock->ml.node == search_node) { | ||
1691 | if (dlm_lvb_needs_invalidation(lock, local)) { | ||
1692 | /* zero the lksb lvb and lockres lvb */ | ||
1693 | blank_lvb = 1; | ||
1694 | memset(lock->lksb->lvb, 0, DLM_LVB_LEN); | ||
1695 | } | ||
1696 | } | ||
1697 | } | ||
1698 | } | ||
1699 | |||
1700 | if (blank_lvb) { | ||
1701 | mlog(0, "clearing %.*s lvb, dead node %u had EX\n", | ||
1702 | res->lockname.len, res->lockname.name, dead_node); | ||
1703 | memset(res->lvb, 0, DLM_LVB_LEN); | ||
1704 | } | ||
1705 | } | ||
1706 | |||
1707 | static void dlm_free_dead_locks(struct dlm_ctxt *dlm, | ||
1708 | struct dlm_lock_resource *res, u8 dead_node) | ||
1709 | { | ||
1710 | struct list_head *iter, *tmpiter; | ||
1711 | struct dlm_lock *lock; | ||
1712 | |||
1713 | /* this node is the lockres master: | ||
1714 | * 1) remove any stale locks for the dead node | ||
1715 | * 2) if the dead node had an EX when he died, blank out the lvb | ||
1716 | */ | ||
1717 | assert_spin_locked(&dlm->spinlock); | ||
1718 | assert_spin_locked(&res->spinlock); | ||
1719 | |||
1720 | /* TODO: check pending_asts, pending_basts here */ | ||
1721 | list_for_each_safe(iter, tmpiter, &res->granted) { | ||
1722 | lock = list_entry (iter, struct dlm_lock, list); | ||
1723 | if (lock->ml.node == dead_node) { | ||
1724 | list_del_init(&lock->list); | ||
1725 | dlm_lock_put(lock); | ||
1726 | } | ||
1727 | } | ||
1728 | list_for_each_safe(iter, tmpiter, &res->converting) { | ||
1729 | lock = list_entry (iter, struct dlm_lock, list); | ||
1730 | if (lock->ml.node == dead_node) { | ||
1731 | list_del_init(&lock->list); | ||
1732 | dlm_lock_put(lock); | ||
1733 | } | ||
1734 | } | ||
1735 | list_for_each_safe(iter, tmpiter, &res->blocked) { | ||
1736 | lock = list_entry (iter, struct dlm_lock, list); | ||
1737 | if (lock->ml.node == dead_node) { | ||
1738 | list_del_init(&lock->list); | ||
1739 | dlm_lock_put(lock); | ||
1740 | } | ||
1741 | } | ||
1742 | |||
1743 | /* do not kick thread yet */ | ||
1744 | __dlm_dirty_lockres(dlm, res); | ||
1745 | } | ||
1746 | |||
1747 | /* if this node is the recovery master, and there are no | ||
1748 | * locks for a given lockres owned by this node that are in | ||
1749 | * either PR or EX mode, zero out the lvb before requesting. | ||
1750 | * | ||
1751 | */ | ||
1752 | |||
1753 | |||
1754 | static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) | ||
1755 | { | ||
1756 | struct list_head *iter; | ||
1757 | struct dlm_lock_resource *res; | ||
1758 | int i; | ||
1759 | struct list_head *bucket; | ||
1760 | |||
1761 | |||
1762 | /* purge any stale mles */ | ||
1763 | dlm_clean_master_list(dlm, dead_node); | ||
1764 | |||
1765 | /* | ||
1766 | * now clean up all lock resources. there are two rules: | ||
1767 | * | ||
1768 | * 1) if the dead node was the master, move the lockres | ||
1769 | * to the recovering list. set the RECOVERING flag. | ||
1770 | * this lockres needs to be cleaned up before it can | ||
1771 | * be used further. | ||
1772 | * | ||
1773 | * 2) if this node was the master, remove all locks from | ||
1774 | * each of the lockres queues that were owned by the | ||
1775 | * dead node. once recovery finishes, the dlm thread | ||
1776 | * can be kicked again to see if any ASTs or BASTs | ||
1777 | * need to be fired as a result. | ||
1778 | */ | ||
1779 | for (i=0; i<DLM_HASH_SIZE; i++) { | ||
1780 | bucket = &(dlm->resources[i]); | ||
1781 | list_for_each(iter, bucket) { | ||
1782 | res = list_entry (iter, struct dlm_lock_resource, list); | ||
1783 | if (dlm_is_recovery_lock(res->lockname.name, | ||
1784 | res->lockname.len)) | ||
1785 | continue; | ||
1786 | |||
1787 | spin_lock(&res->spinlock); | ||
1788 | /* zero the lvb if necessary */ | ||
1789 | dlm_revalidate_lvb(dlm, res, dead_node); | ||
1790 | if (res->owner == dead_node) | ||
1791 | dlm_move_lockres_to_recovery_list(dlm, res); | ||
1792 | else if (res->owner == dlm->node_num) { | ||
1793 | dlm_free_dead_locks(dlm, res, dead_node); | ||
1794 | __dlm_lockres_calc_usage(dlm, res); | ||
1795 | } | ||
1796 | spin_unlock(&res->spinlock); | ||
1797 | } | ||
1798 | } | ||
1799 | |||
1800 | } | ||
1801 | |||
1802 | static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) | ||
1803 | { | ||
1804 | assert_spin_locked(&dlm->spinlock); | ||
1805 | |||
1806 | /* check to see if the node is already considered dead */ | ||
1807 | if (!test_bit(idx, dlm->live_nodes_map)) { | ||
1808 | mlog(0, "for domain %s, node %d is already dead. " | ||
1809 | "another node likely did recovery already.\n", | ||
1810 | dlm->name, idx); | ||
1811 | return; | ||
1812 | } | ||
1813 | |||
1814 | /* check to see if we do not care about this node */ | ||
1815 | if (!test_bit(idx, dlm->domain_map)) { | ||
1816 | /* This also catches the case that we get a node down | ||
1817 | * but haven't joined the domain yet. */ | ||
1818 | mlog(0, "node %u already removed from domain!\n", idx); | ||
1819 | return; | ||
1820 | } | ||
1821 | |||
1822 | clear_bit(idx, dlm->live_nodes_map); | ||
1823 | |||
1824 | /* Clean up join state on node death. */ | ||
1825 | if (dlm->joining_node == idx) { | ||
1826 | mlog(0, "Clearing join state for node %u\n", idx); | ||
1827 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | ||
1828 | } | ||
1829 | |||
1830 | /* make sure local cleanup occurs before the heartbeat events */ | ||
1831 | if (!test_bit(idx, dlm->recovery_map)) | ||
1832 | dlm_do_local_recovery_cleanup(dlm, idx); | ||
1833 | |||
1834 | /* notify anything attached to the heartbeat events */ | ||
1835 | dlm_hb_event_notify_attached(dlm, idx, 0); | ||
1836 | |||
1837 | mlog(0, "node %u being removed from domain map!\n", idx); | ||
1838 | clear_bit(idx, dlm->domain_map); | ||
1839 | /* wake up migration waiters if a node goes down. | ||
1840 | * perhaps later we can genericize this for other waiters. */ | ||
1841 | wake_up(&dlm->migration_wq); | ||
1842 | |||
1843 | if (test_bit(idx, dlm->recovery_map)) | ||
1844 | mlog(0, "domain %s, node %u already added " | ||
1845 | "to recovery map!\n", dlm->name, idx); | ||
1846 | else | ||
1847 | set_bit(idx, dlm->recovery_map); | ||
1848 | } | ||
1849 | |||
1850 | void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data) | ||
1851 | { | ||
1852 | struct dlm_ctxt *dlm = data; | ||
1853 | |||
1854 | if (!dlm_grab(dlm)) | ||
1855 | return; | ||
1856 | |||
1857 | spin_lock(&dlm->spinlock); | ||
1858 | __dlm_hb_node_down(dlm, idx); | ||
1859 | spin_unlock(&dlm->spinlock); | ||
1860 | |||
1861 | dlm_put(dlm); | ||
1862 | } | ||
1863 | |||
1864 | void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data) | ||
1865 | { | ||
1866 | struct dlm_ctxt *dlm = data; | ||
1867 | |||
1868 | if (!dlm_grab(dlm)) | ||
1869 | return; | ||
1870 | |||
1871 | spin_lock(&dlm->spinlock); | ||
1872 | |||
1873 | set_bit(idx, dlm->live_nodes_map); | ||
1874 | |||
1875 | /* notify any mles attached to the heartbeat events */ | ||
1876 | dlm_hb_event_notify_attached(dlm, idx, 1); | ||
1877 | |||
1878 | spin_unlock(&dlm->spinlock); | ||
1879 | |||
1880 | dlm_put(dlm); | ||
1881 | } | ||
1882 | |||
1883 | static void dlm_reco_ast(void *astdata) | ||
1884 | { | ||
1885 | struct dlm_ctxt *dlm = astdata; | ||
1886 | mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n", | ||
1887 | dlm->node_num, dlm->name); | ||
1888 | } | ||
1889 | static void dlm_reco_bast(void *astdata, int blocked_type) | ||
1890 | { | ||
1891 | struct dlm_ctxt *dlm = astdata; | ||
1892 | mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n", | ||
1893 | dlm->node_num, dlm->name); | ||
1894 | } | ||
1895 | static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st) | ||
1896 | { | ||
1897 | mlog(0, "unlockast for recovery lock fired!\n"); | ||
1898 | } | ||
1899 | |||
1900 | |||
1901 | static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) | ||
1902 | { | ||
1903 | enum dlm_status ret; | ||
1904 | struct dlm_lockstatus lksb; | ||
1905 | int status = -EINVAL; | ||
1906 | |||
1907 | mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", | ||
1908 | dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); | ||
1909 | retry: | ||
1910 | memset(&lksb, 0, sizeof(lksb)); | ||
1911 | |||
1912 | ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, | ||
1913 | DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast); | ||
1914 | |||
1915 | if (ret == DLM_NORMAL) { | ||
1916 | mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", | ||
1917 | dlm->name, dlm->node_num); | ||
1918 | /* I am master, send message to all nodes saying | ||
1919 | * that I am beginning a recovery session */ | ||
1920 | status = dlm_send_begin_reco_message(dlm, | ||
1921 | dlm->reco.dead_node); | ||
1922 | |||
1923 | /* recovery lock is a special case. ast will not get fired, | ||
1924 | * so just go ahead and unlock it. */ | ||
1925 | ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm); | ||
1926 | if (ret != DLM_NORMAL) { | ||
1927 | /* this would really suck. this could only happen | ||
1928 | * if there was a network error during the unlock | ||
1929 | * because of node death. this means the unlock | ||
1930 | * is actually "done" and the lock structure is | ||
1931 | * even freed. we can continue, but only | ||
1932 | * because this specific lock name is special. */ | ||
1933 | mlog(0, "dlmunlock returned %d\n", ret); | ||
1934 | } | ||
1935 | |||
1936 | if (status < 0) { | ||
1937 | mlog(0, "failed to send recovery message. " | ||
1938 | "must retry with new node map.\n"); | ||
1939 | goto retry; | ||
1940 | } | ||
1941 | } else if (ret == DLM_NOTQUEUED) { | ||
1942 | mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", | ||
1943 | dlm->name, dlm->node_num); | ||
1944 | /* another node is master. wait on | ||
1945 | * reco.new_master != O2NM_INVALID_NODE_NUM */ | ||
1946 | status = -EEXIST; | ||
1947 | } | ||
1948 | |||
1949 | return status; | ||
1950 | } | ||
1951 | |||
1952 | static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) | ||
1953 | { | ||
1954 | struct dlm_begin_reco br; | ||
1955 | int ret = 0; | ||
1956 | struct dlm_node_iter iter; | ||
1957 | int nodenum; | ||
1958 | int status; | ||
1959 | |||
1960 | mlog_entry("%u\n", dead_node); | ||
1961 | |||
1962 | mlog(0, "dead node is %u\n", dead_node); | ||
1963 | |||
1964 | spin_lock(&dlm->spinlock); | ||
1965 | dlm_node_iter_init(dlm->domain_map, &iter); | ||
1966 | spin_unlock(&dlm->spinlock); | ||
1967 | |||
1968 | clear_bit(dead_node, iter.node_map); | ||
1969 | |||
1970 | memset(&br, 0, sizeof(br)); | ||
1971 | br.node_idx = dlm->node_num; | ||
1972 | br.dead_node = dead_node; | ||
1973 | |||
1974 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { | ||
1975 | ret = 0; | ||
1976 | if (nodenum == dead_node) { | ||
1977 | mlog(0, "not sending begin reco to dead node " | ||
1978 | "%u\n", dead_node); | ||
1979 | continue; | ||
1980 | } | ||
1981 | if (nodenum == dlm->node_num) { | ||
1982 | mlog(0, "not sending begin reco to self\n"); | ||
1983 | continue; | ||
1984 | } | ||
1985 | |||
1986 | ret = -EINVAL; | ||
1987 | mlog(0, "attempting to send begin reco msg to %d\n", | ||
1988 | nodenum); | ||
1989 | ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key, | ||
1990 | &br, sizeof(br), nodenum, &status); | ||
1991 | /* negative status is handled ok by caller here */ | ||
1992 | if (ret >= 0) | ||
1993 | ret = status; | ||
1994 | if (ret < 0) { | ||
1995 | struct dlm_lock_resource *res; | ||
1996 | mlog_errno(ret); | ||
1997 | mlog(ML_ERROR, "begin reco of dlm %s to node %u " | ||
1998 | " returned %d\n", dlm->name, nodenum, ret); | ||
1999 | res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, | ||
2000 | DLM_RECOVERY_LOCK_NAME_LEN); | ||
2001 | if (res) { | ||
2002 | dlm_print_one_lock_resource(res); | ||
2003 | dlm_lockres_put(res); | ||
2004 | } else { | ||
2005 | mlog(ML_ERROR, "recovery lock not found\n"); | ||
2006 | } | ||
2007 | break; | ||
2008 | } | ||
2009 | } | ||
2010 | |||
2011 | return ret; | ||
2012 | } | ||
2013 | |||
2014 | int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) | ||
2015 | { | ||
2016 | struct dlm_ctxt *dlm = data; | ||
2017 | struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf; | ||
2018 | |||
2019 | /* ok to return 0, domain has gone away */ | ||
2020 | if (!dlm_grab(dlm)) | ||
2021 | return 0; | ||
2022 | |||
2023 | mlog(0, "node %u wants to recover node %u\n", | ||
2024 | br->node_idx, br->dead_node); | ||
2025 | |||
2026 | dlm_fire_domain_eviction_callbacks(dlm, br->dead_node); | ||
2027 | |||
2028 | spin_lock(&dlm->spinlock); | ||
2029 | if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { | ||
2030 | mlog(0, "new_master already set to %u!\n", | ||
2031 | dlm->reco.new_master); | ||
2032 | } | ||
2033 | if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { | ||
2034 | mlog(0, "dead_node already set to %u!\n", | ||
2035 | dlm->reco.dead_node); | ||
2036 | } | ||
2037 | dlm->reco.new_master = br->node_idx; | ||
2038 | dlm->reco.dead_node = br->dead_node; | ||
2039 | if (!test_bit(br->dead_node, dlm->recovery_map)) { | ||
2040 | mlog(ML_ERROR, "recovery master %u sees %u as dead, but this " | ||
2041 | "node has not yet. marking %u as dead\n", | ||
2042 | br->node_idx, br->dead_node, br->dead_node); | ||
2043 | __dlm_hb_node_down(dlm, br->dead_node); | ||
2044 | } | ||
2045 | spin_unlock(&dlm->spinlock); | ||
2046 | |||
2047 | dlm_kick_recovery_thread(dlm); | ||
2048 | dlm_put(dlm); | ||
2049 | return 0; | ||
2050 | } | ||
2051 | |||
2052 | static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) | ||
2053 | { | ||
2054 | int ret = 0; | ||
2055 | struct dlm_finalize_reco fr; | ||
2056 | struct dlm_node_iter iter; | ||
2057 | int nodenum; | ||
2058 | int status; | ||
2059 | |||
2060 | mlog(0, "finishing recovery for node %s:%u\n", | ||
2061 | dlm->name, dlm->reco.dead_node); | ||
2062 | |||
2063 | spin_lock(&dlm->spinlock); | ||
2064 | dlm_node_iter_init(dlm->domain_map, &iter); | ||
2065 | spin_unlock(&dlm->spinlock); | ||
2066 | |||
2067 | memset(&fr, 0, sizeof(fr)); | ||
2068 | fr.node_idx = dlm->node_num; | ||
2069 | fr.dead_node = dlm->reco.dead_node; | ||
2070 | |||
2071 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { | ||
2072 | if (nodenum == dlm->node_num) | ||
2073 | continue; | ||
2074 | ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key, | ||
2075 | &fr, sizeof(fr), nodenum, &status); | ||
2076 | if (ret >= 0) { | ||
2077 | ret = status; | ||
2078 | if (dlm_is_host_down(ret)) { | ||
2079 | /* this has no effect on this recovery | ||
2080 | * session, so set the status to zero to | ||
2081 | * finish out the last recovery */ | ||
2082 | mlog(ML_ERROR, "node %u went down after this " | ||
2083 | "node finished recovery.\n", nodenum); | ||
2084 | ret = 0; | ||
2085 | } | ||
2086 | } | ||
2087 | if (ret < 0) { | ||
2088 | mlog_errno(ret); | ||
2089 | break; | ||
2090 | } | ||
2091 | } | ||
2092 | |||
2093 | return ret; | ||
2094 | } | ||
2095 | |||
2096 | int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) | ||
2097 | { | ||
2098 | struct dlm_ctxt *dlm = data; | ||
2099 | struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; | ||
2100 | |||
2101 | /* ok to return 0, domain has gone away */ | ||
2102 | if (!dlm_grab(dlm)) | ||
2103 | return 0; | ||
2104 | |||
2105 | mlog(0, "node %u finalizing recovery of node %u\n", | ||
2106 | fr->node_idx, fr->dead_node); | ||
2107 | |||
2108 | spin_lock(&dlm->spinlock); | ||
2109 | |||
2110 | if (dlm->reco.new_master != fr->node_idx) { | ||
2111 | mlog(ML_ERROR, "node %u sent recovery finalize msg, but node " | ||
2112 | "%u is supposed to be the new master, dead=%u\n", | ||
2113 | fr->node_idx, dlm->reco.new_master, fr->dead_node); | ||
2114 | BUG(); | ||
2115 | } | ||
2116 | if (dlm->reco.dead_node != fr->dead_node) { | ||
2117 | mlog(ML_ERROR, "node %u sent recovery finalize msg for dead " | ||
2118 | "node %u, but node %u is supposed to be dead\n", | ||
2119 | fr->node_idx, fr->dead_node, dlm->reco.dead_node); | ||
2120 | BUG(); | ||
2121 | } | ||
2122 | |||
2123 | dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); | ||
2124 | |||
2125 | spin_unlock(&dlm->spinlock); | ||
2126 | |||
2127 | dlm_reset_recovery(dlm); | ||
2128 | |||
2129 | dlm_kick_recovery_thread(dlm); | ||
2130 | dlm_put(dlm); | ||
2131 | return 0; | ||
2132 | } | ||
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c new file mode 100644 index 000000000000..5be9d14f12cb --- /dev/null +++ b/fs/ocfs2/dlm/dlmthread.c | |||
@@ -0,0 +1,692 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmthread.c | ||
5 | * | ||
6 | * standalone DLM module | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | |||
28 | #include <linux/module.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/types.h> | ||
31 | #include <linux/slab.h> | ||
32 | #include <linux/highmem.h> | ||
33 | #include <linux/utsname.h> | ||
34 | #include <linux/init.h> | ||
35 | #include <linux/sysctl.h> | ||
36 | #include <linux/random.h> | ||
37 | #include <linux/blkdev.h> | ||
38 | #include <linux/socket.h> | ||
39 | #include <linux/inet.h> | ||
40 | #include <linux/timer.h> | ||
41 | #include <linux/kthread.h> | ||
42 | |||
43 | |||
44 | #include "cluster/heartbeat.h" | ||
45 | #include "cluster/nodemanager.h" | ||
46 | #include "cluster/tcp.h" | ||
47 | |||
48 | #include "dlmapi.h" | ||
49 | #include "dlmcommon.h" | ||
50 | #include "dlmdomain.h" | ||
51 | |||
52 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD) | ||
53 | #include "cluster/masklog.h" | ||
54 | |||
55 | static int dlm_thread(void *data); | ||
56 | |||
57 | static void dlm_flush_asts(struct dlm_ctxt *dlm); | ||
58 | |||
59 | #define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num) | ||
60 | |||
61 | /* will exit holding res->spinlock, but may drop in function */ | ||
62 | /* waits until flags are cleared on res->state */ | ||
63 | void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags) | ||
64 | { | ||
65 | DECLARE_WAITQUEUE(wait, current); | ||
66 | |||
67 | assert_spin_locked(&res->spinlock); | ||
68 | |||
69 | add_wait_queue(&res->wq, &wait); | ||
70 | repeat: | ||
71 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
72 | if (res->state & flags) { | ||
73 | spin_unlock(&res->spinlock); | ||
74 | schedule(); | ||
75 | spin_lock(&res->spinlock); | ||
76 | goto repeat; | ||
77 | } | ||
78 | remove_wait_queue(&res->wq, &wait); | ||
79 | current->state = TASK_RUNNING; | ||
80 | } | ||
81 | |||
82 | |||
83 | static int __dlm_lockres_unused(struct dlm_lock_resource *res) | ||
84 | { | ||
85 | if (list_empty(&res->granted) && | ||
86 | list_empty(&res->converting) && | ||
87 | list_empty(&res->blocked) && | ||
88 | list_empty(&res->dirty)) | ||
89 | return 1; | ||
90 | return 0; | ||
91 | } | ||
92 | |||
93 | |||
94 | /* Call whenever you may have added or deleted something from one of | ||
95 | * the lockres queue's. This will figure out whether it belongs on the | ||
96 | * unused list or not and does the appropriate thing. */ | ||
97 | void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, | ||
98 | struct dlm_lock_resource *res) | ||
99 | { | ||
100 | mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); | ||
101 | |||
102 | assert_spin_locked(&dlm->spinlock); | ||
103 | assert_spin_locked(&res->spinlock); | ||
104 | |||
105 | if (__dlm_lockres_unused(res)){ | ||
106 | if (list_empty(&res->purge)) { | ||
107 | mlog(0, "putting lockres %.*s from purge list\n", | ||
108 | res->lockname.len, res->lockname.name); | ||
109 | |||
110 | res->last_used = jiffies; | ||
111 | list_add_tail(&res->purge, &dlm->purge_list); | ||
112 | dlm->purge_count++; | ||
113 | } | ||
114 | } else if (!list_empty(&res->purge)) { | ||
115 | mlog(0, "removing lockres %.*s from purge list\n", | ||
116 | res->lockname.len, res->lockname.name); | ||
117 | |||
118 | list_del_init(&res->purge); | ||
119 | dlm->purge_count--; | ||
120 | } | ||
121 | } | ||
122 | |||
123 | void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, | ||
124 | struct dlm_lock_resource *res) | ||
125 | { | ||
126 | mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); | ||
127 | spin_lock(&dlm->spinlock); | ||
128 | spin_lock(&res->spinlock); | ||
129 | |||
130 | __dlm_lockres_calc_usage(dlm, res); | ||
131 | |||
132 | spin_unlock(&res->spinlock); | ||
133 | spin_unlock(&dlm->spinlock); | ||
134 | } | ||
135 | |||
136 | /* TODO: Eventual API: Called with the dlm spinlock held, may drop it | ||
137 | * to do migration, but will re-acquire before exit. */ | ||
138 | void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres) | ||
139 | { | ||
140 | int master; | ||
141 | int ret; | ||
142 | |||
143 | spin_lock(&lockres->spinlock); | ||
144 | master = lockres->owner == dlm->node_num; | ||
145 | spin_unlock(&lockres->spinlock); | ||
146 | |||
147 | mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len, | ||
148 | lockres->lockname.name, master); | ||
149 | |||
150 | /* Non master is the easy case -- no migration required, just | ||
151 | * quit. */ | ||
152 | if (!master) | ||
153 | goto finish; | ||
154 | |||
155 | /* Wheee! Migrate lockres here! */ | ||
156 | spin_unlock(&dlm->spinlock); | ||
157 | again: | ||
158 | |||
159 | ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES); | ||
160 | if (ret == -ENOTEMPTY) { | ||
161 | mlog(ML_ERROR, "lockres %.*s still has local locks!\n", | ||
162 | lockres->lockname.len, lockres->lockname.name); | ||
163 | |||
164 | BUG(); | ||
165 | } else if (ret < 0) { | ||
166 | mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n", | ||
167 | lockres->lockname.len, lockres->lockname.name); | ||
168 | goto again; | ||
169 | } | ||
170 | |||
171 | spin_lock(&dlm->spinlock); | ||
172 | |||
173 | finish: | ||
174 | if (!list_empty(&lockres->purge)) { | ||
175 | list_del_init(&lockres->purge); | ||
176 | dlm->purge_count--; | ||
177 | } | ||
178 | __dlm_unhash_lockres(lockres); | ||
179 | } | ||
180 | |||
181 | static void dlm_run_purge_list(struct dlm_ctxt *dlm, | ||
182 | int purge_now) | ||
183 | { | ||
184 | unsigned int run_max, unused; | ||
185 | unsigned long purge_jiffies; | ||
186 | struct dlm_lock_resource *lockres; | ||
187 | |||
188 | spin_lock(&dlm->spinlock); | ||
189 | run_max = dlm->purge_count; | ||
190 | |||
191 | while(run_max && !list_empty(&dlm->purge_list)) { | ||
192 | run_max--; | ||
193 | |||
194 | lockres = list_entry(dlm->purge_list.next, | ||
195 | struct dlm_lock_resource, purge); | ||
196 | |||
197 | /* Status of the lockres *might* change so double | ||
198 | * check. If the lockres is unused, holding the dlm | ||
199 | * spinlock will prevent people from getting and more | ||
200 | * refs on it -- there's no need to keep the lockres | ||
201 | * spinlock. */ | ||
202 | spin_lock(&lockres->spinlock); | ||
203 | unused = __dlm_lockres_unused(lockres); | ||
204 | spin_unlock(&lockres->spinlock); | ||
205 | |||
206 | if (!unused) | ||
207 | continue; | ||
208 | |||
209 | purge_jiffies = lockres->last_used + | ||
210 | msecs_to_jiffies(DLM_PURGE_INTERVAL_MS); | ||
211 | |||
212 | /* Make sure that we want to be processing this guy at | ||
213 | * this time. */ | ||
214 | if (!purge_now && time_after(purge_jiffies, jiffies)) { | ||
215 | /* Since resources are added to the purge list | ||
216 | * in tail order, we can stop at the first | ||
217 | * unpurgable resource -- anyone added after | ||
218 | * him will have a greater last_used value */ | ||
219 | break; | ||
220 | } | ||
221 | |||
222 | list_del_init(&lockres->purge); | ||
223 | dlm->purge_count--; | ||
224 | |||
225 | /* This may drop and reacquire the dlm spinlock if it | ||
226 | * has to do migration. */ | ||
227 | mlog(0, "calling dlm_purge_lockres!\n"); | ||
228 | dlm_purge_lockres(dlm, lockres); | ||
229 | mlog(0, "DONE calling dlm_purge_lockres!\n"); | ||
230 | |||
231 | /* Avoid adding any scheduling latencies */ | ||
232 | cond_resched_lock(&dlm->spinlock); | ||
233 | } | ||
234 | |||
235 | spin_unlock(&dlm->spinlock); | ||
236 | } | ||
237 | |||
238 | static void dlm_shuffle_lists(struct dlm_ctxt *dlm, | ||
239 | struct dlm_lock_resource *res) | ||
240 | { | ||
241 | struct dlm_lock *lock, *target; | ||
242 | struct list_head *iter; | ||
243 | struct list_head *head; | ||
244 | int can_grant = 1; | ||
245 | |||
246 | //mlog(0, "res->lockname.len=%d\n", res->lockname.len); | ||
247 | //mlog(0, "res->lockname.name=%p\n", res->lockname.name); | ||
248 | //mlog(0, "shuffle res %.*s\n", res->lockname.len, | ||
249 | // res->lockname.name); | ||
250 | |||
251 | /* because this function is called with the lockres | ||
252 | * spinlock, and because we know that it is not migrating/ | ||
253 | * recovering/in-progress, it is fine to reserve asts and | ||
254 | * basts right before queueing them all throughout */ | ||
255 | assert_spin_locked(&res->spinlock); | ||
256 | BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING| | ||
257 | DLM_LOCK_RES_RECOVERING| | ||
258 | DLM_LOCK_RES_IN_PROGRESS))); | ||
259 | |||
260 | converting: | ||
261 | if (list_empty(&res->converting)) | ||
262 | goto blocked; | ||
263 | mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len, | ||
264 | res->lockname.name); | ||
265 | |||
266 | target = list_entry(res->converting.next, struct dlm_lock, list); | ||
267 | if (target->ml.convert_type == LKM_IVMODE) { | ||
268 | mlog(ML_ERROR, "%.*s: converting a lock with no " | ||
269 | "convert_type!\n", res->lockname.len, res->lockname.name); | ||
270 | BUG(); | ||
271 | } | ||
272 | head = &res->granted; | ||
273 | list_for_each(iter, head) { | ||
274 | lock = list_entry(iter, struct dlm_lock, list); | ||
275 | if (lock==target) | ||
276 | continue; | ||
277 | if (!dlm_lock_compatible(lock->ml.type, | ||
278 | target->ml.convert_type)) { | ||
279 | can_grant = 0; | ||
280 | /* queue the BAST if not already */ | ||
281 | if (lock->ml.highest_blocked == LKM_IVMODE) { | ||
282 | __dlm_lockres_reserve_ast(res); | ||
283 | dlm_queue_bast(dlm, lock); | ||
284 | } | ||
285 | /* update the highest_blocked if needed */ | ||
286 | if (lock->ml.highest_blocked < target->ml.convert_type) | ||
287 | lock->ml.highest_blocked = | ||
288 | target->ml.convert_type; | ||
289 | } | ||
290 | } | ||
291 | head = &res->converting; | ||
292 | list_for_each(iter, head) { | ||
293 | lock = list_entry(iter, struct dlm_lock, list); | ||
294 | if (lock==target) | ||
295 | continue; | ||
296 | if (!dlm_lock_compatible(lock->ml.type, | ||
297 | target->ml.convert_type)) { | ||
298 | can_grant = 0; | ||
299 | if (lock->ml.highest_blocked == LKM_IVMODE) { | ||
300 | __dlm_lockres_reserve_ast(res); | ||
301 | dlm_queue_bast(dlm, lock); | ||
302 | } | ||
303 | if (lock->ml.highest_blocked < target->ml.convert_type) | ||
304 | lock->ml.highest_blocked = | ||
305 | target->ml.convert_type; | ||
306 | } | ||
307 | } | ||
308 | |||
309 | /* we can convert the lock */ | ||
310 | if (can_grant) { | ||
311 | spin_lock(&target->spinlock); | ||
312 | BUG_ON(target->ml.highest_blocked != LKM_IVMODE); | ||
313 | |||
314 | mlog(0, "calling ast for converting lock: %.*s, have: %d, " | ||
315 | "granting: %d, node: %u\n", res->lockname.len, | ||
316 | res->lockname.name, target->ml.type, | ||
317 | target->ml.convert_type, target->ml.node); | ||
318 | |||
319 | target->ml.type = target->ml.convert_type; | ||
320 | target->ml.convert_type = LKM_IVMODE; | ||
321 | list_del_init(&target->list); | ||
322 | list_add_tail(&target->list, &res->granted); | ||
323 | |||
324 | BUG_ON(!target->lksb); | ||
325 | target->lksb->status = DLM_NORMAL; | ||
326 | |||
327 | spin_unlock(&target->spinlock); | ||
328 | |||
329 | __dlm_lockres_reserve_ast(res); | ||
330 | dlm_queue_ast(dlm, target); | ||
331 | /* go back and check for more */ | ||
332 | goto converting; | ||
333 | } | ||
334 | |||
335 | blocked: | ||
336 | if (list_empty(&res->blocked)) | ||
337 | goto leave; | ||
338 | target = list_entry(res->blocked.next, struct dlm_lock, list); | ||
339 | |||
340 | head = &res->granted; | ||
341 | list_for_each(iter, head) { | ||
342 | lock = list_entry(iter, struct dlm_lock, list); | ||
343 | if (lock==target) | ||
344 | continue; | ||
345 | if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { | ||
346 | can_grant = 0; | ||
347 | if (lock->ml.highest_blocked == LKM_IVMODE) { | ||
348 | __dlm_lockres_reserve_ast(res); | ||
349 | dlm_queue_bast(dlm, lock); | ||
350 | } | ||
351 | if (lock->ml.highest_blocked < target->ml.type) | ||
352 | lock->ml.highest_blocked = target->ml.type; | ||
353 | } | ||
354 | } | ||
355 | |||
356 | head = &res->converting; | ||
357 | list_for_each(iter, head) { | ||
358 | lock = list_entry(iter, struct dlm_lock, list); | ||
359 | if (lock==target) | ||
360 | continue; | ||
361 | if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { | ||
362 | can_grant = 0; | ||
363 | if (lock->ml.highest_blocked == LKM_IVMODE) { | ||
364 | __dlm_lockres_reserve_ast(res); | ||
365 | dlm_queue_bast(dlm, lock); | ||
366 | } | ||
367 | if (lock->ml.highest_blocked < target->ml.type) | ||
368 | lock->ml.highest_blocked = target->ml.type; | ||
369 | } | ||
370 | } | ||
371 | |||
372 | /* we can grant the blocked lock (only | ||
373 | * possible if converting list empty) */ | ||
374 | if (can_grant) { | ||
375 | spin_lock(&target->spinlock); | ||
376 | BUG_ON(target->ml.highest_blocked != LKM_IVMODE); | ||
377 | |||
378 | mlog(0, "calling ast for blocked lock: %.*s, granting: %d, " | ||
379 | "node: %u\n", res->lockname.len, res->lockname.name, | ||
380 | target->ml.type, target->ml.node); | ||
381 | |||
382 | // target->ml.type is already correct | ||
383 | list_del_init(&target->list); | ||
384 | list_add_tail(&target->list, &res->granted); | ||
385 | |||
386 | BUG_ON(!target->lksb); | ||
387 | target->lksb->status = DLM_NORMAL; | ||
388 | |||
389 | spin_unlock(&target->spinlock); | ||
390 | |||
391 | __dlm_lockres_reserve_ast(res); | ||
392 | dlm_queue_ast(dlm, target); | ||
393 | /* go back and check for more */ | ||
394 | goto converting; | ||
395 | } | ||
396 | |||
397 | leave: | ||
398 | return; | ||
399 | } | ||
400 | |||
401 | /* must have NO locks when calling this with res !=NULL * */ | ||
402 | void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) | ||
403 | { | ||
404 | mlog_entry("dlm=%p, res=%p\n", dlm, res); | ||
405 | if (res) { | ||
406 | spin_lock(&dlm->spinlock); | ||
407 | spin_lock(&res->spinlock); | ||
408 | __dlm_dirty_lockres(dlm, res); | ||
409 | spin_unlock(&res->spinlock); | ||
410 | spin_unlock(&dlm->spinlock); | ||
411 | } | ||
412 | wake_up(&dlm->dlm_thread_wq); | ||
413 | } | ||
414 | |||
415 | void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) | ||
416 | { | ||
417 | mlog_entry("dlm=%p, res=%p\n", dlm, res); | ||
418 | |||
419 | assert_spin_locked(&dlm->spinlock); | ||
420 | assert_spin_locked(&res->spinlock); | ||
421 | |||
422 | /* don't shuffle secondary queues */ | ||
423 | if ((res->owner == dlm->node_num) && | ||
424 | !(res->state & DLM_LOCK_RES_DIRTY)) { | ||
425 | list_add_tail(&res->dirty, &dlm->dirty_list); | ||
426 | res->state |= DLM_LOCK_RES_DIRTY; | ||
427 | } | ||
428 | } | ||
429 | |||
430 | |||
431 | /* Launch the NM thread for the mounted volume */ | ||
432 | int dlm_launch_thread(struct dlm_ctxt *dlm) | ||
433 | { | ||
434 | mlog(0, "starting dlm thread...\n"); | ||
435 | |||
436 | dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread"); | ||
437 | if (IS_ERR(dlm->dlm_thread_task)) { | ||
438 | mlog_errno(PTR_ERR(dlm->dlm_thread_task)); | ||
439 | dlm->dlm_thread_task = NULL; | ||
440 | return -EINVAL; | ||
441 | } | ||
442 | |||
443 | return 0; | ||
444 | } | ||
445 | |||
446 | void dlm_complete_thread(struct dlm_ctxt *dlm) | ||
447 | { | ||
448 | if (dlm->dlm_thread_task) { | ||
449 | mlog(ML_KTHREAD, "waiting for dlm thread to exit\n"); | ||
450 | kthread_stop(dlm->dlm_thread_task); | ||
451 | dlm->dlm_thread_task = NULL; | ||
452 | } | ||
453 | } | ||
454 | |||
455 | static int dlm_dirty_list_empty(struct dlm_ctxt *dlm) | ||
456 | { | ||
457 | int empty; | ||
458 | |||
459 | spin_lock(&dlm->spinlock); | ||
460 | empty = list_empty(&dlm->dirty_list); | ||
461 | spin_unlock(&dlm->spinlock); | ||
462 | |||
463 | return empty; | ||
464 | } | ||
465 | |||
466 | static void dlm_flush_asts(struct dlm_ctxt *dlm) | ||
467 | { | ||
468 | int ret; | ||
469 | struct dlm_lock *lock; | ||
470 | struct dlm_lock_resource *res; | ||
471 | u8 hi; | ||
472 | |||
473 | spin_lock(&dlm->ast_lock); | ||
474 | while (!list_empty(&dlm->pending_asts)) { | ||
475 | lock = list_entry(dlm->pending_asts.next, | ||
476 | struct dlm_lock, ast_list); | ||
477 | /* get an extra ref on lock */ | ||
478 | dlm_lock_get(lock); | ||
479 | res = lock->lockres; | ||
480 | mlog(0, "delivering an ast for this lockres\n"); | ||
481 | |||
482 | BUG_ON(!lock->ast_pending); | ||
483 | |||
484 | /* remove from list (including ref) */ | ||
485 | list_del_init(&lock->ast_list); | ||
486 | dlm_lock_put(lock); | ||
487 | spin_unlock(&dlm->ast_lock); | ||
488 | |||
489 | if (lock->ml.node != dlm->node_num) { | ||
490 | ret = dlm_do_remote_ast(dlm, res, lock); | ||
491 | if (ret < 0) | ||
492 | mlog_errno(ret); | ||
493 | } else | ||
494 | dlm_do_local_ast(dlm, res, lock); | ||
495 | |||
496 | spin_lock(&dlm->ast_lock); | ||
497 | |||
498 | /* possible that another ast was queued while | ||
499 | * we were delivering the last one */ | ||
500 | if (!list_empty(&lock->ast_list)) { | ||
501 | mlog(0, "aha another ast got queued while " | ||
502 | "we were finishing the last one. will " | ||
503 | "keep the ast_pending flag set.\n"); | ||
504 | } else | ||
505 | lock->ast_pending = 0; | ||
506 | |||
507 | /* drop the extra ref. | ||
508 | * this may drop it completely. */ | ||
509 | dlm_lock_put(lock); | ||
510 | dlm_lockres_release_ast(dlm, res); | ||
511 | } | ||
512 | |||
513 | while (!list_empty(&dlm->pending_basts)) { | ||
514 | lock = list_entry(dlm->pending_basts.next, | ||
515 | struct dlm_lock, bast_list); | ||
516 | /* get an extra ref on lock */ | ||
517 | dlm_lock_get(lock); | ||
518 | res = lock->lockres; | ||
519 | |||
520 | BUG_ON(!lock->bast_pending); | ||
521 | |||
522 | /* get the highest blocked lock, and reset */ | ||
523 | spin_lock(&lock->spinlock); | ||
524 | BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE); | ||
525 | hi = lock->ml.highest_blocked; | ||
526 | lock->ml.highest_blocked = LKM_IVMODE; | ||
527 | spin_unlock(&lock->spinlock); | ||
528 | |||
529 | /* remove from list (including ref) */ | ||
530 | list_del_init(&lock->bast_list); | ||
531 | dlm_lock_put(lock); | ||
532 | spin_unlock(&dlm->ast_lock); | ||
533 | |||
534 | mlog(0, "delivering a bast for this lockres " | ||
535 | "(blocked = %d\n", hi); | ||
536 | |||
537 | if (lock->ml.node != dlm->node_num) { | ||
538 | ret = dlm_send_proxy_bast(dlm, res, lock, hi); | ||
539 | if (ret < 0) | ||
540 | mlog_errno(ret); | ||
541 | } else | ||
542 | dlm_do_local_bast(dlm, res, lock, hi); | ||
543 | |||
544 | spin_lock(&dlm->ast_lock); | ||
545 | |||
546 | /* possible that another bast was queued while | ||
547 | * we were delivering the last one */ | ||
548 | if (!list_empty(&lock->bast_list)) { | ||
549 | mlog(0, "aha another bast got queued while " | ||
550 | "we were finishing the last one. will " | ||
551 | "keep the bast_pending flag set.\n"); | ||
552 | } else | ||
553 | lock->bast_pending = 0; | ||
554 | |||
555 | /* drop the extra ref. | ||
556 | * this may drop it completely. */ | ||
557 | dlm_lock_put(lock); | ||
558 | dlm_lockres_release_ast(dlm, res); | ||
559 | } | ||
560 | wake_up(&dlm->ast_wq); | ||
561 | spin_unlock(&dlm->ast_lock); | ||
562 | } | ||
563 | |||
564 | |||
565 | #define DLM_THREAD_TIMEOUT_MS (4 * 1000) | ||
566 | #define DLM_THREAD_MAX_DIRTY 100 | ||
567 | #define DLM_THREAD_MAX_ASTS 10 | ||
568 | |||
569 | static int dlm_thread(void *data) | ||
570 | { | ||
571 | struct dlm_lock_resource *res; | ||
572 | struct dlm_ctxt *dlm = data; | ||
573 | unsigned long timeout = msecs_to_jiffies(DLM_THREAD_TIMEOUT_MS); | ||
574 | |||
575 | mlog(0, "dlm thread running for %s...\n", dlm->name); | ||
576 | |||
577 | while (!kthread_should_stop()) { | ||
578 | int n = DLM_THREAD_MAX_DIRTY; | ||
579 | |||
580 | /* dlm_shutting_down is very point-in-time, but that | ||
581 | * doesn't matter as we'll just loop back around if we | ||
582 | * get false on the leading edge of a state | ||
583 | * transition. */ | ||
584 | dlm_run_purge_list(dlm, dlm_shutting_down(dlm)); | ||
585 | |||
586 | /* We really don't want to hold dlm->spinlock while | ||
587 | * calling dlm_shuffle_lists on each lockres that | ||
588 | * needs to have its queues adjusted and AST/BASTs | ||
589 | * run. So let's pull each entry off the dirty_list | ||
590 | * and drop dlm->spinlock ASAP. Once off the list, | ||
591 | * res->spinlock needs to be taken again to protect | ||
592 | * the queues while calling dlm_shuffle_lists. */ | ||
593 | spin_lock(&dlm->spinlock); | ||
594 | while (!list_empty(&dlm->dirty_list)) { | ||
595 | int delay = 0; | ||
596 | res = list_entry(dlm->dirty_list.next, | ||
597 | struct dlm_lock_resource, dirty); | ||
598 | |||
599 | /* peel a lockres off, remove it from the list, | ||
600 | * unset the dirty flag and drop the dlm lock */ | ||
601 | BUG_ON(!res); | ||
602 | dlm_lockres_get(res); | ||
603 | |||
604 | spin_lock(&res->spinlock); | ||
605 | res->state &= ~DLM_LOCK_RES_DIRTY; | ||
606 | list_del_init(&res->dirty); | ||
607 | spin_unlock(&res->spinlock); | ||
608 | spin_unlock(&dlm->spinlock); | ||
609 | |||
610 | /* lockres can be re-dirtied/re-added to the | ||
611 | * dirty_list in this gap, but that is ok */ | ||
612 | |||
613 | spin_lock(&res->spinlock); | ||
614 | if (res->owner != dlm->node_num) { | ||
615 | __dlm_print_one_lock_resource(res); | ||
616 | mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n", | ||
617 | res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no", | ||
618 | res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no", | ||
619 | res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no", | ||
620 | res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); | ||
621 | } | ||
622 | BUG_ON(res->owner != dlm->node_num); | ||
623 | |||
624 | /* it is now ok to move lockreses in these states | ||
625 | * to the dirty list, assuming that they will only be | ||
626 | * dirty for a short while. */ | ||
627 | if (res->state & (DLM_LOCK_RES_IN_PROGRESS | | ||
628 | DLM_LOCK_RES_MIGRATING | | ||
629 | DLM_LOCK_RES_RECOVERING)) { | ||
630 | /* move it to the tail and keep going */ | ||
631 | spin_unlock(&res->spinlock); | ||
632 | mlog(0, "delaying list shuffling for in-" | ||
633 | "progress lockres %.*s, state=%d\n", | ||
634 | res->lockname.len, res->lockname.name, | ||
635 | res->state); | ||
636 | delay = 1; | ||
637 | goto in_progress; | ||
638 | } | ||
639 | |||
640 | /* at this point the lockres is not migrating/ | ||
641 | * recovering/in-progress. we have the lockres | ||
642 | * spinlock and do NOT have the dlm lock. | ||
643 | * safe to reserve/queue asts and run the lists. */ | ||
644 | |||
645 | mlog(0, "calling dlm_shuffle_lists with dlm=%p, " | ||
646 | "res=%p\n", dlm, res); | ||
647 | |||
648 | /* called while holding lockres lock */ | ||
649 | dlm_shuffle_lists(dlm, res); | ||
650 | spin_unlock(&res->spinlock); | ||
651 | |||
652 | dlm_lockres_calc_usage(dlm, res); | ||
653 | |||
654 | in_progress: | ||
655 | |||
656 | spin_lock(&dlm->spinlock); | ||
657 | /* if the lock was in-progress, stick | ||
658 | * it on the back of the list */ | ||
659 | if (delay) { | ||
660 | spin_lock(&res->spinlock); | ||
661 | list_add_tail(&res->dirty, &dlm->dirty_list); | ||
662 | res->state |= DLM_LOCK_RES_DIRTY; | ||
663 | spin_unlock(&res->spinlock); | ||
664 | } | ||
665 | dlm_lockres_put(res); | ||
666 | |||
667 | /* unlikely, but we may need to give time to | ||
668 | * other tasks */ | ||
669 | if (!--n) { | ||
670 | mlog(0, "throttling dlm_thread\n"); | ||
671 | break; | ||
672 | } | ||
673 | } | ||
674 | |||
675 | spin_unlock(&dlm->spinlock); | ||
676 | dlm_flush_asts(dlm); | ||
677 | |||
678 | /* yield and continue right away if there is more work to do */ | ||
679 | if (!n) { | ||
680 | yield(); | ||
681 | continue; | ||
682 | } | ||
683 | |||
684 | wait_event_interruptible_timeout(dlm->dlm_thread_wq, | ||
685 | !dlm_dirty_list_empty(dlm) || | ||
686 | kthread_should_stop(), | ||
687 | timeout); | ||
688 | } | ||
689 | |||
690 | mlog(0, "quitting DLM thread\n"); | ||
691 | return 0; | ||
692 | } | ||
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c new file mode 100644 index 000000000000..cec2ce1cd318 --- /dev/null +++ b/fs/ocfs2/dlm/dlmunlock.c | |||
@@ -0,0 +1,672 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmunlock.c | ||
5 | * | ||
6 | * underlying calls for unlocking locks | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | |||
28 | #include <linux/module.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/types.h> | ||
31 | #include <linux/slab.h> | ||
32 | #include <linux/highmem.h> | ||
33 | #include <linux/utsname.h> | ||
34 | #include <linux/init.h> | ||
35 | #include <linux/sysctl.h> | ||
36 | #include <linux/random.h> | ||
37 | #include <linux/blkdev.h> | ||
38 | #include <linux/socket.h> | ||
39 | #include <linux/inet.h> | ||
40 | #include <linux/spinlock.h> | ||
41 | #include <linux/delay.h> | ||
42 | |||
43 | #include "cluster/heartbeat.h" | ||
44 | #include "cluster/nodemanager.h" | ||
45 | #include "cluster/tcp.h" | ||
46 | |||
47 | #include "dlmapi.h" | ||
48 | #include "dlmcommon.h" | ||
49 | |||
50 | #define MLOG_MASK_PREFIX ML_DLM | ||
51 | #include "cluster/masklog.h" | ||
52 | |||
53 | #define DLM_UNLOCK_FREE_LOCK 0x00000001 | ||
54 | #define DLM_UNLOCK_CALL_AST 0x00000002 | ||
55 | #define DLM_UNLOCK_REMOVE_LOCK 0x00000004 | ||
56 | #define DLM_UNLOCK_REGRANT_LOCK 0x00000008 | ||
57 | #define DLM_UNLOCK_CLEAR_CONVERT_TYPE 0x00000010 | ||
58 | |||
59 | |||
60 | static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm, | ||
61 | struct dlm_lock_resource *res, | ||
62 | struct dlm_lock *lock, | ||
63 | struct dlm_lockstatus *lksb, | ||
64 | int *actions); | ||
65 | static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm, | ||
66 | struct dlm_lock_resource *res, | ||
67 | struct dlm_lock *lock, | ||
68 | struct dlm_lockstatus *lksb, | ||
69 | int *actions); | ||
70 | |||
71 | static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, | ||
72 | struct dlm_lock_resource *res, | ||
73 | struct dlm_lock *lock, | ||
74 | struct dlm_lockstatus *lksb, | ||
75 | int flags, | ||
76 | u8 owner); | ||
77 | |||
78 | |||
79 | /* | ||
80 | * according to the spec: | ||
81 | * http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf | ||
82 | * | ||
83 | * flags & LKM_CANCEL != 0: must be converting or blocked | ||
84 | * flags & LKM_CANCEL == 0: must be granted | ||
85 | * | ||
86 | * So to unlock a converting lock, you must first cancel the | ||
87 | * convert (passing LKM_CANCEL in flags), then call the unlock | ||
88 | * again (with no LKM_CANCEL in flags). | ||
89 | */ | ||
90 | |||
91 | |||
92 | /* | ||
93 | * locking: | ||
94 | * caller needs: none | ||
95 | * taken: res->spinlock and lock->spinlock taken and dropped | ||
96 | * held on exit: none | ||
97 | * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network | ||
98 | * all callers should have taken an extra ref on lock coming in | ||
99 | */ | ||
100 | static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, | ||
101 | struct dlm_lock_resource *res, | ||
102 | struct dlm_lock *lock, | ||
103 | struct dlm_lockstatus *lksb, | ||
104 | int flags, int *call_ast, | ||
105 | int master_node) | ||
106 | { | ||
107 | enum dlm_status status; | ||
108 | int actions = 0; | ||
109 | int in_use; | ||
110 | u8 owner; | ||
111 | |||
112 | mlog(0, "master_node = %d, valblk = %d\n", master_node, | ||
113 | flags & LKM_VALBLK); | ||
114 | |||
115 | if (master_node) | ||
116 | BUG_ON(res->owner != dlm->node_num); | ||
117 | else | ||
118 | BUG_ON(res->owner == dlm->node_num); | ||
119 | |||
120 | spin_lock(&dlm->spinlock); | ||
121 | /* We want to be sure that we're not freeing a lock | ||
122 | * that still has AST's pending... */ | ||
123 | in_use = !list_empty(&lock->ast_list); | ||
124 | spin_unlock(&dlm->spinlock); | ||
125 | if (in_use) { | ||
126 | mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock " | ||
127 | "while waiting for an ast!", res->lockname.len, | ||
128 | res->lockname.name); | ||
129 | return DLM_BADPARAM; | ||
130 | } | ||
131 | |||
132 | spin_lock(&res->spinlock); | ||
133 | if (res->state & DLM_LOCK_RES_IN_PROGRESS) { | ||
134 | if (master_node) { | ||
135 | mlog(ML_ERROR, "lockres in progress!\n"); | ||
136 | spin_unlock(&res->spinlock); | ||
137 | return DLM_FORWARD; | ||
138 | } | ||
139 | /* ok for this to sleep if not in a network handler */ | ||
140 | __dlm_wait_on_lockres(res); | ||
141 | res->state |= DLM_LOCK_RES_IN_PROGRESS; | ||
142 | } | ||
143 | spin_lock(&lock->spinlock); | ||
144 | |||
145 | if (res->state & DLM_LOCK_RES_RECOVERING) { | ||
146 | status = DLM_RECOVERING; | ||
147 | goto leave; | ||
148 | } | ||
149 | |||
150 | |||
151 | /* see above for what the spec says about | ||
152 | * LKM_CANCEL and the lock queue state */ | ||
153 | if (flags & LKM_CANCEL) | ||
154 | status = dlm_get_cancel_actions(dlm, res, lock, lksb, &actions); | ||
155 | else | ||
156 | status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions); | ||
157 | |||
158 | if (status != DLM_NORMAL) | ||
159 | goto leave; | ||
160 | |||
161 | /* By now this has been masked out of cancel requests. */ | ||
162 | if (flags & LKM_VALBLK) { | ||
163 | /* make the final update to the lvb */ | ||
164 | if (master_node) | ||
165 | memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN); | ||
166 | else | ||
167 | flags |= LKM_PUT_LVB; /* let the send function | ||
168 | * handle it. */ | ||
169 | } | ||
170 | |||
171 | if (!master_node) { | ||
172 | owner = res->owner; | ||
173 | /* drop locks and send message */ | ||
174 | if (flags & LKM_CANCEL) | ||
175 | lock->cancel_pending = 1; | ||
176 | else | ||
177 | lock->unlock_pending = 1; | ||
178 | spin_unlock(&lock->spinlock); | ||
179 | spin_unlock(&res->spinlock); | ||
180 | status = dlm_send_remote_unlock_request(dlm, res, lock, lksb, | ||
181 | flags, owner); | ||
182 | spin_lock(&res->spinlock); | ||
183 | spin_lock(&lock->spinlock); | ||
184 | /* if the master told us the lock was already granted, | ||
185 | * let the ast handle all of these actions */ | ||
186 | if (status == DLM_NORMAL && | ||
187 | lksb->status == DLM_CANCELGRANT) { | ||
188 | actions &= ~(DLM_UNLOCK_REMOVE_LOCK| | ||
189 | DLM_UNLOCK_REGRANT_LOCK| | ||
190 | DLM_UNLOCK_CLEAR_CONVERT_TYPE); | ||
191 | } | ||
192 | if (flags & LKM_CANCEL) | ||
193 | lock->cancel_pending = 0; | ||
194 | else | ||
195 | lock->unlock_pending = 0; | ||
196 | |||
197 | } | ||
198 | |||
199 | /* get an extra ref on lock. if we are just switching | ||
200 | * lists here, we dont want the lock to go away. */ | ||
201 | dlm_lock_get(lock); | ||
202 | |||
203 | if (actions & DLM_UNLOCK_REMOVE_LOCK) { | ||
204 | list_del_init(&lock->list); | ||
205 | dlm_lock_put(lock); | ||
206 | } | ||
207 | if (actions & DLM_UNLOCK_REGRANT_LOCK) { | ||
208 | dlm_lock_get(lock); | ||
209 | list_add_tail(&lock->list, &res->granted); | ||
210 | } | ||
211 | if (actions & DLM_UNLOCK_CLEAR_CONVERT_TYPE) { | ||
212 | mlog(0, "clearing convert_type at %smaster node\n", | ||
213 | master_node ? "" : "non-"); | ||
214 | lock->ml.convert_type = LKM_IVMODE; | ||
215 | } | ||
216 | |||
217 | /* remove the extra ref on lock */ | ||
218 | dlm_lock_put(lock); | ||
219 | |||
220 | leave: | ||
221 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; | ||
222 | if (!dlm_lock_on_list(&res->converting, lock)) | ||
223 | BUG_ON(lock->ml.convert_type != LKM_IVMODE); | ||
224 | else | ||
225 | BUG_ON(lock->ml.convert_type == LKM_IVMODE); | ||
226 | spin_unlock(&lock->spinlock); | ||
227 | spin_unlock(&res->spinlock); | ||
228 | wake_up(&res->wq); | ||
229 | |||
230 | /* let the caller's final dlm_lock_put handle the actual kfree */ | ||
231 | if (actions & DLM_UNLOCK_FREE_LOCK) { | ||
232 | /* this should always be coupled with list removal */ | ||
233 | BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK)); | ||
234 | mlog(0, "lock %"MLFu64" should be gone now! refs=%d\n", | ||
235 | lock->ml.cookie, atomic_read(&lock->lock_refs.refcount)-1); | ||
236 | dlm_lock_put(lock); | ||
237 | } | ||
238 | if (actions & DLM_UNLOCK_CALL_AST) | ||
239 | *call_ast = 1; | ||
240 | |||
241 | /* if cancel or unlock succeeded, lvb work is done */ | ||
242 | if (status == DLM_NORMAL) | ||
243 | lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB); | ||
244 | |||
245 | return status; | ||
246 | } | ||
247 | |||
248 | void dlm_commit_pending_unlock(struct dlm_lock_resource *res, | ||
249 | struct dlm_lock *lock) | ||
250 | { | ||
251 | /* leave DLM_LKSB_PUT_LVB on the lksb so any final | ||
252 | * update of the lvb will be sent to the new master */ | ||
253 | list_del_init(&lock->list); | ||
254 | } | ||
255 | |||
256 | void dlm_commit_pending_cancel(struct dlm_lock_resource *res, | ||
257 | struct dlm_lock *lock) | ||
258 | { | ||
259 | list_del_init(&lock->list); | ||
260 | list_add_tail(&lock->list, &res->granted); | ||
261 | lock->ml.convert_type = LKM_IVMODE; | ||
262 | } | ||
263 | |||
264 | |||
265 | static inline enum dlm_status dlmunlock_master(struct dlm_ctxt *dlm, | ||
266 | struct dlm_lock_resource *res, | ||
267 | struct dlm_lock *lock, | ||
268 | struct dlm_lockstatus *lksb, | ||
269 | int flags, | ||
270 | int *call_ast) | ||
271 | { | ||
272 | return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 1); | ||
273 | } | ||
274 | |||
275 | static inline enum dlm_status dlmunlock_remote(struct dlm_ctxt *dlm, | ||
276 | struct dlm_lock_resource *res, | ||
277 | struct dlm_lock *lock, | ||
278 | struct dlm_lockstatus *lksb, | ||
279 | int flags, int *call_ast) | ||
280 | { | ||
281 | return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 0); | ||
282 | } | ||
283 | |||
284 | /* | ||
285 | * locking: | ||
286 | * caller needs: none | ||
287 | * taken: none | ||
288 | * held on exit: none | ||
289 | * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network | ||
290 | */ | ||
291 | static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, | ||
292 | struct dlm_lock_resource *res, | ||
293 | struct dlm_lock *lock, | ||
294 | struct dlm_lockstatus *lksb, | ||
295 | int flags, | ||
296 | u8 owner) | ||
297 | { | ||
298 | struct dlm_unlock_lock unlock; | ||
299 | int tmpret; | ||
300 | enum dlm_status ret; | ||
301 | int status = 0; | ||
302 | struct kvec vec[2]; | ||
303 | size_t veclen = 1; | ||
304 | |||
305 | mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); | ||
306 | |||
307 | memset(&unlock, 0, sizeof(unlock)); | ||
308 | unlock.node_idx = dlm->node_num; | ||
309 | unlock.flags = cpu_to_be32(flags); | ||
310 | unlock.cookie = lock->ml.cookie; | ||
311 | unlock.namelen = res->lockname.len; | ||
312 | memcpy(unlock.name, res->lockname.name, unlock.namelen); | ||
313 | |||
314 | vec[0].iov_len = sizeof(struct dlm_unlock_lock); | ||
315 | vec[0].iov_base = &unlock; | ||
316 | |||
317 | if (flags & LKM_PUT_LVB) { | ||
318 | /* extra data to send if we are updating lvb */ | ||
319 | vec[1].iov_len = DLM_LVB_LEN; | ||
320 | vec[1].iov_base = lock->lksb->lvb; | ||
321 | veclen++; | ||
322 | } | ||
323 | |||
324 | tmpret = o2net_send_message_vec(DLM_UNLOCK_LOCK_MSG, dlm->key, | ||
325 | vec, veclen, owner, &status); | ||
326 | if (tmpret >= 0) { | ||
327 | // successfully sent and received | ||
328 | if (status == DLM_CANCELGRANT) | ||
329 | ret = DLM_NORMAL; | ||
330 | else if (status == DLM_FORWARD) { | ||
331 | mlog(0, "master was in-progress. retry\n"); | ||
332 | ret = DLM_FORWARD; | ||
333 | } else | ||
334 | ret = status; | ||
335 | lksb->status = status; | ||
336 | } else { | ||
337 | mlog_errno(tmpret); | ||
338 | if (dlm_is_host_down(tmpret)) { | ||
339 | /* NOTE: this seems strange, but it is what we want. | ||
340 | * when the master goes down during a cancel or | ||
341 | * unlock, the recovery code completes the operation | ||
342 | * as if the master had not died, then passes the | ||
343 | * updated state to the recovery master. this thread | ||
344 | * just needs to finish out the operation and call | ||
345 | * the unlockast. */ | ||
346 | ret = DLM_NORMAL; | ||
347 | } else { | ||
348 | /* something bad. this will BUG in ocfs2 */ | ||
349 | ret = dlm_err_to_dlm_status(tmpret); | ||
350 | } | ||
351 | lksb->status = ret; | ||
352 | } | ||
353 | |||
354 | return ret; | ||
355 | } | ||
356 | |||
357 | /* | ||
358 | * locking: | ||
359 | * caller needs: none | ||
360 | * taken: takes and drops res->spinlock | ||
361 | * held on exit: none | ||
362 | * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID, | ||
363 | * return value from dlmunlock_master | ||
364 | */ | ||
365 | int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data) | ||
366 | { | ||
367 | struct dlm_ctxt *dlm = data; | ||
368 | struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; | ||
369 | struct dlm_lock_resource *res = NULL; | ||
370 | struct list_head *iter; | ||
371 | struct dlm_lock *lock = NULL; | ||
372 | enum dlm_status status = DLM_NORMAL; | ||
373 | int found = 0, i; | ||
374 | struct dlm_lockstatus *lksb = NULL; | ||
375 | int ignore; | ||
376 | u32 flags; | ||
377 | struct list_head *queue; | ||
378 | |||
379 | flags = be32_to_cpu(unlock->flags); | ||
380 | |||
381 | if (flags & LKM_GET_LVB) { | ||
382 | mlog(ML_ERROR, "bad args! GET_LVB specified on unlock!\n"); | ||
383 | return DLM_BADARGS; | ||
384 | } | ||
385 | |||
386 | if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) { | ||
387 | mlog(ML_ERROR, "bad args! cannot modify lvb on a CANCEL " | ||
388 | "request!\n"); | ||
389 | return DLM_BADARGS; | ||
390 | } | ||
391 | |||
392 | if (unlock->namelen > DLM_LOCKID_NAME_MAX) { | ||
393 | mlog(ML_ERROR, "Invalid name length in unlock handler!\n"); | ||
394 | return DLM_IVBUFLEN; | ||
395 | } | ||
396 | |||
397 | if (!dlm_grab(dlm)) | ||
398 | return DLM_REJECTED; | ||
399 | |||
400 | mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), | ||
401 | "Domain %s not fully joined!\n", dlm->name); | ||
402 | |||
403 | mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none"); | ||
404 | |||
405 | res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen); | ||
406 | if (!res) { | ||
407 | /* We assume here that a no lock resource simply means | ||
408 | * it was migrated away and destroyed before the other | ||
409 | * node could detect it. */ | ||
410 | mlog(0, "returning DLM_FORWARD -- res no longer exists\n"); | ||
411 | status = DLM_FORWARD; | ||
412 | goto not_found; | ||
413 | } | ||
414 | |||
415 | queue=&res->granted; | ||
416 | found = 0; | ||
417 | spin_lock(&res->spinlock); | ||
418 | if (res->state & DLM_LOCK_RES_RECOVERING) { | ||
419 | spin_unlock(&res->spinlock); | ||
420 | mlog(0, "returning DLM_RECOVERING\n"); | ||
421 | status = DLM_RECOVERING; | ||
422 | goto leave; | ||
423 | } | ||
424 | |||
425 | if (res->state & DLM_LOCK_RES_MIGRATING) { | ||
426 | spin_unlock(&res->spinlock); | ||
427 | mlog(0, "returning DLM_MIGRATING\n"); | ||
428 | status = DLM_MIGRATING; | ||
429 | goto leave; | ||
430 | } | ||
431 | |||
432 | if (res->owner != dlm->node_num) { | ||
433 | spin_unlock(&res->spinlock); | ||
434 | mlog(0, "returning DLM_FORWARD -- not master\n"); | ||
435 | status = DLM_FORWARD; | ||
436 | goto leave; | ||
437 | } | ||
438 | |||
439 | for (i=0; i<3; i++) { | ||
440 | list_for_each(iter, queue) { | ||
441 | lock = list_entry(iter, struct dlm_lock, list); | ||
442 | if (lock->ml.cookie == unlock->cookie && | ||
443 | lock->ml.node == unlock->node_idx) { | ||
444 | dlm_lock_get(lock); | ||
445 | found = 1; | ||
446 | break; | ||
447 | } | ||
448 | } | ||
449 | if (found) | ||
450 | break; | ||
451 | /* scan granted -> converting -> blocked queues */ | ||
452 | queue++; | ||
453 | } | ||
454 | spin_unlock(&res->spinlock); | ||
455 | if (!found) { | ||
456 | status = DLM_IVLOCKID; | ||
457 | goto not_found; | ||
458 | } | ||
459 | |||
460 | /* lock was found on queue */ | ||
461 | lksb = lock->lksb; | ||
462 | /* unlockast only called on originating node */ | ||
463 | if (flags & LKM_PUT_LVB) { | ||
464 | lksb->flags |= DLM_LKSB_PUT_LVB; | ||
465 | memcpy(&lksb->lvb[0], &unlock->lvb[0], DLM_LVB_LEN); | ||
466 | } | ||
467 | |||
468 | /* if this is in-progress, propagate the DLM_FORWARD | ||
469 | * all the way back out */ | ||
470 | status = dlmunlock_master(dlm, res, lock, lksb, flags, &ignore); | ||
471 | if (status == DLM_FORWARD) | ||
472 | mlog(0, "lockres is in progress\n"); | ||
473 | |||
474 | if (flags & LKM_PUT_LVB) | ||
475 | lksb->flags &= ~DLM_LKSB_PUT_LVB; | ||
476 | |||
477 | dlm_lockres_calc_usage(dlm, res); | ||
478 | dlm_kick_thread(dlm, res); | ||
479 | |||
480 | not_found: | ||
481 | if (!found) | ||
482 | mlog(ML_ERROR, "failed to find lock to unlock! " | ||
483 | "cookie=%"MLFu64"\n", | ||
484 | unlock->cookie); | ||
485 | else { | ||
486 | /* send the lksb->status back to the other node */ | ||
487 | status = lksb->status; | ||
488 | dlm_lock_put(lock); | ||
489 | } | ||
490 | |||
491 | leave: | ||
492 | if (res) | ||
493 | dlm_lockres_put(res); | ||
494 | |||
495 | dlm_put(dlm); | ||
496 | |||
497 | return status; | ||
498 | } | ||
499 | |||
500 | |||
501 | static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm, | ||
502 | struct dlm_lock_resource *res, | ||
503 | struct dlm_lock *lock, | ||
504 | struct dlm_lockstatus *lksb, | ||
505 | int *actions) | ||
506 | { | ||
507 | enum dlm_status status; | ||
508 | |||
509 | if (dlm_lock_on_list(&res->blocked, lock)) { | ||
510 | /* cancel this outright */ | ||
511 | lksb->status = DLM_NORMAL; | ||
512 | status = DLM_NORMAL; | ||
513 | *actions = (DLM_UNLOCK_CALL_AST | | ||
514 | DLM_UNLOCK_REMOVE_LOCK); | ||
515 | } else if (dlm_lock_on_list(&res->converting, lock)) { | ||
516 | /* cancel the request, put back on granted */ | ||
517 | lksb->status = DLM_NORMAL; | ||
518 | status = DLM_NORMAL; | ||
519 | *actions = (DLM_UNLOCK_CALL_AST | | ||
520 | DLM_UNLOCK_REMOVE_LOCK | | ||
521 | DLM_UNLOCK_REGRANT_LOCK | | ||
522 | DLM_UNLOCK_CLEAR_CONVERT_TYPE); | ||
523 | } else if (dlm_lock_on_list(&res->granted, lock)) { | ||
524 | /* too late, already granted. DLM_CANCELGRANT */ | ||
525 | lksb->status = DLM_CANCELGRANT; | ||
526 | status = DLM_NORMAL; | ||
527 | *actions = DLM_UNLOCK_CALL_AST; | ||
528 | } else { | ||
529 | mlog(ML_ERROR, "lock to cancel is not on any list!\n"); | ||
530 | lksb->status = DLM_IVLOCKID; | ||
531 | status = DLM_IVLOCKID; | ||
532 | *actions = 0; | ||
533 | } | ||
534 | return status; | ||
535 | } | ||
536 | |||
537 | static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm, | ||
538 | struct dlm_lock_resource *res, | ||
539 | struct dlm_lock *lock, | ||
540 | struct dlm_lockstatus *lksb, | ||
541 | int *actions) | ||
542 | { | ||
543 | enum dlm_status status; | ||
544 | |||
545 | /* unlock request */ | ||
546 | if (!dlm_lock_on_list(&res->granted, lock)) { | ||
547 | lksb->status = DLM_DENIED; | ||
548 | status = DLM_DENIED; | ||
549 | dlm_error(status); | ||
550 | *actions = 0; | ||
551 | } else { | ||
552 | /* unlock granted lock */ | ||
553 | lksb->status = DLM_NORMAL; | ||
554 | status = DLM_NORMAL; | ||
555 | *actions = (DLM_UNLOCK_FREE_LOCK | | ||
556 | DLM_UNLOCK_CALL_AST | | ||
557 | DLM_UNLOCK_REMOVE_LOCK); | ||
558 | } | ||
559 | return status; | ||
560 | } | ||
561 | |||
562 | /* there seems to be no point in doing this async | ||
563 | * since (even for the remote case) there is really | ||
564 | * no work to queue up... so just do it and fire the | ||
565 | * unlockast by hand when done... */ | ||
566 | enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb, | ||
567 | int flags, dlm_astunlockfunc_t *unlockast, void *data) | ||
568 | { | ||
569 | enum dlm_status status; | ||
570 | struct dlm_lock_resource *res; | ||
571 | struct dlm_lock *lock = NULL; | ||
572 | int call_ast, is_master; | ||
573 | |||
574 | mlog_entry_void(); | ||
575 | |||
576 | if (!lksb) { | ||
577 | dlm_error(DLM_BADARGS); | ||
578 | return DLM_BADARGS; | ||
579 | } | ||
580 | |||
581 | if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK)) { | ||
582 | dlm_error(DLM_BADPARAM); | ||
583 | return DLM_BADPARAM; | ||
584 | } | ||
585 | |||
586 | if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) { | ||
587 | mlog(0, "VALBLK given with CANCEL: ignoring VALBLK\n"); | ||
588 | flags &= ~LKM_VALBLK; | ||
589 | } | ||
590 | |||
591 | if (!lksb->lockid || !lksb->lockid->lockres) { | ||
592 | dlm_error(DLM_BADPARAM); | ||
593 | return DLM_BADPARAM; | ||
594 | } | ||
595 | |||
596 | lock = lksb->lockid; | ||
597 | BUG_ON(!lock); | ||
598 | dlm_lock_get(lock); | ||
599 | |||
600 | res = lock->lockres; | ||
601 | BUG_ON(!res); | ||
602 | dlm_lockres_get(res); | ||
603 | retry: | ||
604 | call_ast = 0; | ||
605 | /* need to retry up here because owner may have changed */ | ||
606 | mlog(0, "lock=%p res=%p\n", lock, res); | ||
607 | |||
608 | spin_lock(&res->spinlock); | ||
609 | is_master = (res->owner == dlm->node_num); | ||
610 | spin_unlock(&res->spinlock); | ||
611 | |||
612 | if (is_master) { | ||
613 | status = dlmunlock_master(dlm, res, lock, lksb, flags, | ||
614 | &call_ast); | ||
615 | mlog(0, "done calling dlmunlock_master: returned %d, " | ||
616 | "call_ast is %d\n", status, call_ast); | ||
617 | } else { | ||
618 | status = dlmunlock_remote(dlm, res, lock, lksb, flags, | ||
619 | &call_ast); | ||
620 | mlog(0, "done calling dlmunlock_remote: returned %d, " | ||
621 | "call_ast is %d\n", status, call_ast); | ||
622 | } | ||
623 | |||
624 | if (status == DLM_RECOVERING || | ||
625 | status == DLM_MIGRATING || | ||
626 | status == DLM_FORWARD) { | ||
627 | /* We want to go away for a tiny bit to allow recovery | ||
628 | * / migration to complete on this resource. I don't | ||
629 | * know of any wait queue we could sleep on as this | ||
630 | * may be happening on another node. Perhaps the | ||
631 | * proper solution is to queue up requests on the | ||
632 | * other end? */ | ||
633 | |||
634 | /* do we want to yield(); ?? */ | ||
635 | msleep(50); | ||
636 | |||
637 | mlog(0, "retrying unlock due to pending recovery/" | ||
638 | "migration/in-progress\n"); | ||
639 | goto retry; | ||
640 | } | ||
641 | |||
642 | if (call_ast) { | ||
643 | mlog(0, "calling unlockast(%p, %d)\n", data, lksb->status); | ||
644 | if (is_master) { | ||
645 | /* it is possible that there is one last bast | ||
646 | * pending. make sure it is flushed, then | ||
647 | * call the unlockast. | ||
648 | * not an issue if this is a mastered remotely, | ||
649 | * since this lock has been removed from the | ||
650 | * lockres queues and cannot be found. */ | ||
651 | dlm_kick_thread(dlm, NULL); | ||
652 | wait_event(dlm->ast_wq, | ||
653 | dlm_lock_basts_flushed(dlm, lock)); | ||
654 | } | ||
655 | (*unlockast)(data, lksb->status); | ||
656 | } | ||
657 | |||
658 | if (status == DLM_NORMAL) { | ||
659 | mlog(0, "kicking the thread\n"); | ||
660 | dlm_kick_thread(dlm, res); | ||
661 | } else | ||
662 | dlm_error(status); | ||
663 | |||
664 | dlm_lockres_calc_usage(dlm, res); | ||
665 | dlm_lockres_put(res); | ||
666 | dlm_lock_put(lock); | ||
667 | |||
668 | mlog(0, "returning status=%d!\n", status); | ||
669 | return status; | ||
670 | } | ||
671 | EXPORT_SYMBOL_GPL(dlmunlock); | ||
672 | |||
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c new file mode 100644 index 000000000000..7ef2653f8f41 --- /dev/null +++ b/fs/ocfs2/dlm/dlmver.c | |||
@@ -0,0 +1,42 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmver.c | ||
5 | * | ||
6 | * version string | ||
7 | * | ||
8 | * Copyright (C) 2002, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/kernel.h> | ||
28 | |||
29 | #include "dlmver.h" | ||
30 | |||
31 | #define DLM_BUILD_VERSION "1.3.3" | ||
32 | |||
33 | #define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION | ||
34 | |||
35 | void dlm_print_version(void) | ||
36 | { | ||
37 | printk(KERN_INFO "%s\n", VERSION_STR); | ||
38 | } | ||
39 | |||
40 | MODULE_DESCRIPTION(VERSION_STR); | ||
41 | |||
42 | MODULE_VERSION(DLM_BUILD_VERSION); | ||
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h new file mode 100644 index 000000000000..f674aee77a16 --- /dev/null +++ b/fs/ocfs2/dlm/dlmver.h | |||
@@ -0,0 +1,31 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmfsver.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef DLM_VER_H | ||
27 | #define DLM_VER_H | ||
28 | |||
29 | void dlm_print_version(void); | ||
30 | |||
31 | #endif /* DLM_VER_H */ | ||
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c new file mode 100644 index 000000000000..e1fdd288796e --- /dev/null +++ b/fs/ocfs2/dlm/userdlm.c | |||
@@ -0,0 +1,658 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * userdlm.c | ||
5 | * | ||
6 | * Code which implements the kernel side of a minimal userspace | ||
7 | * interface to our DLM. | ||
8 | * | ||
9 | * Many of the functions here are pared down versions of dlmglue.c | ||
10 | * functions. | ||
11 | * | ||
12 | * Copyright (C) 2003, 2004 Oracle. All rights reserved. | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or | ||
15 | * modify it under the terms of the GNU General Public | ||
16 | * License as published by the Free Software Foundation; either | ||
17 | * version 2 of the License, or (at your option) any later version. | ||
18 | * | ||
19 | * This program is distributed in the hope that it will be useful, | ||
20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
22 | * General Public License for more details. | ||
23 | * | ||
24 | * You should have received a copy of the GNU General Public | ||
25 | * License along with this program; if not, write to the | ||
26 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
27 | * Boston, MA 021110-1307, USA. | ||
28 | */ | ||
29 | |||
30 | #include <asm/signal.h> | ||
31 | |||
32 | #include <linux/module.h> | ||
33 | #include <linux/fs.h> | ||
34 | #include <linux/types.h> | ||
35 | #include <linux/crc32.h> | ||
36 | |||
37 | |||
38 | #include "cluster/nodemanager.h" | ||
39 | #include "cluster/heartbeat.h" | ||
40 | #include "cluster/tcp.h" | ||
41 | |||
42 | #include "dlmapi.h" | ||
43 | |||
44 | #include "userdlm.h" | ||
45 | |||
46 | #define MLOG_MASK_PREFIX ML_DLMFS | ||
47 | #include "cluster/masklog.h" | ||
48 | |||
49 | static inline int user_check_wait_flag(struct user_lock_res *lockres, | ||
50 | int flag) | ||
51 | { | ||
52 | int ret; | ||
53 | |||
54 | spin_lock(&lockres->l_lock); | ||
55 | ret = lockres->l_flags & flag; | ||
56 | spin_unlock(&lockres->l_lock); | ||
57 | |||
58 | return ret; | ||
59 | } | ||
60 | |||
61 | static inline void user_wait_on_busy_lock(struct user_lock_res *lockres) | ||
62 | |||
63 | { | ||
64 | wait_event(lockres->l_event, | ||
65 | !user_check_wait_flag(lockres, USER_LOCK_BUSY)); | ||
66 | } | ||
67 | |||
68 | static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres) | ||
69 | |||
70 | { | ||
71 | wait_event(lockres->l_event, | ||
72 | !user_check_wait_flag(lockres, USER_LOCK_BLOCKED)); | ||
73 | } | ||
74 | |||
75 | /* I heart container_of... */ | ||
76 | static inline struct dlm_ctxt * | ||
77 | dlm_ctxt_from_user_lockres(struct user_lock_res *lockres) | ||
78 | { | ||
79 | struct dlmfs_inode_private *ip; | ||
80 | |||
81 | ip = container_of(lockres, | ||
82 | struct dlmfs_inode_private, | ||
83 | ip_lockres); | ||
84 | return ip->ip_dlm; | ||
85 | } | ||
86 | |||
87 | static struct inode * | ||
88 | user_dlm_inode_from_user_lockres(struct user_lock_res *lockres) | ||
89 | { | ||
90 | struct dlmfs_inode_private *ip; | ||
91 | |||
92 | ip = container_of(lockres, | ||
93 | struct dlmfs_inode_private, | ||
94 | ip_lockres); | ||
95 | return &ip->ip_vfs_inode; | ||
96 | } | ||
97 | |||
98 | static inline void user_recover_from_dlm_error(struct user_lock_res *lockres) | ||
99 | { | ||
100 | spin_lock(&lockres->l_lock); | ||
101 | lockres->l_flags &= ~USER_LOCK_BUSY; | ||
102 | spin_unlock(&lockres->l_lock); | ||
103 | } | ||
104 | |||
105 | #define user_log_dlm_error(_func, _stat, _lockres) do { \ | ||
106 | mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ | ||
107 | "resource %s: %s\n", dlm_errname(_stat), _func, \ | ||
108 | _lockres->l_name, dlm_errmsg(_stat)); \ | ||
109 | } while (0) | ||
110 | |||
111 | /* WARNING: This function lives in a world where the only three lock | ||
112 | * levels are EX, PR, and NL. It *will* have to be adjusted when more | ||
113 | * lock types are added. */ | ||
114 | static inline int user_highest_compat_lock_level(int level) | ||
115 | { | ||
116 | int new_level = LKM_EXMODE; | ||
117 | |||
118 | if (level == LKM_EXMODE) | ||
119 | new_level = LKM_NLMODE; | ||
120 | else if (level == LKM_PRMODE) | ||
121 | new_level = LKM_PRMODE; | ||
122 | return new_level; | ||
123 | } | ||
124 | |||
125 | static void user_ast(void *opaque) | ||
126 | { | ||
127 | struct user_lock_res *lockres = opaque; | ||
128 | struct dlm_lockstatus *lksb; | ||
129 | |||
130 | mlog(0, "AST fired for lockres %s\n", lockres->l_name); | ||
131 | |||
132 | spin_lock(&lockres->l_lock); | ||
133 | |||
134 | lksb = &(lockres->l_lksb); | ||
135 | if (lksb->status != DLM_NORMAL) { | ||
136 | mlog(ML_ERROR, "lksb status value of %u on lockres %s\n", | ||
137 | lksb->status, lockres->l_name); | ||
138 | spin_unlock(&lockres->l_lock); | ||
139 | return; | ||
140 | } | ||
141 | |||
142 | /* we're downconverting. */ | ||
143 | if (lockres->l_requested < lockres->l_level) { | ||
144 | if (lockres->l_requested <= | ||
145 | user_highest_compat_lock_level(lockres->l_blocking)) { | ||
146 | lockres->l_blocking = LKM_NLMODE; | ||
147 | lockres->l_flags &= ~USER_LOCK_BLOCKED; | ||
148 | } | ||
149 | } | ||
150 | |||
151 | lockres->l_level = lockres->l_requested; | ||
152 | lockres->l_requested = LKM_IVMODE; | ||
153 | lockres->l_flags |= USER_LOCK_ATTACHED; | ||
154 | lockres->l_flags &= ~USER_LOCK_BUSY; | ||
155 | |||
156 | spin_unlock(&lockres->l_lock); | ||
157 | |||
158 | wake_up(&lockres->l_event); | ||
159 | } | ||
160 | |||
161 | static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres) | ||
162 | { | ||
163 | struct inode *inode; | ||
164 | inode = user_dlm_inode_from_user_lockres(lockres); | ||
165 | if (!igrab(inode)) | ||
166 | BUG(); | ||
167 | } | ||
168 | |||
169 | static void user_dlm_unblock_lock(void *opaque); | ||
170 | |||
171 | static void __user_dlm_queue_lockres(struct user_lock_res *lockres) | ||
172 | { | ||
173 | if (!(lockres->l_flags & USER_LOCK_QUEUED)) { | ||
174 | user_dlm_grab_inode_ref(lockres); | ||
175 | |||
176 | INIT_WORK(&lockres->l_work, user_dlm_unblock_lock, | ||
177 | lockres); | ||
178 | |||
179 | queue_work(user_dlm_worker, &lockres->l_work); | ||
180 | lockres->l_flags |= USER_LOCK_QUEUED; | ||
181 | } | ||
182 | } | ||
183 | |||
184 | static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres) | ||
185 | { | ||
186 | int queue = 0; | ||
187 | |||
188 | if (!(lockres->l_flags & USER_LOCK_BLOCKED)) | ||
189 | return; | ||
190 | |||
191 | switch (lockres->l_blocking) { | ||
192 | case LKM_EXMODE: | ||
193 | if (!lockres->l_ex_holders && !lockres->l_ro_holders) | ||
194 | queue = 1; | ||
195 | break; | ||
196 | case LKM_PRMODE: | ||
197 | if (!lockres->l_ex_holders) | ||
198 | queue = 1; | ||
199 | break; | ||
200 | default: | ||
201 | BUG(); | ||
202 | } | ||
203 | |||
204 | if (queue) | ||
205 | __user_dlm_queue_lockres(lockres); | ||
206 | } | ||
207 | |||
208 | static void user_bast(void *opaque, int level) | ||
209 | { | ||
210 | struct user_lock_res *lockres = opaque; | ||
211 | |||
212 | mlog(0, "Blocking AST fired for lockres %s. Blocking level %d\n", | ||
213 | lockres->l_name, level); | ||
214 | |||
215 | spin_lock(&lockres->l_lock); | ||
216 | lockres->l_flags |= USER_LOCK_BLOCKED; | ||
217 | if (level > lockres->l_blocking) | ||
218 | lockres->l_blocking = level; | ||
219 | |||
220 | __user_dlm_queue_lockres(lockres); | ||
221 | spin_unlock(&lockres->l_lock); | ||
222 | |||
223 | wake_up(&lockres->l_event); | ||
224 | } | ||
225 | |||
226 | static void user_unlock_ast(void *opaque, enum dlm_status status) | ||
227 | { | ||
228 | struct user_lock_res *lockres = opaque; | ||
229 | |||
230 | mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name); | ||
231 | |||
232 | if (status != DLM_NORMAL) | ||
233 | mlog(ML_ERROR, "Dlm returns status %d\n", status); | ||
234 | |||
235 | spin_lock(&lockres->l_lock); | ||
236 | if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) | ||
237 | lockres->l_level = LKM_IVMODE; | ||
238 | else { | ||
239 | lockres->l_requested = LKM_IVMODE; /* cancel an | ||
240 | * upconvert | ||
241 | * request. */ | ||
242 | lockres->l_flags &= ~USER_LOCK_IN_CANCEL; | ||
243 | /* we want the unblock thread to look at it again | ||
244 | * now. */ | ||
245 | __user_dlm_queue_lockres(lockres); | ||
246 | } | ||
247 | |||
248 | lockres->l_flags &= ~USER_LOCK_BUSY; | ||
249 | spin_unlock(&lockres->l_lock); | ||
250 | |||
251 | wake_up(&lockres->l_event); | ||
252 | } | ||
253 | |||
254 | static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) | ||
255 | { | ||
256 | struct inode *inode; | ||
257 | inode = user_dlm_inode_from_user_lockres(lockres); | ||
258 | iput(inode); | ||
259 | } | ||
260 | |||
261 | static void user_dlm_unblock_lock(void *opaque) | ||
262 | { | ||
263 | int new_level, status; | ||
264 | struct user_lock_res *lockres = (struct user_lock_res *) opaque; | ||
265 | struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); | ||
266 | |||
267 | mlog(0, "processing lockres %s\n", lockres->l_name); | ||
268 | |||
269 | spin_lock(&lockres->l_lock); | ||
270 | |||
271 | BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); | ||
272 | BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED)); | ||
273 | |||
274 | /* notice that we don't clear USER_LOCK_BLOCKED here. That's | ||
275 | * for user_ast to do. */ | ||
276 | lockres->l_flags &= ~USER_LOCK_QUEUED; | ||
277 | |||
278 | if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { | ||
279 | mlog(0, "lock is in teardown so we do nothing\n"); | ||
280 | spin_unlock(&lockres->l_lock); | ||
281 | goto drop_ref; | ||
282 | } | ||
283 | |||
284 | if (lockres->l_flags & USER_LOCK_BUSY) { | ||
285 | mlog(0, "BUSY flag detected...\n"); | ||
286 | if (lockres->l_flags & USER_LOCK_IN_CANCEL) { | ||
287 | spin_unlock(&lockres->l_lock); | ||
288 | goto drop_ref; | ||
289 | } | ||
290 | |||
291 | lockres->l_flags |= USER_LOCK_IN_CANCEL; | ||
292 | spin_unlock(&lockres->l_lock); | ||
293 | |||
294 | status = dlmunlock(dlm, | ||
295 | &lockres->l_lksb, | ||
296 | LKM_CANCEL, | ||
297 | user_unlock_ast, | ||
298 | lockres); | ||
299 | if (status == DLM_CANCELGRANT) { | ||
300 | /* If we got this, then the ast was fired | ||
301 | * before we could cancel. We cleanup our | ||
302 | * state, and restart the function. */ | ||
303 | spin_lock(&lockres->l_lock); | ||
304 | lockres->l_flags &= ~USER_LOCK_IN_CANCEL; | ||
305 | spin_unlock(&lockres->l_lock); | ||
306 | } else if (status != DLM_NORMAL) | ||
307 | user_log_dlm_error("dlmunlock", status, lockres); | ||
308 | goto drop_ref; | ||
309 | } | ||
310 | |||
311 | /* If there are still incompat holders, we can exit safely | ||
312 | * without worrying about re-queueing this lock as that will | ||
313 | * happen on the last call to user_cluster_unlock. */ | ||
314 | if ((lockres->l_blocking == LKM_EXMODE) | ||
315 | && (lockres->l_ex_holders || lockres->l_ro_holders)) { | ||
316 | spin_unlock(&lockres->l_lock); | ||
317 | mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n", | ||
318 | lockres->l_ro_holders, lockres->l_ex_holders); | ||
319 | goto drop_ref; | ||
320 | } | ||
321 | |||
322 | if ((lockres->l_blocking == LKM_PRMODE) | ||
323 | && lockres->l_ex_holders) { | ||
324 | spin_unlock(&lockres->l_lock); | ||
325 | mlog(0, "can't downconvert for pr: ex = %u\n", | ||
326 | lockres->l_ex_holders); | ||
327 | goto drop_ref; | ||
328 | } | ||
329 | |||
330 | /* yay, we can downconvert now. */ | ||
331 | new_level = user_highest_compat_lock_level(lockres->l_blocking); | ||
332 | lockres->l_requested = new_level; | ||
333 | lockres->l_flags |= USER_LOCK_BUSY; | ||
334 | mlog(0, "Downconvert lock from %d to %d\n", | ||
335 | lockres->l_level, new_level); | ||
336 | spin_unlock(&lockres->l_lock); | ||
337 | |||
338 | /* need lock downconvert request now... */ | ||
339 | status = dlmlock(dlm, | ||
340 | new_level, | ||
341 | &lockres->l_lksb, | ||
342 | LKM_CONVERT|LKM_VALBLK, | ||
343 | lockres->l_name, | ||
344 | user_ast, | ||
345 | lockres, | ||
346 | user_bast); | ||
347 | if (status != DLM_NORMAL) { | ||
348 | user_log_dlm_error("dlmlock", status, lockres); | ||
349 | user_recover_from_dlm_error(lockres); | ||
350 | } | ||
351 | |||
352 | drop_ref: | ||
353 | user_dlm_drop_inode_ref(lockres); | ||
354 | } | ||
355 | |||
356 | static inline void user_dlm_inc_holders(struct user_lock_res *lockres, | ||
357 | int level) | ||
358 | { | ||
359 | switch(level) { | ||
360 | case LKM_EXMODE: | ||
361 | lockres->l_ex_holders++; | ||
362 | break; | ||
363 | case LKM_PRMODE: | ||
364 | lockres->l_ro_holders++; | ||
365 | break; | ||
366 | default: | ||
367 | BUG(); | ||
368 | } | ||
369 | } | ||
370 | |||
371 | /* predict what lock level we'll be dropping down to on behalf | ||
372 | * of another node, and return true if the currently wanted | ||
373 | * level will be compatible with it. */ | ||
374 | static inline int | ||
375 | user_may_continue_on_blocked_lock(struct user_lock_res *lockres, | ||
376 | int wanted) | ||
377 | { | ||
378 | BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); | ||
379 | |||
380 | return wanted <= user_highest_compat_lock_level(lockres->l_blocking); | ||
381 | } | ||
382 | |||
383 | int user_dlm_cluster_lock(struct user_lock_res *lockres, | ||
384 | int level, | ||
385 | int lkm_flags) | ||
386 | { | ||
387 | int status, local_flags; | ||
388 | struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); | ||
389 | |||
390 | if (level != LKM_EXMODE && | ||
391 | level != LKM_PRMODE) { | ||
392 | mlog(ML_ERROR, "lockres %s: invalid request!\n", | ||
393 | lockres->l_name); | ||
394 | status = -EINVAL; | ||
395 | goto bail; | ||
396 | } | ||
397 | |||
398 | mlog(0, "lockres %s: asking for %s lock, passed flags = 0x%x\n", | ||
399 | lockres->l_name, | ||
400 | (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE", | ||
401 | lkm_flags); | ||
402 | |||
403 | again: | ||
404 | if (signal_pending(current)) { | ||
405 | status = -ERESTARTSYS; | ||
406 | goto bail; | ||
407 | } | ||
408 | |||
409 | spin_lock(&lockres->l_lock); | ||
410 | |||
411 | /* We only compare against the currently granted level | ||
412 | * here. If the lock is blocked waiting on a downconvert, | ||
413 | * we'll get caught below. */ | ||
414 | if ((lockres->l_flags & USER_LOCK_BUSY) && | ||
415 | (level > lockres->l_level)) { | ||
416 | /* is someone sitting in dlm_lock? If so, wait on | ||
417 | * them. */ | ||
418 | spin_unlock(&lockres->l_lock); | ||
419 | |||
420 | user_wait_on_busy_lock(lockres); | ||
421 | goto again; | ||
422 | } | ||
423 | |||
424 | if ((lockres->l_flags & USER_LOCK_BLOCKED) && | ||
425 | (!user_may_continue_on_blocked_lock(lockres, level))) { | ||
426 | /* is the lock is currently blocked on behalf of | ||
427 | * another node */ | ||
428 | spin_unlock(&lockres->l_lock); | ||
429 | |||
430 | user_wait_on_blocked_lock(lockres); | ||
431 | goto again; | ||
432 | } | ||
433 | |||
434 | if (level > lockres->l_level) { | ||
435 | local_flags = lkm_flags | LKM_VALBLK; | ||
436 | if (lockres->l_level != LKM_IVMODE) | ||
437 | local_flags |= LKM_CONVERT; | ||
438 | |||
439 | lockres->l_requested = level; | ||
440 | lockres->l_flags |= USER_LOCK_BUSY; | ||
441 | spin_unlock(&lockres->l_lock); | ||
442 | |||
443 | BUG_ON(level == LKM_IVMODE); | ||
444 | BUG_ON(level == LKM_NLMODE); | ||
445 | |||
446 | mlog(0, "lock %s, get lock from %d to level = %d\n", | ||
447 | lockres->l_name, lockres->l_level, level); | ||
448 | |||
449 | /* call dlm_lock to upgrade lock now */ | ||
450 | status = dlmlock(dlm, | ||
451 | level, | ||
452 | &lockres->l_lksb, | ||
453 | local_flags, | ||
454 | lockres->l_name, | ||
455 | user_ast, | ||
456 | lockres, | ||
457 | user_bast); | ||
458 | if (status != DLM_NORMAL) { | ||
459 | if ((lkm_flags & LKM_NOQUEUE) && | ||
460 | (status == DLM_NOTQUEUED)) | ||
461 | status = -EAGAIN; | ||
462 | else { | ||
463 | user_log_dlm_error("dlmlock", status, lockres); | ||
464 | status = -EINVAL; | ||
465 | } | ||
466 | user_recover_from_dlm_error(lockres); | ||
467 | goto bail; | ||
468 | } | ||
469 | |||
470 | mlog(0, "lock %s, successfull return from dlmlock\n", | ||
471 | lockres->l_name); | ||
472 | |||
473 | user_wait_on_busy_lock(lockres); | ||
474 | goto again; | ||
475 | } | ||
476 | |||
477 | user_dlm_inc_holders(lockres, level); | ||
478 | spin_unlock(&lockres->l_lock); | ||
479 | |||
480 | mlog(0, "lockres %s: Got %s lock!\n", lockres->l_name, | ||
481 | (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE"); | ||
482 | |||
483 | status = 0; | ||
484 | bail: | ||
485 | return status; | ||
486 | } | ||
487 | |||
488 | static inline void user_dlm_dec_holders(struct user_lock_res *lockres, | ||
489 | int level) | ||
490 | { | ||
491 | switch(level) { | ||
492 | case LKM_EXMODE: | ||
493 | BUG_ON(!lockres->l_ex_holders); | ||
494 | lockres->l_ex_holders--; | ||
495 | break; | ||
496 | case LKM_PRMODE: | ||
497 | BUG_ON(!lockres->l_ro_holders); | ||
498 | lockres->l_ro_holders--; | ||
499 | break; | ||
500 | default: | ||
501 | BUG(); | ||
502 | } | ||
503 | } | ||
504 | |||
505 | void user_dlm_cluster_unlock(struct user_lock_res *lockres, | ||
506 | int level) | ||
507 | { | ||
508 | if (level != LKM_EXMODE && | ||
509 | level != LKM_PRMODE) { | ||
510 | mlog(ML_ERROR, "lockres %s: invalid request!\n", lockres->l_name); | ||
511 | return; | ||
512 | } | ||
513 | |||
514 | mlog(0, "lockres %s: dropping %s lock\n", lockres->l_name, | ||
515 | (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE"); | ||
516 | |||
517 | spin_lock(&lockres->l_lock); | ||
518 | user_dlm_dec_holders(lockres, level); | ||
519 | __user_dlm_cond_queue_lockres(lockres); | ||
520 | spin_unlock(&lockres->l_lock); | ||
521 | } | ||
522 | |||
523 | void user_dlm_write_lvb(struct inode *inode, | ||
524 | const char *val, | ||
525 | unsigned int len) | ||
526 | { | ||
527 | struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; | ||
528 | char *lvb = lockres->l_lksb.lvb; | ||
529 | |||
530 | BUG_ON(len > DLM_LVB_LEN); | ||
531 | |||
532 | spin_lock(&lockres->l_lock); | ||
533 | |||
534 | BUG_ON(lockres->l_level < LKM_EXMODE); | ||
535 | memcpy(lvb, val, len); | ||
536 | |||
537 | spin_unlock(&lockres->l_lock); | ||
538 | } | ||
539 | |||
540 | void user_dlm_read_lvb(struct inode *inode, | ||
541 | char *val, | ||
542 | unsigned int len) | ||
543 | { | ||
544 | struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; | ||
545 | char *lvb = lockres->l_lksb.lvb; | ||
546 | |||
547 | BUG_ON(len > DLM_LVB_LEN); | ||
548 | |||
549 | spin_lock(&lockres->l_lock); | ||
550 | |||
551 | BUG_ON(lockres->l_level < LKM_PRMODE); | ||
552 | memcpy(val, lvb, len); | ||
553 | |||
554 | spin_unlock(&lockres->l_lock); | ||
555 | } | ||
556 | |||
557 | void user_dlm_lock_res_init(struct user_lock_res *lockres, | ||
558 | struct dentry *dentry) | ||
559 | { | ||
560 | memset(lockres, 0, sizeof(*lockres)); | ||
561 | |||
562 | spin_lock_init(&lockres->l_lock); | ||
563 | init_waitqueue_head(&lockres->l_event); | ||
564 | lockres->l_level = LKM_IVMODE; | ||
565 | lockres->l_requested = LKM_IVMODE; | ||
566 | lockres->l_blocking = LKM_IVMODE; | ||
567 | |||
568 | /* should have been checked before getting here. */ | ||
569 | BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); | ||
570 | |||
571 | memcpy(lockres->l_name, | ||
572 | dentry->d_name.name, | ||
573 | dentry->d_name.len); | ||
574 | } | ||
575 | |||
576 | int user_dlm_destroy_lock(struct user_lock_res *lockres) | ||
577 | { | ||
578 | int status = -EBUSY; | ||
579 | struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); | ||
580 | |||
581 | mlog(0, "asked to destroy %s\n", lockres->l_name); | ||
582 | |||
583 | spin_lock(&lockres->l_lock); | ||
584 | while (lockres->l_flags & USER_LOCK_BUSY) { | ||
585 | spin_unlock(&lockres->l_lock); | ||
586 | |||
587 | mlog(0, "lock %s is busy\n", lockres->l_name); | ||
588 | |||
589 | user_wait_on_busy_lock(lockres); | ||
590 | |||
591 | spin_lock(&lockres->l_lock); | ||
592 | } | ||
593 | |||
594 | if (lockres->l_ro_holders || lockres->l_ex_holders) { | ||
595 | spin_unlock(&lockres->l_lock); | ||
596 | mlog(0, "lock %s has holders\n", lockres->l_name); | ||
597 | goto bail; | ||
598 | } | ||
599 | |||
600 | status = 0; | ||
601 | if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { | ||
602 | spin_unlock(&lockres->l_lock); | ||
603 | mlog(0, "lock %s is not attached\n", lockres->l_name); | ||
604 | goto bail; | ||
605 | } | ||
606 | |||
607 | lockres->l_flags &= ~USER_LOCK_ATTACHED; | ||
608 | lockres->l_flags |= USER_LOCK_BUSY; | ||
609 | lockres->l_flags |= USER_LOCK_IN_TEARDOWN; | ||
610 | spin_unlock(&lockres->l_lock); | ||
611 | |||
612 | mlog(0, "unlocking lockres %s\n", lockres->l_name); | ||
613 | status = dlmunlock(dlm, | ||
614 | &lockres->l_lksb, | ||
615 | LKM_VALBLK, | ||
616 | user_unlock_ast, | ||
617 | lockres); | ||
618 | if (status != DLM_NORMAL) { | ||
619 | user_log_dlm_error("dlmunlock", status, lockres); | ||
620 | status = -EINVAL; | ||
621 | goto bail; | ||
622 | } | ||
623 | |||
624 | user_wait_on_busy_lock(lockres); | ||
625 | |||
626 | status = 0; | ||
627 | bail: | ||
628 | return status; | ||
629 | } | ||
630 | |||
631 | struct dlm_ctxt *user_dlm_register_context(struct qstr *name) | ||
632 | { | ||
633 | struct dlm_ctxt *dlm; | ||
634 | u32 dlm_key; | ||
635 | char *domain; | ||
636 | |||
637 | domain = kmalloc(name->len + 1, GFP_KERNEL); | ||
638 | if (!domain) { | ||
639 | mlog_errno(-ENOMEM); | ||
640 | return ERR_PTR(-ENOMEM); | ||
641 | } | ||
642 | |||
643 | dlm_key = crc32_le(0, name->name, name->len); | ||
644 | |||
645 | snprintf(domain, name->len + 1, "%.*s", name->len, name->name); | ||
646 | |||
647 | dlm = dlm_register_domain(domain, dlm_key); | ||
648 | if (IS_ERR(dlm)) | ||
649 | mlog_errno(PTR_ERR(dlm)); | ||
650 | |||
651 | kfree(domain); | ||
652 | return dlm; | ||
653 | } | ||
654 | |||
655 | void user_dlm_unregister_context(struct dlm_ctxt *dlm) | ||
656 | { | ||
657 | dlm_unregister_domain(dlm); | ||
658 | } | ||
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlm/userdlm.h new file mode 100644 index 000000000000..04178bc40b76 --- /dev/null +++ b/fs/ocfs2/dlm/userdlm.h | |||
@@ -0,0 +1,111 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * userdlm.h | ||
5 | * | ||
6 | * Userspace dlm defines | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | |||
27 | #ifndef USERDLM_H | ||
28 | #define USERDLM_H | ||
29 | |||
30 | #include <linux/module.h> | ||
31 | #include <linux/fs.h> | ||
32 | #include <linux/types.h> | ||
33 | #include <linux/workqueue.h> | ||
34 | |||
35 | /* user_lock_res->l_flags flags. */ | ||
36 | #define USER_LOCK_ATTACHED (0x00000001) /* have we initialized | ||
37 | * the lvb */ | ||
38 | #define USER_LOCK_BUSY (0x00000002) /* we are currently in | ||
39 | * dlm_lock */ | ||
40 | #define USER_LOCK_BLOCKED (0x00000004) /* blocked waiting to | ||
41 | * downconvert*/ | ||
42 | #define USER_LOCK_IN_TEARDOWN (0x00000008) /* we're currently | ||
43 | * destroying this | ||
44 | * lock. */ | ||
45 | #define USER_LOCK_QUEUED (0x00000010) /* lock is on the | ||
46 | * workqueue */ | ||
47 | #define USER_LOCK_IN_CANCEL (0x00000020) | ||
48 | |||
49 | struct user_lock_res { | ||
50 | spinlock_t l_lock; | ||
51 | |||
52 | int l_flags; | ||
53 | |||
54 | #define USER_DLM_LOCK_ID_MAX_LEN 32 | ||
55 | char l_name[USER_DLM_LOCK_ID_MAX_LEN]; | ||
56 | int l_level; | ||
57 | unsigned int l_ro_holders; | ||
58 | unsigned int l_ex_holders; | ||
59 | struct dlm_lockstatus l_lksb; | ||
60 | |||
61 | int l_requested; | ||
62 | int l_blocking; | ||
63 | |||
64 | wait_queue_head_t l_event; | ||
65 | |||
66 | struct work_struct l_work; | ||
67 | }; | ||
68 | |||
69 | extern struct workqueue_struct *user_dlm_worker; | ||
70 | |||
71 | void user_dlm_lock_res_init(struct user_lock_res *lockres, | ||
72 | struct dentry *dentry); | ||
73 | int user_dlm_destroy_lock(struct user_lock_res *lockres); | ||
74 | int user_dlm_cluster_lock(struct user_lock_res *lockres, | ||
75 | int level, | ||
76 | int lkm_flags); | ||
77 | void user_dlm_cluster_unlock(struct user_lock_res *lockres, | ||
78 | int level); | ||
79 | void user_dlm_write_lvb(struct inode *inode, | ||
80 | const char *val, | ||
81 | unsigned int len); | ||
82 | void user_dlm_read_lvb(struct inode *inode, | ||
83 | char *val, | ||
84 | unsigned int len); | ||
85 | struct dlm_ctxt *user_dlm_register_context(struct qstr *name); | ||
86 | void user_dlm_unregister_context(struct dlm_ctxt *dlm); | ||
87 | |||
88 | struct dlmfs_inode_private { | ||
89 | struct dlm_ctxt *ip_dlm; | ||
90 | |||
91 | struct user_lock_res ip_lockres; /* unused for directories. */ | ||
92 | struct inode *ip_parent; | ||
93 | |||
94 | struct inode ip_vfs_inode; | ||
95 | }; | ||
96 | |||
97 | static inline struct dlmfs_inode_private * | ||
98 | DLMFS_I(struct inode *inode) | ||
99 | { | ||
100 | return container_of(inode, | ||
101 | struct dlmfs_inode_private, | ||
102 | ip_vfs_inode); | ||
103 | } | ||
104 | |||
105 | struct dlmfs_filp_private { | ||
106 | int fp_lock_level; | ||
107 | }; | ||
108 | |||
109 | #define DLMFS_MAGIC 0x76a9f425 | ||
110 | |||
111 | #endif /* USERDLM_H */ | ||
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c new file mode 100644 index 000000000000..e971ec2f8407 --- /dev/null +++ b/fs/ocfs2/dlmglue.c | |||
@@ -0,0 +1,2904 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmglue.c | ||
5 | * | ||
6 | * Code which implements an OCFS2 specific interface to our DLM. | ||
7 | * | ||
8 | * Copyright (C) 2003, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/types.h> | ||
27 | #include <linux/slab.h> | ||
28 | #include <linux/highmem.h> | ||
29 | #include <linux/mm.h> | ||
30 | #include <linux/smp_lock.h> | ||
31 | #include <linux/crc32.h> | ||
32 | #include <linux/kthread.h> | ||
33 | #include <linux/pagemap.h> | ||
34 | #include <linux/debugfs.h> | ||
35 | #include <linux/seq_file.h> | ||
36 | |||
37 | #include <cluster/heartbeat.h> | ||
38 | #include <cluster/nodemanager.h> | ||
39 | #include <cluster/tcp.h> | ||
40 | |||
41 | #include <dlm/dlmapi.h> | ||
42 | |||
43 | #define MLOG_MASK_PREFIX ML_DLM_GLUE | ||
44 | #include <cluster/masklog.h> | ||
45 | |||
46 | #include "ocfs2.h" | ||
47 | |||
48 | #include "alloc.h" | ||
49 | #include "dlmglue.h" | ||
50 | #include "extent_map.h" | ||
51 | #include "heartbeat.h" | ||
52 | #include "inode.h" | ||
53 | #include "journal.h" | ||
54 | #include "slot_map.h" | ||
55 | #include "super.h" | ||
56 | #include "uptodate.h" | ||
57 | #include "vote.h" | ||
58 | |||
59 | #include "buffer_head_io.h" | ||
60 | |||
61 | struct ocfs2_mask_waiter { | ||
62 | struct list_head mw_item; | ||
63 | int mw_status; | ||
64 | struct completion mw_complete; | ||
65 | unsigned long mw_mask; | ||
66 | unsigned long mw_goal; | ||
67 | }; | ||
68 | |||
69 | static void ocfs2_inode_ast_func(void *opaque); | ||
70 | static void ocfs2_inode_bast_func(void *opaque, | ||
71 | int level); | ||
72 | static void ocfs2_super_ast_func(void *opaque); | ||
73 | static void ocfs2_super_bast_func(void *opaque, | ||
74 | int level); | ||
75 | static void ocfs2_rename_ast_func(void *opaque); | ||
76 | static void ocfs2_rename_bast_func(void *opaque, | ||
77 | int level); | ||
78 | |||
79 | /* so far, all locks have gotten along with the same unlock ast */ | ||
80 | static void ocfs2_unlock_ast_func(void *opaque, | ||
81 | enum dlm_status status); | ||
82 | static int ocfs2_do_unblock_meta(struct inode *inode, | ||
83 | int *requeue); | ||
84 | static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres, | ||
85 | int *requeue); | ||
86 | static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres, | ||
87 | int *requeue); | ||
88 | static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres, | ||
89 | int *requeue); | ||
90 | static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres, | ||
91 | int *requeue); | ||
92 | typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int); | ||
93 | static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, | ||
94 | struct ocfs2_lock_res *lockres, | ||
95 | int *requeue, | ||
96 | ocfs2_convert_worker_t *worker); | ||
97 | |||
98 | struct ocfs2_lock_res_ops { | ||
99 | void (*ast)(void *); | ||
100 | void (*bast)(void *, int); | ||
101 | void (*unlock_ast)(void *, enum dlm_status); | ||
102 | int (*unblock)(struct ocfs2_lock_res *, int *); | ||
103 | }; | ||
104 | |||
105 | static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = { | ||
106 | .ast = ocfs2_inode_ast_func, | ||
107 | .bast = ocfs2_inode_bast_func, | ||
108 | .unlock_ast = ocfs2_unlock_ast_func, | ||
109 | .unblock = ocfs2_unblock_inode_lock, | ||
110 | }; | ||
111 | |||
112 | static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = { | ||
113 | .ast = ocfs2_inode_ast_func, | ||
114 | .bast = ocfs2_inode_bast_func, | ||
115 | .unlock_ast = ocfs2_unlock_ast_func, | ||
116 | .unblock = ocfs2_unblock_meta, | ||
117 | }; | ||
118 | |||
119 | static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, | ||
120 | int blocking); | ||
121 | |||
122 | static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = { | ||
123 | .ast = ocfs2_inode_ast_func, | ||
124 | .bast = ocfs2_inode_bast_func, | ||
125 | .unlock_ast = ocfs2_unlock_ast_func, | ||
126 | .unblock = ocfs2_unblock_data, | ||
127 | }; | ||
128 | |||
129 | static struct ocfs2_lock_res_ops ocfs2_super_lops = { | ||
130 | .ast = ocfs2_super_ast_func, | ||
131 | .bast = ocfs2_super_bast_func, | ||
132 | .unlock_ast = ocfs2_unlock_ast_func, | ||
133 | .unblock = ocfs2_unblock_osb_lock, | ||
134 | }; | ||
135 | |||
136 | static struct ocfs2_lock_res_ops ocfs2_rename_lops = { | ||
137 | .ast = ocfs2_rename_ast_func, | ||
138 | .bast = ocfs2_rename_bast_func, | ||
139 | .unlock_ast = ocfs2_unlock_ast_func, | ||
140 | .unblock = ocfs2_unblock_osb_lock, | ||
141 | }; | ||
142 | |||
143 | static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) | ||
144 | { | ||
145 | return lockres->l_type == OCFS2_LOCK_TYPE_META || | ||
146 | lockres->l_type == OCFS2_LOCK_TYPE_DATA || | ||
147 | lockres->l_type == OCFS2_LOCK_TYPE_RW; | ||
148 | } | ||
149 | |||
150 | static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres) | ||
151 | { | ||
152 | return lockres->l_type == OCFS2_LOCK_TYPE_SUPER; | ||
153 | } | ||
154 | |||
155 | static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres) | ||
156 | { | ||
157 | return lockres->l_type == OCFS2_LOCK_TYPE_RENAME; | ||
158 | } | ||
159 | |||
160 | static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres) | ||
161 | { | ||
162 | BUG_ON(!ocfs2_is_super_lock(lockres) | ||
163 | && !ocfs2_is_rename_lock(lockres)); | ||
164 | |||
165 | return (struct ocfs2_super *) lockres->l_priv; | ||
166 | } | ||
167 | |||
168 | static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) | ||
169 | { | ||
170 | BUG_ON(!ocfs2_is_inode_lock(lockres)); | ||
171 | |||
172 | return (struct inode *) lockres->l_priv; | ||
173 | } | ||
174 | |||
175 | static int ocfs2_lock_create(struct ocfs2_super *osb, | ||
176 | struct ocfs2_lock_res *lockres, | ||
177 | int level, | ||
178 | int dlm_flags); | ||
179 | static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, | ||
180 | int wanted); | ||
181 | static void ocfs2_cluster_unlock(struct ocfs2_super *osb, | ||
182 | struct ocfs2_lock_res *lockres, | ||
183 | int level); | ||
184 | static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres); | ||
185 | static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres); | ||
186 | static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres); | ||
187 | static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level); | ||
188 | static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, | ||
189 | struct ocfs2_lock_res *lockres); | ||
190 | static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, | ||
191 | int convert); | ||
192 | #define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \ | ||
193 | mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ | ||
194 | "resource %s: %s\n", dlm_errname(_stat), _func, \ | ||
195 | _lockres->l_name, dlm_errmsg(_stat)); \ | ||
196 | } while (0) | ||
197 | static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, | ||
198 | struct ocfs2_lock_res *lockres); | ||
199 | static int ocfs2_meta_lock_update(struct inode *inode, | ||
200 | struct buffer_head **bh); | ||
201 | static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); | ||
202 | static inline int ocfs2_highest_compat_lock_level(int level); | ||
203 | static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode, | ||
204 | struct ocfs2_lock_res *lockres, | ||
205 | int new_level); | ||
206 | |||
207 | static char *ocfs2_lock_type_strings[] = { | ||
208 | [OCFS2_LOCK_TYPE_META] = "Meta", | ||
209 | [OCFS2_LOCK_TYPE_DATA] = "Data", | ||
210 | [OCFS2_LOCK_TYPE_SUPER] = "Super", | ||
211 | [OCFS2_LOCK_TYPE_RENAME] = "Rename", | ||
212 | /* Need to differntiate from [R]ename.. serializing writes is the | ||
213 | * important job it does, anyway. */ | ||
214 | [OCFS2_LOCK_TYPE_RW] = "Write/Read", | ||
215 | }; | ||
216 | |||
217 | static char *ocfs2_lock_type_string(enum ocfs2_lock_type type) | ||
218 | { | ||
219 | mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type); | ||
220 | return ocfs2_lock_type_strings[type]; | ||
221 | } | ||
222 | |||
223 | static void ocfs2_build_lock_name(enum ocfs2_lock_type type, | ||
224 | u64 blkno, | ||
225 | u32 generation, | ||
226 | char *name) | ||
227 | { | ||
228 | int len; | ||
229 | |||
230 | mlog_entry_void(); | ||
231 | |||
232 | BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); | ||
233 | |||
234 | len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016"MLFx64"%08x", | ||
235 | ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, blkno, | ||
236 | generation); | ||
237 | |||
238 | BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1)); | ||
239 | |||
240 | mlog(0, "built lock resource with name: %s\n", name); | ||
241 | |||
242 | mlog_exit_void(); | ||
243 | } | ||
244 | |||
245 | static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED; | ||
246 | |||
247 | static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res, | ||
248 | struct ocfs2_dlm_debug *dlm_debug) | ||
249 | { | ||
250 | mlog(0, "Add tracking for lockres %s\n", res->l_name); | ||
251 | |||
252 | spin_lock(&ocfs2_dlm_tracking_lock); | ||
253 | list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking); | ||
254 | spin_unlock(&ocfs2_dlm_tracking_lock); | ||
255 | } | ||
256 | |||
257 | static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) | ||
258 | { | ||
259 | spin_lock(&ocfs2_dlm_tracking_lock); | ||
260 | if (!list_empty(&res->l_debug_list)) | ||
261 | list_del_init(&res->l_debug_list); | ||
262 | spin_unlock(&ocfs2_dlm_tracking_lock); | ||
263 | } | ||
264 | |||
265 | static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, | ||
266 | struct ocfs2_lock_res *res, | ||
267 | enum ocfs2_lock_type type, | ||
268 | u64 blkno, | ||
269 | u32 generation, | ||
270 | struct ocfs2_lock_res_ops *ops, | ||
271 | void *priv) | ||
272 | { | ||
273 | ocfs2_build_lock_name(type, blkno, generation, res->l_name); | ||
274 | |||
275 | res->l_type = type; | ||
276 | res->l_ops = ops; | ||
277 | res->l_priv = priv; | ||
278 | |||
279 | res->l_level = LKM_IVMODE; | ||
280 | res->l_requested = LKM_IVMODE; | ||
281 | res->l_blocking = LKM_IVMODE; | ||
282 | res->l_action = OCFS2_AST_INVALID; | ||
283 | res->l_unlock_action = OCFS2_UNLOCK_INVALID; | ||
284 | |||
285 | res->l_flags = OCFS2_LOCK_INITIALIZED; | ||
286 | |||
287 | ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug); | ||
288 | } | ||
289 | |||
290 | void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) | ||
291 | { | ||
292 | /* This also clears out the lock status block */ | ||
293 | memset(res, 0, sizeof(struct ocfs2_lock_res)); | ||
294 | spin_lock_init(&res->l_lock); | ||
295 | init_waitqueue_head(&res->l_event); | ||
296 | INIT_LIST_HEAD(&res->l_blocked_list); | ||
297 | INIT_LIST_HEAD(&res->l_mask_waiters); | ||
298 | } | ||
299 | |||
300 | void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, | ||
301 | enum ocfs2_lock_type type, | ||
302 | struct inode *inode) | ||
303 | { | ||
304 | struct ocfs2_lock_res_ops *ops; | ||
305 | |||
306 | switch(type) { | ||
307 | case OCFS2_LOCK_TYPE_RW: | ||
308 | ops = &ocfs2_inode_rw_lops; | ||
309 | break; | ||
310 | case OCFS2_LOCK_TYPE_META: | ||
311 | ops = &ocfs2_inode_meta_lops; | ||
312 | break; | ||
313 | case OCFS2_LOCK_TYPE_DATA: | ||
314 | ops = &ocfs2_inode_data_lops; | ||
315 | break; | ||
316 | default: | ||
317 | mlog_bug_on_msg(1, "type: %d\n", type); | ||
318 | ops = NULL; /* thanks, gcc */ | ||
319 | break; | ||
320 | }; | ||
321 | |||
322 | ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, | ||
323 | OCFS2_I(inode)->ip_blkno, | ||
324 | inode->i_generation, ops, inode); | ||
325 | } | ||
326 | |||
327 | static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res, | ||
328 | struct ocfs2_super *osb) | ||
329 | { | ||
330 | /* Superblock lockres doesn't come from a slab so we call init | ||
331 | * once on it manually. */ | ||
332 | ocfs2_lock_res_init_once(res); | ||
333 | ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER, | ||
334 | OCFS2_SUPER_BLOCK_BLKNO, 0, | ||
335 | &ocfs2_super_lops, osb); | ||
336 | } | ||
337 | |||
338 | static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res, | ||
339 | struct ocfs2_super *osb) | ||
340 | { | ||
341 | /* Rename lockres doesn't come from a slab so we call init | ||
342 | * once on it manually. */ | ||
343 | ocfs2_lock_res_init_once(res); | ||
344 | ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0, | ||
345 | &ocfs2_rename_lops, osb); | ||
346 | } | ||
347 | |||
348 | void ocfs2_lock_res_free(struct ocfs2_lock_res *res) | ||
349 | { | ||
350 | mlog_entry_void(); | ||
351 | |||
352 | if (!(res->l_flags & OCFS2_LOCK_INITIALIZED)) | ||
353 | return; | ||
354 | |||
355 | ocfs2_remove_lockres_tracking(res); | ||
356 | |||
357 | mlog_bug_on_msg(!list_empty(&res->l_blocked_list), | ||
358 | "Lockres %s is on the blocked list\n", | ||
359 | res->l_name); | ||
360 | mlog_bug_on_msg(!list_empty(&res->l_mask_waiters), | ||
361 | "Lockres %s has mask waiters pending\n", | ||
362 | res->l_name); | ||
363 | mlog_bug_on_msg(spin_is_locked(&res->l_lock), | ||
364 | "Lockres %s is locked\n", | ||
365 | res->l_name); | ||
366 | mlog_bug_on_msg(res->l_ro_holders, | ||
367 | "Lockres %s has %u ro holders\n", | ||
368 | res->l_name, res->l_ro_holders); | ||
369 | mlog_bug_on_msg(res->l_ex_holders, | ||
370 | "Lockres %s has %u ex holders\n", | ||
371 | res->l_name, res->l_ex_holders); | ||
372 | |||
373 | /* Need to clear out the lock status block for the dlm */ | ||
374 | memset(&res->l_lksb, 0, sizeof(res->l_lksb)); | ||
375 | |||
376 | res->l_flags = 0UL; | ||
377 | mlog_exit_void(); | ||
378 | } | ||
379 | |||
380 | static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, | ||
381 | int level) | ||
382 | { | ||
383 | mlog_entry_void(); | ||
384 | |||
385 | BUG_ON(!lockres); | ||
386 | |||
387 | switch(level) { | ||
388 | case LKM_EXMODE: | ||
389 | lockres->l_ex_holders++; | ||
390 | break; | ||
391 | case LKM_PRMODE: | ||
392 | lockres->l_ro_holders++; | ||
393 | break; | ||
394 | default: | ||
395 | BUG(); | ||
396 | } | ||
397 | |||
398 | mlog_exit_void(); | ||
399 | } | ||
400 | |||
401 | static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, | ||
402 | int level) | ||
403 | { | ||
404 | mlog_entry_void(); | ||
405 | |||
406 | BUG_ON(!lockres); | ||
407 | |||
408 | switch(level) { | ||
409 | case LKM_EXMODE: | ||
410 | BUG_ON(!lockres->l_ex_holders); | ||
411 | lockres->l_ex_holders--; | ||
412 | break; | ||
413 | case LKM_PRMODE: | ||
414 | BUG_ON(!lockres->l_ro_holders); | ||
415 | lockres->l_ro_holders--; | ||
416 | break; | ||
417 | default: | ||
418 | BUG(); | ||
419 | } | ||
420 | mlog_exit_void(); | ||
421 | } | ||
422 | |||
423 | /* WARNING: This function lives in a world where the only three lock | ||
424 | * levels are EX, PR, and NL. It *will* have to be adjusted when more | ||
425 | * lock types are added. */ | ||
426 | static inline int ocfs2_highest_compat_lock_level(int level) | ||
427 | { | ||
428 | int new_level = LKM_EXMODE; | ||
429 | |||
430 | if (level == LKM_EXMODE) | ||
431 | new_level = LKM_NLMODE; | ||
432 | else if (level == LKM_PRMODE) | ||
433 | new_level = LKM_PRMODE; | ||
434 | return new_level; | ||
435 | } | ||
436 | |||
437 | static void lockres_set_flags(struct ocfs2_lock_res *lockres, | ||
438 | unsigned long newflags) | ||
439 | { | ||
440 | struct list_head *pos, *tmp; | ||
441 | struct ocfs2_mask_waiter *mw; | ||
442 | |||
443 | assert_spin_locked(&lockres->l_lock); | ||
444 | |||
445 | lockres->l_flags = newflags; | ||
446 | |||
447 | list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) { | ||
448 | mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item); | ||
449 | if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) | ||
450 | continue; | ||
451 | |||
452 | list_del_init(&mw->mw_item); | ||
453 | mw->mw_status = 0; | ||
454 | complete(&mw->mw_complete); | ||
455 | } | ||
456 | } | ||
457 | static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or) | ||
458 | { | ||
459 | lockres_set_flags(lockres, lockres->l_flags | or); | ||
460 | } | ||
461 | static void lockres_clear_flags(struct ocfs2_lock_res *lockres, | ||
462 | unsigned long clear) | ||
463 | { | ||
464 | lockres_set_flags(lockres, lockres->l_flags & ~clear); | ||
465 | } | ||
466 | |||
467 | static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres) | ||
468 | { | ||
469 | mlog_entry_void(); | ||
470 | |||
471 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); | ||
472 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); | ||
473 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); | ||
474 | BUG_ON(lockres->l_blocking <= LKM_NLMODE); | ||
475 | |||
476 | lockres->l_level = lockres->l_requested; | ||
477 | if (lockres->l_level <= | ||
478 | ocfs2_highest_compat_lock_level(lockres->l_blocking)) { | ||
479 | lockres->l_blocking = LKM_NLMODE; | ||
480 | lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); | ||
481 | } | ||
482 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); | ||
483 | |||
484 | mlog_exit_void(); | ||
485 | } | ||
486 | |||
487 | static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres) | ||
488 | { | ||
489 | mlog_entry_void(); | ||
490 | |||
491 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); | ||
492 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); | ||
493 | |||
494 | /* Convert from RO to EX doesn't really need anything as our | ||
495 | * information is already up to data. Convert from NL to | ||
496 | * *anything* however should mark ourselves as needing an | ||
497 | * update */ | ||
498 | if (lockres->l_level == LKM_NLMODE) | ||
499 | lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); | ||
500 | |||
501 | lockres->l_level = lockres->l_requested; | ||
502 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); | ||
503 | |||
504 | mlog_exit_void(); | ||
505 | } | ||
506 | |||
507 | static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres) | ||
508 | { | ||
509 | mlog_entry_void(); | ||
510 | |||
511 | BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY)); | ||
512 | BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); | ||
513 | |||
514 | if (lockres->l_requested > LKM_NLMODE && | ||
515 | !(lockres->l_flags & OCFS2_LOCK_LOCAL)) | ||
516 | lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); | ||
517 | |||
518 | lockres->l_level = lockres->l_requested; | ||
519 | lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED); | ||
520 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); | ||
521 | |||
522 | mlog_exit_void(); | ||
523 | } | ||
524 | |||
525 | static void ocfs2_inode_ast_func(void *opaque) | ||
526 | { | ||
527 | struct ocfs2_lock_res *lockres = opaque; | ||
528 | struct inode *inode; | ||
529 | struct dlm_lockstatus *lksb; | ||
530 | unsigned long flags; | ||
531 | |||
532 | mlog_entry_void(); | ||
533 | |||
534 | inode = ocfs2_lock_res_inode(lockres); | ||
535 | |||
536 | mlog(0, "AST fired for inode %"MLFu64", l_action = %u, type = %s\n", | ||
537 | OCFS2_I(inode)->ip_blkno, lockres->l_action, | ||
538 | ocfs2_lock_type_string(lockres->l_type)); | ||
539 | |||
540 | BUG_ON(!ocfs2_is_inode_lock(lockres)); | ||
541 | |||
542 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
543 | |||
544 | lksb = &(lockres->l_lksb); | ||
545 | if (lksb->status != DLM_NORMAL) { | ||
546 | mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u " | ||
547 | "on inode %"MLFu64"\n", lksb->status, | ||
548 | OCFS2_I(inode)->ip_blkno); | ||
549 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
550 | mlog_exit_void(); | ||
551 | return; | ||
552 | } | ||
553 | |||
554 | switch(lockres->l_action) { | ||
555 | case OCFS2_AST_ATTACH: | ||
556 | ocfs2_generic_handle_attach_action(lockres); | ||
557 | lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL); | ||
558 | break; | ||
559 | case OCFS2_AST_CONVERT: | ||
560 | ocfs2_generic_handle_convert_action(lockres); | ||
561 | break; | ||
562 | case OCFS2_AST_DOWNCONVERT: | ||
563 | ocfs2_generic_handle_downconvert_action(lockres); | ||
564 | break; | ||
565 | default: | ||
566 | mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " | ||
567 | "lockres flags = 0x%lx, unlock action: %u\n", | ||
568 | lockres->l_name, lockres->l_action, lockres->l_flags, | ||
569 | lockres->l_unlock_action); | ||
570 | |||
571 | BUG(); | ||
572 | } | ||
573 | |||
574 | /* data and rw locking ignores refresh flag for now. */ | ||
575 | if (lockres->l_type != OCFS2_LOCK_TYPE_META) | ||
576 | lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); | ||
577 | |||
578 | /* set it to something invalid so if we get called again we | ||
579 | * can catch it. */ | ||
580 | lockres->l_action = OCFS2_AST_INVALID; | ||
581 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
582 | wake_up(&lockres->l_event); | ||
583 | |||
584 | mlog_exit_void(); | ||
585 | } | ||
586 | |||
587 | static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, | ||
588 | int level) | ||
589 | { | ||
590 | int needs_downconvert = 0; | ||
591 | mlog_entry_void(); | ||
592 | |||
593 | assert_spin_locked(&lockres->l_lock); | ||
594 | |||
595 | lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); | ||
596 | |||
597 | if (level > lockres->l_blocking) { | ||
598 | /* only schedule a downconvert if we haven't already scheduled | ||
599 | * one that goes low enough to satisfy the level we're | ||
600 | * blocking. this also catches the case where we get | ||
601 | * duplicate BASTs */ | ||
602 | if (ocfs2_highest_compat_lock_level(level) < | ||
603 | ocfs2_highest_compat_lock_level(lockres->l_blocking)) | ||
604 | needs_downconvert = 1; | ||
605 | |||
606 | lockres->l_blocking = level; | ||
607 | } | ||
608 | |||
609 | mlog_exit(needs_downconvert); | ||
610 | return needs_downconvert; | ||
611 | } | ||
612 | |||
613 | static void ocfs2_generic_bast_func(struct ocfs2_super *osb, | ||
614 | struct ocfs2_lock_res *lockres, | ||
615 | int level) | ||
616 | { | ||
617 | int needs_downconvert; | ||
618 | unsigned long flags; | ||
619 | |||
620 | mlog_entry_void(); | ||
621 | |||
622 | BUG_ON(level <= LKM_NLMODE); | ||
623 | |||
624 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
625 | needs_downconvert = ocfs2_generic_handle_bast(lockres, level); | ||
626 | if (needs_downconvert) | ||
627 | ocfs2_schedule_blocked_lock(osb, lockres); | ||
628 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
629 | |||
630 | ocfs2_kick_vote_thread(osb); | ||
631 | |||
632 | wake_up(&lockres->l_event); | ||
633 | mlog_exit_void(); | ||
634 | } | ||
635 | |||
636 | static void ocfs2_inode_bast_func(void *opaque, int level) | ||
637 | { | ||
638 | struct ocfs2_lock_res *lockres = opaque; | ||
639 | struct inode *inode; | ||
640 | struct ocfs2_super *osb; | ||
641 | |||
642 | mlog_entry_void(); | ||
643 | |||
644 | BUG_ON(!ocfs2_is_inode_lock(lockres)); | ||
645 | |||
646 | inode = ocfs2_lock_res_inode(lockres); | ||
647 | osb = OCFS2_SB(inode->i_sb); | ||
648 | |||
649 | mlog(0, "BAST fired for inode %"MLFu64", blocking = %d, level = %d " | ||
650 | "type = %s\n", OCFS2_I(inode)->ip_blkno, level, | ||
651 | lockres->l_level, | ||
652 | ocfs2_lock_type_string(lockres->l_type)); | ||
653 | |||
654 | ocfs2_generic_bast_func(osb, lockres, level); | ||
655 | |||
656 | mlog_exit_void(); | ||
657 | } | ||
658 | |||
659 | static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres, | ||
660 | int ignore_refresh) | ||
661 | { | ||
662 | struct dlm_lockstatus *lksb = &lockres->l_lksb; | ||
663 | unsigned long flags; | ||
664 | |||
665 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
666 | |||
667 | if (lksb->status != DLM_NORMAL) { | ||
668 | mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n", | ||
669 | lockres->l_name, lksb->status); | ||
670 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
671 | return; | ||
672 | } | ||
673 | |||
674 | switch(lockres->l_action) { | ||
675 | case OCFS2_AST_ATTACH: | ||
676 | ocfs2_generic_handle_attach_action(lockres); | ||
677 | break; | ||
678 | case OCFS2_AST_CONVERT: | ||
679 | ocfs2_generic_handle_convert_action(lockres); | ||
680 | break; | ||
681 | case OCFS2_AST_DOWNCONVERT: | ||
682 | ocfs2_generic_handle_downconvert_action(lockres); | ||
683 | break; | ||
684 | default: | ||
685 | BUG(); | ||
686 | } | ||
687 | |||
688 | if (ignore_refresh) | ||
689 | lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); | ||
690 | |||
691 | /* set it to something invalid so if we get called again we | ||
692 | * can catch it. */ | ||
693 | lockres->l_action = OCFS2_AST_INVALID; | ||
694 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
695 | |||
696 | wake_up(&lockres->l_event); | ||
697 | } | ||
698 | |||
699 | static void ocfs2_super_ast_func(void *opaque) | ||
700 | { | ||
701 | struct ocfs2_lock_res *lockres = opaque; | ||
702 | |||
703 | mlog_entry_void(); | ||
704 | mlog(0, "Superblock AST fired\n"); | ||
705 | |||
706 | BUG_ON(!ocfs2_is_super_lock(lockres)); | ||
707 | ocfs2_generic_ast_func(lockres, 0); | ||
708 | |||
709 | mlog_exit_void(); | ||
710 | } | ||
711 | |||
712 | static void ocfs2_super_bast_func(void *opaque, | ||
713 | int level) | ||
714 | { | ||
715 | struct ocfs2_lock_res *lockres = opaque; | ||
716 | struct ocfs2_super *osb; | ||
717 | |||
718 | mlog_entry_void(); | ||
719 | mlog(0, "Superblock BAST fired\n"); | ||
720 | |||
721 | BUG_ON(!ocfs2_is_super_lock(lockres)); | ||
722 | osb = ocfs2_lock_res_super(lockres); | ||
723 | ocfs2_generic_bast_func(osb, lockres, level); | ||
724 | |||
725 | mlog_exit_void(); | ||
726 | } | ||
727 | |||
728 | static void ocfs2_rename_ast_func(void *opaque) | ||
729 | { | ||
730 | struct ocfs2_lock_res *lockres = opaque; | ||
731 | |||
732 | mlog_entry_void(); | ||
733 | |||
734 | mlog(0, "Rename AST fired\n"); | ||
735 | |||
736 | BUG_ON(!ocfs2_is_rename_lock(lockres)); | ||
737 | |||
738 | ocfs2_generic_ast_func(lockres, 1); | ||
739 | |||
740 | mlog_exit_void(); | ||
741 | } | ||
742 | |||
743 | static void ocfs2_rename_bast_func(void *opaque, | ||
744 | int level) | ||
745 | { | ||
746 | struct ocfs2_lock_res *lockres = opaque; | ||
747 | struct ocfs2_super *osb; | ||
748 | |||
749 | mlog_entry_void(); | ||
750 | |||
751 | mlog(0, "Rename BAST fired\n"); | ||
752 | |||
753 | BUG_ON(!ocfs2_is_rename_lock(lockres)); | ||
754 | |||
755 | osb = ocfs2_lock_res_super(lockres); | ||
756 | ocfs2_generic_bast_func(osb, lockres, level); | ||
757 | |||
758 | mlog_exit_void(); | ||
759 | } | ||
760 | |||
761 | static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, | ||
762 | int convert) | ||
763 | { | ||
764 | unsigned long flags; | ||
765 | |||
766 | mlog_entry_void(); | ||
767 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
768 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); | ||
769 | if (convert) | ||
770 | lockres->l_action = OCFS2_AST_INVALID; | ||
771 | else | ||
772 | lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; | ||
773 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
774 | |||
775 | wake_up(&lockres->l_event); | ||
776 | mlog_exit_void(); | ||
777 | } | ||
778 | |||
779 | /* Note: If we detect another process working on the lock (i.e., | ||
780 | * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller | ||
781 | * to do the right thing in that case. | ||
782 | */ | ||
783 | static int ocfs2_lock_create(struct ocfs2_super *osb, | ||
784 | struct ocfs2_lock_res *lockres, | ||
785 | int level, | ||
786 | int dlm_flags) | ||
787 | { | ||
788 | int ret = 0; | ||
789 | enum dlm_status status; | ||
790 | unsigned long flags; | ||
791 | |||
792 | mlog_entry_void(); | ||
793 | |||
794 | mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level, | ||
795 | dlm_flags); | ||
796 | |||
797 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
798 | if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) || | ||
799 | (lockres->l_flags & OCFS2_LOCK_BUSY)) { | ||
800 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
801 | goto bail; | ||
802 | } | ||
803 | |||
804 | lockres->l_action = OCFS2_AST_ATTACH; | ||
805 | lockres->l_requested = level; | ||
806 | lockres_or_flags(lockres, OCFS2_LOCK_BUSY); | ||
807 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
808 | |||
809 | status = dlmlock(osb->dlm, | ||
810 | level, | ||
811 | &lockres->l_lksb, | ||
812 | dlm_flags, | ||
813 | lockres->l_name, | ||
814 | lockres->l_ops->ast, | ||
815 | lockres, | ||
816 | lockres->l_ops->bast); | ||
817 | if (status != DLM_NORMAL) { | ||
818 | ocfs2_log_dlm_error("dlmlock", status, lockres); | ||
819 | ret = -EINVAL; | ||
820 | ocfs2_recover_from_dlm_error(lockres, 1); | ||
821 | } | ||
822 | |||
823 | mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name); | ||
824 | |||
825 | bail: | ||
826 | mlog_exit(ret); | ||
827 | return ret; | ||
828 | } | ||
829 | |||
830 | static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres, | ||
831 | int flag) | ||
832 | { | ||
833 | unsigned long flags; | ||
834 | int ret; | ||
835 | |||
836 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
837 | ret = lockres->l_flags & flag; | ||
838 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
839 | |||
840 | return ret; | ||
841 | } | ||
842 | |||
843 | static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres) | ||
844 | |||
845 | { | ||
846 | wait_event(lockres->l_event, | ||
847 | !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY)); | ||
848 | } | ||
849 | |||
850 | static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres) | ||
851 | |||
852 | { | ||
853 | wait_event(lockres->l_event, | ||
854 | !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING)); | ||
855 | } | ||
856 | |||
857 | /* predict what lock level we'll be dropping down to on behalf | ||
858 | * of another node, and return true if the currently wanted | ||
859 | * level will be compatible with it. */ | ||
860 | static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, | ||
861 | int wanted) | ||
862 | { | ||
863 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); | ||
864 | |||
865 | return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking); | ||
866 | } | ||
867 | |||
868 | static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw) | ||
869 | { | ||
870 | INIT_LIST_HEAD(&mw->mw_item); | ||
871 | init_completion(&mw->mw_complete); | ||
872 | } | ||
873 | |||
874 | static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) | ||
875 | { | ||
876 | wait_for_completion(&mw->mw_complete); | ||
877 | /* Re-arm the completion in case we want to wait on it again */ | ||
878 | INIT_COMPLETION(mw->mw_complete); | ||
879 | return mw->mw_status; | ||
880 | } | ||
881 | |||
882 | static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres, | ||
883 | struct ocfs2_mask_waiter *mw, | ||
884 | unsigned long mask, | ||
885 | unsigned long goal) | ||
886 | { | ||
887 | BUG_ON(!list_empty(&mw->mw_item)); | ||
888 | |||
889 | assert_spin_locked(&lockres->l_lock); | ||
890 | |||
891 | list_add_tail(&mw->mw_item, &lockres->l_mask_waiters); | ||
892 | mw->mw_mask = mask; | ||
893 | mw->mw_goal = goal; | ||
894 | } | ||
895 | |||
896 | /* returns 0 if the mw that was removed was already satisfied, -EBUSY | ||
897 | * if the mask still hadn't reached its goal */ | ||
898 | static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, | ||
899 | struct ocfs2_mask_waiter *mw) | ||
900 | { | ||
901 | unsigned long flags; | ||
902 | int ret = 0; | ||
903 | |||
904 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
905 | if (!list_empty(&mw->mw_item)) { | ||
906 | if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) | ||
907 | ret = -EBUSY; | ||
908 | |||
909 | list_del_init(&mw->mw_item); | ||
910 | init_completion(&mw->mw_complete); | ||
911 | } | ||
912 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
913 | |||
914 | return ret; | ||
915 | |||
916 | } | ||
917 | |||
918 | static int ocfs2_cluster_lock(struct ocfs2_super *osb, | ||
919 | struct ocfs2_lock_res *lockres, | ||
920 | int level, | ||
921 | int lkm_flags, | ||
922 | int arg_flags) | ||
923 | { | ||
924 | struct ocfs2_mask_waiter mw; | ||
925 | enum dlm_status status; | ||
926 | int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); | ||
927 | int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ | ||
928 | unsigned long flags; | ||
929 | |||
930 | mlog_entry_void(); | ||
931 | |||
932 | ocfs2_init_mask_waiter(&mw); | ||
933 | |||
934 | again: | ||
935 | wait = 0; | ||
936 | |||
937 | if (catch_signals && signal_pending(current)) { | ||
938 | ret = -ERESTARTSYS; | ||
939 | goto out; | ||
940 | } | ||
941 | |||
942 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
943 | |||
944 | mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING, | ||
945 | "Cluster lock called on freeing lockres %s! flags " | ||
946 | "0x%lx\n", lockres->l_name, lockres->l_flags); | ||
947 | |||
948 | /* We only compare against the currently granted level | ||
949 | * here. If the lock is blocked waiting on a downconvert, | ||
950 | * we'll get caught below. */ | ||
951 | if (lockres->l_flags & OCFS2_LOCK_BUSY && | ||
952 | level > lockres->l_level) { | ||
953 | /* is someone sitting in dlm_lock? If so, wait on | ||
954 | * them. */ | ||
955 | lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); | ||
956 | wait = 1; | ||
957 | goto unlock; | ||
958 | } | ||
959 | |||
960 | if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { | ||
961 | /* lock has not been created yet. */ | ||
962 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
963 | |||
964 | ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0); | ||
965 | if (ret < 0) { | ||
966 | mlog_errno(ret); | ||
967 | goto out; | ||
968 | } | ||
969 | goto again; | ||
970 | } | ||
971 | |||
972 | if (lockres->l_flags & OCFS2_LOCK_BLOCKED && | ||
973 | !ocfs2_may_continue_on_blocked_lock(lockres, level)) { | ||
974 | /* is the lock is currently blocked on behalf of | ||
975 | * another node */ | ||
976 | lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0); | ||
977 | wait = 1; | ||
978 | goto unlock; | ||
979 | } | ||
980 | |||
981 | if (level > lockres->l_level) { | ||
982 | if (lockres->l_action != OCFS2_AST_INVALID) | ||
983 | mlog(ML_ERROR, "lockres %s has action %u pending\n", | ||
984 | lockres->l_name, lockres->l_action); | ||
985 | |||
986 | lockres->l_action = OCFS2_AST_CONVERT; | ||
987 | lockres->l_requested = level; | ||
988 | lockres_or_flags(lockres, OCFS2_LOCK_BUSY); | ||
989 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
990 | |||
991 | BUG_ON(level == LKM_IVMODE); | ||
992 | BUG_ON(level == LKM_NLMODE); | ||
993 | |||
994 | mlog(0, "lock %s, convert from %d to level = %d\n", | ||
995 | lockres->l_name, lockres->l_level, level); | ||
996 | |||
997 | /* call dlm_lock to upgrade lock now */ | ||
998 | status = dlmlock(osb->dlm, | ||
999 | level, | ||
1000 | &lockres->l_lksb, | ||
1001 | lkm_flags|LKM_CONVERT|LKM_VALBLK, | ||
1002 | lockres->l_name, | ||
1003 | lockres->l_ops->ast, | ||
1004 | lockres, | ||
1005 | lockres->l_ops->bast); | ||
1006 | if (status != DLM_NORMAL) { | ||
1007 | if ((lkm_flags & LKM_NOQUEUE) && | ||
1008 | (status == DLM_NOTQUEUED)) | ||
1009 | ret = -EAGAIN; | ||
1010 | else { | ||
1011 | ocfs2_log_dlm_error("dlmlock", status, | ||
1012 | lockres); | ||
1013 | ret = -EINVAL; | ||
1014 | } | ||
1015 | ocfs2_recover_from_dlm_error(lockres, 1); | ||
1016 | goto out; | ||
1017 | } | ||
1018 | |||
1019 | mlog(0, "lock %s, successfull return from dlmlock\n", | ||
1020 | lockres->l_name); | ||
1021 | |||
1022 | /* At this point we've gone inside the dlm and need to | ||
1023 | * complete our work regardless. */ | ||
1024 | catch_signals = 0; | ||
1025 | |||
1026 | /* wait for busy to clear and carry on */ | ||
1027 | goto again; | ||
1028 | } | ||
1029 | |||
1030 | /* Ok, if we get here then we're good to go. */ | ||
1031 | ocfs2_inc_holders(lockres, level); | ||
1032 | |||
1033 | ret = 0; | ||
1034 | unlock: | ||
1035 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
1036 | out: | ||
1037 | /* | ||
1038 | * This is helping work around a lock inversion between the page lock | ||
1039 | * and dlm locks. One path holds the page lock while calling aops | ||
1040 | * which block acquiring dlm locks. The voting thread holds dlm | ||
1041 | * locks while acquiring page locks while down converting data locks. | ||
1042 | * This block is helping an aop path notice the inversion and back | ||
1043 | * off to unlock its page lock before trying the dlm lock again. | ||
1044 | */ | ||
1045 | if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && | ||
1046 | mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { | ||
1047 | wait = 0; | ||
1048 | if (lockres_remove_mask_waiter(lockres, &mw)) | ||
1049 | ret = -EAGAIN; | ||
1050 | else | ||
1051 | goto again; | ||
1052 | } | ||
1053 | if (wait) { | ||
1054 | ret = ocfs2_wait_for_mask(&mw); | ||
1055 | if (ret == 0) | ||
1056 | goto again; | ||
1057 | mlog_errno(ret); | ||
1058 | } | ||
1059 | |||
1060 | mlog_exit(ret); | ||
1061 | return ret; | ||
1062 | } | ||
1063 | |||
1064 | static void ocfs2_cluster_unlock(struct ocfs2_super *osb, | ||
1065 | struct ocfs2_lock_res *lockres, | ||
1066 | int level) | ||
1067 | { | ||
1068 | unsigned long flags; | ||
1069 | |||
1070 | mlog_entry_void(); | ||
1071 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
1072 | ocfs2_dec_holders(lockres, level); | ||
1073 | ocfs2_vote_on_unlock(osb, lockres); | ||
1074 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
1075 | mlog_exit_void(); | ||
1076 | } | ||
1077 | |||
1078 | static int ocfs2_create_new_inode_lock(struct inode *inode, | ||
1079 | struct ocfs2_lock_res *lockres) | ||
1080 | { | ||
1081 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1082 | unsigned long flags; | ||
1083 | |||
1084 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
1085 | BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); | ||
1086 | lockres_or_flags(lockres, OCFS2_LOCK_LOCAL); | ||
1087 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
1088 | |||
1089 | return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL); | ||
1090 | } | ||
1091 | |||
1092 | /* Grants us an EX lock on the data and metadata resources, skipping | ||
1093 | * the normal cluster directory lookup. Use this ONLY on newly created | ||
1094 | * inodes which other nodes can't possibly see, and which haven't been | ||
1095 | * hashed in the inode hash yet. This can give us a good performance | ||
1096 | * increase as it'll skip the network broadcast normally associated | ||
1097 | * with creating a new lock resource. */ | ||
1098 | int ocfs2_create_new_inode_locks(struct inode *inode) | ||
1099 | { | ||
1100 | int ret; | ||
1101 | |||
1102 | BUG_ON(!inode); | ||
1103 | BUG_ON(!ocfs2_inode_is_new(inode)); | ||
1104 | |||
1105 | mlog_entry_void(); | ||
1106 | |||
1107 | mlog(0, "Inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno); | ||
1108 | |||
1109 | /* NOTE: That we don't increment any of the holder counts, nor | ||
1110 | * do we add anything to a journal handle. Since this is | ||
1111 | * supposed to be a new inode which the cluster doesn't know | ||
1112 | * about yet, there is no need to. As far as the LVB handling | ||
1113 | * is concerned, this is basically like acquiring an EX lock | ||
1114 | * on a resource which has an invalid one -- we'll set it | ||
1115 | * valid when we release the EX. */ | ||
1116 | |||
1117 | ret = ocfs2_create_new_inode_lock(inode, | ||
1118 | &OCFS2_I(inode)->ip_rw_lockres); | ||
1119 | if (ret) { | ||
1120 | mlog_errno(ret); | ||
1121 | goto bail; | ||
1122 | } | ||
1123 | |||
1124 | ret = ocfs2_create_new_inode_lock(inode, | ||
1125 | &OCFS2_I(inode)->ip_meta_lockres); | ||
1126 | if (ret) { | ||
1127 | mlog_errno(ret); | ||
1128 | goto bail; | ||
1129 | } | ||
1130 | |||
1131 | ret = ocfs2_create_new_inode_lock(inode, | ||
1132 | &OCFS2_I(inode)->ip_data_lockres); | ||
1133 | if (ret) { | ||
1134 | mlog_errno(ret); | ||
1135 | goto bail; | ||
1136 | } | ||
1137 | |||
1138 | bail: | ||
1139 | mlog_exit(ret); | ||
1140 | return ret; | ||
1141 | } | ||
1142 | |||
1143 | int ocfs2_rw_lock(struct inode *inode, int write) | ||
1144 | { | ||
1145 | int status, level; | ||
1146 | struct ocfs2_lock_res *lockres; | ||
1147 | |||
1148 | BUG_ON(!inode); | ||
1149 | |||
1150 | mlog_entry_void(); | ||
1151 | |||
1152 | mlog(0, "inode %"MLFu64" take %s RW lock\n", | ||
1153 | OCFS2_I(inode)->ip_blkno, | ||
1154 | write ? "EXMODE" : "PRMODE"); | ||
1155 | |||
1156 | lockres = &OCFS2_I(inode)->ip_rw_lockres; | ||
1157 | |||
1158 | level = write ? LKM_EXMODE : LKM_PRMODE; | ||
1159 | |||
1160 | status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, | ||
1161 | 0); | ||
1162 | if (status < 0) | ||
1163 | mlog_errno(status); | ||
1164 | |||
1165 | mlog_exit(status); | ||
1166 | return status; | ||
1167 | } | ||
1168 | |||
1169 | void ocfs2_rw_unlock(struct inode *inode, int write) | ||
1170 | { | ||
1171 | int level = write ? LKM_EXMODE : LKM_PRMODE; | ||
1172 | struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; | ||
1173 | |||
1174 | mlog_entry_void(); | ||
1175 | |||
1176 | mlog(0, "inode %"MLFu64" drop %s RW lock\n", | ||
1177 | OCFS2_I(inode)->ip_blkno, | ||
1178 | write ? "EXMODE" : "PRMODE"); | ||
1179 | |||
1180 | ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); | ||
1181 | |||
1182 | mlog_exit_void(); | ||
1183 | } | ||
1184 | |||
1185 | int ocfs2_data_lock_full(struct inode *inode, | ||
1186 | int write, | ||
1187 | int arg_flags) | ||
1188 | { | ||
1189 | int status = 0, level; | ||
1190 | struct ocfs2_lock_res *lockres; | ||
1191 | |||
1192 | BUG_ON(!inode); | ||
1193 | |||
1194 | mlog_entry_void(); | ||
1195 | |||
1196 | mlog(0, "inode %"MLFu64" take %s DATA lock\n", | ||
1197 | OCFS2_I(inode)->ip_blkno, | ||
1198 | write ? "EXMODE" : "PRMODE"); | ||
1199 | |||
1200 | /* We'll allow faking a readonly data lock for | ||
1201 | * rodevices. */ | ||
1202 | if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) { | ||
1203 | if (write) { | ||
1204 | status = -EROFS; | ||
1205 | mlog_errno(status); | ||
1206 | } | ||
1207 | goto out; | ||
1208 | } | ||
1209 | |||
1210 | lockres = &OCFS2_I(inode)->ip_data_lockres; | ||
1211 | |||
1212 | level = write ? LKM_EXMODE : LKM_PRMODE; | ||
1213 | |||
1214 | status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, | ||
1215 | 0, arg_flags); | ||
1216 | if (status < 0 && status != -EAGAIN) | ||
1217 | mlog_errno(status); | ||
1218 | |||
1219 | out: | ||
1220 | mlog_exit(status); | ||
1221 | return status; | ||
1222 | } | ||
1223 | |||
1224 | /* see ocfs2_meta_lock_with_page() */ | ||
1225 | int ocfs2_data_lock_with_page(struct inode *inode, | ||
1226 | int write, | ||
1227 | struct page *page) | ||
1228 | { | ||
1229 | int ret; | ||
1230 | |||
1231 | ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK); | ||
1232 | if (ret == -EAGAIN) { | ||
1233 | unlock_page(page); | ||
1234 | if (ocfs2_data_lock(inode, write) == 0) | ||
1235 | ocfs2_data_unlock(inode, write); | ||
1236 | ret = AOP_TRUNCATED_PAGE; | ||
1237 | } | ||
1238 | |||
1239 | return ret; | ||
1240 | } | ||
1241 | |||
1242 | static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, | ||
1243 | struct ocfs2_lock_res *lockres) | ||
1244 | { | ||
1245 | int kick = 0; | ||
1246 | |||
1247 | mlog_entry_void(); | ||
1248 | |||
1249 | /* If we know that another node is waiting on our lock, kick | ||
1250 | * the vote thread * pre-emptively when we reach a release | ||
1251 | * condition. */ | ||
1252 | if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { | ||
1253 | switch(lockres->l_blocking) { | ||
1254 | case LKM_EXMODE: | ||
1255 | if (!lockres->l_ex_holders && !lockres->l_ro_holders) | ||
1256 | kick = 1; | ||
1257 | break; | ||
1258 | case LKM_PRMODE: | ||
1259 | if (!lockres->l_ex_holders) | ||
1260 | kick = 1; | ||
1261 | break; | ||
1262 | default: | ||
1263 | BUG(); | ||
1264 | } | ||
1265 | } | ||
1266 | |||
1267 | if (kick) | ||
1268 | ocfs2_kick_vote_thread(osb); | ||
1269 | |||
1270 | mlog_exit_void(); | ||
1271 | } | ||
1272 | |||
1273 | void ocfs2_data_unlock(struct inode *inode, | ||
1274 | int write) | ||
1275 | { | ||
1276 | int level = write ? LKM_EXMODE : LKM_PRMODE; | ||
1277 | struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres; | ||
1278 | |||
1279 | mlog_entry_void(); | ||
1280 | |||
1281 | mlog(0, "inode %"MLFu64" drop %s DATA lock\n", | ||
1282 | OCFS2_I(inode)->ip_blkno, | ||
1283 | write ? "EXMODE" : "PRMODE"); | ||
1284 | |||
1285 | if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) | ||
1286 | ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); | ||
1287 | |||
1288 | mlog_exit_void(); | ||
1289 | } | ||
1290 | |||
1291 | #define OCFS2_SEC_BITS 34 | ||
1292 | #define OCFS2_SEC_SHIFT (64 - 34) | ||
1293 | #define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1) | ||
1294 | |||
1295 | /* LVB only has room for 64 bits of time here so we pack it for | ||
1296 | * now. */ | ||
1297 | static u64 ocfs2_pack_timespec(struct timespec *spec) | ||
1298 | { | ||
1299 | u64 res; | ||
1300 | u64 sec = spec->tv_sec; | ||
1301 | u32 nsec = spec->tv_nsec; | ||
1302 | |||
1303 | res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK); | ||
1304 | |||
1305 | return res; | ||
1306 | } | ||
1307 | |||
1308 | /* Call this with the lockres locked. I am reasonably sure we don't | ||
1309 | * need ip_lock in this function as anyone who would be changing those | ||
1310 | * values is supposed to be blocked in ocfs2_meta_lock right now. */ | ||
1311 | static void __ocfs2_stuff_meta_lvb(struct inode *inode) | ||
1312 | { | ||
1313 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
1314 | struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; | ||
1315 | struct ocfs2_meta_lvb *lvb; | ||
1316 | |||
1317 | mlog_entry_void(); | ||
1318 | |||
1319 | lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; | ||
1320 | |||
1321 | lvb->lvb_version = cpu_to_be32(OCFS2_LVB_VERSION); | ||
1322 | lvb->lvb_isize = cpu_to_be64(i_size_read(inode)); | ||
1323 | lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); | ||
1324 | lvb->lvb_iuid = cpu_to_be32(inode->i_uid); | ||
1325 | lvb->lvb_igid = cpu_to_be32(inode->i_gid); | ||
1326 | lvb->lvb_imode = cpu_to_be16(inode->i_mode); | ||
1327 | lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); | ||
1328 | lvb->lvb_iatime_packed = | ||
1329 | cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); | ||
1330 | lvb->lvb_ictime_packed = | ||
1331 | cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); | ||
1332 | lvb->lvb_imtime_packed = | ||
1333 | cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); | ||
1334 | |||
1335 | mlog_meta_lvb(0, lockres); | ||
1336 | |||
1337 | mlog_exit_void(); | ||
1338 | } | ||
1339 | |||
1340 | static void ocfs2_unpack_timespec(struct timespec *spec, | ||
1341 | u64 packed_time) | ||
1342 | { | ||
1343 | spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT; | ||
1344 | spec->tv_nsec = packed_time & OCFS2_NSEC_MASK; | ||
1345 | } | ||
1346 | |||
1347 | static void ocfs2_refresh_inode_from_lvb(struct inode *inode) | ||
1348 | { | ||
1349 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
1350 | struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; | ||
1351 | struct ocfs2_meta_lvb *lvb; | ||
1352 | |||
1353 | mlog_entry_void(); | ||
1354 | |||
1355 | mlog_meta_lvb(0, lockres); | ||
1356 | |||
1357 | lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; | ||
1358 | |||
1359 | /* We're safe here without the lockres lock... */ | ||
1360 | spin_lock(&oi->ip_lock); | ||
1361 | oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters); | ||
1362 | i_size_write(inode, be64_to_cpu(lvb->lvb_isize)); | ||
1363 | |||
1364 | /* fast-symlinks are a special case */ | ||
1365 | if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) | ||
1366 | inode->i_blocks = 0; | ||
1367 | else | ||
1368 | inode->i_blocks = | ||
1369 | ocfs2_align_bytes_to_sectors(i_size_read(inode)); | ||
1370 | |||
1371 | inode->i_uid = be32_to_cpu(lvb->lvb_iuid); | ||
1372 | inode->i_gid = be32_to_cpu(lvb->lvb_igid); | ||
1373 | inode->i_mode = be16_to_cpu(lvb->lvb_imode); | ||
1374 | inode->i_nlink = be16_to_cpu(lvb->lvb_inlink); | ||
1375 | ocfs2_unpack_timespec(&inode->i_atime, | ||
1376 | be64_to_cpu(lvb->lvb_iatime_packed)); | ||
1377 | ocfs2_unpack_timespec(&inode->i_mtime, | ||
1378 | be64_to_cpu(lvb->lvb_imtime_packed)); | ||
1379 | ocfs2_unpack_timespec(&inode->i_ctime, | ||
1380 | be64_to_cpu(lvb->lvb_ictime_packed)); | ||
1381 | spin_unlock(&oi->ip_lock); | ||
1382 | |||
1383 | mlog_exit_void(); | ||
1384 | } | ||
1385 | |||
1386 | static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres) | ||
1387 | { | ||
1388 | struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; | ||
1389 | |||
1390 | if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION) | ||
1391 | return 1; | ||
1392 | return 0; | ||
1393 | } | ||
1394 | |||
1395 | /* Determine whether a lock resource needs to be refreshed, and | ||
1396 | * arbitrate who gets to refresh it. | ||
1397 | * | ||
1398 | * 0 means no refresh needed. | ||
1399 | * | ||
1400 | * > 0 means you need to refresh this and you MUST call | ||
1401 | * ocfs2_complete_lock_res_refresh afterwards. */ | ||
1402 | static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres) | ||
1403 | { | ||
1404 | unsigned long flags; | ||
1405 | int status = 0; | ||
1406 | |||
1407 | mlog_entry_void(); | ||
1408 | |||
1409 | refresh_check: | ||
1410 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
1411 | if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { | ||
1412 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
1413 | goto bail; | ||
1414 | } | ||
1415 | |||
1416 | if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { | ||
1417 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
1418 | |||
1419 | ocfs2_wait_on_refreshing_lock(lockres); | ||
1420 | goto refresh_check; | ||
1421 | } | ||
1422 | |||
1423 | /* Ok, I'll be the one to refresh this lock. */ | ||
1424 | lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING); | ||
1425 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
1426 | |||
1427 | status = 1; | ||
1428 | bail: | ||
1429 | mlog_exit(status); | ||
1430 | return status; | ||
1431 | } | ||
1432 | |||
1433 | /* If status is non zero, I'll mark it as not being in refresh | ||
1434 | * anymroe, but i won't clear the needs refresh flag. */ | ||
1435 | static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres, | ||
1436 | int status) | ||
1437 | { | ||
1438 | unsigned long flags; | ||
1439 | mlog_entry_void(); | ||
1440 | |||
1441 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
1442 | lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING); | ||
1443 | if (!status) | ||
1444 | lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); | ||
1445 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
1446 | |||
1447 | wake_up(&lockres->l_event); | ||
1448 | |||
1449 | mlog_exit_void(); | ||
1450 | } | ||
1451 | |||
1452 | /* may or may not return a bh if it went to disk. */ | ||
1453 | static int ocfs2_meta_lock_update(struct inode *inode, | ||
1454 | struct buffer_head **bh) | ||
1455 | { | ||
1456 | int status = 0; | ||
1457 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
1458 | struct ocfs2_lock_res *lockres; | ||
1459 | struct ocfs2_dinode *fe; | ||
1460 | |||
1461 | mlog_entry_void(); | ||
1462 | |||
1463 | spin_lock(&oi->ip_lock); | ||
1464 | if (oi->ip_flags & OCFS2_INODE_DELETED) { | ||
1465 | mlog(0, "Orphaned inode %"MLFu64" was deleted while we " | ||
1466 | "were waiting on a lock. ip_flags = 0x%x\n", | ||
1467 | oi->ip_blkno, oi->ip_flags); | ||
1468 | spin_unlock(&oi->ip_lock); | ||
1469 | status = -ENOENT; | ||
1470 | goto bail; | ||
1471 | } | ||
1472 | spin_unlock(&oi->ip_lock); | ||
1473 | |||
1474 | lockres = &oi->ip_meta_lockres; | ||
1475 | |||
1476 | if (!ocfs2_should_refresh_lock_res(lockres)) | ||
1477 | goto bail; | ||
1478 | |||
1479 | /* This will discard any caching information we might have had | ||
1480 | * for the inode metadata. */ | ||
1481 | ocfs2_metadata_cache_purge(inode); | ||
1482 | |||
1483 | /* will do nothing for inode types that don't use the extent | ||
1484 | * map (directories, bitmap files, etc) */ | ||
1485 | ocfs2_extent_map_trunc(inode, 0); | ||
1486 | |||
1487 | if (ocfs2_meta_lvb_is_trustable(lockres)) { | ||
1488 | mlog(0, "Trusting LVB on inode %"MLFu64"\n", | ||
1489 | oi->ip_blkno); | ||
1490 | ocfs2_refresh_inode_from_lvb(inode); | ||
1491 | } else { | ||
1492 | /* Boo, we have to go to disk. */ | ||
1493 | /* read bh, cast, ocfs2_refresh_inode */ | ||
1494 | status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno, | ||
1495 | bh, OCFS2_BH_CACHED, inode); | ||
1496 | if (status < 0) { | ||
1497 | mlog_errno(status); | ||
1498 | goto bail_refresh; | ||
1499 | } | ||
1500 | fe = (struct ocfs2_dinode *) (*bh)->b_data; | ||
1501 | |||
1502 | /* This is a good chance to make sure we're not | ||
1503 | * locking an invalid object. | ||
1504 | * | ||
1505 | * We bug on a stale inode here because we checked | ||
1506 | * above whether it was wiped from disk. The wiping | ||
1507 | * node provides a guarantee that we receive that | ||
1508 | * message and can mark the inode before dropping any | ||
1509 | * locks associated with it. */ | ||
1510 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
1511 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | ||
1512 | status = -EIO; | ||
1513 | goto bail_refresh; | ||
1514 | } | ||
1515 | mlog_bug_on_msg(inode->i_generation != | ||
1516 | le32_to_cpu(fe->i_generation), | ||
1517 | "Invalid dinode %"MLFu64" disk generation: %u " | ||
1518 | "inode->i_generation: %u\n", | ||
1519 | oi->ip_blkno, le32_to_cpu(fe->i_generation), | ||
1520 | inode->i_generation); | ||
1521 | mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) || | ||
1522 | !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)), | ||
1523 | "Stale dinode %"MLFu64" dtime: %"MLFu64" " | ||
1524 | "flags: 0x%x\n", oi->ip_blkno, | ||
1525 | le64_to_cpu(fe->i_dtime), | ||
1526 | le32_to_cpu(fe->i_flags)); | ||
1527 | |||
1528 | ocfs2_refresh_inode(inode, fe); | ||
1529 | } | ||
1530 | |||
1531 | status = 0; | ||
1532 | bail_refresh: | ||
1533 | ocfs2_complete_lock_res_refresh(lockres, status); | ||
1534 | bail: | ||
1535 | mlog_exit(status); | ||
1536 | return status; | ||
1537 | } | ||
1538 | |||
1539 | static int ocfs2_assign_bh(struct inode *inode, | ||
1540 | struct buffer_head **ret_bh, | ||
1541 | struct buffer_head *passed_bh) | ||
1542 | { | ||
1543 | int status; | ||
1544 | |||
1545 | if (passed_bh) { | ||
1546 | /* Ok, the update went to disk for us, use the | ||
1547 | * returned bh. */ | ||
1548 | *ret_bh = passed_bh; | ||
1549 | get_bh(*ret_bh); | ||
1550 | |||
1551 | return 0; | ||
1552 | } | ||
1553 | |||
1554 | status = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
1555 | OCFS2_I(inode)->ip_blkno, | ||
1556 | ret_bh, | ||
1557 | OCFS2_BH_CACHED, | ||
1558 | inode); | ||
1559 | if (status < 0) | ||
1560 | mlog_errno(status); | ||
1561 | |||
1562 | return status; | ||
1563 | } | ||
1564 | |||
1565 | /* | ||
1566 | * returns < 0 error if the callback will never be called, otherwise | ||
1567 | * the result of the lock will be communicated via the callback. | ||
1568 | */ | ||
1569 | int ocfs2_meta_lock_full(struct inode *inode, | ||
1570 | struct ocfs2_journal_handle *handle, | ||
1571 | struct buffer_head **ret_bh, | ||
1572 | int ex, | ||
1573 | int arg_flags) | ||
1574 | { | ||
1575 | int status, level, dlm_flags, acquired; | ||
1576 | struct ocfs2_lock_res *lockres; | ||
1577 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1578 | struct buffer_head *local_bh = NULL; | ||
1579 | |||
1580 | BUG_ON(!inode); | ||
1581 | |||
1582 | mlog_entry_void(); | ||
1583 | |||
1584 | mlog(0, "inode %"MLFu64", take %s META lock\n", | ||
1585 | OCFS2_I(inode)->ip_blkno, | ||
1586 | ex ? "EXMODE" : "PRMODE"); | ||
1587 | |||
1588 | status = 0; | ||
1589 | acquired = 0; | ||
1590 | /* We'll allow faking a readonly metadata lock for | ||
1591 | * rodevices. */ | ||
1592 | if (ocfs2_is_hard_readonly(osb)) { | ||
1593 | if (ex) | ||
1594 | status = -EROFS; | ||
1595 | goto bail; | ||
1596 | } | ||
1597 | |||
1598 | if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) | ||
1599 | wait_event(osb->recovery_event, | ||
1600 | ocfs2_node_map_is_empty(osb, &osb->recovery_map)); | ||
1601 | |||
1602 | acquired = 0; | ||
1603 | lockres = &OCFS2_I(inode)->ip_meta_lockres; | ||
1604 | level = ex ? LKM_EXMODE : LKM_PRMODE; | ||
1605 | dlm_flags = 0; | ||
1606 | if (arg_flags & OCFS2_META_LOCK_NOQUEUE) | ||
1607 | dlm_flags |= LKM_NOQUEUE; | ||
1608 | |||
1609 | status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags); | ||
1610 | if (status < 0) { | ||
1611 | if (status != -EAGAIN && status != -EIOCBRETRY) | ||
1612 | mlog_errno(status); | ||
1613 | goto bail; | ||
1614 | } | ||
1615 | |||
1616 | /* Notify the error cleanup path to drop the cluster lock. */ | ||
1617 | acquired = 1; | ||
1618 | |||
1619 | /* We wait twice because a node may have died while we were in | ||
1620 | * the lower dlm layers. The second time though, we've | ||
1621 | * committed to owning this lock so we don't allow signals to | ||
1622 | * abort the operation. */ | ||
1623 | if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) | ||
1624 | wait_event(osb->recovery_event, | ||
1625 | ocfs2_node_map_is_empty(osb, &osb->recovery_map)); | ||
1626 | |||
1627 | /* This is fun. The caller may want a bh back, or it may | ||
1628 | * not. ocfs2_meta_lock_update definitely wants one in, but | ||
1629 | * may or may not read one, depending on what's in the | ||
1630 | * LVB. The result of all of this is that we've *only* gone to | ||
1631 | * disk if we have to, so the complexity is worthwhile. */ | ||
1632 | status = ocfs2_meta_lock_update(inode, &local_bh); | ||
1633 | if (status < 0) { | ||
1634 | if (status != -ENOENT) | ||
1635 | mlog_errno(status); | ||
1636 | goto bail; | ||
1637 | } | ||
1638 | |||
1639 | if (ret_bh) { | ||
1640 | status = ocfs2_assign_bh(inode, ret_bh, local_bh); | ||
1641 | if (status < 0) { | ||
1642 | mlog_errno(status); | ||
1643 | goto bail; | ||
1644 | } | ||
1645 | } | ||
1646 | |||
1647 | if (handle) { | ||
1648 | status = ocfs2_handle_add_lock(handle, inode); | ||
1649 | if (status < 0) | ||
1650 | mlog_errno(status); | ||
1651 | } | ||
1652 | |||
1653 | bail: | ||
1654 | if (status < 0) { | ||
1655 | if (ret_bh && (*ret_bh)) { | ||
1656 | brelse(*ret_bh); | ||
1657 | *ret_bh = NULL; | ||
1658 | } | ||
1659 | if (acquired) | ||
1660 | ocfs2_meta_unlock(inode, ex); | ||
1661 | } | ||
1662 | |||
1663 | if (local_bh) | ||
1664 | brelse(local_bh); | ||
1665 | |||
1666 | mlog_exit(status); | ||
1667 | return status; | ||
1668 | } | ||
1669 | |||
1670 | /* | ||
1671 | * This is working around a lock inversion between tasks acquiring DLM locks | ||
1672 | * while holding a page lock and the vote thread which blocks dlm lock acquiry | ||
1673 | * while acquiring page locks. | ||
1674 | * | ||
1675 | * ** These _with_page variantes are only intended to be called from aop | ||
1676 | * methods that hold page locks and return a very specific *positive* error | ||
1677 | * code that aop methods pass up to the VFS -- test for errors with != 0. ** | ||
1678 | * | ||
1679 | * The DLM is called such that it returns -EAGAIN if it would have blocked | ||
1680 | * waiting for the vote thread. In that case we unlock our page so the vote | ||
1681 | * thread can make progress. Once we've done this we have to return | ||
1682 | * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up | ||
1683 | * into the VFS who will then immediately retry the aop call. | ||
1684 | * | ||
1685 | * We do a blocking lock and immediate unlock before returning, though, so that | ||
1686 | * the lock has a great chance of being cached on this node by the time the VFS | ||
1687 | * calls back to retry the aop. This has a potential to livelock as nodes | ||
1688 | * ping locks back and forth, but that's a risk we're willing to take to avoid | ||
1689 | * the lock inversion simply. | ||
1690 | */ | ||
1691 | int ocfs2_meta_lock_with_page(struct inode *inode, | ||
1692 | struct ocfs2_journal_handle *handle, | ||
1693 | struct buffer_head **ret_bh, | ||
1694 | int ex, | ||
1695 | struct page *page) | ||
1696 | { | ||
1697 | int ret; | ||
1698 | |||
1699 | ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex, | ||
1700 | OCFS2_LOCK_NONBLOCK); | ||
1701 | if (ret == -EAGAIN) { | ||
1702 | unlock_page(page); | ||
1703 | if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0) | ||
1704 | ocfs2_meta_unlock(inode, ex); | ||
1705 | ret = AOP_TRUNCATED_PAGE; | ||
1706 | } | ||
1707 | |||
1708 | return ret; | ||
1709 | } | ||
1710 | |||
1711 | void ocfs2_meta_unlock(struct inode *inode, | ||
1712 | int ex) | ||
1713 | { | ||
1714 | int level = ex ? LKM_EXMODE : LKM_PRMODE; | ||
1715 | struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; | ||
1716 | |||
1717 | mlog_entry_void(); | ||
1718 | |||
1719 | mlog(0, "inode %"MLFu64" drop %s META lock\n", | ||
1720 | OCFS2_I(inode)->ip_blkno, | ||
1721 | ex ? "EXMODE" : "PRMODE"); | ||
1722 | |||
1723 | if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) | ||
1724 | ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); | ||
1725 | |||
1726 | mlog_exit_void(); | ||
1727 | } | ||
1728 | |||
1729 | int ocfs2_super_lock(struct ocfs2_super *osb, | ||
1730 | int ex) | ||
1731 | { | ||
1732 | int status; | ||
1733 | int level = ex ? LKM_EXMODE : LKM_PRMODE; | ||
1734 | struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; | ||
1735 | struct buffer_head *bh; | ||
1736 | struct ocfs2_slot_info *si = osb->slot_info; | ||
1737 | |||
1738 | mlog_entry_void(); | ||
1739 | |||
1740 | if (ocfs2_is_hard_readonly(osb)) | ||
1741 | return -EROFS; | ||
1742 | |||
1743 | status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); | ||
1744 | if (status < 0) { | ||
1745 | mlog_errno(status); | ||
1746 | goto bail; | ||
1747 | } | ||
1748 | |||
1749 | /* The super block lock path is really in the best position to | ||
1750 | * know when resources covered by the lock need to be | ||
1751 | * refreshed, so we do it here. Of course, making sense of | ||
1752 | * everything is up to the caller :) */ | ||
1753 | status = ocfs2_should_refresh_lock_res(lockres); | ||
1754 | if (status < 0) { | ||
1755 | mlog_errno(status); | ||
1756 | goto bail; | ||
1757 | } | ||
1758 | if (status) { | ||
1759 | bh = si->si_bh; | ||
1760 | status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, | ||
1761 | si->si_inode); | ||
1762 | if (status == 0) | ||
1763 | ocfs2_update_slot_info(si); | ||
1764 | |||
1765 | ocfs2_complete_lock_res_refresh(lockres, status); | ||
1766 | |||
1767 | if (status < 0) | ||
1768 | mlog_errno(status); | ||
1769 | } | ||
1770 | bail: | ||
1771 | mlog_exit(status); | ||
1772 | return status; | ||
1773 | } | ||
1774 | |||
1775 | void ocfs2_super_unlock(struct ocfs2_super *osb, | ||
1776 | int ex) | ||
1777 | { | ||
1778 | int level = ex ? LKM_EXMODE : LKM_PRMODE; | ||
1779 | struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; | ||
1780 | |||
1781 | ocfs2_cluster_unlock(osb, lockres, level); | ||
1782 | } | ||
1783 | |||
1784 | int ocfs2_rename_lock(struct ocfs2_super *osb) | ||
1785 | { | ||
1786 | int status; | ||
1787 | struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; | ||
1788 | |||
1789 | if (ocfs2_is_hard_readonly(osb)) | ||
1790 | return -EROFS; | ||
1791 | |||
1792 | status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0); | ||
1793 | if (status < 0) | ||
1794 | mlog_errno(status); | ||
1795 | |||
1796 | return status; | ||
1797 | } | ||
1798 | |||
1799 | void ocfs2_rename_unlock(struct ocfs2_super *osb) | ||
1800 | { | ||
1801 | struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; | ||
1802 | |||
1803 | ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); | ||
1804 | } | ||
1805 | |||
1806 | /* Reference counting of the dlm debug structure. We want this because | ||
1807 | * open references on the debug inodes can live on after a mount, so | ||
1808 | * we can't rely on the ocfs2_super to always exist. */ | ||
1809 | static void ocfs2_dlm_debug_free(struct kref *kref) | ||
1810 | { | ||
1811 | struct ocfs2_dlm_debug *dlm_debug; | ||
1812 | |||
1813 | dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt); | ||
1814 | |||
1815 | kfree(dlm_debug); | ||
1816 | } | ||
1817 | |||
1818 | void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug) | ||
1819 | { | ||
1820 | if (dlm_debug) | ||
1821 | kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free); | ||
1822 | } | ||
1823 | |||
1824 | static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug) | ||
1825 | { | ||
1826 | kref_get(&debug->d_refcnt); | ||
1827 | } | ||
1828 | |||
1829 | struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) | ||
1830 | { | ||
1831 | struct ocfs2_dlm_debug *dlm_debug; | ||
1832 | |||
1833 | dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL); | ||
1834 | if (!dlm_debug) { | ||
1835 | mlog_errno(-ENOMEM); | ||
1836 | goto out; | ||
1837 | } | ||
1838 | |||
1839 | kref_init(&dlm_debug->d_refcnt); | ||
1840 | INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); | ||
1841 | dlm_debug->d_locking_state = NULL; | ||
1842 | out: | ||
1843 | return dlm_debug; | ||
1844 | } | ||
1845 | |||
1846 | /* Access to this is arbitrated for us via seq_file->sem. */ | ||
1847 | struct ocfs2_dlm_seq_priv { | ||
1848 | struct ocfs2_dlm_debug *p_dlm_debug; | ||
1849 | struct ocfs2_lock_res p_iter_res; | ||
1850 | struct ocfs2_lock_res p_tmp_res; | ||
1851 | }; | ||
1852 | |||
1853 | static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start, | ||
1854 | struct ocfs2_dlm_seq_priv *priv) | ||
1855 | { | ||
1856 | struct ocfs2_lock_res *iter, *ret = NULL; | ||
1857 | struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug; | ||
1858 | |||
1859 | assert_spin_locked(&ocfs2_dlm_tracking_lock); | ||
1860 | |||
1861 | list_for_each_entry(iter, &start->l_debug_list, l_debug_list) { | ||
1862 | /* discover the head of the list */ | ||
1863 | if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) { | ||
1864 | mlog(0, "End of list found, %p\n", ret); | ||
1865 | break; | ||
1866 | } | ||
1867 | |||
1868 | /* We track our "dummy" iteration lockres' by a NULL | ||
1869 | * l_ops field. */ | ||
1870 | if (iter->l_ops != NULL) { | ||
1871 | ret = iter; | ||
1872 | break; | ||
1873 | } | ||
1874 | } | ||
1875 | |||
1876 | return ret; | ||
1877 | } | ||
1878 | |||
1879 | static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos) | ||
1880 | { | ||
1881 | struct ocfs2_dlm_seq_priv *priv = m->private; | ||
1882 | struct ocfs2_lock_res *iter; | ||
1883 | |||
1884 | spin_lock(&ocfs2_dlm_tracking_lock); | ||
1885 | iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv); | ||
1886 | if (iter) { | ||
1887 | /* Since lockres' have the lifetime of their container | ||
1888 | * (which can be inodes, ocfs2_supers, etc) we want to | ||
1889 | * copy this out to a temporary lockres while still | ||
1890 | * under the spinlock. Obviously after this we can't | ||
1891 | * trust any pointers on the copy returned, but that's | ||
1892 | * ok as the information we want isn't typically held | ||
1893 | * in them. */ | ||
1894 | priv->p_tmp_res = *iter; | ||
1895 | iter = &priv->p_tmp_res; | ||
1896 | } | ||
1897 | spin_unlock(&ocfs2_dlm_tracking_lock); | ||
1898 | |||
1899 | return iter; | ||
1900 | } | ||
1901 | |||
1902 | static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v) | ||
1903 | { | ||
1904 | } | ||
1905 | |||
1906 | static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) | ||
1907 | { | ||
1908 | struct ocfs2_dlm_seq_priv *priv = m->private; | ||
1909 | struct ocfs2_lock_res *iter = v; | ||
1910 | struct ocfs2_lock_res *dummy = &priv->p_iter_res; | ||
1911 | |||
1912 | spin_lock(&ocfs2_dlm_tracking_lock); | ||
1913 | iter = ocfs2_dlm_next_res(iter, priv); | ||
1914 | list_del_init(&dummy->l_debug_list); | ||
1915 | if (iter) { | ||
1916 | list_add(&dummy->l_debug_list, &iter->l_debug_list); | ||
1917 | priv->p_tmp_res = *iter; | ||
1918 | iter = &priv->p_tmp_res; | ||
1919 | } | ||
1920 | spin_unlock(&ocfs2_dlm_tracking_lock); | ||
1921 | |||
1922 | return iter; | ||
1923 | } | ||
1924 | |||
1925 | /* So that debugfs.ocfs2 can determine which format is being used */ | ||
1926 | #define OCFS2_DLM_DEBUG_STR_VERSION 1 | ||
1927 | static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) | ||
1928 | { | ||
1929 | int i; | ||
1930 | char *lvb; | ||
1931 | struct ocfs2_lock_res *lockres = v; | ||
1932 | |||
1933 | if (!lockres) | ||
1934 | return -EINVAL; | ||
1935 | |||
1936 | seq_printf(m, "0x%x\t" | ||
1937 | "%.*s\t" | ||
1938 | "%d\t" | ||
1939 | "0x%lx\t" | ||
1940 | "0x%x\t" | ||
1941 | "0x%x\t" | ||
1942 | "%u\t" | ||
1943 | "%u\t" | ||
1944 | "%d\t" | ||
1945 | "%d\t", | ||
1946 | OCFS2_DLM_DEBUG_STR_VERSION, | ||
1947 | OCFS2_LOCK_ID_MAX_LEN, lockres->l_name, | ||
1948 | lockres->l_level, | ||
1949 | lockres->l_flags, | ||
1950 | lockres->l_action, | ||
1951 | lockres->l_unlock_action, | ||
1952 | lockres->l_ro_holders, | ||
1953 | lockres->l_ex_holders, | ||
1954 | lockres->l_requested, | ||
1955 | lockres->l_blocking); | ||
1956 | |||
1957 | /* Dump the raw LVB */ | ||
1958 | lvb = lockres->l_lksb.lvb; | ||
1959 | for(i = 0; i < DLM_LVB_LEN; i++) | ||
1960 | seq_printf(m, "0x%x\t", lvb[i]); | ||
1961 | |||
1962 | /* End the line */ | ||
1963 | seq_printf(m, "\n"); | ||
1964 | return 0; | ||
1965 | } | ||
1966 | |||
1967 | static struct seq_operations ocfs2_dlm_seq_ops = { | ||
1968 | .start = ocfs2_dlm_seq_start, | ||
1969 | .stop = ocfs2_dlm_seq_stop, | ||
1970 | .next = ocfs2_dlm_seq_next, | ||
1971 | .show = ocfs2_dlm_seq_show, | ||
1972 | }; | ||
1973 | |||
1974 | static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file) | ||
1975 | { | ||
1976 | struct seq_file *seq = (struct seq_file *) file->private_data; | ||
1977 | struct ocfs2_dlm_seq_priv *priv = seq->private; | ||
1978 | struct ocfs2_lock_res *res = &priv->p_iter_res; | ||
1979 | |||
1980 | ocfs2_remove_lockres_tracking(res); | ||
1981 | ocfs2_put_dlm_debug(priv->p_dlm_debug); | ||
1982 | return seq_release_private(inode, file); | ||
1983 | } | ||
1984 | |||
1985 | static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) | ||
1986 | { | ||
1987 | int ret; | ||
1988 | struct ocfs2_dlm_seq_priv *priv; | ||
1989 | struct seq_file *seq; | ||
1990 | struct ocfs2_super *osb; | ||
1991 | |||
1992 | priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL); | ||
1993 | if (!priv) { | ||
1994 | ret = -ENOMEM; | ||
1995 | mlog_errno(ret); | ||
1996 | goto out; | ||
1997 | } | ||
1998 | osb = (struct ocfs2_super *) inode->u.generic_ip; | ||
1999 | ocfs2_get_dlm_debug(osb->osb_dlm_debug); | ||
2000 | priv->p_dlm_debug = osb->osb_dlm_debug; | ||
2001 | INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); | ||
2002 | |||
2003 | ret = seq_open(file, &ocfs2_dlm_seq_ops); | ||
2004 | if (ret) { | ||
2005 | kfree(priv); | ||
2006 | mlog_errno(ret); | ||
2007 | goto out; | ||
2008 | } | ||
2009 | |||
2010 | seq = (struct seq_file *) file->private_data; | ||
2011 | seq->private = priv; | ||
2012 | |||
2013 | ocfs2_add_lockres_tracking(&priv->p_iter_res, | ||
2014 | priv->p_dlm_debug); | ||
2015 | |||
2016 | out: | ||
2017 | return ret; | ||
2018 | } | ||
2019 | |||
2020 | static struct file_operations ocfs2_dlm_debug_fops = { | ||
2021 | .open = ocfs2_dlm_debug_open, | ||
2022 | .release = ocfs2_dlm_debug_release, | ||
2023 | .read = seq_read, | ||
2024 | .llseek = seq_lseek, | ||
2025 | }; | ||
2026 | |||
2027 | static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) | ||
2028 | { | ||
2029 | int ret = 0; | ||
2030 | struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; | ||
2031 | |||
2032 | dlm_debug->d_locking_state = debugfs_create_file("locking_state", | ||
2033 | S_IFREG|S_IRUSR, | ||
2034 | osb->osb_debug_root, | ||
2035 | osb, | ||
2036 | &ocfs2_dlm_debug_fops); | ||
2037 | if (!dlm_debug->d_locking_state) { | ||
2038 | ret = -EINVAL; | ||
2039 | mlog(ML_ERROR, | ||
2040 | "Unable to create locking state debugfs file.\n"); | ||
2041 | goto out; | ||
2042 | } | ||
2043 | |||
2044 | ocfs2_get_dlm_debug(dlm_debug); | ||
2045 | out: | ||
2046 | return ret; | ||
2047 | } | ||
2048 | |||
2049 | static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) | ||
2050 | { | ||
2051 | struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; | ||
2052 | |||
2053 | if (dlm_debug) { | ||
2054 | debugfs_remove(dlm_debug->d_locking_state); | ||
2055 | ocfs2_put_dlm_debug(dlm_debug); | ||
2056 | } | ||
2057 | } | ||
2058 | |||
2059 | int ocfs2_dlm_init(struct ocfs2_super *osb) | ||
2060 | { | ||
2061 | int status; | ||
2062 | u32 dlm_key; | ||
2063 | struct dlm_ctxt *dlm; | ||
2064 | |||
2065 | mlog_entry_void(); | ||
2066 | |||
2067 | status = ocfs2_dlm_init_debug(osb); | ||
2068 | if (status < 0) { | ||
2069 | mlog_errno(status); | ||
2070 | goto bail; | ||
2071 | } | ||
2072 | |||
2073 | /* launch vote thread */ | ||
2074 | osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d", | ||
2075 | osb->osb_id); | ||
2076 | if (IS_ERR(osb->vote_task)) { | ||
2077 | status = PTR_ERR(osb->vote_task); | ||
2078 | osb->vote_task = NULL; | ||
2079 | mlog_errno(status); | ||
2080 | goto bail; | ||
2081 | } | ||
2082 | |||
2083 | /* used by the dlm code to make message headers unique, each | ||
2084 | * node in this domain must agree on this. */ | ||
2085 | dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str)); | ||
2086 | |||
2087 | /* for now, uuid == domain */ | ||
2088 | dlm = dlm_register_domain(osb->uuid_str, dlm_key); | ||
2089 | if (IS_ERR(dlm)) { | ||
2090 | status = PTR_ERR(dlm); | ||
2091 | mlog_errno(status); | ||
2092 | goto bail; | ||
2093 | } | ||
2094 | |||
2095 | ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); | ||
2096 | ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); | ||
2097 | |||
2098 | dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); | ||
2099 | |||
2100 | osb->dlm = dlm; | ||
2101 | |||
2102 | status = 0; | ||
2103 | bail: | ||
2104 | if (status < 0) { | ||
2105 | ocfs2_dlm_shutdown_debug(osb); | ||
2106 | if (osb->vote_task) | ||
2107 | kthread_stop(osb->vote_task); | ||
2108 | } | ||
2109 | |||
2110 | mlog_exit(status); | ||
2111 | return status; | ||
2112 | } | ||
2113 | |||
2114 | void ocfs2_dlm_shutdown(struct ocfs2_super *osb) | ||
2115 | { | ||
2116 | mlog_entry_void(); | ||
2117 | |||
2118 | dlm_unregister_eviction_cb(&osb->osb_eviction_cb); | ||
2119 | |||
2120 | ocfs2_drop_osb_locks(osb); | ||
2121 | |||
2122 | if (osb->vote_task) { | ||
2123 | kthread_stop(osb->vote_task); | ||
2124 | osb->vote_task = NULL; | ||
2125 | } | ||
2126 | |||
2127 | ocfs2_lock_res_free(&osb->osb_super_lockres); | ||
2128 | ocfs2_lock_res_free(&osb->osb_rename_lockres); | ||
2129 | |||
2130 | dlm_unregister_domain(osb->dlm); | ||
2131 | osb->dlm = NULL; | ||
2132 | |||
2133 | ocfs2_dlm_shutdown_debug(osb); | ||
2134 | |||
2135 | mlog_exit_void(); | ||
2136 | } | ||
2137 | |||
2138 | static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status) | ||
2139 | { | ||
2140 | struct ocfs2_lock_res *lockres = opaque; | ||
2141 | unsigned long flags; | ||
2142 | |||
2143 | mlog_entry_void(); | ||
2144 | |||
2145 | mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name, | ||
2146 | lockres->l_unlock_action); | ||
2147 | |||
2148 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2149 | /* We tried to cancel a convert request, but it was already | ||
2150 | * granted. All we want to do here is clear our unlock | ||
2151 | * state. The wake_up call done at the bottom is redundant | ||
2152 | * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't | ||
2153 | * hurt anything anyway */ | ||
2154 | if (status == DLM_CANCELGRANT && | ||
2155 | lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { | ||
2156 | mlog(0, "Got cancelgrant for %s\n", lockres->l_name); | ||
2157 | |||
2158 | /* We don't clear the busy flag in this case as it | ||
2159 | * should have been cleared by the ast which the dlm | ||
2160 | * has called. */ | ||
2161 | goto complete_unlock; | ||
2162 | } | ||
2163 | |||
2164 | if (status != DLM_NORMAL) { | ||
2165 | mlog(ML_ERROR, "Dlm passes status %d for lock %s, " | ||
2166 | "unlock_action %d\n", status, lockres->l_name, | ||
2167 | lockres->l_unlock_action); | ||
2168 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2169 | return; | ||
2170 | } | ||
2171 | |||
2172 | switch(lockres->l_unlock_action) { | ||
2173 | case OCFS2_UNLOCK_CANCEL_CONVERT: | ||
2174 | mlog(0, "Cancel convert success for %s\n", lockres->l_name); | ||
2175 | lockres->l_action = OCFS2_AST_INVALID; | ||
2176 | break; | ||
2177 | case OCFS2_UNLOCK_DROP_LOCK: | ||
2178 | lockres->l_level = LKM_IVMODE; | ||
2179 | break; | ||
2180 | default: | ||
2181 | BUG(); | ||
2182 | } | ||
2183 | |||
2184 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); | ||
2185 | complete_unlock: | ||
2186 | lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; | ||
2187 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2188 | |||
2189 | wake_up(&lockres->l_event); | ||
2190 | |||
2191 | mlog_exit_void(); | ||
2192 | } | ||
2193 | |||
2194 | typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *); | ||
2195 | |||
2196 | struct drop_lock_cb { | ||
2197 | ocfs2_pre_drop_cb_t *drop_func; | ||
2198 | void *drop_data; | ||
2199 | }; | ||
2200 | |||
2201 | static int ocfs2_drop_lock(struct ocfs2_super *osb, | ||
2202 | struct ocfs2_lock_res *lockres, | ||
2203 | struct drop_lock_cb *dcb) | ||
2204 | { | ||
2205 | enum dlm_status status; | ||
2206 | unsigned long flags; | ||
2207 | |||
2208 | /* We didn't get anywhere near actually using this lockres. */ | ||
2209 | if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) | ||
2210 | goto out; | ||
2211 | |||
2212 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2213 | |||
2214 | mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING), | ||
2215 | "lockres %s, flags 0x%lx\n", | ||
2216 | lockres->l_name, lockres->l_flags); | ||
2217 | |||
2218 | while (lockres->l_flags & OCFS2_LOCK_BUSY) { | ||
2219 | mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = " | ||
2220 | "%u, unlock_action = %u\n", | ||
2221 | lockres->l_name, lockres->l_flags, lockres->l_action, | ||
2222 | lockres->l_unlock_action); | ||
2223 | |||
2224 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2225 | |||
2226 | /* XXX: Today we just wait on any busy | ||
2227 | * locks... Perhaps we need to cancel converts in the | ||
2228 | * future? */ | ||
2229 | ocfs2_wait_on_busy_lock(lockres); | ||
2230 | |||
2231 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2232 | } | ||
2233 | |||
2234 | if (dcb) | ||
2235 | dcb->drop_func(lockres, dcb->drop_data); | ||
2236 | |||
2237 | if (lockres->l_flags & OCFS2_LOCK_BUSY) | ||
2238 | mlog(ML_ERROR, "destroying busy lock: \"%s\"\n", | ||
2239 | lockres->l_name); | ||
2240 | if (lockres->l_flags & OCFS2_LOCK_BLOCKED) | ||
2241 | mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name); | ||
2242 | |||
2243 | if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { | ||
2244 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2245 | goto out; | ||
2246 | } | ||
2247 | |||
2248 | lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED); | ||
2249 | |||
2250 | /* make sure we never get here while waiting for an ast to | ||
2251 | * fire. */ | ||
2252 | BUG_ON(lockres->l_action != OCFS2_AST_INVALID); | ||
2253 | |||
2254 | /* is this necessary? */ | ||
2255 | lockres_or_flags(lockres, OCFS2_LOCK_BUSY); | ||
2256 | lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK; | ||
2257 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2258 | |||
2259 | mlog(0, "lock %s\n", lockres->l_name); | ||
2260 | |||
2261 | status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK, | ||
2262 | lockres->l_ops->unlock_ast, lockres); | ||
2263 | if (status != DLM_NORMAL) { | ||
2264 | ocfs2_log_dlm_error("dlmunlock", status, lockres); | ||
2265 | mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); | ||
2266 | dlm_print_one_lock(lockres->l_lksb.lockid); | ||
2267 | BUG(); | ||
2268 | } | ||
2269 | mlog(0, "lock %s, successfull return from dlmunlock\n", | ||
2270 | lockres->l_name); | ||
2271 | |||
2272 | ocfs2_wait_on_busy_lock(lockres); | ||
2273 | out: | ||
2274 | mlog_exit(0); | ||
2275 | return 0; | ||
2276 | } | ||
2277 | |||
2278 | /* Mark the lockres as being dropped. It will no longer be | ||
2279 | * queued if blocking, but we still may have to wait on it | ||
2280 | * being dequeued from the vote thread before we can consider | ||
2281 | * it safe to drop. | ||
2282 | * | ||
2283 | * You can *not* attempt to call cluster_lock on this lockres anymore. */ | ||
2284 | void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) | ||
2285 | { | ||
2286 | int status; | ||
2287 | struct ocfs2_mask_waiter mw; | ||
2288 | unsigned long flags; | ||
2289 | |||
2290 | ocfs2_init_mask_waiter(&mw); | ||
2291 | |||
2292 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2293 | lockres->l_flags |= OCFS2_LOCK_FREEING; | ||
2294 | while (lockres->l_flags & OCFS2_LOCK_QUEUED) { | ||
2295 | lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); | ||
2296 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2297 | |||
2298 | mlog(0, "Waiting on lockres %s\n", lockres->l_name); | ||
2299 | |||
2300 | status = ocfs2_wait_for_mask(&mw); | ||
2301 | if (status) | ||
2302 | mlog_errno(status); | ||
2303 | |||
2304 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2305 | } | ||
2306 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2307 | } | ||
2308 | |||
2309 | static void ocfs2_drop_osb_locks(struct ocfs2_super *osb) | ||
2310 | { | ||
2311 | int status; | ||
2312 | |||
2313 | mlog_entry_void(); | ||
2314 | |||
2315 | ocfs2_mark_lockres_freeing(&osb->osb_super_lockres); | ||
2316 | |||
2317 | status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL); | ||
2318 | if (status < 0) | ||
2319 | mlog_errno(status); | ||
2320 | |||
2321 | ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres); | ||
2322 | |||
2323 | status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL); | ||
2324 | if (status < 0) | ||
2325 | mlog_errno(status); | ||
2326 | |||
2327 | mlog_exit(status); | ||
2328 | } | ||
2329 | |||
2330 | static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data) | ||
2331 | { | ||
2332 | struct inode *inode = data; | ||
2333 | |||
2334 | /* the metadata lock requires a bit more work as we have an | ||
2335 | * LVB to worry about. */ | ||
2336 | if (lockres->l_flags & OCFS2_LOCK_ATTACHED && | ||
2337 | lockres->l_level == LKM_EXMODE && | ||
2338 | !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) | ||
2339 | __ocfs2_stuff_meta_lvb(inode); | ||
2340 | } | ||
2341 | |||
2342 | int ocfs2_drop_inode_locks(struct inode *inode) | ||
2343 | { | ||
2344 | int status, err; | ||
2345 | struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, }; | ||
2346 | |||
2347 | mlog_entry_void(); | ||
2348 | |||
2349 | /* No need to call ocfs2_mark_lockres_freeing here - | ||
2350 | * ocfs2_clear_inode has done it for us. */ | ||
2351 | |||
2352 | err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), | ||
2353 | &OCFS2_I(inode)->ip_data_lockres, | ||
2354 | NULL); | ||
2355 | if (err < 0) | ||
2356 | mlog_errno(err); | ||
2357 | |||
2358 | status = err; | ||
2359 | |||
2360 | err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), | ||
2361 | &OCFS2_I(inode)->ip_meta_lockres, | ||
2362 | &meta_dcb); | ||
2363 | if (err < 0) | ||
2364 | mlog_errno(err); | ||
2365 | if (err < 0 && !status) | ||
2366 | status = err; | ||
2367 | |||
2368 | err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), | ||
2369 | &OCFS2_I(inode)->ip_rw_lockres, | ||
2370 | NULL); | ||
2371 | if (err < 0) | ||
2372 | mlog_errno(err); | ||
2373 | if (err < 0 && !status) | ||
2374 | status = err; | ||
2375 | |||
2376 | mlog_exit(status); | ||
2377 | return status; | ||
2378 | } | ||
2379 | |||
2380 | static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, | ||
2381 | int new_level) | ||
2382 | { | ||
2383 | assert_spin_locked(&lockres->l_lock); | ||
2384 | |||
2385 | BUG_ON(lockres->l_blocking <= LKM_NLMODE); | ||
2386 | |||
2387 | if (lockres->l_level <= new_level) { | ||
2388 | mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n", | ||
2389 | lockres->l_level, new_level); | ||
2390 | BUG(); | ||
2391 | } | ||
2392 | |||
2393 | mlog(0, "lock %s, new_level = %d, l_blocking = %d\n", | ||
2394 | lockres->l_name, new_level, lockres->l_blocking); | ||
2395 | |||
2396 | lockres->l_action = OCFS2_AST_DOWNCONVERT; | ||
2397 | lockres->l_requested = new_level; | ||
2398 | lockres_or_flags(lockres, OCFS2_LOCK_BUSY); | ||
2399 | } | ||
2400 | |||
2401 | static int ocfs2_downconvert_lock(struct ocfs2_super *osb, | ||
2402 | struct ocfs2_lock_res *lockres, | ||
2403 | int new_level, | ||
2404 | int lvb) | ||
2405 | { | ||
2406 | int ret, dlm_flags = LKM_CONVERT; | ||
2407 | enum dlm_status status; | ||
2408 | |||
2409 | mlog_entry_void(); | ||
2410 | |||
2411 | if (lvb) | ||
2412 | dlm_flags |= LKM_VALBLK; | ||
2413 | |||
2414 | status = dlmlock(osb->dlm, | ||
2415 | new_level, | ||
2416 | &lockres->l_lksb, | ||
2417 | dlm_flags, | ||
2418 | lockres->l_name, | ||
2419 | lockres->l_ops->ast, | ||
2420 | lockres, | ||
2421 | lockres->l_ops->bast); | ||
2422 | if (status != DLM_NORMAL) { | ||
2423 | ocfs2_log_dlm_error("dlmlock", status, lockres); | ||
2424 | ret = -EINVAL; | ||
2425 | ocfs2_recover_from_dlm_error(lockres, 1); | ||
2426 | goto bail; | ||
2427 | } | ||
2428 | |||
2429 | ret = 0; | ||
2430 | bail: | ||
2431 | mlog_exit(ret); | ||
2432 | return ret; | ||
2433 | } | ||
2434 | |||
2435 | /* returns 1 when the caller should unlock and call dlmunlock */ | ||
2436 | static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, | ||
2437 | struct ocfs2_lock_res *lockres) | ||
2438 | { | ||
2439 | assert_spin_locked(&lockres->l_lock); | ||
2440 | |||
2441 | mlog_entry_void(); | ||
2442 | mlog(0, "lock %s\n", lockres->l_name); | ||
2443 | |||
2444 | if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { | ||
2445 | /* If we're already trying to cancel a lock conversion | ||
2446 | * then just drop the spinlock and allow the caller to | ||
2447 | * requeue this lock. */ | ||
2448 | |||
2449 | mlog(0, "Lockres %s, skip convert\n", lockres->l_name); | ||
2450 | return 0; | ||
2451 | } | ||
2452 | |||
2453 | /* were we in a convert when we got the bast fire? */ | ||
2454 | BUG_ON(lockres->l_action != OCFS2_AST_CONVERT && | ||
2455 | lockres->l_action != OCFS2_AST_DOWNCONVERT); | ||
2456 | /* set things up for the unlockast to know to just | ||
2457 | * clear out the ast_action and unset busy, etc. */ | ||
2458 | lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT; | ||
2459 | |||
2460 | mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY), | ||
2461 | "lock %s, invalid flags: 0x%lx\n", | ||
2462 | lockres->l_name, lockres->l_flags); | ||
2463 | |||
2464 | return 1; | ||
2465 | } | ||
2466 | |||
2467 | static int ocfs2_cancel_convert(struct ocfs2_super *osb, | ||
2468 | struct ocfs2_lock_res *lockres) | ||
2469 | { | ||
2470 | int ret; | ||
2471 | enum dlm_status status; | ||
2472 | |||
2473 | mlog_entry_void(); | ||
2474 | mlog(0, "lock %s\n", lockres->l_name); | ||
2475 | |||
2476 | ret = 0; | ||
2477 | status = dlmunlock(osb->dlm, | ||
2478 | &lockres->l_lksb, | ||
2479 | LKM_CANCEL, | ||
2480 | lockres->l_ops->unlock_ast, | ||
2481 | lockres); | ||
2482 | if (status != DLM_NORMAL) { | ||
2483 | ocfs2_log_dlm_error("dlmunlock", status, lockres); | ||
2484 | ret = -EINVAL; | ||
2485 | ocfs2_recover_from_dlm_error(lockres, 0); | ||
2486 | } | ||
2487 | |||
2488 | mlog(0, "lock %s return from dlmunlock\n", lockres->l_name); | ||
2489 | |||
2490 | mlog_exit(ret); | ||
2491 | return ret; | ||
2492 | } | ||
2493 | |||
2494 | static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode, | ||
2495 | struct ocfs2_lock_res *lockres, | ||
2496 | int new_level) | ||
2497 | { | ||
2498 | int ret; | ||
2499 | |||
2500 | mlog_entry_void(); | ||
2501 | |||
2502 | BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE); | ||
2503 | |||
2504 | if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { | ||
2505 | ret = 0; | ||
2506 | mlog(0, "lockres %s currently being refreshed -- backing " | ||
2507 | "off!\n", lockres->l_name); | ||
2508 | } else if (new_level == LKM_PRMODE) | ||
2509 | ret = !lockres->l_ex_holders && | ||
2510 | ocfs2_inode_fully_checkpointed(inode); | ||
2511 | else /* Must be NLMODE we're converting to. */ | ||
2512 | ret = !lockres->l_ro_holders && !lockres->l_ex_holders && | ||
2513 | ocfs2_inode_fully_checkpointed(inode); | ||
2514 | |||
2515 | mlog_exit(ret); | ||
2516 | return ret; | ||
2517 | } | ||
2518 | |||
2519 | static int ocfs2_do_unblock_meta(struct inode *inode, | ||
2520 | int *requeue) | ||
2521 | { | ||
2522 | int new_level; | ||
2523 | int set_lvb = 0; | ||
2524 | int ret = 0; | ||
2525 | struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; | ||
2526 | unsigned long flags; | ||
2527 | |||
2528 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
2529 | |||
2530 | mlog_entry_void(); | ||
2531 | |||
2532 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2533 | |||
2534 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); | ||
2535 | |||
2536 | mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level, | ||
2537 | lockres->l_blocking); | ||
2538 | |||
2539 | BUG_ON(lockres->l_level != LKM_EXMODE && | ||
2540 | lockres->l_level != LKM_PRMODE); | ||
2541 | |||
2542 | if (lockres->l_flags & OCFS2_LOCK_BUSY) { | ||
2543 | *requeue = 1; | ||
2544 | ret = ocfs2_prepare_cancel_convert(osb, lockres); | ||
2545 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2546 | if (ret) { | ||
2547 | ret = ocfs2_cancel_convert(osb, lockres); | ||
2548 | if (ret < 0) | ||
2549 | mlog_errno(ret); | ||
2550 | } | ||
2551 | goto leave; | ||
2552 | } | ||
2553 | |||
2554 | new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); | ||
2555 | |||
2556 | mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n", | ||
2557 | lockres->l_level, lockres->l_blocking, new_level); | ||
2558 | |||
2559 | if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) { | ||
2560 | if (lockres->l_level == LKM_EXMODE) | ||
2561 | set_lvb = 1; | ||
2562 | |||
2563 | /* If the lock hasn't been refreshed yet (rare), then | ||
2564 | * our memory inode values are old and we skip | ||
2565 | * stuffing the lvb. There's no need to actually clear | ||
2566 | * out the lvb here as it's value is still valid. */ | ||
2567 | if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { | ||
2568 | if (set_lvb) | ||
2569 | __ocfs2_stuff_meta_lvb(inode); | ||
2570 | } else | ||
2571 | mlog(0, "lockres %s: downconverting stale lock!\n", | ||
2572 | lockres->l_name); | ||
2573 | |||
2574 | mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, " | ||
2575 | "l_blocking=%d, new_level=%d\n", | ||
2576 | lockres->l_level, lockres->l_blocking, new_level); | ||
2577 | |||
2578 | ocfs2_prepare_downconvert(lockres, new_level); | ||
2579 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2580 | ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb); | ||
2581 | goto leave; | ||
2582 | } | ||
2583 | if (!ocfs2_inode_fully_checkpointed(inode)) | ||
2584 | ocfs2_start_checkpoint(osb); | ||
2585 | |||
2586 | *requeue = 1; | ||
2587 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2588 | ret = 0; | ||
2589 | leave: | ||
2590 | mlog_exit(ret); | ||
2591 | return ret; | ||
2592 | } | ||
2593 | |||
2594 | static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, | ||
2595 | struct ocfs2_lock_res *lockres, | ||
2596 | int *requeue, | ||
2597 | ocfs2_convert_worker_t *worker) | ||
2598 | { | ||
2599 | unsigned long flags; | ||
2600 | int blocking; | ||
2601 | int new_level; | ||
2602 | int ret = 0; | ||
2603 | |||
2604 | mlog_entry_void(); | ||
2605 | |||
2606 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2607 | |||
2608 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); | ||
2609 | |||
2610 | recheck: | ||
2611 | if (lockres->l_flags & OCFS2_LOCK_BUSY) { | ||
2612 | *requeue = 1; | ||
2613 | ret = ocfs2_prepare_cancel_convert(osb, lockres); | ||
2614 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2615 | if (ret) { | ||
2616 | ret = ocfs2_cancel_convert(osb, lockres); | ||
2617 | if (ret < 0) | ||
2618 | mlog_errno(ret); | ||
2619 | } | ||
2620 | goto leave; | ||
2621 | } | ||
2622 | |||
2623 | /* if we're blocking an exclusive and we have *any* holders, | ||
2624 | * then requeue. */ | ||
2625 | if ((lockres->l_blocking == LKM_EXMODE) | ||
2626 | && (lockres->l_ex_holders || lockres->l_ro_holders)) { | ||
2627 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2628 | *requeue = 1; | ||
2629 | ret = 0; | ||
2630 | goto leave; | ||
2631 | } | ||
2632 | |||
2633 | /* If it's a PR we're blocking, then only | ||
2634 | * requeue if we've got any EX holders */ | ||
2635 | if (lockres->l_blocking == LKM_PRMODE && | ||
2636 | lockres->l_ex_holders) { | ||
2637 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2638 | *requeue = 1; | ||
2639 | ret = 0; | ||
2640 | goto leave; | ||
2641 | } | ||
2642 | |||
2643 | /* If we get here, then we know that there are no more | ||
2644 | * incompatible holders (and anyone asking for an incompatible | ||
2645 | * lock is blocked). We can now downconvert the lock */ | ||
2646 | if (!worker) | ||
2647 | goto downconvert; | ||
2648 | |||
2649 | /* Some lockres types want to do a bit of work before | ||
2650 | * downconverting a lock. Allow that here. The worker function | ||
2651 | * may sleep, so we save off a copy of what we're blocking as | ||
2652 | * it may change while we're not holding the spin lock. */ | ||
2653 | blocking = lockres->l_blocking; | ||
2654 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2655 | |||
2656 | worker(lockres, blocking); | ||
2657 | |||
2658 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2659 | if (blocking != lockres->l_blocking) { | ||
2660 | /* If this changed underneath us, then we can't drop | ||
2661 | * it just yet. */ | ||
2662 | goto recheck; | ||
2663 | } | ||
2664 | |||
2665 | downconvert: | ||
2666 | *requeue = 0; | ||
2667 | new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); | ||
2668 | |||
2669 | ocfs2_prepare_downconvert(lockres, new_level); | ||
2670 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2671 | ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0); | ||
2672 | leave: | ||
2673 | mlog_exit(ret); | ||
2674 | return ret; | ||
2675 | } | ||
2676 | |||
2677 | static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, | ||
2678 | int blocking) | ||
2679 | { | ||
2680 | struct inode *inode; | ||
2681 | struct address_space *mapping; | ||
2682 | |||
2683 | mlog_entry_void(); | ||
2684 | |||
2685 | inode = ocfs2_lock_res_inode(lockres); | ||
2686 | mapping = inode->i_mapping; | ||
2687 | |||
2688 | if (filemap_fdatawrite(mapping)) { | ||
2689 | mlog(ML_ERROR, "Could not sync inode %"MLFu64" for downconvert!", | ||
2690 | OCFS2_I(inode)->ip_blkno); | ||
2691 | } | ||
2692 | sync_mapping_buffers(mapping); | ||
2693 | if (blocking == LKM_EXMODE) { | ||
2694 | truncate_inode_pages(mapping, 0); | ||
2695 | unmap_mapping_range(mapping, 0, 0, 0); | ||
2696 | } else { | ||
2697 | /* We only need to wait on the I/O if we're not also | ||
2698 | * truncating pages because truncate_inode_pages waits | ||
2699 | * for us above. We don't truncate pages if we're | ||
2700 | * blocking anything < EXMODE because we want to keep | ||
2701 | * them around in that case. */ | ||
2702 | filemap_fdatawait(mapping); | ||
2703 | } | ||
2704 | |||
2705 | mlog_exit_void(); | ||
2706 | } | ||
2707 | |||
2708 | int ocfs2_unblock_data(struct ocfs2_lock_res *lockres, | ||
2709 | int *requeue) | ||
2710 | { | ||
2711 | int status; | ||
2712 | struct inode *inode; | ||
2713 | struct ocfs2_super *osb; | ||
2714 | |||
2715 | mlog_entry_void(); | ||
2716 | |||
2717 | inode = ocfs2_lock_res_inode(lockres); | ||
2718 | osb = OCFS2_SB(inode->i_sb); | ||
2719 | |||
2720 | mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno); | ||
2721 | |||
2722 | status = ocfs2_generic_unblock_lock(osb, | ||
2723 | lockres, | ||
2724 | requeue, | ||
2725 | ocfs2_data_convert_worker); | ||
2726 | if (status < 0) | ||
2727 | mlog_errno(status); | ||
2728 | |||
2729 | mlog(0, "inode %"MLFu64", requeue = %d\n", | ||
2730 | OCFS2_I(inode)->ip_blkno, *requeue); | ||
2731 | |||
2732 | mlog_exit(status); | ||
2733 | return status; | ||
2734 | } | ||
2735 | |||
2736 | static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres, | ||
2737 | int *requeue) | ||
2738 | { | ||
2739 | int status; | ||
2740 | struct inode *inode; | ||
2741 | |||
2742 | mlog_entry_void(); | ||
2743 | |||
2744 | mlog(0, "Unblock lockres %s\n", lockres->l_name); | ||
2745 | |||
2746 | inode = ocfs2_lock_res_inode(lockres); | ||
2747 | |||
2748 | status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb), | ||
2749 | lockres, | ||
2750 | requeue, | ||
2751 | NULL); | ||
2752 | if (status < 0) | ||
2753 | mlog_errno(status); | ||
2754 | |||
2755 | mlog_exit(status); | ||
2756 | return status; | ||
2757 | } | ||
2758 | |||
2759 | |||
2760 | int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres, | ||
2761 | int *requeue) | ||
2762 | { | ||
2763 | int status; | ||
2764 | struct inode *inode; | ||
2765 | |||
2766 | mlog_entry_void(); | ||
2767 | |||
2768 | inode = ocfs2_lock_res_inode(lockres); | ||
2769 | |||
2770 | mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno); | ||
2771 | |||
2772 | status = ocfs2_do_unblock_meta(inode, requeue); | ||
2773 | if (status < 0) | ||
2774 | mlog_errno(status); | ||
2775 | |||
2776 | mlog(0, "inode %"MLFu64", requeue = %d\n", | ||
2777 | OCFS2_I(inode)->ip_blkno, *requeue); | ||
2778 | |||
2779 | mlog_exit(status); | ||
2780 | return status; | ||
2781 | } | ||
2782 | |||
2783 | /* Generic unblock function for any lockres whose private data is an | ||
2784 | * ocfs2_super pointer. */ | ||
2785 | static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres, | ||
2786 | int *requeue) | ||
2787 | { | ||
2788 | int status; | ||
2789 | struct ocfs2_super *osb; | ||
2790 | |||
2791 | mlog_entry_void(); | ||
2792 | |||
2793 | mlog(0, "Unblock lockres %s\n", lockres->l_name); | ||
2794 | |||
2795 | osb = ocfs2_lock_res_super(lockres); | ||
2796 | |||
2797 | status = ocfs2_generic_unblock_lock(osb, | ||
2798 | lockres, | ||
2799 | requeue, | ||
2800 | NULL); | ||
2801 | if (status < 0) | ||
2802 | mlog_errno(status); | ||
2803 | |||
2804 | mlog_exit(status); | ||
2805 | return status; | ||
2806 | } | ||
2807 | |||
2808 | void ocfs2_process_blocked_lock(struct ocfs2_super *osb, | ||
2809 | struct ocfs2_lock_res *lockres) | ||
2810 | { | ||
2811 | int status; | ||
2812 | int requeue = 0; | ||
2813 | unsigned long flags; | ||
2814 | |||
2815 | /* Our reference to the lockres in this function can be | ||
2816 | * considered valid until we remove the OCFS2_LOCK_QUEUED | ||
2817 | * flag. */ | ||
2818 | |||
2819 | mlog_entry_void(); | ||
2820 | |||
2821 | BUG_ON(!lockres); | ||
2822 | BUG_ON(!lockres->l_ops); | ||
2823 | BUG_ON(!lockres->l_ops->unblock); | ||
2824 | |||
2825 | mlog(0, "lockres %s blocked.\n", lockres->l_name); | ||
2826 | |||
2827 | /* Detect whether a lock has been marked as going away while | ||
2828 | * the vote thread was processing other things. A lock can | ||
2829 | * still be marked with OCFS2_LOCK_FREEING after this check, | ||
2830 | * but short circuiting here will still save us some | ||
2831 | * performance. */ | ||
2832 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2833 | if (lockres->l_flags & OCFS2_LOCK_FREEING) | ||
2834 | goto unqueue; | ||
2835 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2836 | |||
2837 | status = lockres->l_ops->unblock(lockres, &requeue); | ||
2838 | if (status < 0) | ||
2839 | mlog_errno(status); | ||
2840 | |||
2841 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
2842 | unqueue: | ||
2843 | if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) { | ||
2844 | lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED); | ||
2845 | } else | ||
2846 | ocfs2_schedule_blocked_lock(osb, lockres); | ||
2847 | |||
2848 | mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name, | ||
2849 | requeue ? "yes" : "no"); | ||
2850 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
2851 | |||
2852 | mlog_exit_void(); | ||
2853 | } | ||
2854 | |||
2855 | static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, | ||
2856 | struct ocfs2_lock_res *lockres) | ||
2857 | { | ||
2858 | mlog_entry_void(); | ||
2859 | |||
2860 | assert_spin_locked(&lockres->l_lock); | ||
2861 | |||
2862 | if (lockres->l_flags & OCFS2_LOCK_FREEING) { | ||
2863 | /* Do not schedule a lock for downconvert when it's on | ||
2864 | * the way to destruction - any nodes wanting access | ||
2865 | * to the resource will get it soon. */ | ||
2866 | mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n", | ||
2867 | lockres->l_name, lockres->l_flags); | ||
2868 | return; | ||
2869 | } | ||
2870 | |||
2871 | lockres_or_flags(lockres, OCFS2_LOCK_QUEUED); | ||
2872 | |||
2873 | spin_lock(&osb->vote_task_lock); | ||
2874 | if (list_empty(&lockres->l_blocked_list)) { | ||
2875 | list_add_tail(&lockres->l_blocked_list, | ||
2876 | &osb->blocked_lock_list); | ||
2877 | osb->blocked_lock_count++; | ||
2878 | } | ||
2879 | spin_unlock(&osb->vote_task_lock); | ||
2880 | |||
2881 | mlog_exit_void(); | ||
2882 | } | ||
2883 | |||
2884 | /* This aids in debugging situations where a bad LVB might be involved. */ | ||
2885 | void ocfs2_dump_meta_lvb_info(u64 level, | ||
2886 | const char *function, | ||
2887 | unsigned int line, | ||
2888 | struct ocfs2_lock_res *lockres) | ||
2889 | { | ||
2890 | struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; | ||
2891 | |||
2892 | mlog(level, "LVB information for %s (called from %s:%u):\n", | ||
2893 | lockres->l_name, function, line); | ||
2894 | mlog(level, "version: %u, clusters: %u\n", | ||
2895 | be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters)); | ||
2896 | mlog(level, "size: %"MLFu64", uid %u, gid %u, mode 0x%x\n", | ||
2897 | be64_to_cpu(lvb->lvb_isize), be32_to_cpu(lvb->lvb_iuid), | ||
2898 | be32_to_cpu(lvb->lvb_igid), be16_to_cpu(lvb->lvb_imode)); | ||
2899 | mlog(level, "nlink %u, atime_packed 0x%"MLFx64", " | ||
2900 | "ctime_packed 0x%"MLFx64", mtime_packed 0x%"MLFx64"\n", | ||
2901 | be16_to_cpu(lvb->lvb_inlink), be64_to_cpu(lvb->lvb_iatime_packed), | ||
2902 | be64_to_cpu(lvb->lvb_ictime_packed), | ||
2903 | be64_to_cpu(lvb->lvb_imtime_packed)); | ||
2904 | } | ||
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h new file mode 100644 index 000000000000..8f2d1db2d9ea --- /dev/null +++ b/fs/ocfs2/dlmglue.h | |||
@@ -0,0 +1,111 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmglue.h | ||
5 | * | ||
6 | * description here | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | |||
27 | #ifndef DLMGLUE_H | ||
28 | #define DLMGLUE_H | ||
29 | |||
30 | #define OCFS2_LVB_VERSION 2 | ||
31 | |||
32 | struct ocfs2_meta_lvb { | ||
33 | __be32 lvb_version; | ||
34 | __be32 lvb_iclusters; | ||
35 | __be32 lvb_iuid; | ||
36 | __be32 lvb_igid; | ||
37 | __be64 lvb_iatime_packed; | ||
38 | __be64 lvb_ictime_packed; | ||
39 | __be64 lvb_imtime_packed; | ||
40 | __be64 lvb_isize; | ||
41 | __be16 lvb_imode; | ||
42 | __be16 lvb_inlink; | ||
43 | __be32 lvb_reserved[3]; | ||
44 | }; | ||
45 | |||
46 | /* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */ | ||
47 | /* don't wait on recovery. */ | ||
48 | #define OCFS2_META_LOCK_RECOVERY (0x01) | ||
49 | /* Instruct the dlm not to queue ourselves on the other node. */ | ||
50 | #define OCFS2_META_LOCK_NOQUEUE (0x02) | ||
51 | /* don't block waiting for the vote thread, instead return -EAGAIN */ | ||
52 | #define OCFS2_LOCK_NONBLOCK (0x04) | ||
53 | |||
54 | int ocfs2_dlm_init(struct ocfs2_super *osb); | ||
55 | void ocfs2_dlm_shutdown(struct ocfs2_super *osb); | ||
56 | void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res); | ||
57 | void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, | ||
58 | enum ocfs2_lock_type type, | ||
59 | struct inode *inode); | ||
60 | void ocfs2_lock_res_free(struct ocfs2_lock_res *res); | ||
61 | int ocfs2_create_new_inode_locks(struct inode *inode); | ||
62 | int ocfs2_drop_inode_locks(struct inode *inode); | ||
63 | int ocfs2_data_lock_full(struct inode *inode, | ||
64 | int write, | ||
65 | int arg_flags); | ||
66 | #define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0) | ||
67 | int ocfs2_data_lock_with_page(struct inode *inode, | ||
68 | int write, | ||
69 | struct page *page); | ||
70 | void ocfs2_data_unlock(struct inode *inode, | ||
71 | int write); | ||
72 | int ocfs2_rw_lock(struct inode *inode, int write); | ||
73 | void ocfs2_rw_unlock(struct inode *inode, int write); | ||
74 | int ocfs2_meta_lock_full(struct inode *inode, | ||
75 | struct ocfs2_journal_handle *handle, | ||
76 | struct buffer_head **ret_bh, | ||
77 | int ex, | ||
78 | int arg_flags); | ||
79 | int ocfs2_meta_lock_with_page(struct inode *inode, | ||
80 | struct ocfs2_journal_handle *handle, | ||
81 | struct buffer_head **ret_bh, | ||
82 | int ex, | ||
83 | struct page *page); | ||
84 | /* 99% of the time we don't want to supply any additional flags -- | ||
85 | * those are for very specific cases only. */ | ||
86 | #define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0) | ||
87 | void ocfs2_meta_unlock(struct inode *inode, | ||
88 | int ex); | ||
89 | int ocfs2_super_lock(struct ocfs2_super *osb, | ||
90 | int ex); | ||
91 | void ocfs2_super_unlock(struct ocfs2_super *osb, | ||
92 | int ex); | ||
93 | int ocfs2_rename_lock(struct ocfs2_super *osb); | ||
94 | void ocfs2_rename_unlock(struct ocfs2_super *osb); | ||
95 | void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); | ||
96 | |||
97 | /* for the vote thread */ | ||
98 | void ocfs2_process_blocked_lock(struct ocfs2_super *osb, | ||
99 | struct ocfs2_lock_res *lockres); | ||
100 | |||
101 | struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); | ||
102 | void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); | ||
103 | |||
104 | /* aids in debugging and tracking lvbs */ | ||
105 | void ocfs2_dump_meta_lvb_info(u64 level, | ||
106 | const char *function, | ||
107 | unsigned int line, | ||
108 | struct ocfs2_lock_res *lockres); | ||
109 | #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) | ||
110 | |||
111 | #endif /* DLMGLUE_H */ | ||
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h new file mode 100644 index 000000000000..f226b2207628 --- /dev/null +++ b/fs/ocfs2/endian.h | |||
@@ -0,0 +1,45 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * Copyright (C) 2005 Oracle. All rights reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | */ | ||
21 | |||
22 | #ifndef OCFS2_ENDIAN_H | ||
23 | #define OCFS2_ENDIAN_H | ||
24 | |||
25 | static inline void le16_add_cpu(__le16 *var, u16 val) | ||
26 | { | ||
27 | *var = cpu_to_le16(le16_to_cpu(*var) + val); | ||
28 | } | ||
29 | |||
30 | static inline void le32_add_cpu(__le32 *var, u32 val) | ||
31 | { | ||
32 | *var = cpu_to_le32(le32_to_cpu(*var) + val); | ||
33 | } | ||
34 | |||
35 | static inline void le32_and_cpu(__le32 *var, u32 val) | ||
36 | { | ||
37 | *var = cpu_to_le32(le32_to_cpu(*var) & val); | ||
38 | } | ||
39 | |||
40 | static inline void be32_add_cpu(__be32 *var, u32 val) | ||
41 | { | ||
42 | *var = cpu_to_be32(be32_to_cpu(*var) + val); | ||
43 | } | ||
44 | |||
45 | #endif /* OCFS2_ENDIAN_H */ | ||
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c new file mode 100644 index 000000000000..5810160d92a8 --- /dev/null +++ b/fs/ocfs2/export.c | |||
@@ -0,0 +1,248 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * export.c | ||
5 | * | ||
6 | * Functions to facilitate NFS exporting | ||
7 | * | ||
8 | * Copyright (C) 2002, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | |||
29 | #define MLOG_MASK_PREFIX ML_EXPORT | ||
30 | #include <cluster/masklog.h> | ||
31 | |||
32 | #include "ocfs2.h" | ||
33 | |||
34 | #include "dir.h" | ||
35 | #include "dlmglue.h" | ||
36 | #include "export.h" | ||
37 | #include "inode.h" | ||
38 | |||
39 | #include "buffer_head_io.h" | ||
40 | |||
41 | struct ocfs2_inode_handle | ||
42 | { | ||
43 | u64 ih_blkno; | ||
44 | u32 ih_generation; | ||
45 | }; | ||
46 | |||
47 | static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp) | ||
48 | { | ||
49 | struct ocfs2_inode_handle *handle = vobjp; | ||
50 | struct inode *inode; | ||
51 | struct dentry *result; | ||
52 | |||
53 | mlog_entry("(0x%p, 0x%p)\n", sb, handle); | ||
54 | |||
55 | if (handle->ih_blkno == 0) { | ||
56 | mlog_errno(-ESTALE); | ||
57 | return ERR_PTR(-ESTALE); | ||
58 | } | ||
59 | |||
60 | inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno); | ||
61 | |||
62 | if (IS_ERR(inode)) { | ||
63 | mlog_errno(PTR_ERR(inode)); | ||
64 | return (void *)inode; | ||
65 | } | ||
66 | |||
67 | if (handle->ih_generation != inode->i_generation) { | ||
68 | iput(inode); | ||
69 | mlog_errno(-ESTALE); | ||
70 | return ERR_PTR(-ESTALE); | ||
71 | } | ||
72 | |||
73 | result = d_alloc_anon(inode); | ||
74 | |||
75 | if (!result) { | ||
76 | iput(inode); | ||
77 | mlog_errno(-ENOMEM); | ||
78 | return ERR_PTR(-ENOMEM); | ||
79 | } | ||
80 | |||
81 | mlog_exit_ptr(result); | ||
82 | return result; | ||
83 | } | ||
84 | |||
85 | static struct dentry *ocfs2_get_parent(struct dentry *child) | ||
86 | { | ||
87 | int status; | ||
88 | u64 blkno; | ||
89 | struct dentry *parent; | ||
90 | struct inode *inode; | ||
91 | struct inode *dir = child->d_inode; | ||
92 | struct buffer_head *dirent_bh = NULL; | ||
93 | struct ocfs2_dir_entry *dirent; | ||
94 | |||
95 | mlog_entry("(0x%p, '%.*s')\n", child, | ||
96 | child->d_name.len, child->d_name.name); | ||
97 | |||
98 | mlog(0, "find parent of directory %"MLFu64"\n", | ||
99 | OCFS2_I(dir)->ip_blkno); | ||
100 | |||
101 | status = ocfs2_meta_lock(dir, NULL, NULL, 0); | ||
102 | if (status < 0) { | ||
103 | if (status != -ENOENT) | ||
104 | mlog_errno(status); | ||
105 | parent = ERR_PTR(status); | ||
106 | goto bail; | ||
107 | } | ||
108 | |||
109 | status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh, | ||
110 | &dirent); | ||
111 | if (status < 0) { | ||
112 | parent = ERR_PTR(-ENOENT); | ||
113 | goto bail_unlock; | ||
114 | } | ||
115 | |||
116 | inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno); | ||
117 | if (IS_ERR(inode)) { | ||
118 | mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno); | ||
119 | parent = ERR_PTR(-EACCES); | ||
120 | goto bail_unlock; | ||
121 | } | ||
122 | |||
123 | parent = d_alloc_anon(inode); | ||
124 | if (!parent) { | ||
125 | iput(inode); | ||
126 | parent = ERR_PTR(-ENOMEM); | ||
127 | } | ||
128 | |||
129 | bail_unlock: | ||
130 | ocfs2_meta_unlock(dir, 0); | ||
131 | |||
132 | if (dirent_bh) | ||
133 | brelse(dirent_bh); | ||
134 | |||
135 | bail: | ||
136 | mlog_exit_ptr(parent); | ||
137 | |||
138 | return parent; | ||
139 | } | ||
140 | |||
141 | static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len, | ||
142 | int connectable) | ||
143 | { | ||
144 | struct inode *inode = dentry->d_inode; | ||
145 | int len = *max_len; | ||
146 | int type = 1; | ||
147 | u64 blkno; | ||
148 | u32 generation; | ||
149 | |||
150 | mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry, | ||
151 | dentry->d_name.len, dentry->d_name.name, | ||
152 | fh, len, connectable); | ||
153 | |||
154 | if (len < 3 || (connectable && len < 6)) { | ||
155 | mlog(ML_ERROR, "fh buffer is too small for encoding\n"); | ||
156 | type = 255; | ||
157 | goto bail; | ||
158 | } | ||
159 | |||
160 | blkno = OCFS2_I(inode)->ip_blkno; | ||
161 | generation = inode->i_generation; | ||
162 | |||
163 | mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n", | ||
164 | blkno, generation); | ||
165 | |||
166 | len = 3; | ||
167 | fh[0] = cpu_to_le32((u32)(blkno >> 32)); | ||
168 | fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff)); | ||
169 | fh[2] = cpu_to_le32(generation); | ||
170 | |||
171 | if (connectable && !S_ISDIR(inode->i_mode)) { | ||
172 | struct inode *parent; | ||
173 | |||
174 | spin_lock(&dentry->d_lock); | ||
175 | |||
176 | parent = dentry->d_parent->d_inode; | ||
177 | blkno = OCFS2_I(parent)->ip_blkno; | ||
178 | generation = parent->i_generation; | ||
179 | |||
180 | fh[3] = cpu_to_le32((u32)(blkno >> 32)); | ||
181 | fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff)); | ||
182 | fh[5] = cpu_to_le32(generation); | ||
183 | |||
184 | spin_unlock(&dentry->d_lock); | ||
185 | |||
186 | len = 6; | ||
187 | type = 2; | ||
188 | |||
189 | mlog(0, "Encoding parent: blkno: %"MLFu64", generation: %u\n", | ||
190 | blkno, generation); | ||
191 | } | ||
192 | |||
193 | *max_len = len; | ||
194 | |||
195 | bail: | ||
196 | mlog_exit(type); | ||
197 | return type; | ||
198 | } | ||
199 | |||
200 | static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh, | ||
201 | int fh_len, int fileid_type, | ||
202 | int (*acceptable)(void *context, | ||
203 | struct dentry *de), | ||
204 | void *context) | ||
205 | { | ||
206 | struct ocfs2_inode_handle handle, parent; | ||
207 | struct dentry *ret = NULL; | ||
208 | |||
209 | mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n", | ||
210 | sb, fh, fh_len, fileid_type, acceptable, context); | ||
211 | |||
212 | if (fh_len < 3 || fileid_type > 2) | ||
213 | goto bail; | ||
214 | |||
215 | if (fileid_type == 2) { | ||
216 | if (fh_len < 6) | ||
217 | goto bail; | ||
218 | |||
219 | parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32; | ||
220 | parent.ih_blkno |= (u64)le32_to_cpu(fh[4]); | ||
221 | parent.ih_generation = le32_to_cpu(fh[5]); | ||
222 | |||
223 | mlog(0, "Decoding parent: blkno: %"MLFu64", generation: %u\n", | ||
224 | parent.ih_blkno, parent.ih_generation); | ||
225 | } | ||
226 | |||
227 | handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32; | ||
228 | handle.ih_blkno |= (u64)le32_to_cpu(fh[1]); | ||
229 | handle.ih_generation = le32_to_cpu(fh[2]); | ||
230 | |||
231 | mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n", | ||
232 | handle.ih_blkno, handle.ih_generation); | ||
233 | |||
234 | ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent, | ||
235 | acceptable, context); | ||
236 | |||
237 | bail: | ||
238 | mlog_exit_ptr(ret); | ||
239 | return ret; | ||
240 | } | ||
241 | |||
242 | struct export_operations ocfs2_export_ops = { | ||
243 | .decode_fh = ocfs2_decode_fh, | ||
244 | .encode_fh = ocfs2_encode_fh, | ||
245 | |||
246 | .get_parent = ocfs2_get_parent, | ||
247 | .get_dentry = ocfs2_get_dentry, | ||
248 | }; | ||
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h new file mode 100644 index 000000000000..5b77ee7866ef --- /dev/null +++ b/fs/ocfs2/export.h | |||
@@ -0,0 +1,31 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * export.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_EXPORT_H | ||
27 | #define OCFS2_EXPORT_H | ||
28 | |||
29 | extern struct export_operations ocfs2_export_ops; | ||
30 | |||
31 | #endif /* OCFS2_EXPORT_H */ | ||
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c new file mode 100644 index 000000000000..f2fb40cd296a --- /dev/null +++ b/fs/ocfs2/extent_map.c | |||
@@ -0,0 +1,994 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * extent_map.c | ||
5 | * | ||
6 | * In-memory extent map for OCFS2. Man, this code was prettier in | ||
7 | * the library. | ||
8 | * | ||
9 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public | ||
13 | * License, version 2, as published by the Free Software Foundation. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/init.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/rbtree.h> | ||
31 | |||
32 | #define MLOG_MASK_PREFIX ML_EXTENT_MAP | ||
33 | #include <cluster/masklog.h> | ||
34 | |||
35 | #include "ocfs2.h" | ||
36 | |||
37 | #include "extent_map.h" | ||
38 | #include "inode.h" | ||
39 | #include "super.h" | ||
40 | |||
41 | #include "buffer_head_io.h" | ||
42 | |||
43 | |||
44 | /* | ||
45 | * SUCK SUCK SUCK | ||
46 | * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h | ||
47 | */ | ||
48 | |||
49 | struct ocfs2_extent_map_entry { | ||
50 | struct rb_node e_node; | ||
51 | int e_tree_depth; | ||
52 | struct ocfs2_extent_rec e_rec; | ||
53 | }; | ||
54 | |||
55 | struct ocfs2_em_insert_context { | ||
56 | int need_left; | ||
57 | int need_right; | ||
58 | struct ocfs2_extent_map_entry *new_ent; | ||
59 | struct ocfs2_extent_map_entry *old_ent; | ||
60 | struct ocfs2_extent_map_entry *left_ent; | ||
61 | struct ocfs2_extent_map_entry *right_ent; | ||
62 | }; | ||
63 | |||
64 | static kmem_cache_t *ocfs2_em_ent_cachep = NULL; | ||
65 | |||
66 | |||
67 | static struct ocfs2_extent_map_entry * | ||
68 | ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, | ||
69 | u32 cpos, u32 clusters, | ||
70 | struct rb_node ***ret_p, | ||
71 | struct rb_node **ret_parent); | ||
72 | static int ocfs2_extent_map_insert(struct inode *inode, | ||
73 | struct ocfs2_extent_rec *rec, | ||
74 | int tree_depth); | ||
75 | static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, | ||
76 | struct ocfs2_extent_map_entry *ent); | ||
77 | static int ocfs2_extent_map_find_leaf(struct inode *inode, | ||
78 | u32 cpos, u32 clusters, | ||
79 | struct ocfs2_extent_list *el); | ||
80 | static int ocfs2_extent_map_lookup_read(struct inode *inode, | ||
81 | u32 cpos, u32 clusters, | ||
82 | struct ocfs2_extent_map_entry **ret_ent); | ||
83 | static int ocfs2_extent_map_try_insert(struct inode *inode, | ||
84 | struct ocfs2_extent_rec *rec, | ||
85 | int tree_depth, | ||
86 | struct ocfs2_em_insert_context *ctxt); | ||
87 | |||
88 | /* returns 1 only if the rec contains all the given clusters -- that is that | ||
89 | * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos + | ||
90 | * clusters) is >= the argument's endpoint */ | ||
91 | static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec, | ||
92 | u32 cpos, u32 clusters) | ||
93 | { | ||
94 | if (le32_to_cpu(rec->e_cpos) > cpos) | ||
95 | return 0; | ||
96 | if (cpos + clusters > le32_to_cpu(rec->e_cpos) + | ||
97 | le32_to_cpu(rec->e_clusters)) | ||
98 | return 0; | ||
99 | return 1; | ||
100 | } | ||
101 | |||
102 | |||
103 | /* | ||
104 | * Find an entry in the tree that intersects the region passed in. | ||
105 | * Note that this will find straddled intervals, it is up to the | ||
106 | * callers to enforce any boundary conditions. | ||
107 | * | ||
108 | * Callers must hold ip_lock. This lookup is not guaranteed to return | ||
109 | * a tree_depth 0 match, and as such can race inserts if the lock | ||
110 | * were not held. | ||
111 | * | ||
112 | * The rb_node garbage lets insertion share the search. Trivial | ||
113 | * callers pass NULL. | ||
114 | */ | ||
115 | static struct ocfs2_extent_map_entry * | ||
116 | ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, | ||
117 | u32 cpos, u32 clusters, | ||
118 | struct rb_node ***ret_p, | ||
119 | struct rb_node **ret_parent) | ||
120 | { | ||
121 | struct rb_node **p = &em->em_extents.rb_node; | ||
122 | struct rb_node *parent = NULL; | ||
123 | struct ocfs2_extent_map_entry *ent = NULL; | ||
124 | |||
125 | while (*p) | ||
126 | { | ||
127 | parent = *p; | ||
128 | ent = rb_entry(parent, struct ocfs2_extent_map_entry, | ||
129 | e_node); | ||
130 | if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) { | ||
131 | p = &(*p)->rb_left; | ||
132 | ent = NULL; | ||
133 | } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) + | ||
134 | le32_to_cpu(ent->e_rec.e_clusters))) { | ||
135 | p = &(*p)->rb_right; | ||
136 | ent = NULL; | ||
137 | } else | ||
138 | break; | ||
139 | } | ||
140 | |||
141 | if (ret_p != NULL) | ||
142 | *ret_p = p; | ||
143 | if (ret_parent != NULL) | ||
144 | *ret_parent = parent; | ||
145 | return ent; | ||
146 | } | ||
147 | |||
148 | /* | ||
149 | * Find the leaf containing the interval we want. While we're on our | ||
150 | * way down the tree, fill in every record we see at any depth, because | ||
151 | * we might want it later. | ||
152 | * | ||
153 | * Note that this code is run without ip_lock. That's because it | ||
154 | * sleeps while reading. If someone is also filling the extent list at | ||
155 | * the same time we are, we might have to restart. | ||
156 | */ | ||
157 | static int ocfs2_extent_map_find_leaf(struct inode *inode, | ||
158 | u32 cpos, u32 clusters, | ||
159 | struct ocfs2_extent_list *el) | ||
160 | { | ||
161 | int i, ret; | ||
162 | struct buffer_head *eb_bh = NULL; | ||
163 | u64 blkno; | ||
164 | u32 rec_end; | ||
165 | struct ocfs2_extent_block *eb; | ||
166 | struct ocfs2_extent_rec *rec; | ||
167 | |||
168 | /* | ||
169 | * The bh data containing the el cannot change here, because | ||
170 | * we hold alloc_sem. So we can do this without other | ||
171 | * locks. | ||
172 | */ | ||
173 | while (el->l_tree_depth) | ||
174 | { | ||
175 | blkno = 0; | ||
176 | for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | ||
177 | rec = &el->l_recs[i]; | ||
178 | rec_end = (le32_to_cpu(rec->e_cpos) + | ||
179 | le32_to_cpu(rec->e_clusters)); | ||
180 | |||
181 | ret = -EBADR; | ||
182 | if (rec_end > OCFS2_I(inode)->ip_clusters) { | ||
183 | mlog_errno(ret); | ||
184 | goto out_free; | ||
185 | } | ||
186 | |||
187 | if (rec_end <= cpos) { | ||
188 | ret = ocfs2_extent_map_insert(inode, rec, | ||
189 | le16_to_cpu(el->l_tree_depth)); | ||
190 | if (ret && (ret != -EEXIST)) { | ||
191 | mlog_errno(ret); | ||
192 | goto out_free; | ||
193 | } | ||
194 | continue; | ||
195 | } | ||
196 | if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) { | ||
197 | ret = ocfs2_extent_map_insert(inode, rec, | ||
198 | le16_to_cpu(el->l_tree_depth)); | ||
199 | if (ret && (ret != -EEXIST)) { | ||
200 | mlog_errno(ret); | ||
201 | goto out_free; | ||
202 | } | ||
203 | continue; | ||
204 | } | ||
205 | |||
206 | /* | ||
207 | * We've found a record that matches our | ||
208 | * interval. We don't insert it because we're | ||
209 | * about to traverse it. | ||
210 | */ | ||
211 | |||
212 | /* Check to see if we're stradling */ | ||
213 | ret = -ESRCH; | ||
214 | if (!ocfs2_extent_rec_contains_clusters(rec, | ||
215 | cpos, | ||
216 | clusters)) { | ||
217 | mlog_errno(ret); | ||
218 | goto out_free; | ||
219 | } | ||
220 | |||
221 | /* | ||
222 | * If we've already found a record, the el has | ||
223 | * two records covering the same interval. | ||
224 | * EEEK! | ||
225 | */ | ||
226 | ret = -EBADR; | ||
227 | if (blkno) { | ||
228 | mlog_errno(ret); | ||
229 | goto out_free; | ||
230 | } | ||
231 | |||
232 | blkno = le64_to_cpu(rec->e_blkno); | ||
233 | } | ||
234 | |||
235 | /* | ||
236 | * We don't support holes, and we're still up | ||
237 | * in the branches, so we'd better have found someone | ||
238 | */ | ||
239 | ret = -EBADR; | ||
240 | if (!blkno) { | ||
241 | mlog_errno(ret); | ||
242 | goto out_free; | ||
243 | } | ||
244 | |||
245 | if (eb_bh) { | ||
246 | brelse(eb_bh); | ||
247 | eb_bh = NULL; | ||
248 | } | ||
249 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
250 | blkno, &eb_bh, OCFS2_BH_CACHED, | ||
251 | inode); | ||
252 | if (ret) { | ||
253 | mlog_errno(ret); | ||
254 | goto out_free; | ||
255 | } | ||
256 | eb = (struct ocfs2_extent_block *)eb_bh->b_data; | ||
257 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
258 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
259 | ret = -EIO; | ||
260 | goto out_free; | ||
261 | } | ||
262 | el = &eb->h_list; | ||
263 | } | ||
264 | |||
265 | if (el->l_tree_depth) | ||
266 | BUG(); | ||
267 | |||
268 | for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | ||
269 | rec = &el->l_recs[i]; | ||
270 | ret = ocfs2_extent_map_insert(inode, rec, | ||
271 | le16_to_cpu(el->l_tree_depth)); | ||
272 | if (ret) { | ||
273 | mlog_errno(ret); | ||
274 | goto out_free; | ||
275 | } | ||
276 | } | ||
277 | |||
278 | ret = 0; | ||
279 | |||
280 | out_free: | ||
281 | if (eb_bh) | ||
282 | brelse(eb_bh); | ||
283 | |||
284 | return ret; | ||
285 | } | ||
286 | |||
287 | /* | ||
288 | * This lookup actually will read from disk. It has one invariant: | ||
289 | * It will never re-traverse blocks. This means that all inserts should | ||
290 | * be new regions or more granular regions (both allowed by insert). | ||
291 | */ | ||
292 | static int ocfs2_extent_map_lookup_read(struct inode *inode, | ||
293 | u32 cpos, | ||
294 | u32 clusters, | ||
295 | struct ocfs2_extent_map_entry **ret_ent) | ||
296 | { | ||
297 | int ret; | ||
298 | u64 blkno; | ||
299 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
300 | struct ocfs2_extent_map_entry *ent; | ||
301 | struct buffer_head *bh = NULL; | ||
302 | struct ocfs2_extent_block *eb; | ||
303 | struct ocfs2_dinode *di; | ||
304 | struct ocfs2_extent_list *el; | ||
305 | |||
306 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
307 | ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); | ||
308 | if (ent) { | ||
309 | if (!ent->e_tree_depth) { | ||
310 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
311 | *ret_ent = ent; | ||
312 | return 0; | ||
313 | } | ||
314 | blkno = le64_to_cpu(ent->e_rec.e_blkno); | ||
315 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
316 | |||
317 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, | ||
318 | OCFS2_BH_CACHED, inode); | ||
319 | if (ret) { | ||
320 | mlog_errno(ret); | ||
321 | if (bh) | ||
322 | brelse(bh); | ||
323 | return ret; | ||
324 | } | ||
325 | eb = (struct ocfs2_extent_block *)bh->b_data; | ||
326 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
327 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
328 | brelse(bh); | ||
329 | return -EIO; | ||
330 | } | ||
331 | el = &eb->h_list; | ||
332 | } else { | ||
333 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
334 | |||
335 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
336 | OCFS2_I(inode)->ip_blkno, &bh, | ||
337 | OCFS2_BH_CACHED, inode); | ||
338 | if (ret) { | ||
339 | mlog_errno(ret); | ||
340 | if (bh) | ||
341 | brelse(bh); | ||
342 | return ret; | ||
343 | } | ||
344 | di = (struct ocfs2_dinode *)bh->b_data; | ||
345 | if (!OCFS2_IS_VALID_DINODE(di)) { | ||
346 | brelse(bh); | ||
347 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di); | ||
348 | return -EIO; | ||
349 | } | ||
350 | el = &di->id2.i_list; | ||
351 | } | ||
352 | |||
353 | ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); | ||
354 | brelse(bh); | ||
355 | if (ret) { | ||
356 | mlog_errno(ret); | ||
357 | return ret; | ||
358 | } | ||
359 | |||
360 | ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); | ||
361 | if (!ent) { | ||
362 | ret = -ESRCH; | ||
363 | mlog_errno(ret); | ||
364 | return ret; | ||
365 | } | ||
366 | |||
367 | if (ent->e_tree_depth) | ||
368 | BUG(); /* FIXME: Make sure this isn't a corruption */ | ||
369 | |||
370 | *ret_ent = ent; | ||
371 | |||
372 | return 0; | ||
373 | } | ||
374 | |||
375 | /* | ||
376 | * Callers must hold ip_lock. This can insert pieces of the tree, | ||
377 | * thus racing lookup if the lock weren't held. | ||
378 | */ | ||
379 | static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, | ||
380 | struct ocfs2_extent_map_entry *ent) | ||
381 | { | ||
382 | struct rb_node **p, *parent; | ||
383 | struct ocfs2_extent_map_entry *old_ent; | ||
384 | |||
385 | old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), | ||
386 | le32_to_cpu(ent->e_rec.e_clusters), | ||
387 | &p, &parent); | ||
388 | if (old_ent) | ||
389 | return -EEXIST; | ||
390 | |||
391 | rb_link_node(&ent->e_node, parent, p); | ||
392 | rb_insert_color(&ent->e_node, &em->em_extents); | ||
393 | |||
394 | return 0; | ||
395 | } | ||
396 | |||
397 | |||
398 | /* | ||
399 | * Simple rule: on any return code other than -EAGAIN, anything left | ||
400 | * in the insert_context will be freed. | ||
401 | */ | ||
402 | static int ocfs2_extent_map_try_insert(struct inode *inode, | ||
403 | struct ocfs2_extent_rec *rec, | ||
404 | int tree_depth, | ||
405 | struct ocfs2_em_insert_context *ctxt) | ||
406 | { | ||
407 | int ret; | ||
408 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
409 | struct ocfs2_extent_map_entry *old_ent; | ||
410 | |||
411 | ctxt->need_left = 0; | ||
412 | ctxt->need_right = 0; | ||
413 | ctxt->old_ent = NULL; | ||
414 | |||
415 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
416 | ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); | ||
417 | if (!ret) { | ||
418 | ctxt->new_ent = NULL; | ||
419 | goto out_unlock; | ||
420 | } | ||
421 | |||
422 | old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), | ||
423 | le32_to_cpu(rec->e_clusters), NULL, | ||
424 | NULL); | ||
425 | |||
426 | if (!old_ent) | ||
427 | BUG(); | ||
428 | |||
429 | ret = -EEXIST; | ||
430 | if (old_ent->e_tree_depth < tree_depth) | ||
431 | goto out_unlock; | ||
432 | |||
433 | if (old_ent->e_tree_depth == tree_depth) { | ||
434 | if (!memcmp(rec, &old_ent->e_rec, | ||
435 | sizeof(struct ocfs2_extent_rec))) | ||
436 | ret = 0; | ||
437 | |||
438 | /* FIXME: Should this be ESRCH/EBADR??? */ | ||
439 | goto out_unlock; | ||
440 | } | ||
441 | |||
442 | /* | ||
443 | * We do it in this order specifically so that no actual tree | ||
444 | * changes occur until we have all the pieces we need. We | ||
445 | * don't want malloc failures to leave an inconsistent tree. | ||
446 | * Whenever we drop the lock, another process could be | ||
447 | * inserting. Also note that, if another process just beat us | ||
448 | * to an insert, we might not need the same pieces we needed | ||
449 | * the first go round. In the end, the pieces we need will | ||
450 | * be used, and the pieces we don't will be freed. | ||
451 | */ | ||
452 | ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > | ||
453 | le32_to_cpu(old_ent->e_rec.e_cpos)); | ||
454 | ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + | ||
455 | le32_to_cpu(old_ent->e_rec.e_clusters)) > | ||
456 | (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); | ||
457 | ret = -EAGAIN; | ||
458 | if (ctxt->need_left) { | ||
459 | if (!ctxt->left_ent) | ||
460 | goto out_unlock; | ||
461 | *(ctxt->left_ent) = *old_ent; | ||
462 | ctxt->left_ent->e_rec.e_clusters = | ||
463 | cpu_to_le32(le32_to_cpu(rec->e_cpos) - | ||
464 | le32_to_cpu(ctxt->left_ent->e_rec.e_cpos)); | ||
465 | } | ||
466 | if (ctxt->need_right) { | ||
467 | if (!ctxt->right_ent) | ||
468 | goto out_unlock; | ||
469 | *(ctxt->right_ent) = *old_ent; | ||
470 | ctxt->right_ent->e_rec.e_cpos = | ||
471 | cpu_to_le32(le32_to_cpu(rec->e_cpos) + | ||
472 | le32_to_cpu(rec->e_clusters)); | ||
473 | ctxt->right_ent->e_rec.e_clusters = | ||
474 | cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + | ||
475 | le32_to_cpu(old_ent->e_rec.e_clusters)) - | ||
476 | le32_to_cpu(ctxt->right_ent->e_rec.e_cpos)); | ||
477 | } | ||
478 | |||
479 | rb_erase(&old_ent->e_node, &em->em_extents); | ||
480 | /* Now that he's erased, set him up for deletion */ | ||
481 | ctxt->old_ent = old_ent; | ||
482 | |||
483 | if (ctxt->need_left) { | ||
484 | ret = ocfs2_extent_map_insert_entry(em, | ||
485 | ctxt->left_ent); | ||
486 | if (ret) | ||
487 | goto out_unlock; | ||
488 | ctxt->left_ent = NULL; | ||
489 | } | ||
490 | |||
491 | if (ctxt->need_right) { | ||
492 | ret = ocfs2_extent_map_insert_entry(em, | ||
493 | ctxt->right_ent); | ||
494 | if (ret) | ||
495 | goto out_unlock; | ||
496 | ctxt->right_ent = NULL; | ||
497 | } | ||
498 | |||
499 | ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); | ||
500 | |||
501 | if (!ret) | ||
502 | ctxt->new_ent = NULL; | ||
503 | |||
504 | out_unlock: | ||
505 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
506 | |||
507 | return ret; | ||
508 | } | ||
509 | |||
510 | |||
511 | static int ocfs2_extent_map_insert(struct inode *inode, | ||
512 | struct ocfs2_extent_rec *rec, | ||
513 | int tree_depth) | ||
514 | { | ||
515 | int ret; | ||
516 | struct ocfs2_em_insert_context ctxt = {0, }; | ||
517 | |||
518 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > | ||
519 | OCFS2_I(inode)->ip_map.em_clusters) { | ||
520 | ret = -EBADR; | ||
521 | mlog_errno(ret); | ||
522 | return ret; | ||
523 | } | ||
524 | |||
525 | /* Zero e_clusters means a truncated tail record. It better be EOF */ | ||
526 | if (!rec->e_clusters) { | ||
527 | if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != | ||
528 | OCFS2_I(inode)->ip_map.em_clusters) { | ||
529 | ret = -EBADR; | ||
530 | mlog_errno(ret); | ||
531 | return ret; | ||
532 | } | ||
533 | |||
534 | /* Ignore the truncated tail */ | ||
535 | return 0; | ||
536 | } | ||
537 | |||
538 | ret = -ENOMEM; | ||
539 | ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
540 | GFP_KERNEL); | ||
541 | if (!ctxt.new_ent) { | ||
542 | mlog_errno(ret); | ||
543 | return ret; | ||
544 | } | ||
545 | |||
546 | ctxt.new_ent->e_rec = *rec; | ||
547 | ctxt.new_ent->e_tree_depth = tree_depth; | ||
548 | |||
549 | do { | ||
550 | ret = -ENOMEM; | ||
551 | if (ctxt.need_left && !ctxt.left_ent) { | ||
552 | ctxt.left_ent = | ||
553 | kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
554 | GFP_KERNEL); | ||
555 | if (!ctxt.left_ent) | ||
556 | break; | ||
557 | } | ||
558 | if (ctxt.need_right && !ctxt.right_ent) { | ||
559 | ctxt.right_ent = | ||
560 | kmem_cache_alloc(ocfs2_em_ent_cachep, | ||
561 | GFP_KERNEL); | ||
562 | if (!ctxt.right_ent) | ||
563 | break; | ||
564 | } | ||
565 | |||
566 | ret = ocfs2_extent_map_try_insert(inode, rec, | ||
567 | tree_depth, &ctxt); | ||
568 | } while (ret == -EAGAIN); | ||
569 | |||
570 | if (ret < 0) | ||
571 | mlog_errno(ret); | ||
572 | |||
573 | if (ctxt.left_ent) | ||
574 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent); | ||
575 | if (ctxt.right_ent) | ||
576 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent); | ||
577 | if (ctxt.old_ent) | ||
578 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent); | ||
579 | if (ctxt.new_ent) | ||
580 | kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent); | ||
581 | |||
582 | return ret; | ||
583 | } | ||
584 | |||
585 | /* | ||
586 | * Append this record to the tail of the extent map. It must be | ||
587 | * tree_depth 0. The record might be an extension of an existing | ||
588 | * record, and as such that needs to be handled. eg: | ||
589 | * | ||
590 | * Existing record in the extent map: | ||
591 | * | ||
592 | * cpos = 10, len = 10 | ||
593 | * |---------| | ||
594 | * | ||
595 | * New Record: | ||
596 | * | ||
597 | * cpos = 10, len = 20 | ||
598 | * |------------------| | ||
599 | * | ||
600 | * The passed record is the new on-disk record. The new_clusters value | ||
601 | * is how many clusters were added to the file. If the append is a | ||
602 | * contiguous append, the new_clusters has been added to | ||
603 | * rec->e_clusters. If the append is an entirely new extent, then | ||
604 | * rec->e_clusters is == new_clusters. | ||
605 | */ | ||
606 | int ocfs2_extent_map_append(struct inode *inode, | ||
607 | struct ocfs2_extent_rec *rec, | ||
608 | u32 new_clusters) | ||
609 | { | ||
610 | int ret; | ||
611 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
612 | struct ocfs2_extent_map_entry *ent; | ||
613 | struct ocfs2_extent_rec *old; | ||
614 | |||
615 | BUG_ON(!new_clusters); | ||
616 | BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters); | ||
617 | |||
618 | if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { | ||
619 | /* | ||
620 | * Size changed underneath us on disk. Drop any | ||
621 | * straddling records and update our idea of | ||
622 | * i_clusters | ||
623 | */ | ||
624 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
625 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
626 | } | ||
627 | |||
628 | mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + | ||
629 | le32_to_cpu(rec->e_clusters)) != | ||
630 | (em->em_clusters + new_clusters), | ||
631 | "Inode %"MLFu64":\n" | ||
632 | "rec->e_cpos = %u + rec->e_clusters = %u = %u\n" | ||
633 | "em->em_clusters = %u + new_clusters = %u = %u\n", | ||
634 | OCFS2_I(inode)->ip_blkno, | ||
635 | le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), | ||
636 | le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters), | ||
637 | em->em_clusters, new_clusters, | ||
638 | em->em_clusters + new_clusters); | ||
639 | |||
640 | em->em_clusters += new_clusters; | ||
641 | |||
642 | ret = -ENOENT; | ||
643 | if (le32_to_cpu(rec->e_clusters) > new_clusters) { | ||
644 | /* This is a contiguous append */ | ||
645 | ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1, | ||
646 | NULL, NULL); | ||
647 | if (ent) { | ||
648 | old = &ent->e_rec; | ||
649 | BUG_ON((le32_to_cpu(rec->e_cpos) + | ||
650 | le32_to_cpu(rec->e_clusters)) != | ||
651 | (le32_to_cpu(old->e_cpos) + | ||
652 | le32_to_cpu(old->e_clusters) + | ||
653 | new_clusters)); | ||
654 | if (ent->e_tree_depth == 0) { | ||
655 | BUG_ON(le32_to_cpu(old->e_cpos) != | ||
656 | le32_to_cpu(rec->e_cpos)); | ||
657 | BUG_ON(le64_to_cpu(old->e_blkno) != | ||
658 | le64_to_cpu(rec->e_blkno)); | ||
659 | ret = 0; | ||
660 | } | ||
661 | /* | ||
662 | * Let non-leafs fall through as -ENOENT to | ||
663 | * force insertion of the new leaf. | ||
664 | */ | ||
665 | le32_add_cpu(&old->e_clusters, new_clusters); | ||
666 | } | ||
667 | } | ||
668 | |||
669 | if (ret == -ENOENT) | ||
670 | ret = ocfs2_extent_map_insert(inode, rec, 0); | ||
671 | if (ret < 0) | ||
672 | mlog_errno(ret); | ||
673 | return ret; | ||
674 | } | ||
675 | |||
676 | #if 0 | ||
677 | /* Code here is included but defined out as it completes the extent | ||
678 | * map api and may be used in the future. */ | ||
679 | |||
680 | /* | ||
681 | * Look up the record containing this cluster offset. This record is | ||
682 | * part of the extent map. Do not free it. Any changes you make to | ||
683 | * it will reflect in the extent map. So, if your last extent | ||
684 | * is (cpos = 10, clusters = 10) and you truncate the file by 5 | ||
685 | * clusters, you can do: | ||
686 | * | ||
687 | * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); | ||
688 | * rec->e_clusters -= 5; | ||
689 | * | ||
690 | * The lookup does not read from disk. If the map isn't filled in for | ||
691 | * an entry, you won't find it. | ||
692 | * | ||
693 | * Also note that the returned record is valid until alloc_sem is | ||
694 | * dropped. After that, truncate and extend can happen. Caveat Emptor. | ||
695 | */ | ||
696 | int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, | ||
697 | struct ocfs2_extent_rec **rec, | ||
698 | int *tree_depth) | ||
699 | { | ||
700 | int ret = -ENOENT; | ||
701 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
702 | struct ocfs2_extent_map_entry *ent; | ||
703 | |||
704 | *rec = NULL; | ||
705 | |||
706 | if (cpos >= OCFS2_I(inode)->ip_clusters) | ||
707 | return -EINVAL; | ||
708 | |||
709 | if (cpos >= em->em_clusters) { | ||
710 | /* | ||
711 | * Size changed underneath us on disk. Drop any | ||
712 | * straddling records and update our idea of | ||
713 | * i_clusters | ||
714 | */ | ||
715 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
716 | em->em_clusters = OCFS2_I(inode)->ip_clusters ; | ||
717 | } | ||
718 | |||
719 | ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1, | ||
720 | NULL, NULL); | ||
721 | |||
722 | if (ent) { | ||
723 | *rec = &ent->e_rec; | ||
724 | if (tree_depth) | ||
725 | *tree_depth = ent->e_tree_depth; | ||
726 | ret = 0; | ||
727 | } | ||
728 | |||
729 | return ret; | ||
730 | } | ||
731 | |||
732 | int ocfs2_extent_map_get_clusters(struct inode *inode, | ||
733 | u32 v_cpos, int count, | ||
734 | u32 *p_cpos, int *ret_count) | ||
735 | { | ||
736 | int ret; | ||
737 | u32 coff, ccount; | ||
738 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
739 | struct ocfs2_extent_map_entry *ent = NULL; | ||
740 | |||
741 | *p_cpos = ccount = 0; | ||
742 | |||
743 | if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) | ||
744 | return -EINVAL; | ||
745 | |||
746 | if ((v_cpos + count) > em->em_clusters) { | ||
747 | /* | ||
748 | * Size changed underneath us on disk. Drop any | ||
749 | * straddling records and update our idea of | ||
750 | * i_clusters | ||
751 | */ | ||
752 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
753 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
754 | } | ||
755 | |||
756 | |||
757 | ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); | ||
758 | if (ret) | ||
759 | return ret; | ||
760 | |||
761 | if (ent) { | ||
762 | /* We should never find ourselves straddling an interval */ | ||
763 | if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec, | ||
764 | v_cpos, | ||
765 | count)) | ||
766 | return -ESRCH; | ||
767 | |||
768 | coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); | ||
769 | *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, | ||
770 | le64_to_cpu(ent->e_rec.e_blkno)) + | ||
771 | coff; | ||
772 | |||
773 | if (ret_count) | ||
774 | *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff; | ||
775 | |||
776 | return 0; | ||
777 | } | ||
778 | |||
779 | |||
780 | return -ENOENT; | ||
781 | } | ||
782 | |||
783 | #endif /* 0 */ | ||
784 | |||
785 | int ocfs2_extent_map_get_blocks(struct inode *inode, | ||
786 | u64 v_blkno, int count, | ||
787 | u64 *p_blkno, int *ret_count) | ||
788 | { | ||
789 | int ret; | ||
790 | u64 boff; | ||
791 | u32 cpos, clusters; | ||
792 | int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); | ||
793 | struct ocfs2_extent_map_entry *ent = NULL; | ||
794 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
795 | struct ocfs2_extent_rec *rec; | ||
796 | |||
797 | *p_blkno = 0; | ||
798 | |||
799 | cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); | ||
800 | clusters = ocfs2_blocks_to_clusters(inode->i_sb, | ||
801 | (u64)count + bpc - 1); | ||
802 | if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) { | ||
803 | ret = -EINVAL; | ||
804 | mlog_errno(ret); | ||
805 | return ret; | ||
806 | } | ||
807 | |||
808 | if ((cpos + clusters) > em->em_clusters) { | ||
809 | /* | ||
810 | * Size changed underneath us on disk. Drop any | ||
811 | * straddling records and update our idea of | ||
812 | * i_clusters | ||
813 | */ | ||
814 | ocfs2_extent_map_drop(inode, em->em_clusters - 1); | ||
815 | em->em_clusters = OCFS2_I(inode)->ip_clusters; | ||
816 | } | ||
817 | |||
818 | ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); | ||
819 | if (ret) { | ||
820 | mlog_errno(ret); | ||
821 | return ret; | ||
822 | } | ||
823 | |||
824 | if (ent) | ||
825 | { | ||
826 | rec = &ent->e_rec; | ||
827 | |||
828 | /* We should never find ourselves straddling an interval */ | ||
829 | if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { | ||
830 | ret = -ESRCH; | ||
831 | mlog_errno(ret); | ||
832 | return ret; | ||
833 | } | ||
834 | |||
835 | boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - | ||
836 | le32_to_cpu(rec->e_cpos)); | ||
837 | boff += (v_blkno & (u64)(bpc - 1)); | ||
838 | *p_blkno = le64_to_cpu(rec->e_blkno) + boff; | ||
839 | |||
840 | if (ret_count) { | ||
841 | *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, | ||
842 | le32_to_cpu(rec->e_clusters)) - boff; | ||
843 | } | ||
844 | |||
845 | return 0; | ||
846 | } | ||
847 | |||
848 | return -ENOENT; | ||
849 | } | ||
850 | |||
851 | int ocfs2_extent_map_init(struct inode *inode) | ||
852 | { | ||
853 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
854 | |||
855 | em->em_extents = RB_ROOT; | ||
856 | em->em_clusters = 0; | ||
857 | |||
858 | return 0; | ||
859 | } | ||
860 | |||
861 | /* Needs the lock */ | ||
862 | static void __ocfs2_extent_map_drop(struct inode *inode, | ||
863 | u32 new_clusters, | ||
864 | struct rb_node **free_head, | ||
865 | struct ocfs2_extent_map_entry **tail_ent) | ||
866 | { | ||
867 | struct rb_node *node, *next; | ||
868 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
869 | struct ocfs2_extent_map_entry *ent; | ||
870 | |||
871 | *free_head = NULL; | ||
872 | |||
873 | ent = NULL; | ||
874 | node = rb_last(&em->em_extents); | ||
875 | while (node) | ||
876 | { | ||
877 | next = rb_prev(node); | ||
878 | |||
879 | ent = rb_entry(node, struct ocfs2_extent_map_entry, | ||
880 | e_node); | ||
881 | if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters) | ||
882 | break; | ||
883 | |||
884 | rb_erase(&ent->e_node, &em->em_extents); | ||
885 | |||
886 | node->rb_right = *free_head; | ||
887 | *free_head = node; | ||
888 | |||
889 | ent = NULL; | ||
890 | node = next; | ||
891 | } | ||
892 | |||
893 | /* Do we have an entry straddling new_clusters? */ | ||
894 | if (tail_ent) { | ||
895 | if (ent && | ||
896 | ((le32_to_cpu(ent->e_rec.e_cpos) + | ||
897 | le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters)) | ||
898 | *tail_ent = ent; | ||
899 | else | ||
900 | *tail_ent = NULL; | ||
901 | } | ||
902 | } | ||
903 | |||
904 | static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head) | ||
905 | { | ||
906 | struct rb_node *node; | ||
907 | struct ocfs2_extent_map_entry *ent; | ||
908 | |||
909 | while (free_head) { | ||
910 | node = free_head; | ||
911 | free_head = node->rb_right; | ||
912 | |||
913 | ent = rb_entry(node, struct ocfs2_extent_map_entry, | ||
914 | e_node); | ||
915 | kmem_cache_free(ocfs2_em_ent_cachep, ent); | ||
916 | } | ||
917 | } | ||
918 | |||
919 | /* | ||
920 | * Remove all entries past new_clusters, inclusive of an entry that | ||
921 | * contains new_clusters. This is effectively a cache forget. | ||
922 | * | ||
923 | * If you want to also clip the last extent by some number of clusters, | ||
924 | * you need to call ocfs2_extent_map_trunc(). | ||
925 | * This code does not check or modify ip_clusters. | ||
926 | */ | ||
927 | int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) | ||
928 | { | ||
929 | struct rb_node *free_head = NULL; | ||
930 | struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; | ||
931 | struct ocfs2_extent_map_entry *ent; | ||
932 | |||
933 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
934 | |||
935 | __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); | ||
936 | |||
937 | if (ent) { | ||
938 | rb_erase(&ent->e_node, &em->em_extents); | ||
939 | ent->e_node.rb_right = free_head; | ||
940 | free_head = &ent->e_node; | ||
941 | } | ||
942 | |||
943 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
944 | |||
945 | if (free_head) | ||
946 | __ocfs2_extent_map_drop_cleanup(free_head); | ||
947 | |||
948 | return 0; | ||
949 | } | ||
950 | |||
951 | /* | ||
952 | * Remove all entries past new_clusters and also clip any extent | ||
953 | * straddling new_clusters, if there is one. This does not check | ||
954 | * or modify ip_clusters | ||
955 | */ | ||
956 | int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters) | ||
957 | { | ||
958 | struct rb_node *free_head = NULL; | ||
959 | struct ocfs2_extent_map_entry *ent = NULL; | ||
960 | |||
961 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
962 | |||
963 | __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); | ||
964 | |||
965 | if (ent) | ||
966 | ent->e_rec.e_clusters = cpu_to_le32(new_clusters - | ||
967 | le32_to_cpu(ent->e_rec.e_cpos)); | ||
968 | |||
969 | OCFS2_I(inode)->ip_map.em_clusters = new_clusters; | ||
970 | |||
971 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
972 | |||
973 | if (free_head) | ||
974 | __ocfs2_extent_map_drop_cleanup(free_head); | ||
975 | |||
976 | return 0; | ||
977 | } | ||
978 | |||
979 | int __init init_ocfs2_extent_maps(void) | ||
980 | { | ||
981 | ocfs2_em_ent_cachep = | ||
982 | kmem_cache_create("ocfs2_em_ent", | ||
983 | sizeof(struct ocfs2_extent_map_entry), | ||
984 | 0, SLAB_HWCACHE_ALIGN, NULL, NULL); | ||
985 | if (!ocfs2_em_ent_cachep) | ||
986 | return -ENOMEM; | ||
987 | |||
988 | return 0; | ||
989 | } | ||
990 | |||
991 | void __exit exit_ocfs2_extent_maps(void) | ||
992 | { | ||
993 | kmem_cache_destroy(ocfs2_em_ent_cachep); | ||
994 | } | ||
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h new file mode 100644 index 000000000000..fa3745efa886 --- /dev/null +++ b/fs/ocfs2/extent_map.h | |||
@@ -0,0 +1,46 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * extent_map.h | ||
5 | * | ||
6 | * In-memory file extent mappings for OCFS2. | ||
7 | * | ||
8 | * Copyright (C) 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License, version 2, as published by the Free Software Foundation. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
17 | * General Public License for more details. | ||
18 | * | ||
19 | * You should have received a copy of the GNU General Public | ||
20 | * License along with this program; if not, write to the | ||
21 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
22 | * Boston, MA 021110-1307, USA. | ||
23 | */ | ||
24 | |||
25 | #ifndef _EXTENT_MAP_H | ||
26 | #define _EXTENT_MAP_H | ||
27 | |||
28 | int init_ocfs2_extent_maps(void); | ||
29 | void exit_ocfs2_extent_maps(void); | ||
30 | |||
31 | /* | ||
32 | * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem | ||
33 | * to be held. The allocation cannot change at all while the map is | ||
34 | * in the process of being updated. | ||
35 | */ | ||
36 | int ocfs2_extent_map_init(struct inode *inode); | ||
37 | int ocfs2_extent_map_append(struct inode *inode, | ||
38 | struct ocfs2_extent_rec *rec, | ||
39 | u32 new_clusters); | ||
40 | int ocfs2_extent_map_get_blocks(struct inode *inode, | ||
41 | u64 v_blkno, int count, | ||
42 | u64 *p_blkno, int *ret_count); | ||
43 | int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters); | ||
44 | int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters); | ||
45 | |||
46 | #endif /* _EXTENT_MAP_H */ | ||
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c new file mode 100644 index 000000000000..72ae9e3306f4 --- /dev/null +++ b/fs/ocfs2/file.c | |||
@@ -0,0 +1,1237 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * file.c | ||
5 | * | ||
6 | * File open, close, extend, truncate | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/pagemap.h> | ||
31 | #include <linux/uio.h> | ||
32 | |||
33 | #define MLOG_MASK_PREFIX ML_INODE | ||
34 | #include <cluster/masklog.h> | ||
35 | |||
36 | #include "ocfs2.h" | ||
37 | |||
38 | #include "alloc.h" | ||
39 | #include "aops.h" | ||
40 | #include "dir.h" | ||
41 | #include "dlmglue.h" | ||
42 | #include "extent_map.h" | ||
43 | #include "file.h" | ||
44 | #include "sysfile.h" | ||
45 | #include "inode.h" | ||
46 | #include "journal.h" | ||
47 | #include "mmap.h" | ||
48 | #include "suballoc.h" | ||
49 | #include "super.h" | ||
50 | |||
51 | #include "buffer_head_io.h" | ||
52 | |||
53 | static int ocfs2_sync_inode(struct inode *inode) | ||
54 | { | ||
55 | filemap_fdatawrite(inode->i_mapping); | ||
56 | return sync_mapping_buffers(inode->i_mapping); | ||
57 | } | ||
58 | |||
59 | static int ocfs2_file_open(struct inode *inode, struct file *file) | ||
60 | { | ||
61 | int status; | ||
62 | int mode = file->f_flags; | ||
63 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
64 | |||
65 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, | ||
66 | file->f_dentry->d_name.len, file->f_dentry->d_name.name); | ||
67 | |||
68 | spin_lock(&oi->ip_lock); | ||
69 | |||
70 | /* Check that the inode hasn't been wiped from disk by another | ||
71 | * node. If it hasn't then we're safe as long as we hold the | ||
72 | * spin lock until our increment of open count. */ | ||
73 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { | ||
74 | spin_unlock(&oi->ip_lock); | ||
75 | |||
76 | status = -ENOENT; | ||
77 | goto leave; | ||
78 | } | ||
79 | |||
80 | if (mode & O_DIRECT) | ||
81 | oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; | ||
82 | |||
83 | oi->ip_open_count++; | ||
84 | spin_unlock(&oi->ip_lock); | ||
85 | status = 0; | ||
86 | leave: | ||
87 | mlog_exit(status); | ||
88 | return status; | ||
89 | } | ||
90 | |||
91 | static int ocfs2_file_release(struct inode *inode, struct file *file) | ||
92 | { | ||
93 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
94 | |||
95 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, | ||
96 | file->f_dentry->d_name.len, | ||
97 | file->f_dentry->d_name.name); | ||
98 | |||
99 | spin_lock(&oi->ip_lock); | ||
100 | if (!--oi->ip_open_count) | ||
101 | oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; | ||
102 | spin_unlock(&oi->ip_lock); | ||
103 | |||
104 | mlog_exit(0); | ||
105 | |||
106 | return 0; | ||
107 | } | ||
108 | |||
109 | static int ocfs2_sync_file(struct file *file, | ||
110 | struct dentry *dentry, | ||
111 | int datasync) | ||
112 | { | ||
113 | int err = 0; | ||
114 | journal_t *journal; | ||
115 | struct inode *inode = dentry->d_inode; | ||
116 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
117 | |||
118 | mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, | ||
119 | dentry->d_name.len, dentry->d_name.name); | ||
120 | |||
121 | err = ocfs2_sync_inode(dentry->d_inode); | ||
122 | if (err) | ||
123 | goto bail; | ||
124 | |||
125 | journal = osb->journal->j_journal; | ||
126 | err = journal_force_commit(journal); | ||
127 | |||
128 | bail: | ||
129 | mlog_exit(err); | ||
130 | |||
131 | return (err < 0) ? -EIO : 0; | ||
132 | } | ||
133 | |||
134 | int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle, | ||
135 | struct inode *inode, | ||
136 | struct buffer_head *fe_bh, | ||
137 | u64 new_i_size) | ||
138 | { | ||
139 | int status; | ||
140 | |||
141 | mlog_entry_void(); | ||
142 | i_size_write(inode, new_i_size); | ||
143 | inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); | ||
144 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | ||
145 | |||
146 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); | ||
147 | if (status < 0) { | ||
148 | mlog_errno(status); | ||
149 | goto bail; | ||
150 | } | ||
151 | |||
152 | bail: | ||
153 | mlog_exit(status); | ||
154 | return status; | ||
155 | } | ||
156 | |||
157 | static int ocfs2_simple_size_update(struct inode *inode, | ||
158 | struct buffer_head *di_bh, | ||
159 | u64 new_i_size) | ||
160 | { | ||
161 | int ret; | ||
162 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
163 | struct ocfs2_journal_handle *handle = NULL; | ||
164 | |||
165 | handle = ocfs2_start_trans(osb, NULL, | ||
166 | OCFS2_INODE_UPDATE_CREDITS); | ||
167 | if (handle == NULL) { | ||
168 | ret = -ENOMEM; | ||
169 | mlog_errno(ret); | ||
170 | goto out; | ||
171 | } | ||
172 | |||
173 | ret = ocfs2_set_inode_size(handle, inode, di_bh, | ||
174 | new_i_size); | ||
175 | if (ret < 0) | ||
176 | mlog_errno(ret); | ||
177 | |||
178 | ocfs2_commit_trans(handle); | ||
179 | out: | ||
180 | return ret; | ||
181 | } | ||
182 | |||
183 | static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, | ||
184 | struct inode *inode, | ||
185 | struct buffer_head *fe_bh, | ||
186 | u64 new_i_size) | ||
187 | { | ||
188 | int status; | ||
189 | struct ocfs2_journal_handle *handle; | ||
190 | |||
191 | mlog_entry_void(); | ||
192 | |||
193 | /* TODO: This needs to actually orphan the inode in this | ||
194 | * transaction. */ | ||
195 | |||
196 | handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); | ||
197 | if (IS_ERR(handle)) { | ||
198 | status = PTR_ERR(handle); | ||
199 | mlog_errno(status); | ||
200 | goto out; | ||
201 | } | ||
202 | |||
203 | status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); | ||
204 | if (status < 0) | ||
205 | mlog_errno(status); | ||
206 | |||
207 | ocfs2_commit_trans(handle); | ||
208 | out: | ||
209 | mlog_exit(status); | ||
210 | return status; | ||
211 | } | ||
212 | |||
213 | static int ocfs2_truncate_file(struct inode *inode, | ||
214 | struct buffer_head *di_bh, | ||
215 | u64 new_i_size) | ||
216 | { | ||
217 | int status = 0; | ||
218 | struct ocfs2_dinode *fe = NULL; | ||
219 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
220 | struct ocfs2_truncate_context *tc = NULL; | ||
221 | |||
222 | mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n", | ||
223 | OCFS2_I(inode)->ip_blkno, new_i_size); | ||
224 | |||
225 | truncate_inode_pages(inode->i_mapping, new_i_size); | ||
226 | |||
227 | fe = (struct ocfs2_dinode *) di_bh->b_data; | ||
228 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
229 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | ||
230 | status = -EIO; | ||
231 | goto bail; | ||
232 | } | ||
233 | |||
234 | mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), | ||
235 | "Inode %"MLFu64", inode i_size = %lld != di " | ||
236 | "i_size = %"MLFu64", i_flags = 0x%x\n", | ||
237 | OCFS2_I(inode)->ip_blkno, | ||
238 | i_size_read(inode), | ||
239 | le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags)); | ||
240 | |||
241 | if (new_i_size > le64_to_cpu(fe->i_size)) { | ||
242 | mlog(0, "asked to truncate file with size (%"MLFu64") " | ||
243 | "to size (%"MLFu64")!\n", | ||
244 | le64_to_cpu(fe->i_size), new_i_size); | ||
245 | status = -EINVAL; | ||
246 | mlog_errno(status); | ||
247 | goto bail; | ||
248 | } | ||
249 | |||
250 | mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n", | ||
251 | le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size); | ||
252 | |||
253 | /* lets handle the simple truncate cases before doing any more | ||
254 | * cluster locking. */ | ||
255 | if (new_i_size == le64_to_cpu(fe->i_size)) | ||
256 | goto bail; | ||
257 | |||
258 | if (le32_to_cpu(fe->i_clusters) == | ||
259 | ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { | ||
260 | mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", | ||
261 | fe->i_clusters); | ||
262 | /* No allocation change is required, so lets fast path | ||
263 | * this truncate. */ | ||
264 | status = ocfs2_simple_size_update(inode, di_bh, new_i_size); | ||
265 | if (status < 0) | ||
266 | mlog_errno(status); | ||
267 | goto bail; | ||
268 | } | ||
269 | |||
270 | /* This forces other nodes to sync and drop their pages */ | ||
271 | status = ocfs2_data_lock(inode, 1); | ||
272 | if (status < 0) { | ||
273 | mlog_errno(status); | ||
274 | goto bail; | ||
275 | } | ||
276 | ocfs2_data_unlock(inode, 1); | ||
277 | |||
278 | /* alright, we're going to need to do a full blown alloc size | ||
279 | * change. Orphan the inode so that recovery can complete the | ||
280 | * truncate if necessary. This does the task of marking | ||
281 | * i_size. */ | ||
282 | status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); | ||
283 | if (status < 0) { | ||
284 | mlog_errno(status); | ||
285 | goto bail; | ||
286 | } | ||
287 | |||
288 | status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); | ||
289 | if (status < 0) { | ||
290 | mlog_errno(status); | ||
291 | goto bail; | ||
292 | } | ||
293 | |||
294 | status = ocfs2_commit_truncate(osb, inode, di_bh, tc); | ||
295 | if (status < 0) { | ||
296 | mlog_errno(status); | ||
297 | goto bail; | ||
298 | } | ||
299 | |||
300 | /* TODO: orphan dir cleanup here. */ | ||
301 | bail: | ||
302 | |||
303 | mlog_exit(status); | ||
304 | return status; | ||
305 | } | ||
306 | |||
307 | /* | ||
308 | * extend allocation only here. | ||
309 | * we'll update all the disk stuff, and oip->alloc_size | ||
310 | * | ||
311 | * expect stuff to be locked, a transaction started and enough data / | ||
312 | * metadata reservations in the contexts. | ||
313 | * | ||
314 | * Will return -EAGAIN, and a reason if a restart is needed. | ||
315 | * If passed in, *reason will always be set, even in error. | ||
316 | */ | ||
317 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | ||
318 | struct inode *inode, | ||
319 | u32 clusters_to_add, | ||
320 | struct buffer_head *fe_bh, | ||
321 | struct ocfs2_journal_handle *handle, | ||
322 | struct ocfs2_alloc_context *data_ac, | ||
323 | struct ocfs2_alloc_context *meta_ac, | ||
324 | enum ocfs2_alloc_restarted *reason_ret) | ||
325 | { | ||
326 | int status = 0; | ||
327 | int free_extents; | ||
328 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
329 | enum ocfs2_alloc_restarted reason = RESTART_NONE; | ||
330 | u32 bit_off, num_bits; | ||
331 | u64 block; | ||
332 | |||
333 | BUG_ON(!clusters_to_add); | ||
334 | |||
335 | free_extents = ocfs2_num_free_extents(osb, inode, fe); | ||
336 | if (free_extents < 0) { | ||
337 | status = free_extents; | ||
338 | mlog_errno(status); | ||
339 | goto leave; | ||
340 | } | ||
341 | |||
342 | /* there are two cases which could cause us to EAGAIN in the | ||
343 | * we-need-more-metadata case: | ||
344 | * 1) we haven't reserved *any* | ||
345 | * 2) we are so fragmented, we've needed to add metadata too | ||
346 | * many times. */ | ||
347 | if (!free_extents && !meta_ac) { | ||
348 | mlog(0, "we haven't reserved any metadata!\n"); | ||
349 | status = -EAGAIN; | ||
350 | reason = RESTART_META; | ||
351 | goto leave; | ||
352 | } else if ((!free_extents) | ||
353 | && (ocfs2_alloc_context_bits_left(meta_ac) | ||
354 | < ocfs2_extend_meta_needed(fe))) { | ||
355 | mlog(0, "filesystem is really fragmented...\n"); | ||
356 | status = -EAGAIN; | ||
357 | reason = RESTART_META; | ||
358 | goto leave; | ||
359 | } | ||
360 | |||
361 | status = ocfs2_claim_clusters(osb, handle, data_ac, 1, | ||
362 | &bit_off, &num_bits); | ||
363 | if (status < 0) { | ||
364 | if (status != -ENOSPC) | ||
365 | mlog_errno(status); | ||
366 | goto leave; | ||
367 | } | ||
368 | |||
369 | BUG_ON(num_bits > clusters_to_add); | ||
370 | |||
371 | /* reserve our write early -- insert_extent may update the inode */ | ||
372 | status = ocfs2_journal_access(handle, inode, fe_bh, | ||
373 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
374 | if (status < 0) { | ||
375 | mlog_errno(status); | ||
376 | goto leave; | ||
377 | } | ||
378 | |||
379 | block = ocfs2_clusters_to_blocks(osb->sb, bit_off); | ||
380 | mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n", | ||
381 | num_bits, bit_off, OCFS2_I(inode)->ip_blkno); | ||
382 | status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, | ||
383 | num_bits, meta_ac); | ||
384 | if (status < 0) { | ||
385 | mlog_errno(status); | ||
386 | goto leave; | ||
387 | } | ||
388 | |||
389 | le32_add_cpu(&fe->i_clusters, num_bits); | ||
390 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
391 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
392 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
393 | |||
394 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
395 | if (status < 0) { | ||
396 | mlog_errno(status); | ||
397 | goto leave; | ||
398 | } | ||
399 | |||
400 | clusters_to_add -= num_bits; | ||
401 | |||
402 | if (clusters_to_add) { | ||
403 | mlog(0, "need to alloc once more, clusters = %u, wanted = " | ||
404 | "%u\n", fe->i_clusters, clusters_to_add); | ||
405 | status = -EAGAIN; | ||
406 | reason = RESTART_TRANS; | ||
407 | } | ||
408 | |||
409 | leave: | ||
410 | mlog_exit(status); | ||
411 | if (reason_ret) | ||
412 | *reason_ret = reason; | ||
413 | return status; | ||
414 | } | ||
415 | |||
416 | static int ocfs2_extend_allocation(struct inode *inode, | ||
417 | u32 clusters_to_add) | ||
418 | { | ||
419 | int status = 0; | ||
420 | int restart_func = 0; | ||
421 | int drop_alloc_sem = 0; | ||
422 | int credits, num_free_extents; | ||
423 | u32 prev_clusters; | ||
424 | struct buffer_head *bh = NULL; | ||
425 | struct ocfs2_dinode *fe = NULL; | ||
426 | struct ocfs2_journal_handle *handle = NULL; | ||
427 | struct ocfs2_alloc_context *data_ac = NULL; | ||
428 | struct ocfs2_alloc_context *meta_ac = NULL; | ||
429 | enum ocfs2_alloc_restarted why; | ||
430 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
431 | |||
432 | mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); | ||
433 | |||
434 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, | ||
435 | OCFS2_BH_CACHED, inode); | ||
436 | if (status < 0) { | ||
437 | mlog_errno(status); | ||
438 | goto leave; | ||
439 | } | ||
440 | |||
441 | fe = (struct ocfs2_dinode *) bh->b_data; | ||
442 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
443 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | ||
444 | status = -EIO; | ||
445 | goto leave; | ||
446 | } | ||
447 | |||
448 | restart_all: | ||
449 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); | ||
450 | |||
451 | mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, " | ||
452 | "clusters_to_add = %u\n", | ||
453 | OCFS2_I(inode)->ip_blkno, i_size_read(inode), | ||
454 | fe->i_clusters, clusters_to_add); | ||
455 | |||
456 | handle = ocfs2_alloc_handle(osb); | ||
457 | if (handle == NULL) { | ||
458 | status = -ENOMEM; | ||
459 | mlog_errno(status); | ||
460 | goto leave; | ||
461 | } | ||
462 | |||
463 | num_free_extents = ocfs2_num_free_extents(osb, | ||
464 | inode, | ||
465 | fe); | ||
466 | if (num_free_extents < 0) { | ||
467 | status = num_free_extents; | ||
468 | mlog_errno(status); | ||
469 | goto leave; | ||
470 | } | ||
471 | |||
472 | if (!num_free_extents) { | ||
473 | status = ocfs2_reserve_new_metadata(osb, | ||
474 | handle, | ||
475 | fe, | ||
476 | &meta_ac); | ||
477 | if (status < 0) { | ||
478 | if (status != -ENOSPC) | ||
479 | mlog_errno(status); | ||
480 | goto leave; | ||
481 | } | ||
482 | } | ||
483 | |||
484 | status = ocfs2_reserve_clusters(osb, | ||
485 | handle, | ||
486 | clusters_to_add, | ||
487 | &data_ac); | ||
488 | if (status < 0) { | ||
489 | if (status != -ENOSPC) | ||
490 | mlog_errno(status); | ||
491 | goto leave; | ||
492 | } | ||
493 | |||
494 | /* blocks peope in read/write from reading our allocation | ||
495 | * until we're done changing it. We depend on i_sem to block | ||
496 | * other extend/truncate calls while we're here. Ordering wrt | ||
497 | * start_trans is important here -- always do it before! */ | ||
498 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
499 | drop_alloc_sem = 1; | ||
500 | |||
501 | credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); | ||
502 | handle = ocfs2_start_trans(osb, handle, credits); | ||
503 | if (IS_ERR(handle)) { | ||
504 | status = PTR_ERR(handle); | ||
505 | handle = NULL; | ||
506 | mlog_errno(status); | ||
507 | goto leave; | ||
508 | } | ||
509 | |||
510 | restarted_transaction: | ||
511 | /* reserve a write to the file entry early on - that we if we | ||
512 | * run out of credits in the allocation path, we can still | ||
513 | * update i_size. */ | ||
514 | status = ocfs2_journal_access(handle, inode, bh, | ||
515 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
516 | if (status < 0) { | ||
517 | mlog_errno(status); | ||
518 | goto leave; | ||
519 | } | ||
520 | |||
521 | prev_clusters = OCFS2_I(inode)->ip_clusters; | ||
522 | |||
523 | status = ocfs2_do_extend_allocation(osb, | ||
524 | inode, | ||
525 | clusters_to_add, | ||
526 | bh, | ||
527 | handle, | ||
528 | data_ac, | ||
529 | meta_ac, | ||
530 | &why); | ||
531 | if ((status < 0) && (status != -EAGAIN)) { | ||
532 | if (status != -ENOSPC) | ||
533 | mlog_errno(status); | ||
534 | goto leave; | ||
535 | } | ||
536 | |||
537 | status = ocfs2_journal_dirty(handle, bh); | ||
538 | if (status < 0) { | ||
539 | mlog_errno(status); | ||
540 | goto leave; | ||
541 | } | ||
542 | |||
543 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
544 | clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); | ||
545 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
546 | |||
547 | if (why != RESTART_NONE && clusters_to_add) { | ||
548 | if (why == RESTART_META) { | ||
549 | mlog(0, "restarting function.\n"); | ||
550 | restart_func = 1; | ||
551 | } else { | ||
552 | BUG_ON(why != RESTART_TRANS); | ||
553 | |||
554 | mlog(0, "restarting transaction.\n"); | ||
555 | /* TODO: This can be more intelligent. */ | ||
556 | credits = ocfs2_calc_extend_credits(osb->sb, | ||
557 | fe, | ||
558 | clusters_to_add); | ||
559 | status = ocfs2_extend_trans(handle, credits); | ||
560 | if (status < 0) { | ||
561 | /* handle still has to be committed at | ||
562 | * this point. */ | ||
563 | status = -ENOMEM; | ||
564 | mlog_errno(status); | ||
565 | goto leave; | ||
566 | } | ||
567 | goto restarted_transaction; | ||
568 | } | ||
569 | } | ||
570 | |||
571 | mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n", | ||
572 | fe->i_clusters, fe->i_size); | ||
573 | mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", | ||
574 | OCFS2_I(inode)->ip_clusters, i_size_read(inode)); | ||
575 | |||
576 | leave: | ||
577 | if (drop_alloc_sem) { | ||
578 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
579 | drop_alloc_sem = 0; | ||
580 | } | ||
581 | if (handle) { | ||
582 | ocfs2_commit_trans(handle); | ||
583 | handle = NULL; | ||
584 | } | ||
585 | if (data_ac) { | ||
586 | ocfs2_free_alloc_context(data_ac); | ||
587 | data_ac = NULL; | ||
588 | } | ||
589 | if (meta_ac) { | ||
590 | ocfs2_free_alloc_context(meta_ac); | ||
591 | meta_ac = NULL; | ||
592 | } | ||
593 | if ((!status) && restart_func) { | ||
594 | restart_func = 0; | ||
595 | goto restart_all; | ||
596 | } | ||
597 | if (bh) { | ||
598 | brelse(bh); | ||
599 | bh = NULL; | ||
600 | } | ||
601 | |||
602 | mlog_exit(status); | ||
603 | return status; | ||
604 | } | ||
605 | |||
606 | /* Some parts of this taken from generic_cont_expand, which turned out | ||
607 | * to be too fragile to do exactly what we need without us having to | ||
608 | * worry about recursive locking in ->commit_write(). */ | ||
609 | static int ocfs2_write_zero_page(struct inode *inode, | ||
610 | u64 size) | ||
611 | { | ||
612 | struct address_space *mapping = inode->i_mapping; | ||
613 | struct page *page; | ||
614 | unsigned long index; | ||
615 | unsigned int offset; | ||
616 | struct ocfs2_journal_handle *handle = NULL; | ||
617 | int ret; | ||
618 | |||
619 | offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ | ||
620 | /* ugh. in prepare/commit_write, if from==to==start of block, we | ||
621 | ** skip the prepare. make sure we never send an offset for the start | ||
622 | ** of a block | ||
623 | */ | ||
624 | if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { | ||
625 | offset++; | ||
626 | } | ||
627 | index = size >> PAGE_CACHE_SHIFT; | ||
628 | |||
629 | page = grab_cache_page(mapping, index); | ||
630 | if (!page) { | ||
631 | ret = -ENOMEM; | ||
632 | mlog_errno(ret); | ||
633 | goto out; | ||
634 | } | ||
635 | |||
636 | ret = ocfs2_prepare_write(NULL, page, offset, offset); | ||
637 | if (ret < 0) { | ||
638 | mlog_errno(ret); | ||
639 | goto out_unlock; | ||
640 | } | ||
641 | |||
642 | if (ocfs2_should_order_data(inode)) { | ||
643 | handle = ocfs2_start_walk_page_trans(inode, page, offset, | ||
644 | offset); | ||
645 | if (IS_ERR(handle)) { | ||
646 | ret = PTR_ERR(handle); | ||
647 | handle = NULL; | ||
648 | goto out_unlock; | ||
649 | } | ||
650 | } | ||
651 | |||
652 | /* must not update i_size! */ | ||
653 | ret = block_commit_write(page, offset, offset); | ||
654 | if (ret < 0) | ||
655 | mlog_errno(ret); | ||
656 | else | ||
657 | ret = 0; | ||
658 | |||
659 | if (handle) | ||
660 | ocfs2_commit_trans(handle); | ||
661 | out_unlock: | ||
662 | unlock_page(page); | ||
663 | page_cache_release(page); | ||
664 | out: | ||
665 | return ret; | ||
666 | } | ||
667 | |||
668 | static int ocfs2_zero_extend(struct inode *inode, | ||
669 | u64 zero_to_size) | ||
670 | { | ||
671 | int ret = 0; | ||
672 | u64 start_off; | ||
673 | struct super_block *sb = inode->i_sb; | ||
674 | |||
675 | start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); | ||
676 | while (start_off < zero_to_size) { | ||
677 | ret = ocfs2_write_zero_page(inode, start_off); | ||
678 | if (ret < 0) { | ||
679 | mlog_errno(ret); | ||
680 | goto out; | ||
681 | } | ||
682 | |||
683 | start_off += sb->s_blocksize; | ||
684 | } | ||
685 | |||
686 | out: | ||
687 | return ret; | ||
688 | } | ||
689 | |||
690 | static int ocfs2_extend_file(struct inode *inode, | ||
691 | struct buffer_head *di_bh, | ||
692 | u64 new_i_size) | ||
693 | { | ||
694 | int ret = 0; | ||
695 | u32 clusters_to_add; | ||
696 | |||
697 | /* setattr sometimes calls us like this. */ | ||
698 | if (new_i_size == 0) | ||
699 | goto out; | ||
700 | |||
701 | if (i_size_read(inode) == new_i_size) | ||
702 | goto out; | ||
703 | BUG_ON(new_i_size < i_size_read(inode)); | ||
704 | |||
705 | clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - | ||
706 | OCFS2_I(inode)->ip_clusters; | ||
707 | |||
708 | if (clusters_to_add) { | ||
709 | ret = ocfs2_extend_allocation(inode, clusters_to_add); | ||
710 | if (ret < 0) { | ||
711 | mlog_errno(ret); | ||
712 | goto out; | ||
713 | } | ||
714 | |||
715 | ret = ocfs2_zero_extend(inode, new_i_size); | ||
716 | if (ret < 0) { | ||
717 | mlog_errno(ret); | ||
718 | goto out; | ||
719 | } | ||
720 | } | ||
721 | |||
722 | /* No allocation required, we just use this helper to | ||
723 | * do a trivial update of i_size. */ | ||
724 | ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); | ||
725 | if (ret < 0) { | ||
726 | mlog_errno(ret); | ||
727 | goto out; | ||
728 | } | ||
729 | |||
730 | out: | ||
731 | return ret; | ||
732 | } | ||
733 | |||
734 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) | ||
735 | { | ||
736 | int status = 0, size_change; | ||
737 | struct inode *inode = dentry->d_inode; | ||
738 | struct super_block *sb = inode->i_sb; | ||
739 | struct ocfs2_super *osb = OCFS2_SB(sb); | ||
740 | struct buffer_head *bh = NULL; | ||
741 | struct ocfs2_journal_handle *handle = NULL; | ||
742 | |||
743 | mlog_entry("(0x%p, '%.*s')\n", dentry, | ||
744 | dentry->d_name.len, dentry->d_name.name); | ||
745 | |||
746 | if (attr->ia_valid & ATTR_MODE) | ||
747 | mlog(0, "mode change: %d\n", attr->ia_mode); | ||
748 | if (attr->ia_valid & ATTR_UID) | ||
749 | mlog(0, "uid change: %d\n", attr->ia_uid); | ||
750 | if (attr->ia_valid & ATTR_GID) | ||
751 | mlog(0, "gid change: %d\n", attr->ia_gid); | ||
752 | if (attr->ia_valid & ATTR_SIZE) | ||
753 | mlog(0, "size change...\n"); | ||
754 | if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) | ||
755 | mlog(0, "time change...\n"); | ||
756 | |||
757 | #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ | ||
758 | | ATTR_GID | ATTR_UID | ATTR_MODE) | ||
759 | if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { | ||
760 | mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); | ||
761 | return 0; | ||
762 | } | ||
763 | |||
764 | status = inode_change_ok(inode, attr); | ||
765 | if (status) | ||
766 | return status; | ||
767 | |||
768 | size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; | ||
769 | if (size_change) { | ||
770 | status = ocfs2_rw_lock(inode, 1); | ||
771 | if (status < 0) { | ||
772 | mlog_errno(status); | ||
773 | goto bail; | ||
774 | } | ||
775 | } | ||
776 | |||
777 | status = ocfs2_meta_lock(inode, NULL, &bh, 1); | ||
778 | if (status < 0) { | ||
779 | if (status != -ENOENT) | ||
780 | mlog_errno(status); | ||
781 | goto bail_unlock_rw; | ||
782 | } | ||
783 | |||
784 | if (size_change && attr->ia_size != i_size_read(inode)) { | ||
785 | if (i_size_read(inode) > attr->ia_size) | ||
786 | status = ocfs2_truncate_file(inode, bh, attr->ia_size); | ||
787 | else | ||
788 | status = ocfs2_extend_file(inode, bh, attr->ia_size); | ||
789 | if (status < 0) { | ||
790 | if (status != -ENOSPC) | ||
791 | mlog_errno(status); | ||
792 | status = -ENOSPC; | ||
793 | goto bail_unlock; | ||
794 | } | ||
795 | } | ||
796 | |||
797 | handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); | ||
798 | if (IS_ERR(handle)) { | ||
799 | status = PTR_ERR(handle); | ||
800 | mlog_errno(status); | ||
801 | goto bail_unlock; | ||
802 | } | ||
803 | |||
804 | status = inode_setattr(inode, attr); | ||
805 | if (status < 0) { | ||
806 | mlog_errno(status); | ||
807 | goto bail_commit; | ||
808 | } | ||
809 | |||
810 | status = ocfs2_mark_inode_dirty(handle, inode, bh); | ||
811 | if (status < 0) | ||
812 | mlog_errno(status); | ||
813 | |||
814 | bail_commit: | ||
815 | ocfs2_commit_trans(handle); | ||
816 | bail_unlock: | ||
817 | ocfs2_meta_unlock(inode, 1); | ||
818 | bail_unlock_rw: | ||
819 | if (size_change) | ||
820 | ocfs2_rw_unlock(inode, 1); | ||
821 | bail: | ||
822 | if (bh) | ||
823 | brelse(bh); | ||
824 | |||
825 | mlog_exit(status); | ||
826 | return status; | ||
827 | } | ||
828 | |||
829 | int ocfs2_getattr(struct vfsmount *mnt, | ||
830 | struct dentry *dentry, | ||
831 | struct kstat *stat) | ||
832 | { | ||
833 | struct inode *inode = dentry->d_inode; | ||
834 | struct super_block *sb = dentry->d_inode->i_sb; | ||
835 | struct ocfs2_super *osb = sb->s_fs_info; | ||
836 | int err; | ||
837 | |||
838 | mlog_entry_void(); | ||
839 | |||
840 | err = ocfs2_inode_revalidate(dentry); | ||
841 | if (err) { | ||
842 | if (err != -ENOENT) | ||
843 | mlog_errno(err); | ||
844 | goto bail; | ||
845 | } | ||
846 | |||
847 | generic_fillattr(inode, stat); | ||
848 | |||
849 | /* We set the blksize from the cluster size for performance */ | ||
850 | stat->blksize = osb->s_clustersize; | ||
851 | |||
852 | bail: | ||
853 | mlog_exit(err); | ||
854 | |||
855 | return err; | ||
856 | } | ||
857 | |||
858 | static int ocfs2_write_remove_suid(struct inode *inode) | ||
859 | { | ||
860 | int ret; | ||
861 | struct buffer_head *bh = NULL; | ||
862 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
863 | struct ocfs2_journal_handle *handle; | ||
864 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
865 | struct ocfs2_dinode *di; | ||
866 | |||
867 | mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno, | ||
868 | inode->i_mode); | ||
869 | |||
870 | handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); | ||
871 | if (handle == NULL) { | ||
872 | ret = -ENOMEM; | ||
873 | mlog_errno(ret); | ||
874 | goto out; | ||
875 | } | ||
876 | |||
877 | ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); | ||
878 | if (ret < 0) { | ||
879 | mlog_errno(ret); | ||
880 | goto out_trans; | ||
881 | } | ||
882 | |||
883 | ret = ocfs2_journal_access(handle, inode, bh, | ||
884 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
885 | if (ret < 0) { | ||
886 | mlog_errno(ret); | ||
887 | goto out_bh; | ||
888 | } | ||
889 | |||
890 | inode->i_mode &= ~S_ISUID; | ||
891 | if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) | ||
892 | inode->i_mode &= ~S_ISGID; | ||
893 | |||
894 | di = (struct ocfs2_dinode *) bh->b_data; | ||
895 | di->i_mode = cpu_to_le16(inode->i_mode); | ||
896 | |||
897 | ret = ocfs2_journal_dirty(handle, bh); | ||
898 | if (ret < 0) | ||
899 | mlog_errno(ret); | ||
900 | out_bh: | ||
901 | brelse(bh); | ||
902 | out_trans: | ||
903 | ocfs2_commit_trans(handle); | ||
904 | out: | ||
905 | mlog_exit(ret); | ||
906 | return ret; | ||
907 | } | ||
908 | |||
909 | static inline int ocfs2_write_should_remove_suid(struct inode *inode) | ||
910 | { | ||
911 | mode_t mode = inode->i_mode; | ||
912 | |||
913 | if (!capable(CAP_FSETID)) { | ||
914 | if (unlikely(mode & S_ISUID)) | ||
915 | return 1; | ||
916 | |||
917 | if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) | ||
918 | return 1; | ||
919 | } | ||
920 | return 0; | ||
921 | } | ||
922 | |||
923 | static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | ||
924 | const char __user *buf, | ||
925 | size_t count, | ||
926 | loff_t pos) | ||
927 | { | ||
928 | struct iovec local_iov = { .iov_base = (void __user *)buf, | ||
929 | .iov_len = count }; | ||
930 | int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0; | ||
931 | u32 clusters; | ||
932 | struct file *filp = iocb->ki_filp; | ||
933 | struct inode *inode = filp->f_dentry->d_inode; | ||
934 | loff_t newsize, saved_pos; | ||
935 | #ifdef OCFS2_ORACORE_WORKAROUNDS | ||
936 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
937 | #endif | ||
938 | |||
939 | mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, | ||
940 | (unsigned int)count, | ||
941 | filp->f_dentry->d_name.len, | ||
942 | filp->f_dentry->d_name.name); | ||
943 | |||
944 | /* happy write of zero bytes */ | ||
945 | if (count == 0) | ||
946 | return 0; | ||
947 | |||
948 | if (!inode) { | ||
949 | mlog(0, "bad inode\n"); | ||
950 | return -EIO; | ||
951 | } | ||
952 | |||
953 | #ifdef OCFS2_ORACORE_WORKAROUNDS | ||
954 | /* ugh, work around some applications which open everything O_DIRECT + | ||
955 | * O_APPEND and really don't mean to use O_DIRECT. */ | ||
956 | if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS && | ||
957 | (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) | ||
958 | filp->f_flags &= ~O_DIRECT; | ||
959 | #endif | ||
960 | |||
961 | down(&inode->i_sem); | ||
962 | /* to match setattr's i_sem -> i_alloc_sem -> rw_lock ordering */ | ||
963 | if (filp->f_flags & O_DIRECT) { | ||
964 | have_alloc_sem = 1; | ||
965 | down_read(&inode->i_alloc_sem); | ||
966 | } | ||
967 | |||
968 | /* concurrent O_DIRECT writes are allowed */ | ||
969 | rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; | ||
970 | ret = ocfs2_rw_lock(inode, rw_level); | ||
971 | if (ret < 0) { | ||
972 | rw_level = -1; | ||
973 | mlog_errno(ret); | ||
974 | goto out; | ||
975 | } | ||
976 | |||
977 | /* | ||
978 | * We sample i_size under a read level meta lock to see if our write | ||
979 | * is extending the file, if it is we back off and get a write level | ||
980 | * meta lock. | ||
981 | */ | ||
982 | meta_level = (filp->f_flags & O_APPEND) ? 1 : 0; | ||
983 | for(;;) { | ||
984 | ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level); | ||
985 | if (ret < 0) { | ||
986 | meta_level = -1; | ||
987 | mlog_errno(ret); | ||
988 | goto out; | ||
989 | } | ||
990 | |||
991 | /* Clear suid / sgid if necessary. We do this here | ||
992 | * instead of later in the write path because | ||
993 | * remove_suid() calls ->setattr without any hint that | ||
994 | * we may have already done our cluster locking. Since | ||
995 | * ocfs2_setattr() *must* take cluster locks to | ||
996 | * proceeed, this will lead us to recursively lock the | ||
997 | * inode. There's also the dinode i_size state which | ||
998 | * can be lost via setattr during extending writes (we | ||
999 | * set inode->i_size at the end of a write. */ | ||
1000 | if (ocfs2_write_should_remove_suid(inode)) { | ||
1001 | if (meta_level == 0) { | ||
1002 | ocfs2_meta_unlock(inode, meta_level); | ||
1003 | meta_level = 1; | ||
1004 | continue; | ||
1005 | } | ||
1006 | |||
1007 | ret = ocfs2_write_remove_suid(inode); | ||
1008 | if (ret < 0) { | ||
1009 | mlog_errno(ret); | ||
1010 | goto out; | ||
1011 | } | ||
1012 | } | ||
1013 | |||
1014 | /* work on a copy of ppos until we're sure that we won't have | ||
1015 | * to recalculate it due to relocking. */ | ||
1016 | if (filp->f_flags & O_APPEND) { | ||
1017 | saved_pos = i_size_read(inode); | ||
1018 | mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); | ||
1019 | } else { | ||
1020 | saved_pos = iocb->ki_pos; | ||
1021 | } | ||
1022 | newsize = count + saved_pos; | ||
1023 | |||
1024 | mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n", | ||
1025 | saved_pos, newsize, i_size_read(inode)); | ||
1026 | |||
1027 | /* No need for a higher level metadata lock if we're | ||
1028 | * never going past i_size. */ | ||
1029 | if (newsize <= i_size_read(inode)) | ||
1030 | break; | ||
1031 | |||
1032 | if (meta_level == 0) { | ||
1033 | ocfs2_meta_unlock(inode, meta_level); | ||
1034 | meta_level = 1; | ||
1035 | continue; | ||
1036 | } | ||
1037 | |||
1038 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
1039 | clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - | ||
1040 | OCFS2_I(inode)->ip_clusters; | ||
1041 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
1042 | |||
1043 | mlog(0, "Writing at EOF, may need more allocation: " | ||
1044 | "i_size = %lld, newsize = %"MLFu64", need %u clusters\n", | ||
1045 | i_size_read(inode), newsize, clusters); | ||
1046 | |||
1047 | /* We only want to continue the rest of this loop if | ||
1048 | * our extend will actually require more | ||
1049 | * allocation. */ | ||
1050 | if (!clusters) | ||
1051 | break; | ||
1052 | |||
1053 | ret = ocfs2_extend_allocation(inode, clusters); | ||
1054 | if (ret < 0) { | ||
1055 | if (ret != -ENOSPC) | ||
1056 | mlog_errno(ret); | ||
1057 | goto out; | ||
1058 | } | ||
1059 | |||
1060 | /* Fill any holes which would've been created by this | ||
1061 | * write. If we're O_APPEND, this will wind up | ||
1062 | * (correctly) being a noop. */ | ||
1063 | ret = ocfs2_zero_extend(inode, (u64) newsize - count); | ||
1064 | if (ret < 0) { | ||
1065 | mlog_errno(ret); | ||
1066 | goto out; | ||
1067 | } | ||
1068 | break; | ||
1069 | } | ||
1070 | |||
1071 | /* ok, we're done with i_size and alloc work */ | ||
1072 | iocb->ki_pos = saved_pos; | ||
1073 | ocfs2_meta_unlock(inode, meta_level); | ||
1074 | meta_level = -1; | ||
1075 | |||
1076 | /* communicate with ocfs2_dio_end_io */ | ||
1077 | ocfs2_iocb_set_rw_locked(iocb); | ||
1078 | |||
1079 | #ifdef OCFS2_ORACORE_WORKAROUNDS | ||
1080 | if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS && | ||
1081 | filp->f_flags & O_DIRECT) { | ||
1082 | unsigned int saved_flags = filp->f_flags; | ||
1083 | int sector_size = 1 << osb->s_sectsize_bits; | ||
1084 | |||
1085 | if ((saved_pos & (sector_size - 1)) || | ||
1086 | (count & (sector_size - 1)) || | ||
1087 | ((unsigned long)buf & (sector_size - 1))) { | ||
1088 | filp->f_flags |= O_SYNC; | ||
1089 | filp->f_flags &= ~O_DIRECT; | ||
1090 | } | ||
1091 | |||
1092 | ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, | ||
1093 | &iocb->ki_pos); | ||
1094 | |||
1095 | filp->f_flags = saved_flags; | ||
1096 | } else | ||
1097 | #endif | ||
1098 | ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, | ||
1099 | &iocb->ki_pos); | ||
1100 | |||
1101 | /* buffered aio wouldn't have proper lock coverage today */ | ||
1102 | BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); | ||
1103 | |||
1104 | /* | ||
1105 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io | ||
1106 | * function pointer which is called when o_direct io completes so that | ||
1107 | * it can unlock our rw lock. (it's the clustered equivalent of | ||
1108 | * i_alloc_sem; protects truncate from racing with pending ios). | ||
1109 | * Unfortunately there are error cases which call end_io and others | ||
1110 | * that don't. so we don't have to unlock the rw_lock if either an | ||
1111 | * async dio is going to do it in the future or an end_io after an | ||
1112 | * error has already done it. | ||
1113 | */ | ||
1114 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { | ||
1115 | rw_level = -1; | ||
1116 | have_alloc_sem = 0; | ||
1117 | } | ||
1118 | |||
1119 | out: | ||
1120 | if (meta_level != -1) | ||
1121 | ocfs2_meta_unlock(inode, meta_level); | ||
1122 | if (have_alloc_sem) | ||
1123 | up_read(&inode->i_alloc_sem); | ||
1124 | if (rw_level != -1) | ||
1125 | ocfs2_rw_unlock(inode, rw_level); | ||
1126 | up(&inode->i_sem); | ||
1127 | |||
1128 | mlog_exit(ret); | ||
1129 | return ret; | ||
1130 | } | ||
1131 | |||
1132 | static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, | ||
1133 | char __user *buf, | ||
1134 | size_t count, | ||
1135 | loff_t pos) | ||
1136 | { | ||
1137 | int ret = 0, rw_level = -1, have_alloc_sem = 0; | ||
1138 | struct file *filp = iocb->ki_filp; | ||
1139 | struct inode *inode = filp->f_dentry->d_inode; | ||
1140 | #ifdef OCFS2_ORACORE_WORKAROUNDS | ||
1141 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1142 | #endif | ||
1143 | |||
1144 | mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, | ||
1145 | (unsigned int)count, | ||
1146 | filp->f_dentry->d_name.len, | ||
1147 | filp->f_dentry->d_name.name); | ||
1148 | |||
1149 | if (!inode) { | ||
1150 | ret = -EINVAL; | ||
1151 | mlog_errno(ret); | ||
1152 | goto bail; | ||
1153 | } | ||
1154 | |||
1155 | #ifdef OCFS2_ORACORE_WORKAROUNDS | ||
1156 | if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) { | ||
1157 | if (filp->f_flags & O_DIRECT) { | ||
1158 | int sector_size = 1 << osb->s_sectsize_bits; | ||
1159 | |||
1160 | if ((pos & (sector_size - 1)) || | ||
1161 | (count & (sector_size - 1)) || | ||
1162 | ((unsigned long)buf & (sector_size - 1)) || | ||
1163 | (i_size_read(inode) & (sector_size -1))) { | ||
1164 | filp->f_flags &= ~O_DIRECT; | ||
1165 | } | ||
1166 | } | ||
1167 | } | ||
1168 | #endif | ||
1169 | |||
1170 | /* | ||
1171 | * buffered reads protect themselves in ->readpage(). O_DIRECT reads | ||
1172 | * need locks to protect pending reads from racing with truncate. | ||
1173 | */ | ||
1174 | if (filp->f_flags & O_DIRECT) { | ||
1175 | down_read(&inode->i_alloc_sem); | ||
1176 | have_alloc_sem = 1; | ||
1177 | |||
1178 | ret = ocfs2_rw_lock(inode, 0); | ||
1179 | if (ret < 0) { | ||
1180 | mlog_errno(ret); | ||
1181 | goto bail; | ||
1182 | } | ||
1183 | rw_level = 0; | ||
1184 | /* communicate with ocfs2_dio_end_io */ | ||
1185 | ocfs2_iocb_set_rw_locked(iocb); | ||
1186 | } | ||
1187 | |||
1188 | ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos); | ||
1189 | if (ret == -EINVAL) | ||
1190 | mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); | ||
1191 | |||
1192 | /* buffered aio wouldn't have proper lock coverage today */ | ||
1193 | BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); | ||
1194 | |||
1195 | /* see ocfs2_file_aio_write */ | ||
1196 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { | ||
1197 | rw_level = -1; | ||
1198 | have_alloc_sem = 0; | ||
1199 | } | ||
1200 | |||
1201 | bail: | ||
1202 | if (have_alloc_sem) | ||
1203 | up_read(&inode->i_alloc_sem); | ||
1204 | if (rw_level != -1) | ||
1205 | ocfs2_rw_unlock(inode, rw_level); | ||
1206 | mlog_exit(ret); | ||
1207 | |||
1208 | return ret; | ||
1209 | } | ||
1210 | |||
1211 | struct inode_operations ocfs2_file_iops = { | ||
1212 | .setattr = ocfs2_setattr, | ||
1213 | .getattr = ocfs2_getattr, | ||
1214 | }; | ||
1215 | |||
1216 | struct inode_operations ocfs2_special_file_iops = { | ||
1217 | .setattr = ocfs2_setattr, | ||
1218 | .getattr = ocfs2_getattr, | ||
1219 | }; | ||
1220 | |||
1221 | struct file_operations ocfs2_fops = { | ||
1222 | .read = do_sync_read, | ||
1223 | .write = do_sync_write, | ||
1224 | .sendfile = generic_file_sendfile, | ||
1225 | .mmap = ocfs2_mmap, | ||
1226 | .fsync = ocfs2_sync_file, | ||
1227 | .release = ocfs2_file_release, | ||
1228 | .open = ocfs2_file_open, | ||
1229 | .aio_read = ocfs2_file_aio_read, | ||
1230 | .aio_write = ocfs2_file_aio_write, | ||
1231 | }; | ||
1232 | |||
1233 | struct file_operations ocfs2_dops = { | ||
1234 | .read = generic_read_dir, | ||
1235 | .readdir = ocfs2_readdir, | ||
1236 | .fsync = ocfs2_sync_file, | ||
1237 | }; | ||
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h new file mode 100644 index 000000000000..a5ea33b24060 --- /dev/null +++ b/fs/ocfs2/file.h | |||
@@ -0,0 +1,57 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * file.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_FILE_H | ||
27 | #define OCFS2_FILE_H | ||
28 | |||
29 | extern struct file_operations ocfs2_fops; | ||
30 | extern struct file_operations ocfs2_dops; | ||
31 | extern struct inode_operations ocfs2_file_iops; | ||
32 | extern struct inode_operations ocfs2_special_file_iops; | ||
33 | struct ocfs2_alloc_context; | ||
34 | |||
35 | enum ocfs2_alloc_restarted { | ||
36 | RESTART_NONE = 0, | ||
37 | RESTART_TRANS, | ||
38 | RESTART_META | ||
39 | }; | ||
40 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | ||
41 | struct inode *inode, | ||
42 | u32 clusters_to_add, | ||
43 | struct buffer_head *fe_bh, | ||
44 | struct ocfs2_journal_handle *handle, | ||
45 | struct ocfs2_alloc_context *data_ac, | ||
46 | struct ocfs2_alloc_context *meta_ac, | ||
47 | enum ocfs2_alloc_restarted *reason); | ||
48 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); | ||
49 | int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, | ||
50 | struct kstat *stat); | ||
51 | |||
52 | int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle, | ||
53 | struct inode *inode, | ||
54 | struct buffer_head *fe_bh, | ||
55 | u64 new_i_size); | ||
56 | |||
57 | #endif /* OCFS2_FILE_H */ | ||
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c new file mode 100644 index 000000000000..0bbd22f46c80 --- /dev/null +++ b/fs/ocfs2/heartbeat.c | |||
@@ -0,0 +1,378 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * heartbeat.c | ||
5 | * | ||
6 | * Register ourselves with the heartbaet service, keep our node maps | ||
7 | * up to date, and fire off recovery when needed. | ||
8 | * | ||
9 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public | ||
13 | * License as published by the Free Software Foundation; either | ||
14 | * version 2 of the License, or (at your option) any later version. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
19 | * General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public | ||
22 | * License along with this program; if not, write to the | ||
23 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
24 | * Boston, MA 021110-1307, USA. | ||
25 | */ | ||
26 | |||
27 | #include <linux/fs.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/highmem.h> | ||
31 | #include <linux/kmod.h> | ||
32 | |||
33 | #include <cluster/heartbeat.h> | ||
34 | #include <cluster/nodemanager.h> | ||
35 | |||
36 | #include <dlm/dlmapi.h> | ||
37 | |||
38 | #define MLOG_MASK_PREFIX ML_SUPER | ||
39 | #include <cluster/masklog.h> | ||
40 | |||
41 | #include "ocfs2.h" | ||
42 | |||
43 | #include "alloc.h" | ||
44 | #include "heartbeat.h" | ||
45 | #include "inode.h" | ||
46 | #include "journal.h" | ||
47 | #include "vote.h" | ||
48 | |||
49 | #include "buffer_head_io.h" | ||
50 | |||
51 | #define OCFS2_HB_NODE_DOWN_PRI (0x0000002) | ||
52 | #define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI | ||
53 | |||
54 | static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, | ||
55 | int bit); | ||
56 | static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, | ||
57 | int bit); | ||
58 | static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map); | ||
59 | static void __ocfs2_node_map_dup(struct ocfs2_node_map *target, | ||
60 | struct ocfs2_node_map *from); | ||
61 | static void __ocfs2_node_map_set(struct ocfs2_node_map *target, | ||
62 | struct ocfs2_node_map *from); | ||
63 | |||
64 | void ocfs2_init_node_maps(struct ocfs2_super *osb) | ||
65 | { | ||
66 | spin_lock_init(&osb->node_map_lock); | ||
67 | ocfs2_node_map_init(&osb->mounted_map); | ||
68 | ocfs2_node_map_init(&osb->recovery_map); | ||
69 | ocfs2_node_map_init(&osb->umount_map); | ||
70 | } | ||
71 | |||
72 | static void ocfs2_do_node_down(int node_num, | ||
73 | struct ocfs2_super *osb) | ||
74 | { | ||
75 | BUG_ON(osb->node_num == node_num); | ||
76 | |||
77 | mlog(0, "ocfs2: node down event for %d\n", node_num); | ||
78 | |||
79 | if (!osb->dlm) { | ||
80 | /* | ||
81 | * No DLM means we're not even ready to participate yet. | ||
82 | * We check the slots after the DLM comes up, so we will | ||
83 | * notice the node death then. We can safely ignore it | ||
84 | * here. | ||
85 | */ | ||
86 | return; | ||
87 | } | ||
88 | |||
89 | if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) { | ||
90 | /* If a node is in the umount map, then we've been | ||
91 | * expecting him to go down and we know ahead of time | ||
92 | * that recovery is not necessary. */ | ||
93 | ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num); | ||
94 | return; | ||
95 | } | ||
96 | |||
97 | ocfs2_recovery_thread(osb, node_num); | ||
98 | |||
99 | ocfs2_remove_node_from_vote_queues(osb, node_num); | ||
100 | } | ||
101 | |||
102 | static void ocfs2_hb_node_down_cb(struct o2nm_node *node, | ||
103 | int node_num, | ||
104 | void *data) | ||
105 | { | ||
106 | ocfs2_do_node_down(node_num, (struct ocfs2_super *) data); | ||
107 | } | ||
108 | |||
109 | /* Called from the dlm when it's about to evict a node. We may also | ||
110 | * get a heartbeat callback later. */ | ||
111 | static void ocfs2_dlm_eviction_cb(int node_num, | ||
112 | void *data) | ||
113 | { | ||
114 | struct ocfs2_super *osb = (struct ocfs2_super *) data; | ||
115 | struct super_block *sb = osb->sb; | ||
116 | |||
117 | mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n", | ||
118 | MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num); | ||
119 | |||
120 | ocfs2_do_node_down(node_num, osb); | ||
121 | } | ||
122 | |||
123 | static void ocfs2_hb_node_up_cb(struct o2nm_node *node, | ||
124 | int node_num, | ||
125 | void *data) | ||
126 | { | ||
127 | struct ocfs2_super *osb = data; | ||
128 | |||
129 | BUG_ON(osb->node_num == node_num); | ||
130 | |||
131 | mlog(0, "node up event for %d\n", node_num); | ||
132 | ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num); | ||
133 | } | ||
134 | |||
135 | void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb) | ||
136 | { | ||
137 | o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB, | ||
138 | ocfs2_hb_node_down_cb, osb, | ||
139 | OCFS2_HB_NODE_DOWN_PRI); | ||
140 | |||
141 | o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB, | ||
142 | ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI); | ||
143 | |||
144 | /* Not exactly a heartbeat callback, but leads to essentially | ||
145 | * the same path so we set it up here. */ | ||
146 | dlm_setup_eviction_cb(&osb->osb_eviction_cb, | ||
147 | ocfs2_dlm_eviction_cb, | ||
148 | osb); | ||
149 | } | ||
150 | |||
151 | /* Most functions here are just stubs for now... */ | ||
152 | int ocfs2_register_hb_callbacks(struct ocfs2_super *osb) | ||
153 | { | ||
154 | int status; | ||
155 | |||
156 | status = o2hb_register_callback(&osb->osb_hb_down); | ||
157 | if (status < 0) { | ||
158 | mlog_errno(status); | ||
159 | goto bail; | ||
160 | } | ||
161 | |||
162 | status = o2hb_register_callback(&osb->osb_hb_up); | ||
163 | if (status < 0) | ||
164 | mlog_errno(status); | ||
165 | |||
166 | bail: | ||
167 | return status; | ||
168 | } | ||
169 | |||
170 | void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb) | ||
171 | { | ||
172 | int status; | ||
173 | |||
174 | status = o2hb_unregister_callback(&osb->osb_hb_down); | ||
175 | if (status < 0) | ||
176 | mlog_errno(status); | ||
177 | |||
178 | status = o2hb_unregister_callback(&osb->osb_hb_up); | ||
179 | if (status < 0) | ||
180 | mlog_errno(status); | ||
181 | } | ||
182 | |||
183 | void ocfs2_stop_heartbeat(struct ocfs2_super *osb) | ||
184 | { | ||
185 | int ret; | ||
186 | char *argv[5], *envp[3]; | ||
187 | |||
188 | if (!osb->uuid_str) { | ||
189 | /* This can happen if we don't get far enough in mount... */ | ||
190 | mlog(0, "No UUID with which to stop heartbeat!\n\n"); | ||
191 | return; | ||
192 | } | ||
193 | |||
194 | argv[0] = (char *)o2nm_get_hb_ctl_path(); | ||
195 | argv[1] = "-K"; | ||
196 | argv[2] = "-u"; | ||
197 | argv[3] = osb->uuid_str; | ||
198 | argv[4] = NULL; | ||
199 | |||
200 | mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]); | ||
201 | |||
202 | /* minimal command environment taken from cpu_run_sbin_hotplug */ | ||
203 | envp[0] = "HOME=/"; | ||
204 | envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | ||
205 | envp[2] = NULL; | ||
206 | |||
207 | ret = call_usermodehelper(argv[0], argv, envp, 1); | ||
208 | if (ret < 0) | ||
209 | mlog_errno(ret); | ||
210 | } | ||
211 | |||
212 | /* special case -1 for now | ||
213 | * TODO: should *really* make sure the calling func never passes -1!! */ | ||
214 | void ocfs2_node_map_init(struct ocfs2_node_map *map) | ||
215 | { | ||
216 | map->num_nodes = OCFS2_NODE_MAP_MAX_NODES; | ||
217 | memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) * | ||
218 | sizeof(unsigned long)); | ||
219 | } | ||
220 | |||
221 | static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, | ||
222 | int bit) | ||
223 | { | ||
224 | set_bit(bit, map->map); | ||
225 | } | ||
226 | |||
227 | void ocfs2_node_map_set_bit(struct ocfs2_super *osb, | ||
228 | struct ocfs2_node_map *map, | ||
229 | int bit) | ||
230 | { | ||
231 | if (bit==-1) | ||
232 | return; | ||
233 | BUG_ON(bit >= map->num_nodes); | ||
234 | spin_lock(&osb->node_map_lock); | ||
235 | __ocfs2_node_map_set_bit(map, bit); | ||
236 | spin_unlock(&osb->node_map_lock); | ||
237 | } | ||
238 | |||
239 | static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, | ||
240 | int bit) | ||
241 | { | ||
242 | clear_bit(bit, map->map); | ||
243 | } | ||
244 | |||
245 | void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, | ||
246 | struct ocfs2_node_map *map, | ||
247 | int bit) | ||
248 | { | ||
249 | if (bit==-1) | ||
250 | return; | ||
251 | BUG_ON(bit >= map->num_nodes); | ||
252 | spin_lock(&osb->node_map_lock); | ||
253 | __ocfs2_node_map_clear_bit(map, bit); | ||
254 | spin_unlock(&osb->node_map_lock); | ||
255 | } | ||
256 | |||
257 | int ocfs2_node_map_test_bit(struct ocfs2_super *osb, | ||
258 | struct ocfs2_node_map *map, | ||
259 | int bit) | ||
260 | { | ||
261 | int ret; | ||
262 | if (bit >= map->num_nodes) { | ||
263 | mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes); | ||
264 | BUG(); | ||
265 | } | ||
266 | spin_lock(&osb->node_map_lock); | ||
267 | ret = test_bit(bit, map->map); | ||
268 | spin_unlock(&osb->node_map_lock); | ||
269 | return ret; | ||
270 | } | ||
271 | |||
272 | static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map) | ||
273 | { | ||
274 | int bit; | ||
275 | bit = find_next_bit(map->map, map->num_nodes, 0); | ||
276 | if (bit < map->num_nodes) | ||
277 | return 0; | ||
278 | return 1; | ||
279 | } | ||
280 | |||
281 | int ocfs2_node_map_is_empty(struct ocfs2_super *osb, | ||
282 | struct ocfs2_node_map *map) | ||
283 | { | ||
284 | int ret; | ||
285 | BUG_ON(map->num_nodes == 0); | ||
286 | spin_lock(&osb->node_map_lock); | ||
287 | ret = __ocfs2_node_map_is_empty(map); | ||
288 | spin_unlock(&osb->node_map_lock); | ||
289 | return ret; | ||
290 | } | ||
291 | |||
292 | static void __ocfs2_node_map_dup(struct ocfs2_node_map *target, | ||
293 | struct ocfs2_node_map *from) | ||
294 | { | ||
295 | BUG_ON(from->num_nodes == 0); | ||
296 | ocfs2_node_map_init(target); | ||
297 | __ocfs2_node_map_set(target, from); | ||
298 | } | ||
299 | |||
300 | /* returns 1 if bit is the only bit set in target, 0 otherwise */ | ||
301 | int ocfs2_node_map_is_only(struct ocfs2_super *osb, | ||
302 | struct ocfs2_node_map *target, | ||
303 | int bit) | ||
304 | { | ||
305 | struct ocfs2_node_map temp; | ||
306 | int ret; | ||
307 | |||
308 | spin_lock(&osb->node_map_lock); | ||
309 | __ocfs2_node_map_dup(&temp, target); | ||
310 | __ocfs2_node_map_clear_bit(&temp, bit); | ||
311 | ret = __ocfs2_node_map_is_empty(&temp); | ||
312 | spin_unlock(&osb->node_map_lock); | ||
313 | |||
314 | return ret; | ||
315 | } | ||
316 | |||
317 | static void __ocfs2_node_map_set(struct ocfs2_node_map *target, | ||
318 | struct ocfs2_node_map *from) | ||
319 | { | ||
320 | int num_longs, i; | ||
321 | |||
322 | BUG_ON(target->num_nodes != from->num_nodes); | ||
323 | BUG_ON(target->num_nodes == 0); | ||
324 | |||
325 | num_longs = BITS_TO_LONGS(target->num_nodes); | ||
326 | for (i = 0; i < num_longs; i++) | ||
327 | target->map[i] = from->map[i]; | ||
328 | } | ||
329 | |||
330 | /* Returns whether the recovery bit was actually set - it may not be | ||
331 | * if a node is still marked as needing recovery */ | ||
332 | int ocfs2_recovery_map_set(struct ocfs2_super *osb, | ||
333 | int num) | ||
334 | { | ||
335 | int set = 0; | ||
336 | |||
337 | spin_lock(&osb->node_map_lock); | ||
338 | |||
339 | __ocfs2_node_map_clear_bit(&osb->mounted_map, num); | ||
340 | |||
341 | if (!test_bit(num, osb->recovery_map.map)) { | ||
342 | __ocfs2_node_map_set_bit(&osb->recovery_map, num); | ||
343 | set = 1; | ||
344 | } | ||
345 | |||
346 | spin_unlock(&osb->node_map_lock); | ||
347 | |||
348 | return set; | ||
349 | } | ||
350 | |||
351 | void ocfs2_recovery_map_clear(struct ocfs2_super *osb, | ||
352 | int num) | ||
353 | { | ||
354 | ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num); | ||
355 | } | ||
356 | |||
357 | int ocfs2_node_map_iterate(struct ocfs2_super *osb, | ||
358 | struct ocfs2_node_map *map, | ||
359 | int idx) | ||
360 | { | ||
361 | int i = idx; | ||
362 | |||
363 | idx = O2NM_INVALID_NODE_NUM; | ||
364 | spin_lock(&osb->node_map_lock); | ||
365 | if ((i != O2NM_INVALID_NODE_NUM) && | ||
366 | (i >= 0) && | ||
367 | (i < map->num_nodes)) { | ||
368 | while(i < map->num_nodes) { | ||
369 | if (test_bit(i, map->map)) { | ||
370 | idx = i; | ||
371 | break; | ||
372 | } | ||
373 | i++; | ||
374 | } | ||
375 | } | ||
376 | spin_unlock(&osb->node_map_lock); | ||
377 | return idx; | ||
378 | } | ||
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h new file mode 100644 index 000000000000..e8fb079122e4 --- /dev/null +++ b/fs/ocfs2/heartbeat.h | |||
@@ -0,0 +1,67 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * heartbeat.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_HEARTBEAT_H | ||
27 | #define OCFS2_HEARTBEAT_H | ||
28 | |||
29 | void ocfs2_init_node_maps(struct ocfs2_super *osb); | ||
30 | |||
31 | void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb); | ||
32 | int ocfs2_register_hb_callbacks(struct ocfs2_super *osb); | ||
33 | void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb); | ||
34 | void ocfs2_stop_heartbeat(struct ocfs2_super *osb); | ||
35 | |||
36 | /* node map functions - used to keep track of mounted and in-recovery | ||
37 | * nodes. */ | ||
38 | void ocfs2_node_map_init(struct ocfs2_node_map *map); | ||
39 | int ocfs2_node_map_is_empty(struct ocfs2_super *osb, | ||
40 | struct ocfs2_node_map *map); | ||
41 | void ocfs2_node_map_set_bit(struct ocfs2_super *osb, | ||
42 | struct ocfs2_node_map *map, | ||
43 | int bit); | ||
44 | void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, | ||
45 | struct ocfs2_node_map *map, | ||
46 | int bit); | ||
47 | int ocfs2_node_map_test_bit(struct ocfs2_super *osb, | ||
48 | struct ocfs2_node_map *map, | ||
49 | int bit); | ||
50 | int ocfs2_node_map_iterate(struct ocfs2_super *osb, | ||
51 | struct ocfs2_node_map *map, | ||
52 | int idx); | ||
53 | static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb, | ||
54 | struct ocfs2_node_map *map) | ||
55 | { | ||
56 | return ocfs2_node_map_iterate(osb, map, 0); | ||
57 | } | ||
58 | int ocfs2_recovery_map_set(struct ocfs2_super *osb, | ||
59 | int num); | ||
60 | void ocfs2_recovery_map_clear(struct ocfs2_super *osb, | ||
61 | int num); | ||
62 | /* returns 1 if bit is the only bit set in target, 0 otherwise */ | ||
63 | int ocfs2_node_map_is_only(struct ocfs2_super *osb, | ||
64 | struct ocfs2_node_map *target, | ||
65 | int bit); | ||
66 | |||
67 | #endif /* OCFS2_HEARTBEAT_H */ | ||
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c new file mode 100644 index 000000000000..a91ba4dec936 --- /dev/null +++ b/fs/ocfs2/inode.c | |||
@@ -0,0 +1,1140 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * inode.c | ||
5 | * | ||
6 | * vfs' aops, fops, dops and iops | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/pagemap.h> | ||
31 | #include <linux/smp_lock.h> | ||
32 | |||
33 | #include <asm/byteorder.h> | ||
34 | |||
35 | #define MLOG_MASK_PREFIX ML_INODE | ||
36 | #include <cluster/masklog.h> | ||
37 | |||
38 | #include "ocfs2.h" | ||
39 | |||
40 | #include "alloc.h" | ||
41 | #include "dlmglue.h" | ||
42 | #include "extent_map.h" | ||
43 | #include "file.h" | ||
44 | #include "inode.h" | ||
45 | #include "journal.h" | ||
46 | #include "namei.h" | ||
47 | #include "suballoc.h" | ||
48 | #include "super.h" | ||
49 | #include "symlink.h" | ||
50 | #include "sysfile.h" | ||
51 | #include "uptodate.h" | ||
52 | #include "vote.h" | ||
53 | |||
54 | #include "buffer_head_io.h" | ||
55 | |||
56 | #define OCFS2_FI_FLAG_NOWAIT 0x1 | ||
57 | #define OCFS2_FI_FLAG_DELETE 0x2 | ||
58 | struct ocfs2_find_inode_args | ||
59 | { | ||
60 | u64 fi_blkno; | ||
61 | unsigned long fi_ino; | ||
62 | unsigned int fi_flags; | ||
63 | }; | ||
64 | |||
65 | static int ocfs2_read_locked_inode(struct inode *inode, | ||
66 | struct ocfs2_find_inode_args *args); | ||
67 | static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); | ||
68 | static int ocfs2_find_actor(struct inode *inode, void *opaque); | ||
69 | static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, | ||
70 | struct inode *inode, | ||
71 | struct buffer_head *fe_bh); | ||
72 | |||
73 | struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, | ||
74 | u64 blkno, | ||
75 | int delete_vote) | ||
76 | { | ||
77 | struct ocfs2_find_inode_args args; | ||
78 | |||
79 | /* ocfs2_ilookup_for_vote should *only* be called from the | ||
80 | * vote thread */ | ||
81 | BUG_ON(current != osb->vote_task); | ||
82 | |||
83 | args.fi_blkno = blkno; | ||
84 | args.fi_flags = OCFS2_FI_FLAG_NOWAIT; | ||
85 | if (delete_vote) | ||
86 | args.fi_flags |= OCFS2_FI_FLAG_DELETE; | ||
87 | args.fi_ino = ino_from_blkno(osb->sb, blkno); | ||
88 | return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args); | ||
89 | } | ||
90 | |||
91 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno) | ||
92 | { | ||
93 | struct inode *inode = NULL; | ||
94 | struct super_block *sb = osb->sb; | ||
95 | struct ocfs2_find_inode_args args; | ||
96 | |||
97 | mlog_entry("(blkno = %"MLFu64")\n", blkno); | ||
98 | |||
99 | /* Ok. By now we've either got the offsets passed to us by the | ||
100 | * caller, or we just pulled them off the bh. Lets do some | ||
101 | * sanity checks to make sure they're OK. */ | ||
102 | if (blkno == 0) { | ||
103 | inode = ERR_PTR(-EINVAL); | ||
104 | mlog_errno(PTR_ERR(inode)); | ||
105 | goto bail; | ||
106 | } | ||
107 | |||
108 | args.fi_blkno = blkno; | ||
109 | args.fi_flags = 0; | ||
110 | args.fi_ino = ino_from_blkno(sb, blkno); | ||
111 | |||
112 | inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, | ||
113 | ocfs2_init_locked_inode, &args); | ||
114 | /* inode was *not* in the inode cache. 2.6.x requires | ||
115 | * us to do our own read_inode call and unlock it | ||
116 | * afterwards. */ | ||
117 | if (inode && inode->i_state & I_NEW) { | ||
118 | mlog(0, "Inode was not in inode cache, reading it.\n"); | ||
119 | ocfs2_read_locked_inode(inode, &args); | ||
120 | unlock_new_inode(inode); | ||
121 | } | ||
122 | if (inode == NULL) { | ||
123 | inode = ERR_PTR(-ENOMEM); | ||
124 | mlog_errno(PTR_ERR(inode)); | ||
125 | goto bail; | ||
126 | } | ||
127 | if (is_bad_inode(inode)) { | ||
128 | iput(inode); | ||
129 | inode = ERR_PTR(-ESTALE); | ||
130 | mlog_errno(PTR_ERR(inode)); | ||
131 | goto bail; | ||
132 | } | ||
133 | |||
134 | bail: | ||
135 | if (!IS_ERR(inode)) { | ||
136 | mlog(0, "returning inode with number %"MLFu64"\n", | ||
137 | OCFS2_I(inode)->ip_blkno); | ||
138 | mlog_exit_ptr(inode); | ||
139 | } else | ||
140 | mlog_errno(PTR_ERR(inode)); | ||
141 | |||
142 | return inode; | ||
143 | } | ||
144 | |||
145 | |||
146 | /* | ||
147 | * here's how inodes get read from disk: | ||
148 | * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR | ||
149 | * found? : return the in-memory inode | ||
150 | * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE | ||
151 | */ | ||
152 | |||
153 | static int ocfs2_find_actor(struct inode *inode, void *opaque) | ||
154 | { | ||
155 | struct ocfs2_find_inode_args *args = NULL; | ||
156 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
157 | int ret = 0; | ||
158 | |||
159 | mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque); | ||
160 | |||
161 | args = opaque; | ||
162 | |||
163 | mlog_bug_on_msg(!inode, "No inode in find actor!\n"); | ||
164 | |||
165 | if (oi->ip_blkno != args->fi_blkno) | ||
166 | goto bail; | ||
167 | |||
168 | /* OCFS2_FI_FLAG_NOWAIT is *only* set from | ||
169 | * ocfs2_ilookup_for_vote which won't create an inode for one | ||
170 | * that isn't found. The vote thread which doesn't want to get | ||
171 | * an inode which is in the process of going away - otherwise | ||
172 | * the call to __wait_on_freeing_inode in find_inode_fast will | ||
173 | * cause it to deadlock on an inode which may be waiting on a | ||
174 | * vote (or lock release) in delete_inode */ | ||
175 | if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) && | ||
176 | (inode->i_state & (I_FREEING|I_CLEAR))) { | ||
177 | /* As stated above, we're not going to return an | ||
178 | * inode. In the case of a delete vote, the voting | ||
179 | * code is going to signal the other node to go | ||
180 | * ahead. Mark that state here, so this freeing inode | ||
181 | * has the state when it gets to delete_inode. */ | ||
182 | if (args->fi_flags & OCFS2_FI_FLAG_DELETE) { | ||
183 | spin_lock(&oi->ip_lock); | ||
184 | ocfs2_mark_inode_remotely_deleted(inode); | ||
185 | spin_unlock(&oi->ip_lock); | ||
186 | } | ||
187 | goto bail; | ||
188 | } | ||
189 | |||
190 | ret = 1; | ||
191 | bail: | ||
192 | mlog_exit(ret); | ||
193 | return ret; | ||
194 | } | ||
195 | |||
196 | /* | ||
197 | * initialize the new inode, but don't do anything that would cause | ||
198 | * us to sleep. | ||
199 | * return 0 on success, 1 on failure | ||
200 | */ | ||
201 | static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) | ||
202 | { | ||
203 | struct ocfs2_find_inode_args *args = opaque; | ||
204 | |||
205 | mlog_entry("inode = %p, opaque = %p\n", inode, opaque); | ||
206 | |||
207 | inode->i_ino = args->fi_ino; | ||
208 | OCFS2_I(inode)->ip_blkno = args->fi_blkno; | ||
209 | |||
210 | mlog_exit(0); | ||
211 | return 0; | ||
212 | } | ||
213 | |||
214 | int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | ||
215 | int create_ino) | ||
216 | { | ||
217 | struct super_block *sb; | ||
218 | struct ocfs2_super *osb; | ||
219 | int status = -EINVAL; | ||
220 | |||
221 | mlog_entry("(0x%p, size:%"MLFu64")\n", inode, fe->i_size); | ||
222 | |||
223 | sb = inode->i_sb; | ||
224 | osb = OCFS2_SB(sb); | ||
225 | |||
226 | /* this means that read_inode cannot create a superblock inode | ||
227 | * today. change if needed. */ | ||
228 | if (!OCFS2_IS_VALID_DINODE(fe) || | ||
229 | !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { | ||
230 | mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%"MLFu64", " | ||
231 | "signature = %.*s, flags = 0x%x\n", | ||
232 | inode->i_ino, le64_to_cpu(fe->i_blkno), 7, | ||
233 | fe->i_signature, le32_to_cpu(fe->i_flags)); | ||
234 | goto bail; | ||
235 | } | ||
236 | |||
237 | if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) { | ||
238 | mlog(ML_ERROR, "file entry generation does not match " | ||
239 | "superblock! osb->fs_generation=%x, " | ||
240 | "fe->i_fs_generation=%x\n", | ||
241 | osb->fs_generation, le32_to_cpu(fe->i_fs_generation)); | ||
242 | goto bail; | ||
243 | } | ||
244 | |||
245 | inode->i_version = 1; | ||
246 | inode->i_generation = le32_to_cpu(fe->i_generation); | ||
247 | inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); | ||
248 | inode->i_mode = le16_to_cpu(fe->i_mode); | ||
249 | inode->i_uid = le32_to_cpu(fe->i_uid); | ||
250 | inode->i_gid = le32_to_cpu(fe->i_gid); | ||
251 | inode->i_blksize = (u32)osb->s_clustersize; | ||
252 | |||
253 | /* Fast symlinks will have i_size but no allocated clusters. */ | ||
254 | if (S_ISLNK(inode->i_mode) && !fe->i_clusters) | ||
255 | inode->i_blocks = 0; | ||
256 | else | ||
257 | inode->i_blocks = | ||
258 | ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size)); | ||
259 | inode->i_mapping->a_ops = &ocfs2_aops; | ||
260 | inode->i_flags |= S_NOATIME; | ||
261 | inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); | ||
262 | inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); | ||
263 | inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); | ||
264 | inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); | ||
265 | inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); | ||
266 | inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); | ||
267 | |||
268 | if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno)) | ||
269 | mlog(ML_ERROR, | ||
270 | "ip_blkno %"MLFu64" != i_blkno %"MLFu64"!\n", | ||
271 | OCFS2_I(inode)->ip_blkno, fe->i_blkno); | ||
272 | |||
273 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
274 | OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT; | ||
275 | |||
276 | if (create_ino) | ||
277 | inode->i_ino = ino_from_blkno(inode->i_sb, | ||
278 | le64_to_cpu(fe->i_blkno)); | ||
279 | |||
280 | mlog(0, "blkno = %"MLFu64", ino = %lu, create_ino = %s\n", | ||
281 | fe->i_blkno, inode->i_ino, create_ino ? "true" : "false"); | ||
282 | |||
283 | inode->i_nlink = le16_to_cpu(fe->i_links_count); | ||
284 | |||
285 | if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { | ||
286 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; | ||
287 | mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); | ||
288 | } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { | ||
289 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; | ||
290 | } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { | ||
291 | mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino); | ||
292 | /* we can't actually hit this as read_inode can't | ||
293 | * handle superblocks today ;-) */ | ||
294 | BUG(); | ||
295 | } | ||
296 | |||
297 | switch (inode->i_mode & S_IFMT) { | ||
298 | case S_IFREG: | ||
299 | inode->i_fop = &ocfs2_fops; | ||
300 | inode->i_op = &ocfs2_file_iops; | ||
301 | i_size_write(inode, le64_to_cpu(fe->i_size)); | ||
302 | break; | ||
303 | case S_IFDIR: | ||
304 | inode->i_op = &ocfs2_dir_iops; | ||
305 | inode->i_fop = &ocfs2_dops; | ||
306 | i_size_write(inode, le64_to_cpu(fe->i_size)); | ||
307 | break; | ||
308 | case S_IFLNK: | ||
309 | if (ocfs2_inode_is_fast_symlink(inode)) | ||
310 | inode->i_op = &ocfs2_fast_symlink_inode_operations; | ||
311 | else | ||
312 | inode->i_op = &ocfs2_symlink_inode_operations; | ||
313 | i_size_write(inode, le64_to_cpu(fe->i_size)); | ||
314 | break; | ||
315 | default: | ||
316 | inode->i_op = &ocfs2_special_file_iops; | ||
317 | init_special_inode(inode, inode->i_mode, | ||
318 | inode->i_rdev); | ||
319 | break; | ||
320 | } | ||
321 | |||
322 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, | ||
323 | OCFS2_LOCK_TYPE_RW, inode); | ||
324 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, | ||
325 | OCFS2_LOCK_TYPE_META, inode); | ||
326 | ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres, | ||
327 | OCFS2_LOCK_TYPE_DATA, inode); | ||
328 | |||
329 | status = 0; | ||
330 | bail: | ||
331 | mlog_exit(status); | ||
332 | return status; | ||
333 | } | ||
334 | |||
335 | static int ocfs2_read_locked_inode(struct inode *inode, | ||
336 | struct ocfs2_find_inode_args *args) | ||
337 | { | ||
338 | struct super_block *sb; | ||
339 | struct ocfs2_super *osb; | ||
340 | struct ocfs2_dinode *fe; | ||
341 | struct buffer_head *bh = NULL; | ||
342 | int status; | ||
343 | int sysfile = 0; | ||
344 | |||
345 | mlog_entry("(0x%p, 0x%p)\n", inode, args); | ||
346 | |||
347 | status = -EINVAL; | ||
348 | if (inode == NULL || inode->i_sb == NULL) { | ||
349 | mlog(ML_ERROR, "bad inode\n"); | ||
350 | goto bail; | ||
351 | } | ||
352 | sb = inode->i_sb; | ||
353 | osb = OCFS2_SB(sb); | ||
354 | |||
355 | if (!args) { | ||
356 | mlog(ML_ERROR, "bad inode args\n"); | ||
357 | make_bad_inode(inode); | ||
358 | goto bail; | ||
359 | } | ||
360 | |||
361 | /* Read the FE off disk. This is safe because the kernel only | ||
362 | * does one read_inode2 for a new inode, and if it doesn't | ||
363 | * exist yet then nobody can be working on it! */ | ||
364 | status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL); | ||
365 | if (status < 0) { | ||
366 | mlog_errno(status); | ||
367 | make_bad_inode(inode); | ||
368 | goto bail; | ||
369 | } | ||
370 | |||
371 | fe = (struct ocfs2_dinode *) bh->b_data; | ||
372 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
373 | mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n", | ||
374 | fe->i_blkno, 7, fe->i_signature); | ||
375 | make_bad_inode(inode); | ||
376 | goto bail; | ||
377 | } | ||
378 | |||
379 | if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) | ||
380 | sysfile = 1; | ||
381 | |||
382 | if (S_ISCHR(le16_to_cpu(fe->i_mode)) || | ||
383 | S_ISBLK(le16_to_cpu(fe->i_mode))) | ||
384 | inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); | ||
385 | |||
386 | status = -EINVAL; | ||
387 | if (ocfs2_populate_inode(inode, fe, 0) < 0) { | ||
388 | mlog(ML_ERROR, "populate inode failed! i_blkno=%"MLFu64", " | ||
389 | "i_ino=%lu\n", fe->i_blkno, inode->i_ino); | ||
390 | make_bad_inode(inode); | ||
391 | goto bail; | ||
392 | } | ||
393 | |||
394 | BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); | ||
395 | |||
396 | if (sysfile) | ||
397 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; | ||
398 | |||
399 | status = 0; | ||
400 | |||
401 | bail: | ||
402 | if (args && bh) | ||
403 | brelse(bh); | ||
404 | |||
405 | mlog_exit(status); | ||
406 | return status; | ||
407 | } | ||
408 | |||
409 | void ocfs2_sync_blockdev(struct super_block *sb) | ||
410 | { | ||
411 | sync_blockdev(sb->s_bdev); | ||
412 | } | ||
413 | |||
414 | static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, | ||
415 | struct inode *inode, | ||
416 | struct buffer_head *fe_bh) | ||
417 | { | ||
418 | int status = 0; | ||
419 | struct ocfs2_journal_handle *handle = NULL; | ||
420 | struct ocfs2_truncate_context *tc = NULL; | ||
421 | struct ocfs2_dinode *fe; | ||
422 | |||
423 | mlog_entry_void(); | ||
424 | |||
425 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
426 | |||
427 | /* zero allocation, zero truncate :) */ | ||
428 | if (!fe->i_clusters) | ||
429 | goto bail; | ||
430 | |||
431 | handle = ocfs2_start_trans(osb, handle, OCFS2_INODE_UPDATE_CREDITS); | ||
432 | if (IS_ERR(handle)) { | ||
433 | status = PTR_ERR(handle); | ||
434 | handle = NULL; | ||
435 | mlog_errno(status); | ||
436 | goto bail; | ||
437 | } | ||
438 | |||
439 | status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL); | ||
440 | if (status < 0) { | ||
441 | mlog_errno(status); | ||
442 | goto bail; | ||
443 | } | ||
444 | |||
445 | ocfs2_commit_trans(handle); | ||
446 | handle = NULL; | ||
447 | |||
448 | status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); | ||
449 | if (status < 0) { | ||
450 | mlog_errno(status); | ||
451 | goto bail; | ||
452 | } | ||
453 | |||
454 | status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); | ||
455 | if (status < 0) { | ||
456 | mlog_errno(status); | ||
457 | goto bail; | ||
458 | } | ||
459 | bail: | ||
460 | if (handle) | ||
461 | ocfs2_commit_trans(handle); | ||
462 | |||
463 | mlog_exit(status); | ||
464 | return status; | ||
465 | } | ||
466 | |||
467 | static int ocfs2_remove_inode(struct inode *inode, | ||
468 | struct buffer_head *di_bh, | ||
469 | struct inode *orphan_dir_inode, | ||
470 | struct buffer_head *orphan_dir_bh) | ||
471 | { | ||
472 | int status; | ||
473 | struct inode *inode_alloc_inode = NULL; | ||
474 | struct buffer_head *inode_alloc_bh = NULL; | ||
475 | struct ocfs2_journal_handle *handle; | ||
476 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
477 | struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; | ||
478 | |||
479 | inode_alloc_inode = | ||
480 | ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, | ||
481 | le16_to_cpu(di->i_suballoc_slot)); | ||
482 | if (!inode_alloc_inode) { | ||
483 | status = -EEXIST; | ||
484 | mlog_errno(status); | ||
485 | goto bail; | ||
486 | } | ||
487 | |||
488 | down(&inode_alloc_inode->i_sem); | ||
489 | status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1); | ||
490 | if (status < 0) { | ||
491 | up(&inode_alloc_inode->i_sem); | ||
492 | |||
493 | mlog_errno(status); | ||
494 | goto bail; | ||
495 | } | ||
496 | |||
497 | handle = ocfs2_start_trans(osb, NULL, OCFS2_DELETE_INODE_CREDITS); | ||
498 | if (IS_ERR(handle)) { | ||
499 | status = PTR_ERR(handle); | ||
500 | mlog_errno(status); | ||
501 | goto bail_unlock; | ||
502 | } | ||
503 | |||
504 | status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, | ||
505 | orphan_dir_bh); | ||
506 | if (status < 0) { | ||
507 | mlog_errno(status); | ||
508 | goto bail_commit; | ||
509 | } | ||
510 | |||
511 | /* set the inodes dtime */ | ||
512 | status = ocfs2_journal_access(handle, inode, di_bh, | ||
513 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
514 | if (status < 0) { | ||
515 | mlog_errno(status); | ||
516 | goto bail_commit; | ||
517 | } | ||
518 | |||
519 | di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); | ||
520 | le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); | ||
521 | |||
522 | status = ocfs2_journal_dirty(handle, di_bh); | ||
523 | if (status < 0) { | ||
524 | mlog_errno(status); | ||
525 | goto bail_commit; | ||
526 | } | ||
527 | |||
528 | ocfs2_remove_from_cache(inode, di_bh); | ||
529 | |||
530 | status = ocfs2_free_dinode(handle, inode_alloc_inode, | ||
531 | inode_alloc_bh, di); | ||
532 | if (status < 0) | ||
533 | mlog_errno(status); | ||
534 | |||
535 | bail_commit: | ||
536 | ocfs2_commit_trans(handle); | ||
537 | bail_unlock: | ||
538 | ocfs2_meta_unlock(inode_alloc_inode, 1); | ||
539 | up(&inode_alloc_inode->i_sem); | ||
540 | brelse(inode_alloc_bh); | ||
541 | bail: | ||
542 | iput(inode_alloc_inode); | ||
543 | |||
544 | return status; | ||
545 | } | ||
546 | |||
547 | static int ocfs2_wipe_inode(struct inode *inode, | ||
548 | struct buffer_head *di_bh) | ||
549 | { | ||
550 | int status, orphaned_slot; | ||
551 | struct inode *orphan_dir_inode = NULL; | ||
552 | struct buffer_head *orphan_dir_bh = NULL; | ||
553 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
554 | |||
555 | /* We've already voted on this so it should be readonly - no | ||
556 | * spinlock needed. */ | ||
557 | orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; | ||
558 | orphan_dir_inode = ocfs2_get_system_file_inode(osb, | ||
559 | ORPHAN_DIR_SYSTEM_INODE, | ||
560 | orphaned_slot); | ||
561 | if (!orphan_dir_inode) { | ||
562 | status = -EEXIST; | ||
563 | mlog_errno(status); | ||
564 | goto bail; | ||
565 | } | ||
566 | |||
567 | /* Lock the orphan dir. The lock will be held for the entire | ||
568 | * delete_inode operation. We do this now to avoid races with | ||
569 | * recovery completion on other nodes. */ | ||
570 | down(&orphan_dir_inode->i_sem); | ||
571 | status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1); | ||
572 | if (status < 0) { | ||
573 | up(&orphan_dir_inode->i_sem); | ||
574 | |||
575 | mlog_errno(status); | ||
576 | goto bail; | ||
577 | } | ||
578 | |||
579 | /* we do this while holding the orphan dir lock because we | ||
580 | * don't want recovery being run from another node to vote for | ||
581 | * an inode delete on us -- this will result in two nodes | ||
582 | * truncating the same file! */ | ||
583 | status = ocfs2_truncate_for_delete(osb, inode, di_bh); | ||
584 | if (status < 0) { | ||
585 | mlog_errno(status); | ||
586 | goto bail_unlock_dir; | ||
587 | } | ||
588 | |||
589 | status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, | ||
590 | orphan_dir_bh); | ||
591 | if (status < 0) | ||
592 | mlog_errno(status); | ||
593 | |||
594 | bail_unlock_dir: | ||
595 | ocfs2_meta_unlock(orphan_dir_inode, 1); | ||
596 | up(&orphan_dir_inode->i_sem); | ||
597 | brelse(orphan_dir_bh); | ||
598 | bail: | ||
599 | iput(orphan_dir_inode); | ||
600 | |||
601 | return status; | ||
602 | } | ||
603 | |||
604 | /* There is a series of simple checks that should be done before a | ||
605 | * vote is even considered. Encapsulate those in this function. */ | ||
606 | static int ocfs2_inode_is_valid_to_delete(struct inode *inode) | ||
607 | { | ||
608 | int ret = 0; | ||
609 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
610 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
611 | |||
612 | /* We shouldn't be getting here for the root directory | ||
613 | * inode.. */ | ||
614 | if (inode == osb->root_inode) { | ||
615 | mlog(ML_ERROR, "Skipping delete of root inode.\n"); | ||
616 | goto bail; | ||
617 | } | ||
618 | |||
619 | /* If we're coming from process_vote we can't go into our own | ||
620 | * voting [hello, deadlock city!], so unforuntately we just | ||
621 | * have to skip deleting this guy. That's OK though because | ||
622 | * the node who's doing the actual deleting should handle it | ||
623 | * anyway. */ | ||
624 | if (current == osb->vote_task) { | ||
625 | mlog(0, "Skipping delete of %lu because we're currently " | ||
626 | "in process_vote\n", inode->i_ino); | ||
627 | goto bail; | ||
628 | } | ||
629 | |||
630 | spin_lock(&oi->ip_lock); | ||
631 | /* OCFS2 *never* deletes system files. This should technically | ||
632 | * never get here as system file inodes should always have a | ||
633 | * positive link count. */ | ||
634 | if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) { | ||
635 | mlog(ML_ERROR, "Skipping delete of system file %"MLFu64".\n", | ||
636 | oi->ip_blkno); | ||
637 | goto bail_unlock; | ||
638 | } | ||
639 | |||
640 | /* If we have voted "yes" on the wipe of this inode for | ||
641 | * another node, it will be marked here so we can safely skip | ||
642 | * it. Recovery will cleanup any inodes we might inadvertantly | ||
643 | * skip here. */ | ||
644 | if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) { | ||
645 | mlog(0, "Skipping delete of %lu because another node " | ||
646 | "has done this for us.\n", inode->i_ino); | ||
647 | goto bail_unlock; | ||
648 | } | ||
649 | |||
650 | ret = 1; | ||
651 | bail_unlock: | ||
652 | spin_unlock(&oi->ip_lock); | ||
653 | bail: | ||
654 | return ret; | ||
655 | } | ||
656 | |||
657 | /* Query the cluster to determine whether we should wipe an inode from | ||
658 | * disk or not. | ||
659 | * | ||
660 | * Requires the inode to have the cluster lock. */ | ||
661 | static int ocfs2_query_inode_wipe(struct inode *inode, | ||
662 | struct buffer_head *di_bh, | ||
663 | int *wipe) | ||
664 | { | ||
665 | int status = 0; | ||
666 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
667 | struct ocfs2_dinode *di; | ||
668 | |||
669 | *wipe = 0; | ||
670 | |||
671 | /* While we were waiting for the cluster lock in | ||
672 | * ocfs2_delete_inode, another node might have asked to delete | ||
673 | * the inode. Recheck our flags to catch this. */ | ||
674 | if (!ocfs2_inode_is_valid_to_delete(inode)) { | ||
675 | mlog(0, "Skipping delete of %"MLFu64" because flags changed\n", | ||
676 | oi->ip_blkno); | ||
677 | goto bail; | ||
678 | } | ||
679 | |||
680 | /* Now that we have an up to date inode, we can double check | ||
681 | * the link count. */ | ||
682 | if (inode->i_nlink) { | ||
683 | mlog(0, "Skipping delete of %"MLFu64" because nlink = %u\n", | ||
684 | oi->ip_blkno, inode->i_nlink); | ||
685 | goto bail; | ||
686 | } | ||
687 | |||
688 | /* Do some basic inode verification... */ | ||
689 | di = (struct ocfs2_dinode *) di_bh->b_data; | ||
690 | if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) { | ||
691 | /* for lack of a better error? */ | ||
692 | status = -EEXIST; | ||
693 | mlog(ML_ERROR, | ||
694 | "Inode %"MLFu64" (on-disk %"MLFu64") not orphaned! " | ||
695 | "Disk flags 0x%x, inode flags 0x%x\n", | ||
696 | oi->ip_blkno, di->i_blkno, di->i_flags, oi->ip_flags); | ||
697 | goto bail; | ||
698 | } | ||
699 | |||
700 | /* has someone already deleted us?! baaad... */ | ||
701 | if (di->i_dtime) { | ||
702 | status = -EEXIST; | ||
703 | mlog_errno(status); | ||
704 | goto bail; | ||
705 | } | ||
706 | |||
707 | status = ocfs2_request_delete_vote(inode); | ||
708 | /* -EBUSY means that other nodes are still using the | ||
709 | * inode. We're done here though, so avoid doing anything on | ||
710 | * disk and let them worry about deleting it. */ | ||
711 | if (status == -EBUSY) { | ||
712 | status = 0; | ||
713 | mlog(0, "Skipping delete of %"MLFu64" because it is in use on" | ||
714 | "other nodes\n", oi->ip_blkno); | ||
715 | goto bail; | ||
716 | } | ||
717 | if (status < 0) { | ||
718 | mlog_errno(status); | ||
719 | goto bail; | ||
720 | } | ||
721 | |||
722 | spin_lock(&oi->ip_lock); | ||
723 | if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) { | ||
724 | /* Nobody knew which slot this inode was orphaned | ||
725 | * into. This may happen during node death and | ||
726 | * recovery knows how to clean it up so we can safely | ||
727 | * ignore this inode for now on. */ | ||
728 | mlog(0, "Nobody knew where inode %"MLFu64" was orphaned!\n", | ||
729 | oi->ip_blkno); | ||
730 | } else { | ||
731 | *wipe = 1; | ||
732 | |||
733 | mlog(0, "Inode %"MLFu64" is ok to wipe from orphan dir %d\n", | ||
734 | oi->ip_blkno, oi->ip_orphaned_slot); | ||
735 | } | ||
736 | spin_unlock(&oi->ip_lock); | ||
737 | |||
738 | bail: | ||
739 | return status; | ||
740 | } | ||
741 | |||
742 | /* Support function for ocfs2_delete_inode. Will help us keep the | ||
743 | * inode data in a consistent state for clear_inode. Always truncates | ||
744 | * pages, optionally sync's them first. */ | ||
745 | static void ocfs2_cleanup_delete_inode(struct inode *inode, | ||
746 | int sync_data) | ||
747 | { | ||
748 | mlog(0, "Cleanup inode %"MLFu64", sync = %d\n", | ||
749 | OCFS2_I(inode)->ip_blkno, sync_data); | ||
750 | if (sync_data) | ||
751 | write_inode_now(inode, 1); | ||
752 | truncate_inode_pages(&inode->i_data, 0); | ||
753 | } | ||
754 | |||
755 | void ocfs2_delete_inode(struct inode *inode) | ||
756 | { | ||
757 | int wipe, status; | ||
758 | sigset_t blocked, oldset; | ||
759 | struct buffer_head *di_bh = NULL; | ||
760 | |||
761 | mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); | ||
762 | |||
763 | if (is_bad_inode(inode)) { | ||
764 | mlog(0, "Skipping delete of bad inode\n"); | ||
765 | goto bail; | ||
766 | } | ||
767 | |||
768 | if (!ocfs2_inode_is_valid_to_delete(inode)) { | ||
769 | /* It's probably not necessary to truncate_inode_pages | ||
770 | * here but we do it for safety anyway (it will most | ||
771 | * likely be a no-op anyway) */ | ||
772 | ocfs2_cleanup_delete_inode(inode, 0); | ||
773 | goto bail; | ||
774 | } | ||
775 | |||
776 | /* We want to block signals in delete_inode as the lock and | ||
777 | * messaging paths may return us -ERESTARTSYS. Which would | ||
778 | * cause us to exit early, resulting in inodes being orphaned | ||
779 | * forever. */ | ||
780 | sigfillset(&blocked); | ||
781 | status = sigprocmask(SIG_BLOCK, &blocked, &oldset); | ||
782 | if (status < 0) { | ||
783 | mlog_errno(status); | ||
784 | ocfs2_cleanup_delete_inode(inode, 1); | ||
785 | goto bail; | ||
786 | } | ||
787 | |||
788 | /* Lock down the inode. This gives us an up to date view of | ||
789 | * it's metadata (for verification), and allows us to | ||
790 | * serialize delete_inode votes. | ||
791 | * | ||
792 | * Even though we might be doing a truncate, we don't take the | ||
793 | * allocation lock here as it won't be needed - nobody will | ||
794 | * have the file open. | ||
795 | */ | ||
796 | status = ocfs2_meta_lock(inode, NULL, &di_bh, 1); | ||
797 | if (status < 0) { | ||
798 | if (status != -ENOENT) | ||
799 | mlog_errno(status); | ||
800 | ocfs2_cleanup_delete_inode(inode, 0); | ||
801 | goto bail_unblock; | ||
802 | } | ||
803 | |||
804 | /* Query the cluster. This will be the final decision made | ||
805 | * before we go ahead and wipe the inode. */ | ||
806 | status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); | ||
807 | if (!wipe || status < 0) { | ||
808 | /* Error and inode busy vote both mean we won't be | ||
809 | * removing the inode, so they take almost the same | ||
810 | * path. */ | ||
811 | if (status < 0) | ||
812 | mlog_errno(status); | ||
813 | |||
814 | /* Someone in the cluster has voted to not wipe this | ||
815 | * inode, or it was never completely orphaned. Write | ||
816 | * out the pages and exit now. */ | ||
817 | ocfs2_cleanup_delete_inode(inode, 1); | ||
818 | goto bail_unlock_inode; | ||
819 | } | ||
820 | |||
821 | ocfs2_cleanup_delete_inode(inode, 0); | ||
822 | |||
823 | status = ocfs2_wipe_inode(inode, di_bh); | ||
824 | if (status < 0) { | ||
825 | mlog_errno(status); | ||
826 | goto bail_unlock_inode; | ||
827 | } | ||
828 | |||
829 | /* Mark the inode as successfully deleted. This is important | ||
830 | * for ocfs2_clear_inode as it will check this flag and skip | ||
831 | * any checkpointing work */ | ||
832 | OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; | ||
833 | |||
834 | bail_unlock_inode: | ||
835 | ocfs2_meta_unlock(inode, 1); | ||
836 | brelse(di_bh); | ||
837 | bail_unblock: | ||
838 | status = sigprocmask(SIG_SETMASK, &oldset, NULL); | ||
839 | if (status < 0) | ||
840 | mlog_errno(status); | ||
841 | bail: | ||
842 | clear_inode(inode); | ||
843 | mlog_exit_void(); | ||
844 | } | ||
845 | |||
846 | void ocfs2_clear_inode(struct inode *inode) | ||
847 | { | ||
848 | int status; | ||
849 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
850 | |||
851 | mlog_entry_void(); | ||
852 | |||
853 | if (!inode) | ||
854 | goto bail; | ||
855 | |||
856 | mlog(0, "Clearing inode: %"MLFu64", nlink = %u\n", | ||
857 | OCFS2_I(inode)->ip_blkno, inode->i_nlink); | ||
858 | |||
859 | mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, | ||
860 | "Inode=%lu\n", inode->i_ino); | ||
861 | |||
862 | /* Do these before all the other work so that we don't bounce | ||
863 | * the vote thread while waiting to destroy the locks. */ | ||
864 | ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); | ||
865 | ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); | ||
866 | ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); | ||
867 | |||
868 | /* We very well may get a clear_inode before all an inodes | ||
869 | * metadata has hit disk. Of course, we can't drop any cluster | ||
870 | * locks until the journal has finished with it. The only | ||
871 | * exception here are successfully wiped inodes - their | ||
872 | * metadata can now be considered to be part of the system | ||
873 | * inodes from which it came. */ | ||
874 | if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED)) | ||
875 | ocfs2_checkpoint_inode(inode); | ||
876 | |||
877 | mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), | ||
878 | "Clear inode of %"MLFu64", inode has io markers\n", | ||
879 | oi->ip_blkno); | ||
880 | |||
881 | ocfs2_extent_map_drop(inode, 0); | ||
882 | ocfs2_extent_map_init(inode); | ||
883 | |||
884 | status = ocfs2_drop_inode_locks(inode); | ||
885 | if (status < 0) | ||
886 | mlog_errno(status); | ||
887 | |||
888 | ocfs2_lock_res_free(&oi->ip_rw_lockres); | ||
889 | ocfs2_lock_res_free(&oi->ip_meta_lockres); | ||
890 | ocfs2_lock_res_free(&oi->ip_data_lockres); | ||
891 | |||
892 | ocfs2_metadata_cache_purge(inode); | ||
893 | |||
894 | mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached, | ||
895 | "Clear inode of %"MLFu64", inode has %u cache items\n", | ||
896 | oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached); | ||
897 | |||
898 | mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), | ||
899 | "Clear inode of %"MLFu64", inode has a bad flag\n", | ||
900 | oi->ip_blkno); | ||
901 | |||
902 | mlog_bug_on_msg(spin_is_locked(&oi->ip_lock), | ||
903 | "Clear inode of %"MLFu64", inode is locked\n", | ||
904 | oi->ip_blkno); | ||
905 | |||
906 | mlog_bug_on_msg(down_trylock(&oi->ip_io_sem), | ||
907 | "Clear inode of %"MLFu64", io_sem is locked\n", | ||
908 | oi->ip_blkno); | ||
909 | up(&oi->ip_io_sem); | ||
910 | |||
911 | /* | ||
912 | * down_trylock() returns 0, down_write_trylock() returns 1 | ||
913 | * kernel 1, world 0 | ||
914 | */ | ||
915 | mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem), | ||
916 | "Clear inode of %"MLFu64", alloc_sem is locked\n", | ||
917 | oi->ip_blkno); | ||
918 | up_write(&oi->ip_alloc_sem); | ||
919 | |||
920 | mlog_bug_on_msg(oi->ip_open_count, | ||
921 | "Clear inode of %"MLFu64" has open count %d\n", | ||
922 | oi->ip_blkno, oi->ip_open_count); | ||
923 | mlog_bug_on_msg(!list_empty(&oi->ip_handle_list), | ||
924 | "Clear inode of %"MLFu64" has non empty handle list\n", | ||
925 | oi->ip_blkno); | ||
926 | mlog_bug_on_msg(oi->ip_handle, | ||
927 | "Clear inode of %"MLFu64" has non empty handle pointer\n", | ||
928 | oi->ip_blkno); | ||
929 | |||
930 | /* Clear all other flags. */ | ||
931 | oi->ip_flags = OCFS2_INODE_CACHE_INLINE; | ||
932 | oi->ip_created_trans = 0; | ||
933 | oi->ip_last_trans = 0; | ||
934 | oi->ip_dir_start_lookup = 0; | ||
935 | oi->ip_blkno = 0ULL; | ||
936 | |||
937 | bail: | ||
938 | mlog_exit_void(); | ||
939 | } | ||
940 | |||
941 | /* Called under inode_lock, with no more references on the | ||
942 | * struct inode, so it's safe here to check the flags field | ||
943 | * and to manipulate i_nlink without any other locks. */ | ||
944 | void ocfs2_drop_inode(struct inode *inode) | ||
945 | { | ||
946 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
947 | |||
948 | mlog_entry_void(); | ||
949 | |||
950 | mlog(0, "Drop inode %"MLFu64", nlink = %u, ip_flags = 0x%x\n", | ||
951 | oi->ip_blkno, inode->i_nlink, oi->ip_flags); | ||
952 | |||
953 | /* Testing ip_orphaned_slot here wouldn't work because we may | ||
954 | * not have gotten a delete_inode vote from any other nodes | ||
955 | * yet. */ | ||
956 | if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) { | ||
957 | mlog(0, "Inode was orphaned on another node, clearing nlink.\n"); | ||
958 | inode->i_nlink = 0; | ||
959 | } | ||
960 | |||
961 | generic_drop_inode(inode); | ||
962 | |||
963 | mlog_exit_void(); | ||
964 | } | ||
965 | |||
966 | /* | ||
967 | * TODO: this should probably be merged into ocfs2_get_block | ||
968 | * | ||
969 | * However, you now need to pay attention to the cont_prepare_write() | ||
970 | * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much | ||
971 | * expects never to extend). | ||
972 | */ | ||
973 | struct buffer_head *ocfs2_bread(struct inode *inode, | ||
974 | int block, int *err, int reada) | ||
975 | { | ||
976 | struct buffer_head *bh = NULL; | ||
977 | int tmperr; | ||
978 | u64 p_blkno; | ||
979 | int readflags = OCFS2_BH_CACHED; | ||
980 | |||
981 | #if 0 | ||
982 | /* only turn this on if we know we can deal with read_block | ||
983 | * returning nothing */ | ||
984 | if (reada) | ||
985 | readflags |= OCFS2_BH_READAHEAD; | ||
986 | #endif | ||
987 | |||
988 | if (((u64)block << inode->i_sb->s_blocksize_bits) >= | ||
989 | i_size_read(inode)) { | ||
990 | BUG_ON(!reada); | ||
991 | return NULL; | ||
992 | } | ||
993 | |||
994 | tmperr = ocfs2_extent_map_get_blocks(inode, block, 1, | ||
995 | &p_blkno, NULL); | ||
996 | if (tmperr < 0) { | ||
997 | mlog_errno(tmperr); | ||
998 | goto fail; | ||
999 | } | ||
1000 | |||
1001 | tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh, | ||
1002 | readflags, inode); | ||
1003 | if (tmperr < 0) | ||
1004 | goto fail; | ||
1005 | |||
1006 | tmperr = 0; | ||
1007 | |||
1008 | *err = 0; | ||
1009 | return bh; | ||
1010 | |||
1011 | fail: | ||
1012 | if (bh) { | ||
1013 | brelse(bh); | ||
1014 | bh = NULL; | ||
1015 | } | ||
1016 | *err = -EIO; | ||
1017 | return NULL; | ||
1018 | } | ||
1019 | |||
1020 | /* | ||
1021 | * This is called from our getattr. | ||
1022 | */ | ||
1023 | int ocfs2_inode_revalidate(struct dentry *dentry) | ||
1024 | { | ||
1025 | struct inode *inode = dentry->d_inode; | ||
1026 | int status = 0; | ||
1027 | |||
1028 | mlog_entry("(inode = 0x%p, ino = %"MLFu64")\n", inode, | ||
1029 | inode ? OCFS2_I(inode)->ip_blkno : 0ULL); | ||
1030 | |||
1031 | if (!inode) { | ||
1032 | mlog(0, "eep, no inode!\n"); | ||
1033 | status = -ENOENT; | ||
1034 | goto bail; | ||
1035 | } | ||
1036 | |||
1037 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
1038 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { | ||
1039 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
1040 | mlog(0, "inode deleted!\n"); | ||
1041 | status = -ENOENT; | ||
1042 | goto bail; | ||
1043 | } | ||
1044 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
1045 | |||
1046 | /* Let ocfs2_meta_lock do the work of updating our struct | ||
1047 | * inode for us. */ | ||
1048 | status = ocfs2_meta_lock(inode, NULL, NULL, 0); | ||
1049 | if (status < 0) { | ||
1050 | if (status != -ENOENT) | ||
1051 | mlog_errno(status); | ||
1052 | goto bail; | ||
1053 | } | ||
1054 | ocfs2_meta_unlock(inode, 0); | ||
1055 | bail: | ||
1056 | mlog_exit(status); | ||
1057 | |||
1058 | return status; | ||
1059 | } | ||
1060 | |||
1061 | /* | ||
1062 | * Updates a disk inode from a | ||
1063 | * struct inode. | ||
1064 | * Only takes ip_lock. | ||
1065 | */ | ||
1066 | int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle, | ||
1067 | struct inode *inode, | ||
1068 | struct buffer_head *bh) | ||
1069 | { | ||
1070 | int status; | ||
1071 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; | ||
1072 | |||
1073 | mlog_entry("(inode %"MLFu64")\n", OCFS2_I(inode)->ip_blkno); | ||
1074 | |||
1075 | status = ocfs2_journal_access(handle, inode, bh, | ||
1076 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1077 | if (status < 0) { | ||
1078 | mlog_errno(status); | ||
1079 | goto leave; | ||
1080 | } | ||
1081 | |||
1082 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
1083 | fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); | ||
1084 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
1085 | |||
1086 | fe->i_size = cpu_to_le64(i_size_read(inode)); | ||
1087 | fe->i_links_count = cpu_to_le16(inode->i_nlink); | ||
1088 | fe->i_uid = cpu_to_le32(inode->i_uid); | ||
1089 | fe->i_gid = cpu_to_le32(inode->i_gid); | ||
1090 | fe->i_mode = cpu_to_le16(inode->i_mode); | ||
1091 | fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); | ||
1092 | fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); | ||
1093 | fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); | ||
1094 | fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); | ||
1095 | fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); | ||
1096 | fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | ||
1097 | |||
1098 | status = ocfs2_journal_dirty(handle, bh); | ||
1099 | if (status < 0) | ||
1100 | mlog_errno(status); | ||
1101 | |||
1102 | status = 0; | ||
1103 | leave: | ||
1104 | |||
1105 | mlog_exit(status); | ||
1106 | return status; | ||
1107 | } | ||
1108 | |||
1109 | /* | ||
1110 | * | ||
1111 | * Updates a struct inode from a disk inode. | ||
1112 | * does no i/o, only takes ip_lock. | ||
1113 | */ | ||
1114 | void ocfs2_refresh_inode(struct inode *inode, | ||
1115 | struct ocfs2_dinode *fe) | ||
1116 | { | ||
1117 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1118 | |||
1119 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
1120 | |||
1121 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
1122 | i_size_write(inode, le64_to_cpu(fe->i_size)); | ||
1123 | inode->i_nlink = le16_to_cpu(fe->i_links_count); | ||
1124 | inode->i_uid = le32_to_cpu(fe->i_uid); | ||
1125 | inode->i_gid = le32_to_cpu(fe->i_gid); | ||
1126 | inode->i_mode = le16_to_cpu(fe->i_mode); | ||
1127 | inode->i_blksize = (u32) osb->s_clustersize; | ||
1128 | if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) | ||
1129 | inode->i_blocks = 0; | ||
1130 | else | ||
1131 | inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode)); | ||
1132 | inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); | ||
1133 | inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); | ||
1134 | inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); | ||
1135 | inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); | ||
1136 | inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); | ||
1137 | inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); | ||
1138 | |||
1139 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
1140 | } | ||
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h new file mode 100644 index 000000000000..9b0177433653 --- /dev/null +++ b/fs/ocfs2/inode.h | |||
@@ -0,0 +1,145 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * inode.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_INODE_H | ||
27 | #define OCFS2_INODE_H | ||
28 | |||
29 | /* OCFS2 Inode Private Data */ | ||
30 | struct ocfs2_inode_info | ||
31 | { | ||
32 | u64 ip_blkno; | ||
33 | |||
34 | struct ocfs2_lock_res ip_rw_lockres; | ||
35 | struct ocfs2_lock_res ip_meta_lockres; | ||
36 | struct ocfs2_lock_res ip_data_lockres; | ||
37 | |||
38 | /* protects allocation changes on this inode. */ | ||
39 | struct rw_semaphore ip_alloc_sem; | ||
40 | |||
41 | /* These fields are protected by ip_lock */ | ||
42 | spinlock_t ip_lock; | ||
43 | u32 ip_open_count; | ||
44 | u32 ip_clusters; | ||
45 | struct ocfs2_extent_map ip_map; | ||
46 | struct list_head ip_io_markers; | ||
47 | int ip_orphaned_slot; | ||
48 | |||
49 | struct semaphore ip_io_sem; | ||
50 | |||
51 | /* Used by the journalling code to attach an inode to a | ||
52 | * handle. These are protected by ip_io_sem in order to lock | ||
53 | * out other I/O to the inode until we either commit or | ||
54 | * abort. */ | ||
55 | struct list_head ip_handle_list; | ||
56 | struct ocfs2_journal_handle *ip_handle; | ||
57 | |||
58 | u32 ip_flags; /* see below */ | ||
59 | |||
60 | /* protected by recovery_lock. */ | ||
61 | struct inode *ip_next_orphan; | ||
62 | |||
63 | u32 ip_dir_start_lookup; | ||
64 | |||
65 | /* next two are protected by trans_inc_lock */ | ||
66 | /* which transaction were we created on? Zero if none. */ | ||
67 | unsigned long ip_created_trans; | ||
68 | /* last transaction we were a part of. */ | ||
69 | unsigned long ip_last_trans; | ||
70 | |||
71 | struct ocfs2_caching_info ip_metadata_cache; | ||
72 | |||
73 | struct inode vfs_inode; | ||
74 | }; | ||
75 | |||
76 | /* | ||
77 | * Flags for the ip_flags field | ||
78 | */ | ||
79 | /* System file inodes */ | ||
80 | #define OCFS2_INODE_SYSTEM_FILE 0x00000001 | ||
81 | #define OCFS2_INODE_JOURNAL 0x00000002 | ||
82 | #define OCFS2_INODE_BITMAP 0x00000004 | ||
83 | /* This inode has been wiped from disk */ | ||
84 | #define OCFS2_INODE_DELETED 0x00000008 | ||
85 | /* Another node is deleting, so our delete is a nop */ | ||
86 | #define OCFS2_INODE_SKIP_DELETE 0x00000010 | ||
87 | /* Has the inode been orphaned on another node? | ||
88 | * | ||
89 | * This hints to ocfs2_drop_inode that it should clear i_nlink before | ||
90 | * continuing. | ||
91 | * | ||
92 | * We *only* set this on unlink vote from another node. If the inode | ||
93 | * was locally orphaned, then we're sure of the state and don't need | ||
94 | * to twiddle i_nlink later - it's either zero or not depending on | ||
95 | * whether our unlink succeeded. Otherwise we got this from a node | ||
96 | * whose intention was to orphan the inode, however he may have | ||
97 | * crashed, failed etc, so we let ocfs2_drop_inode zero the value and | ||
98 | * rely on ocfs2_delete_inode to sort things out under the proper | ||
99 | * cluster locks. | ||
100 | */ | ||
101 | #define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 | ||
102 | /* Does someone have the file open O_DIRECT */ | ||
103 | #define OCFS2_INODE_OPEN_DIRECT 0x00000040 | ||
104 | /* Indicates that the metadata cache should be used as an array. */ | ||
105 | #define OCFS2_INODE_CACHE_INLINE 0x00000080 | ||
106 | |||
107 | static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) | ||
108 | { | ||
109 | return container_of(inode, struct ocfs2_inode_info, vfs_inode); | ||
110 | } | ||
111 | |||
112 | #define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL) | ||
113 | #define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL) | ||
114 | |||
115 | extern kmem_cache_t *ocfs2_inode_cache; | ||
116 | |||
117 | extern struct address_space_operations ocfs2_aops; | ||
118 | |||
119 | struct buffer_head *ocfs2_bread(struct inode *inode, int block, | ||
120 | int *err, int reada); | ||
121 | void ocfs2_clear_inode(struct inode *inode); | ||
122 | void ocfs2_delete_inode(struct inode *inode); | ||
123 | void ocfs2_drop_inode(struct inode *inode); | ||
124 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff); | ||
125 | struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, | ||
126 | u64 blkno, | ||
127 | int delete_vote); | ||
128 | int ocfs2_inode_init_private(struct inode *inode); | ||
129 | int ocfs2_inode_revalidate(struct dentry *dentry); | ||
130 | int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | ||
131 | int create_ino); | ||
132 | void ocfs2_read_inode(struct inode *inode); | ||
133 | void ocfs2_read_inode2(struct inode *inode, void *opaque); | ||
134 | ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf, | ||
135 | size_t size, loff_t *offp); | ||
136 | void ocfs2_sync_blockdev(struct super_block *sb); | ||
137 | void ocfs2_refresh_inode(struct inode *inode, | ||
138 | struct ocfs2_dinode *fe); | ||
139 | int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle, | ||
140 | struct inode *inode, | ||
141 | struct buffer_head *bh); | ||
142 | int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); | ||
143 | int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); | ||
144 | |||
145 | #endif /* OCFS2_INODE_H */ | ||
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c new file mode 100644 index 000000000000..04428042e5e5 --- /dev/null +++ b/fs/ocfs2/journal.c | |||
@@ -0,0 +1,1652 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * journal.c | ||
5 | * | ||
6 | * Defines functions of journalling api | ||
7 | * | ||
8 | * Copyright (C) 2003, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/kthread.h> | ||
31 | |||
32 | #define MLOG_MASK_PREFIX ML_JOURNAL | ||
33 | #include <cluster/masklog.h> | ||
34 | |||
35 | #include "ocfs2.h" | ||
36 | |||
37 | #include "alloc.h" | ||
38 | #include "dlmglue.h" | ||
39 | #include "extent_map.h" | ||
40 | #include "heartbeat.h" | ||
41 | #include "inode.h" | ||
42 | #include "journal.h" | ||
43 | #include "localalloc.h" | ||
44 | #include "namei.h" | ||
45 | #include "slot_map.h" | ||
46 | #include "super.h" | ||
47 | #include "vote.h" | ||
48 | #include "sysfile.h" | ||
49 | |||
50 | #include "buffer_head_io.h" | ||
51 | |||
52 | spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED; | ||
53 | |||
54 | static int ocfs2_force_read_journal(struct inode *inode); | ||
55 | static int ocfs2_recover_node(struct ocfs2_super *osb, | ||
56 | int node_num); | ||
57 | static int __ocfs2_recovery_thread(void *arg); | ||
58 | static int ocfs2_commit_cache(struct ocfs2_super *osb); | ||
59 | static int ocfs2_wait_on_mount(struct ocfs2_super *osb); | ||
60 | static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal, | ||
61 | struct ocfs2_journal_handle *handle); | ||
62 | static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle); | ||
63 | static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, | ||
64 | int dirty); | ||
65 | static int ocfs2_trylock_journal(struct ocfs2_super *osb, | ||
66 | int slot_num); | ||
67 | static int ocfs2_recover_orphans(struct ocfs2_super *osb, | ||
68 | int slot); | ||
69 | static int ocfs2_commit_thread(void *arg); | ||
70 | |||
71 | static int ocfs2_commit_cache(struct ocfs2_super *osb) | ||
72 | { | ||
73 | int status = 0; | ||
74 | unsigned int flushed; | ||
75 | unsigned long old_id; | ||
76 | struct ocfs2_journal *journal = NULL; | ||
77 | |||
78 | mlog_entry_void(); | ||
79 | |||
80 | journal = osb->journal; | ||
81 | |||
82 | /* Flush all pending commits and checkpoint the journal. */ | ||
83 | down_write(&journal->j_trans_barrier); | ||
84 | |||
85 | if (atomic_read(&journal->j_num_trans) == 0) { | ||
86 | up_write(&journal->j_trans_barrier); | ||
87 | mlog(0, "No transactions for me to flush!\n"); | ||
88 | goto finally; | ||
89 | } | ||
90 | |||
91 | journal_lock_updates(journal->j_journal); | ||
92 | status = journal_flush(journal->j_journal); | ||
93 | journal_unlock_updates(journal->j_journal); | ||
94 | if (status < 0) { | ||
95 | up_write(&journal->j_trans_barrier); | ||
96 | mlog_errno(status); | ||
97 | goto finally; | ||
98 | } | ||
99 | |||
100 | old_id = ocfs2_inc_trans_id(journal); | ||
101 | |||
102 | flushed = atomic_read(&journal->j_num_trans); | ||
103 | atomic_set(&journal->j_num_trans, 0); | ||
104 | up_write(&journal->j_trans_barrier); | ||
105 | |||
106 | mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n", | ||
107 | journal->j_trans_id, flushed); | ||
108 | |||
109 | ocfs2_kick_vote_thread(osb); | ||
110 | wake_up(&journal->j_checkpointed); | ||
111 | finally: | ||
112 | mlog_exit(status); | ||
113 | return status; | ||
114 | } | ||
115 | |||
116 | struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb) | ||
117 | { | ||
118 | struct ocfs2_journal_handle *retval = NULL; | ||
119 | |||
120 | retval = kcalloc(1, sizeof(*retval), GFP_KERNEL); | ||
121 | if (!retval) { | ||
122 | mlog(ML_ERROR, "Failed to allocate memory for journal " | ||
123 | "handle!\n"); | ||
124 | return NULL; | ||
125 | } | ||
126 | |||
127 | retval->max_buffs = 0; | ||
128 | retval->num_locks = 0; | ||
129 | retval->k_handle = NULL; | ||
130 | |||
131 | INIT_LIST_HEAD(&retval->locks); | ||
132 | INIT_LIST_HEAD(&retval->inode_list); | ||
133 | retval->journal = osb->journal; | ||
134 | |||
135 | return retval; | ||
136 | } | ||
137 | |||
138 | /* pass it NULL and it will allocate a new handle object for you. If | ||
139 | * you pass it a handle however, it may still return error, in which | ||
140 | * case it has free'd the passed handle for you. */ | ||
141 | struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb, | ||
142 | struct ocfs2_journal_handle *handle, | ||
143 | int max_buffs) | ||
144 | { | ||
145 | int ret; | ||
146 | journal_t *journal = osb->journal->j_journal; | ||
147 | |||
148 | mlog_entry("(max_buffs = %d)\n", max_buffs); | ||
149 | |||
150 | if (!osb || !osb->journal->j_journal) | ||
151 | BUG(); | ||
152 | |||
153 | if (ocfs2_is_hard_readonly(osb)) { | ||
154 | ret = -EROFS; | ||
155 | goto done_free; | ||
156 | } | ||
157 | |||
158 | BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE); | ||
159 | BUG_ON(max_buffs <= 0); | ||
160 | |||
161 | /* JBD might support this, but our journalling code doesn't yet. */ | ||
162 | if (journal_current_handle()) { | ||
163 | mlog(ML_ERROR, "Recursive transaction attempted!\n"); | ||
164 | BUG(); | ||
165 | } | ||
166 | |||
167 | if (!handle) | ||
168 | handle = ocfs2_alloc_handle(osb); | ||
169 | if (!handle) { | ||
170 | ret = -ENOMEM; | ||
171 | mlog(ML_ERROR, "Failed to allocate memory for journal " | ||
172 | "handle!\n"); | ||
173 | goto done_free; | ||
174 | } | ||
175 | |||
176 | handle->max_buffs = max_buffs; | ||
177 | |||
178 | down_read(&osb->journal->j_trans_barrier); | ||
179 | |||
180 | /* actually start the transaction now */ | ||
181 | handle->k_handle = journal_start(journal, max_buffs); | ||
182 | if (IS_ERR(handle->k_handle)) { | ||
183 | up_read(&osb->journal->j_trans_barrier); | ||
184 | |||
185 | ret = PTR_ERR(handle->k_handle); | ||
186 | handle->k_handle = NULL; | ||
187 | mlog_errno(ret); | ||
188 | |||
189 | if (is_journal_aborted(journal)) { | ||
190 | ocfs2_abort(osb->sb, "Detected aborted journal"); | ||
191 | ret = -EROFS; | ||
192 | } | ||
193 | goto done_free; | ||
194 | } | ||
195 | |||
196 | atomic_inc(&(osb->journal->j_num_trans)); | ||
197 | handle->flags |= OCFS2_HANDLE_STARTED; | ||
198 | |||
199 | mlog_exit_ptr(handle); | ||
200 | return handle; | ||
201 | |||
202 | done_free: | ||
203 | if (handle) | ||
204 | ocfs2_commit_unstarted_handle(handle); /* will kfree handle */ | ||
205 | |||
206 | mlog_exit(ret); | ||
207 | return ERR_PTR(ret); | ||
208 | } | ||
209 | |||
210 | void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle, | ||
211 | struct inode *inode) | ||
212 | { | ||
213 | BUG_ON(!handle); | ||
214 | BUG_ON(!inode); | ||
215 | |||
216 | atomic_inc(&inode->i_count); | ||
217 | |||
218 | /* we're obviously changing it... */ | ||
219 | down(&inode->i_sem); | ||
220 | |||
221 | /* sanity check */ | ||
222 | BUG_ON(OCFS2_I(inode)->ip_handle); | ||
223 | BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list)); | ||
224 | |||
225 | OCFS2_I(inode)->ip_handle = handle; | ||
226 | list_del(&(OCFS2_I(inode)->ip_handle_list)); | ||
227 | list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list)); | ||
228 | } | ||
229 | |||
230 | static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle) | ||
231 | { | ||
232 | struct list_head *p, *n; | ||
233 | struct inode *inode; | ||
234 | struct ocfs2_inode_info *oi; | ||
235 | |||
236 | list_for_each_safe(p, n, &handle->inode_list) { | ||
237 | oi = list_entry(p, struct ocfs2_inode_info, | ||
238 | ip_handle_list); | ||
239 | inode = &oi->vfs_inode; | ||
240 | |||
241 | OCFS2_I(inode)->ip_handle = NULL; | ||
242 | list_del_init(&OCFS2_I(inode)->ip_handle_list); | ||
243 | |||
244 | up(&inode->i_sem); | ||
245 | iput(inode); | ||
246 | } | ||
247 | } | ||
248 | |||
249 | /* This is trivial so we do it out of the main commit | ||
250 | * paths. Beware, it can be called from start_trans too! */ | ||
251 | static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle) | ||
252 | { | ||
253 | mlog_entry_void(); | ||
254 | |||
255 | BUG_ON(handle->flags & OCFS2_HANDLE_STARTED); | ||
256 | |||
257 | ocfs2_handle_unlock_inodes(handle); | ||
258 | /* You are allowed to add journal locks before the transaction | ||
259 | * has started. */ | ||
260 | ocfs2_handle_cleanup_locks(handle->journal, handle); | ||
261 | |||
262 | kfree(handle); | ||
263 | |||
264 | mlog_exit_void(); | ||
265 | } | ||
266 | |||
267 | void ocfs2_commit_trans(struct ocfs2_journal_handle *handle) | ||
268 | { | ||
269 | handle_t *jbd_handle; | ||
270 | int retval; | ||
271 | struct ocfs2_journal *journal = handle->journal; | ||
272 | |||
273 | mlog_entry_void(); | ||
274 | |||
275 | BUG_ON(!handle); | ||
276 | |||
277 | if (!(handle->flags & OCFS2_HANDLE_STARTED)) { | ||
278 | ocfs2_commit_unstarted_handle(handle); | ||
279 | mlog_exit_void(); | ||
280 | return; | ||
281 | } | ||
282 | |||
283 | /* release inode semaphores we took during this transaction */ | ||
284 | ocfs2_handle_unlock_inodes(handle); | ||
285 | |||
286 | /* ocfs2_extend_trans may have had to call journal_restart | ||
287 | * which will always commit the transaction, but may return | ||
288 | * error for any number of reasons. If this is the case, we | ||
289 | * clear k_handle as it's not valid any more. */ | ||
290 | if (handle->k_handle) { | ||
291 | jbd_handle = handle->k_handle; | ||
292 | |||
293 | if (handle->flags & OCFS2_HANDLE_SYNC) | ||
294 | jbd_handle->h_sync = 1; | ||
295 | else | ||
296 | jbd_handle->h_sync = 0; | ||
297 | |||
298 | /* actually stop the transaction. if we've set h_sync, | ||
299 | * it'll have been committed when we return */ | ||
300 | retval = journal_stop(jbd_handle); | ||
301 | if (retval < 0) { | ||
302 | mlog_errno(retval); | ||
303 | mlog(ML_ERROR, "Could not commit transaction\n"); | ||
304 | BUG(); | ||
305 | } | ||
306 | |||
307 | handle->k_handle = NULL; /* it's been free'd in journal_stop */ | ||
308 | } | ||
309 | |||
310 | ocfs2_handle_cleanup_locks(journal, handle); | ||
311 | |||
312 | up_read(&journal->j_trans_barrier); | ||
313 | |||
314 | kfree(handle); | ||
315 | mlog_exit_void(); | ||
316 | } | ||
317 | |||
318 | /* | ||
319 | * 'nblocks' is what you want to add to the current | ||
320 | * transaction. extend_trans will either extend the current handle by | ||
321 | * nblocks, or commit it and start a new one with nblocks credits. | ||
322 | * | ||
323 | * WARNING: This will not release any semaphores or disk locks taken | ||
324 | * during the transaction, so make sure they were taken *before* | ||
325 | * start_trans or we'll have ordering deadlocks. | ||
326 | * | ||
327 | * WARNING2: Note that we do *not* drop j_trans_barrier here. This is | ||
328 | * good because transaction ids haven't yet been recorded on the | ||
329 | * cluster locks associated with this handle. | ||
330 | */ | ||
331 | int ocfs2_extend_trans(struct ocfs2_journal_handle *handle, | ||
332 | int nblocks) | ||
333 | { | ||
334 | int status; | ||
335 | |||
336 | BUG_ON(!handle); | ||
337 | BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED)); | ||
338 | BUG_ON(!nblocks); | ||
339 | |||
340 | mlog_entry_void(); | ||
341 | |||
342 | mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); | ||
343 | |||
344 | status = journal_extend(handle->k_handle, nblocks); | ||
345 | if (status < 0) { | ||
346 | mlog_errno(status); | ||
347 | goto bail; | ||
348 | } | ||
349 | |||
350 | if (status > 0) { | ||
351 | mlog(0, "journal_extend failed, trying journal_restart\n"); | ||
352 | status = journal_restart(handle->k_handle, nblocks); | ||
353 | if (status < 0) { | ||
354 | handle->k_handle = NULL; | ||
355 | mlog_errno(status); | ||
356 | goto bail; | ||
357 | } | ||
358 | handle->max_buffs = nblocks; | ||
359 | } else | ||
360 | handle->max_buffs += nblocks; | ||
361 | |||
362 | status = 0; | ||
363 | bail: | ||
364 | |||
365 | mlog_exit(status); | ||
366 | return status; | ||
367 | } | ||
368 | |||
369 | int ocfs2_journal_access(struct ocfs2_journal_handle *handle, | ||
370 | struct inode *inode, | ||
371 | struct buffer_head *bh, | ||
372 | int type) | ||
373 | { | ||
374 | int status; | ||
375 | |||
376 | BUG_ON(!inode); | ||
377 | BUG_ON(!handle); | ||
378 | BUG_ON(!bh); | ||
379 | BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED)); | ||
380 | |||
381 | mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %hu\n", | ||
382 | (unsigned long long)bh->b_blocknr, type, | ||
383 | (type == OCFS2_JOURNAL_ACCESS_CREATE) ? | ||
384 | "OCFS2_JOURNAL_ACCESS_CREATE" : | ||
385 | "OCFS2_JOURNAL_ACCESS_WRITE", | ||
386 | bh->b_size); | ||
387 | |||
388 | /* we can safely remove this assertion after testing. */ | ||
389 | if (!buffer_uptodate(bh)) { | ||
390 | mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); | ||
391 | mlog(ML_ERROR, "b_blocknr=%llu\n", | ||
392 | (unsigned long long)bh->b_blocknr); | ||
393 | BUG(); | ||
394 | } | ||
395 | |||
396 | /* Set the current transaction information on the inode so | ||
397 | * that the locking code knows whether it can drop it's locks | ||
398 | * on this inode or not. We're protected from the commit | ||
399 | * thread updating the current transaction id until | ||
400 | * ocfs2_commit_trans() because ocfs2_start_trans() took | ||
401 | * j_trans_barrier for us. */ | ||
402 | ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); | ||
403 | |||
404 | down(&OCFS2_I(inode)->ip_io_sem); | ||
405 | switch (type) { | ||
406 | case OCFS2_JOURNAL_ACCESS_CREATE: | ||
407 | case OCFS2_JOURNAL_ACCESS_WRITE: | ||
408 | status = journal_get_write_access(handle->k_handle, bh); | ||
409 | break; | ||
410 | |||
411 | case OCFS2_JOURNAL_ACCESS_UNDO: | ||
412 | status = journal_get_undo_access(handle->k_handle, bh); | ||
413 | break; | ||
414 | |||
415 | default: | ||
416 | status = -EINVAL; | ||
417 | mlog(ML_ERROR, "Uknown access type!\n"); | ||
418 | } | ||
419 | up(&OCFS2_I(inode)->ip_io_sem); | ||
420 | |||
421 | if (status < 0) | ||
422 | mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", | ||
423 | status, type); | ||
424 | |||
425 | mlog_exit(status); | ||
426 | return status; | ||
427 | } | ||
428 | |||
429 | int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle, | ||
430 | struct buffer_head *bh) | ||
431 | { | ||
432 | int status; | ||
433 | |||
434 | BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED)); | ||
435 | |||
436 | mlog_entry("(bh->b_blocknr=%llu)\n", | ||
437 | (unsigned long long)bh->b_blocknr); | ||
438 | |||
439 | status = journal_dirty_metadata(handle->k_handle, bh); | ||
440 | if (status < 0) | ||
441 | mlog(ML_ERROR, "Could not dirty metadata buffer. " | ||
442 | "(bh->b_blocknr=%llu)\n", | ||
443 | (unsigned long long)bh->b_blocknr); | ||
444 | |||
445 | mlog_exit(status); | ||
446 | return status; | ||
447 | } | ||
448 | |||
449 | int ocfs2_journal_dirty_data(handle_t *handle, | ||
450 | struct buffer_head *bh) | ||
451 | { | ||
452 | int err = journal_dirty_data(handle, bh); | ||
453 | if (err) | ||
454 | mlog_errno(err); | ||
455 | /* TODO: When we can handle it, abort the handle and go RO on | ||
456 | * error here. */ | ||
457 | |||
458 | return err; | ||
459 | } | ||
460 | |||
461 | /* We always assume you're adding a metadata lock at level 'ex' */ | ||
462 | int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle, | ||
463 | struct inode *inode) | ||
464 | { | ||
465 | int status; | ||
466 | struct ocfs2_journal_lock *lock; | ||
467 | |||
468 | BUG_ON(!inode); | ||
469 | |||
470 | lock = kmem_cache_alloc(ocfs2_lock_cache, GFP_NOFS); | ||
471 | if (!lock) { | ||
472 | status = -ENOMEM; | ||
473 | mlog_errno(-ENOMEM); | ||
474 | goto bail; | ||
475 | } | ||
476 | |||
477 | if (!igrab(inode)) | ||
478 | BUG(); | ||
479 | lock->jl_inode = inode; | ||
480 | |||
481 | list_add_tail(&(lock->jl_lock_list), &(handle->locks)); | ||
482 | handle->num_locks++; | ||
483 | |||
484 | status = 0; | ||
485 | bail: | ||
486 | mlog_exit(status); | ||
487 | return status; | ||
488 | } | ||
489 | |||
490 | static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal, | ||
491 | struct ocfs2_journal_handle *handle) | ||
492 | { | ||
493 | struct list_head *p, *n; | ||
494 | struct ocfs2_journal_lock *lock; | ||
495 | struct inode *inode; | ||
496 | |||
497 | list_for_each_safe(p, n, &(handle->locks)) { | ||
498 | lock = list_entry(p, struct ocfs2_journal_lock, | ||
499 | jl_lock_list); | ||
500 | list_del(&lock->jl_lock_list); | ||
501 | handle->num_locks--; | ||
502 | |||
503 | inode = lock->jl_inode; | ||
504 | ocfs2_meta_unlock(inode, 1); | ||
505 | if (atomic_read(&inode->i_count) == 1) | ||
506 | mlog(ML_ERROR, | ||
507 | "Inode %"MLFu64", I'm doing a last iput for!", | ||
508 | OCFS2_I(inode)->ip_blkno); | ||
509 | iput(inode); | ||
510 | kmem_cache_free(ocfs2_lock_cache, lock); | ||
511 | } | ||
512 | } | ||
513 | |||
514 | #define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5) | ||
515 | |||
516 | void ocfs2_set_journal_params(struct ocfs2_super *osb) | ||
517 | { | ||
518 | journal_t *journal = osb->journal->j_journal; | ||
519 | |||
520 | spin_lock(&journal->j_state_lock); | ||
521 | journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; | ||
522 | if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) | ||
523 | journal->j_flags |= JFS_BARRIER; | ||
524 | else | ||
525 | journal->j_flags &= ~JFS_BARRIER; | ||
526 | spin_unlock(&journal->j_state_lock); | ||
527 | } | ||
528 | |||
529 | int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) | ||
530 | { | ||
531 | int status = -1; | ||
532 | struct inode *inode = NULL; /* the journal inode */ | ||
533 | journal_t *j_journal = NULL; | ||
534 | struct ocfs2_dinode *di = NULL; | ||
535 | struct buffer_head *bh = NULL; | ||
536 | struct ocfs2_super *osb; | ||
537 | int meta_lock = 0; | ||
538 | |||
539 | mlog_entry_void(); | ||
540 | |||
541 | BUG_ON(!journal); | ||
542 | |||
543 | osb = journal->j_osb; | ||
544 | |||
545 | /* already have the inode for our journal */ | ||
546 | inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, | ||
547 | osb->slot_num); | ||
548 | if (inode == NULL) { | ||
549 | status = -EACCES; | ||
550 | mlog_errno(status); | ||
551 | goto done; | ||
552 | } | ||
553 | if (is_bad_inode(inode)) { | ||
554 | mlog(ML_ERROR, "access error (bad inode)\n"); | ||
555 | iput(inode); | ||
556 | inode = NULL; | ||
557 | status = -EACCES; | ||
558 | goto done; | ||
559 | } | ||
560 | |||
561 | SET_INODE_JOURNAL(inode); | ||
562 | OCFS2_I(inode)->ip_open_count++; | ||
563 | |||
564 | status = ocfs2_meta_lock(inode, NULL, &bh, 1); | ||
565 | if (status < 0) { | ||
566 | if (status != -ERESTARTSYS) | ||
567 | mlog(ML_ERROR, "Could not get lock on journal!\n"); | ||
568 | goto done; | ||
569 | } | ||
570 | |||
571 | meta_lock = 1; | ||
572 | di = (struct ocfs2_dinode *)bh->b_data; | ||
573 | |||
574 | if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { | ||
575 | mlog(ML_ERROR, "Journal file size (%lld) is too small!\n", | ||
576 | inode->i_size); | ||
577 | status = -EINVAL; | ||
578 | goto done; | ||
579 | } | ||
580 | |||
581 | mlog(0, "inode->i_size = %lld\n", inode->i_size); | ||
582 | mlog(0, "inode->i_blocks = %lu\n", inode->i_blocks); | ||
583 | mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters); | ||
584 | |||
585 | /* call the kernels journal init function now */ | ||
586 | j_journal = journal_init_inode(inode); | ||
587 | if (j_journal == NULL) { | ||
588 | mlog(ML_ERROR, "Linux journal layer error\n"); | ||
589 | status = -EINVAL; | ||
590 | goto done; | ||
591 | } | ||
592 | |||
593 | mlog(0, "Returned from journal_init_inode\n"); | ||
594 | mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen); | ||
595 | |||
596 | *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & | ||
597 | OCFS2_JOURNAL_DIRTY_FL); | ||
598 | |||
599 | journal->j_journal = j_journal; | ||
600 | journal->j_inode = inode; | ||
601 | journal->j_bh = bh; | ||
602 | |||
603 | ocfs2_set_journal_params(osb); | ||
604 | |||
605 | journal->j_state = OCFS2_JOURNAL_LOADED; | ||
606 | |||
607 | status = 0; | ||
608 | done: | ||
609 | if (status < 0) { | ||
610 | if (meta_lock) | ||
611 | ocfs2_meta_unlock(inode, 1); | ||
612 | if (bh != NULL) | ||
613 | brelse(bh); | ||
614 | if (inode) { | ||
615 | OCFS2_I(inode)->ip_open_count--; | ||
616 | iput(inode); | ||
617 | } | ||
618 | } | ||
619 | |||
620 | mlog_exit(status); | ||
621 | return status; | ||
622 | } | ||
623 | |||
624 | static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, | ||
625 | int dirty) | ||
626 | { | ||
627 | int status; | ||
628 | unsigned int flags; | ||
629 | struct ocfs2_journal *journal = osb->journal; | ||
630 | struct buffer_head *bh = journal->j_bh; | ||
631 | struct ocfs2_dinode *fe; | ||
632 | |||
633 | mlog_entry_void(); | ||
634 | |||
635 | fe = (struct ocfs2_dinode *)bh->b_data; | ||
636 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
637 | /* This is called from startup/shutdown which will | ||
638 | * handle the errors in a specific manner, so no need | ||
639 | * to call ocfs2_error() here. */ | ||
640 | mlog(ML_ERROR, "Journal dinode %"MLFu64" has invalid " | ||
641 | "signature: %.*s", fe->i_blkno, 7, fe->i_signature); | ||
642 | status = -EIO; | ||
643 | goto out; | ||
644 | } | ||
645 | |||
646 | flags = le32_to_cpu(fe->id1.journal1.ij_flags); | ||
647 | if (dirty) | ||
648 | flags |= OCFS2_JOURNAL_DIRTY_FL; | ||
649 | else | ||
650 | flags &= ~OCFS2_JOURNAL_DIRTY_FL; | ||
651 | fe->id1.journal1.ij_flags = cpu_to_le32(flags); | ||
652 | |||
653 | status = ocfs2_write_block(osb, bh, journal->j_inode); | ||
654 | if (status < 0) | ||
655 | mlog_errno(status); | ||
656 | |||
657 | out: | ||
658 | mlog_exit(status); | ||
659 | return status; | ||
660 | } | ||
661 | |||
662 | /* | ||
663 | * If the journal has been kmalloc'd it needs to be freed after this | ||
664 | * call. | ||
665 | */ | ||
666 | void ocfs2_journal_shutdown(struct ocfs2_super *osb) | ||
667 | { | ||
668 | struct ocfs2_journal *journal = NULL; | ||
669 | int status = 0; | ||
670 | struct inode *inode = NULL; | ||
671 | int num_running_trans = 0; | ||
672 | |||
673 | mlog_entry_void(); | ||
674 | |||
675 | if (!osb) | ||
676 | BUG(); | ||
677 | |||
678 | journal = osb->journal; | ||
679 | if (!journal) | ||
680 | goto done; | ||
681 | |||
682 | inode = journal->j_inode; | ||
683 | |||
684 | if (journal->j_state != OCFS2_JOURNAL_LOADED) | ||
685 | goto done; | ||
686 | |||
687 | /* need to inc inode use count as journal_destroy will iput. */ | ||
688 | if (!igrab(inode)) | ||
689 | BUG(); | ||
690 | |||
691 | num_running_trans = atomic_read(&(osb->journal->j_num_trans)); | ||
692 | if (num_running_trans > 0) | ||
693 | mlog(0, "Shutting down journal: must wait on %d " | ||
694 | "running transactions!\n", | ||
695 | num_running_trans); | ||
696 | |||
697 | /* Do a commit_cache here. It will flush our journal, *and* | ||
698 | * release any locks that are still held. | ||
699 | * set the SHUTDOWN flag and release the trans lock. | ||
700 | * the commit thread will take the trans lock for us below. */ | ||
701 | journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN; | ||
702 | |||
703 | /* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not | ||
704 | * drop the trans_lock (which we want to hold until we | ||
705 | * completely destroy the journal. */ | ||
706 | if (osb->commit_task) { | ||
707 | /* Wait for the commit thread */ | ||
708 | mlog(0, "Waiting for ocfs2commit to exit....\n"); | ||
709 | kthread_stop(osb->commit_task); | ||
710 | osb->commit_task = NULL; | ||
711 | } | ||
712 | |||
713 | BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); | ||
714 | |||
715 | status = ocfs2_journal_toggle_dirty(osb, 0); | ||
716 | if (status < 0) | ||
717 | mlog_errno(status); | ||
718 | |||
719 | /* Shutdown the kernel journal system */ | ||
720 | journal_destroy(journal->j_journal); | ||
721 | |||
722 | OCFS2_I(inode)->ip_open_count--; | ||
723 | |||
724 | /* unlock our journal */ | ||
725 | ocfs2_meta_unlock(inode, 1); | ||
726 | |||
727 | brelse(journal->j_bh); | ||
728 | journal->j_bh = NULL; | ||
729 | |||
730 | journal->j_state = OCFS2_JOURNAL_FREE; | ||
731 | |||
732 | // up_write(&journal->j_trans_barrier); | ||
733 | done: | ||
734 | if (inode) | ||
735 | iput(inode); | ||
736 | mlog_exit_void(); | ||
737 | } | ||
738 | |||
739 | static void ocfs2_clear_journal_error(struct super_block *sb, | ||
740 | journal_t *journal, | ||
741 | int slot) | ||
742 | { | ||
743 | int olderr; | ||
744 | |||
745 | olderr = journal_errno(journal); | ||
746 | if (olderr) { | ||
747 | mlog(ML_ERROR, "File system error %d recorded in " | ||
748 | "journal %u.\n", olderr, slot); | ||
749 | mlog(ML_ERROR, "File system on device %s needs checking.\n", | ||
750 | sb->s_id); | ||
751 | |||
752 | journal_ack_err(journal); | ||
753 | journal_clear_err(journal); | ||
754 | } | ||
755 | } | ||
756 | |||
757 | int ocfs2_journal_load(struct ocfs2_journal *journal) | ||
758 | { | ||
759 | int status = 0; | ||
760 | struct ocfs2_super *osb; | ||
761 | |||
762 | mlog_entry_void(); | ||
763 | |||
764 | if (!journal) | ||
765 | BUG(); | ||
766 | |||
767 | osb = journal->j_osb; | ||
768 | |||
769 | status = journal_load(journal->j_journal); | ||
770 | if (status < 0) { | ||
771 | mlog(ML_ERROR, "Failed to load journal!\n"); | ||
772 | goto done; | ||
773 | } | ||
774 | |||
775 | ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num); | ||
776 | |||
777 | status = ocfs2_journal_toggle_dirty(osb, 1); | ||
778 | if (status < 0) { | ||
779 | mlog_errno(status); | ||
780 | goto done; | ||
781 | } | ||
782 | |||
783 | /* Launch the commit thread */ | ||
784 | osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d", | ||
785 | osb->osb_id); | ||
786 | if (IS_ERR(osb->commit_task)) { | ||
787 | status = PTR_ERR(osb->commit_task); | ||
788 | osb->commit_task = NULL; | ||
789 | mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d", | ||
790 | status); | ||
791 | goto done; | ||
792 | } | ||
793 | |||
794 | done: | ||
795 | mlog_exit(status); | ||
796 | return status; | ||
797 | } | ||
798 | |||
799 | |||
800 | /* 'full' flag tells us whether we clear out all blocks or if we just | ||
801 | * mark the journal clean */ | ||
802 | int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) | ||
803 | { | ||
804 | int status; | ||
805 | |||
806 | mlog_entry_void(); | ||
807 | |||
808 | if (!journal) | ||
809 | BUG(); | ||
810 | |||
811 | status = journal_wipe(journal->j_journal, full); | ||
812 | if (status < 0) { | ||
813 | mlog_errno(status); | ||
814 | goto bail; | ||
815 | } | ||
816 | |||
817 | status = ocfs2_journal_toggle_dirty(journal->j_osb, 0); | ||
818 | if (status < 0) | ||
819 | mlog_errno(status); | ||
820 | |||
821 | bail: | ||
822 | mlog_exit(status); | ||
823 | return status; | ||
824 | } | ||
825 | |||
826 | /* | ||
827 | * JBD Might read a cached version of another nodes journal file. We | ||
828 | * don't want this as this file changes often and we get no | ||
829 | * notification on those changes. The only way to be sure that we've | ||
830 | * got the most up to date version of those blocks then is to force | ||
831 | * read them off disk. Just searching through the buffer cache won't | ||
832 | * work as there may be pages backing this file which are still marked | ||
833 | * up to date. We know things can't change on this file underneath us | ||
834 | * as we have the lock by now :) | ||
835 | */ | ||
836 | static int ocfs2_force_read_journal(struct inode *inode) | ||
837 | { | ||
838 | int status = 0; | ||
839 | int i, p_blocks; | ||
840 | u64 v_blkno, p_blkno; | ||
841 | #define CONCURRENT_JOURNAL_FILL 32 | ||
842 | struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; | ||
843 | |||
844 | mlog_entry_void(); | ||
845 | |||
846 | BUG_ON(inode->i_blocks != | ||
847 | ocfs2_align_bytes_to_sectors(i_size_read(inode))); | ||
848 | |||
849 | memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); | ||
850 | |||
851 | mlog(0, "Force reading %lu blocks\n", | ||
852 | (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))); | ||
853 | |||
854 | v_blkno = 0; | ||
855 | while (v_blkno < | ||
856 | (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) { | ||
857 | |||
858 | status = ocfs2_extent_map_get_blocks(inode, v_blkno, | ||
859 | 1, &p_blkno, | ||
860 | &p_blocks); | ||
861 | if (status < 0) { | ||
862 | mlog_errno(status); | ||
863 | goto bail; | ||
864 | } | ||
865 | |||
866 | if (p_blocks > CONCURRENT_JOURNAL_FILL) | ||
867 | p_blocks = CONCURRENT_JOURNAL_FILL; | ||
868 | |||
869 | status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb), | ||
870 | p_blkno, p_blocks, bhs, 0, | ||
871 | inode); | ||
872 | if (status < 0) { | ||
873 | mlog_errno(status); | ||
874 | goto bail; | ||
875 | } | ||
876 | |||
877 | for(i = 0; i < p_blocks; i++) { | ||
878 | brelse(bhs[i]); | ||
879 | bhs[i] = NULL; | ||
880 | } | ||
881 | |||
882 | v_blkno += p_blocks; | ||
883 | } | ||
884 | |||
885 | bail: | ||
886 | for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) | ||
887 | if (bhs[i]) | ||
888 | brelse(bhs[i]); | ||
889 | mlog_exit(status); | ||
890 | return status; | ||
891 | } | ||
892 | |||
893 | struct ocfs2_la_recovery_item { | ||
894 | struct list_head lri_list; | ||
895 | int lri_slot; | ||
896 | struct ocfs2_dinode *lri_la_dinode; | ||
897 | struct ocfs2_dinode *lri_tl_dinode; | ||
898 | }; | ||
899 | |||
900 | /* Does the second half of the recovery process. By this point, the | ||
901 | * node is marked clean and can actually be considered recovered, | ||
902 | * hence it's no longer in the recovery map, but there's still some | ||
903 | * cleanup we can do which shouldn't happen within the recovery thread | ||
904 | * as locking in that context becomes very difficult if we are to take | ||
905 | * recovering nodes into account. | ||
906 | * | ||
907 | * NOTE: This function can and will sleep on recovery of other nodes | ||
908 | * during cluster locking, just like any other ocfs2 process. | ||
909 | */ | ||
910 | void ocfs2_complete_recovery(void *data) | ||
911 | { | ||
912 | int ret; | ||
913 | struct ocfs2_super *osb = data; | ||
914 | struct ocfs2_journal *journal = osb->journal; | ||
915 | struct ocfs2_dinode *la_dinode, *tl_dinode; | ||
916 | struct ocfs2_la_recovery_item *item; | ||
917 | struct list_head *p, *n; | ||
918 | LIST_HEAD(tmp_la_list); | ||
919 | |||
920 | mlog_entry_void(); | ||
921 | |||
922 | mlog(0, "completing recovery from keventd\n"); | ||
923 | |||
924 | spin_lock(&journal->j_lock); | ||
925 | list_splice_init(&journal->j_la_cleanups, &tmp_la_list); | ||
926 | spin_unlock(&journal->j_lock); | ||
927 | |||
928 | list_for_each_safe(p, n, &tmp_la_list) { | ||
929 | item = list_entry(p, struct ocfs2_la_recovery_item, lri_list); | ||
930 | list_del_init(&item->lri_list); | ||
931 | |||
932 | mlog(0, "Complete recovery for slot %d\n", item->lri_slot); | ||
933 | |||
934 | la_dinode = item->lri_la_dinode; | ||
935 | if (la_dinode) { | ||
936 | mlog(0, "Clean up local alloc %"MLFu64"\n", | ||
937 | la_dinode->i_blkno); | ||
938 | |||
939 | ret = ocfs2_complete_local_alloc_recovery(osb, | ||
940 | la_dinode); | ||
941 | if (ret < 0) | ||
942 | mlog_errno(ret); | ||
943 | |||
944 | kfree(la_dinode); | ||
945 | } | ||
946 | |||
947 | tl_dinode = item->lri_tl_dinode; | ||
948 | if (tl_dinode) { | ||
949 | mlog(0, "Clean up truncate log %"MLFu64"\n", | ||
950 | tl_dinode->i_blkno); | ||
951 | |||
952 | ret = ocfs2_complete_truncate_log_recovery(osb, | ||
953 | tl_dinode); | ||
954 | if (ret < 0) | ||
955 | mlog_errno(ret); | ||
956 | |||
957 | kfree(tl_dinode); | ||
958 | } | ||
959 | |||
960 | ret = ocfs2_recover_orphans(osb, item->lri_slot); | ||
961 | if (ret < 0) | ||
962 | mlog_errno(ret); | ||
963 | |||
964 | kfree(item); | ||
965 | } | ||
966 | |||
967 | mlog(0, "Recovery completion\n"); | ||
968 | mlog_exit_void(); | ||
969 | } | ||
970 | |||
971 | /* NOTE: This function always eats your references to la_dinode and | ||
972 | * tl_dinode, either manually on error, or by passing them to | ||
973 | * ocfs2_complete_recovery */ | ||
974 | static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, | ||
975 | int slot_num, | ||
976 | struct ocfs2_dinode *la_dinode, | ||
977 | struct ocfs2_dinode *tl_dinode) | ||
978 | { | ||
979 | struct ocfs2_la_recovery_item *item; | ||
980 | |||
981 | item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL); | ||
982 | if (!item) { | ||
983 | /* Though we wish to avoid it, we are in fact safe in | ||
984 | * skipping local alloc cleanup as fsck.ocfs2 is more | ||
985 | * than capable of reclaiming unused space. */ | ||
986 | if (la_dinode) | ||
987 | kfree(la_dinode); | ||
988 | |||
989 | if (tl_dinode) | ||
990 | kfree(tl_dinode); | ||
991 | |||
992 | mlog_errno(-ENOMEM); | ||
993 | return; | ||
994 | } | ||
995 | |||
996 | INIT_LIST_HEAD(&item->lri_list); | ||
997 | item->lri_la_dinode = la_dinode; | ||
998 | item->lri_slot = slot_num; | ||
999 | item->lri_tl_dinode = tl_dinode; | ||
1000 | |||
1001 | spin_lock(&journal->j_lock); | ||
1002 | list_add_tail(&item->lri_list, &journal->j_la_cleanups); | ||
1003 | queue_work(ocfs2_wq, &journal->j_recovery_work); | ||
1004 | spin_unlock(&journal->j_lock); | ||
1005 | } | ||
1006 | |||
1007 | /* Called by the mount code to queue recovery the last part of | ||
1008 | * recovery for it's own slot. */ | ||
1009 | void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) | ||
1010 | { | ||
1011 | struct ocfs2_journal *journal = osb->journal; | ||
1012 | |||
1013 | if (osb->dirty) { | ||
1014 | /* No need to queue up our truncate_log as regular | ||
1015 | * cleanup will catch that. */ | ||
1016 | ocfs2_queue_recovery_completion(journal, | ||
1017 | osb->slot_num, | ||
1018 | osb->local_alloc_copy, | ||
1019 | NULL); | ||
1020 | ocfs2_schedule_truncate_log_flush(osb, 0); | ||
1021 | |||
1022 | osb->local_alloc_copy = NULL; | ||
1023 | osb->dirty = 0; | ||
1024 | } | ||
1025 | } | ||
1026 | |||
1027 | static int __ocfs2_recovery_thread(void *arg) | ||
1028 | { | ||
1029 | int status, node_num; | ||
1030 | struct ocfs2_super *osb = arg; | ||
1031 | |||
1032 | mlog_entry_void(); | ||
1033 | |||
1034 | status = ocfs2_wait_on_mount(osb); | ||
1035 | if (status < 0) { | ||
1036 | goto bail; | ||
1037 | } | ||
1038 | |||
1039 | restart: | ||
1040 | status = ocfs2_super_lock(osb, 1); | ||
1041 | if (status < 0) { | ||
1042 | mlog_errno(status); | ||
1043 | goto bail; | ||
1044 | } | ||
1045 | |||
1046 | while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { | ||
1047 | node_num = ocfs2_node_map_first_set_bit(osb, | ||
1048 | &osb->recovery_map); | ||
1049 | if (node_num == O2NM_INVALID_NODE_NUM) { | ||
1050 | mlog(0, "Out of nodes to recover.\n"); | ||
1051 | break; | ||
1052 | } | ||
1053 | |||
1054 | status = ocfs2_recover_node(osb, node_num); | ||
1055 | if (status < 0) { | ||
1056 | mlog(ML_ERROR, | ||
1057 | "Error %d recovering node %d on device (%u,%u)!\n", | ||
1058 | status, node_num, | ||
1059 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); | ||
1060 | mlog(ML_ERROR, "Volume requires unmount.\n"); | ||
1061 | continue; | ||
1062 | } | ||
1063 | |||
1064 | ocfs2_recovery_map_clear(osb, node_num); | ||
1065 | } | ||
1066 | ocfs2_super_unlock(osb, 1); | ||
1067 | |||
1068 | /* We always run recovery on our own orphan dir - the dead | ||
1069 | * node(s) may have voted "no" on an inode delete earlier. A | ||
1070 | * revote is therefore required. */ | ||
1071 | ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, | ||
1072 | NULL); | ||
1073 | |||
1074 | bail: | ||
1075 | down(&osb->recovery_lock); | ||
1076 | if (!status && | ||
1077 | !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { | ||
1078 | up(&osb->recovery_lock); | ||
1079 | goto restart; | ||
1080 | } | ||
1081 | |||
1082 | osb->recovery_thread_task = NULL; | ||
1083 | mb(); /* sync with ocfs2_recovery_thread_running */ | ||
1084 | wake_up(&osb->recovery_event); | ||
1085 | |||
1086 | up(&osb->recovery_lock); | ||
1087 | |||
1088 | mlog_exit(status); | ||
1089 | /* no one is callint kthread_stop() for us so the kthread() api | ||
1090 | * requires that we call do_exit(). And it isn't exported, but | ||
1091 | * complete_and_exit() seems to be a minimal wrapper around it. */ | ||
1092 | complete_and_exit(NULL, status); | ||
1093 | return status; | ||
1094 | } | ||
1095 | |||
1096 | void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) | ||
1097 | { | ||
1098 | mlog_entry("(node_num=%d, osb->node_num = %d)\n", | ||
1099 | node_num, osb->node_num); | ||
1100 | |||
1101 | down(&osb->recovery_lock); | ||
1102 | if (osb->disable_recovery) | ||
1103 | goto out; | ||
1104 | |||
1105 | /* People waiting on recovery will wait on | ||
1106 | * the recovery map to empty. */ | ||
1107 | if (!ocfs2_recovery_map_set(osb, node_num)) | ||
1108 | mlog(0, "node %d already be in recovery.\n", node_num); | ||
1109 | |||
1110 | mlog(0, "starting recovery thread...\n"); | ||
1111 | |||
1112 | if (osb->recovery_thread_task) | ||
1113 | goto out; | ||
1114 | |||
1115 | osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, | ||
1116 | "ocfs2rec-%d", osb->osb_id); | ||
1117 | if (IS_ERR(osb->recovery_thread_task)) { | ||
1118 | mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); | ||
1119 | osb->recovery_thread_task = NULL; | ||
1120 | } | ||
1121 | |||
1122 | out: | ||
1123 | up(&osb->recovery_lock); | ||
1124 | wake_up(&osb->recovery_event); | ||
1125 | |||
1126 | mlog_exit_void(); | ||
1127 | } | ||
1128 | |||
1129 | /* Does the actual journal replay and marks the journal inode as | ||
1130 | * clean. Will only replay if the journal inode is marked dirty. */ | ||
1131 | static int ocfs2_replay_journal(struct ocfs2_super *osb, | ||
1132 | int node_num, | ||
1133 | int slot_num) | ||
1134 | { | ||
1135 | int status; | ||
1136 | int got_lock = 0; | ||
1137 | unsigned int flags; | ||
1138 | struct inode *inode = NULL; | ||
1139 | struct ocfs2_dinode *fe; | ||
1140 | journal_t *journal = NULL; | ||
1141 | struct buffer_head *bh = NULL; | ||
1142 | |||
1143 | inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, | ||
1144 | slot_num); | ||
1145 | if (inode == NULL) { | ||
1146 | status = -EACCES; | ||
1147 | mlog_errno(status); | ||
1148 | goto done; | ||
1149 | } | ||
1150 | if (is_bad_inode(inode)) { | ||
1151 | status = -EACCES; | ||
1152 | iput(inode); | ||
1153 | inode = NULL; | ||
1154 | mlog_errno(status); | ||
1155 | goto done; | ||
1156 | } | ||
1157 | SET_INODE_JOURNAL(inode); | ||
1158 | |||
1159 | status = ocfs2_meta_lock_full(inode, NULL, &bh, 1, | ||
1160 | OCFS2_META_LOCK_RECOVERY); | ||
1161 | if (status < 0) { | ||
1162 | mlog(0, "status returned from ocfs2_meta_lock=%d\n", status); | ||
1163 | if (status != -ERESTARTSYS) | ||
1164 | mlog(ML_ERROR, "Could not lock journal!\n"); | ||
1165 | goto done; | ||
1166 | } | ||
1167 | got_lock = 1; | ||
1168 | |||
1169 | fe = (struct ocfs2_dinode *) bh->b_data; | ||
1170 | |||
1171 | flags = le32_to_cpu(fe->id1.journal1.ij_flags); | ||
1172 | |||
1173 | if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) { | ||
1174 | mlog(0, "No recovery required for node %d\n", node_num); | ||
1175 | goto done; | ||
1176 | } | ||
1177 | |||
1178 | mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", | ||
1179 | node_num, slot_num, | ||
1180 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); | ||
1181 | |||
1182 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
1183 | |||
1184 | status = ocfs2_force_read_journal(inode); | ||
1185 | if (status < 0) { | ||
1186 | mlog_errno(status); | ||
1187 | goto done; | ||
1188 | } | ||
1189 | |||
1190 | mlog(0, "calling journal_init_inode\n"); | ||
1191 | journal = journal_init_inode(inode); | ||
1192 | if (journal == NULL) { | ||
1193 | mlog(ML_ERROR, "Linux journal layer error\n"); | ||
1194 | status = -EIO; | ||
1195 | goto done; | ||
1196 | } | ||
1197 | |||
1198 | status = journal_load(journal); | ||
1199 | if (status < 0) { | ||
1200 | mlog_errno(status); | ||
1201 | if (!igrab(inode)) | ||
1202 | BUG(); | ||
1203 | journal_destroy(journal); | ||
1204 | goto done; | ||
1205 | } | ||
1206 | |||
1207 | ocfs2_clear_journal_error(osb->sb, journal, slot_num); | ||
1208 | |||
1209 | /* wipe the journal */ | ||
1210 | mlog(0, "flushing the journal.\n"); | ||
1211 | journal_lock_updates(journal); | ||
1212 | status = journal_flush(journal); | ||
1213 | journal_unlock_updates(journal); | ||
1214 | if (status < 0) | ||
1215 | mlog_errno(status); | ||
1216 | |||
1217 | /* This will mark the node clean */ | ||
1218 | flags = le32_to_cpu(fe->id1.journal1.ij_flags); | ||
1219 | flags &= ~OCFS2_JOURNAL_DIRTY_FL; | ||
1220 | fe->id1.journal1.ij_flags = cpu_to_le32(flags); | ||
1221 | |||
1222 | status = ocfs2_write_block(osb, bh, inode); | ||
1223 | if (status < 0) | ||
1224 | mlog_errno(status); | ||
1225 | |||
1226 | if (!igrab(inode)) | ||
1227 | BUG(); | ||
1228 | |||
1229 | journal_destroy(journal); | ||
1230 | |||
1231 | done: | ||
1232 | /* drop the lock on this nodes journal */ | ||
1233 | if (got_lock) | ||
1234 | ocfs2_meta_unlock(inode, 1); | ||
1235 | |||
1236 | if (inode) | ||
1237 | iput(inode); | ||
1238 | |||
1239 | if (bh) | ||
1240 | brelse(bh); | ||
1241 | |||
1242 | mlog_exit(status); | ||
1243 | return status; | ||
1244 | } | ||
1245 | |||
1246 | /* | ||
1247 | * Do the most important parts of node recovery: | ||
1248 | * - Replay it's journal | ||
1249 | * - Stamp a clean local allocator file | ||
1250 | * - Stamp a clean truncate log | ||
1251 | * - Mark the node clean | ||
1252 | * | ||
1253 | * If this function completes without error, a node in OCFS2 can be | ||
1254 | * said to have been safely recovered. As a result, failure during the | ||
1255 | * second part of a nodes recovery process (local alloc recovery) is | ||
1256 | * far less concerning. | ||
1257 | */ | ||
1258 | static int ocfs2_recover_node(struct ocfs2_super *osb, | ||
1259 | int node_num) | ||
1260 | { | ||
1261 | int status = 0; | ||
1262 | int slot_num; | ||
1263 | struct ocfs2_slot_info *si = osb->slot_info; | ||
1264 | struct ocfs2_dinode *la_copy = NULL; | ||
1265 | struct ocfs2_dinode *tl_copy = NULL; | ||
1266 | |||
1267 | mlog_entry("(node_num=%d, osb->node_num = %d)\n", | ||
1268 | node_num, osb->node_num); | ||
1269 | |||
1270 | mlog(0, "checking node %d\n", node_num); | ||
1271 | |||
1272 | /* Should not ever be called to recover ourselves -- in that | ||
1273 | * case we should've called ocfs2_journal_load instead. */ | ||
1274 | if (osb->node_num == node_num) | ||
1275 | BUG(); | ||
1276 | |||
1277 | slot_num = ocfs2_node_num_to_slot(si, node_num); | ||
1278 | if (slot_num == OCFS2_INVALID_SLOT) { | ||
1279 | status = 0; | ||
1280 | mlog(0, "no slot for this node, so no recovery required.\n"); | ||
1281 | goto done; | ||
1282 | } | ||
1283 | |||
1284 | mlog(0, "node %d was using slot %d\n", node_num, slot_num); | ||
1285 | |||
1286 | status = ocfs2_replay_journal(osb, node_num, slot_num); | ||
1287 | if (status < 0) { | ||
1288 | mlog_errno(status); | ||
1289 | goto done; | ||
1290 | } | ||
1291 | |||
1292 | /* Stamp a clean local alloc file AFTER recovering the journal... */ | ||
1293 | status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy); | ||
1294 | if (status < 0) { | ||
1295 | mlog_errno(status); | ||
1296 | goto done; | ||
1297 | } | ||
1298 | |||
1299 | /* An error from begin_truncate_log_recovery is not | ||
1300 | * serious enough to warrant halting the rest of | ||
1301 | * recovery. */ | ||
1302 | status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy); | ||
1303 | if (status < 0) | ||
1304 | mlog_errno(status); | ||
1305 | |||
1306 | /* Likewise, this would be a strange but ultimately not so | ||
1307 | * harmful place to get an error... */ | ||
1308 | ocfs2_clear_slot(si, slot_num); | ||
1309 | status = ocfs2_update_disk_slots(osb, si); | ||
1310 | if (status < 0) | ||
1311 | mlog_errno(status); | ||
1312 | |||
1313 | /* This will kfree the memory pointed to by la_copy and tl_copy */ | ||
1314 | ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, | ||
1315 | tl_copy); | ||
1316 | |||
1317 | status = 0; | ||
1318 | done: | ||
1319 | |||
1320 | mlog_exit(status); | ||
1321 | return status; | ||
1322 | } | ||
1323 | |||
1324 | /* Test node liveness by trylocking his journal. If we get the lock, | ||
1325 | * we drop it here. Return 0 if we got the lock, -EAGAIN if node is | ||
1326 | * still alive (we couldn't get the lock) and < 0 on error. */ | ||
1327 | static int ocfs2_trylock_journal(struct ocfs2_super *osb, | ||
1328 | int slot_num) | ||
1329 | { | ||
1330 | int status, flags; | ||
1331 | struct inode *inode = NULL; | ||
1332 | |||
1333 | inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, | ||
1334 | slot_num); | ||
1335 | if (inode == NULL) { | ||
1336 | mlog(ML_ERROR, "access error\n"); | ||
1337 | status = -EACCES; | ||
1338 | goto bail; | ||
1339 | } | ||
1340 | if (is_bad_inode(inode)) { | ||
1341 | mlog(ML_ERROR, "access error (bad inode)\n"); | ||
1342 | iput(inode); | ||
1343 | inode = NULL; | ||
1344 | status = -EACCES; | ||
1345 | goto bail; | ||
1346 | } | ||
1347 | SET_INODE_JOURNAL(inode); | ||
1348 | |||
1349 | flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE; | ||
1350 | status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags); | ||
1351 | if (status < 0) { | ||
1352 | if (status != -EAGAIN) | ||
1353 | mlog_errno(status); | ||
1354 | goto bail; | ||
1355 | } | ||
1356 | |||
1357 | ocfs2_meta_unlock(inode, 1); | ||
1358 | bail: | ||
1359 | if (inode) | ||
1360 | iput(inode); | ||
1361 | |||
1362 | return status; | ||
1363 | } | ||
1364 | |||
1365 | /* Call this underneath ocfs2_super_lock. It also assumes that the | ||
1366 | * slot info struct has been updated from disk. */ | ||
1367 | int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) | ||
1368 | { | ||
1369 | int status, i, node_num; | ||
1370 | struct ocfs2_slot_info *si = osb->slot_info; | ||
1371 | |||
1372 | /* This is called with the super block cluster lock, so we | ||
1373 | * know that the slot map can't change underneath us. */ | ||
1374 | |||
1375 | spin_lock(&si->si_lock); | ||
1376 | for(i = 0; i < si->si_num_slots; i++) { | ||
1377 | if (i == osb->slot_num) | ||
1378 | continue; | ||
1379 | if (ocfs2_is_empty_slot(si, i)) | ||
1380 | continue; | ||
1381 | |||
1382 | node_num = si->si_global_node_nums[i]; | ||
1383 | if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num)) | ||
1384 | continue; | ||
1385 | spin_unlock(&si->si_lock); | ||
1386 | |||
1387 | /* Ok, we have a slot occupied by another node which | ||
1388 | * is not in the recovery map. We trylock his journal | ||
1389 | * file here to test if he's alive. */ | ||
1390 | status = ocfs2_trylock_journal(osb, i); | ||
1391 | if (!status) { | ||
1392 | /* Since we're called from mount, we know that | ||
1393 | * the recovery thread can't race us on | ||
1394 | * setting / checking the recovery bits. */ | ||
1395 | ocfs2_recovery_thread(osb, node_num); | ||
1396 | } else if ((status < 0) && (status != -EAGAIN)) { | ||
1397 | mlog_errno(status); | ||
1398 | goto bail; | ||
1399 | } | ||
1400 | |||
1401 | spin_lock(&si->si_lock); | ||
1402 | } | ||
1403 | spin_unlock(&si->si_lock); | ||
1404 | |||
1405 | status = 0; | ||
1406 | bail: | ||
1407 | mlog_exit(status); | ||
1408 | return status; | ||
1409 | } | ||
1410 | |||
1411 | static int ocfs2_recover_orphans(struct ocfs2_super *osb, | ||
1412 | int slot) | ||
1413 | { | ||
1414 | int status = 0; | ||
1415 | int have_disk_lock = 0; | ||
1416 | struct inode *inode = NULL; | ||
1417 | struct inode *iter; | ||
1418 | struct inode *orphan_dir_inode = NULL; | ||
1419 | unsigned long offset, blk, local; | ||
1420 | struct buffer_head *bh = NULL; | ||
1421 | struct ocfs2_dir_entry *de; | ||
1422 | struct super_block *sb = osb->sb; | ||
1423 | struct ocfs2_inode_info *oi; | ||
1424 | |||
1425 | mlog(0, "Recover inodes from orphan dir in slot %d\n", slot); | ||
1426 | |||
1427 | orphan_dir_inode = ocfs2_get_system_file_inode(osb, | ||
1428 | ORPHAN_DIR_SYSTEM_INODE, | ||
1429 | slot); | ||
1430 | if (!orphan_dir_inode) { | ||
1431 | status = -ENOENT; | ||
1432 | mlog_errno(status); | ||
1433 | goto out; | ||
1434 | } | ||
1435 | |||
1436 | down(&orphan_dir_inode->i_sem); | ||
1437 | status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0); | ||
1438 | if (status < 0) { | ||
1439 | up(&orphan_dir_inode->i_sem); | ||
1440 | mlog_errno(status); | ||
1441 | goto out; | ||
1442 | } | ||
1443 | have_disk_lock = 1; | ||
1444 | |||
1445 | offset = 0; | ||
1446 | iter = NULL; | ||
1447 | while(offset < i_size_read(orphan_dir_inode)) { | ||
1448 | blk = offset >> sb->s_blocksize_bits; | ||
1449 | |||
1450 | bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0); | ||
1451 | if (!bh) | ||
1452 | status = -EINVAL; | ||
1453 | if (status < 0) { | ||
1454 | up(&orphan_dir_inode->i_sem); | ||
1455 | if (bh) | ||
1456 | brelse(bh); | ||
1457 | mlog_errno(status); | ||
1458 | goto out; | ||
1459 | } | ||
1460 | |||
1461 | local = 0; | ||
1462 | while(offset < i_size_read(orphan_dir_inode) | ||
1463 | && local < sb->s_blocksize) { | ||
1464 | de = (struct ocfs2_dir_entry *) (bh->b_data + local); | ||
1465 | |||
1466 | if (!ocfs2_check_dir_entry(orphan_dir_inode, | ||
1467 | de, bh, local)) { | ||
1468 | up(&orphan_dir_inode->i_sem); | ||
1469 | status = -EINVAL; | ||
1470 | mlog_errno(status); | ||
1471 | brelse(bh); | ||
1472 | goto out; | ||
1473 | } | ||
1474 | |||
1475 | local += le16_to_cpu(de->rec_len); | ||
1476 | offset += le16_to_cpu(de->rec_len); | ||
1477 | |||
1478 | /* I guess we silently fail on no inode? */ | ||
1479 | if (!le64_to_cpu(de->inode)) | ||
1480 | continue; | ||
1481 | if (de->file_type > OCFS2_FT_MAX) { | ||
1482 | mlog(ML_ERROR, | ||
1483 | "block %llu contains invalid de: " | ||
1484 | "inode = %"MLFu64", rec_len = %u, " | ||
1485 | "name_len = %u, file_type = %u, " | ||
1486 | "name='%.*s'\n", | ||
1487 | (unsigned long long)bh->b_blocknr, | ||
1488 | le64_to_cpu(de->inode), | ||
1489 | le16_to_cpu(de->rec_len), | ||
1490 | de->name_len, | ||
1491 | de->file_type, | ||
1492 | de->name_len, | ||
1493 | de->name); | ||
1494 | continue; | ||
1495 | } | ||
1496 | if (de->name_len == 1 && !strncmp(".", de->name, 1)) | ||
1497 | continue; | ||
1498 | if (de->name_len == 2 && !strncmp("..", de->name, 2)) | ||
1499 | continue; | ||
1500 | |||
1501 | iter = ocfs2_iget(osb, le64_to_cpu(de->inode)); | ||
1502 | if (IS_ERR(iter)) | ||
1503 | continue; | ||
1504 | |||
1505 | mlog(0, "queue orphan %"MLFu64"\n", | ||
1506 | OCFS2_I(iter)->ip_blkno); | ||
1507 | OCFS2_I(iter)->ip_next_orphan = inode; | ||
1508 | inode = iter; | ||
1509 | } | ||
1510 | brelse(bh); | ||
1511 | } | ||
1512 | up(&orphan_dir_inode->i_sem); | ||
1513 | |||
1514 | ocfs2_meta_unlock(orphan_dir_inode, 0); | ||
1515 | have_disk_lock = 0; | ||
1516 | |||
1517 | iput(orphan_dir_inode); | ||
1518 | orphan_dir_inode = NULL; | ||
1519 | |||
1520 | while (inode) { | ||
1521 | oi = OCFS2_I(inode); | ||
1522 | mlog(0, "iput orphan %"MLFu64"\n", oi->ip_blkno); | ||
1523 | |||
1524 | iter = oi->ip_next_orphan; | ||
1525 | |||
1526 | spin_lock(&oi->ip_lock); | ||
1527 | /* Delete voting may have set these on the assumption | ||
1528 | * that the other node would wipe them successfully. | ||
1529 | * If they are still in the node's orphan dir, we need | ||
1530 | * to reset that state. */ | ||
1531 | oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); | ||
1532 | |||
1533 | /* Set the proper information to get us going into | ||
1534 | * ocfs2_delete_inode. */ | ||
1535 | oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; | ||
1536 | oi->ip_orphaned_slot = slot; | ||
1537 | spin_unlock(&oi->ip_lock); | ||
1538 | |||
1539 | iput(inode); | ||
1540 | |||
1541 | inode = iter; | ||
1542 | } | ||
1543 | |||
1544 | out: | ||
1545 | if (have_disk_lock) | ||
1546 | ocfs2_meta_unlock(orphan_dir_inode, 0); | ||
1547 | |||
1548 | if (orphan_dir_inode) | ||
1549 | iput(orphan_dir_inode); | ||
1550 | |||
1551 | return status; | ||
1552 | } | ||
1553 | |||
1554 | static int ocfs2_wait_on_mount(struct ocfs2_super *osb) | ||
1555 | { | ||
1556 | /* This check is good because ocfs2 will wait on our recovery | ||
1557 | * thread before changing it to something other than MOUNTED | ||
1558 | * or DISABLED. */ | ||
1559 | wait_event(osb->osb_mount_event, | ||
1560 | atomic_read(&osb->vol_state) == VOLUME_MOUNTED || | ||
1561 | atomic_read(&osb->vol_state) == VOLUME_DISABLED); | ||
1562 | |||
1563 | /* If there's an error on mount, then we may never get to the | ||
1564 | * MOUNTED flag, but this is set right before | ||
1565 | * dismount_volume() so we can trust it. */ | ||
1566 | if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) { | ||
1567 | mlog(0, "mount error, exiting!\n"); | ||
1568 | return -EBUSY; | ||
1569 | } | ||
1570 | |||
1571 | return 0; | ||
1572 | } | ||
1573 | |||
1574 | static int ocfs2_commit_thread(void *arg) | ||
1575 | { | ||
1576 | int status; | ||
1577 | struct ocfs2_super *osb = arg; | ||
1578 | struct ocfs2_journal *journal = osb->journal; | ||
1579 | |||
1580 | /* we can trust j_num_trans here because _should_stop() is only set in | ||
1581 | * shutdown and nobody other than ourselves should be able to start | ||
1582 | * transactions. committing on shutdown might take a few iterations | ||
1583 | * as final transactions put deleted inodes on the list */ | ||
1584 | while (!(kthread_should_stop() && | ||
1585 | atomic_read(&journal->j_num_trans) == 0)) { | ||
1586 | |||
1587 | wait_event_interruptible_timeout(osb->checkpoint_event, | ||
1588 | atomic_read(&journal->j_num_trans) | ||
1589 | || kthread_should_stop(), | ||
1590 | OCFS2_CHECKPOINT_INTERVAL); | ||
1591 | |||
1592 | status = ocfs2_commit_cache(osb); | ||
1593 | if (status < 0) | ||
1594 | mlog_errno(status); | ||
1595 | |||
1596 | if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){ | ||
1597 | mlog(ML_KTHREAD, | ||
1598 | "commit_thread: %u transactions pending on " | ||
1599 | "shutdown\n", | ||
1600 | atomic_read(&journal->j_num_trans)); | ||
1601 | } | ||
1602 | } | ||
1603 | |||
1604 | return 0; | ||
1605 | } | ||
1606 | |||
1607 | /* Look for a dirty journal without taking any cluster locks. Used for | ||
1608 | * hard readonly access to determine whether the file system journals | ||
1609 | * require recovery. */ | ||
1610 | int ocfs2_check_journals_nolocks(struct ocfs2_super *osb) | ||
1611 | { | ||
1612 | int ret = 0; | ||
1613 | unsigned int slot; | ||
1614 | struct buffer_head *di_bh; | ||
1615 | struct ocfs2_dinode *di; | ||
1616 | struct inode *journal = NULL; | ||
1617 | |||
1618 | for(slot = 0; slot < osb->max_slots; slot++) { | ||
1619 | journal = ocfs2_get_system_file_inode(osb, | ||
1620 | JOURNAL_SYSTEM_INODE, | ||
1621 | slot); | ||
1622 | if (!journal || is_bad_inode(journal)) { | ||
1623 | ret = -EACCES; | ||
1624 | mlog_errno(ret); | ||
1625 | goto out; | ||
1626 | } | ||
1627 | |||
1628 | di_bh = NULL; | ||
1629 | ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh, | ||
1630 | 0, journal); | ||
1631 | if (ret < 0) { | ||
1632 | mlog_errno(ret); | ||
1633 | goto out; | ||
1634 | } | ||
1635 | |||
1636 | di = (struct ocfs2_dinode *) di_bh->b_data; | ||
1637 | |||
1638 | if (le32_to_cpu(di->id1.journal1.ij_flags) & | ||
1639 | OCFS2_JOURNAL_DIRTY_FL) | ||
1640 | ret = -EROFS; | ||
1641 | |||
1642 | brelse(di_bh); | ||
1643 | if (ret) | ||
1644 | break; | ||
1645 | } | ||
1646 | |||
1647 | out: | ||
1648 | if (journal) | ||
1649 | iput(journal); | ||
1650 | |||
1651 | return ret; | ||
1652 | } | ||
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h new file mode 100644 index 000000000000..7d0a816184fa --- /dev/null +++ b/fs/ocfs2/journal.h | |||
@@ -0,0 +1,457 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * journal.h | ||
5 | * | ||
6 | * Defines journalling api and structures. | ||
7 | * | ||
8 | * Copyright (C) 2003, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_JOURNAL_H | ||
27 | #define OCFS2_JOURNAL_H | ||
28 | |||
29 | #include <linux/fs.h> | ||
30 | #include <linux/jbd.h> | ||
31 | |||
32 | #define OCFS2_CHECKPOINT_INTERVAL (8 * HZ) | ||
33 | |||
34 | enum ocfs2_journal_state { | ||
35 | OCFS2_JOURNAL_FREE = 0, | ||
36 | OCFS2_JOURNAL_LOADED, | ||
37 | OCFS2_JOURNAL_IN_SHUTDOWN, | ||
38 | }; | ||
39 | |||
40 | struct ocfs2_super; | ||
41 | struct ocfs2_dinode; | ||
42 | struct ocfs2_journal_handle; | ||
43 | |||
44 | struct ocfs2_journal { | ||
45 | enum ocfs2_journal_state j_state; /* Journals current state */ | ||
46 | |||
47 | journal_t *j_journal; /* The kernels journal type */ | ||
48 | struct inode *j_inode; /* Kernel inode pointing to | ||
49 | * this journal */ | ||
50 | struct ocfs2_super *j_osb; /* pointer to the super | ||
51 | * block for the node | ||
52 | * we're currently | ||
53 | * running on -- not | ||
54 | * necessarily the super | ||
55 | * block from the node | ||
56 | * which we usually run | ||
57 | * from (recovery, | ||
58 | * etc) */ | ||
59 | struct buffer_head *j_bh; /* Journal disk inode block */ | ||
60 | atomic_t j_num_trans; /* Number of transactions | ||
61 | * currently in the system. */ | ||
62 | unsigned long j_trans_id; | ||
63 | struct rw_semaphore j_trans_barrier; | ||
64 | wait_queue_head_t j_checkpointed; | ||
65 | |||
66 | spinlock_t j_lock; | ||
67 | struct list_head j_la_cleanups; | ||
68 | struct work_struct j_recovery_work; | ||
69 | }; | ||
70 | |||
71 | extern spinlock_t trans_inc_lock; | ||
72 | |||
73 | /* wrap j_trans_id so we never have it equal to zero. */ | ||
74 | static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j) | ||
75 | { | ||
76 | unsigned long old_id; | ||
77 | spin_lock(&trans_inc_lock); | ||
78 | old_id = j->j_trans_id++; | ||
79 | if (unlikely(!j->j_trans_id)) | ||
80 | j->j_trans_id = 1; | ||
81 | spin_unlock(&trans_inc_lock); | ||
82 | return old_id; | ||
83 | } | ||
84 | |||
85 | static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal, | ||
86 | struct inode *inode) | ||
87 | { | ||
88 | spin_lock(&trans_inc_lock); | ||
89 | OCFS2_I(inode)->ip_last_trans = journal->j_trans_id; | ||
90 | spin_unlock(&trans_inc_lock); | ||
91 | } | ||
92 | |||
93 | /* Used to figure out whether it's safe to drop a metadata lock on an | ||
94 | * inode. Returns true if all the inodes changes have been | ||
95 | * checkpointed to disk. You should be holding the spinlock on the | ||
96 | * metadata lock while calling this to be sure that nobody can take | ||
97 | * the lock and put it on another transaction. */ | ||
98 | static inline int ocfs2_inode_fully_checkpointed(struct inode *inode) | ||
99 | { | ||
100 | int ret; | ||
101 | struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal; | ||
102 | |||
103 | spin_lock(&trans_inc_lock); | ||
104 | ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans); | ||
105 | spin_unlock(&trans_inc_lock); | ||
106 | return ret; | ||
107 | } | ||
108 | |||
109 | /* convenience function to check if an inode is still new (has never | ||
110 | * hit disk) Will do you a favor and set created_trans = 0 when you've | ||
111 | * been checkpointed. returns '1' if the inode is still new. */ | ||
112 | static inline int ocfs2_inode_is_new(struct inode *inode) | ||
113 | { | ||
114 | int ret; | ||
115 | |||
116 | /* System files are never "new" as they're written out by | ||
117 | * mkfs. This helps us early during mount, before we have the | ||
118 | * journal open and j_trans_id could be junk. */ | ||
119 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) | ||
120 | return 0; | ||
121 | spin_lock(&trans_inc_lock); | ||
122 | ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id, | ||
123 | OCFS2_I(inode)->ip_created_trans)); | ||
124 | if (!ret) | ||
125 | OCFS2_I(inode)->ip_created_trans = 0; | ||
126 | spin_unlock(&trans_inc_lock); | ||
127 | return ret; | ||
128 | } | ||
129 | |||
130 | static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, | ||
131 | struct inode *inode) | ||
132 | { | ||
133 | spin_lock(&trans_inc_lock); | ||
134 | OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id; | ||
135 | spin_unlock(&trans_inc_lock); | ||
136 | } | ||
137 | |||
138 | extern kmem_cache_t *ocfs2_lock_cache; | ||
139 | |||
140 | struct ocfs2_journal_lock { | ||
141 | struct inode *jl_inode; | ||
142 | struct list_head jl_lock_list; | ||
143 | }; | ||
144 | |||
145 | struct ocfs2_journal_handle { | ||
146 | handle_t *k_handle; /* kernel handle. */ | ||
147 | struct ocfs2_journal *journal; | ||
148 | u32 flags; /* see flags below. */ | ||
149 | int max_buffs; /* Buffs reserved by this handle */ | ||
150 | |||
151 | /* The following two fields are for ocfs2_handle_add_lock */ | ||
152 | int num_locks; | ||
153 | struct list_head locks; /* A bunch of locks to | ||
154 | * release on commit. This | ||
155 | * should be a list_head */ | ||
156 | |||
157 | struct list_head inode_list; | ||
158 | }; | ||
159 | |||
160 | #define OCFS2_HANDLE_STARTED 1 | ||
161 | /* should we sync-commit this handle? */ | ||
162 | #define OCFS2_HANDLE_SYNC 2 | ||
163 | static inline int ocfs2_handle_started(struct ocfs2_journal_handle *handle) | ||
164 | { | ||
165 | return handle->flags & OCFS2_HANDLE_STARTED; | ||
166 | } | ||
167 | |||
168 | static inline void ocfs2_handle_set_sync(struct ocfs2_journal_handle *handle, int sync) | ||
169 | { | ||
170 | if (sync) | ||
171 | handle->flags |= OCFS2_HANDLE_SYNC; | ||
172 | else | ||
173 | handle->flags &= ~OCFS2_HANDLE_SYNC; | ||
174 | } | ||
175 | |||
176 | /* Exported only for the journal struct init code in super.c. Do not call. */ | ||
177 | void ocfs2_complete_recovery(void *data); | ||
178 | |||
179 | /* | ||
180 | * Journal Control: | ||
181 | * Initialize, Load, Shutdown, Wipe a journal. | ||
182 | * | ||
183 | * ocfs2_journal_init - Initialize journal structures in the OSB. | ||
184 | * ocfs2_journal_load - Load the given journal off disk. Replay it if | ||
185 | * there's transactions still in there. | ||
186 | * ocfs2_journal_shutdown - Shutdown a journal, this will flush all | ||
187 | * uncommitted, uncheckpointed transactions. | ||
188 | * ocfs2_journal_wipe - Wipe transactions from a journal. Optionally | ||
189 | * zero out each block. | ||
190 | * ocfs2_recovery_thread - Perform recovery on a node. osb is our own osb. | ||
191 | * ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat | ||
192 | * event on. | ||
193 | * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint. | ||
194 | */ | ||
195 | void ocfs2_set_journal_params(struct ocfs2_super *osb); | ||
196 | int ocfs2_journal_init(struct ocfs2_journal *journal, | ||
197 | int *dirty); | ||
198 | void ocfs2_journal_shutdown(struct ocfs2_super *osb); | ||
199 | int ocfs2_journal_wipe(struct ocfs2_journal *journal, | ||
200 | int full); | ||
201 | int ocfs2_journal_load(struct ocfs2_journal *journal); | ||
202 | int ocfs2_check_journals_nolocks(struct ocfs2_super *osb); | ||
203 | void ocfs2_recovery_thread(struct ocfs2_super *osb, | ||
204 | int node_num); | ||
205 | int ocfs2_mark_dead_nodes(struct ocfs2_super *osb); | ||
206 | void ocfs2_complete_mount_recovery(struct ocfs2_super *osb); | ||
207 | |||
208 | static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) | ||
209 | { | ||
210 | atomic_set(&osb->needs_checkpoint, 1); | ||
211 | wake_up(&osb->checkpoint_event); | ||
212 | } | ||
213 | |||
214 | static inline void ocfs2_checkpoint_inode(struct inode *inode) | ||
215 | { | ||
216 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
217 | |||
218 | if (!ocfs2_inode_fully_checkpointed(inode)) { | ||
219 | /* WARNING: This only kicks off a single | ||
220 | * checkpoint. If someone races you and adds more | ||
221 | * metadata to the journal, you won't know, and will | ||
222 | * wind up waiting *alot* longer than necessary. Right | ||
223 | * now we only use this in clear_inode so that's | ||
224 | * OK. */ | ||
225 | ocfs2_start_checkpoint(osb); | ||
226 | |||
227 | wait_event(osb->journal->j_checkpointed, | ||
228 | ocfs2_inode_fully_checkpointed(inode)); | ||
229 | } | ||
230 | } | ||
231 | |||
232 | /* | ||
233 | * Transaction Handling: | ||
234 | * Manage the lifetime of a transaction handle. | ||
235 | * | ||
236 | * ocfs2_alloc_handle - Only allocate a handle so we can start putting | ||
237 | * cluster locks on it. To actually change blocks, | ||
238 | * call ocfs2_start_trans with the handle returned | ||
239 | * from this function. You may call ocfs2_commit_trans | ||
240 | * at any time in the lifetime of a handle. | ||
241 | * ocfs2_start_trans - Begin a transaction. Give it an upper estimate of | ||
242 | * the number of blocks that will be changed during | ||
243 | * this handle. | ||
244 | * ocfs2_commit_trans - Complete a handle. | ||
245 | * ocfs2_extend_trans - Extend a handle by nblocks credits. This may | ||
246 | * commit the handle to disk in the process, but will | ||
247 | * not release any locks taken during the transaction. | ||
248 | * ocfs2_journal_access - Notify the handle that we want to journal this | ||
249 | * buffer. Will have to call ocfs2_journal_dirty once | ||
250 | * we've actually dirtied it. Type is one of . or . | ||
251 | * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. | ||
252 | * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before | ||
253 | * the current handle commits. | ||
254 | * ocfs2_handle_add_lock - Sometimes we need to delay lock release | ||
255 | * until after a transaction has been completed. Use | ||
256 | * ocfs2_handle_add_lock to indicate that a lock needs | ||
257 | * to be released at the end of that handle. Locks | ||
258 | * will be released in the order that they are added. | ||
259 | * ocfs2_handle_add_inode - Add a locked inode to a transaction. | ||
260 | */ | ||
261 | |||
262 | /* You must always start_trans with a number of buffs > 0, but it's | ||
263 | * perfectly legal to go through an entire transaction without having | ||
264 | * dirtied any buffers. */ | ||
265 | struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb); | ||
266 | struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb, | ||
267 | struct ocfs2_journal_handle *handle, | ||
268 | int max_buffs); | ||
269 | void ocfs2_commit_trans(struct ocfs2_journal_handle *handle); | ||
270 | int ocfs2_extend_trans(struct ocfs2_journal_handle *handle, | ||
271 | int nblocks); | ||
272 | |||
273 | /* | ||
274 | * Create access is for when we get a newly created buffer and we're | ||
275 | * not gonna read it off disk, but rather fill it ourselves. Right | ||
276 | * now, we don't do anything special with this (it turns into a write | ||
277 | * request), but this is a good placeholder in case we do... | ||
278 | * | ||
279 | * Write access is for when we read a block off disk and are going to | ||
280 | * modify it. This way the journalling layer knows it may need to make | ||
281 | * a copy of that block (if it's part of another, uncommitted | ||
282 | * transaction) before we do so. | ||
283 | */ | ||
284 | #define OCFS2_JOURNAL_ACCESS_CREATE 0 | ||
285 | #define OCFS2_JOURNAL_ACCESS_WRITE 1 | ||
286 | #define OCFS2_JOURNAL_ACCESS_UNDO 2 | ||
287 | |||
288 | int ocfs2_journal_access(struct ocfs2_journal_handle *handle, | ||
289 | struct inode *inode, | ||
290 | struct buffer_head *bh, | ||
291 | int type); | ||
292 | /* | ||
293 | * A word about the journal_access/journal_dirty "dance". It is | ||
294 | * entirely legal to journal_access a buffer more than once (as long | ||
295 | * as the access type is the same -- I'm not sure what will happen if | ||
296 | * access type is different but this should never happen anyway) It is | ||
297 | * also legal to journal_dirty a buffer more than once. In fact, you | ||
298 | * can even journal_access a buffer after you've done a | ||
299 | * journal_access/journal_dirty pair. The only thing you cannot do | ||
300 | * however, is journal_dirty a buffer which you haven't yet passed to | ||
301 | * journal_access at least once. | ||
302 | * | ||
303 | * That said, 99% of the time this doesn't matter and this is what the | ||
304 | * path looks like: | ||
305 | * | ||
306 | * <read a bh> | ||
307 | * ocfs2_journal_access(handle, bh, OCFS2_JOURNAL_ACCESS_WRITE); | ||
308 | * <modify the bh> | ||
309 | * ocfs2_journal_dirty(handle, bh); | ||
310 | */ | ||
311 | int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle, | ||
312 | struct buffer_head *bh); | ||
313 | int ocfs2_journal_dirty_data(handle_t *handle, | ||
314 | struct buffer_head *bh); | ||
315 | int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle, | ||
316 | struct inode *inode); | ||
317 | /* | ||
318 | * Use this to protect from other processes reading buffer state while | ||
319 | * it's in flight. | ||
320 | */ | ||
321 | void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle, | ||
322 | struct inode *inode); | ||
323 | |||
324 | /* | ||
325 | * Credit Macros: | ||
326 | * Convenience macros to calculate number of credits needed. | ||
327 | * | ||
328 | * For convenience sake, I have a set of macros here which calculate | ||
329 | * the *maximum* number of sectors which will be changed for various | ||
330 | * metadata updates. | ||
331 | */ | ||
332 | |||
333 | /* simple file updates like chmod, etc. */ | ||
334 | #define OCFS2_INODE_UPDATE_CREDITS 1 | ||
335 | |||
336 | /* get one bit out of a suballocator: dinode + group descriptor + | ||
337 | * prev. group desc. if we relink. */ | ||
338 | #define OCFS2_SUBALLOC_ALLOC (3) | ||
339 | |||
340 | /* dinode + group descriptor update. We don't relink on free yet. */ | ||
341 | #define OCFS2_SUBALLOC_FREE (2) | ||
342 | |||
343 | #define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS | ||
344 | #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ | ||
345 | + OCFS2_TRUNCATE_LOG_UPDATE) | ||
346 | |||
347 | /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + | ||
348 | * bitmap block for the new bit) */ | ||
349 | #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) | ||
350 | |||
351 | /* parent fe, parent block, new file entry, inode alloc fe, inode alloc | ||
352 | * group descriptor + mkdir/symlink blocks */ | ||
353 | #define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \ | ||
354 | + OCFS2_DIR_LINK_ADDITIONAL_CREDITS) | ||
355 | |||
356 | /* local alloc metadata change + main bitmap updates */ | ||
357 | #define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \ | ||
358 | + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE) | ||
359 | |||
360 | /* used when we don't need an allocation change for a dir extend. One | ||
361 | * for the dinode, one for the new block. */ | ||
362 | #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) | ||
363 | |||
364 | /* file update (nlink, etc) + dir entry block */ | ||
365 | #define OCFS2_LINK_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) | ||
366 | |||
367 | /* inode + dir inode (if we unlink a dir), + dir entry block + orphan | ||
368 | * dir inode link */ | ||
369 | #define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \ | ||
370 | + OCFS2_LINK_CREDITS) | ||
371 | |||
372 | /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + | ||
373 | * inode alloc group descriptor */ | ||
374 | #define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1) | ||
375 | |||
376 | /* dinode update, old dir dinode update, new dir dinode update, old | ||
377 | * dir dir entry, new dir dir entry, dir entry update for renaming | ||
378 | * directory + target unlink */ | ||
379 | #define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \ | ||
380 | + OCFS2_UNLINK_CREDITS) | ||
381 | |||
382 | static inline int ocfs2_calc_extend_credits(struct super_block *sb, | ||
383 | struct ocfs2_dinode *fe, | ||
384 | u32 bits_wanted) | ||
385 | { | ||
386 | int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks; | ||
387 | |||
388 | /* bitmap dinode, group desc. + relinked group. */ | ||
389 | bitmap_blocks = OCFS2_SUBALLOC_ALLOC; | ||
390 | |||
391 | /* we might need to shift tree depth so lets assume an | ||
392 | * absolute worst case of complete fragmentation. Even with | ||
393 | * that, we only need one update for the dinode, and then | ||
394 | * however many metadata chunks needed * a remaining suballoc | ||
395 | * alloc. */ | ||
396 | sysfile_bitmap_blocks = 1 + | ||
397 | (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe); | ||
398 | |||
399 | /* this does not include *new* metadata blocks, which are | ||
400 | * accounted for in sysfile_bitmap_blocks. fe + | ||
401 | * prev. last_eb_blk + blocks along edge of tree. | ||
402 | * calc_symlink_credits passes because we just need 1 | ||
403 | * credit for the dinode there. */ | ||
404 | dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth); | ||
405 | |||
406 | return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks; | ||
407 | } | ||
408 | |||
409 | static inline int ocfs2_calc_symlink_credits(struct super_block *sb) | ||
410 | { | ||
411 | int blocks = OCFS2_MKNOD_CREDITS; | ||
412 | |||
413 | /* links can be longer than one block so we may update many | ||
414 | * within our single allocated extent. */ | ||
415 | blocks += ocfs2_clusters_to_blocks(sb, 1); | ||
416 | |||
417 | return blocks; | ||
418 | } | ||
419 | |||
420 | static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb, | ||
421 | unsigned int cpg) | ||
422 | { | ||
423 | int blocks; | ||
424 | int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1; | ||
425 | /* parent inode update + new block group header + bitmap inode update | ||
426 | + bitmap blocks affected */ | ||
427 | blocks = 1 + 1 + 1 + bitmap_blocks; | ||
428 | return blocks; | ||
429 | } | ||
430 | |||
431 | static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, | ||
432 | unsigned int clusters_to_del, | ||
433 | struct ocfs2_dinode *fe, | ||
434 | struct ocfs2_extent_list *last_el) | ||
435 | { | ||
436 | /* for dinode + all headers in this pass + update to next leaf */ | ||
437 | u16 next_free = le16_to_cpu(last_el->l_next_free_rec); | ||
438 | u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth); | ||
439 | int credits = 1 + tree_depth + 1; | ||
440 | int i; | ||
441 | |||
442 | i = next_free - 1; | ||
443 | BUG_ON(i < 0); | ||
444 | |||
445 | /* We may be deleting metadata blocks, so metadata alloc dinode + | ||
446 | one desc. block for each possible delete. */ | ||
447 | if (tree_depth && next_free == 1 && | ||
448 | le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del) | ||
449 | credits += 1 + tree_depth; | ||
450 | |||
451 | /* update to the truncate log. */ | ||
452 | credits += OCFS2_TRUNCATE_LOG_UPDATE; | ||
453 | |||
454 | return credits; | ||
455 | } | ||
456 | |||
457 | #endif /* OCFS2_JOURNAL_H */ | ||
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c new file mode 100644 index 000000000000..fe373a2101d9 --- /dev/null +++ b/fs/ocfs2/localalloc.c | |||
@@ -0,0 +1,983 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * localalloc.c | ||
5 | * | ||
6 | * Node local data allocation | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/bitops.h> | ||
31 | |||
32 | #define MLOG_MASK_PREFIX ML_DISK_ALLOC | ||
33 | #include <cluster/masklog.h> | ||
34 | |||
35 | #include "ocfs2.h" | ||
36 | |||
37 | #include "alloc.h" | ||
38 | #include "dlmglue.h" | ||
39 | #include "inode.h" | ||
40 | #include "journal.h" | ||
41 | #include "localalloc.h" | ||
42 | #include "suballoc.h" | ||
43 | #include "super.h" | ||
44 | #include "sysfile.h" | ||
45 | |||
46 | #include "buffer_head_io.h" | ||
47 | |||
48 | #define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) | ||
49 | |||
50 | static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb); | ||
51 | |||
52 | static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); | ||
53 | |||
54 | static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, | ||
55 | struct ocfs2_dinode *alloc, | ||
56 | u32 numbits); | ||
57 | |||
58 | static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc); | ||
59 | |||
60 | static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, | ||
61 | struct ocfs2_journal_handle *handle, | ||
62 | struct ocfs2_dinode *alloc, | ||
63 | struct inode *main_bm_inode, | ||
64 | struct buffer_head *main_bm_bh); | ||
65 | |||
66 | static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, | ||
67 | struct ocfs2_journal_handle *handle, | ||
68 | struct ocfs2_alloc_context **ac, | ||
69 | struct inode **bitmap_inode, | ||
70 | struct buffer_head **bitmap_bh); | ||
71 | |||
72 | static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, | ||
73 | struct ocfs2_journal_handle *handle, | ||
74 | struct ocfs2_alloc_context *ac); | ||
75 | |||
76 | static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, | ||
77 | struct inode *local_alloc_inode); | ||
78 | |||
79 | /* | ||
80 | * Determine how large our local alloc window should be, in bits. | ||
81 | * | ||
82 | * These values (and the behavior in ocfs2_alloc_should_use_local) have | ||
83 | * been chosen so that most allocations, including new block groups go | ||
84 | * through local alloc. | ||
85 | */ | ||
86 | static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) | ||
87 | { | ||
88 | BUG_ON(osb->s_clustersize_bits < 12); | ||
89 | |||
90 | return 2048 >> (osb->s_clustersize_bits - 12); | ||
91 | } | ||
92 | |||
93 | /* | ||
94 | * Tell us whether a given allocation should use the local alloc | ||
95 | * file. Otherwise, it has to go to the main bitmap. | ||
96 | */ | ||
97 | int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) | ||
98 | { | ||
99 | int la_bits = ocfs2_local_alloc_window_bits(osb); | ||
100 | |||
101 | if (osb->local_alloc_state != OCFS2_LA_ENABLED) | ||
102 | return 0; | ||
103 | |||
104 | /* la_bits should be at least twice the size (in clusters) of | ||
105 | * a new block group. We want to be sure block group | ||
106 | * allocations go through the local alloc, so allow an | ||
107 | * allocation to take up to half the bitmap. */ | ||
108 | if (bits > (la_bits / 2)) | ||
109 | return 0; | ||
110 | |||
111 | return 1; | ||
112 | } | ||
113 | |||
114 | int ocfs2_load_local_alloc(struct ocfs2_super *osb) | ||
115 | { | ||
116 | int status = 0; | ||
117 | struct ocfs2_dinode *alloc = NULL; | ||
118 | struct buffer_head *alloc_bh = NULL; | ||
119 | u32 num_used; | ||
120 | struct inode *inode = NULL; | ||
121 | struct ocfs2_local_alloc *la; | ||
122 | |||
123 | mlog_entry_void(); | ||
124 | |||
125 | /* read the alloc off disk */ | ||
126 | inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE, | ||
127 | osb->slot_num); | ||
128 | if (!inode) { | ||
129 | status = -EINVAL; | ||
130 | mlog_errno(status); | ||
131 | goto bail; | ||
132 | } | ||
133 | |||
134 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, | ||
135 | &alloc_bh, 0, inode); | ||
136 | if (status < 0) { | ||
137 | mlog_errno(status); | ||
138 | goto bail; | ||
139 | } | ||
140 | |||
141 | alloc = (struct ocfs2_dinode *) alloc_bh->b_data; | ||
142 | la = OCFS2_LOCAL_ALLOC(alloc); | ||
143 | |||
144 | if (!(le32_to_cpu(alloc->i_flags) & | ||
145 | (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) { | ||
146 | mlog(ML_ERROR, "Invalid local alloc inode, %"MLFu64"\n", | ||
147 | OCFS2_I(inode)->ip_blkno); | ||
148 | status = -EINVAL; | ||
149 | goto bail; | ||
150 | } | ||
151 | |||
152 | if ((la->la_size == 0) || | ||
153 | (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) { | ||
154 | mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n", | ||
155 | le16_to_cpu(la->la_size)); | ||
156 | status = -EINVAL; | ||
157 | goto bail; | ||
158 | } | ||
159 | |||
160 | /* do a little verification. */ | ||
161 | num_used = ocfs2_local_alloc_count_bits(alloc); | ||
162 | |||
163 | /* hopefully the local alloc has always been recovered before | ||
164 | * we load it. */ | ||
165 | if (num_used | ||
166 | || alloc->id1.bitmap1.i_used | ||
167 | || alloc->id1.bitmap1.i_total | ||
168 | || la->la_bm_off) | ||
169 | mlog(ML_ERROR, "Local alloc hasn't been recovered!\n" | ||
170 | "found = %u, set = %u, taken = %u, off = %u\n", | ||
171 | num_used, le32_to_cpu(alloc->id1.bitmap1.i_used), | ||
172 | le32_to_cpu(alloc->id1.bitmap1.i_total), | ||
173 | OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); | ||
174 | |||
175 | osb->local_alloc_bh = alloc_bh; | ||
176 | osb->local_alloc_state = OCFS2_LA_ENABLED; | ||
177 | |||
178 | bail: | ||
179 | if (status < 0) | ||
180 | if (alloc_bh) | ||
181 | brelse(alloc_bh); | ||
182 | if (inode) | ||
183 | iput(inode); | ||
184 | |||
185 | mlog_exit(status); | ||
186 | return status; | ||
187 | } | ||
188 | |||
189 | /* | ||
190 | * return any unused bits to the bitmap and write out a clean | ||
191 | * local_alloc. | ||
192 | * | ||
193 | * local_alloc_bh is optional. If not passed, we will simply use the | ||
194 | * one off osb. If you do pass it however, be warned that it *will* be | ||
195 | * returned brelse'd and NULL'd out.*/ | ||
196 | void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) | ||
197 | { | ||
198 | int status; | ||
199 | struct ocfs2_journal_handle *handle = NULL; | ||
200 | struct inode *local_alloc_inode = NULL; | ||
201 | struct buffer_head *bh = NULL; | ||
202 | struct buffer_head *main_bm_bh = NULL; | ||
203 | struct inode *main_bm_inode = NULL; | ||
204 | struct ocfs2_dinode *alloc_copy = NULL; | ||
205 | struct ocfs2_dinode *alloc = NULL; | ||
206 | |||
207 | mlog_entry_void(); | ||
208 | |||
209 | if (osb->local_alloc_state == OCFS2_LA_UNUSED) | ||
210 | goto bail; | ||
211 | |||
212 | local_alloc_inode = | ||
213 | ocfs2_get_system_file_inode(osb, | ||
214 | LOCAL_ALLOC_SYSTEM_INODE, | ||
215 | osb->slot_num); | ||
216 | if (!local_alloc_inode) { | ||
217 | status = -ENOENT; | ||
218 | mlog_errno(status); | ||
219 | goto bail; | ||
220 | } | ||
221 | |||
222 | osb->local_alloc_state = OCFS2_LA_DISABLED; | ||
223 | |||
224 | handle = ocfs2_alloc_handle(osb); | ||
225 | if (!handle) { | ||
226 | status = -ENOMEM; | ||
227 | mlog_errno(status); | ||
228 | goto bail; | ||
229 | } | ||
230 | |||
231 | main_bm_inode = ocfs2_get_system_file_inode(osb, | ||
232 | GLOBAL_BITMAP_SYSTEM_INODE, | ||
233 | OCFS2_INVALID_SLOT); | ||
234 | if (!main_bm_inode) { | ||
235 | status = -EINVAL; | ||
236 | mlog_errno(status); | ||
237 | goto bail; | ||
238 | } | ||
239 | |||
240 | ocfs2_handle_add_inode(handle, main_bm_inode); | ||
241 | status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1); | ||
242 | if (status < 0) { | ||
243 | mlog_errno(status); | ||
244 | goto bail; | ||
245 | } | ||
246 | |||
247 | /* WINDOW_MOVE_CREDITS is a bit heavy... */ | ||
248 | handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS); | ||
249 | if (IS_ERR(handle)) { | ||
250 | mlog_errno(PTR_ERR(handle)); | ||
251 | handle = NULL; | ||
252 | goto bail; | ||
253 | } | ||
254 | |||
255 | bh = osb->local_alloc_bh; | ||
256 | alloc = (struct ocfs2_dinode *) bh->b_data; | ||
257 | |||
258 | alloc_copy = kmalloc(bh->b_size, GFP_KERNEL); | ||
259 | if (!alloc_copy) { | ||
260 | status = -ENOMEM; | ||
261 | goto bail; | ||
262 | } | ||
263 | memcpy(alloc_copy, alloc, bh->b_size); | ||
264 | |||
265 | status = ocfs2_journal_access(handle, local_alloc_inode, bh, | ||
266 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
267 | if (status < 0) { | ||
268 | mlog_errno(status); | ||
269 | goto bail; | ||
270 | } | ||
271 | |||
272 | ocfs2_clear_local_alloc(alloc); | ||
273 | |||
274 | status = ocfs2_journal_dirty(handle, bh); | ||
275 | if (status < 0) { | ||
276 | mlog_errno(status); | ||
277 | goto bail; | ||
278 | } | ||
279 | |||
280 | brelse(bh); | ||
281 | osb->local_alloc_bh = NULL; | ||
282 | osb->local_alloc_state = OCFS2_LA_UNUSED; | ||
283 | |||
284 | status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, | ||
285 | main_bm_inode, main_bm_bh); | ||
286 | if (status < 0) | ||
287 | mlog_errno(status); | ||
288 | |||
289 | bail: | ||
290 | if (handle) | ||
291 | ocfs2_commit_trans(handle); | ||
292 | |||
293 | if (main_bm_bh) | ||
294 | brelse(main_bm_bh); | ||
295 | |||
296 | if (main_bm_inode) | ||
297 | iput(main_bm_inode); | ||
298 | |||
299 | if (local_alloc_inode) | ||
300 | iput(local_alloc_inode); | ||
301 | |||
302 | if (alloc_copy) | ||
303 | kfree(alloc_copy); | ||
304 | |||
305 | mlog_exit_void(); | ||
306 | } | ||
307 | |||
308 | /* | ||
309 | * We want to free the bitmap bits outside of any recovery context as | ||
310 | * we'll need a cluster lock to do so, but we must clear the local | ||
311 | * alloc before giving up the recovered nodes journal. To solve this, | ||
312 | * we kmalloc a copy of the local alloc before it's change for the | ||
313 | * caller to process with ocfs2_complete_local_alloc_recovery | ||
314 | */ | ||
315 | int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, | ||
316 | int slot_num, | ||
317 | struct ocfs2_dinode **alloc_copy) | ||
318 | { | ||
319 | int status = 0; | ||
320 | struct buffer_head *alloc_bh = NULL; | ||
321 | struct inode *inode = NULL; | ||
322 | struct ocfs2_dinode *alloc; | ||
323 | |||
324 | mlog_entry("(slot_num = %d)\n", slot_num); | ||
325 | |||
326 | *alloc_copy = NULL; | ||
327 | |||
328 | inode = ocfs2_get_system_file_inode(osb, | ||
329 | LOCAL_ALLOC_SYSTEM_INODE, | ||
330 | slot_num); | ||
331 | if (!inode) { | ||
332 | status = -EINVAL; | ||
333 | mlog_errno(status); | ||
334 | goto bail; | ||
335 | } | ||
336 | |||
337 | down(&inode->i_sem); | ||
338 | |||
339 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, | ||
340 | &alloc_bh, 0, inode); | ||
341 | if (status < 0) { | ||
342 | mlog_errno(status); | ||
343 | goto bail; | ||
344 | } | ||
345 | |||
346 | *alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL); | ||
347 | if (!(*alloc_copy)) { | ||
348 | status = -ENOMEM; | ||
349 | goto bail; | ||
350 | } | ||
351 | memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size); | ||
352 | |||
353 | alloc = (struct ocfs2_dinode *) alloc_bh->b_data; | ||
354 | ocfs2_clear_local_alloc(alloc); | ||
355 | |||
356 | status = ocfs2_write_block(osb, alloc_bh, inode); | ||
357 | if (status < 0) | ||
358 | mlog_errno(status); | ||
359 | |||
360 | bail: | ||
361 | if ((status < 0) && (*alloc_copy)) { | ||
362 | kfree(*alloc_copy); | ||
363 | *alloc_copy = NULL; | ||
364 | } | ||
365 | |||
366 | if (alloc_bh) | ||
367 | brelse(alloc_bh); | ||
368 | |||
369 | if (inode) { | ||
370 | up(&inode->i_sem); | ||
371 | iput(inode); | ||
372 | } | ||
373 | |||
374 | mlog_exit(status); | ||
375 | return status; | ||
376 | } | ||
377 | |||
378 | /* | ||
379 | * Step 2: By now, we've completed the journal recovery, we've stamped | ||
380 | * a clean local alloc on disk and dropped the node out of the | ||
381 | * recovery map. Dlm locks will no longer stall, so lets clear out the | ||
382 | * main bitmap. | ||
383 | */ | ||
384 | int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, | ||
385 | struct ocfs2_dinode *alloc) | ||
386 | { | ||
387 | int status; | ||
388 | struct ocfs2_journal_handle *handle = NULL; | ||
389 | struct buffer_head *main_bm_bh = NULL; | ||
390 | struct inode *main_bm_inode = NULL; | ||
391 | |||
392 | mlog_entry_void(); | ||
393 | |||
394 | handle = ocfs2_alloc_handle(osb); | ||
395 | if (!handle) { | ||
396 | status = -ENOMEM; | ||
397 | mlog_errno(status); | ||
398 | goto bail; | ||
399 | } | ||
400 | |||
401 | main_bm_inode = ocfs2_get_system_file_inode(osb, | ||
402 | GLOBAL_BITMAP_SYSTEM_INODE, | ||
403 | OCFS2_INVALID_SLOT); | ||
404 | if (!main_bm_inode) { | ||
405 | status = -EINVAL; | ||
406 | mlog_errno(status); | ||
407 | goto bail; | ||
408 | } | ||
409 | |||
410 | ocfs2_handle_add_inode(handle, main_bm_inode); | ||
411 | status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1); | ||
412 | if (status < 0) { | ||
413 | mlog_errno(status); | ||
414 | goto bail; | ||
415 | } | ||
416 | |||
417 | handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS); | ||
418 | if (IS_ERR(handle)) { | ||
419 | status = PTR_ERR(handle); | ||
420 | handle = NULL; | ||
421 | mlog_errno(status); | ||
422 | goto bail; | ||
423 | } | ||
424 | |||
425 | /* we want the bitmap change to be recorded on disk asap */ | ||
426 | ocfs2_handle_set_sync(handle, 1); | ||
427 | |||
428 | status = ocfs2_sync_local_to_main(osb, handle, alloc, | ||
429 | main_bm_inode, main_bm_bh); | ||
430 | if (status < 0) | ||
431 | mlog_errno(status); | ||
432 | |||
433 | bail: | ||
434 | if (handle) | ||
435 | ocfs2_commit_trans(handle); | ||
436 | |||
437 | if (main_bm_bh) | ||
438 | brelse(main_bm_bh); | ||
439 | |||
440 | if (main_bm_inode) | ||
441 | iput(main_bm_inode); | ||
442 | |||
443 | mlog_exit(status); | ||
444 | return status; | ||
445 | } | ||
446 | |||
447 | /* | ||
448 | * make sure we've got at least bitswanted contiguous bits in the | ||
449 | * local alloc. You lose them when you drop i_sem. | ||
450 | * | ||
451 | * We will add ourselves to the transaction passed in, but may start | ||
452 | * our own in order to shift windows. | ||
453 | */ | ||
454 | int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, | ||
455 | struct ocfs2_journal_handle *passed_handle, | ||
456 | u32 bits_wanted, | ||
457 | struct ocfs2_alloc_context *ac) | ||
458 | { | ||
459 | int status; | ||
460 | struct ocfs2_dinode *alloc; | ||
461 | struct inode *local_alloc_inode; | ||
462 | unsigned int free_bits; | ||
463 | |||
464 | mlog_entry_void(); | ||
465 | |||
466 | BUG_ON(!passed_handle); | ||
467 | BUG_ON(!ac); | ||
468 | BUG_ON(passed_handle->flags & OCFS2_HANDLE_STARTED); | ||
469 | |||
470 | local_alloc_inode = | ||
471 | ocfs2_get_system_file_inode(osb, | ||
472 | LOCAL_ALLOC_SYSTEM_INODE, | ||
473 | osb->slot_num); | ||
474 | if (!local_alloc_inode) { | ||
475 | status = -ENOENT; | ||
476 | mlog_errno(status); | ||
477 | goto bail; | ||
478 | } | ||
479 | ocfs2_handle_add_inode(passed_handle, local_alloc_inode); | ||
480 | |||
481 | if (osb->local_alloc_state != OCFS2_LA_ENABLED) { | ||
482 | status = -ENOSPC; | ||
483 | goto bail; | ||
484 | } | ||
485 | |||
486 | if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) { | ||
487 | mlog(0, "Asking for more than my max window size!\n"); | ||
488 | status = -ENOSPC; | ||
489 | goto bail; | ||
490 | } | ||
491 | |||
492 | alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; | ||
493 | |||
494 | if (le32_to_cpu(alloc->id1.bitmap1.i_used) != | ||
495 | ocfs2_local_alloc_count_bits(alloc)) { | ||
496 | ocfs2_error(osb->sb, "local alloc inode %"MLFu64" says it has " | ||
497 | "%u free bits, but a count shows %u", | ||
498 | le64_to_cpu(alloc->i_blkno), | ||
499 | le32_to_cpu(alloc->id1.bitmap1.i_used), | ||
500 | ocfs2_local_alloc_count_bits(alloc)); | ||
501 | status = -EIO; | ||
502 | goto bail; | ||
503 | } | ||
504 | |||
505 | free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) - | ||
506 | le32_to_cpu(alloc->id1.bitmap1.i_used); | ||
507 | if (bits_wanted > free_bits) { | ||
508 | /* uhoh, window change time. */ | ||
509 | status = | ||
510 | ocfs2_local_alloc_slide_window(osb, local_alloc_inode); | ||
511 | if (status < 0) { | ||
512 | if (status != -ENOSPC) | ||
513 | mlog_errno(status); | ||
514 | goto bail; | ||
515 | } | ||
516 | } | ||
517 | |||
518 | ac->ac_inode = igrab(local_alloc_inode); | ||
519 | get_bh(osb->local_alloc_bh); | ||
520 | ac->ac_bh = osb->local_alloc_bh; | ||
521 | ac->ac_which = OCFS2_AC_USE_LOCAL; | ||
522 | status = 0; | ||
523 | bail: | ||
524 | if (local_alloc_inode) | ||
525 | iput(local_alloc_inode); | ||
526 | |||
527 | mlog_exit(status); | ||
528 | return status; | ||
529 | } | ||
530 | |||
531 | int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, | ||
532 | struct ocfs2_journal_handle *handle, | ||
533 | struct ocfs2_alloc_context *ac, | ||
534 | u32 min_bits, | ||
535 | u32 *bit_off, | ||
536 | u32 *num_bits) | ||
537 | { | ||
538 | int status, start; | ||
539 | struct inode *local_alloc_inode; | ||
540 | u32 bits_wanted; | ||
541 | void *bitmap; | ||
542 | struct ocfs2_dinode *alloc; | ||
543 | struct ocfs2_local_alloc *la; | ||
544 | |||
545 | mlog_entry_void(); | ||
546 | BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL); | ||
547 | |||
548 | bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; | ||
549 | local_alloc_inode = ac->ac_inode; | ||
550 | alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; | ||
551 | la = OCFS2_LOCAL_ALLOC(alloc); | ||
552 | |||
553 | start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); | ||
554 | if (start == -1) { | ||
555 | /* TODO: Shouldn't we just BUG here? */ | ||
556 | status = -ENOSPC; | ||
557 | mlog_errno(status); | ||
558 | goto bail; | ||
559 | } | ||
560 | |||
561 | bitmap = la->la_bitmap; | ||
562 | *bit_off = le32_to_cpu(la->la_bm_off) + start; | ||
563 | /* local alloc is always contiguous by nature -- we never | ||
564 | * delete bits from it! */ | ||
565 | *num_bits = bits_wanted; | ||
566 | |||
567 | status = ocfs2_journal_access(handle, local_alloc_inode, | ||
568 | osb->local_alloc_bh, | ||
569 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
570 | if (status < 0) { | ||
571 | mlog_errno(status); | ||
572 | goto bail; | ||
573 | } | ||
574 | |||
575 | while(bits_wanted--) | ||
576 | ocfs2_set_bit(start++, bitmap); | ||
577 | |||
578 | alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits + | ||
579 | le32_to_cpu(alloc->id1.bitmap1.i_used)); | ||
580 | |||
581 | status = ocfs2_journal_dirty(handle, osb->local_alloc_bh); | ||
582 | if (status < 0) { | ||
583 | mlog_errno(status); | ||
584 | goto bail; | ||
585 | } | ||
586 | |||
587 | status = 0; | ||
588 | bail: | ||
589 | mlog_exit(status); | ||
590 | return status; | ||
591 | } | ||
592 | |||
593 | static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) | ||
594 | { | ||
595 | int i; | ||
596 | u8 *buffer; | ||
597 | u32 count = 0; | ||
598 | struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); | ||
599 | |||
600 | mlog_entry_void(); | ||
601 | |||
602 | buffer = la->la_bitmap; | ||
603 | for (i = 0; i < le16_to_cpu(la->la_size); i++) | ||
604 | count += hweight8(buffer[i]); | ||
605 | |||
606 | mlog_exit(count); | ||
607 | return count; | ||
608 | } | ||
609 | |||
610 | static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, | ||
611 | struct ocfs2_dinode *alloc, | ||
612 | u32 numbits) | ||
613 | { | ||
614 | int numfound, bitoff, left, startoff, lastzero; | ||
615 | void *bitmap = NULL; | ||
616 | |||
617 | mlog_entry("(numbits wanted = %u)\n", numbits); | ||
618 | |||
619 | if (!alloc->id1.bitmap1.i_total) { | ||
620 | mlog(0, "No bits in my window!\n"); | ||
621 | bitoff = -1; | ||
622 | goto bail; | ||
623 | } | ||
624 | |||
625 | bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; | ||
626 | |||
627 | numfound = bitoff = startoff = 0; | ||
628 | lastzero = -1; | ||
629 | left = le32_to_cpu(alloc->id1.bitmap1.i_total); | ||
630 | while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) { | ||
631 | if (bitoff == left) { | ||
632 | /* mlog(0, "bitoff (%d) == left", bitoff); */ | ||
633 | break; | ||
634 | } | ||
635 | /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, " | ||
636 | "numfound = %d\n", bitoff, startoff, numfound);*/ | ||
637 | |||
638 | /* Ok, we found a zero bit... is it contig. or do we | ||
639 | * start over?*/ | ||
640 | if (bitoff == startoff) { | ||
641 | /* we found a zero */ | ||
642 | numfound++; | ||
643 | startoff++; | ||
644 | } else { | ||
645 | /* got a zero after some ones */ | ||
646 | numfound = 1; | ||
647 | startoff = bitoff+1; | ||
648 | } | ||
649 | /* we got everything we needed */ | ||
650 | if (numfound == numbits) { | ||
651 | /* mlog(0, "Found it all!\n"); */ | ||
652 | break; | ||
653 | } | ||
654 | } | ||
655 | |||
656 | mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff, | ||
657 | numfound); | ||
658 | |||
659 | if (numfound == numbits) | ||
660 | bitoff = startoff - numfound; | ||
661 | else | ||
662 | bitoff = -1; | ||
663 | |||
664 | bail: | ||
665 | mlog_exit(bitoff); | ||
666 | return bitoff; | ||
667 | } | ||
668 | |||
669 | static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc) | ||
670 | { | ||
671 | struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); | ||
672 | int i; | ||
673 | mlog_entry_void(); | ||
674 | |||
675 | alloc->id1.bitmap1.i_total = 0; | ||
676 | alloc->id1.bitmap1.i_used = 0; | ||
677 | la->la_bm_off = 0; | ||
678 | for(i = 0; i < le16_to_cpu(la->la_size); i++) | ||
679 | la->la_bitmap[i] = 0; | ||
680 | |||
681 | mlog_exit_void(); | ||
682 | } | ||
683 | |||
684 | #if 0 | ||
685 | /* turn this on and uncomment below to aid debugging window shifts. */ | ||
686 | static void ocfs2_verify_zero_bits(unsigned long *bitmap, | ||
687 | unsigned int start, | ||
688 | unsigned int count) | ||
689 | { | ||
690 | unsigned int tmp = count; | ||
691 | while(tmp--) { | ||
692 | if (ocfs2_test_bit(start + tmp, bitmap)) { | ||
693 | printk("ocfs2_verify_zero_bits: start = %u, count = " | ||
694 | "%u\n", start, count); | ||
695 | printk("ocfs2_verify_zero_bits: bit %u is set!", | ||
696 | start + tmp); | ||
697 | BUG(); | ||
698 | } | ||
699 | } | ||
700 | } | ||
701 | #endif | ||
702 | |||
703 | /* | ||
704 | * sync the local alloc to main bitmap. | ||
705 | * | ||
706 | * assumes you've already locked the main bitmap -- the bitmap inode | ||
707 | * passed is used for caching. | ||
708 | */ | ||
709 | static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, | ||
710 | struct ocfs2_journal_handle *handle, | ||
711 | struct ocfs2_dinode *alloc, | ||
712 | struct inode *main_bm_inode, | ||
713 | struct buffer_head *main_bm_bh) | ||
714 | { | ||
715 | int status = 0; | ||
716 | int bit_off, left, count, start; | ||
717 | u64 la_start_blk; | ||
718 | u64 blkno; | ||
719 | void *bitmap; | ||
720 | struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); | ||
721 | |||
722 | mlog_entry("total = %u, COUNT = %u, used = %u\n", | ||
723 | le32_to_cpu(alloc->id1.bitmap1.i_total), | ||
724 | ocfs2_local_alloc_count_bits(alloc), | ||
725 | le32_to_cpu(alloc->id1.bitmap1.i_used)); | ||
726 | |||
727 | if (!alloc->id1.bitmap1.i_total) { | ||
728 | mlog(0, "nothing to sync!\n"); | ||
729 | goto bail; | ||
730 | } | ||
731 | |||
732 | if (le32_to_cpu(alloc->id1.bitmap1.i_used) == | ||
733 | le32_to_cpu(alloc->id1.bitmap1.i_total)) { | ||
734 | mlog(0, "all bits were taken!\n"); | ||
735 | goto bail; | ||
736 | } | ||
737 | |||
738 | la_start_blk = ocfs2_clusters_to_blocks(osb->sb, | ||
739 | le32_to_cpu(la->la_bm_off)); | ||
740 | bitmap = la->la_bitmap; | ||
741 | start = count = bit_off = 0; | ||
742 | left = le32_to_cpu(alloc->id1.bitmap1.i_total); | ||
743 | |||
744 | while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) | ||
745 | != -1) { | ||
746 | if ((bit_off < left) && (bit_off == start)) { | ||
747 | count++; | ||
748 | start++; | ||
749 | continue; | ||
750 | } | ||
751 | if (count) { | ||
752 | blkno = la_start_blk + | ||
753 | ocfs2_clusters_to_blocks(osb->sb, | ||
754 | start - count); | ||
755 | |||
756 | mlog(0, "freeing %u bits starting at local " | ||
757 | "alloc bit %u (la_start_blk = %"MLFu64", " | ||
758 | "blkno = %"MLFu64")\n", count, start - count, | ||
759 | la_start_blk, blkno); | ||
760 | |||
761 | status = ocfs2_free_clusters(handle, main_bm_inode, | ||
762 | main_bm_bh, blkno, count); | ||
763 | if (status < 0) { | ||
764 | mlog_errno(status); | ||
765 | goto bail; | ||
766 | } | ||
767 | } | ||
768 | if (bit_off >= left) | ||
769 | break; | ||
770 | count = 1; | ||
771 | start = bit_off + 1; | ||
772 | } | ||
773 | |||
774 | bail: | ||
775 | mlog_exit(status); | ||
776 | return status; | ||
777 | } | ||
778 | |||
779 | static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, | ||
780 | struct ocfs2_journal_handle *handle, | ||
781 | struct ocfs2_alloc_context **ac, | ||
782 | struct inode **bitmap_inode, | ||
783 | struct buffer_head **bitmap_bh) | ||
784 | { | ||
785 | int status; | ||
786 | |||
787 | *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); | ||
788 | if (!(*ac)) { | ||
789 | status = -ENOMEM; | ||
790 | mlog_errno(status); | ||
791 | goto bail; | ||
792 | } | ||
793 | |||
794 | (*ac)->ac_handle = handle; | ||
795 | (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb); | ||
796 | |||
797 | status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); | ||
798 | if (status < 0) { | ||
799 | if (status != -ENOSPC) | ||
800 | mlog_errno(status); | ||
801 | goto bail; | ||
802 | } | ||
803 | |||
804 | *bitmap_inode = (*ac)->ac_inode; | ||
805 | igrab(*bitmap_inode); | ||
806 | *bitmap_bh = (*ac)->ac_bh; | ||
807 | get_bh(*bitmap_bh); | ||
808 | status = 0; | ||
809 | bail: | ||
810 | if ((status < 0) && *ac) { | ||
811 | ocfs2_free_alloc_context(*ac); | ||
812 | *ac = NULL; | ||
813 | } | ||
814 | |||
815 | mlog_exit(status); | ||
816 | return status; | ||
817 | } | ||
818 | |||
819 | /* | ||
820 | * pass it the bitmap lock in lock_bh if you have it. | ||
821 | */ | ||
822 | static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, | ||
823 | struct ocfs2_journal_handle *handle, | ||
824 | struct ocfs2_alloc_context *ac) | ||
825 | { | ||
826 | int status = 0; | ||
827 | u32 cluster_off, cluster_count; | ||
828 | struct ocfs2_dinode *alloc = NULL; | ||
829 | struct ocfs2_local_alloc *la; | ||
830 | |||
831 | mlog_entry_void(); | ||
832 | |||
833 | alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; | ||
834 | la = OCFS2_LOCAL_ALLOC(alloc); | ||
835 | |||
836 | if (alloc->id1.bitmap1.i_total) | ||
837 | mlog(0, "asking me to alloc a new window over a non-empty " | ||
838 | "one\n"); | ||
839 | |||
840 | mlog(0, "Allocating %u clusters for a new window.\n", | ||
841 | ocfs2_local_alloc_window_bits(osb)); | ||
842 | /* we used the generic suballoc reserve function, but we set | ||
843 | * everything up nicely, so there's no reason why we can't use | ||
844 | * the more specific cluster api to claim bits. */ | ||
845 | status = ocfs2_claim_clusters(osb, handle, ac, | ||
846 | ocfs2_local_alloc_window_bits(osb), | ||
847 | &cluster_off, &cluster_count); | ||
848 | if (status < 0) { | ||
849 | if (status != -ENOSPC) | ||
850 | mlog_errno(status); | ||
851 | goto bail; | ||
852 | } | ||
853 | |||
854 | la->la_bm_off = cpu_to_le32(cluster_off); | ||
855 | alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count); | ||
856 | /* just in case... In the future when we find space ourselves, | ||
857 | * we don't have to get all contiguous -- but we'll have to | ||
858 | * set all previously used bits in bitmap and update | ||
859 | * la_bits_set before setting the bits in the main bitmap. */ | ||
860 | alloc->id1.bitmap1.i_used = 0; | ||
861 | memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0, | ||
862 | le16_to_cpu(la->la_size)); | ||
863 | |||
864 | mlog(0, "New window allocated:\n"); | ||
865 | mlog(0, "window la_bm_off = %u\n", | ||
866 | OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); | ||
867 | mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total)); | ||
868 | |||
869 | bail: | ||
870 | mlog_exit(status); | ||
871 | return status; | ||
872 | } | ||
873 | |||
874 | /* Note that we do *NOT* lock the local alloc inode here as | ||
875 | * it's been locked already for us. */ | ||
876 | static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, | ||
877 | struct inode *local_alloc_inode) | ||
878 | { | ||
879 | int status = 0; | ||
880 | struct buffer_head *main_bm_bh = NULL; | ||
881 | struct inode *main_bm_inode = NULL; | ||
882 | struct ocfs2_journal_handle *handle = NULL; | ||
883 | struct ocfs2_dinode *alloc; | ||
884 | struct ocfs2_dinode *alloc_copy = NULL; | ||
885 | struct ocfs2_alloc_context *ac = NULL; | ||
886 | |||
887 | mlog_entry_void(); | ||
888 | |||
889 | handle = ocfs2_alloc_handle(osb); | ||
890 | if (!handle) { | ||
891 | status = -ENOMEM; | ||
892 | mlog_errno(status); | ||
893 | goto bail; | ||
894 | } | ||
895 | |||
896 | /* This will lock the main bitmap for us. */ | ||
897 | status = ocfs2_local_alloc_reserve_for_window(osb, | ||
898 | handle, | ||
899 | &ac, | ||
900 | &main_bm_inode, | ||
901 | &main_bm_bh); | ||
902 | if (status < 0) { | ||
903 | if (status != -ENOSPC) | ||
904 | mlog_errno(status); | ||
905 | goto bail; | ||
906 | } | ||
907 | |||
908 | handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS); | ||
909 | if (IS_ERR(handle)) { | ||
910 | status = PTR_ERR(handle); | ||
911 | handle = NULL; | ||
912 | mlog_errno(status); | ||
913 | goto bail; | ||
914 | } | ||
915 | |||
916 | alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; | ||
917 | |||
918 | /* We want to clear the local alloc before doing anything | ||
919 | * else, so that if we error later during this operation, | ||
920 | * local alloc shutdown won't try to double free main bitmap | ||
921 | * bits. Make a copy so the sync function knows which bits to | ||
922 | * free. */ | ||
923 | alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL); | ||
924 | if (!alloc_copy) { | ||
925 | status = -ENOMEM; | ||
926 | mlog_errno(status); | ||
927 | goto bail; | ||
928 | } | ||
929 | memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); | ||
930 | |||
931 | status = ocfs2_journal_access(handle, local_alloc_inode, | ||
932 | osb->local_alloc_bh, | ||
933 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
934 | if (status < 0) { | ||
935 | mlog_errno(status); | ||
936 | goto bail; | ||
937 | } | ||
938 | |||
939 | ocfs2_clear_local_alloc(alloc); | ||
940 | |||
941 | status = ocfs2_journal_dirty(handle, osb->local_alloc_bh); | ||
942 | if (status < 0) { | ||
943 | mlog_errno(status); | ||
944 | goto bail; | ||
945 | } | ||
946 | |||
947 | status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, | ||
948 | main_bm_inode, main_bm_bh); | ||
949 | if (status < 0) { | ||
950 | mlog_errno(status); | ||
951 | goto bail; | ||
952 | } | ||
953 | |||
954 | status = ocfs2_local_alloc_new_window(osb, handle, ac); | ||
955 | if (status < 0) { | ||
956 | if (status != -ENOSPC) | ||
957 | mlog_errno(status); | ||
958 | goto bail; | ||
959 | } | ||
960 | |||
961 | atomic_inc(&osb->alloc_stats.moves); | ||
962 | |||
963 | status = 0; | ||
964 | bail: | ||
965 | if (handle) | ||
966 | ocfs2_commit_trans(handle); | ||
967 | |||
968 | if (main_bm_bh) | ||
969 | brelse(main_bm_bh); | ||
970 | |||
971 | if (main_bm_inode) | ||
972 | iput(main_bm_inode); | ||
973 | |||
974 | if (alloc_copy) | ||
975 | kfree(alloc_copy); | ||
976 | |||
977 | if (ac) | ||
978 | ocfs2_free_alloc_context(ac); | ||
979 | |||
980 | mlog_exit(status); | ||
981 | return status; | ||
982 | } | ||
983 | |||
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h new file mode 100644 index 000000000000..30f88ce14e46 --- /dev/null +++ b/fs/ocfs2/localalloc.h | |||
@@ -0,0 +1,56 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * localalloc.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_LOCALALLOC_H | ||
27 | #define OCFS2_LOCALALLOC_H | ||
28 | |||
29 | int ocfs2_load_local_alloc(struct ocfs2_super *osb); | ||
30 | |||
31 | void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb); | ||
32 | |||
33 | int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, | ||
34 | int node_num, | ||
35 | struct ocfs2_dinode **alloc_copy); | ||
36 | |||
37 | int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, | ||
38 | struct ocfs2_dinode *alloc); | ||
39 | |||
40 | int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, | ||
41 | u64 bits); | ||
42 | |||
43 | struct ocfs2_alloc_context; | ||
44 | int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, | ||
45 | struct ocfs2_journal_handle *passed_handle, | ||
46 | u32 bits_wanted, | ||
47 | struct ocfs2_alloc_context *ac); | ||
48 | |||
49 | int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, | ||
50 | struct ocfs2_journal_handle *handle, | ||
51 | struct ocfs2_alloc_context *ac, | ||
52 | u32 min_bits, | ||
53 | u32 *bit_off, | ||
54 | u32 *num_bits); | ||
55 | |||
56 | #endif /* OCFS2_LOCALALLOC_H */ | ||
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c new file mode 100644 index 000000000000..afdeec4b0eef --- /dev/null +++ b/fs/ocfs2/mmap.c | |||
@@ -0,0 +1,102 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * mmap.c | ||
5 | * | ||
6 | * Code to deal with the mess that is clustered mmap. | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/pagemap.h> | ||
31 | #include <linux/uio.h> | ||
32 | #include <linux/signal.h> | ||
33 | #include <linux/rbtree.h> | ||
34 | |||
35 | #define MLOG_MASK_PREFIX ML_FILE_IO | ||
36 | #include <cluster/masklog.h> | ||
37 | |||
38 | #include "ocfs2.h" | ||
39 | |||
40 | #include "dlmglue.h" | ||
41 | #include "file.h" | ||
42 | #include "inode.h" | ||
43 | #include "mmap.h" | ||
44 | |||
45 | static struct page *ocfs2_nopage(struct vm_area_struct * area, | ||
46 | unsigned long address, | ||
47 | int *type) | ||
48 | { | ||
49 | struct inode *inode = area->vm_file->f_dentry->d_inode; | ||
50 | struct page *page = NOPAGE_SIGBUS; | ||
51 | sigset_t blocked, oldset; | ||
52 | int ret; | ||
53 | |||
54 | mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address); | ||
55 | |||
56 | /* The best way to deal with signals in this path is | ||
57 | * to block them upfront, rather than allowing the | ||
58 | * locking paths to return -ERESTARTSYS. */ | ||
59 | sigfillset(&blocked); | ||
60 | |||
61 | /* We should technically never get a bad ret return | ||
62 | * from sigprocmask */ | ||
63 | ret = sigprocmask(SIG_BLOCK, &blocked, &oldset); | ||
64 | if (ret < 0) { | ||
65 | mlog_errno(ret); | ||
66 | goto out; | ||
67 | } | ||
68 | |||
69 | page = filemap_nopage(area, address, type); | ||
70 | |||
71 | ret = sigprocmask(SIG_SETMASK, &oldset, NULL); | ||
72 | if (ret < 0) | ||
73 | mlog_errno(ret); | ||
74 | out: | ||
75 | mlog_exit_ptr(page); | ||
76 | return page; | ||
77 | } | ||
78 | |||
79 | static struct vm_operations_struct ocfs2_file_vm_ops = { | ||
80 | .nopage = ocfs2_nopage, | ||
81 | }; | ||
82 | |||
83 | int ocfs2_mmap(struct file *file, | ||
84 | struct vm_area_struct *vma) | ||
85 | { | ||
86 | struct address_space *mapping = file->f_dentry->d_inode->i_mapping; | ||
87 | struct inode *inode = mapping->host; | ||
88 | |||
89 | /* We don't want to support shared writable mappings yet. */ | ||
90 | if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) | ||
91 | && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { | ||
92 | mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); | ||
93 | /* This is -EINVAL because generic_file_readonly_mmap | ||
94 | * returns it in a similar situation. */ | ||
95 | return -EINVAL; | ||
96 | } | ||
97 | |||
98 | update_atime(inode); | ||
99 | vma->vm_ops = &ocfs2_file_vm_ops; | ||
100 | return 0; | ||
101 | } | ||
102 | |||
diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h new file mode 100644 index 000000000000..1274ee0f1fe2 --- /dev/null +++ b/fs/ocfs2/mmap.h | |||
@@ -0,0 +1,6 @@ | |||
1 | #ifndef OCFS2_MMAP_H | ||
2 | #define OCFS2_MMAP_H | ||
3 | |||
4 | int ocfs2_mmap(struct file *file, struct vm_area_struct *vma); | ||
5 | |||
6 | #endif /* OCFS2_MMAP_H */ | ||
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c new file mode 100644 index 000000000000..f6b77ff1d2bf --- /dev/null +++ b/fs/ocfs2/namei.c | |||
@@ -0,0 +1,2264 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * namei.c | ||
5 | * | ||
6 | * Create and rename file, directory, symlinks | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * Portions of this code from linux/fs/ext3/dir.c | ||
11 | * | ||
12 | * Copyright (C) 1992, 1993, 1994, 1995 | ||
13 | * Remy Card (card@masi.ibp.fr) | ||
14 | * Laboratoire MASI - Institut Blaise pascal | ||
15 | * Universite Pierre et Marie Curie (Paris VI) | ||
16 | * | ||
17 | * from | ||
18 | * | ||
19 | * linux/fs/minix/dir.c | ||
20 | * | ||
21 | * Copyright (C) 1991, 1992 Linux Torvalds | ||
22 | * | ||
23 | * This program is free software; you can redistribute it and/or | ||
24 | * modify it under the terms of the GNU General Public | ||
25 | * License as published by the Free Software Foundation; either | ||
26 | * version 2 of the License, or (at your option) any later version. | ||
27 | * | ||
28 | * This program is distributed in the hope that it will be useful, | ||
29 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
30 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
31 | * General Public License for more details. | ||
32 | * | ||
33 | * You should have received a copy of the GNU General Public | ||
34 | * License along with this program; if not, write to the | ||
35 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
36 | * Boston, MA 021110-1307, USA. | ||
37 | */ | ||
38 | |||
39 | #include <linux/fs.h> | ||
40 | #include <linux/types.h> | ||
41 | #include <linux/slab.h> | ||
42 | #include <linux/highmem.h> | ||
43 | |||
44 | #define MLOG_MASK_PREFIX ML_NAMEI | ||
45 | #include <cluster/masklog.h> | ||
46 | |||
47 | #include "ocfs2.h" | ||
48 | |||
49 | #include "alloc.h" | ||
50 | #include "dcache.h" | ||
51 | #include "dir.h" | ||
52 | #include "dlmglue.h" | ||
53 | #include "extent_map.h" | ||
54 | #include "file.h" | ||
55 | #include "inode.h" | ||
56 | #include "journal.h" | ||
57 | #include "namei.h" | ||
58 | #include "suballoc.h" | ||
59 | #include "symlink.h" | ||
60 | #include "sysfile.h" | ||
61 | #include "uptodate.h" | ||
62 | #include "vote.h" | ||
63 | |||
64 | #include "buffer_head_io.h" | ||
65 | |||
66 | #define NAMEI_RA_CHUNKS 2 | ||
67 | #define NAMEI_RA_BLOCKS 4 | ||
68 | #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) | ||
69 | #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) | ||
70 | |||
71 | static int inline ocfs2_search_dirblock(struct buffer_head *bh, | ||
72 | struct inode *dir, | ||
73 | const char *name, int namelen, | ||
74 | unsigned long offset, | ||
75 | struct ocfs2_dir_entry **res_dir); | ||
76 | |||
77 | static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle, | ||
78 | struct inode *dir, | ||
79 | struct ocfs2_dir_entry *de_del, | ||
80 | struct buffer_head *bh); | ||
81 | |||
82 | static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle, | ||
83 | struct inode *dir, | ||
84 | const char *name, int namelen, | ||
85 | struct inode *inode, u64 blkno, | ||
86 | struct buffer_head *parent_fe_bh, | ||
87 | struct buffer_head *insert_bh); | ||
88 | |||
89 | static int ocfs2_mknod_locked(struct ocfs2_super *osb, | ||
90 | struct inode *dir, | ||
91 | struct dentry *dentry, int mode, | ||
92 | dev_t dev, | ||
93 | struct buffer_head **new_fe_bh, | ||
94 | struct buffer_head *parent_fe_bh, | ||
95 | struct ocfs2_journal_handle *handle, | ||
96 | struct inode **ret_inode, | ||
97 | struct ocfs2_alloc_context *inode_ac); | ||
98 | |||
99 | static int ocfs2_fill_new_dir(struct ocfs2_super *osb, | ||
100 | struct ocfs2_journal_handle *handle, | ||
101 | struct inode *parent, | ||
102 | struct inode *inode, | ||
103 | struct buffer_head *fe_bh, | ||
104 | struct ocfs2_alloc_context *data_ac); | ||
105 | |||
106 | static int ocfs2_double_lock(struct ocfs2_super *osb, | ||
107 | struct ocfs2_journal_handle *handle, | ||
108 | struct buffer_head **bh1, | ||
109 | struct inode *inode1, | ||
110 | struct buffer_head **bh2, | ||
111 | struct inode *inode2); | ||
112 | |||
113 | static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, | ||
114 | struct ocfs2_journal_handle *handle, | ||
115 | struct inode *inode, | ||
116 | char *name, | ||
117 | struct buffer_head **de_bh); | ||
118 | |||
119 | static int ocfs2_orphan_add(struct ocfs2_super *osb, | ||
120 | struct ocfs2_journal_handle *handle, | ||
121 | struct inode *inode, | ||
122 | struct ocfs2_dinode *fe, | ||
123 | char *name, | ||
124 | struct buffer_head *de_bh); | ||
125 | |||
126 | static int ocfs2_create_symlink_data(struct ocfs2_super *osb, | ||
127 | struct ocfs2_journal_handle *handle, | ||
128 | struct inode *inode, | ||
129 | const char *symname); | ||
130 | |||
131 | static inline int ocfs2_add_entry(struct ocfs2_journal_handle *handle, | ||
132 | struct dentry *dentry, | ||
133 | struct inode *inode, u64 blkno, | ||
134 | struct buffer_head *parent_fe_bh, | ||
135 | struct buffer_head *insert_bh) | ||
136 | { | ||
137 | return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, | ||
138 | dentry->d_name.name, dentry->d_name.len, | ||
139 | inode, blkno, parent_fe_bh, insert_bh); | ||
140 | } | ||
141 | |||
142 | /* An orphan dir name is an 8 byte value, printed as a hex string */ | ||
143 | #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) | ||
144 | |||
145 | static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, | ||
146 | struct nameidata *nd) | ||
147 | { | ||
148 | int status; | ||
149 | u64 blkno; | ||
150 | struct buffer_head *dirent_bh = NULL; | ||
151 | struct inode *inode = NULL; | ||
152 | struct dentry *ret; | ||
153 | struct ocfs2_dir_entry *dirent; | ||
154 | struct ocfs2_inode_info *oi; | ||
155 | |||
156 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, | ||
157 | dentry->d_name.len, dentry->d_name.name); | ||
158 | |||
159 | if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) { | ||
160 | ret = ERR_PTR(-ENAMETOOLONG); | ||
161 | goto bail; | ||
162 | } | ||
163 | |||
164 | mlog(0, "find name %.*s in directory %"MLFu64"\n", dentry->d_name.len, | ||
165 | dentry->d_name.name, OCFS2_I(dir)->ip_blkno); | ||
166 | |||
167 | status = ocfs2_meta_lock(dir, NULL, NULL, 0); | ||
168 | if (status < 0) { | ||
169 | if (status != -ENOENT) | ||
170 | mlog_errno(status); | ||
171 | ret = ERR_PTR(status); | ||
172 | goto bail; | ||
173 | } | ||
174 | |||
175 | status = ocfs2_find_files_on_disk(dentry->d_name.name, | ||
176 | dentry->d_name.len, &blkno, | ||
177 | dir, &dirent_bh, &dirent); | ||
178 | if (status < 0) | ||
179 | goto bail_add; | ||
180 | |||
181 | inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno); | ||
182 | if (IS_ERR(inode)) { | ||
183 | mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno); | ||
184 | ret = ERR_PTR(-EACCES); | ||
185 | goto bail_unlock; | ||
186 | } | ||
187 | |||
188 | oi = OCFS2_I(inode); | ||
189 | /* Clear any orphaned state... If we were able to look up the | ||
190 | * inode from a directory, it certainly can't be orphaned. We | ||
191 | * might have the bad state from a node which intended to | ||
192 | * orphan this inode but crashed before it could commit the | ||
193 | * unlink. */ | ||
194 | spin_lock(&oi->ip_lock); | ||
195 | oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; | ||
196 | oi->ip_orphaned_slot = OCFS2_INVALID_SLOT; | ||
197 | spin_unlock(&oi->ip_lock); | ||
198 | |||
199 | bail_add: | ||
200 | |||
201 | dentry->d_op = &ocfs2_dentry_ops; | ||
202 | ret = d_splice_alias(inode, dentry); | ||
203 | |||
204 | bail_unlock: | ||
205 | /* Don't drop the cluster lock until *after* the d_add -- | ||
206 | * unlink on another node will message us to remove that | ||
207 | * dentry under this lock so otherwise we can race this with | ||
208 | * the vote thread and have a stale dentry. */ | ||
209 | ocfs2_meta_unlock(dir, 0); | ||
210 | |||
211 | bail: | ||
212 | if (dirent_bh) | ||
213 | brelse(dirent_bh); | ||
214 | |||
215 | mlog_exit_ptr(ret); | ||
216 | |||
217 | return ret; | ||
218 | } | ||
219 | |||
220 | static int ocfs2_fill_new_dir(struct ocfs2_super *osb, | ||
221 | struct ocfs2_journal_handle *handle, | ||
222 | struct inode *parent, | ||
223 | struct inode *inode, | ||
224 | struct buffer_head *fe_bh, | ||
225 | struct ocfs2_alloc_context *data_ac) | ||
226 | { | ||
227 | int status; | ||
228 | struct buffer_head *new_bh = NULL; | ||
229 | struct ocfs2_dir_entry *de = NULL; | ||
230 | |||
231 | mlog_entry_void(); | ||
232 | |||
233 | status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, | ||
234 | data_ac, NULL, &new_bh); | ||
235 | if (status < 0) { | ||
236 | mlog_errno(status); | ||
237 | goto bail; | ||
238 | } | ||
239 | |||
240 | ocfs2_set_new_buffer_uptodate(inode, new_bh); | ||
241 | |||
242 | status = ocfs2_journal_access(handle, inode, new_bh, | ||
243 | OCFS2_JOURNAL_ACCESS_CREATE); | ||
244 | if (status < 0) { | ||
245 | mlog_errno(status); | ||
246 | goto bail; | ||
247 | } | ||
248 | memset(new_bh->b_data, 0, osb->sb->s_blocksize); | ||
249 | |||
250 | de = (struct ocfs2_dir_entry *) new_bh->b_data; | ||
251 | de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); | ||
252 | de->name_len = 1; | ||
253 | de->rec_len = | ||
254 | cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); | ||
255 | strcpy(de->name, "."); | ||
256 | ocfs2_set_de_type(de, S_IFDIR); | ||
257 | de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len)); | ||
258 | de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno); | ||
259 | de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize - | ||
260 | OCFS2_DIR_REC_LEN(1)); | ||
261 | de->name_len = 2; | ||
262 | strcpy(de->name, ".."); | ||
263 | ocfs2_set_de_type(de, S_IFDIR); | ||
264 | |||
265 | status = ocfs2_journal_dirty(handle, new_bh); | ||
266 | if (status < 0) { | ||
267 | mlog_errno(status); | ||
268 | goto bail; | ||
269 | } | ||
270 | |||
271 | i_size_write(inode, inode->i_sb->s_blocksize); | ||
272 | inode->i_nlink = 2; | ||
273 | inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize); | ||
274 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); | ||
275 | if (status < 0) { | ||
276 | mlog_errno(status); | ||
277 | goto bail; | ||
278 | } | ||
279 | |||
280 | status = 0; | ||
281 | bail: | ||
282 | if (new_bh) | ||
283 | brelse(new_bh); | ||
284 | |||
285 | mlog_exit(status); | ||
286 | return status; | ||
287 | } | ||
288 | |||
289 | static int ocfs2_mknod(struct inode *dir, | ||
290 | struct dentry *dentry, | ||
291 | int mode, | ||
292 | dev_t dev) | ||
293 | { | ||
294 | int status = 0; | ||
295 | struct buffer_head *parent_fe_bh = NULL; | ||
296 | struct ocfs2_journal_handle *handle = NULL; | ||
297 | struct ocfs2_super *osb; | ||
298 | struct ocfs2_dinode *dirfe; | ||
299 | struct buffer_head *new_fe_bh = NULL; | ||
300 | struct buffer_head *de_bh = NULL; | ||
301 | struct inode *inode = NULL; | ||
302 | struct ocfs2_alloc_context *inode_ac = NULL; | ||
303 | struct ocfs2_alloc_context *data_ac = NULL; | ||
304 | |||
305 | mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, | ||
306 | (unsigned long)dev, dentry->d_name.len, | ||
307 | dentry->d_name.name); | ||
308 | |||
309 | /* get our super block */ | ||
310 | osb = OCFS2_SB(dir->i_sb); | ||
311 | |||
312 | if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) { | ||
313 | mlog(ML_ERROR, "inode %"MLFu64" has i_nlink of %u\n", | ||
314 | OCFS2_I(dir)->ip_blkno, dir->i_nlink); | ||
315 | status = -EMLINK; | ||
316 | goto leave; | ||
317 | } | ||
318 | |||
319 | handle = ocfs2_alloc_handle(osb); | ||
320 | if (handle == NULL) { | ||
321 | status = -ENOMEM; | ||
322 | mlog_errno(status); | ||
323 | goto leave; | ||
324 | } | ||
325 | |||
326 | status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1); | ||
327 | if (status < 0) { | ||
328 | if (status != -ENOENT) | ||
329 | mlog_errno(status); | ||
330 | goto leave; | ||
331 | } | ||
332 | |||
333 | dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; | ||
334 | if (!dirfe->i_links_count) { | ||
335 | /* can't make a file in a deleted directory. */ | ||
336 | status = -ENOENT; | ||
337 | goto leave; | ||
338 | } | ||
339 | |||
340 | status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, | ||
341 | dentry->d_name.len); | ||
342 | if (status) | ||
343 | goto leave; | ||
344 | |||
345 | /* get a spot inside the dir. */ | ||
346 | status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, | ||
347 | dentry->d_name.name, | ||
348 | dentry->d_name.len, &de_bh); | ||
349 | if (status < 0) { | ||
350 | mlog_errno(status); | ||
351 | goto leave; | ||
352 | } | ||
353 | |||
354 | /* reserve an inode spot */ | ||
355 | status = ocfs2_reserve_new_inode(osb, handle, &inode_ac); | ||
356 | if (status < 0) { | ||
357 | if (status != -ENOSPC) | ||
358 | mlog_errno(status); | ||
359 | goto leave; | ||
360 | } | ||
361 | |||
362 | /* are we making a directory? If so, reserve a cluster for his | ||
363 | * 1st extent. */ | ||
364 | if (S_ISDIR(mode)) { | ||
365 | status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac); | ||
366 | if (status < 0) { | ||
367 | if (status != -ENOSPC) | ||
368 | mlog_errno(status); | ||
369 | goto leave; | ||
370 | } | ||
371 | } | ||
372 | |||
373 | handle = ocfs2_start_trans(osb, handle, OCFS2_MKNOD_CREDITS); | ||
374 | if (IS_ERR(handle)) { | ||
375 | status = PTR_ERR(handle); | ||
376 | handle = NULL; | ||
377 | mlog_errno(status); | ||
378 | goto leave; | ||
379 | } | ||
380 | |||
381 | /* do the real work now. */ | ||
382 | status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev, | ||
383 | &new_fe_bh, parent_fe_bh, handle, | ||
384 | &inode, inode_ac); | ||
385 | if (status < 0) { | ||
386 | mlog_errno(status); | ||
387 | goto leave; | ||
388 | } | ||
389 | |||
390 | if (S_ISDIR(mode)) { | ||
391 | status = ocfs2_fill_new_dir(osb, handle, dir, inode, | ||
392 | new_fe_bh, data_ac); | ||
393 | if (status < 0) { | ||
394 | mlog_errno(status); | ||
395 | goto leave; | ||
396 | } | ||
397 | |||
398 | status = ocfs2_journal_access(handle, dir, parent_fe_bh, | ||
399 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
400 | if (status < 0) { | ||
401 | mlog_errno(status); | ||
402 | goto leave; | ||
403 | } | ||
404 | le16_add_cpu(&dirfe->i_links_count, 1); | ||
405 | status = ocfs2_journal_dirty(handle, parent_fe_bh); | ||
406 | if (status < 0) { | ||
407 | mlog_errno(status); | ||
408 | goto leave; | ||
409 | } | ||
410 | dir->i_nlink++; | ||
411 | } | ||
412 | |||
413 | status = ocfs2_add_entry(handle, dentry, inode, | ||
414 | OCFS2_I(inode)->ip_blkno, parent_fe_bh, | ||
415 | de_bh); | ||
416 | if (status < 0) { | ||
417 | mlog_errno(status); | ||
418 | goto leave; | ||
419 | } | ||
420 | |||
421 | insert_inode_hash(inode); | ||
422 | dentry->d_op = &ocfs2_dentry_ops; | ||
423 | d_instantiate(dentry, inode); | ||
424 | status = 0; | ||
425 | leave: | ||
426 | if (handle) | ||
427 | ocfs2_commit_trans(handle); | ||
428 | |||
429 | if (status == -ENOSPC) | ||
430 | mlog(0, "Disk is full\n"); | ||
431 | |||
432 | if (new_fe_bh) | ||
433 | brelse(new_fe_bh); | ||
434 | |||
435 | if (de_bh) | ||
436 | brelse(de_bh); | ||
437 | |||
438 | if (parent_fe_bh) | ||
439 | brelse(parent_fe_bh); | ||
440 | |||
441 | if ((status < 0) && inode) | ||
442 | iput(inode); | ||
443 | |||
444 | if (inode_ac) | ||
445 | ocfs2_free_alloc_context(inode_ac); | ||
446 | |||
447 | if (data_ac) | ||
448 | ocfs2_free_alloc_context(data_ac); | ||
449 | |||
450 | mlog_exit(status); | ||
451 | |||
452 | return status; | ||
453 | } | ||
454 | |||
455 | static int ocfs2_mknod_locked(struct ocfs2_super *osb, | ||
456 | struct inode *dir, | ||
457 | struct dentry *dentry, int mode, | ||
458 | dev_t dev, | ||
459 | struct buffer_head **new_fe_bh, | ||
460 | struct buffer_head *parent_fe_bh, | ||
461 | struct ocfs2_journal_handle *handle, | ||
462 | struct inode **ret_inode, | ||
463 | struct ocfs2_alloc_context *inode_ac) | ||
464 | { | ||
465 | int status = 0; | ||
466 | struct ocfs2_dinode *fe = NULL; | ||
467 | struct ocfs2_extent_list *fel; | ||
468 | u64 fe_blkno = 0; | ||
469 | u16 suballoc_bit; | ||
470 | struct inode *inode = NULL; | ||
471 | |||
472 | mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, | ||
473 | (unsigned long)dev, dentry->d_name.len, | ||
474 | dentry->d_name.name); | ||
475 | |||
476 | *new_fe_bh = NULL; | ||
477 | *ret_inode = NULL; | ||
478 | |||
479 | status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, | ||
480 | &fe_blkno); | ||
481 | if (status < 0) { | ||
482 | mlog_errno(status); | ||
483 | goto leave; | ||
484 | } | ||
485 | |||
486 | inode = new_inode(dir->i_sb); | ||
487 | if (IS_ERR(inode)) { | ||
488 | status = PTR_ERR(inode); | ||
489 | mlog(ML_ERROR, "new_inode failed!\n"); | ||
490 | goto leave; | ||
491 | } | ||
492 | |||
493 | /* populate as many fields early on as possible - many of | ||
494 | * these are used by the support functions here and in | ||
495 | * callers. */ | ||
496 | inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); | ||
497 | OCFS2_I(inode)->ip_blkno = fe_blkno; | ||
498 | if (S_ISDIR(mode)) | ||
499 | inode->i_nlink = 2; | ||
500 | else | ||
501 | inode->i_nlink = 1; | ||
502 | inode->i_mode = mode; | ||
503 | spin_lock(&osb->osb_lock); | ||
504 | inode->i_generation = osb->s_next_generation++; | ||
505 | spin_unlock(&osb->osb_lock); | ||
506 | |||
507 | *new_fe_bh = sb_getblk(osb->sb, fe_blkno); | ||
508 | if (!*new_fe_bh) { | ||
509 | status = -EIO; | ||
510 | mlog_errno(status); | ||
511 | goto leave; | ||
512 | } | ||
513 | ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh); | ||
514 | |||
515 | status = ocfs2_journal_access(handle, inode, *new_fe_bh, | ||
516 | OCFS2_JOURNAL_ACCESS_CREATE); | ||
517 | if (status < 0) { | ||
518 | mlog_errno(status); | ||
519 | goto leave; | ||
520 | } | ||
521 | |||
522 | fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data; | ||
523 | memset(fe, 0, osb->sb->s_blocksize); | ||
524 | |||
525 | fe->i_generation = cpu_to_le32(inode->i_generation); | ||
526 | fe->i_fs_generation = cpu_to_le32(osb->fs_generation); | ||
527 | fe->i_blkno = cpu_to_le64(fe_blkno); | ||
528 | fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); | ||
529 | fe->i_suballoc_slot = cpu_to_le16(osb->slot_num); | ||
530 | fe->i_uid = cpu_to_le32(current->fsuid); | ||
531 | if (dir->i_mode & S_ISGID) { | ||
532 | fe->i_gid = cpu_to_le32(dir->i_gid); | ||
533 | if (S_ISDIR(mode)) | ||
534 | mode |= S_ISGID; | ||
535 | } else | ||
536 | fe->i_gid = cpu_to_le32(current->fsgid); | ||
537 | fe->i_mode = cpu_to_le16(mode); | ||
538 | if (S_ISCHR(mode) || S_ISBLK(mode)) | ||
539 | fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); | ||
540 | |||
541 | fe->i_links_count = cpu_to_le16(inode->i_nlink); | ||
542 | |||
543 | fe->i_last_eb_blk = 0; | ||
544 | strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); | ||
545 | le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL); | ||
546 | fe->i_atime = fe->i_ctime = fe->i_mtime = | ||
547 | cpu_to_le64(CURRENT_TIME.tv_sec); | ||
548 | fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = | ||
549 | cpu_to_le32(CURRENT_TIME.tv_nsec); | ||
550 | fe->i_dtime = 0; | ||
551 | |||
552 | fel = &fe->id2.i_list; | ||
553 | fel->l_tree_depth = 0; | ||
554 | fel->l_next_free_rec = 0; | ||
555 | fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb)); | ||
556 | |||
557 | status = ocfs2_journal_dirty(handle, *new_fe_bh); | ||
558 | if (status < 0) { | ||
559 | mlog_errno(status); | ||
560 | goto leave; | ||
561 | } | ||
562 | |||
563 | if (ocfs2_populate_inode(inode, fe, 1) < 0) { | ||
564 | mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, " | ||
565 | "i_blkno=%"MLFu64", i_ino=%lu\n", | ||
566 | (unsigned long long) (*new_fe_bh)->b_blocknr, | ||
567 | fe->i_blkno, inode->i_ino); | ||
568 | BUG(); | ||
569 | } | ||
570 | |||
571 | ocfs2_inode_set_new(osb, inode); | ||
572 | status = ocfs2_create_new_inode_locks(inode); | ||
573 | if (status < 0) | ||
574 | mlog_errno(status); | ||
575 | |||
576 | status = 0; /* error in ocfs2_create_new_inode_locks is not | ||
577 | * critical */ | ||
578 | |||
579 | *ret_inode = inode; | ||
580 | leave: | ||
581 | if (status < 0) { | ||
582 | if (*new_fe_bh) { | ||
583 | brelse(*new_fe_bh); | ||
584 | *new_fe_bh = NULL; | ||
585 | } | ||
586 | if (inode) | ||
587 | iput(inode); | ||
588 | } | ||
589 | |||
590 | mlog_exit(status); | ||
591 | return status; | ||
592 | } | ||
593 | |||
594 | static int ocfs2_mkdir(struct inode *dir, | ||
595 | struct dentry *dentry, | ||
596 | int mode) | ||
597 | { | ||
598 | int ret; | ||
599 | |||
600 | mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode, | ||
601 | dentry->d_name.len, dentry->d_name.name); | ||
602 | ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0); | ||
603 | mlog_exit(ret); | ||
604 | |||
605 | return ret; | ||
606 | } | ||
607 | |||
608 | static int ocfs2_create(struct inode *dir, | ||
609 | struct dentry *dentry, | ||
610 | int mode, | ||
611 | struct nameidata *nd) | ||
612 | { | ||
613 | int ret; | ||
614 | |||
615 | mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode, | ||
616 | dentry->d_name.len, dentry->d_name.name); | ||
617 | ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0); | ||
618 | mlog_exit(ret); | ||
619 | |||
620 | return ret; | ||
621 | } | ||
622 | |||
623 | static int ocfs2_link(struct dentry *old_dentry, | ||
624 | struct inode *dir, | ||
625 | struct dentry *dentry) | ||
626 | { | ||
627 | struct ocfs2_journal_handle *handle = NULL; | ||
628 | struct inode *inode = old_dentry->d_inode; | ||
629 | int err; | ||
630 | struct buffer_head *fe_bh = NULL; | ||
631 | struct buffer_head *parent_fe_bh = NULL; | ||
632 | struct buffer_head *de_bh = NULL; | ||
633 | struct ocfs2_dinode *fe = NULL; | ||
634 | struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); | ||
635 | |||
636 | mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, | ||
637 | old_dentry->d_name.len, old_dentry->d_name.name, | ||
638 | dentry->d_name.len, dentry->d_name.name); | ||
639 | |||
640 | if (S_ISDIR(inode->i_mode)) { | ||
641 | err = -EPERM; | ||
642 | goto bail; | ||
643 | } | ||
644 | |||
645 | if (inode->i_nlink >= OCFS2_LINK_MAX) { | ||
646 | err = -EMLINK; | ||
647 | goto bail; | ||
648 | } | ||
649 | |||
650 | handle = ocfs2_alloc_handle(osb); | ||
651 | if (handle == NULL) { | ||
652 | err = -ENOMEM; | ||
653 | goto bail; | ||
654 | } | ||
655 | |||
656 | err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1); | ||
657 | if (err < 0) { | ||
658 | if (err != -ENOENT) | ||
659 | mlog_errno(err); | ||
660 | goto bail; | ||
661 | } | ||
662 | |||
663 | err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, | ||
664 | dentry->d_name.len); | ||
665 | if (err) | ||
666 | goto bail; | ||
667 | |||
668 | err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, | ||
669 | dentry->d_name.name, | ||
670 | dentry->d_name.len, &de_bh); | ||
671 | if (err < 0) { | ||
672 | mlog_errno(err); | ||
673 | goto bail; | ||
674 | } | ||
675 | |||
676 | err = ocfs2_meta_lock(inode, handle, &fe_bh, 1); | ||
677 | if (err < 0) { | ||
678 | if (err != -ENOENT) | ||
679 | mlog_errno(err); | ||
680 | goto bail; | ||
681 | } | ||
682 | |||
683 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
684 | if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) { | ||
685 | err = -EMLINK; | ||
686 | goto bail; | ||
687 | } | ||
688 | |||
689 | handle = ocfs2_start_trans(osb, handle, OCFS2_LINK_CREDITS); | ||
690 | if (IS_ERR(handle)) { | ||
691 | err = PTR_ERR(handle); | ||
692 | handle = NULL; | ||
693 | mlog_errno(err); | ||
694 | goto bail; | ||
695 | } | ||
696 | |||
697 | err = ocfs2_journal_access(handle, inode, fe_bh, | ||
698 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
699 | if (err < 0) { | ||
700 | mlog_errno(err); | ||
701 | goto bail; | ||
702 | } | ||
703 | |||
704 | inode->i_nlink++; | ||
705 | inode->i_ctime = CURRENT_TIME; | ||
706 | fe->i_links_count = cpu_to_le16(inode->i_nlink); | ||
707 | fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); | ||
708 | fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); | ||
709 | |||
710 | err = ocfs2_journal_dirty(handle, fe_bh); | ||
711 | if (err < 0) { | ||
712 | le16_add_cpu(&fe->i_links_count, -1); | ||
713 | inode->i_nlink--; | ||
714 | mlog_errno(err); | ||
715 | goto bail; | ||
716 | } | ||
717 | |||
718 | err = ocfs2_add_entry(handle, dentry, inode, | ||
719 | OCFS2_I(inode)->ip_blkno, | ||
720 | parent_fe_bh, de_bh); | ||
721 | if (err) { | ||
722 | le16_add_cpu(&fe->i_links_count, -1); | ||
723 | inode->i_nlink--; | ||
724 | mlog_errno(err); | ||
725 | goto bail; | ||
726 | } | ||
727 | |||
728 | atomic_inc(&inode->i_count); | ||
729 | dentry->d_op = &ocfs2_dentry_ops; | ||
730 | d_instantiate(dentry, inode); | ||
731 | bail: | ||
732 | if (handle) | ||
733 | ocfs2_commit_trans(handle); | ||
734 | if (de_bh) | ||
735 | brelse(de_bh); | ||
736 | if (fe_bh) | ||
737 | brelse(fe_bh); | ||
738 | if (parent_fe_bh) | ||
739 | brelse(parent_fe_bh); | ||
740 | |||
741 | mlog_exit(err); | ||
742 | |||
743 | return err; | ||
744 | } | ||
745 | |||
746 | static int ocfs2_unlink(struct inode *dir, | ||
747 | struct dentry *dentry) | ||
748 | { | ||
749 | int status; | ||
750 | unsigned int saved_nlink = 0; | ||
751 | struct inode *inode = dentry->d_inode; | ||
752 | struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); | ||
753 | u64 blkno; | ||
754 | struct ocfs2_dinode *fe = NULL; | ||
755 | struct buffer_head *fe_bh = NULL; | ||
756 | struct buffer_head *parent_node_bh = NULL; | ||
757 | struct ocfs2_journal_handle *handle = NULL; | ||
758 | struct ocfs2_dir_entry *dirent = NULL; | ||
759 | struct buffer_head *dirent_bh = NULL; | ||
760 | char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; | ||
761 | struct buffer_head *orphan_entry_bh = NULL; | ||
762 | |||
763 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, | ||
764 | dentry->d_name.len, dentry->d_name.name); | ||
765 | |||
766 | BUG_ON(dentry->d_parent->d_inode != dir); | ||
767 | |||
768 | mlog(0, "ino = %"MLFu64"\n", OCFS2_I(inode)->ip_blkno); | ||
769 | |||
770 | if (inode == osb->root_inode) { | ||
771 | mlog(0, "Cannot delete the root directory\n"); | ||
772 | status = -EPERM; | ||
773 | goto leave; | ||
774 | } | ||
775 | |||
776 | handle = ocfs2_alloc_handle(osb); | ||
777 | if (handle == NULL) { | ||
778 | status = -ENOMEM; | ||
779 | mlog_errno(status); | ||
780 | goto leave; | ||
781 | } | ||
782 | |||
783 | status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1); | ||
784 | if (status < 0) { | ||
785 | if (status != -ENOENT) | ||
786 | mlog_errno(status); | ||
787 | goto leave; | ||
788 | } | ||
789 | |||
790 | status = ocfs2_find_files_on_disk(dentry->d_name.name, | ||
791 | dentry->d_name.len, &blkno, | ||
792 | dir, &dirent_bh, &dirent); | ||
793 | if (status < 0) { | ||
794 | if (status != -ENOENT) | ||
795 | mlog_errno(status); | ||
796 | goto leave; | ||
797 | } | ||
798 | |||
799 | if (OCFS2_I(inode)->ip_blkno != blkno) { | ||
800 | status = -ENOENT; | ||
801 | |||
802 | mlog(0, "ip_blkno (%"MLFu64") != dirent blkno (%"MLFu64") " | ||
803 | "ip_flags = %x\n", OCFS2_I(inode)->ip_blkno, blkno, | ||
804 | OCFS2_I(inode)->ip_flags); | ||
805 | goto leave; | ||
806 | } | ||
807 | |||
808 | status = ocfs2_meta_lock(inode, handle, &fe_bh, 1); | ||
809 | if (status < 0) { | ||
810 | if (status != -ENOENT) | ||
811 | mlog_errno(status); | ||
812 | goto leave; | ||
813 | } | ||
814 | |||
815 | if (S_ISDIR(inode->i_mode)) { | ||
816 | if (!ocfs2_empty_dir(inode)) { | ||
817 | status = -ENOTEMPTY; | ||
818 | goto leave; | ||
819 | } else if (inode->i_nlink != 2) { | ||
820 | status = -ENOTEMPTY; | ||
821 | goto leave; | ||
822 | } | ||
823 | } | ||
824 | |||
825 | /* There are still a few steps left until we can consider the | ||
826 | * unlink to have succeeded. Save off nlink here before | ||
827 | * modification so we can set it back in case we hit an issue | ||
828 | * before commit. */ | ||
829 | saved_nlink = inode->i_nlink; | ||
830 | if (S_ISDIR(inode->i_mode)) | ||
831 | inode->i_nlink = 0; | ||
832 | else | ||
833 | inode->i_nlink--; | ||
834 | |||
835 | status = ocfs2_request_unlink_vote(inode, dentry, | ||
836 | (unsigned int) inode->i_nlink); | ||
837 | if (status < 0) { | ||
838 | /* This vote should succeed under all normal | ||
839 | * circumstances. */ | ||
840 | mlog_errno(status); | ||
841 | goto leave; | ||
842 | } | ||
843 | |||
844 | if (!inode->i_nlink) { | ||
845 | status = ocfs2_prepare_orphan_dir(osb, handle, inode, | ||
846 | orphan_name, | ||
847 | &orphan_entry_bh); | ||
848 | if (status < 0) { | ||
849 | mlog_errno(status); | ||
850 | goto leave; | ||
851 | } | ||
852 | } | ||
853 | |||
854 | handle = ocfs2_start_trans(osb, handle, OCFS2_UNLINK_CREDITS); | ||
855 | if (IS_ERR(handle)) { | ||
856 | status = PTR_ERR(handle); | ||
857 | handle = NULL; | ||
858 | mlog_errno(status); | ||
859 | goto leave; | ||
860 | } | ||
861 | |||
862 | status = ocfs2_journal_access(handle, inode, fe_bh, | ||
863 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
864 | if (status < 0) { | ||
865 | mlog_errno(status); | ||
866 | goto leave; | ||
867 | } | ||
868 | |||
869 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
870 | |||
871 | if (!inode->i_nlink) { | ||
872 | status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, | ||
873 | orphan_entry_bh); | ||
874 | if (status < 0) { | ||
875 | mlog_errno(status); | ||
876 | goto leave; | ||
877 | } | ||
878 | } | ||
879 | |||
880 | /* delete the name from the parent dir */ | ||
881 | status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh); | ||
882 | if (status < 0) { | ||
883 | mlog_errno(status); | ||
884 | goto leave; | ||
885 | } | ||
886 | |||
887 | /* We can set nlink on the dinode now. clear the saved version | ||
888 | * so that it doesn't get set later. */ | ||
889 | fe->i_links_count = cpu_to_le16(inode->i_nlink); | ||
890 | saved_nlink = 0; | ||
891 | |||
892 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
893 | if (status < 0) { | ||
894 | mlog_errno(status); | ||
895 | goto leave; | ||
896 | } | ||
897 | |||
898 | if (S_ISDIR(inode->i_mode)) { | ||
899 | dir->i_nlink--; | ||
900 | status = ocfs2_mark_inode_dirty(handle, dir, | ||
901 | parent_node_bh); | ||
902 | if (status < 0) { | ||
903 | mlog_errno(status); | ||
904 | dir->i_nlink++; | ||
905 | } | ||
906 | } | ||
907 | |||
908 | leave: | ||
909 | if (status < 0 && saved_nlink) | ||
910 | inode->i_nlink = saved_nlink; | ||
911 | |||
912 | if (handle) | ||
913 | ocfs2_commit_trans(handle); | ||
914 | |||
915 | if (fe_bh) | ||
916 | brelse(fe_bh); | ||
917 | |||
918 | if (dirent_bh) | ||
919 | brelse(dirent_bh); | ||
920 | |||
921 | if (parent_node_bh) | ||
922 | brelse(parent_node_bh); | ||
923 | |||
924 | if (orphan_entry_bh) | ||
925 | brelse(orphan_entry_bh); | ||
926 | |||
927 | mlog_exit(status); | ||
928 | |||
929 | return status; | ||
930 | } | ||
931 | |||
932 | /* | ||
933 | * The only place this should be used is rename! | ||
934 | * if they have the same id, then the 1st one is the only one locked. | ||
935 | */ | ||
936 | static int ocfs2_double_lock(struct ocfs2_super *osb, | ||
937 | struct ocfs2_journal_handle *handle, | ||
938 | struct buffer_head **bh1, | ||
939 | struct inode *inode1, | ||
940 | struct buffer_head **bh2, | ||
941 | struct inode *inode2) | ||
942 | { | ||
943 | int status; | ||
944 | struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); | ||
945 | struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); | ||
946 | struct buffer_head **tmpbh; | ||
947 | struct inode *tmpinode; | ||
948 | |||
949 | mlog_entry("(inode1 = %"MLFu64", inode2 = %"MLFu64")\n", | ||
950 | oi1->ip_blkno, oi2->ip_blkno); | ||
951 | |||
952 | BUG_ON(!handle); | ||
953 | |||
954 | if (*bh1) | ||
955 | *bh1 = NULL; | ||
956 | if (*bh2) | ||
957 | *bh2 = NULL; | ||
958 | |||
959 | /* we always want to lock the one with the lower lockid first. */ | ||
960 | if (oi1->ip_blkno != oi2->ip_blkno) { | ||
961 | if (oi1->ip_blkno < oi2->ip_blkno) { | ||
962 | /* switch id1 and id2 around */ | ||
963 | mlog(0, "switching them around...\n"); | ||
964 | tmpbh = bh2; | ||
965 | bh2 = bh1; | ||
966 | bh1 = tmpbh; | ||
967 | |||
968 | tmpinode = inode2; | ||
969 | inode2 = inode1; | ||
970 | inode1 = tmpinode; | ||
971 | } | ||
972 | /* lock id2 */ | ||
973 | status = ocfs2_meta_lock(inode2, handle, bh2, 1); | ||
974 | if (status < 0) { | ||
975 | if (status != -ENOENT) | ||
976 | mlog_errno(status); | ||
977 | goto bail; | ||
978 | } | ||
979 | } | ||
980 | /* lock id1 */ | ||
981 | status = ocfs2_meta_lock(inode1, handle, bh1, 1); | ||
982 | if (status < 0) { | ||
983 | if (status != -ENOENT) | ||
984 | mlog_errno(status); | ||
985 | goto bail; | ||
986 | } | ||
987 | bail: | ||
988 | mlog_exit(status); | ||
989 | return status; | ||
990 | } | ||
991 | |||
992 | #define PARENT_INO(buffer) \ | ||
993 | ((struct ocfs2_dir_entry *) \ | ||
994 | ((char *)buffer + \ | ||
995 | le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode | ||
996 | |||
997 | static int ocfs2_rename(struct inode *old_dir, | ||
998 | struct dentry *old_dentry, | ||
999 | struct inode *new_dir, | ||
1000 | struct dentry *new_dentry) | ||
1001 | { | ||
1002 | int status = 0, rename_lock = 0; | ||
1003 | struct inode *old_inode = old_dentry->d_inode; | ||
1004 | struct inode *new_inode = new_dentry->d_inode; | ||
1005 | struct ocfs2_dinode *newfe = NULL; | ||
1006 | char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; | ||
1007 | struct buffer_head *orphan_entry_bh = NULL; | ||
1008 | struct buffer_head *newfe_bh = NULL; | ||
1009 | struct buffer_head *insert_entry_bh = NULL; | ||
1010 | struct ocfs2_super *osb = NULL; | ||
1011 | u64 newfe_blkno; | ||
1012 | struct ocfs2_journal_handle *handle = NULL; | ||
1013 | struct buffer_head *old_dir_bh = NULL; | ||
1014 | struct buffer_head *new_dir_bh = NULL; | ||
1015 | struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry | ||
1016 | // and new_dentry | ||
1017 | struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above | ||
1018 | struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir, | ||
1019 | // this is the 1st dirent bh | ||
1020 | nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink; | ||
1021 | unsigned int links_count; | ||
1022 | |||
1023 | /* At some point it might be nice to break this function up a | ||
1024 | * bit. */ | ||
1025 | |||
1026 | mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n", | ||
1027 | old_dir, old_dentry, new_dir, new_dentry, | ||
1028 | old_dentry->d_name.len, old_dentry->d_name.name, | ||
1029 | new_dentry->d_name.len, new_dentry->d_name.name); | ||
1030 | |||
1031 | osb = OCFS2_SB(old_dir->i_sb); | ||
1032 | |||
1033 | if (new_inode) { | ||
1034 | if (!igrab(new_inode)) | ||
1035 | BUG(); | ||
1036 | } | ||
1037 | |||
1038 | if (atomic_read(&old_dentry->d_count) > 2) { | ||
1039 | shrink_dcache_parent(old_dentry); | ||
1040 | if (atomic_read(&old_dentry->d_count) > 2) { | ||
1041 | status = -EBUSY; | ||
1042 | goto bail; | ||
1043 | } | ||
1044 | } | ||
1045 | |||
1046 | /* Assume a directory heirarchy thusly: | ||
1047 | * a/b/c | ||
1048 | * a/d | ||
1049 | * a,b,c, and d are all directories. | ||
1050 | * | ||
1051 | * from cwd of 'a' on both nodes: | ||
1052 | * node1: mv b/c d | ||
1053 | * node2: mv d b/c | ||
1054 | * | ||
1055 | * And that's why, just like the VFS, we need a file system | ||
1056 | * rename lock. */ | ||
1057 | if (old_dentry != new_dentry) { | ||
1058 | status = ocfs2_rename_lock(osb); | ||
1059 | if (status < 0) { | ||
1060 | mlog_errno(status); | ||
1061 | goto bail; | ||
1062 | } | ||
1063 | rename_lock = 1; | ||
1064 | } | ||
1065 | |||
1066 | handle = ocfs2_alloc_handle(osb); | ||
1067 | if (handle == NULL) { | ||
1068 | status = -ENOMEM; | ||
1069 | mlog_errno(status); | ||
1070 | goto bail; | ||
1071 | } | ||
1072 | |||
1073 | /* if old and new are the same, this'll just do one lock. */ | ||
1074 | status = ocfs2_double_lock(osb, handle, | ||
1075 | &old_dir_bh, old_dir, | ||
1076 | &new_dir_bh, new_dir); | ||
1077 | if (status < 0) { | ||
1078 | mlog_errno(status); | ||
1079 | goto bail; | ||
1080 | } | ||
1081 | |||
1082 | /* make sure both dirs have bhs | ||
1083 | * get an extra ref on old_dir_bh if old==new */ | ||
1084 | if (!new_dir_bh) { | ||
1085 | if (old_dir_bh) { | ||
1086 | new_dir_bh = old_dir_bh; | ||
1087 | get_bh(new_dir_bh); | ||
1088 | } else { | ||
1089 | mlog(ML_ERROR, "no old_dir_bh!\n"); | ||
1090 | status = -EIO; | ||
1091 | goto bail; | ||
1092 | } | ||
1093 | } | ||
1094 | |||
1095 | if (S_ISDIR(old_inode->i_mode)) { | ||
1096 | /* Directories actually require metadata updates to | ||
1097 | * the directory info so we can't get away with not | ||
1098 | * doing node locking on it. */ | ||
1099 | status = ocfs2_meta_lock(old_inode, handle, NULL, 1); | ||
1100 | if (status < 0) { | ||
1101 | if (status != -ENOENT) | ||
1102 | mlog_errno(status); | ||
1103 | goto bail; | ||
1104 | } | ||
1105 | |||
1106 | status = ocfs2_request_rename_vote(old_inode, old_dentry); | ||
1107 | if (status < 0) { | ||
1108 | mlog_errno(status); | ||
1109 | goto bail; | ||
1110 | } | ||
1111 | |||
1112 | status = -EIO; | ||
1113 | old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0); | ||
1114 | if (!old_inode_de_bh) | ||
1115 | goto bail; | ||
1116 | |||
1117 | status = -EIO; | ||
1118 | if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) != | ||
1119 | OCFS2_I(old_dir)->ip_blkno) | ||
1120 | goto bail; | ||
1121 | status = -EMLINK; | ||
1122 | if (!new_inode && new_dir!=old_dir && | ||
1123 | new_dir->i_nlink >= OCFS2_LINK_MAX) | ||
1124 | goto bail; | ||
1125 | } else { | ||
1126 | /* Ah, the simple case - we're a file so just send a | ||
1127 | * message. */ | ||
1128 | status = ocfs2_request_rename_vote(old_inode, old_dentry); | ||
1129 | if (status < 0) { | ||
1130 | mlog_errno(status); | ||
1131 | goto bail; | ||
1132 | } | ||
1133 | } | ||
1134 | |||
1135 | status = -ENOENT; | ||
1136 | old_de_bh = ocfs2_find_entry(old_dentry->d_name.name, | ||
1137 | old_dentry->d_name.len, | ||
1138 | old_dir, &old_de); | ||
1139 | if (!old_de_bh) | ||
1140 | goto bail; | ||
1141 | |||
1142 | /* | ||
1143 | * Check for inode number is _not_ due to possible IO errors. | ||
1144 | * We might rmdir the source, keep it as pwd of some process | ||
1145 | * and merrily kill the link to whatever was created under the | ||
1146 | * same name. Goodbye sticky bit ;-< | ||
1147 | */ | ||
1148 | if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno) | ||
1149 | goto bail; | ||
1150 | |||
1151 | /* check if the target already exists (in which case we need | ||
1152 | * to delete it */ | ||
1153 | status = ocfs2_find_files_on_disk(new_dentry->d_name.name, | ||
1154 | new_dentry->d_name.len, | ||
1155 | &newfe_blkno, new_dir, &new_de_bh, | ||
1156 | &new_de); | ||
1157 | /* The only error we allow here is -ENOENT because the new | ||
1158 | * file not existing is perfectly valid. */ | ||
1159 | if ((status < 0) && (status != -ENOENT)) { | ||
1160 | /* If we cannot find the file specified we should just */ | ||
1161 | /* return the error... */ | ||
1162 | mlog_errno(status); | ||
1163 | goto bail; | ||
1164 | } | ||
1165 | |||
1166 | if (!new_de && new_inode) | ||
1167 | mlog(ML_ERROR, "inode %lu does not exist in it's parent " | ||
1168 | "directory!", new_inode->i_ino); | ||
1169 | |||
1170 | /* In case we need to overwrite an existing file, we blow it | ||
1171 | * away first */ | ||
1172 | if (new_de) { | ||
1173 | /* VFS didn't think there existed an inode here, but | ||
1174 | * someone else in the cluster must have raced our | ||
1175 | * rename to create one. Today we error cleanly, in | ||
1176 | * the future we should consider calling iget to build | ||
1177 | * a new struct inode for this entry. */ | ||
1178 | if (!new_inode) { | ||
1179 | status = -EACCES; | ||
1180 | |||
1181 | mlog(0, "We found an inode for name %.*s but VFS " | ||
1182 | "didn't give us one.\n", new_dentry->d_name.len, | ||
1183 | new_dentry->d_name.name); | ||
1184 | goto bail; | ||
1185 | } | ||
1186 | |||
1187 | if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) { | ||
1188 | status = -EACCES; | ||
1189 | |||
1190 | mlog(0, "Inode blkno (%"MLFu64") and dir (%"MLFu64") " | ||
1191 | "disagree. ip_flags = %x\n", | ||
1192 | OCFS2_I(new_inode)->ip_blkno, newfe_blkno, | ||
1193 | OCFS2_I(new_inode)->ip_flags); | ||
1194 | goto bail; | ||
1195 | } | ||
1196 | |||
1197 | status = ocfs2_meta_lock(new_inode, handle, &newfe_bh, 1); | ||
1198 | if (status < 0) { | ||
1199 | if (status != -ENOENT) | ||
1200 | mlog_errno(status); | ||
1201 | goto bail; | ||
1202 | } | ||
1203 | |||
1204 | if (S_ISDIR(new_inode->i_mode)) | ||
1205 | links_count = 0; | ||
1206 | else | ||
1207 | links_count = (unsigned int) (new_inode->i_nlink - 1); | ||
1208 | |||
1209 | status = ocfs2_request_unlink_vote(new_inode, new_dentry, | ||
1210 | links_count); | ||
1211 | if (status < 0) { | ||
1212 | mlog_errno(status); | ||
1213 | goto bail; | ||
1214 | } | ||
1215 | |||
1216 | newfe = (struct ocfs2_dinode *) newfe_bh->b_data; | ||
1217 | |||
1218 | mlog(0, "aha rename over existing... new_de=%p " | ||
1219 | "new_blkno=%"MLFu64" newfebh=%p bhblocknr=%llu\n", | ||
1220 | new_de, newfe_blkno, newfe_bh, newfe_bh ? | ||
1221 | (unsigned long long)newfe_bh->b_blocknr : 0ULL); | ||
1222 | |||
1223 | if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { | ||
1224 | status = ocfs2_prepare_orphan_dir(osb, handle, | ||
1225 | new_inode, | ||
1226 | orphan_name, | ||
1227 | &orphan_entry_bh); | ||
1228 | if (status < 0) { | ||
1229 | mlog_errno(status); | ||
1230 | goto bail; | ||
1231 | } | ||
1232 | } | ||
1233 | } else { | ||
1234 | BUG_ON(new_dentry->d_parent->d_inode != new_dir); | ||
1235 | |||
1236 | status = ocfs2_check_dir_for_entry(new_dir, | ||
1237 | new_dentry->d_name.name, | ||
1238 | new_dentry->d_name.len); | ||
1239 | if (status) | ||
1240 | goto bail; | ||
1241 | |||
1242 | status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh, | ||
1243 | new_dentry->d_name.name, | ||
1244 | new_dentry->d_name.len, | ||
1245 | &insert_entry_bh); | ||
1246 | if (status < 0) { | ||
1247 | mlog_errno(status); | ||
1248 | goto bail; | ||
1249 | } | ||
1250 | } | ||
1251 | |||
1252 | handle = ocfs2_start_trans(osb, handle, OCFS2_RENAME_CREDITS); | ||
1253 | if (IS_ERR(handle)) { | ||
1254 | status = PTR_ERR(handle); | ||
1255 | handle = NULL; | ||
1256 | mlog_errno(status); | ||
1257 | goto bail; | ||
1258 | } | ||
1259 | |||
1260 | if (new_de) { | ||
1261 | if (S_ISDIR(new_inode->i_mode)) { | ||
1262 | if (!ocfs2_empty_dir(new_inode) || | ||
1263 | new_inode->i_nlink != 2) { | ||
1264 | status = -ENOTEMPTY; | ||
1265 | goto bail; | ||
1266 | } | ||
1267 | } | ||
1268 | status = ocfs2_journal_access(handle, new_inode, newfe_bh, | ||
1269 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1270 | if (status < 0) { | ||
1271 | mlog_errno(status); | ||
1272 | goto bail; | ||
1273 | } | ||
1274 | |||
1275 | if (S_ISDIR(new_inode->i_mode) || | ||
1276 | (newfe->i_links_count == cpu_to_le16(1))){ | ||
1277 | status = ocfs2_orphan_add(osb, handle, new_inode, | ||
1278 | newfe, orphan_name, | ||
1279 | orphan_entry_bh); | ||
1280 | if (status < 0) { | ||
1281 | mlog_errno(status); | ||
1282 | goto bail; | ||
1283 | } | ||
1284 | } | ||
1285 | |||
1286 | /* change the dirent to point to the correct inode */ | ||
1287 | status = ocfs2_journal_access(handle, new_dir, new_de_bh, | ||
1288 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1289 | if (status < 0) { | ||
1290 | mlog_errno(status); | ||
1291 | goto bail; | ||
1292 | } | ||
1293 | new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno); | ||
1294 | new_de->file_type = old_de->file_type; | ||
1295 | new_dir->i_version++; | ||
1296 | status = ocfs2_journal_dirty(handle, new_de_bh); | ||
1297 | if (status < 0) { | ||
1298 | mlog_errno(status); | ||
1299 | goto bail; | ||
1300 | } | ||
1301 | |||
1302 | if (S_ISDIR(new_inode->i_mode)) | ||
1303 | newfe->i_links_count = 0; | ||
1304 | else | ||
1305 | le16_add_cpu(&newfe->i_links_count, -1); | ||
1306 | |||
1307 | status = ocfs2_journal_dirty(handle, newfe_bh); | ||
1308 | if (status < 0) { | ||
1309 | mlog_errno(status); | ||
1310 | goto bail; | ||
1311 | } | ||
1312 | } else { | ||
1313 | /* if the name was not found in new_dir, add it now */ | ||
1314 | status = ocfs2_add_entry(handle, new_dentry, old_inode, | ||
1315 | OCFS2_I(old_inode)->ip_blkno, | ||
1316 | new_dir_bh, insert_entry_bh); | ||
1317 | } | ||
1318 | |||
1319 | old_inode->i_ctime = CURRENT_TIME; | ||
1320 | mark_inode_dirty(old_inode); | ||
1321 | |||
1322 | /* now that the name has been added to new_dir, remove the old name */ | ||
1323 | status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); | ||
1324 | if (status < 0) { | ||
1325 | mlog_errno(status); | ||
1326 | goto bail; | ||
1327 | } | ||
1328 | |||
1329 | if (new_inode) { | ||
1330 | new_inode->i_nlink--; | ||
1331 | new_inode->i_ctime = CURRENT_TIME; | ||
1332 | } | ||
1333 | old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; | ||
1334 | if (old_inode_de_bh) { | ||
1335 | status = ocfs2_journal_access(handle, old_inode, | ||
1336 | old_inode_de_bh, | ||
1337 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1338 | PARENT_INO(old_inode_de_bh->b_data) = | ||
1339 | cpu_to_le64(OCFS2_I(new_dir)->ip_blkno); | ||
1340 | status = ocfs2_journal_dirty(handle, old_inode_de_bh); | ||
1341 | old_dir->i_nlink--; | ||
1342 | if (new_inode) { | ||
1343 | new_inode->i_nlink--; | ||
1344 | } else { | ||
1345 | new_dir->i_nlink++; | ||
1346 | mark_inode_dirty(new_dir); | ||
1347 | } | ||
1348 | } | ||
1349 | mark_inode_dirty(old_dir); | ||
1350 | if (new_inode) | ||
1351 | mark_inode_dirty(new_inode); | ||
1352 | |||
1353 | if (old_dir != new_dir) | ||
1354 | if (new_dir_nlink != new_dir->i_nlink) { | ||
1355 | if (!new_dir_bh) { | ||
1356 | mlog(ML_ERROR, "need to change nlink for new " | ||
1357 | "dir %"MLFu64" from %d to %d but bh is " | ||
1358 | "NULL\n", OCFS2_I(new_dir)->ip_blkno, | ||
1359 | (int)new_dir_nlink, new_dir->i_nlink); | ||
1360 | } else { | ||
1361 | struct ocfs2_dinode *fe; | ||
1362 | status = ocfs2_journal_access(handle, | ||
1363 | new_dir, | ||
1364 | new_dir_bh, | ||
1365 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1366 | fe = (struct ocfs2_dinode *) new_dir_bh->b_data; | ||
1367 | fe->i_links_count = cpu_to_le16(new_dir->i_nlink); | ||
1368 | status = ocfs2_journal_dirty(handle, new_dir_bh); | ||
1369 | } | ||
1370 | } | ||
1371 | |||
1372 | if (old_dir_nlink != old_dir->i_nlink) { | ||
1373 | if (!old_dir_bh) { | ||
1374 | mlog(ML_ERROR, "need to change nlink for old dir " | ||
1375 | "%"MLFu64" from %d to %d but bh is NULL!\n", | ||
1376 | OCFS2_I(old_dir)->ip_blkno, | ||
1377 | (int)old_dir_nlink, | ||
1378 | old_dir->i_nlink); | ||
1379 | } else { | ||
1380 | struct ocfs2_dinode *fe; | ||
1381 | status = ocfs2_journal_access(handle, old_dir, | ||
1382 | old_dir_bh, | ||
1383 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1384 | fe = (struct ocfs2_dinode *) old_dir_bh->b_data; | ||
1385 | fe->i_links_count = cpu_to_le16(old_dir->i_nlink); | ||
1386 | status = ocfs2_journal_dirty(handle, old_dir_bh); | ||
1387 | } | ||
1388 | } | ||
1389 | |||
1390 | status = 0; | ||
1391 | bail: | ||
1392 | if (rename_lock) | ||
1393 | ocfs2_rename_unlock(osb); | ||
1394 | |||
1395 | if (handle) | ||
1396 | ocfs2_commit_trans(handle); | ||
1397 | |||
1398 | if (new_inode) | ||
1399 | sync_mapping_buffers(old_inode->i_mapping); | ||
1400 | |||
1401 | if (new_inode) | ||
1402 | iput(new_inode); | ||
1403 | if (newfe_bh) | ||
1404 | brelse(newfe_bh); | ||
1405 | if (old_dir_bh) | ||
1406 | brelse(old_dir_bh); | ||
1407 | if (new_dir_bh) | ||
1408 | brelse(new_dir_bh); | ||
1409 | if (new_de_bh) | ||
1410 | brelse(new_de_bh); | ||
1411 | if (old_de_bh) | ||
1412 | brelse(old_de_bh); | ||
1413 | if (old_inode_de_bh) | ||
1414 | brelse(old_inode_de_bh); | ||
1415 | if (orphan_entry_bh) | ||
1416 | brelse(orphan_entry_bh); | ||
1417 | if (insert_entry_bh) | ||
1418 | brelse(insert_entry_bh); | ||
1419 | |||
1420 | mlog_exit(status); | ||
1421 | |||
1422 | return status; | ||
1423 | } | ||
1424 | |||
1425 | /* | ||
1426 | * we expect i_size = strlen(symname). Copy symname into the file | ||
1427 | * data, including the null terminator. | ||
1428 | */ | ||
1429 | static int ocfs2_create_symlink_data(struct ocfs2_super *osb, | ||
1430 | struct ocfs2_journal_handle *handle, | ||
1431 | struct inode *inode, | ||
1432 | const char *symname) | ||
1433 | { | ||
1434 | struct buffer_head **bhs = NULL; | ||
1435 | const char *c; | ||
1436 | struct super_block *sb = osb->sb; | ||
1437 | u64 p_blkno; | ||
1438 | int p_blocks; | ||
1439 | int virtual, blocks, status, i, bytes_left; | ||
1440 | |||
1441 | bytes_left = i_size_read(inode) + 1; | ||
1442 | /* we can't trust i_blocks because we're actually going to | ||
1443 | * write i_size + 1 bytes. */ | ||
1444 | blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits; | ||
1445 | |||
1446 | mlog_entry("i_blocks = %lu, i_size = %llu, blocks = %d\n", | ||
1447 | inode->i_blocks, i_size_read(inode), blocks); | ||
1448 | |||
1449 | /* Sanity check -- make sure we're going to fit. */ | ||
1450 | if (bytes_left > | ||
1451 | ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) { | ||
1452 | status = -EIO; | ||
1453 | mlog_errno(status); | ||
1454 | goto bail; | ||
1455 | } | ||
1456 | |||
1457 | bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL); | ||
1458 | if (!bhs) { | ||
1459 | status = -ENOMEM; | ||
1460 | mlog_errno(status); | ||
1461 | goto bail; | ||
1462 | } | ||
1463 | |||
1464 | status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno, | ||
1465 | &p_blocks); | ||
1466 | if (status < 0) { | ||
1467 | mlog_errno(status); | ||
1468 | goto bail; | ||
1469 | } | ||
1470 | |||
1471 | /* links can never be larger than one cluster so we know this | ||
1472 | * is all going to be contiguous, but do a sanity check | ||
1473 | * anyway. */ | ||
1474 | if ((p_blocks << sb->s_blocksize_bits) < bytes_left) { | ||
1475 | status = -EIO; | ||
1476 | mlog_errno(status); | ||
1477 | goto bail; | ||
1478 | } | ||
1479 | |||
1480 | virtual = 0; | ||
1481 | while(bytes_left > 0) { | ||
1482 | c = &symname[virtual * sb->s_blocksize]; | ||
1483 | |||
1484 | bhs[virtual] = sb_getblk(sb, p_blkno); | ||
1485 | if (!bhs[virtual]) { | ||
1486 | status = -ENOMEM; | ||
1487 | mlog_errno(status); | ||
1488 | goto bail; | ||
1489 | } | ||
1490 | ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]); | ||
1491 | |||
1492 | status = ocfs2_journal_access(handle, inode, bhs[virtual], | ||
1493 | OCFS2_JOURNAL_ACCESS_CREATE); | ||
1494 | if (status < 0) { | ||
1495 | mlog_errno(status); | ||
1496 | goto bail; | ||
1497 | } | ||
1498 | |||
1499 | memset(bhs[virtual]->b_data, 0, sb->s_blocksize); | ||
1500 | |||
1501 | memcpy(bhs[virtual]->b_data, c, | ||
1502 | (bytes_left > sb->s_blocksize) ? sb->s_blocksize : | ||
1503 | bytes_left); | ||
1504 | |||
1505 | status = ocfs2_journal_dirty(handle, bhs[virtual]); | ||
1506 | if (status < 0) { | ||
1507 | mlog_errno(status); | ||
1508 | goto bail; | ||
1509 | } | ||
1510 | |||
1511 | virtual++; | ||
1512 | p_blkno++; | ||
1513 | bytes_left -= sb->s_blocksize; | ||
1514 | } | ||
1515 | |||
1516 | status = 0; | ||
1517 | bail: | ||
1518 | |||
1519 | if (bhs) { | ||
1520 | for(i = 0; i < blocks; i++) | ||
1521 | if (bhs[i]) | ||
1522 | brelse(bhs[i]); | ||
1523 | kfree(bhs); | ||
1524 | } | ||
1525 | |||
1526 | mlog_exit(status); | ||
1527 | return status; | ||
1528 | } | ||
1529 | |||
1530 | static int ocfs2_symlink(struct inode *dir, | ||
1531 | struct dentry *dentry, | ||
1532 | const char *symname) | ||
1533 | { | ||
1534 | int status, l, credits; | ||
1535 | u64 newsize; | ||
1536 | struct ocfs2_super *osb = NULL; | ||
1537 | struct inode *inode = NULL; | ||
1538 | struct super_block *sb; | ||
1539 | struct buffer_head *new_fe_bh = NULL; | ||
1540 | struct buffer_head *de_bh = NULL; | ||
1541 | struct buffer_head *parent_fe_bh = NULL; | ||
1542 | struct ocfs2_dinode *fe = NULL; | ||
1543 | struct ocfs2_dinode *dirfe; | ||
1544 | struct ocfs2_journal_handle *handle = NULL; | ||
1545 | struct ocfs2_alloc_context *inode_ac = NULL; | ||
1546 | struct ocfs2_alloc_context *data_ac = NULL; | ||
1547 | |||
1548 | mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, | ||
1549 | dentry, symname, dentry->d_name.len, dentry->d_name.name); | ||
1550 | |||
1551 | sb = dir->i_sb; | ||
1552 | osb = OCFS2_SB(sb); | ||
1553 | |||
1554 | l = strlen(symname) + 1; | ||
1555 | |||
1556 | credits = ocfs2_calc_symlink_credits(sb); | ||
1557 | |||
1558 | handle = ocfs2_alloc_handle(osb); | ||
1559 | if (handle == NULL) { | ||
1560 | status = -ENOMEM; | ||
1561 | mlog_errno(status); | ||
1562 | goto bail; | ||
1563 | } | ||
1564 | |||
1565 | /* lock the parent directory */ | ||
1566 | status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1); | ||
1567 | if (status < 0) { | ||
1568 | if (status != -ENOENT) | ||
1569 | mlog_errno(status); | ||
1570 | goto bail; | ||
1571 | } | ||
1572 | |||
1573 | dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; | ||
1574 | if (!dirfe->i_links_count) { | ||
1575 | /* can't make a file in a deleted directory. */ | ||
1576 | status = -ENOENT; | ||
1577 | goto bail; | ||
1578 | } | ||
1579 | |||
1580 | status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, | ||
1581 | dentry->d_name.len); | ||
1582 | if (status) | ||
1583 | goto bail; | ||
1584 | |||
1585 | status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, | ||
1586 | dentry->d_name.name, | ||
1587 | dentry->d_name.len, &de_bh); | ||
1588 | if (status < 0) { | ||
1589 | mlog_errno(status); | ||
1590 | goto bail; | ||
1591 | } | ||
1592 | |||
1593 | status = ocfs2_reserve_new_inode(osb, handle, &inode_ac); | ||
1594 | if (status < 0) { | ||
1595 | if (status != -ENOSPC) | ||
1596 | mlog_errno(status); | ||
1597 | goto bail; | ||
1598 | } | ||
1599 | |||
1600 | /* don't reserve bitmap space for fast symlinks. */ | ||
1601 | if (l > ocfs2_fast_symlink_chars(sb)) { | ||
1602 | status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac); | ||
1603 | if (status < 0) { | ||
1604 | if (status != -ENOSPC) | ||
1605 | mlog_errno(status); | ||
1606 | goto bail; | ||
1607 | } | ||
1608 | } | ||
1609 | |||
1610 | handle = ocfs2_start_trans(osb, handle, credits); | ||
1611 | if (IS_ERR(handle)) { | ||
1612 | status = PTR_ERR(handle); | ||
1613 | handle = NULL; | ||
1614 | mlog_errno(status); | ||
1615 | goto bail; | ||
1616 | } | ||
1617 | |||
1618 | status = ocfs2_mknod_locked(osb, dir, dentry, | ||
1619 | S_IFLNK | S_IRWXUGO, 0, | ||
1620 | &new_fe_bh, parent_fe_bh, handle, | ||
1621 | &inode, inode_ac); | ||
1622 | if (status < 0) { | ||
1623 | mlog_errno(status); | ||
1624 | goto bail; | ||
1625 | } | ||
1626 | |||
1627 | fe = (struct ocfs2_dinode *) new_fe_bh->b_data; | ||
1628 | inode->i_rdev = 0; | ||
1629 | newsize = l - 1; | ||
1630 | if (l > ocfs2_fast_symlink_chars(sb)) { | ||
1631 | inode->i_op = &ocfs2_symlink_inode_operations; | ||
1632 | status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh, | ||
1633 | handle, data_ac, NULL, | ||
1634 | NULL); | ||
1635 | if (status < 0) { | ||
1636 | if (status != -ENOSPC && status != -EINTR) { | ||
1637 | mlog(ML_ERROR, "Failed to extend file to " | ||
1638 | "%"MLFu64"\n", | ||
1639 | newsize); | ||
1640 | mlog_errno(status); | ||
1641 | status = -ENOSPC; | ||
1642 | } | ||
1643 | goto bail; | ||
1644 | } | ||
1645 | i_size_write(inode, newsize); | ||
1646 | inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize); | ||
1647 | } else { | ||
1648 | inode->i_op = &ocfs2_fast_symlink_inode_operations; | ||
1649 | memcpy((char *) fe->id2.i_symlink, symname, l); | ||
1650 | i_size_write(inode, newsize); | ||
1651 | inode->i_blocks = 0; | ||
1652 | } | ||
1653 | |||
1654 | status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh); | ||
1655 | if (status < 0) { | ||
1656 | mlog_errno(status); | ||
1657 | goto bail; | ||
1658 | } | ||
1659 | |||
1660 | if (!ocfs2_inode_is_fast_symlink(inode)) { | ||
1661 | status = ocfs2_create_symlink_data(osb, handle, inode, | ||
1662 | symname); | ||
1663 | if (status < 0) { | ||
1664 | mlog_errno(status); | ||
1665 | goto bail; | ||
1666 | } | ||
1667 | } | ||
1668 | |||
1669 | status = ocfs2_add_entry(handle, dentry, inode, | ||
1670 | le64_to_cpu(fe->i_blkno), parent_fe_bh, | ||
1671 | de_bh); | ||
1672 | if (status < 0) { | ||
1673 | mlog_errno(status); | ||
1674 | goto bail; | ||
1675 | } | ||
1676 | |||
1677 | insert_inode_hash(inode); | ||
1678 | dentry->d_op = &ocfs2_dentry_ops; | ||
1679 | d_instantiate(dentry, inode); | ||
1680 | bail: | ||
1681 | if (handle) | ||
1682 | ocfs2_commit_trans(handle); | ||
1683 | if (new_fe_bh) | ||
1684 | brelse(new_fe_bh); | ||
1685 | if (parent_fe_bh) | ||
1686 | brelse(parent_fe_bh); | ||
1687 | if (de_bh) | ||
1688 | brelse(de_bh); | ||
1689 | if (inode_ac) | ||
1690 | ocfs2_free_alloc_context(inode_ac); | ||
1691 | if (data_ac) | ||
1692 | ocfs2_free_alloc_context(data_ac); | ||
1693 | if ((status < 0) && inode) | ||
1694 | iput(inode); | ||
1695 | |||
1696 | mlog_exit(status); | ||
1697 | |||
1698 | return status; | ||
1699 | } | ||
1700 | |||
1701 | int ocfs2_check_dir_entry(struct inode * dir, | ||
1702 | struct ocfs2_dir_entry * de, | ||
1703 | struct buffer_head * bh, | ||
1704 | unsigned long offset) | ||
1705 | { | ||
1706 | const char *error_msg = NULL; | ||
1707 | const int rlen = le16_to_cpu(de->rec_len); | ||
1708 | |||
1709 | if (rlen < OCFS2_DIR_REC_LEN(1)) | ||
1710 | error_msg = "rec_len is smaller than minimal"; | ||
1711 | else if (rlen % 4 != 0) | ||
1712 | error_msg = "rec_len % 4 != 0"; | ||
1713 | else if (rlen < OCFS2_DIR_REC_LEN(de->name_len)) | ||
1714 | error_msg = "rec_len is too small for name_len"; | ||
1715 | else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) | ||
1716 | error_msg = "directory entry across blocks"; | ||
1717 | |||
1718 | if (error_msg != NULL) | ||
1719 | mlog(ML_ERROR, "bad entry in directory #%"MLFu64": %s - " | ||
1720 | "offset=%lu, inode=%"MLFu64", rec_len=%d, name_len=%d\n", | ||
1721 | OCFS2_I(dir)->ip_blkno, error_msg, offset, | ||
1722 | le64_to_cpu(de->inode), rlen, de->name_len); | ||
1723 | return error_msg == NULL ? 1 : 0; | ||
1724 | } | ||
1725 | |||
1726 | /* we don't always have a dentry for what we want to add, so people | ||
1727 | * like orphan dir can call this instead. | ||
1728 | * | ||
1729 | * If you pass me insert_bh, I'll skip the search of the other dir | ||
1730 | * blocks and put the record in there. | ||
1731 | */ | ||
1732 | static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle, | ||
1733 | struct inode *dir, | ||
1734 | const char *name, int namelen, | ||
1735 | struct inode *inode, u64 blkno, | ||
1736 | struct buffer_head *parent_fe_bh, | ||
1737 | struct buffer_head *insert_bh) | ||
1738 | { | ||
1739 | unsigned long offset; | ||
1740 | unsigned short rec_len; | ||
1741 | struct ocfs2_dir_entry *de, *de1; | ||
1742 | struct super_block *sb; | ||
1743 | int retval, status; | ||
1744 | |||
1745 | mlog_entry_void(); | ||
1746 | |||
1747 | sb = dir->i_sb; | ||
1748 | |||
1749 | if (!namelen) | ||
1750 | return -EINVAL; | ||
1751 | |||
1752 | rec_len = OCFS2_DIR_REC_LEN(namelen); | ||
1753 | offset = 0; | ||
1754 | de = (struct ocfs2_dir_entry *) insert_bh->b_data; | ||
1755 | while (1) { | ||
1756 | BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data); | ||
1757 | /* These checks should've already been passed by the | ||
1758 | * prepare function, but I guess we can leave them | ||
1759 | * here anyway. */ | ||
1760 | if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) { | ||
1761 | retval = -ENOENT; | ||
1762 | goto bail; | ||
1763 | } | ||
1764 | if (ocfs2_match(namelen, name, de)) { | ||
1765 | retval = -EEXIST; | ||
1766 | goto bail; | ||
1767 | } | ||
1768 | if (((le64_to_cpu(de->inode) == 0) && | ||
1769 | (le16_to_cpu(de->rec_len) >= rec_len)) || | ||
1770 | (le16_to_cpu(de->rec_len) >= | ||
1771 | (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) { | ||
1772 | status = ocfs2_journal_access(handle, dir, insert_bh, | ||
1773 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1774 | /* By now the buffer is marked for journaling */ | ||
1775 | offset += le16_to_cpu(de->rec_len); | ||
1776 | if (le64_to_cpu(de->inode)) { | ||
1777 | de1 = (struct ocfs2_dir_entry *)((char *) de + | ||
1778 | OCFS2_DIR_REC_LEN(de->name_len)); | ||
1779 | de1->rec_len = | ||
1780 | cpu_to_le16(le16_to_cpu(de->rec_len) - | ||
1781 | OCFS2_DIR_REC_LEN(de->name_len)); | ||
1782 | de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); | ||
1783 | de = de1; | ||
1784 | } | ||
1785 | de->file_type = OCFS2_FT_UNKNOWN; | ||
1786 | if (blkno) { | ||
1787 | de->inode = cpu_to_le64(blkno); | ||
1788 | ocfs2_set_de_type(de, inode->i_mode); | ||
1789 | } else | ||
1790 | de->inode = 0; | ||
1791 | de->name_len = namelen; | ||
1792 | memcpy(de->name, name, namelen); | ||
1793 | |||
1794 | dir->i_mtime = dir->i_ctime = CURRENT_TIME; | ||
1795 | dir->i_version++; | ||
1796 | status = ocfs2_journal_dirty(handle, insert_bh); | ||
1797 | retval = 0; | ||
1798 | goto bail; | ||
1799 | } | ||
1800 | offset += le16_to_cpu(de->rec_len); | ||
1801 | de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len)); | ||
1802 | } | ||
1803 | |||
1804 | /* when you think about it, the assert above should prevent us | ||
1805 | * from ever getting here. */ | ||
1806 | retval = -ENOSPC; | ||
1807 | bail: | ||
1808 | |||
1809 | mlog_exit(retval); | ||
1810 | return retval; | ||
1811 | } | ||
1812 | |||
1813 | |||
1814 | /* | ||
1815 | * ocfs2_delete_entry deletes a directory entry by merging it with the | ||
1816 | * previous entry | ||
1817 | */ | ||
1818 | static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle, | ||
1819 | struct inode *dir, | ||
1820 | struct ocfs2_dir_entry *de_del, | ||
1821 | struct buffer_head *bh) | ||
1822 | { | ||
1823 | struct ocfs2_dir_entry *de, *pde; | ||
1824 | int i, status = -ENOENT; | ||
1825 | |||
1826 | mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh); | ||
1827 | |||
1828 | i = 0; | ||
1829 | pde = NULL; | ||
1830 | de = (struct ocfs2_dir_entry *) bh->b_data; | ||
1831 | while (i < bh->b_size) { | ||
1832 | if (!ocfs2_check_dir_entry(dir, de, bh, i)) { | ||
1833 | status = -EIO; | ||
1834 | mlog_errno(status); | ||
1835 | goto bail; | ||
1836 | } | ||
1837 | if (de == de_del) { | ||
1838 | status = ocfs2_journal_access(handle, dir, bh, | ||
1839 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1840 | if (status < 0) { | ||
1841 | status = -EIO; | ||
1842 | mlog_errno(status); | ||
1843 | goto bail; | ||
1844 | } | ||
1845 | if (pde) | ||
1846 | pde->rec_len = | ||
1847 | cpu_to_le16(le16_to_cpu(pde->rec_len) + | ||
1848 | le16_to_cpu(de->rec_len)); | ||
1849 | else | ||
1850 | de->inode = 0; | ||
1851 | dir->i_version++; | ||
1852 | status = ocfs2_journal_dirty(handle, bh); | ||
1853 | goto bail; | ||
1854 | } | ||
1855 | i += le16_to_cpu(de->rec_len); | ||
1856 | pde = de; | ||
1857 | de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len)); | ||
1858 | } | ||
1859 | bail: | ||
1860 | mlog_exit(status); | ||
1861 | return status; | ||
1862 | } | ||
1863 | |||
1864 | /* | ||
1865 | * Returns 0 if not found, -1 on failure, and 1 on success | ||
1866 | */ | ||
1867 | static int inline ocfs2_search_dirblock(struct buffer_head *bh, | ||
1868 | struct inode *dir, | ||
1869 | const char *name, int namelen, | ||
1870 | unsigned long offset, | ||
1871 | struct ocfs2_dir_entry **res_dir) | ||
1872 | { | ||
1873 | struct ocfs2_dir_entry *de; | ||
1874 | char *dlimit, *de_buf; | ||
1875 | int de_len; | ||
1876 | int ret = 0; | ||
1877 | |||
1878 | mlog_entry_void(); | ||
1879 | |||
1880 | de_buf = bh->b_data; | ||
1881 | dlimit = de_buf + dir->i_sb->s_blocksize; | ||
1882 | |||
1883 | while (de_buf < dlimit) { | ||
1884 | /* this code is executed quadratically often */ | ||
1885 | /* do minimal checking `by hand' */ | ||
1886 | |||
1887 | de = (struct ocfs2_dir_entry *) de_buf; | ||
1888 | |||
1889 | if (de_buf + namelen <= dlimit && | ||
1890 | ocfs2_match(namelen, name, de)) { | ||
1891 | /* found a match - just to be sure, do a full check */ | ||
1892 | if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { | ||
1893 | ret = -1; | ||
1894 | goto bail; | ||
1895 | } | ||
1896 | *res_dir = de; | ||
1897 | ret = 1; | ||
1898 | goto bail; | ||
1899 | } | ||
1900 | |||
1901 | /* prevent looping on a bad block */ | ||
1902 | de_len = le16_to_cpu(de->rec_len); | ||
1903 | if (de_len <= 0) { | ||
1904 | ret = -1; | ||
1905 | goto bail; | ||
1906 | } | ||
1907 | |||
1908 | de_buf += de_len; | ||
1909 | offset += de_len; | ||
1910 | } | ||
1911 | |||
1912 | bail: | ||
1913 | mlog_exit(ret); | ||
1914 | return ret; | ||
1915 | } | ||
1916 | |||
1917 | struct buffer_head *ocfs2_find_entry(const char *name, int namelen, | ||
1918 | struct inode *dir, | ||
1919 | struct ocfs2_dir_entry **res_dir) | ||
1920 | { | ||
1921 | struct super_block *sb; | ||
1922 | struct buffer_head *bh_use[NAMEI_RA_SIZE]; | ||
1923 | struct buffer_head *bh, *ret = NULL; | ||
1924 | unsigned long start, block, b; | ||
1925 | int ra_max = 0; /* Number of bh's in the readahead | ||
1926 | buffer, bh_use[] */ | ||
1927 | int ra_ptr = 0; /* Current index into readahead | ||
1928 | buffer */ | ||
1929 | int num = 0; | ||
1930 | int nblocks, i, err; | ||
1931 | |||
1932 | mlog_entry_void(); | ||
1933 | |||
1934 | *res_dir = NULL; | ||
1935 | sb = dir->i_sb; | ||
1936 | |||
1937 | nblocks = i_size_read(dir) >> sb->s_blocksize_bits; | ||
1938 | start = OCFS2_I(dir)->ip_dir_start_lookup; | ||
1939 | if (start >= nblocks) | ||
1940 | start = 0; | ||
1941 | block = start; | ||
1942 | |||
1943 | restart: | ||
1944 | do { | ||
1945 | /* | ||
1946 | * We deal with the read-ahead logic here. | ||
1947 | */ | ||
1948 | if (ra_ptr >= ra_max) { | ||
1949 | /* Refill the readahead buffer */ | ||
1950 | ra_ptr = 0; | ||
1951 | b = block; | ||
1952 | for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { | ||
1953 | /* | ||
1954 | * Terminate if we reach the end of the | ||
1955 | * directory and must wrap, or if our | ||
1956 | * search has finished at this block. | ||
1957 | */ | ||
1958 | if (b >= nblocks || (num && block == start)) { | ||
1959 | bh_use[ra_max] = NULL; | ||
1960 | break; | ||
1961 | } | ||
1962 | num++; | ||
1963 | |||
1964 | /* XXX: questionable readahead stuff here */ | ||
1965 | bh = ocfs2_bread(dir, b++, &err, 1); | ||
1966 | bh_use[ra_max] = bh; | ||
1967 | #if 0 // ??? | ||
1968 | if (bh) | ||
1969 | ll_rw_block(READ, 1, &bh); | ||
1970 | #endif | ||
1971 | } | ||
1972 | } | ||
1973 | if ((bh = bh_use[ra_ptr++]) == NULL) | ||
1974 | goto next; | ||
1975 | wait_on_buffer(bh); | ||
1976 | if (!buffer_uptodate(bh)) { | ||
1977 | /* read error, skip block & hope for the best */ | ||
1978 | brelse(bh); | ||
1979 | goto next; | ||
1980 | } | ||
1981 | i = ocfs2_search_dirblock(bh, dir, name, namelen, | ||
1982 | block << sb->s_blocksize_bits, | ||
1983 | res_dir); | ||
1984 | if (i == 1) { | ||
1985 | OCFS2_I(dir)->ip_dir_start_lookup = block; | ||
1986 | ret = bh; | ||
1987 | goto cleanup_and_exit; | ||
1988 | } else { | ||
1989 | brelse(bh); | ||
1990 | if (i < 0) | ||
1991 | goto cleanup_and_exit; | ||
1992 | } | ||
1993 | next: | ||
1994 | if (++block >= nblocks) | ||
1995 | block = 0; | ||
1996 | } while (block != start); | ||
1997 | |||
1998 | /* | ||
1999 | * If the directory has grown while we were searching, then | ||
2000 | * search the last part of the directory before giving up. | ||
2001 | */ | ||
2002 | block = nblocks; | ||
2003 | nblocks = i_size_read(dir) >> sb->s_blocksize_bits; | ||
2004 | if (block < nblocks) { | ||
2005 | start = 0; | ||
2006 | goto restart; | ||
2007 | } | ||
2008 | |||
2009 | cleanup_and_exit: | ||
2010 | /* Clean up the read-ahead blocks */ | ||
2011 | for (; ra_ptr < ra_max; ra_ptr++) | ||
2012 | brelse(bh_use[ra_ptr]); | ||
2013 | |||
2014 | mlog_exit_ptr(ret); | ||
2015 | return ret; | ||
2016 | } | ||
2017 | |||
2018 | static int ocfs2_blkno_stringify(u64 blkno, char *name) | ||
2019 | { | ||
2020 | int status, namelen; | ||
2021 | |||
2022 | mlog_entry_void(); | ||
2023 | |||
2024 | namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016"MLFx64, | ||
2025 | blkno); | ||
2026 | if (namelen <= 0) { | ||
2027 | if (namelen) | ||
2028 | status = namelen; | ||
2029 | else | ||
2030 | status = -EINVAL; | ||
2031 | mlog_errno(status); | ||
2032 | goto bail; | ||
2033 | } | ||
2034 | if (namelen != OCFS2_ORPHAN_NAMELEN) { | ||
2035 | status = -EINVAL; | ||
2036 | mlog_errno(status); | ||
2037 | goto bail; | ||
2038 | } | ||
2039 | |||
2040 | mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name, | ||
2041 | namelen); | ||
2042 | |||
2043 | status = 0; | ||
2044 | bail: | ||
2045 | mlog_exit(status); | ||
2046 | return status; | ||
2047 | } | ||
2048 | |||
2049 | static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, | ||
2050 | struct ocfs2_journal_handle *handle, | ||
2051 | struct inode *inode, | ||
2052 | char *name, | ||
2053 | struct buffer_head **de_bh) | ||
2054 | { | ||
2055 | struct inode *orphan_dir_inode = NULL; | ||
2056 | struct buffer_head *orphan_dir_bh = NULL; | ||
2057 | int status = 0; | ||
2058 | |||
2059 | status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); | ||
2060 | if (status < 0) { | ||
2061 | mlog_errno(status); | ||
2062 | goto leave; | ||
2063 | } | ||
2064 | |||
2065 | orphan_dir_inode = ocfs2_get_system_file_inode(osb, | ||
2066 | ORPHAN_DIR_SYSTEM_INODE, | ||
2067 | osb->slot_num); | ||
2068 | if (!orphan_dir_inode) { | ||
2069 | status = -ENOENT; | ||
2070 | mlog_errno(status); | ||
2071 | goto leave; | ||
2072 | } | ||
2073 | |||
2074 | ocfs2_handle_add_inode(handle, orphan_dir_inode); | ||
2075 | status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1); | ||
2076 | if (status < 0) { | ||
2077 | mlog_errno(status); | ||
2078 | goto leave; | ||
2079 | } | ||
2080 | |||
2081 | status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, | ||
2082 | orphan_dir_bh, name, | ||
2083 | OCFS2_ORPHAN_NAMELEN, de_bh); | ||
2084 | if (status < 0) { | ||
2085 | mlog_errno(status); | ||
2086 | goto leave; | ||
2087 | } | ||
2088 | |||
2089 | leave: | ||
2090 | if (orphan_dir_inode) | ||
2091 | iput(orphan_dir_inode); | ||
2092 | |||
2093 | if (orphan_dir_bh) | ||
2094 | brelse(orphan_dir_bh); | ||
2095 | |||
2096 | mlog_exit(status); | ||
2097 | return status; | ||
2098 | } | ||
2099 | |||
2100 | static int ocfs2_orphan_add(struct ocfs2_super *osb, | ||
2101 | struct ocfs2_journal_handle *handle, | ||
2102 | struct inode *inode, | ||
2103 | struct ocfs2_dinode *fe, | ||
2104 | char *name, | ||
2105 | struct buffer_head *de_bh) | ||
2106 | { | ||
2107 | struct inode *orphan_dir_inode = NULL; | ||
2108 | struct buffer_head *orphan_dir_bh = NULL; | ||
2109 | int status = 0; | ||
2110 | struct ocfs2_dinode *orphan_fe; | ||
2111 | |||
2112 | mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); | ||
2113 | |||
2114 | orphan_dir_inode = ocfs2_get_system_file_inode(osb, | ||
2115 | ORPHAN_DIR_SYSTEM_INODE, | ||
2116 | osb->slot_num); | ||
2117 | if (!orphan_dir_inode) { | ||
2118 | status = -ENOENT; | ||
2119 | mlog_errno(status); | ||
2120 | goto leave; | ||
2121 | } | ||
2122 | |||
2123 | status = ocfs2_read_block(osb, | ||
2124 | OCFS2_I(orphan_dir_inode)->ip_blkno, | ||
2125 | &orphan_dir_bh, OCFS2_BH_CACHED, | ||
2126 | orphan_dir_inode); | ||
2127 | if (status < 0) { | ||
2128 | mlog_errno(status); | ||
2129 | goto leave; | ||
2130 | } | ||
2131 | |||
2132 | status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh, | ||
2133 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2134 | if (status < 0) { | ||
2135 | mlog_errno(status); | ||
2136 | goto leave; | ||
2137 | } | ||
2138 | |||
2139 | /* we're a cluster, and nlink can change on disk from | ||
2140 | * underneath us... */ | ||
2141 | orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; | ||
2142 | if (S_ISDIR(inode->i_mode)) | ||
2143 | le16_add_cpu(&orphan_fe->i_links_count, 1); | ||
2144 | orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); | ||
2145 | |||
2146 | status = ocfs2_journal_dirty(handle, orphan_dir_bh); | ||
2147 | if (status < 0) { | ||
2148 | mlog_errno(status); | ||
2149 | goto leave; | ||
2150 | } | ||
2151 | |||
2152 | status = __ocfs2_add_entry(handle, orphan_dir_inode, name, | ||
2153 | OCFS2_ORPHAN_NAMELEN, inode, | ||
2154 | OCFS2_I(inode)->ip_blkno, | ||
2155 | orphan_dir_bh, de_bh); | ||
2156 | if (status < 0) { | ||
2157 | mlog_errno(status); | ||
2158 | goto leave; | ||
2159 | } | ||
2160 | |||
2161 | le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); | ||
2162 | |||
2163 | /* Record which orphan dir our inode now resides | ||
2164 | * in. delete_inode will use this to determine which orphan | ||
2165 | * dir to lock. */ | ||
2166 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
2167 | OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num; | ||
2168 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
2169 | |||
2170 | mlog(0, "Inode %"MLFu64" orphaned in slot %d\n", | ||
2171 | OCFS2_I(inode)->ip_blkno, osb->slot_num); | ||
2172 | |||
2173 | leave: | ||
2174 | if (orphan_dir_inode) | ||
2175 | iput(orphan_dir_inode); | ||
2176 | |||
2177 | if (orphan_dir_bh) | ||
2178 | brelse(orphan_dir_bh); | ||
2179 | |||
2180 | mlog_exit(status); | ||
2181 | return status; | ||
2182 | } | ||
2183 | |||
2184 | /* unlike orphan_add, we expect the orphan dir to already be locked here. */ | ||
2185 | int ocfs2_orphan_del(struct ocfs2_super *osb, | ||
2186 | struct ocfs2_journal_handle *handle, | ||
2187 | struct inode *orphan_dir_inode, | ||
2188 | struct inode *inode, | ||
2189 | struct buffer_head *orphan_dir_bh) | ||
2190 | { | ||
2191 | char name[OCFS2_ORPHAN_NAMELEN + 1]; | ||
2192 | struct ocfs2_dinode *orphan_fe; | ||
2193 | int status = 0; | ||
2194 | struct buffer_head *target_de_bh = NULL; | ||
2195 | struct ocfs2_dir_entry *target_de = NULL; | ||
2196 | |||
2197 | mlog_entry_void(); | ||
2198 | |||
2199 | status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); | ||
2200 | if (status < 0) { | ||
2201 | mlog_errno(status); | ||
2202 | goto leave; | ||
2203 | } | ||
2204 | |||
2205 | mlog(0, "removing '%s' from orphan dir %"MLFu64" (namelen=%d)\n", | ||
2206 | name, OCFS2_I(orphan_dir_inode)->ip_blkno, OCFS2_ORPHAN_NAMELEN); | ||
2207 | |||
2208 | /* find it's spot in the orphan directory */ | ||
2209 | target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, | ||
2210 | orphan_dir_inode, &target_de); | ||
2211 | if (!target_de_bh) { | ||
2212 | status = -ENOENT; | ||
2213 | mlog_errno(status); | ||
2214 | goto leave; | ||
2215 | } | ||
2216 | |||
2217 | /* remove it from the orphan directory */ | ||
2218 | status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de, | ||
2219 | target_de_bh); | ||
2220 | if (status < 0) { | ||
2221 | mlog_errno(status); | ||
2222 | goto leave; | ||
2223 | } | ||
2224 | |||
2225 | status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh, | ||
2226 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2227 | if (status < 0) { | ||
2228 | mlog_errno(status); | ||
2229 | goto leave; | ||
2230 | } | ||
2231 | |||
2232 | /* do the i_nlink dance! :) */ | ||
2233 | orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; | ||
2234 | if (S_ISDIR(inode->i_mode)) | ||
2235 | le16_add_cpu(&orphan_fe->i_links_count, -1); | ||
2236 | orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); | ||
2237 | |||
2238 | status = ocfs2_journal_dirty(handle, orphan_dir_bh); | ||
2239 | if (status < 0) { | ||
2240 | mlog_errno(status); | ||
2241 | goto leave; | ||
2242 | } | ||
2243 | |||
2244 | leave: | ||
2245 | if (target_de_bh) | ||
2246 | brelse(target_de_bh); | ||
2247 | |||
2248 | mlog_exit(status); | ||
2249 | return status; | ||
2250 | } | ||
2251 | |||
2252 | struct inode_operations ocfs2_dir_iops = { | ||
2253 | .create = ocfs2_create, | ||
2254 | .lookup = ocfs2_lookup, | ||
2255 | .link = ocfs2_link, | ||
2256 | .unlink = ocfs2_unlink, | ||
2257 | .rmdir = ocfs2_unlink, | ||
2258 | .symlink = ocfs2_symlink, | ||
2259 | .mkdir = ocfs2_mkdir, | ||
2260 | .mknod = ocfs2_mknod, | ||
2261 | .rename = ocfs2_rename, | ||
2262 | .setattr = ocfs2_setattr, | ||
2263 | .getattr = ocfs2_getattr, | ||
2264 | }; | ||
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h new file mode 100644 index 000000000000..deaaa97dbf0b --- /dev/null +++ b/fs/ocfs2/namei.h | |||
@@ -0,0 +1,58 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * namei.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_NAMEI_H | ||
27 | #define OCFS2_NAMEI_H | ||
28 | |||
29 | extern struct inode_operations ocfs2_dir_iops; | ||
30 | |||
31 | struct dentry *ocfs2_get_parent(struct dentry *child); | ||
32 | |||
33 | int ocfs2_check_dir_entry (struct inode *dir, | ||
34 | struct ocfs2_dir_entry *de, | ||
35 | struct buffer_head *bh, | ||
36 | unsigned long offset); | ||
37 | struct buffer_head *ocfs2_find_entry(const char *name, | ||
38 | int namelen, | ||
39 | struct inode *dir, | ||
40 | struct ocfs2_dir_entry **res_dir); | ||
41 | int ocfs2_orphan_del(struct ocfs2_super *osb, | ||
42 | struct ocfs2_journal_handle *handle, | ||
43 | struct inode *orphan_dir_inode, | ||
44 | struct inode *inode, | ||
45 | struct buffer_head *orphan_dir_bh); | ||
46 | |||
47 | static inline int ocfs2_match(int len, | ||
48 | const char * const name, | ||
49 | struct ocfs2_dir_entry *de) | ||
50 | { | ||
51 | if (len != de->name_len) | ||
52 | return 0; | ||
53 | if (!de->inode) | ||
54 | return 0; | ||
55 | return !memcmp(name, de->name, len); | ||
56 | } | ||
57 | |||
58 | #endif /* OCFS2_NAMEI_H */ | ||
diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h new file mode 100644 index 000000000000..0b499bccec5a --- /dev/null +++ b/fs/ocfs2/ocfs1_fs_compat.h | |||
@@ -0,0 +1,109 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ocfs1_fs_compat.h | ||
5 | * | ||
6 | * OCFS1 volume header definitions. OCFS2 creates valid but unmountable | ||
7 | * OCFS1 volume headers on the first two sectors of an OCFS2 volume. | ||
8 | * This allows an OCFS1 volume to see the partition and cleanly fail to | ||
9 | * mount it. | ||
10 | * | ||
11 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
12 | * | ||
13 | * This program is free software; you can redistribute it and/or | ||
14 | * modify it under the terms of the GNU General Public | ||
15 | * License, version 2, as published by the Free Software Foundation. | ||
16 | * | ||
17 | * This program is distributed in the hope that it will be useful, | ||
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
20 | * General Public License for more details. | ||
21 | * | ||
22 | * You should have received a copy of the GNU General Public | ||
23 | * License along with this program; if not, write to the | ||
24 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
25 | * Boston, MA 021110-1307, USA. | ||
26 | */ | ||
27 | |||
28 | #ifndef _OCFS1_FS_COMPAT_H | ||
29 | #define _OCFS1_FS_COMPAT_H | ||
30 | |||
31 | #define OCFS1_MAX_VOL_SIGNATURE_LEN 128 | ||
32 | #define OCFS1_MAX_MOUNT_POINT_LEN 128 | ||
33 | #define OCFS1_MAX_VOL_ID_LENGTH 16 | ||
34 | #define OCFS1_MAX_VOL_LABEL_LEN 64 | ||
35 | #define OCFS1_MAX_CLUSTER_NAME_LEN 64 | ||
36 | |||
37 | #define OCFS1_MAJOR_VERSION (2) | ||
38 | #define OCFS1_MINOR_VERSION (0) | ||
39 | #define OCFS1_VOLUME_SIGNATURE "OracleCFS" | ||
40 | |||
41 | /* | ||
42 | * OCFS1 superblock. Lives at sector 0. | ||
43 | */ | ||
44 | struct ocfs1_vol_disk_hdr | ||
45 | { | ||
46 | /*00*/ __u32 minor_version; | ||
47 | __u32 major_version; | ||
48 | /*08*/ __u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN]; | ||
49 | /*88*/ __u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN]; | ||
50 | /*108*/ __u64 serial_num; | ||
51 | /*110*/ __u64 device_size; | ||
52 | __u64 start_off; | ||
53 | /*120*/ __u64 bitmap_off; | ||
54 | __u64 publ_off; | ||
55 | /*130*/ __u64 vote_off; | ||
56 | __u64 root_bitmap_off; | ||
57 | /*140*/ __u64 data_start_off; | ||
58 | __u64 root_bitmap_size; | ||
59 | /*150*/ __u64 root_off; | ||
60 | __u64 root_size; | ||
61 | /*160*/ __u64 cluster_size; | ||
62 | __u64 num_nodes; | ||
63 | /*170*/ __u64 num_clusters; | ||
64 | __u64 dir_node_size; | ||
65 | /*180*/ __u64 file_node_size; | ||
66 | __u64 internal_off; | ||
67 | /*190*/ __u64 node_cfg_off; | ||
68 | __u64 node_cfg_size; | ||
69 | /*1A0*/ __u64 new_cfg_off; | ||
70 | __u32 prot_bits; | ||
71 | __s32 excl_mount; | ||
72 | /*1B0*/ | ||
73 | }; | ||
74 | |||
75 | |||
76 | struct ocfs1_disk_lock | ||
77 | { | ||
78 | /*00*/ __u32 curr_master; | ||
79 | __u8 file_lock; | ||
80 | __u8 compat_pad[3]; /* Not in orignal definition. Used to | ||
81 | make the already existing alignment | ||
82 | explicit */ | ||
83 | __u64 last_write_time; | ||
84 | /*10*/ __u64 last_read_time; | ||
85 | __u32 writer_node_num; | ||
86 | __u32 reader_node_num; | ||
87 | /*20*/ __u64 oin_node_map; | ||
88 | __u64 dlock_seq_num; | ||
89 | /*30*/ | ||
90 | }; | ||
91 | |||
92 | /* | ||
93 | * OCFS1 volume label. Lives at sector 1. | ||
94 | */ | ||
95 | struct ocfs1_vol_label | ||
96 | { | ||
97 | /*00*/ struct ocfs1_disk_lock disk_lock; | ||
98 | /*30*/ __u8 label[OCFS1_MAX_VOL_LABEL_LEN]; | ||
99 | /*70*/ __u16 label_len; | ||
100 | /*72*/ __u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH]; | ||
101 | /*82*/ __u16 vol_id_len; | ||
102 | /*84*/ __u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN]; | ||
103 | /*A4*/ __u16 cluster_name_len; | ||
104 | /*A6*/ | ||
105 | }; | ||
106 | |||
107 | |||
108 | #endif /* _OCFS1_FS_COMPAT_H */ | ||
109 | |||
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h new file mode 100644 index 000000000000..f468c600cf92 --- /dev/null +++ b/fs/ocfs2/ocfs2.h | |||
@@ -0,0 +1,464 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ocfs2.h | ||
5 | * | ||
6 | * Defines macros and structures used in OCFS2 | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_H | ||
27 | #define OCFS2_H | ||
28 | |||
29 | #include <linux/spinlock.h> | ||
30 | #include <linux/sched.h> | ||
31 | #include <linux/wait.h> | ||
32 | #include <linux/list.h> | ||
33 | #include <linux/rbtree.h> | ||
34 | #include <linux/workqueue.h> | ||
35 | #include <linux/kref.h> | ||
36 | |||
37 | #include "cluster/nodemanager.h" | ||
38 | #include "cluster/heartbeat.h" | ||
39 | #include "cluster/tcp.h" | ||
40 | |||
41 | #include "dlm/dlmapi.h" | ||
42 | |||
43 | #include "ocfs2_fs.h" | ||
44 | #include "endian.h" | ||
45 | #include "ocfs2_lockid.h" | ||
46 | |||
47 | struct ocfs2_extent_map { | ||
48 | u32 em_clusters; | ||
49 | struct rb_root em_extents; | ||
50 | }; | ||
51 | |||
52 | /* Most user visible OCFS2 inodes will have very few pieces of | ||
53 | * metadata, but larger files (including bitmaps, etc) must be taken | ||
54 | * into account when designing an access scheme. We allow a small | ||
55 | * amount of inlined blocks to be stored on an array and grow the | ||
56 | * structure into a rb tree when necessary. */ | ||
57 | #define OCFS2_INODE_MAX_CACHE_ARRAY 2 | ||
58 | |||
59 | struct ocfs2_caching_info { | ||
60 | unsigned int ci_num_cached; | ||
61 | union { | ||
62 | sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY]; | ||
63 | struct rb_root ci_tree; | ||
64 | } ci_cache; | ||
65 | }; | ||
66 | |||
67 | /* this limits us to 256 nodes | ||
68 | * if we need more, we can do a kmalloc for the map */ | ||
69 | #define OCFS2_NODE_MAP_MAX_NODES 256 | ||
70 | struct ocfs2_node_map { | ||
71 | u16 num_nodes; | ||
72 | unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)]; | ||
73 | }; | ||
74 | |||
75 | enum ocfs2_ast_action { | ||
76 | OCFS2_AST_INVALID = 0, | ||
77 | OCFS2_AST_ATTACH, | ||
78 | OCFS2_AST_CONVERT, | ||
79 | OCFS2_AST_DOWNCONVERT, | ||
80 | }; | ||
81 | |||
82 | /* actions for an unlockast function to take. */ | ||
83 | enum ocfs2_unlock_action { | ||
84 | OCFS2_UNLOCK_INVALID = 0, | ||
85 | OCFS2_UNLOCK_CANCEL_CONVERT, | ||
86 | OCFS2_UNLOCK_DROP_LOCK, | ||
87 | }; | ||
88 | |||
89 | /* ocfs2_lock_res->l_flags flags. */ | ||
90 | #define OCFS2_LOCK_ATTACHED (0x00000001) /* have we initialized | ||
91 | * the lvb */ | ||
92 | #define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in | ||
93 | * dlm_lock */ | ||
94 | #define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to | ||
95 | * downconvert*/ | ||
96 | #define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */ | ||
97 | #define OCFS2_LOCK_NEEDS_REFRESH (0x00000010) | ||
98 | #define OCFS2_LOCK_REFRESHING (0x00000020) | ||
99 | #define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization | ||
100 | * for shutdown paths */ | ||
101 | #define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track | ||
102 | * when to skip queueing | ||
103 | * a lock because it's | ||
104 | * about to be | ||
105 | * dropped. */ | ||
106 | #define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ | ||
107 | |||
108 | struct ocfs2_lock_res_ops; | ||
109 | |||
110 | typedef void (*ocfs2_lock_callback)(int status, unsigned long data); | ||
111 | |||
112 | struct ocfs2_lock_res { | ||
113 | void *l_priv; | ||
114 | struct ocfs2_lock_res_ops *l_ops; | ||
115 | spinlock_t l_lock; | ||
116 | |||
117 | struct list_head l_blocked_list; | ||
118 | struct list_head l_mask_waiters; | ||
119 | |||
120 | enum ocfs2_lock_type l_type; | ||
121 | unsigned long l_flags; | ||
122 | char l_name[OCFS2_LOCK_ID_MAX_LEN]; | ||
123 | int l_level; | ||
124 | unsigned int l_ro_holders; | ||
125 | unsigned int l_ex_holders; | ||
126 | struct dlm_lockstatus l_lksb; | ||
127 | |||
128 | /* used from AST/BAST funcs. */ | ||
129 | enum ocfs2_ast_action l_action; | ||
130 | enum ocfs2_unlock_action l_unlock_action; | ||
131 | int l_requested; | ||
132 | int l_blocking; | ||
133 | |||
134 | wait_queue_head_t l_event; | ||
135 | |||
136 | struct list_head l_debug_list; | ||
137 | }; | ||
138 | |||
139 | struct ocfs2_dlm_debug { | ||
140 | struct kref d_refcnt; | ||
141 | struct dentry *d_locking_state; | ||
142 | struct list_head d_lockres_tracking; | ||
143 | }; | ||
144 | |||
145 | enum ocfs2_vol_state | ||
146 | { | ||
147 | VOLUME_INIT = 0, | ||
148 | VOLUME_MOUNTED, | ||
149 | VOLUME_DISMOUNTED, | ||
150 | VOLUME_DISABLED | ||
151 | }; | ||
152 | |||
153 | struct ocfs2_alloc_stats | ||
154 | { | ||
155 | atomic_t moves; | ||
156 | atomic_t local_data; | ||
157 | atomic_t bitmap_data; | ||
158 | atomic_t bg_allocs; | ||
159 | atomic_t bg_extends; | ||
160 | }; | ||
161 | |||
162 | enum ocfs2_local_alloc_state | ||
163 | { | ||
164 | OCFS2_LA_UNUSED = 0, | ||
165 | OCFS2_LA_ENABLED, | ||
166 | OCFS2_LA_DISABLED | ||
167 | }; | ||
168 | |||
169 | enum ocfs2_mount_options | ||
170 | { | ||
171 | OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */ | ||
172 | OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ | ||
173 | OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ | ||
174 | OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ | ||
175 | OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ | ||
176 | #ifdef OCFS2_ORACORE_WORKAROUNDS | ||
177 | OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */ | ||
178 | #endif | ||
179 | }; | ||
180 | |||
181 | #define OCFS2_OSB_SOFT_RO 0x0001 | ||
182 | #define OCFS2_OSB_HARD_RO 0x0002 | ||
183 | #define OCFS2_OSB_ERROR_FS 0x0004 | ||
184 | |||
185 | struct ocfs2_journal; | ||
186 | struct ocfs2_journal_handle; | ||
187 | struct ocfs2_super | ||
188 | { | ||
189 | u32 osb_id; /* id used by the proc interface */ | ||
190 | struct task_struct *commit_task; | ||
191 | struct super_block *sb; | ||
192 | struct inode *root_inode; | ||
193 | struct inode *sys_root_inode; | ||
194 | struct inode *system_inodes[NUM_SYSTEM_INODES]; | ||
195 | |||
196 | struct ocfs2_slot_info *slot_info; | ||
197 | |||
198 | spinlock_t node_map_lock; | ||
199 | struct ocfs2_node_map mounted_map; | ||
200 | struct ocfs2_node_map recovery_map; | ||
201 | struct ocfs2_node_map umount_map; | ||
202 | |||
203 | u32 num_clusters; | ||
204 | u64 root_blkno; | ||
205 | u64 system_dir_blkno; | ||
206 | u64 bitmap_blkno; | ||
207 | u32 bitmap_cpg; | ||
208 | u8 *uuid; | ||
209 | char *uuid_str; | ||
210 | u8 *vol_label; | ||
211 | u64 first_cluster_group_blkno; | ||
212 | u32 fs_generation; | ||
213 | |||
214 | u32 s_feature_compat; | ||
215 | u32 s_feature_incompat; | ||
216 | u32 s_feature_ro_compat; | ||
217 | |||
218 | /* Protects s_next_generaion, osb_flags. Could protect more on | ||
219 | * osb as it's very short lived. */ | ||
220 | spinlock_t osb_lock; | ||
221 | u32 s_next_generation; | ||
222 | unsigned long osb_flags; | ||
223 | |||
224 | unsigned long s_mount_opt; | ||
225 | |||
226 | u16 max_slots; | ||
227 | u16 num_nodes; | ||
228 | s16 node_num; | ||
229 | s16 slot_num; | ||
230 | int s_sectsize_bits; | ||
231 | int s_clustersize; | ||
232 | int s_clustersize_bits; | ||
233 | struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */ | ||
234 | |||
235 | atomic_t vol_state; | ||
236 | struct semaphore recovery_lock; | ||
237 | struct task_struct *recovery_thread_task; | ||
238 | int disable_recovery; | ||
239 | wait_queue_head_t checkpoint_event; | ||
240 | atomic_t needs_checkpoint; | ||
241 | struct ocfs2_journal *journal; | ||
242 | |||
243 | enum ocfs2_local_alloc_state local_alloc_state; | ||
244 | struct buffer_head *local_alloc_bh; | ||
245 | |||
246 | /* Next two fields are for local node slot recovery during | ||
247 | * mount. */ | ||
248 | int dirty; | ||
249 | struct ocfs2_dinode *local_alloc_copy; | ||
250 | |||
251 | struct ocfs2_alloc_stats alloc_stats; | ||
252 | char dev_str[20]; /* "major,minor" of the device */ | ||
253 | |||
254 | struct dlm_ctxt *dlm; | ||
255 | struct ocfs2_lock_res osb_super_lockres; | ||
256 | struct ocfs2_lock_res osb_rename_lockres; | ||
257 | struct dlm_eviction_cb osb_eviction_cb; | ||
258 | struct ocfs2_dlm_debug *osb_dlm_debug; | ||
259 | |||
260 | struct dentry *osb_debug_root; | ||
261 | |||
262 | wait_queue_head_t recovery_event; | ||
263 | |||
264 | spinlock_t vote_task_lock; | ||
265 | struct task_struct *vote_task; | ||
266 | wait_queue_head_t vote_event; | ||
267 | unsigned long vote_wake_sequence; | ||
268 | unsigned long vote_work_sequence; | ||
269 | |||
270 | struct list_head blocked_lock_list; | ||
271 | unsigned long blocked_lock_count; | ||
272 | |||
273 | struct list_head vote_list; | ||
274 | int vote_count; | ||
275 | |||
276 | u32 net_key; | ||
277 | spinlock_t net_response_lock; | ||
278 | unsigned int net_response_ids; | ||
279 | struct list_head net_response_list; | ||
280 | |||
281 | struct o2hb_callback_func osb_hb_up; | ||
282 | struct o2hb_callback_func osb_hb_down; | ||
283 | |||
284 | struct list_head osb_net_handlers; | ||
285 | |||
286 | wait_queue_head_t osb_mount_event; | ||
287 | |||
288 | /* Truncate log info */ | ||
289 | struct inode *osb_tl_inode; | ||
290 | struct buffer_head *osb_tl_bh; | ||
291 | struct work_struct osb_truncate_log_wq; | ||
292 | }; | ||
293 | |||
294 | #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) | ||
295 | #define OCFS2_MAX_OSB_ID 65536 | ||
296 | |||
297 | static inline int ocfs2_should_order_data(struct inode *inode) | ||
298 | { | ||
299 | if (!S_ISREG(inode->i_mode)) | ||
300 | return 0; | ||
301 | if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) | ||
302 | return 0; | ||
303 | return 1; | ||
304 | } | ||
305 | |||
306 | /* set / clear functions because cluster events can make these happen | ||
307 | * in parallel so we want the transitions to be atomic. this also | ||
308 | * means that any future flags osb_flags must be protected by spinlock | ||
309 | * too! */ | ||
310 | static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, | ||
311 | unsigned long flag) | ||
312 | { | ||
313 | spin_lock(&osb->osb_lock); | ||
314 | osb->osb_flags |= flag; | ||
315 | spin_unlock(&osb->osb_lock); | ||
316 | } | ||
317 | |||
318 | static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, | ||
319 | int hard) | ||
320 | { | ||
321 | spin_lock(&osb->osb_lock); | ||
322 | osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO); | ||
323 | if (hard) | ||
324 | osb->osb_flags |= OCFS2_OSB_HARD_RO; | ||
325 | else | ||
326 | osb->osb_flags |= OCFS2_OSB_SOFT_RO; | ||
327 | spin_unlock(&osb->osb_lock); | ||
328 | } | ||
329 | |||
330 | static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb) | ||
331 | { | ||
332 | int ret; | ||
333 | |||
334 | spin_lock(&osb->osb_lock); | ||
335 | ret = osb->osb_flags & OCFS2_OSB_HARD_RO; | ||
336 | spin_unlock(&osb->osb_lock); | ||
337 | |||
338 | return ret; | ||
339 | } | ||
340 | |||
341 | static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) | ||
342 | { | ||
343 | int ret; | ||
344 | |||
345 | spin_lock(&osb->osb_lock); | ||
346 | ret = osb->osb_flags & OCFS2_OSB_SOFT_RO; | ||
347 | spin_unlock(&osb->osb_lock); | ||
348 | |||
349 | return ret; | ||
350 | } | ||
351 | |||
352 | #define OCFS2_IS_VALID_DINODE(ptr) \ | ||
353 | (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) | ||
354 | |||
355 | #define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \ | ||
356 | typeof(__di) ____di = (__di); \ | ||
357 | ocfs2_error((__sb), \ | ||
358 | "Dinode # %"MLFu64" has bad signature %.*s", \ | ||
359 | (____di)->i_blkno, 7, \ | ||
360 | (____di)->i_signature); \ | ||
361 | } while (0); | ||
362 | |||
363 | #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \ | ||
364 | (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE)) | ||
365 | |||
366 | #define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \ | ||
367 | typeof(__eb) ____eb = (__eb); \ | ||
368 | ocfs2_error((__sb), \ | ||
369 | "Extent Block # %"MLFu64" has bad signature %.*s", \ | ||
370 | (____eb)->h_blkno, 7, \ | ||
371 | (____eb)->h_signature); \ | ||
372 | } while (0); | ||
373 | |||
374 | #define OCFS2_IS_VALID_GROUP_DESC(ptr) \ | ||
375 | (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE)) | ||
376 | |||
377 | #define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \ | ||
378 | typeof(__gd) ____gd = (__gd); \ | ||
379 | ocfs2_error((__sb), \ | ||
380 | "Group Descriptor # %"MLFu64" has bad signature %.*s", \ | ||
381 | (____gd)->bg_blkno, 7, \ | ||
382 | (____gd)->bg_signature); \ | ||
383 | } while (0); | ||
384 | |||
385 | static inline unsigned long ino_from_blkno(struct super_block *sb, | ||
386 | u64 blkno) | ||
387 | { | ||
388 | return (unsigned long)(blkno & (u64)ULONG_MAX); | ||
389 | } | ||
390 | |||
391 | static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb, | ||
392 | u32 clusters) | ||
393 | { | ||
394 | int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits - | ||
395 | sb->s_blocksize_bits; | ||
396 | |||
397 | return (u64)clusters << c_to_b_bits; | ||
398 | } | ||
399 | |||
400 | static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb, | ||
401 | u64 blocks) | ||
402 | { | ||
403 | int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - | ||
404 | sb->s_blocksize_bits; | ||
405 | |||
406 | return (u32)(blocks >> b_to_c_bits); | ||
407 | } | ||
408 | |||
409 | static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb, | ||
410 | u64 bytes) | ||
411 | { | ||
412 | int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; | ||
413 | unsigned int clusters; | ||
414 | |||
415 | bytes += OCFS2_SB(sb)->s_clustersize - 1; | ||
416 | /* OCFS2 just cannot have enough clusters to overflow this */ | ||
417 | clusters = (unsigned int)(bytes >> cl_bits); | ||
418 | |||
419 | return clusters; | ||
420 | } | ||
421 | |||
422 | static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, | ||
423 | u64 bytes) | ||
424 | { | ||
425 | bytes += sb->s_blocksize - 1; | ||
426 | return bytes >> sb->s_blocksize_bits; | ||
427 | } | ||
428 | |||
429 | static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb, | ||
430 | u32 clusters) | ||
431 | { | ||
432 | return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; | ||
433 | } | ||
434 | |||
435 | static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, | ||
436 | u64 bytes) | ||
437 | { | ||
438 | int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; | ||
439 | unsigned int clusters; | ||
440 | |||
441 | clusters = ocfs2_clusters_for_bytes(sb, bytes); | ||
442 | return (u64)clusters << cl_bits; | ||
443 | } | ||
444 | |||
445 | static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb, | ||
446 | u64 bytes) | ||
447 | { | ||
448 | u64 blocks; | ||
449 | |||
450 | blocks = ocfs2_blocks_for_bytes(sb, bytes); | ||
451 | return blocks << sb->s_blocksize_bits; | ||
452 | } | ||
453 | |||
454 | static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes) | ||
455 | { | ||
456 | return (unsigned long)((bytes + 511) >> 9); | ||
457 | } | ||
458 | |||
459 | #define ocfs2_set_bit ext2_set_bit | ||
460 | #define ocfs2_clear_bit ext2_clear_bit | ||
461 | #define ocfs2_test_bit ext2_test_bit | ||
462 | #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit | ||
463 | #endif /* OCFS2_H */ | ||
464 | |||
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h new file mode 100644 index 000000000000..dfb8a5bedfc8 --- /dev/null +++ b/fs/ocfs2/ocfs2_fs.h | |||
@@ -0,0 +1,638 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ocfs2_fs.h | ||
5 | * | ||
6 | * On-disk structures for OCFS2. | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License, version 2, as published by the Free Software Foundation. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
17 | * General Public License for more details. | ||
18 | * | ||
19 | * You should have received a copy of the GNU General Public | ||
20 | * License along with this program; if not, write to the | ||
21 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
22 | * Boston, MA 021110-1307, USA. | ||
23 | */ | ||
24 | |||
25 | #ifndef _OCFS2_FS_H | ||
26 | #define _OCFS2_FS_H | ||
27 | |||
28 | /* Version */ | ||
29 | #define OCFS2_MAJOR_REV_LEVEL 0 | ||
30 | #define OCFS2_MINOR_REV_LEVEL 90 | ||
31 | |||
32 | /* | ||
33 | * An OCFS2 volume starts this way: | ||
34 | * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS. | ||
35 | * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS. | ||
36 | * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock. | ||
37 | * | ||
38 | * All other structures are found from the superblock information. | ||
39 | * | ||
40 | * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors. eg, for a | ||
41 | * blocksize of 2K, it is 4096 bytes into disk. | ||
42 | */ | ||
43 | #define OCFS2_SUPER_BLOCK_BLKNO 2 | ||
44 | |||
45 | /* | ||
46 | * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could | ||
47 | * grow if needed. | ||
48 | */ | ||
49 | #define OCFS2_MIN_CLUSTERSIZE 4096 | ||
50 | #define OCFS2_MAX_CLUSTERSIZE 1048576 | ||
51 | |||
52 | /* | ||
53 | * Blocks cannot be bigger than clusters, so the maximum blocksize is the | ||
54 | * minimum cluster size. | ||
55 | */ | ||
56 | #define OCFS2_MIN_BLOCKSIZE 512 | ||
57 | #define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE | ||
58 | |||
59 | /* Filesystem magic number */ | ||
60 | #define OCFS2_SUPER_MAGIC 0x7461636f | ||
61 | |||
62 | /* Object signatures */ | ||
63 | #define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2" | ||
64 | #define OCFS2_INODE_SIGNATURE "INODE01" | ||
65 | #define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" | ||
66 | #define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" | ||
67 | |||
68 | /* Compatibility flags */ | ||
69 | #define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ | ||
70 | ( OCFS2_SB(sb)->s_feature_compat & (mask) ) | ||
71 | #define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask) \ | ||
72 | ( OCFS2_SB(sb)->s_feature_ro_compat & (mask) ) | ||
73 | #define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask) \ | ||
74 | ( OCFS2_SB(sb)->s_feature_incompat & (mask) ) | ||
75 | #define OCFS2_SET_COMPAT_FEATURE(sb,mask) \ | ||
76 | OCFS2_SB(sb)->s_feature_compat |= (mask) | ||
77 | #define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask) \ | ||
78 | OCFS2_SB(sb)->s_feature_ro_compat |= (mask) | ||
79 | #define OCFS2_SET_INCOMPAT_FEATURE(sb,mask) \ | ||
80 | OCFS2_SB(sb)->s_feature_incompat |= (mask) | ||
81 | #define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask) \ | ||
82 | OCFS2_SB(sb)->s_feature_compat &= ~(mask) | ||
83 | #define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ | ||
84 | OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask) | ||
85 | #define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \ | ||
86 | OCFS2_SB(sb)->s_feature_incompat &= ~(mask) | ||
87 | |||
88 | #define OCFS2_FEATURE_COMPAT_SUPP 0 | ||
89 | #define OCFS2_FEATURE_INCOMPAT_SUPP 0 | ||
90 | #define OCFS2_FEATURE_RO_COMPAT_SUPP 0 | ||
91 | |||
92 | /* | ||
93 | * Heartbeat-only devices are missing journals and other files. The | ||
94 | * filesystem driver can't load them, but the library can. Never put | ||
95 | * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*. | ||
96 | */ | ||
97 | #define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002 | ||
98 | |||
99 | |||
100 | /* | ||
101 | * Flags on ocfs2_dinode.i_flags | ||
102 | */ | ||
103 | #define OCFS2_VALID_FL (0x00000001) /* Inode is valid */ | ||
104 | #define OCFS2_UNUSED2_FL (0x00000002) | ||
105 | #define OCFS2_ORPHANED_FL (0x00000004) /* On the orphan list */ | ||
106 | #define OCFS2_UNUSED3_FL (0x00000008) | ||
107 | /* System inode flags */ | ||
108 | #define OCFS2_SYSTEM_FL (0x00000010) /* System inode */ | ||
109 | #define OCFS2_SUPER_BLOCK_FL (0x00000020) /* Super block */ | ||
110 | #define OCFS2_LOCAL_ALLOC_FL (0x00000040) /* Slot local alloc bitmap */ | ||
111 | #define OCFS2_BITMAP_FL (0x00000080) /* Allocation bitmap */ | ||
112 | #define OCFS2_JOURNAL_FL (0x00000100) /* Slot local journal */ | ||
113 | #define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */ | ||
114 | #define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ | ||
115 | #define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ | ||
116 | |||
117 | /* | ||
118 | * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) | ||
119 | */ | ||
120 | #define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ | ||
121 | |||
122 | /* | ||
123 | * superblock s_state flags | ||
124 | */ | ||
125 | #define OCFS2_ERROR_FS (0x00000001) /* FS saw errors */ | ||
126 | |||
127 | /* Limit of space in ocfs2_dir_entry */ | ||
128 | #define OCFS2_MAX_FILENAME_LEN 255 | ||
129 | |||
130 | /* Maximum slots on an ocfs2 file system */ | ||
131 | #define OCFS2_MAX_SLOTS 255 | ||
132 | |||
133 | /* Slot map indicator for an empty slot */ | ||
134 | #define OCFS2_INVALID_SLOT -1 | ||
135 | |||
136 | #define OCFS2_VOL_UUID_LEN 16 | ||
137 | #define OCFS2_MAX_VOL_LABEL_LEN 64 | ||
138 | |||
139 | /* Journal limits (in bytes) */ | ||
140 | #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) | ||
141 | #define OCFS2_MAX_JOURNAL_SIZE (500 * 1024 * 1024) | ||
142 | |||
143 | struct ocfs2_system_inode_info { | ||
144 | char *si_name; | ||
145 | int si_iflags; | ||
146 | int si_mode; | ||
147 | }; | ||
148 | |||
149 | /* System file index */ | ||
150 | enum { | ||
151 | BAD_BLOCK_SYSTEM_INODE = 0, | ||
152 | GLOBAL_INODE_ALLOC_SYSTEM_INODE, | ||
153 | SLOT_MAP_SYSTEM_INODE, | ||
154 | #define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE | ||
155 | HEARTBEAT_SYSTEM_INODE, | ||
156 | GLOBAL_BITMAP_SYSTEM_INODE, | ||
157 | #define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE | ||
158 | ORPHAN_DIR_SYSTEM_INODE, | ||
159 | EXTENT_ALLOC_SYSTEM_INODE, | ||
160 | INODE_ALLOC_SYSTEM_INODE, | ||
161 | JOURNAL_SYSTEM_INODE, | ||
162 | LOCAL_ALLOC_SYSTEM_INODE, | ||
163 | TRUNCATE_LOG_SYSTEM_INODE, | ||
164 | NUM_SYSTEM_INODES | ||
165 | }; | ||
166 | |||
167 | static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { | ||
168 | /* Global system inodes (single copy) */ | ||
169 | /* The first two are only used from userspace mfks/tunefs */ | ||
170 | [BAD_BLOCK_SYSTEM_INODE] = { "bad_blocks", 0, S_IFREG | 0644 }, | ||
171 | [GLOBAL_INODE_ALLOC_SYSTEM_INODE] = { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, | ||
172 | |||
173 | /* These are used by the running filesystem */ | ||
174 | [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 }, | ||
175 | [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 }, | ||
176 | [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 }, | ||
177 | |||
178 | /* Slot-specific system inodes (one copy per slot) */ | ||
179 | [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 }, | ||
180 | [EXTENT_ALLOC_SYSTEM_INODE] = { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, | ||
181 | [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, | ||
182 | [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 }, | ||
183 | [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 }, | ||
184 | [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 } | ||
185 | }; | ||
186 | |||
187 | /* Parameter passed from mount.ocfs2 to module */ | ||
188 | #define OCFS2_HB_NONE "heartbeat=none" | ||
189 | #define OCFS2_HB_LOCAL "heartbeat=local" | ||
190 | |||
191 | /* | ||
192 | * OCFS2 directory file types. Only the low 3 bits are used. The | ||
193 | * other bits are reserved for now. | ||
194 | */ | ||
195 | #define OCFS2_FT_UNKNOWN 0 | ||
196 | #define OCFS2_FT_REG_FILE 1 | ||
197 | #define OCFS2_FT_DIR 2 | ||
198 | #define OCFS2_FT_CHRDEV 3 | ||
199 | #define OCFS2_FT_BLKDEV 4 | ||
200 | #define OCFS2_FT_FIFO 5 | ||
201 | #define OCFS2_FT_SOCK 6 | ||
202 | #define OCFS2_FT_SYMLINK 7 | ||
203 | |||
204 | #define OCFS2_FT_MAX 8 | ||
205 | |||
206 | /* | ||
207 | * OCFS2_DIR_PAD defines the directory entries boundaries | ||
208 | * | ||
209 | * NOTE: It must be a multiple of 4 | ||
210 | */ | ||
211 | #define OCFS2_DIR_PAD 4 | ||
212 | #define OCFS2_DIR_ROUND (OCFS2_DIR_PAD - 1) | ||
213 | #define OCFS2_DIR_MEMBER_LEN offsetof(struct ocfs2_dir_entry, name) | ||
214 | #define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ | ||
215 | OCFS2_DIR_ROUND) & \ | ||
216 | ~OCFS2_DIR_ROUND) | ||
217 | |||
218 | #define OCFS2_LINK_MAX 32000 | ||
219 | |||
220 | #define S_SHIFT 12 | ||
221 | static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { | ||
222 | [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE, | ||
223 | [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR, | ||
224 | [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV, | ||
225 | [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV, | ||
226 | [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO, | ||
227 | [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK, | ||
228 | [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK, | ||
229 | }; | ||
230 | |||
231 | |||
232 | /* | ||
233 | * Convenience casts | ||
234 | */ | ||
235 | #define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super)) | ||
236 | |||
237 | /* | ||
238 | * On disk extent record for OCFS2 | ||
239 | * It describes a range of clusters on disk. | ||
240 | */ | ||
241 | struct ocfs2_extent_rec { | ||
242 | /*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ | ||
243 | __le32 e_clusters; /* Clusters covered by this extent */ | ||
244 | __le64 e_blkno; /* Physical disk offset, in blocks */ | ||
245 | /*10*/ | ||
246 | }; | ||
247 | |||
248 | struct ocfs2_chain_rec { | ||
249 | __le32 c_free; /* Number of free bits in this chain. */ | ||
250 | __le32 c_total; /* Number of total bits in this chain */ | ||
251 | __le64 c_blkno; /* Physical disk offset (blocks) of 1st group */ | ||
252 | }; | ||
253 | |||
254 | struct ocfs2_truncate_rec { | ||
255 | __le32 t_start; /* 1st cluster in this log */ | ||
256 | __le32 t_clusters; /* Number of total clusters covered */ | ||
257 | }; | ||
258 | |||
259 | /* | ||
260 | * On disk extent list for OCFS2 (node in the tree). Note that this | ||
261 | * is contained inside ocfs2_dinode or ocfs2_extent_block, so the | ||
262 | * offsets are relative to ocfs2_dinode.id2.i_list or | ||
263 | * ocfs2_extent_block.h_list, respectively. | ||
264 | */ | ||
265 | struct ocfs2_extent_list { | ||
266 | /*00*/ __le16 l_tree_depth; /* Extent tree depth from this | ||
267 | point. 0 means data extents | ||
268 | hang directly off this | ||
269 | header (a leaf) */ | ||
270 | __le16 l_count; /* Number of extent records */ | ||
271 | __le16 l_next_free_rec; /* Next unused extent slot */ | ||
272 | __le16 l_reserved1; | ||
273 | __le64 l_reserved2; /* Pad to | ||
274 | sizeof(ocfs2_extent_rec) */ | ||
275 | /*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */ | ||
276 | }; | ||
277 | |||
278 | /* | ||
279 | * On disk allocation chain list for OCFS2. Note that this is | ||
280 | * contained inside ocfs2_dinode, so the offsets are relative to | ||
281 | * ocfs2_dinode.id2.i_chain. | ||
282 | */ | ||
283 | struct ocfs2_chain_list { | ||
284 | /*00*/ __le16 cl_cpg; /* Clusters per Block Group */ | ||
285 | __le16 cl_bpc; /* Bits per cluster */ | ||
286 | __le16 cl_count; /* Total chains in this list */ | ||
287 | __le16 cl_next_free_rec; /* Next unused chain slot */ | ||
288 | __le64 cl_reserved1; | ||
289 | /*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */ | ||
290 | }; | ||
291 | |||
292 | /* | ||
293 | * On disk deallocation log for OCFS2. Note that this is | ||
294 | * contained inside ocfs2_dinode, so the offsets are relative to | ||
295 | * ocfs2_dinode.id2.i_dealloc. | ||
296 | */ | ||
297 | struct ocfs2_truncate_log { | ||
298 | /*00*/ __le16 tl_count; /* Total records in this log */ | ||
299 | __le16 tl_used; /* Number of records in use */ | ||
300 | __le32 tl_reserved1; | ||
301 | /*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */ | ||
302 | }; | ||
303 | |||
304 | /* | ||
305 | * On disk extent block (indirect block) for OCFS2 | ||
306 | */ | ||
307 | struct ocfs2_extent_block | ||
308 | { | ||
309 | /*00*/ __u8 h_signature[8]; /* Signature for verification */ | ||
310 | __le64 h_reserved1; | ||
311 | /*10*/ __le16 h_suballoc_slot; /* Slot suballocator this | ||
312 | extent_header belongs to */ | ||
313 | __le16 h_suballoc_bit; /* Bit offset in suballocator | ||
314 | block group */ | ||
315 | __le32 h_fs_generation; /* Must match super block */ | ||
316 | __le64 h_blkno; /* Offset on disk, in blocks */ | ||
317 | /*20*/ __le64 h_reserved3; | ||
318 | __le64 h_next_leaf_blk; /* Offset on disk, in blocks, | ||
319 | of next leaf header pointing | ||
320 | to data */ | ||
321 | /*30*/ struct ocfs2_extent_list h_list; /* Extent record list */ | ||
322 | /* Actual on-disk size is one block */ | ||
323 | }; | ||
324 | |||
325 | /* | ||
326 | * On disk superblock for OCFS2 | ||
327 | * Note that it is contained inside an ocfs2_dinode, so all offsets | ||
328 | * are relative to the start of ocfs2_dinode.id2. | ||
329 | */ | ||
330 | struct ocfs2_super_block { | ||
331 | /*00*/ __le16 s_major_rev_level; | ||
332 | __le16 s_minor_rev_level; | ||
333 | __le16 s_mnt_count; | ||
334 | __le16 s_max_mnt_count; | ||
335 | __le16 s_state; /* File system state */ | ||
336 | __le16 s_errors; /* Behaviour when detecting errors */ | ||
337 | __le32 s_checkinterval; /* Max time between checks */ | ||
338 | /*10*/ __le64 s_lastcheck; /* Time of last check */ | ||
339 | __le32 s_creator_os; /* OS */ | ||
340 | __le32 s_feature_compat; /* Compatible feature set */ | ||
341 | /*20*/ __le32 s_feature_incompat; /* Incompatible feature set */ | ||
342 | __le32 s_feature_ro_compat; /* Readonly-compatible feature set */ | ||
343 | __le64 s_root_blkno; /* Offset, in blocks, of root directory | ||
344 | dinode */ | ||
345 | /*30*/ __le64 s_system_dir_blkno; /* Offset, in blocks, of system | ||
346 | directory dinode */ | ||
347 | __le32 s_blocksize_bits; /* Blocksize for this fs */ | ||
348 | __le32 s_clustersize_bits; /* Clustersize for this fs */ | ||
349 | /*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts | ||
350 | before tunefs required */ | ||
351 | __le16 s_reserved1; | ||
352 | __le32 s_reserved2; | ||
353 | __le64 s_first_cluster_group; /* Block offset of 1st cluster | ||
354 | * group header */ | ||
355 | /*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ | ||
356 | /*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ | ||
357 | /*A0*/ | ||
358 | }; | ||
359 | |||
360 | /* | ||
361 | * Local allocation bitmap for OCFS2 slots | ||
362 | * Note that it exists inside an ocfs2_dinode, so all offsets are | ||
363 | * relative to the start of ocfs2_dinode.id2. | ||
364 | */ | ||
365 | struct ocfs2_local_alloc | ||
366 | { | ||
367 | /*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */ | ||
368 | __le16 la_size; /* Size of included bitmap, in bytes */ | ||
369 | __le16 la_reserved1; | ||
370 | __le64 la_reserved2; | ||
371 | /*10*/ __u8 la_bitmap[0]; | ||
372 | }; | ||
373 | |||
374 | /* | ||
375 | * On disk inode for OCFS2 | ||
376 | */ | ||
377 | struct ocfs2_dinode { | ||
378 | /*00*/ __u8 i_signature[8]; /* Signature for validation */ | ||
379 | __le32 i_generation; /* Generation number */ | ||
380 | __le16 i_suballoc_slot; /* Slot suballocator this inode | ||
381 | belongs to */ | ||
382 | __le16 i_suballoc_bit; /* Bit offset in suballocator | ||
383 | block group */ | ||
384 | /*10*/ __le32 i_reserved0; | ||
385 | __le32 i_clusters; /* Cluster count */ | ||
386 | __le32 i_uid; /* Owner UID */ | ||
387 | __le32 i_gid; /* Owning GID */ | ||
388 | /*20*/ __le64 i_size; /* Size in bytes */ | ||
389 | __le16 i_mode; /* File mode */ | ||
390 | __le16 i_links_count; /* Links count */ | ||
391 | __le32 i_flags; /* File flags */ | ||
392 | /*30*/ __le64 i_atime; /* Access time */ | ||
393 | __le64 i_ctime; /* Creation time */ | ||
394 | /*40*/ __le64 i_mtime; /* Modification time */ | ||
395 | __le64 i_dtime; /* Deletion time */ | ||
396 | /*50*/ __le64 i_blkno; /* Offset on disk, in blocks */ | ||
397 | __le64 i_last_eb_blk; /* Pointer to last extent | ||
398 | block */ | ||
399 | /*60*/ __le32 i_fs_generation; /* Generation per fs-instance */ | ||
400 | __le32 i_atime_nsec; | ||
401 | __le32 i_ctime_nsec; | ||
402 | __le32 i_mtime_nsec; | ||
403 | /*70*/ __le64 i_reserved1[9]; | ||
404 | /*B8*/ union { | ||
405 | __le64 i_pad1; /* Generic way to refer to this | ||
406 | 64bit union */ | ||
407 | struct { | ||
408 | __le64 i_rdev; /* Device number */ | ||
409 | } dev1; | ||
410 | struct { /* Info for bitmap system | ||
411 | inodes */ | ||
412 | __le32 i_used; /* Bits (ie, clusters) used */ | ||
413 | __le32 i_total; /* Total bits (clusters) | ||
414 | available */ | ||
415 | } bitmap1; | ||
416 | struct { /* Info for journal system | ||
417 | inodes */ | ||
418 | __le32 ij_flags; /* Mounted, version, etc. */ | ||
419 | __le32 ij_pad; | ||
420 | } journal1; | ||
421 | } id1; /* Inode type dependant 1 */ | ||
422 | /*C0*/ union { | ||
423 | struct ocfs2_super_block i_super; | ||
424 | struct ocfs2_local_alloc i_lab; | ||
425 | struct ocfs2_chain_list i_chain; | ||
426 | struct ocfs2_extent_list i_list; | ||
427 | struct ocfs2_truncate_log i_dealloc; | ||
428 | __u8 i_symlink[0]; | ||
429 | } id2; | ||
430 | /* Actual on-disk size is one block */ | ||
431 | }; | ||
432 | |||
433 | /* | ||
434 | * On-disk directory entry structure for OCFS2 | ||
435 | * | ||
436 | * Packed as this structure could be accessed unaligned on 64-bit platforms | ||
437 | */ | ||
438 | struct ocfs2_dir_entry { | ||
439 | /*00*/ __le64 inode; /* Inode number */ | ||
440 | __le16 rec_len; /* Directory entry length */ | ||
441 | __u8 name_len; /* Name length */ | ||
442 | __u8 file_type; | ||
443 | /*0C*/ char name[OCFS2_MAX_FILENAME_LEN]; /* File name */ | ||
444 | /* Actual on-disk length specified by rec_len */ | ||
445 | } __attribute__ ((packed)); | ||
446 | |||
447 | /* | ||
448 | * On disk allocator group structure for OCFS2 | ||
449 | */ | ||
450 | struct ocfs2_group_desc | ||
451 | { | ||
452 | /*00*/ __u8 bg_signature[8]; /* Signature for validation */ | ||
453 | __le16 bg_size; /* Size of included bitmap in | ||
454 | bytes. */ | ||
455 | __le16 bg_bits; /* Bits represented by this | ||
456 | group. */ | ||
457 | __le16 bg_free_bits_count; /* Free bits count */ | ||
458 | __le16 bg_chain; /* What chain I am in. */ | ||
459 | /*10*/ __le32 bg_generation; | ||
460 | __le32 bg_reserved1; | ||
461 | __le64 bg_next_group; /* Next group in my list, in | ||
462 | blocks */ | ||
463 | /*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in | ||
464 | blocks */ | ||
465 | __le64 bg_blkno; /* Offset on disk, in blocks */ | ||
466 | /*30*/ __le64 bg_reserved2[2]; | ||
467 | /*40*/ __u8 bg_bitmap[0]; | ||
468 | }; | ||
469 | |||
470 | #ifdef __KERNEL__ | ||
471 | static inline int ocfs2_fast_symlink_chars(struct super_block *sb) | ||
472 | { | ||
473 | return sb->s_blocksize - | ||
474 | offsetof(struct ocfs2_dinode, id2.i_symlink); | ||
475 | } | ||
476 | |||
477 | static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) | ||
478 | { | ||
479 | int size; | ||
480 | |||
481 | size = sb->s_blocksize - | ||
482 | offsetof(struct ocfs2_dinode, id2.i_list.l_recs); | ||
483 | |||
484 | return size / sizeof(struct ocfs2_extent_rec); | ||
485 | } | ||
486 | |||
487 | static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) | ||
488 | { | ||
489 | int size; | ||
490 | |||
491 | size = sb->s_blocksize - | ||
492 | offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); | ||
493 | |||
494 | return size / sizeof(struct ocfs2_chain_rec); | ||
495 | } | ||
496 | |||
497 | static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb) | ||
498 | { | ||
499 | int size; | ||
500 | |||
501 | size = sb->s_blocksize - | ||
502 | offsetof(struct ocfs2_extent_block, h_list.l_recs); | ||
503 | |||
504 | return size / sizeof(struct ocfs2_extent_rec); | ||
505 | } | ||
506 | |||
507 | static inline u16 ocfs2_local_alloc_size(struct super_block *sb) | ||
508 | { | ||
509 | u16 size; | ||
510 | |||
511 | size = sb->s_blocksize - | ||
512 | offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); | ||
513 | |||
514 | return size; | ||
515 | } | ||
516 | |||
517 | static inline int ocfs2_group_bitmap_size(struct super_block *sb) | ||
518 | { | ||
519 | int size; | ||
520 | |||
521 | size = sb->s_blocksize - | ||
522 | offsetof(struct ocfs2_group_desc, bg_bitmap); | ||
523 | |||
524 | return size; | ||
525 | } | ||
526 | |||
527 | static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb) | ||
528 | { | ||
529 | int size; | ||
530 | |||
531 | size = sb->s_blocksize - | ||
532 | offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); | ||
533 | |||
534 | return size / sizeof(struct ocfs2_truncate_rec); | ||
535 | } | ||
536 | #else | ||
537 | static inline int ocfs2_fast_symlink_chars(int blocksize) | ||
538 | { | ||
539 | return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); | ||
540 | } | ||
541 | |||
542 | static inline int ocfs2_extent_recs_per_inode(int blocksize) | ||
543 | { | ||
544 | int size; | ||
545 | |||
546 | size = blocksize - | ||
547 | offsetof(struct ocfs2_dinode, id2.i_list.l_recs); | ||
548 | |||
549 | return size / sizeof(struct ocfs2_extent_rec); | ||
550 | } | ||
551 | |||
552 | static inline int ocfs2_chain_recs_per_inode(int blocksize) | ||
553 | { | ||
554 | int size; | ||
555 | |||
556 | size = blocksize - | ||
557 | offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); | ||
558 | |||
559 | return size / sizeof(struct ocfs2_chain_rec); | ||
560 | } | ||
561 | |||
562 | static inline int ocfs2_extent_recs_per_eb(int blocksize) | ||
563 | { | ||
564 | int size; | ||
565 | |||
566 | size = blocksize - | ||
567 | offsetof(struct ocfs2_extent_block, h_list.l_recs); | ||
568 | |||
569 | return size / sizeof(struct ocfs2_extent_rec); | ||
570 | } | ||
571 | |||
572 | static inline int ocfs2_local_alloc_size(int blocksize) | ||
573 | { | ||
574 | int size; | ||
575 | |||
576 | size = blocksize - | ||
577 | offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); | ||
578 | |||
579 | return size; | ||
580 | } | ||
581 | |||
582 | static inline int ocfs2_group_bitmap_size(int blocksize) | ||
583 | { | ||
584 | int size; | ||
585 | |||
586 | size = blocksize - | ||
587 | offsetof(struct ocfs2_group_desc, bg_bitmap); | ||
588 | |||
589 | return size; | ||
590 | } | ||
591 | |||
592 | static inline int ocfs2_truncate_recs_per_inode(int blocksize) | ||
593 | { | ||
594 | int size; | ||
595 | |||
596 | size = blocksize - | ||
597 | offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); | ||
598 | |||
599 | return size / sizeof(struct ocfs2_truncate_rec); | ||
600 | } | ||
601 | #endif /* __KERNEL__ */ | ||
602 | |||
603 | |||
604 | static inline int ocfs2_system_inode_is_global(int type) | ||
605 | { | ||
606 | return ((type >= 0) && | ||
607 | (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)); | ||
608 | } | ||
609 | |||
610 | static inline int ocfs2_sprintf_system_inode_name(char *buf, int len, | ||
611 | int type, int slot) | ||
612 | { | ||
613 | int chars; | ||
614 | |||
615 | /* | ||
616 | * Global system inodes can only have one copy. Everything | ||
617 | * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode | ||
618 | * list has a copy per slot. | ||
619 | */ | ||
620 | if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE) | ||
621 | chars = snprintf(buf, len, | ||
622 | ocfs2_system_inodes[type].si_name); | ||
623 | else | ||
624 | chars = snprintf(buf, len, | ||
625 | ocfs2_system_inodes[type].si_name, | ||
626 | slot); | ||
627 | |||
628 | return chars; | ||
629 | } | ||
630 | |||
631 | static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de, | ||
632 | umode_t mode) | ||
633 | { | ||
634 | de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; | ||
635 | } | ||
636 | |||
637 | #endif /* _OCFS2_FS_H */ | ||
638 | |||
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h new file mode 100644 index 000000000000..7dd9e1e705b0 --- /dev/null +++ b/fs/ocfs2/ocfs2_lockid.h | |||
@@ -0,0 +1,73 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ocfs2_lockid.h | ||
5 | * | ||
6 | * Defines OCFS2 lockid bits. | ||
7 | * | ||
8 | * Copyright (C) 2002, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_LOCKID_H | ||
27 | #define OCFS2_LOCKID_H | ||
28 | |||
29 | /* lock ids are made up in the following manner: | ||
30 | * name[0] --> type | ||
31 | * name[1-6] --> 6 pad characters, reserved for now | ||
32 | * name[7-22] --> block number, expressed in hex as 16 chars | ||
33 | * name[23-30] --> i_generation, expressed in hex 8 chars | ||
34 | * name[31] --> '\0' */ | ||
35 | #define OCFS2_LOCK_ID_MAX_LEN 32 | ||
36 | #define OCFS2_LOCK_ID_PAD "000000" | ||
37 | |||
38 | enum ocfs2_lock_type { | ||
39 | OCFS2_LOCK_TYPE_META = 0, | ||
40 | OCFS2_LOCK_TYPE_DATA, | ||
41 | OCFS2_LOCK_TYPE_SUPER, | ||
42 | OCFS2_LOCK_TYPE_RENAME, | ||
43 | OCFS2_LOCK_TYPE_RW, | ||
44 | OCFS2_NUM_LOCK_TYPES | ||
45 | }; | ||
46 | |||
47 | static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) | ||
48 | { | ||
49 | char c; | ||
50 | switch (type) { | ||
51 | case OCFS2_LOCK_TYPE_META: | ||
52 | c = 'M'; | ||
53 | break; | ||
54 | case OCFS2_LOCK_TYPE_DATA: | ||
55 | c = 'D'; | ||
56 | break; | ||
57 | case OCFS2_LOCK_TYPE_SUPER: | ||
58 | c = 'S'; | ||
59 | break; | ||
60 | case OCFS2_LOCK_TYPE_RENAME: | ||
61 | c = 'R'; | ||
62 | break; | ||
63 | case OCFS2_LOCK_TYPE_RW: | ||
64 | c = 'W'; | ||
65 | break; | ||
66 | default: | ||
67 | c = '\0'; | ||
68 | } | ||
69 | |||
70 | return c; | ||
71 | } | ||
72 | |||
73 | #endif /* OCFS2_LOCKID_H */ | ||
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c new file mode 100644 index 000000000000..871627961d6d --- /dev/null +++ b/fs/ocfs2/slot_map.c | |||
@@ -0,0 +1,303 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * slot_map.c | ||
5 | * | ||
6 | * | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/types.h> | ||
27 | #include <linux/slab.h> | ||
28 | #include <linux/highmem.h> | ||
29 | #include <linux/smp_lock.h> | ||
30 | |||
31 | #define MLOG_MASK_PREFIX ML_SUPER | ||
32 | #include <cluster/masklog.h> | ||
33 | |||
34 | #include "ocfs2.h" | ||
35 | |||
36 | #include "dlmglue.h" | ||
37 | #include "extent_map.h" | ||
38 | #include "heartbeat.h" | ||
39 | #include "inode.h" | ||
40 | #include "slot_map.h" | ||
41 | #include "super.h" | ||
42 | #include "sysfile.h" | ||
43 | |||
44 | #include "buffer_head_io.h" | ||
45 | |||
46 | static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, | ||
47 | s16 global); | ||
48 | static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, | ||
49 | s16 slot_num, | ||
50 | s16 node_num); | ||
51 | |||
52 | /* Use the slot information we've collected to create a map of mounted | ||
53 | * nodes. Should be holding an EX on super block. assumes slot info is | ||
54 | * up to date. Note that we call this *after* we find a slot, so our | ||
55 | * own node should be set in the map too... */ | ||
56 | void ocfs2_populate_mounted_map(struct ocfs2_super *osb) | ||
57 | { | ||
58 | int i; | ||
59 | struct ocfs2_slot_info *si = osb->slot_info; | ||
60 | |||
61 | spin_lock(&si->si_lock); | ||
62 | |||
63 | for (i = 0; i < si->si_size; i++) | ||
64 | if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT) | ||
65 | ocfs2_node_map_set_bit(osb, &osb->mounted_map, | ||
66 | si->si_global_node_nums[i]); | ||
67 | |||
68 | spin_unlock(&si->si_lock); | ||
69 | } | ||
70 | |||
71 | /* post the slot information on disk into our slot_info struct. */ | ||
72 | void ocfs2_update_slot_info(struct ocfs2_slot_info *si) | ||
73 | { | ||
74 | int i; | ||
75 | __le16 *disk_info; | ||
76 | |||
77 | /* we don't read the slot block here as ocfs2_super_lock | ||
78 | * should've made sure we have the most recent copy. */ | ||
79 | spin_lock(&si->si_lock); | ||
80 | disk_info = (__le16 *) si->si_bh->b_data; | ||
81 | |||
82 | for (i = 0; i < si->si_size; i++) | ||
83 | si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]); | ||
84 | |||
85 | spin_unlock(&si->si_lock); | ||
86 | } | ||
87 | |||
88 | /* post the our slot info stuff into it's destination bh and write it | ||
89 | * out. */ | ||
90 | int ocfs2_update_disk_slots(struct ocfs2_super *osb, | ||
91 | struct ocfs2_slot_info *si) | ||
92 | { | ||
93 | int status, i; | ||
94 | __le16 *disk_info = (__le16 *) si->si_bh->b_data; | ||
95 | |||
96 | spin_lock(&si->si_lock); | ||
97 | for (i = 0; i < si->si_size; i++) | ||
98 | disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]); | ||
99 | spin_unlock(&si->si_lock); | ||
100 | |||
101 | status = ocfs2_write_block(osb, si->si_bh, si->si_inode); | ||
102 | if (status < 0) | ||
103 | mlog_errno(status); | ||
104 | |||
105 | return status; | ||
106 | } | ||
107 | |||
108 | /* try to find global node in the slot info. Returns | ||
109 | * OCFS2_INVALID_SLOT if nothing is found. */ | ||
110 | static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, | ||
111 | s16 global) | ||
112 | { | ||
113 | int i; | ||
114 | s16 ret = OCFS2_INVALID_SLOT; | ||
115 | |||
116 | for(i = 0; i < si->si_num_slots; i++) { | ||
117 | if (global == si->si_global_node_nums[i]) { | ||
118 | ret = (s16) i; | ||
119 | break; | ||
120 | } | ||
121 | } | ||
122 | return ret; | ||
123 | } | ||
124 | |||
125 | static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si) | ||
126 | { | ||
127 | int i; | ||
128 | s16 ret = OCFS2_INVALID_SLOT; | ||
129 | |||
130 | for(i = 0; i < si->si_num_slots; i++) { | ||
131 | if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { | ||
132 | ret = (s16) i; | ||
133 | break; | ||
134 | } | ||
135 | } | ||
136 | return ret; | ||
137 | } | ||
138 | |||
139 | s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, | ||
140 | s16 global) | ||
141 | { | ||
142 | s16 ret; | ||
143 | |||
144 | spin_lock(&si->si_lock); | ||
145 | ret = __ocfs2_node_num_to_slot(si, global); | ||
146 | spin_unlock(&si->si_lock); | ||
147 | return ret; | ||
148 | } | ||
149 | |||
150 | static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, | ||
151 | s16 slot_num, | ||
152 | s16 node_num) | ||
153 | { | ||
154 | BUG_ON(slot_num == OCFS2_INVALID_SLOT); | ||
155 | BUG_ON(slot_num >= si->si_num_slots); | ||
156 | BUG_ON((node_num != O2NM_INVALID_NODE_NUM) && | ||
157 | (node_num >= O2NM_MAX_NODES)); | ||
158 | |||
159 | si->si_global_node_nums[slot_num] = node_num; | ||
160 | } | ||
161 | |||
162 | void ocfs2_clear_slot(struct ocfs2_slot_info *si, | ||
163 | s16 slot_num) | ||
164 | { | ||
165 | spin_lock(&si->si_lock); | ||
166 | __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT); | ||
167 | spin_unlock(&si->si_lock); | ||
168 | } | ||
169 | |||
170 | int ocfs2_init_slot_info(struct ocfs2_super *osb) | ||
171 | { | ||
172 | int status, i; | ||
173 | u64 blkno; | ||
174 | struct inode *inode = NULL; | ||
175 | struct buffer_head *bh = NULL; | ||
176 | struct ocfs2_slot_info *si; | ||
177 | |||
178 | si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL); | ||
179 | if (!si) { | ||
180 | status = -ENOMEM; | ||
181 | mlog_errno(status); | ||
182 | goto bail; | ||
183 | } | ||
184 | |||
185 | spin_lock_init(&si->si_lock); | ||
186 | si->si_num_slots = osb->max_slots; | ||
187 | si->si_size = OCFS2_MAX_SLOTS; | ||
188 | |||
189 | for(i = 0; i < si->si_num_slots; i++) | ||
190 | si->si_global_node_nums[i] = OCFS2_INVALID_SLOT; | ||
191 | |||
192 | inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, | ||
193 | OCFS2_INVALID_SLOT); | ||
194 | if (!inode) { | ||
195 | status = -EINVAL; | ||
196 | mlog_errno(status); | ||
197 | goto bail; | ||
198 | } | ||
199 | |||
200 | status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL); | ||
201 | if (status < 0) { | ||
202 | mlog_errno(status); | ||
203 | goto bail; | ||
204 | } | ||
205 | |||
206 | status = ocfs2_read_block(osb, blkno, &bh, 0, inode); | ||
207 | if (status < 0) { | ||
208 | mlog_errno(status); | ||
209 | goto bail; | ||
210 | } | ||
211 | |||
212 | si->si_inode = inode; | ||
213 | si->si_bh = bh; | ||
214 | osb->slot_info = si; | ||
215 | bail: | ||
216 | if (status < 0 && si) | ||
217 | ocfs2_free_slot_info(si); | ||
218 | |||
219 | return status; | ||
220 | } | ||
221 | |||
222 | void ocfs2_free_slot_info(struct ocfs2_slot_info *si) | ||
223 | { | ||
224 | if (si->si_inode) | ||
225 | iput(si->si_inode); | ||
226 | if (si->si_bh) | ||
227 | brelse(si->si_bh); | ||
228 | kfree(si); | ||
229 | } | ||
230 | |||
231 | int ocfs2_find_slot(struct ocfs2_super *osb) | ||
232 | { | ||
233 | int status; | ||
234 | s16 slot; | ||
235 | struct ocfs2_slot_info *si; | ||
236 | |||
237 | mlog_entry_void(); | ||
238 | |||
239 | si = osb->slot_info; | ||
240 | |||
241 | ocfs2_update_slot_info(si); | ||
242 | |||
243 | spin_lock(&si->si_lock); | ||
244 | /* search for ourselves first and take the slot if it already | ||
245 | * exists. Perhaps we need to mark this in a variable for our | ||
246 | * own journal recovery? Possibly not, though we certainly | ||
247 | * need to warn to the user */ | ||
248 | slot = __ocfs2_node_num_to_slot(si, osb->node_num); | ||
249 | if (slot == OCFS2_INVALID_SLOT) { | ||
250 | /* if no slot yet, then just take 1st available | ||
251 | * one. */ | ||
252 | slot = __ocfs2_find_empty_slot(si); | ||
253 | if (slot == OCFS2_INVALID_SLOT) { | ||
254 | spin_unlock(&si->si_lock); | ||
255 | mlog(ML_ERROR, "no free slots available!\n"); | ||
256 | status = -EINVAL; | ||
257 | goto bail; | ||
258 | } | ||
259 | } else | ||
260 | mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", | ||
261 | slot); | ||
262 | |||
263 | __ocfs2_fill_slot(si, slot, osb->node_num); | ||
264 | osb->slot_num = slot; | ||
265 | spin_unlock(&si->si_lock); | ||
266 | |||
267 | mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num); | ||
268 | |||
269 | status = ocfs2_update_disk_slots(osb, si); | ||
270 | if (status < 0) | ||
271 | mlog_errno(status); | ||
272 | |||
273 | bail: | ||
274 | mlog_exit(status); | ||
275 | return status; | ||
276 | } | ||
277 | |||
278 | void ocfs2_put_slot(struct ocfs2_super *osb) | ||
279 | { | ||
280 | int status; | ||
281 | struct ocfs2_slot_info *si = osb->slot_info; | ||
282 | |||
283 | if (!si) | ||
284 | return; | ||
285 | |||
286 | ocfs2_update_slot_info(si); | ||
287 | |||
288 | spin_lock(&si->si_lock); | ||
289 | __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT); | ||
290 | osb->slot_num = OCFS2_INVALID_SLOT; | ||
291 | spin_unlock(&si->si_lock); | ||
292 | |||
293 | status = ocfs2_update_disk_slots(osb, si); | ||
294 | if (status < 0) { | ||
295 | mlog_errno(status); | ||
296 | goto bail; | ||
297 | } | ||
298 | |||
299 | bail: | ||
300 | osb->slot_info = NULL; | ||
301 | ocfs2_free_slot_info(si); | ||
302 | } | ||
303 | |||
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h new file mode 100644 index 000000000000..d8c8ceed031b --- /dev/null +++ b/fs/ocfs2/slot_map.h | |||
@@ -0,0 +1,66 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * slotmap.h | ||
5 | * | ||
6 | * description here | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | |||
27 | #ifndef SLOTMAP_H | ||
28 | #define SLOTMAP_H | ||
29 | |||
30 | struct ocfs2_slot_info { | ||
31 | spinlock_t si_lock; | ||
32 | |||
33 | struct inode *si_inode; | ||
34 | struct buffer_head *si_bh; | ||
35 | unsigned int si_num_slots; | ||
36 | unsigned int si_size; | ||
37 | s16 si_global_node_nums[OCFS2_MAX_SLOTS]; | ||
38 | }; | ||
39 | |||
40 | int ocfs2_init_slot_info(struct ocfs2_super *osb); | ||
41 | void ocfs2_free_slot_info(struct ocfs2_slot_info *si); | ||
42 | |||
43 | int ocfs2_find_slot(struct ocfs2_super *osb); | ||
44 | void ocfs2_put_slot(struct ocfs2_super *osb); | ||
45 | |||
46 | void ocfs2_update_slot_info(struct ocfs2_slot_info *si); | ||
47 | int ocfs2_update_disk_slots(struct ocfs2_super *osb, | ||
48 | struct ocfs2_slot_info *si); | ||
49 | |||
50 | s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, | ||
51 | s16 global); | ||
52 | void ocfs2_clear_slot(struct ocfs2_slot_info *si, | ||
53 | s16 slot_num); | ||
54 | |||
55 | void ocfs2_populate_mounted_map(struct ocfs2_super *osb); | ||
56 | |||
57 | static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, | ||
58 | int slot_num) | ||
59 | { | ||
60 | BUG_ON(slot_num == OCFS2_INVALID_SLOT); | ||
61 | assert_spin_locked(&si->si_lock); | ||
62 | |||
63 | return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT; | ||
64 | } | ||
65 | |||
66 | #endif | ||
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c new file mode 100644 index 000000000000..c46c164aefbb --- /dev/null +++ b/fs/ocfs2/suballoc.c | |||
@@ -0,0 +1,1651 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * suballoc.c | ||
5 | * | ||
6 | * metadata alloc and free | ||
7 | * Inspired by ext3 block groups. | ||
8 | * | ||
9 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public | ||
13 | * License as published by the Free Software Foundation; either | ||
14 | * version 2 of the License, or (at your option) any later version. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
19 | * General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public | ||
22 | * License along with this program; if not, write to the | ||
23 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
24 | * Boston, MA 021110-1307, USA. | ||
25 | */ | ||
26 | |||
27 | #include <linux/fs.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/highmem.h> | ||
31 | |||
32 | #define MLOG_MASK_PREFIX ML_DISK_ALLOC | ||
33 | #include <cluster/masklog.h> | ||
34 | |||
35 | #include "ocfs2.h" | ||
36 | |||
37 | #include "alloc.h" | ||
38 | #include "dlmglue.h" | ||
39 | #include "inode.h" | ||
40 | #include "journal.h" | ||
41 | #include "localalloc.h" | ||
42 | #include "suballoc.h" | ||
43 | #include "super.h" | ||
44 | #include "sysfile.h" | ||
45 | #include "uptodate.h" | ||
46 | |||
47 | #include "buffer_head_io.h" | ||
48 | |||
49 | static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); | ||
50 | static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); | ||
51 | static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); | ||
52 | static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle, | ||
53 | struct inode *alloc_inode, | ||
54 | struct buffer_head *bg_bh, | ||
55 | u64 group_blkno, | ||
56 | u16 my_chain, | ||
57 | struct ocfs2_chain_list *cl); | ||
58 | static int ocfs2_block_group_alloc(struct ocfs2_super *osb, | ||
59 | struct inode *alloc_inode, | ||
60 | struct buffer_head *bh); | ||
61 | |||
62 | static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, | ||
63 | struct ocfs2_alloc_context *ac); | ||
64 | |||
65 | static int ocfs2_cluster_group_search(struct inode *inode, | ||
66 | struct buffer_head *group_bh, | ||
67 | u32 bits_wanted, u32 min_bits, | ||
68 | u16 *bit_off, u16 *bits_found); | ||
69 | static int ocfs2_block_group_search(struct inode *inode, | ||
70 | struct buffer_head *group_bh, | ||
71 | u32 bits_wanted, u32 min_bits, | ||
72 | u16 *bit_off, u16 *bits_found); | ||
73 | static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, | ||
74 | u32 bits_wanted, | ||
75 | u32 min_bits, | ||
76 | u16 *bit_off, | ||
77 | unsigned int *num_bits, | ||
78 | u64 *bg_blkno); | ||
79 | static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, | ||
80 | struct ocfs2_alloc_context *ac, | ||
81 | u32 bits_wanted, | ||
82 | u32 min_bits, | ||
83 | u16 *bit_off, | ||
84 | unsigned int *num_bits, | ||
85 | u64 *bg_blkno); | ||
86 | static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, | ||
87 | int nr); | ||
88 | static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, | ||
89 | struct buffer_head *bg_bh, | ||
90 | unsigned int bits_wanted, | ||
91 | u16 *bit_off, | ||
92 | u16 *bits_found); | ||
93 | static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle, | ||
94 | struct inode *alloc_inode, | ||
95 | struct ocfs2_group_desc *bg, | ||
96 | struct buffer_head *group_bh, | ||
97 | unsigned int bit_off, | ||
98 | unsigned int num_bits); | ||
99 | static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle, | ||
100 | struct inode *alloc_inode, | ||
101 | struct ocfs2_group_desc *bg, | ||
102 | struct buffer_head *group_bh, | ||
103 | unsigned int bit_off, | ||
104 | unsigned int num_bits); | ||
105 | |||
106 | static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle, | ||
107 | struct inode *alloc_inode, | ||
108 | struct buffer_head *fe_bh, | ||
109 | struct buffer_head *bg_bh, | ||
110 | struct buffer_head *prev_bg_bh, | ||
111 | u16 chain); | ||
112 | static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, | ||
113 | u32 wanted); | ||
114 | static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle, | ||
115 | struct inode *alloc_inode, | ||
116 | struct buffer_head *alloc_bh, | ||
117 | unsigned int start_bit, | ||
118 | u64 bg_blkno, | ||
119 | unsigned int count); | ||
120 | static inline u64 ocfs2_which_suballoc_group(u64 block, | ||
121 | unsigned int bit); | ||
122 | static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, | ||
123 | u64 bg_blkno, | ||
124 | u16 bg_bit_off); | ||
125 | static inline u64 ocfs2_which_cluster_group(struct inode *inode, | ||
126 | u32 cluster); | ||
127 | static inline void ocfs2_block_to_cluster_group(struct inode *inode, | ||
128 | u64 data_blkno, | ||
129 | u64 *bg_blkno, | ||
130 | u16 *bg_bit_off); | ||
131 | |||
132 | void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) | ||
133 | { | ||
134 | if (ac->ac_inode) | ||
135 | iput(ac->ac_inode); | ||
136 | if (ac->ac_bh) | ||
137 | brelse(ac->ac_bh); | ||
138 | kfree(ac); | ||
139 | } | ||
140 | |||
141 | static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) | ||
142 | { | ||
143 | return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); | ||
144 | } | ||
145 | |||
146 | static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle, | ||
147 | struct inode *alloc_inode, | ||
148 | struct buffer_head *bg_bh, | ||
149 | u64 group_blkno, | ||
150 | u16 my_chain, | ||
151 | struct ocfs2_chain_list *cl) | ||
152 | { | ||
153 | int status = 0; | ||
154 | struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; | ||
155 | struct super_block * sb = alloc_inode->i_sb; | ||
156 | |||
157 | mlog_entry_void(); | ||
158 | |||
159 | if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { | ||
160 | ocfs2_error(alloc_inode->i_sb, "group block (%"MLFu64") " | ||
161 | "!= b_blocknr (%llu)", group_blkno, | ||
162 | (unsigned long long) bg_bh->b_blocknr); | ||
163 | status = -EIO; | ||
164 | goto bail; | ||
165 | } | ||
166 | |||
167 | status = ocfs2_journal_access(handle, | ||
168 | alloc_inode, | ||
169 | bg_bh, | ||
170 | OCFS2_JOURNAL_ACCESS_CREATE); | ||
171 | if (status < 0) { | ||
172 | mlog_errno(status); | ||
173 | goto bail; | ||
174 | } | ||
175 | |||
176 | memset(bg, 0, sb->s_blocksize); | ||
177 | strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); | ||
178 | bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); | ||
179 | bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb)); | ||
180 | bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); | ||
181 | bg->bg_chain = cpu_to_le16(my_chain); | ||
182 | bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; | ||
183 | bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); | ||
184 | bg->bg_blkno = cpu_to_le64(group_blkno); | ||
185 | /* set the 1st bit in the bitmap to account for the descriptor block */ | ||
186 | ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); | ||
187 | bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); | ||
188 | |||
189 | status = ocfs2_journal_dirty(handle, bg_bh); | ||
190 | if (status < 0) | ||
191 | mlog_errno(status); | ||
192 | |||
193 | /* There is no need to zero out or otherwise initialize the | ||
194 | * other blocks in a group - All valid FS metadata in a block | ||
195 | * group stores the superblock fs_generation value at | ||
196 | * allocation time. */ | ||
197 | |||
198 | bail: | ||
199 | mlog_exit(status); | ||
200 | return status; | ||
201 | } | ||
202 | |||
203 | static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) | ||
204 | { | ||
205 | u16 curr, best; | ||
206 | |||
207 | best = curr = 0; | ||
208 | while (curr < le16_to_cpu(cl->cl_count)) { | ||
209 | if (le32_to_cpu(cl->cl_recs[best].c_total) > | ||
210 | le32_to_cpu(cl->cl_recs[curr].c_total)) | ||
211 | best = curr; | ||
212 | curr++; | ||
213 | } | ||
214 | return best; | ||
215 | } | ||
216 | |||
217 | /* | ||
218 | * We expect the block group allocator to already be locked. | ||
219 | */ | ||
220 | static int ocfs2_block_group_alloc(struct ocfs2_super *osb, | ||
221 | struct inode *alloc_inode, | ||
222 | struct buffer_head *bh) | ||
223 | { | ||
224 | int status, credits; | ||
225 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; | ||
226 | struct ocfs2_chain_list *cl; | ||
227 | struct ocfs2_alloc_context *ac = NULL; | ||
228 | struct ocfs2_journal_handle *handle = NULL; | ||
229 | u32 bit_off, num_bits; | ||
230 | u16 alloc_rec; | ||
231 | u64 bg_blkno; | ||
232 | struct buffer_head *bg_bh = NULL; | ||
233 | struct ocfs2_group_desc *bg; | ||
234 | |||
235 | BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); | ||
236 | |||
237 | mlog_entry_void(); | ||
238 | |||
239 | handle = ocfs2_alloc_handle(osb); | ||
240 | if (!handle) { | ||
241 | status = -ENOMEM; | ||
242 | mlog_errno(status); | ||
243 | goto bail; | ||
244 | } | ||
245 | |||
246 | cl = &fe->id2.i_chain; | ||
247 | status = ocfs2_reserve_clusters(osb, | ||
248 | handle, | ||
249 | le16_to_cpu(cl->cl_cpg), | ||
250 | &ac); | ||
251 | if (status < 0) { | ||
252 | if (status != -ENOSPC) | ||
253 | mlog_errno(status); | ||
254 | goto bail; | ||
255 | } | ||
256 | |||
257 | credits = ocfs2_calc_group_alloc_credits(osb->sb, | ||
258 | le16_to_cpu(cl->cl_cpg)); | ||
259 | handle = ocfs2_start_trans(osb, handle, credits); | ||
260 | if (IS_ERR(handle)) { | ||
261 | status = PTR_ERR(handle); | ||
262 | handle = NULL; | ||
263 | mlog_errno(status); | ||
264 | goto bail; | ||
265 | } | ||
266 | |||
267 | status = ocfs2_claim_clusters(osb, | ||
268 | handle, | ||
269 | ac, | ||
270 | le16_to_cpu(cl->cl_cpg), | ||
271 | &bit_off, | ||
272 | &num_bits); | ||
273 | if (status < 0) { | ||
274 | if (status != -ENOSPC) | ||
275 | mlog_errno(status); | ||
276 | goto bail; | ||
277 | } | ||
278 | |||
279 | alloc_rec = ocfs2_find_smallest_chain(cl); | ||
280 | |||
281 | /* setup the group */ | ||
282 | bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); | ||
283 | mlog(0, "new descriptor, record %u, at block %"MLFu64"\n", | ||
284 | alloc_rec, bg_blkno); | ||
285 | |||
286 | bg_bh = sb_getblk(osb->sb, bg_blkno); | ||
287 | if (!bg_bh) { | ||
288 | status = -EIO; | ||
289 | mlog_errno(status); | ||
290 | goto bail; | ||
291 | } | ||
292 | ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh); | ||
293 | |||
294 | status = ocfs2_block_group_fill(handle, | ||
295 | alloc_inode, | ||
296 | bg_bh, | ||
297 | bg_blkno, | ||
298 | alloc_rec, | ||
299 | cl); | ||
300 | if (status < 0) { | ||
301 | mlog_errno(status); | ||
302 | goto bail; | ||
303 | } | ||
304 | |||
305 | bg = (struct ocfs2_group_desc *) bg_bh->b_data; | ||
306 | |||
307 | status = ocfs2_journal_access(handle, alloc_inode, | ||
308 | bh, OCFS2_JOURNAL_ACCESS_WRITE); | ||
309 | if (status < 0) { | ||
310 | mlog_errno(status); | ||
311 | goto bail; | ||
312 | } | ||
313 | |||
314 | le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, | ||
315 | le16_to_cpu(bg->bg_free_bits_count)); | ||
316 | le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); | ||
317 | cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno); | ||
318 | if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) | ||
319 | le16_add_cpu(&cl->cl_next_free_rec, 1); | ||
320 | |||
321 | le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) - | ||
322 | le16_to_cpu(bg->bg_free_bits_count)); | ||
323 | le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); | ||
324 | le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); | ||
325 | |||
326 | status = ocfs2_journal_dirty(handle, bh); | ||
327 | if (status < 0) { | ||
328 | mlog_errno(status); | ||
329 | goto bail; | ||
330 | } | ||
331 | |||
332 | spin_lock(&OCFS2_I(alloc_inode)->ip_lock); | ||
333 | OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
334 | fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, | ||
335 | le32_to_cpu(fe->i_clusters))); | ||
336 | spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); | ||
337 | i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); | ||
338 | alloc_inode->i_blocks = | ||
339 | ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode)); | ||
340 | |||
341 | status = 0; | ||
342 | bail: | ||
343 | if (handle) | ||
344 | ocfs2_commit_trans(handle); | ||
345 | |||
346 | if (ac) | ||
347 | ocfs2_free_alloc_context(ac); | ||
348 | |||
349 | if (bg_bh) | ||
350 | brelse(bg_bh); | ||
351 | |||
352 | mlog_exit(status); | ||
353 | return status; | ||
354 | } | ||
355 | |||
356 | static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, | ||
357 | struct ocfs2_alloc_context *ac) | ||
358 | { | ||
359 | int status; | ||
360 | u32 bits_wanted = ac->ac_bits_wanted; | ||
361 | struct inode *alloc_inode = ac->ac_inode; | ||
362 | struct buffer_head *bh = NULL; | ||
363 | struct ocfs2_journal_handle *handle = ac->ac_handle; | ||
364 | struct ocfs2_dinode *fe; | ||
365 | u32 free_bits; | ||
366 | |||
367 | mlog_entry_void(); | ||
368 | |||
369 | BUG_ON(handle->flags & OCFS2_HANDLE_STARTED); | ||
370 | |||
371 | ocfs2_handle_add_inode(handle, alloc_inode); | ||
372 | status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1); | ||
373 | if (status < 0) { | ||
374 | mlog_errno(status); | ||
375 | goto bail; | ||
376 | } | ||
377 | |||
378 | fe = (struct ocfs2_dinode *) bh->b_data; | ||
379 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
380 | OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); | ||
381 | status = -EIO; | ||
382 | goto bail; | ||
383 | } | ||
384 | if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { | ||
385 | ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator " | ||
386 | "# %"MLFu64, le64_to_cpu(fe->i_blkno)); | ||
387 | status = -EIO; | ||
388 | goto bail; | ||
389 | } | ||
390 | |||
391 | free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - | ||
392 | le32_to_cpu(fe->id1.bitmap1.i_used); | ||
393 | |||
394 | if (bits_wanted > free_bits) { | ||
395 | /* cluster bitmap never grows */ | ||
396 | if (ocfs2_is_cluster_bitmap(alloc_inode)) { | ||
397 | mlog(0, "Disk Full: wanted=%u, free_bits=%u\n", | ||
398 | bits_wanted, free_bits); | ||
399 | status = -ENOSPC; | ||
400 | goto bail; | ||
401 | } | ||
402 | |||
403 | status = ocfs2_block_group_alloc(osb, alloc_inode, bh); | ||
404 | if (status < 0) { | ||
405 | if (status != -ENOSPC) | ||
406 | mlog_errno(status); | ||
407 | goto bail; | ||
408 | } | ||
409 | atomic_inc(&osb->alloc_stats.bg_extends); | ||
410 | |||
411 | /* You should never ask for this much metadata */ | ||
412 | BUG_ON(bits_wanted > | ||
413 | (le32_to_cpu(fe->id1.bitmap1.i_total) | ||
414 | - le32_to_cpu(fe->id1.bitmap1.i_used))); | ||
415 | } | ||
416 | |||
417 | get_bh(bh); | ||
418 | ac->ac_bh = bh; | ||
419 | bail: | ||
420 | if (bh) | ||
421 | brelse(bh); | ||
422 | |||
423 | mlog_exit(status); | ||
424 | return status; | ||
425 | } | ||
426 | |||
427 | int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, | ||
428 | struct ocfs2_journal_handle *handle, | ||
429 | struct ocfs2_dinode *fe, | ||
430 | struct ocfs2_alloc_context **ac) | ||
431 | { | ||
432 | int status; | ||
433 | struct inode *alloc_inode = NULL; | ||
434 | |||
435 | *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); | ||
436 | if (!(*ac)) { | ||
437 | status = -ENOMEM; | ||
438 | mlog_errno(status); | ||
439 | goto bail; | ||
440 | } | ||
441 | |||
442 | (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); | ||
443 | (*ac)->ac_handle = handle; | ||
444 | (*ac)->ac_which = OCFS2_AC_USE_META; | ||
445 | |||
446 | #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS | ||
447 | alloc_inode = ocfs2_get_system_file_inode(osb, | ||
448 | EXTENT_ALLOC_SYSTEM_INODE, | ||
449 | 0); | ||
450 | #else | ||
451 | alloc_inode = ocfs2_get_system_file_inode(osb, | ||
452 | EXTENT_ALLOC_SYSTEM_INODE, | ||
453 | osb->slot_num); | ||
454 | #endif | ||
455 | if (!alloc_inode) { | ||
456 | status = -ENOMEM; | ||
457 | mlog_errno(status); | ||
458 | goto bail; | ||
459 | } | ||
460 | |||
461 | (*ac)->ac_inode = igrab(alloc_inode); | ||
462 | (*ac)->ac_group_search = ocfs2_block_group_search; | ||
463 | |||
464 | status = ocfs2_reserve_suballoc_bits(osb, (*ac)); | ||
465 | if (status < 0) { | ||
466 | if (status != -ENOSPC) | ||
467 | mlog_errno(status); | ||
468 | goto bail; | ||
469 | } | ||
470 | |||
471 | status = 0; | ||
472 | bail: | ||
473 | if ((status < 0) && *ac) { | ||
474 | ocfs2_free_alloc_context(*ac); | ||
475 | *ac = NULL; | ||
476 | } | ||
477 | |||
478 | if (alloc_inode) | ||
479 | iput(alloc_inode); | ||
480 | |||
481 | mlog_exit(status); | ||
482 | return status; | ||
483 | } | ||
484 | |||
485 | int ocfs2_reserve_new_inode(struct ocfs2_super *osb, | ||
486 | struct ocfs2_journal_handle *handle, | ||
487 | struct ocfs2_alloc_context **ac) | ||
488 | { | ||
489 | int status; | ||
490 | struct inode *alloc_inode = NULL; | ||
491 | |||
492 | *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); | ||
493 | if (!(*ac)) { | ||
494 | status = -ENOMEM; | ||
495 | mlog_errno(status); | ||
496 | goto bail; | ||
497 | } | ||
498 | |||
499 | (*ac)->ac_bits_wanted = 1; | ||
500 | (*ac)->ac_handle = handle; | ||
501 | (*ac)->ac_which = OCFS2_AC_USE_INODE; | ||
502 | |||
503 | alloc_inode = ocfs2_get_system_file_inode(osb, | ||
504 | INODE_ALLOC_SYSTEM_INODE, | ||
505 | osb->slot_num); | ||
506 | if (!alloc_inode) { | ||
507 | status = -ENOMEM; | ||
508 | mlog_errno(status); | ||
509 | goto bail; | ||
510 | } | ||
511 | |||
512 | (*ac)->ac_inode = igrab(alloc_inode); | ||
513 | (*ac)->ac_group_search = ocfs2_block_group_search; | ||
514 | |||
515 | status = ocfs2_reserve_suballoc_bits(osb, *ac); | ||
516 | if (status < 0) { | ||
517 | if (status != -ENOSPC) | ||
518 | mlog_errno(status); | ||
519 | goto bail; | ||
520 | } | ||
521 | |||
522 | status = 0; | ||
523 | bail: | ||
524 | if ((status < 0) && *ac) { | ||
525 | ocfs2_free_alloc_context(*ac); | ||
526 | *ac = NULL; | ||
527 | } | ||
528 | |||
529 | if (alloc_inode) | ||
530 | iput(alloc_inode); | ||
531 | |||
532 | mlog_exit(status); | ||
533 | return status; | ||
534 | } | ||
535 | |||
536 | /* local alloc code has to do the same thing, so rather than do this | ||
537 | * twice.. */ | ||
538 | int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, | ||
539 | struct ocfs2_alloc_context *ac) | ||
540 | { | ||
541 | int status; | ||
542 | |||
543 | ac->ac_inode = ocfs2_get_system_file_inode(osb, | ||
544 | GLOBAL_BITMAP_SYSTEM_INODE, | ||
545 | OCFS2_INVALID_SLOT); | ||
546 | if (!ac->ac_inode) { | ||
547 | status = -EINVAL; | ||
548 | mlog(ML_ERROR, "Could not get bitmap inode!\n"); | ||
549 | goto bail; | ||
550 | } | ||
551 | ac->ac_which = OCFS2_AC_USE_MAIN; | ||
552 | ac->ac_group_search = ocfs2_cluster_group_search; | ||
553 | |||
554 | status = ocfs2_reserve_suballoc_bits(osb, ac); | ||
555 | if (status < 0 && status != -ENOSPC) | ||
556 | mlog_errno(status); | ||
557 | bail: | ||
558 | return status; | ||
559 | } | ||
560 | |||
561 | /* Callers don't need to care which bitmap (local alloc or main) to | ||
562 | * use so we figure it out for them, but unfortunately this clutters | ||
563 | * things a bit. */ | ||
564 | int ocfs2_reserve_clusters(struct ocfs2_super *osb, | ||
565 | struct ocfs2_journal_handle *handle, | ||
566 | u32 bits_wanted, | ||
567 | struct ocfs2_alloc_context **ac) | ||
568 | { | ||
569 | int status; | ||
570 | |||
571 | mlog_entry_void(); | ||
572 | |||
573 | BUG_ON(!handle); | ||
574 | |||
575 | *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); | ||
576 | if (!(*ac)) { | ||
577 | status = -ENOMEM; | ||
578 | mlog_errno(status); | ||
579 | goto bail; | ||
580 | } | ||
581 | |||
582 | (*ac)->ac_bits_wanted = bits_wanted; | ||
583 | (*ac)->ac_handle = handle; | ||
584 | |||
585 | status = -ENOSPC; | ||
586 | if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { | ||
587 | status = ocfs2_reserve_local_alloc_bits(osb, | ||
588 | handle, | ||
589 | bits_wanted, | ||
590 | *ac); | ||
591 | if ((status < 0) && (status != -ENOSPC)) { | ||
592 | mlog_errno(status); | ||
593 | goto bail; | ||
594 | } else if (status == -ENOSPC) { | ||
595 | /* reserve_local_bits will return enospc with | ||
596 | * the local alloc inode still locked, so we | ||
597 | * can change this safely here. */ | ||
598 | mlog(0, "Disabling local alloc\n"); | ||
599 | /* We set to OCFS2_LA_DISABLED so that umount | ||
600 | * can clean up what's left of the local | ||
601 | * allocation */ | ||
602 | osb->local_alloc_state = OCFS2_LA_DISABLED; | ||
603 | } | ||
604 | } | ||
605 | |||
606 | if (status == -ENOSPC) { | ||
607 | status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); | ||
608 | if (status < 0) { | ||
609 | if (status != -ENOSPC) | ||
610 | mlog_errno(status); | ||
611 | goto bail; | ||
612 | } | ||
613 | } | ||
614 | |||
615 | status = 0; | ||
616 | bail: | ||
617 | if ((status < 0) && *ac) { | ||
618 | ocfs2_free_alloc_context(*ac); | ||
619 | *ac = NULL; | ||
620 | } | ||
621 | |||
622 | mlog_exit(status); | ||
623 | return status; | ||
624 | } | ||
625 | |||
626 | /* | ||
627 | * More or less lifted from ext3. I'll leave their description below: | ||
628 | * | ||
629 | * "For ext3 allocations, we must not reuse any blocks which are | ||
630 | * allocated in the bitmap buffer's "last committed data" copy. This | ||
631 | * prevents deletes from freeing up the page for reuse until we have | ||
632 | * committed the delete transaction. | ||
633 | * | ||
634 | * If we didn't do this, then deleting something and reallocating it as | ||
635 | * data would allow the old block to be overwritten before the | ||
636 | * transaction committed (because we force data to disk before commit). | ||
637 | * This would lead to corruption if we crashed between overwriting the | ||
638 | * data and committing the delete. | ||
639 | * | ||
640 | * @@@ We may want to make this allocation behaviour conditional on | ||
641 | * data-writes at some point, and disable it for metadata allocations or | ||
642 | * sync-data inodes." | ||
643 | * | ||
644 | * Note: OCFS2 already does this differently for metadata vs data | ||
645 | * allocations, as those bitmaps are seperate and undo access is never | ||
646 | * called on a metadata group descriptor. | ||
647 | */ | ||
648 | static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, | ||
649 | int nr) | ||
650 | { | ||
651 | struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; | ||
652 | |||
653 | if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) | ||
654 | return 0; | ||
655 | if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data) | ||
656 | return 1; | ||
657 | |||
658 | bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; | ||
659 | return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); | ||
660 | } | ||
661 | |||
662 | static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, | ||
663 | struct buffer_head *bg_bh, | ||
664 | unsigned int bits_wanted, | ||
665 | u16 *bit_off, | ||
666 | u16 *bits_found) | ||
667 | { | ||
668 | void *bitmap; | ||
669 | u16 best_offset, best_size; | ||
670 | int offset, start, found, status = 0; | ||
671 | struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; | ||
672 | |||
673 | if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { | ||
674 | OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg); | ||
675 | return -EIO; | ||
676 | } | ||
677 | |||
678 | found = start = best_offset = best_size = 0; | ||
679 | bitmap = bg->bg_bitmap; | ||
680 | |||
681 | while((offset = ocfs2_find_next_zero_bit(bitmap, | ||
682 | le16_to_cpu(bg->bg_bits), | ||
683 | start)) != -1) { | ||
684 | if (offset == le16_to_cpu(bg->bg_bits)) | ||
685 | break; | ||
686 | |||
687 | if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { | ||
688 | /* We found a zero, but we can't use it as it | ||
689 | * hasn't been put to disk yet! */ | ||
690 | found = 0; | ||
691 | start = offset + 1; | ||
692 | } else if (offset == start) { | ||
693 | /* we found a zero */ | ||
694 | found++; | ||
695 | /* move start to the next bit to test */ | ||
696 | start++; | ||
697 | } else { | ||
698 | /* got a zero after some ones */ | ||
699 | found = 1; | ||
700 | start = offset + 1; | ||
701 | } | ||
702 | if (found > best_size) { | ||
703 | best_size = found; | ||
704 | best_offset = start - found; | ||
705 | } | ||
706 | /* we got everything we needed */ | ||
707 | if (found == bits_wanted) { | ||
708 | /* mlog(0, "Found it all!\n"); */ | ||
709 | break; | ||
710 | } | ||
711 | } | ||
712 | |||
713 | /* XXX: I think the first clause is equivalent to the second | ||
714 | * - jlbec */ | ||
715 | if (found == bits_wanted) { | ||
716 | *bit_off = start - found; | ||
717 | *bits_found = found; | ||
718 | } else if (best_size) { | ||
719 | *bit_off = best_offset; | ||
720 | *bits_found = best_size; | ||
721 | } else { | ||
722 | status = -ENOSPC; | ||
723 | /* No error log here -- see the comment above | ||
724 | * ocfs2_test_bg_bit_allocatable */ | ||
725 | } | ||
726 | |||
727 | return status; | ||
728 | } | ||
729 | |||
730 | static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle, | ||
731 | struct inode *alloc_inode, | ||
732 | struct ocfs2_group_desc *bg, | ||
733 | struct buffer_head *group_bh, | ||
734 | unsigned int bit_off, | ||
735 | unsigned int num_bits) | ||
736 | { | ||
737 | int status; | ||
738 | void *bitmap = bg->bg_bitmap; | ||
739 | int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; | ||
740 | |||
741 | mlog_entry_void(); | ||
742 | |||
743 | if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { | ||
744 | OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); | ||
745 | status = -EIO; | ||
746 | goto bail; | ||
747 | } | ||
748 | BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); | ||
749 | |||
750 | mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, | ||
751 | num_bits); | ||
752 | |||
753 | if (ocfs2_is_cluster_bitmap(alloc_inode)) | ||
754 | journal_type = OCFS2_JOURNAL_ACCESS_UNDO; | ||
755 | |||
756 | status = ocfs2_journal_access(handle, | ||
757 | alloc_inode, | ||
758 | group_bh, | ||
759 | journal_type); | ||
760 | if (status < 0) { | ||
761 | mlog_errno(status); | ||
762 | goto bail; | ||
763 | } | ||
764 | |||
765 | le16_add_cpu(&bg->bg_free_bits_count, -num_bits); | ||
766 | |||
767 | while(num_bits--) | ||
768 | ocfs2_set_bit(bit_off++, bitmap); | ||
769 | |||
770 | status = ocfs2_journal_dirty(handle, | ||
771 | group_bh); | ||
772 | if (status < 0) { | ||
773 | mlog_errno(status); | ||
774 | goto bail; | ||
775 | } | ||
776 | |||
777 | bail: | ||
778 | mlog_exit(status); | ||
779 | return status; | ||
780 | } | ||
781 | |||
782 | /* find the one with the most empty bits */ | ||
783 | static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl) | ||
784 | { | ||
785 | u16 curr, best; | ||
786 | |||
787 | BUG_ON(!cl->cl_next_free_rec); | ||
788 | |||
789 | best = curr = 0; | ||
790 | while (curr < le16_to_cpu(cl->cl_next_free_rec)) { | ||
791 | if (le32_to_cpu(cl->cl_recs[curr].c_free) > | ||
792 | le32_to_cpu(cl->cl_recs[best].c_free)) | ||
793 | best = curr; | ||
794 | curr++; | ||
795 | } | ||
796 | |||
797 | BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec)); | ||
798 | return best; | ||
799 | } | ||
800 | |||
801 | static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle, | ||
802 | struct inode *alloc_inode, | ||
803 | struct buffer_head *fe_bh, | ||
804 | struct buffer_head *bg_bh, | ||
805 | struct buffer_head *prev_bg_bh, | ||
806 | u16 chain) | ||
807 | { | ||
808 | int status; | ||
809 | /* there is a really tiny chance the journal calls could fail, | ||
810 | * but we wouldn't want inconsistent blocks in *any* case. */ | ||
811 | u64 fe_ptr, bg_ptr, prev_bg_ptr; | ||
812 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; | ||
813 | struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; | ||
814 | struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; | ||
815 | |||
816 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
817 | OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); | ||
818 | status = -EIO; | ||
819 | goto out; | ||
820 | } | ||
821 | if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { | ||
822 | OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); | ||
823 | status = -EIO; | ||
824 | goto out; | ||
825 | } | ||
826 | if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) { | ||
827 | OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg); | ||
828 | status = -EIO; | ||
829 | goto out; | ||
830 | } | ||
831 | |||
832 | mlog(0, "In suballoc %"MLFu64", chain %u, move group %"MLFu64" to " | ||
833 | "top, prev = %"MLFu64"\n", | ||
834 | fe->i_blkno, chain, bg->bg_blkno, prev_bg->bg_blkno); | ||
835 | |||
836 | fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno); | ||
837 | bg_ptr = le64_to_cpu(bg->bg_next_group); | ||
838 | prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); | ||
839 | |||
840 | status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh, | ||
841 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
842 | if (status < 0) { | ||
843 | mlog_errno(status); | ||
844 | goto out_rollback; | ||
845 | } | ||
846 | |||
847 | prev_bg->bg_next_group = bg->bg_next_group; | ||
848 | |||
849 | status = ocfs2_journal_dirty(handle, prev_bg_bh); | ||
850 | if (status < 0) { | ||
851 | mlog_errno(status); | ||
852 | goto out_rollback; | ||
853 | } | ||
854 | |||
855 | status = ocfs2_journal_access(handle, alloc_inode, bg_bh, | ||
856 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
857 | if (status < 0) { | ||
858 | mlog_errno(status); | ||
859 | goto out_rollback; | ||
860 | } | ||
861 | |||
862 | bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; | ||
863 | |||
864 | status = ocfs2_journal_dirty(handle, bg_bh); | ||
865 | if (status < 0) { | ||
866 | mlog_errno(status); | ||
867 | goto out_rollback; | ||
868 | } | ||
869 | |||
870 | status = ocfs2_journal_access(handle, alloc_inode, fe_bh, | ||
871 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
872 | if (status < 0) { | ||
873 | mlog_errno(status); | ||
874 | goto out_rollback; | ||
875 | } | ||
876 | |||
877 | fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; | ||
878 | |||
879 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
880 | if (status < 0) { | ||
881 | mlog_errno(status); | ||
882 | goto out_rollback; | ||
883 | } | ||
884 | |||
885 | status = 0; | ||
886 | out_rollback: | ||
887 | if (status < 0) { | ||
888 | fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); | ||
889 | bg->bg_next_group = cpu_to_le64(bg_ptr); | ||
890 | prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); | ||
891 | } | ||
892 | out: | ||
893 | mlog_exit(status); | ||
894 | return status; | ||
895 | } | ||
896 | |||
897 | static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, | ||
898 | u32 wanted) | ||
899 | { | ||
900 | return le16_to_cpu(bg->bg_free_bits_count) > wanted; | ||
901 | } | ||
902 | |||
903 | /* return 0 on success, -ENOSPC to keep searching and any other < 0 | ||
904 | * value on error. */ | ||
905 | static int ocfs2_cluster_group_search(struct inode *inode, | ||
906 | struct buffer_head *group_bh, | ||
907 | u32 bits_wanted, u32 min_bits, | ||
908 | u16 *bit_off, u16 *bits_found) | ||
909 | { | ||
910 | int search = -ENOSPC; | ||
911 | int ret; | ||
912 | struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; | ||
913 | u16 tmp_off, tmp_found; | ||
914 | |||
915 | BUG_ON(!ocfs2_is_cluster_bitmap(inode)); | ||
916 | |||
917 | if (bg->bg_free_bits_count) { | ||
918 | ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), | ||
919 | group_bh, bits_wanted, | ||
920 | &tmp_off, &tmp_found); | ||
921 | if (ret) | ||
922 | return ret; | ||
923 | |||
924 | /* ocfs2_block_group_find_clear_bits() might | ||
925 | * return success, but we still want to return | ||
926 | * -ENOSPC unless it found the minimum number | ||
927 | * of bits. */ | ||
928 | if (min_bits <= tmp_found) { | ||
929 | *bit_off = tmp_off; | ||
930 | *bits_found = tmp_found; | ||
931 | search = 0; /* success */ | ||
932 | } | ||
933 | } | ||
934 | |||
935 | return search; | ||
936 | } | ||
937 | |||
938 | static int ocfs2_block_group_search(struct inode *inode, | ||
939 | struct buffer_head *group_bh, | ||
940 | u32 bits_wanted, u32 min_bits, | ||
941 | u16 *bit_off, u16 *bits_found) | ||
942 | { | ||
943 | int ret = -ENOSPC; | ||
944 | struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; | ||
945 | |||
946 | BUG_ON(min_bits != 1); | ||
947 | BUG_ON(ocfs2_is_cluster_bitmap(inode)); | ||
948 | |||
949 | if (bg->bg_free_bits_count) | ||
950 | ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), | ||
951 | group_bh, bits_wanted, | ||
952 | bit_off, bits_found); | ||
953 | |||
954 | return ret; | ||
955 | } | ||
956 | |||
957 | static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, | ||
958 | u32 bits_wanted, | ||
959 | u32 min_bits, | ||
960 | u16 *bit_off, | ||
961 | unsigned int *num_bits, | ||
962 | u64 *bg_blkno) | ||
963 | { | ||
964 | int status; | ||
965 | u16 chain, tmp_bits; | ||
966 | u32 tmp_used; | ||
967 | u64 next_group; | ||
968 | struct ocfs2_journal_handle *handle = ac->ac_handle; | ||
969 | struct inode *alloc_inode = ac->ac_inode; | ||
970 | struct buffer_head *group_bh = NULL; | ||
971 | struct buffer_head *prev_group_bh = NULL; | ||
972 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; | ||
973 | struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; | ||
974 | struct ocfs2_group_desc *bg; | ||
975 | |||
976 | chain = ac->ac_chain; | ||
977 | mlog(0, "trying to alloc %u bits from chain %u, inode %"MLFu64"\n", | ||
978 | bits_wanted, chain, OCFS2_I(alloc_inode)->ip_blkno); | ||
979 | |||
980 | status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), | ||
981 | le64_to_cpu(cl->cl_recs[chain].c_blkno), | ||
982 | &group_bh, OCFS2_BH_CACHED, alloc_inode); | ||
983 | if (status < 0) { | ||
984 | mlog_errno(status); | ||
985 | goto bail; | ||
986 | } | ||
987 | bg = (struct ocfs2_group_desc *) group_bh->b_data; | ||
988 | if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { | ||
989 | OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); | ||
990 | status = -EIO; | ||
991 | goto bail; | ||
992 | } | ||
993 | |||
994 | status = -ENOSPC; | ||
995 | /* for now, the chain search is a bit simplistic. We just use | ||
996 | * the 1st group with any empty bits. */ | ||
997 | while ((status = ac->ac_group_search(alloc_inode, group_bh, | ||
998 | bits_wanted, min_bits, bit_off, | ||
999 | &tmp_bits)) == -ENOSPC) { | ||
1000 | if (!bg->bg_next_group) | ||
1001 | break; | ||
1002 | |||
1003 | if (prev_group_bh) { | ||
1004 | brelse(prev_group_bh); | ||
1005 | prev_group_bh = NULL; | ||
1006 | } | ||
1007 | next_group = le64_to_cpu(bg->bg_next_group); | ||
1008 | prev_group_bh = group_bh; | ||
1009 | group_bh = NULL; | ||
1010 | status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), | ||
1011 | next_group, &group_bh, | ||
1012 | OCFS2_BH_CACHED, alloc_inode); | ||
1013 | if (status < 0) { | ||
1014 | mlog_errno(status); | ||
1015 | goto bail; | ||
1016 | } | ||
1017 | bg = (struct ocfs2_group_desc *) group_bh->b_data; | ||
1018 | if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { | ||
1019 | OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); | ||
1020 | status = -EIO; | ||
1021 | goto bail; | ||
1022 | } | ||
1023 | } | ||
1024 | if (status < 0) { | ||
1025 | if (status != -ENOSPC) | ||
1026 | mlog_errno(status); | ||
1027 | goto bail; | ||
1028 | } | ||
1029 | |||
1030 | mlog(0, "alloc succeeds: we give %u bits from block group %"MLFu64"\n", | ||
1031 | tmp_bits, bg->bg_blkno); | ||
1032 | |||
1033 | *num_bits = tmp_bits; | ||
1034 | |||
1035 | BUG_ON(*num_bits == 0); | ||
1036 | |||
1037 | /* | ||
1038 | * Keep track of previous block descriptor read. When | ||
1039 | * we find a target, if we have read more than X | ||
1040 | * number of descriptors, and the target is reasonably | ||
1041 | * empty, relink him to top of his chain. | ||
1042 | * | ||
1043 | * We've read 0 extra blocks and only send one more to | ||
1044 | * the transaction, yet the next guy to search has a | ||
1045 | * much easier time. | ||
1046 | * | ||
1047 | * Do this *after* figuring out how many bits we're taking out | ||
1048 | * of our target group. | ||
1049 | */ | ||
1050 | if (ac->ac_allow_chain_relink && | ||
1051 | (prev_group_bh) && | ||
1052 | (ocfs2_block_group_reasonably_empty(bg, *num_bits))) { | ||
1053 | status = ocfs2_relink_block_group(handle, alloc_inode, | ||
1054 | ac->ac_bh, group_bh, | ||
1055 | prev_group_bh, chain); | ||
1056 | if (status < 0) { | ||
1057 | mlog_errno(status); | ||
1058 | goto bail; | ||
1059 | } | ||
1060 | } | ||
1061 | |||
1062 | /* Ok, claim our bits now: set the info on dinode, chainlist | ||
1063 | * and then the group */ | ||
1064 | status = ocfs2_journal_access(handle, | ||
1065 | alloc_inode, | ||
1066 | ac->ac_bh, | ||
1067 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1068 | if (status < 0) { | ||
1069 | mlog_errno(status); | ||
1070 | goto bail; | ||
1071 | } | ||
1072 | |||
1073 | tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); | ||
1074 | fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used); | ||
1075 | le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits)); | ||
1076 | |||
1077 | status = ocfs2_journal_dirty(handle, | ||
1078 | ac->ac_bh); | ||
1079 | if (status < 0) { | ||
1080 | mlog_errno(status); | ||
1081 | goto bail; | ||
1082 | } | ||
1083 | |||
1084 | status = ocfs2_block_group_set_bits(handle, | ||
1085 | alloc_inode, | ||
1086 | bg, | ||
1087 | group_bh, | ||
1088 | *bit_off, | ||
1089 | *num_bits); | ||
1090 | if (status < 0) { | ||
1091 | mlog_errno(status); | ||
1092 | goto bail; | ||
1093 | } | ||
1094 | |||
1095 | mlog(0, "Allocated %u bits from suballocator %"MLFu64"\n", | ||
1096 | *num_bits, fe->i_blkno); | ||
1097 | |||
1098 | *bg_blkno = le64_to_cpu(bg->bg_blkno); | ||
1099 | bail: | ||
1100 | if (group_bh) | ||
1101 | brelse(group_bh); | ||
1102 | if (prev_group_bh) | ||
1103 | brelse(prev_group_bh); | ||
1104 | |||
1105 | mlog_exit(status); | ||
1106 | return status; | ||
1107 | } | ||
1108 | |||
1109 | /* will give out up to bits_wanted contiguous bits. */ | ||
1110 | static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, | ||
1111 | struct ocfs2_alloc_context *ac, | ||
1112 | u32 bits_wanted, | ||
1113 | u32 min_bits, | ||
1114 | u16 *bit_off, | ||
1115 | unsigned int *num_bits, | ||
1116 | u64 *bg_blkno) | ||
1117 | { | ||
1118 | int status; | ||
1119 | u16 victim, i; | ||
1120 | struct ocfs2_chain_list *cl; | ||
1121 | struct ocfs2_dinode *fe; | ||
1122 | |||
1123 | mlog_entry_void(); | ||
1124 | |||
1125 | BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); | ||
1126 | BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); | ||
1127 | BUG_ON(!ac->ac_bh); | ||
1128 | |||
1129 | fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; | ||
1130 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
1131 | OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe); | ||
1132 | status = -EIO; | ||
1133 | goto bail; | ||
1134 | } | ||
1135 | if (le32_to_cpu(fe->id1.bitmap1.i_used) >= | ||
1136 | le32_to_cpu(fe->id1.bitmap1.i_total)) { | ||
1137 | ocfs2_error(osb->sb, "Chain allocator dinode %"MLFu64" has %u" | ||
1138 | "used bits but only %u total.", | ||
1139 | le64_to_cpu(fe->i_blkno), | ||
1140 | le32_to_cpu(fe->id1.bitmap1.i_used), | ||
1141 | le32_to_cpu(fe->id1.bitmap1.i_total)); | ||
1142 | status = -EIO; | ||
1143 | goto bail; | ||
1144 | } | ||
1145 | |||
1146 | cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; | ||
1147 | |||
1148 | victim = ocfs2_find_victim_chain(cl); | ||
1149 | ac->ac_chain = victim; | ||
1150 | ac->ac_allow_chain_relink = 1; | ||
1151 | |||
1152 | status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off, | ||
1153 | num_bits, bg_blkno); | ||
1154 | if (!status) | ||
1155 | goto bail; | ||
1156 | if (status < 0 && status != -ENOSPC) { | ||
1157 | mlog_errno(status); | ||
1158 | goto bail; | ||
1159 | } | ||
1160 | |||
1161 | mlog(0, "Search of victim chain %u came up with nothing, " | ||
1162 | "trying all chains now.\n", victim); | ||
1163 | |||
1164 | /* If we didn't pick a good victim, then just default to | ||
1165 | * searching each chain in order. Don't allow chain relinking | ||
1166 | * because we only calculate enough journal credits for one | ||
1167 | * relink per alloc. */ | ||
1168 | ac->ac_allow_chain_relink = 0; | ||
1169 | for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { | ||
1170 | if (i == victim) | ||
1171 | continue; | ||
1172 | if (!cl->cl_recs[i].c_free) | ||
1173 | continue; | ||
1174 | |||
1175 | ac->ac_chain = i; | ||
1176 | status = ocfs2_search_chain(ac, bits_wanted, min_bits, | ||
1177 | bit_off, num_bits, | ||
1178 | bg_blkno); | ||
1179 | if (!status) | ||
1180 | break; | ||
1181 | if (status < 0 && status != -ENOSPC) { | ||
1182 | mlog_errno(status); | ||
1183 | goto bail; | ||
1184 | } | ||
1185 | } | ||
1186 | bail: | ||
1187 | |||
1188 | mlog_exit(status); | ||
1189 | return status; | ||
1190 | } | ||
1191 | |||
1192 | int ocfs2_claim_metadata(struct ocfs2_super *osb, | ||
1193 | struct ocfs2_journal_handle *handle, | ||
1194 | struct ocfs2_alloc_context *ac, | ||
1195 | u32 bits_wanted, | ||
1196 | u16 *suballoc_bit_start, | ||
1197 | unsigned int *num_bits, | ||
1198 | u64 *blkno_start) | ||
1199 | { | ||
1200 | int status; | ||
1201 | u64 bg_blkno; | ||
1202 | |||
1203 | BUG_ON(!ac); | ||
1204 | BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); | ||
1205 | BUG_ON(ac->ac_which != OCFS2_AC_USE_META); | ||
1206 | BUG_ON(ac->ac_handle != handle); | ||
1207 | |||
1208 | status = ocfs2_claim_suballoc_bits(osb, | ||
1209 | ac, | ||
1210 | bits_wanted, | ||
1211 | 1, | ||
1212 | suballoc_bit_start, | ||
1213 | num_bits, | ||
1214 | &bg_blkno); | ||
1215 | if (status < 0) { | ||
1216 | mlog_errno(status); | ||
1217 | goto bail; | ||
1218 | } | ||
1219 | atomic_inc(&osb->alloc_stats.bg_allocs); | ||
1220 | |||
1221 | *blkno_start = bg_blkno + (u64) *suballoc_bit_start; | ||
1222 | ac->ac_bits_given += (*num_bits); | ||
1223 | status = 0; | ||
1224 | bail: | ||
1225 | mlog_exit(status); | ||
1226 | return status; | ||
1227 | } | ||
1228 | |||
1229 | int ocfs2_claim_new_inode(struct ocfs2_super *osb, | ||
1230 | struct ocfs2_journal_handle *handle, | ||
1231 | struct ocfs2_alloc_context *ac, | ||
1232 | u16 *suballoc_bit, | ||
1233 | u64 *fe_blkno) | ||
1234 | { | ||
1235 | int status; | ||
1236 | unsigned int num_bits; | ||
1237 | u64 bg_blkno; | ||
1238 | |||
1239 | mlog_entry_void(); | ||
1240 | |||
1241 | BUG_ON(!ac); | ||
1242 | BUG_ON(ac->ac_bits_given != 0); | ||
1243 | BUG_ON(ac->ac_bits_wanted != 1); | ||
1244 | BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); | ||
1245 | BUG_ON(ac->ac_handle != handle); | ||
1246 | |||
1247 | status = ocfs2_claim_suballoc_bits(osb, | ||
1248 | ac, | ||
1249 | 1, | ||
1250 | 1, | ||
1251 | suballoc_bit, | ||
1252 | &num_bits, | ||
1253 | &bg_blkno); | ||
1254 | if (status < 0) { | ||
1255 | mlog_errno(status); | ||
1256 | goto bail; | ||
1257 | } | ||
1258 | atomic_inc(&osb->alloc_stats.bg_allocs); | ||
1259 | |||
1260 | BUG_ON(num_bits != 1); | ||
1261 | |||
1262 | *fe_blkno = bg_blkno + (u64) (*suballoc_bit); | ||
1263 | ac->ac_bits_given++; | ||
1264 | status = 0; | ||
1265 | bail: | ||
1266 | mlog_exit(status); | ||
1267 | return status; | ||
1268 | } | ||
1269 | |||
1270 | /* translate a group desc. blkno and it's bitmap offset into | ||
1271 | * disk cluster offset. */ | ||
1272 | static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, | ||
1273 | u64 bg_blkno, | ||
1274 | u16 bg_bit_off) | ||
1275 | { | ||
1276 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1277 | u32 cluster = 0; | ||
1278 | |||
1279 | BUG_ON(!ocfs2_is_cluster_bitmap(inode)); | ||
1280 | |||
1281 | if (bg_blkno != osb->first_cluster_group_blkno) | ||
1282 | cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno); | ||
1283 | cluster += (u32) bg_bit_off; | ||
1284 | return cluster; | ||
1285 | } | ||
1286 | |||
1287 | /* given a cluster offset, calculate which block group it belongs to | ||
1288 | * and return that block offset. */ | ||
1289 | static inline u64 ocfs2_which_cluster_group(struct inode *inode, | ||
1290 | u32 cluster) | ||
1291 | { | ||
1292 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1293 | u32 group_no; | ||
1294 | |||
1295 | BUG_ON(!ocfs2_is_cluster_bitmap(inode)); | ||
1296 | |||
1297 | group_no = cluster / osb->bitmap_cpg; | ||
1298 | if (!group_no) | ||
1299 | return osb->first_cluster_group_blkno; | ||
1300 | return ocfs2_clusters_to_blocks(inode->i_sb, | ||
1301 | group_no * osb->bitmap_cpg); | ||
1302 | } | ||
1303 | |||
1304 | /* given the block number of a cluster start, calculate which cluster | ||
1305 | * group and descriptor bitmap offset that corresponds to. */ | ||
1306 | static inline void ocfs2_block_to_cluster_group(struct inode *inode, | ||
1307 | u64 data_blkno, | ||
1308 | u64 *bg_blkno, | ||
1309 | u16 *bg_bit_off) | ||
1310 | { | ||
1311 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1312 | u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno); | ||
1313 | |||
1314 | BUG_ON(!ocfs2_is_cluster_bitmap(inode)); | ||
1315 | |||
1316 | *bg_blkno = ocfs2_which_cluster_group(inode, | ||
1317 | data_cluster); | ||
1318 | |||
1319 | if (*bg_blkno == osb->first_cluster_group_blkno) | ||
1320 | *bg_bit_off = (u16) data_cluster; | ||
1321 | else | ||
1322 | *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb, | ||
1323 | data_blkno - *bg_blkno); | ||
1324 | } | ||
1325 | |||
1326 | /* | ||
1327 | * min_bits - minimum contiguous chunk from this total allocation we | ||
1328 | * can handle. set to what we asked for originally for a full | ||
1329 | * contig. allocation, set to '1' to indicate we can deal with extents | ||
1330 | * of any size. | ||
1331 | */ | ||
1332 | int ocfs2_claim_clusters(struct ocfs2_super *osb, | ||
1333 | struct ocfs2_journal_handle *handle, | ||
1334 | struct ocfs2_alloc_context *ac, | ||
1335 | u32 min_clusters, | ||
1336 | u32 *cluster_start, | ||
1337 | u32 *num_clusters) | ||
1338 | { | ||
1339 | int status; | ||
1340 | unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; | ||
1341 | u64 bg_blkno; | ||
1342 | u16 bg_bit_off; | ||
1343 | |||
1344 | mlog_entry_void(); | ||
1345 | |||
1346 | BUG_ON(!ac); | ||
1347 | BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); | ||
1348 | |||
1349 | BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL | ||
1350 | && ac->ac_which != OCFS2_AC_USE_MAIN); | ||
1351 | BUG_ON(ac->ac_handle != handle); | ||
1352 | |||
1353 | if (ac->ac_which == OCFS2_AC_USE_LOCAL) { | ||
1354 | status = ocfs2_claim_local_alloc_bits(osb, | ||
1355 | handle, | ||
1356 | ac, | ||
1357 | bits_wanted, | ||
1358 | cluster_start, | ||
1359 | num_clusters); | ||
1360 | if (!status) | ||
1361 | atomic_inc(&osb->alloc_stats.local_data); | ||
1362 | } else { | ||
1363 | if (min_clusters > (osb->bitmap_cpg - 1)) { | ||
1364 | /* The only paths asking for contiguousness | ||
1365 | * should know about this already. */ | ||
1366 | mlog(ML_ERROR, "minimum allocation requested exceeds " | ||
1367 | "group bitmap size!"); | ||
1368 | status = -ENOSPC; | ||
1369 | goto bail; | ||
1370 | } | ||
1371 | /* clamp the current request down to a realistic size. */ | ||
1372 | if (bits_wanted > (osb->bitmap_cpg - 1)) | ||
1373 | bits_wanted = osb->bitmap_cpg - 1; | ||
1374 | |||
1375 | status = ocfs2_claim_suballoc_bits(osb, | ||
1376 | ac, | ||
1377 | bits_wanted, | ||
1378 | min_clusters, | ||
1379 | &bg_bit_off, | ||
1380 | num_clusters, | ||
1381 | &bg_blkno); | ||
1382 | if (!status) { | ||
1383 | *cluster_start = | ||
1384 | ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, | ||
1385 | bg_blkno, | ||
1386 | bg_bit_off); | ||
1387 | atomic_inc(&osb->alloc_stats.bitmap_data); | ||
1388 | } | ||
1389 | } | ||
1390 | if (status < 0) { | ||
1391 | if (status != -ENOSPC) | ||
1392 | mlog_errno(status); | ||
1393 | goto bail; | ||
1394 | } | ||
1395 | |||
1396 | ac->ac_bits_given += *num_clusters; | ||
1397 | |||
1398 | bail: | ||
1399 | mlog_exit(status); | ||
1400 | return status; | ||
1401 | } | ||
1402 | |||
1403 | static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle, | ||
1404 | struct inode *alloc_inode, | ||
1405 | struct ocfs2_group_desc *bg, | ||
1406 | struct buffer_head *group_bh, | ||
1407 | unsigned int bit_off, | ||
1408 | unsigned int num_bits) | ||
1409 | { | ||
1410 | int status; | ||
1411 | unsigned int tmp; | ||
1412 | int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; | ||
1413 | struct ocfs2_group_desc *undo_bg = NULL; | ||
1414 | |||
1415 | mlog_entry_void(); | ||
1416 | |||
1417 | if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { | ||
1418 | OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); | ||
1419 | status = -EIO; | ||
1420 | goto bail; | ||
1421 | } | ||
1422 | |||
1423 | mlog(0, "off = %u, num = %u\n", bit_off, num_bits); | ||
1424 | |||
1425 | if (ocfs2_is_cluster_bitmap(alloc_inode)) | ||
1426 | journal_type = OCFS2_JOURNAL_ACCESS_UNDO; | ||
1427 | |||
1428 | status = ocfs2_journal_access(handle, alloc_inode, group_bh, | ||
1429 | journal_type); | ||
1430 | if (status < 0) { | ||
1431 | mlog_errno(status); | ||
1432 | goto bail; | ||
1433 | } | ||
1434 | |||
1435 | if (ocfs2_is_cluster_bitmap(alloc_inode)) | ||
1436 | undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data; | ||
1437 | |||
1438 | tmp = num_bits; | ||
1439 | while(tmp--) { | ||
1440 | ocfs2_clear_bit((bit_off + tmp), | ||
1441 | (unsigned long *) bg->bg_bitmap); | ||
1442 | if (ocfs2_is_cluster_bitmap(alloc_inode)) | ||
1443 | ocfs2_set_bit(bit_off + tmp, | ||
1444 | (unsigned long *) undo_bg->bg_bitmap); | ||
1445 | } | ||
1446 | le16_add_cpu(&bg->bg_free_bits_count, num_bits); | ||
1447 | |||
1448 | status = ocfs2_journal_dirty(handle, group_bh); | ||
1449 | if (status < 0) | ||
1450 | mlog_errno(status); | ||
1451 | bail: | ||
1452 | return status; | ||
1453 | } | ||
1454 | |||
1455 | /* | ||
1456 | * expects the suballoc inode to already be locked. | ||
1457 | */ | ||
1458 | static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle, | ||
1459 | struct inode *alloc_inode, | ||
1460 | struct buffer_head *alloc_bh, | ||
1461 | unsigned int start_bit, | ||
1462 | u64 bg_blkno, | ||
1463 | unsigned int count) | ||
1464 | { | ||
1465 | int status = 0; | ||
1466 | u32 tmp_used; | ||
1467 | struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); | ||
1468 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; | ||
1469 | struct ocfs2_chain_list *cl = &fe->id2.i_chain; | ||
1470 | struct buffer_head *group_bh = NULL; | ||
1471 | struct ocfs2_group_desc *group; | ||
1472 | |||
1473 | mlog_entry_void(); | ||
1474 | |||
1475 | if (!OCFS2_IS_VALID_DINODE(fe)) { | ||
1476 | OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); | ||
1477 | status = -EIO; | ||
1478 | goto bail; | ||
1479 | } | ||
1480 | BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); | ||
1481 | |||
1482 | mlog(0, "suballocator %"MLFu64": freeing %u bits from group %"MLFu64 | ||
1483 | ", starting at %u\n", | ||
1484 | OCFS2_I(alloc_inode)->ip_blkno, count, bg_blkno, | ||
1485 | start_bit); | ||
1486 | |||
1487 | status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED, | ||
1488 | alloc_inode); | ||
1489 | if (status < 0) { | ||
1490 | mlog_errno(status); | ||
1491 | goto bail; | ||
1492 | } | ||
1493 | |||
1494 | group = (struct ocfs2_group_desc *) group_bh->b_data; | ||
1495 | if (!OCFS2_IS_VALID_GROUP_DESC(group)) { | ||
1496 | OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group); | ||
1497 | status = -EIO; | ||
1498 | goto bail; | ||
1499 | } | ||
1500 | BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); | ||
1501 | |||
1502 | status = ocfs2_block_group_clear_bits(handle, alloc_inode, | ||
1503 | group, group_bh, | ||
1504 | start_bit, count); | ||
1505 | if (status < 0) { | ||
1506 | mlog_errno(status); | ||
1507 | goto bail; | ||
1508 | } | ||
1509 | |||
1510 | status = ocfs2_journal_access(handle, alloc_inode, alloc_bh, | ||
1511 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1512 | if (status < 0) { | ||
1513 | mlog_errno(status); | ||
1514 | goto bail; | ||
1515 | } | ||
1516 | |||
1517 | le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free, | ||
1518 | count); | ||
1519 | tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); | ||
1520 | fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); | ||
1521 | |||
1522 | status = ocfs2_journal_dirty(handle, alloc_bh); | ||
1523 | if (status < 0) { | ||
1524 | mlog_errno(status); | ||
1525 | goto bail; | ||
1526 | } | ||
1527 | |||
1528 | bail: | ||
1529 | if (group_bh) | ||
1530 | brelse(group_bh); | ||
1531 | |||
1532 | mlog_exit(status); | ||
1533 | return status; | ||
1534 | } | ||
1535 | |||
1536 | static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) | ||
1537 | { | ||
1538 | u64 group = block - (u64) bit; | ||
1539 | |||
1540 | return group; | ||
1541 | } | ||
1542 | |||
1543 | int ocfs2_free_dinode(struct ocfs2_journal_handle *handle, | ||
1544 | struct inode *inode_alloc_inode, | ||
1545 | struct buffer_head *inode_alloc_bh, | ||
1546 | struct ocfs2_dinode *di) | ||
1547 | { | ||
1548 | u64 blk = le64_to_cpu(di->i_blkno); | ||
1549 | u16 bit = le16_to_cpu(di->i_suballoc_bit); | ||
1550 | u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); | ||
1551 | |||
1552 | return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, | ||
1553 | inode_alloc_bh, bit, bg_blkno, 1); | ||
1554 | } | ||
1555 | |||
1556 | int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle, | ||
1557 | struct inode *eb_alloc_inode, | ||
1558 | struct buffer_head *eb_alloc_bh, | ||
1559 | struct ocfs2_extent_block *eb) | ||
1560 | { | ||
1561 | u64 blk = le64_to_cpu(eb->h_blkno); | ||
1562 | u16 bit = le16_to_cpu(eb->h_suballoc_bit); | ||
1563 | u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); | ||
1564 | |||
1565 | return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh, | ||
1566 | bit, bg_blkno, 1); | ||
1567 | } | ||
1568 | |||
1569 | int ocfs2_free_clusters(struct ocfs2_journal_handle *handle, | ||
1570 | struct inode *bitmap_inode, | ||
1571 | struct buffer_head *bitmap_bh, | ||
1572 | u64 start_blk, | ||
1573 | unsigned int num_clusters) | ||
1574 | { | ||
1575 | int status; | ||
1576 | u16 bg_start_bit; | ||
1577 | u64 bg_blkno; | ||
1578 | struct ocfs2_dinode *fe; | ||
1579 | |||
1580 | /* You can't ever have a contiguous set of clusters | ||
1581 | * bigger than a block group bitmap so we never have to worry | ||
1582 | * about looping on them. */ | ||
1583 | |||
1584 | mlog_entry_void(); | ||
1585 | |||
1586 | /* This is expensive. We can safely remove once this stuff has | ||
1587 | * gotten tested really well. */ | ||
1588 | BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); | ||
1589 | |||
1590 | fe = (struct ocfs2_dinode *) bitmap_bh->b_data; | ||
1591 | |||
1592 | ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, | ||
1593 | &bg_start_bit); | ||
1594 | |||
1595 | mlog(0, "want to free %u clusters starting at block %"MLFu64"\n", | ||
1596 | num_clusters, start_blk); | ||
1597 | mlog(0, "bg_blkno = %"MLFu64", bg_start_bit = %u\n", | ||
1598 | bg_blkno, bg_start_bit); | ||
1599 | |||
1600 | status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, | ||
1601 | bg_start_bit, bg_blkno, | ||
1602 | num_clusters); | ||
1603 | if (status < 0) | ||
1604 | mlog_errno(status); | ||
1605 | |||
1606 | mlog_exit(status); | ||
1607 | return status; | ||
1608 | } | ||
1609 | |||
1610 | static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) | ||
1611 | { | ||
1612 | printk("Block Group:\n"); | ||
1613 | printk("bg_signature: %s\n", bg->bg_signature); | ||
1614 | printk("bg_size: %u\n", bg->bg_size); | ||
1615 | printk("bg_bits: %u\n", bg->bg_bits); | ||
1616 | printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count); | ||
1617 | printk("bg_chain: %u\n", bg->bg_chain); | ||
1618 | printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation)); | ||
1619 | printk("bg_next_group: %"MLFu64"\n", bg->bg_next_group); | ||
1620 | printk("bg_parent_dinode: %"MLFu64"\n", bg->bg_parent_dinode); | ||
1621 | printk("bg_blkno: %"MLFu64"\n", bg->bg_blkno); | ||
1622 | } | ||
1623 | |||
1624 | static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) | ||
1625 | { | ||
1626 | int i; | ||
1627 | |||
1628 | printk("Suballoc Inode %"MLFu64":\n", fe->i_blkno); | ||
1629 | printk("i_signature: %s\n", fe->i_signature); | ||
1630 | printk("i_size: %"MLFu64"\n", fe->i_size); | ||
1631 | printk("i_clusters: %u\n", fe->i_clusters); | ||
1632 | printk("i_generation: %u\n", | ||
1633 | le32_to_cpu(fe->i_generation)); | ||
1634 | printk("id1.bitmap1.i_used: %u\n", | ||
1635 | le32_to_cpu(fe->id1.bitmap1.i_used)); | ||
1636 | printk("id1.bitmap1.i_total: %u\n", | ||
1637 | le32_to_cpu(fe->id1.bitmap1.i_total)); | ||
1638 | printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg); | ||
1639 | printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc); | ||
1640 | printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count); | ||
1641 | printk("id2.i_chain.cl_next_free_rec: %u\n", | ||
1642 | fe->id2.i_chain.cl_next_free_rec); | ||
1643 | for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) { | ||
1644 | printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i, | ||
1645 | fe->id2.i_chain.cl_recs[i].c_free); | ||
1646 | printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i, | ||
1647 | fe->id2.i_chain.cl_recs[i].c_total); | ||
1648 | printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %"MLFu64"\n", i, | ||
1649 | fe->id2.i_chain.cl_recs[i].c_blkno); | ||
1650 | } | ||
1651 | } | ||
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h new file mode 100644 index 000000000000..a76c82a7ceac --- /dev/null +++ b/fs/ocfs2/suballoc.h | |||
@@ -0,0 +1,132 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * suballoc.h | ||
5 | * | ||
6 | * Defines sub allocator api | ||
7 | * | ||
8 | * Copyright (C) 2003, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef _CHAINALLOC_H_ | ||
27 | #define _CHAINALLOC_H_ | ||
28 | |||
29 | typedef int (group_search_t)(struct inode *, | ||
30 | struct buffer_head *, | ||
31 | u32, | ||
32 | u32, | ||
33 | u16 *, | ||
34 | u16 *); | ||
35 | |||
36 | struct ocfs2_alloc_context { | ||
37 | struct inode *ac_inode; /* which bitmap are we allocating from? */ | ||
38 | struct buffer_head *ac_bh; /* file entry bh */ | ||
39 | u32 ac_bits_wanted; | ||
40 | u32 ac_bits_given; | ||
41 | #define OCFS2_AC_USE_LOCAL 1 | ||
42 | #define OCFS2_AC_USE_MAIN 2 | ||
43 | #define OCFS2_AC_USE_INODE 3 | ||
44 | #define OCFS2_AC_USE_META 4 | ||
45 | u32 ac_which; | ||
46 | struct ocfs2_journal_handle *ac_handle; | ||
47 | |||
48 | /* these are used by the chain search */ | ||
49 | u16 ac_chain; | ||
50 | int ac_allow_chain_relink; | ||
51 | group_search_t *ac_group_search; | ||
52 | }; | ||
53 | |||
54 | void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); | ||
55 | static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) | ||
56 | { | ||
57 | return ac->ac_bits_wanted - ac->ac_bits_given; | ||
58 | } | ||
59 | |||
60 | int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, | ||
61 | struct ocfs2_journal_handle *handle, | ||
62 | struct ocfs2_dinode *fe, | ||
63 | struct ocfs2_alloc_context **ac); | ||
64 | int ocfs2_reserve_new_inode(struct ocfs2_super *osb, | ||
65 | struct ocfs2_journal_handle *handle, | ||
66 | struct ocfs2_alloc_context **ac); | ||
67 | int ocfs2_reserve_clusters(struct ocfs2_super *osb, | ||
68 | struct ocfs2_journal_handle *handle, | ||
69 | u32 bits_wanted, | ||
70 | struct ocfs2_alloc_context **ac); | ||
71 | |||
72 | int ocfs2_claim_metadata(struct ocfs2_super *osb, | ||
73 | struct ocfs2_journal_handle *handle, | ||
74 | struct ocfs2_alloc_context *ac, | ||
75 | u32 bits_wanted, | ||
76 | u16 *suballoc_bit_start, | ||
77 | u32 *num_bits, | ||
78 | u64 *blkno_start); | ||
79 | int ocfs2_claim_new_inode(struct ocfs2_super *osb, | ||
80 | struct ocfs2_journal_handle *handle, | ||
81 | struct ocfs2_alloc_context *ac, | ||
82 | u16 *suballoc_bit, | ||
83 | u64 *fe_blkno); | ||
84 | int ocfs2_claim_clusters(struct ocfs2_super *osb, | ||
85 | struct ocfs2_journal_handle *handle, | ||
86 | struct ocfs2_alloc_context *ac, | ||
87 | u32 min_clusters, | ||
88 | u32 *cluster_start, | ||
89 | u32 *num_clusters); | ||
90 | |||
91 | int ocfs2_free_dinode(struct ocfs2_journal_handle *handle, | ||
92 | struct inode *inode_alloc_inode, | ||
93 | struct buffer_head *inode_alloc_bh, | ||
94 | struct ocfs2_dinode *di); | ||
95 | int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle, | ||
96 | struct inode *eb_alloc_inode, | ||
97 | struct buffer_head *eb_alloc_bh, | ||
98 | struct ocfs2_extent_block *eb); | ||
99 | int ocfs2_free_clusters(struct ocfs2_journal_handle *handle, | ||
100 | struct inode *bitmap_inode, | ||
101 | struct buffer_head *bitmap_bh, | ||
102 | u64 start_blk, | ||
103 | unsigned int num_clusters); | ||
104 | |||
105 | static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb, | ||
106 | u64 bg_blkno) | ||
107 | { | ||
108 | /* This should work for all block group descriptors as only | ||
109 | * the 1st group descriptor of the cluster bitmap is | ||
110 | * different. */ | ||
111 | |||
112 | if (bg_blkno == osb->first_cluster_group_blkno) | ||
113 | return 0; | ||
114 | |||
115 | /* the rest of the block groups are located at the beginning | ||
116 | * of their 1st cluster, so a direct translation just | ||
117 | * works. */ | ||
118 | return ocfs2_blocks_to_clusters(osb->sb, bg_blkno); | ||
119 | } | ||
120 | |||
121 | static inline int ocfs2_is_cluster_bitmap(struct inode *inode) | ||
122 | { | ||
123 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
124 | return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno; | ||
125 | } | ||
126 | |||
127 | /* This is for local alloc ONLY. Others should use the task-specific | ||
128 | * apis above. */ | ||
129 | int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, | ||
130 | struct ocfs2_alloc_context *ac); | ||
131 | |||
132 | #endif /* _CHAINALLOC_H_ */ | ||
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c new file mode 100644 index 000000000000..48bf7f0ce544 --- /dev/null +++ b/fs/ocfs2/super.c | |||
@@ -0,0 +1,1733 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * super.c | ||
5 | * | ||
6 | * load/unload driver, mount/dismount volumes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/fs.h> | ||
28 | #include <linux/types.h> | ||
29 | #include <linux/slab.h> | ||
30 | #include <linux/highmem.h> | ||
31 | #include <linux/utsname.h> | ||
32 | #include <linux/init.h> | ||
33 | #include <linux/random.h> | ||
34 | #include <linux/statfs.h> | ||
35 | #include <linux/moduleparam.h> | ||
36 | #include <linux/blkdev.h> | ||
37 | #include <linux/socket.h> | ||
38 | #include <linux/inet.h> | ||
39 | #include <linux/parser.h> | ||
40 | #include <linux/crc32.h> | ||
41 | #include <linux/debugfs.h> | ||
42 | |||
43 | #include <cluster/nodemanager.h> | ||
44 | |||
45 | #define MLOG_MASK_PREFIX ML_SUPER | ||
46 | #include <cluster/masklog.h> | ||
47 | |||
48 | #include "ocfs2.h" | ||
49 | |||
50 | /* this should be the only file to include a version 1 header */ | ||
51 | #include "ocfs1_fs_compat.h" | ||
52 | |||
53 | #include "alloc.h" | ||
54 | #include "dlmglue.h" | ||
55 | #include "export.h" | ||
56 | #include "extent_map.h" | ||
57 | #include "heartbeat.h" | ||
58 | #include "inode.h" | ||
59 | #include "journal.h" | ||
60 | #include "localalloc.h" | ||
61 | #include "namei.h" | ||
62 | #include "slot_map.h" | ||
63 | #include "super.h" | ||
64 | #include "sysfile.h" | ||
65 | #include "uptodate.h" | ||
66 | #include "ver.h" | ||
67 | #include "vote.h" | ||
68 | |||
69 | #include "buffer_head_io.h" | ||
70 | |||
71 | /* | ||
72 | * Globals | ||
73 | */ | ||
74 | static spinlock_t ocfs2_globals_lock = SPIN_LOCK_UNLOCKED; | ||
75 | |||
76 | static u32 osb_id; /* Keeps track of next available OSB Id */ | ||
77 | |||
78 | static kmem_cache_t *ocfs2_inode_cachep = NULL; | ||
79 | |||
80 | kmem_cache_t *ocfs2_lock_cache = NULL; | ||
81 | |||
82 | /* OCFS2 needs to schedule several differnt types of work which | ||
83 | * require cluster locking, disk I/O, recovery waits, etc. Since these | ||
84 | * types of work tend to be heavy we avoid using the kernel events | ||
85 | * workqueue and schedule on our own. */ | ||
86 | struct workqueue_struct *ocfs2_wq = NULL; | ||
87 | |||
88 | static struct dentry *ocfs2_debugfs_root = NULL; | ||
89 | |||
90 | MODULE_AUTHOR("Oracle"); | ||
91 | MODULE_LICENSE("GPL"); | ||
92 | |||
93 | static int ocfs2_parse_options(struct super_block *sb, char *options, | ||
94 | unsigned long *mount_opt, int is_remount); | ||
95 | static void ocfs2_put_super(struct super_block *sb); | ||
96 | static int ocfs2_mount_volume(struct super_block *sb); | ||
97 | static int ocfs2_remount(struct super_block *sb, int *flags, char *data); | ||
98 | static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err); | ||
99 | static int ocfs2_initialize_mem_caches(void); | ||
100 | static void ocfs2_free_mem_caches(void); | ||
101 | static void ocfs2_delete_osb(struct ocfs2_super *osb); | ||
102 | |||
103 | static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf); | ||
104 | |||
105 | static int ocfs2_sync_fs(struct super_block *sb, int wait); | ||
106 | |||
107 | static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); | ||
108 | static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); | ||
109 | static int ocfs2_release_system_inodes(struct ocfs2_super *osb); | ||
110 | static int ocfs2_fill_local_node_info(struct ocfs2_super *osb); | ||
111 | static int ocfs2_check_volume(struct ocfs2_super *osb); | ||
112 | static int ocfs2_verify_volume(struct ocfs2_dinode *di, | ||
113 | struct buffer_head *bh, | ||
114 | u32 sectsize); | ||
115 | static int ocfs2_initialize_super(struct super_block *sb, | ||
116 | struct buffer_head *bh, | ||
117 | int sector_size); | ||
118 | static int ocfs2_get_sector(struct super_block *sb, | ||
119 | struct buffer_head **bh, | ||
120 | int block, | ||
121 | int sect_size); | ||
122 | static void ocfs2_write_super(struct super_block *sb); | ||
123 | static struct inode *ocfs2_alloc_inode(struct super_block *sb); | ||
124 | static void ocfs2_destroy_inode(struct inode *inode); | ||
125 | |||
126 | static unsigned long long ocfs2_max_file_offset(unsigned int blockshift); | ||
127 | |||
128 | static struct super_operations ocfs2_sops = { | ||
129 | .statfs = ocfs2_statfs, | ||
130 | .alloc_inode = ocfs2_alloc_inode, | ||
131 | .destroy_inode = ocfs2_destroy_inode, | ||
132 | .drop_inode = ocfs2_drop_inode, | ||
133 | .clear_inode = ocfs2_clear_inode, | ||
134 | .delete_inode = ocfs2_delete_inode, | ||
135 | .sync_fs = ocfs2_sync_fs, | ||
136 | .write_super = ocfs2_write_super, | ||
137 | .put_super = ocfs2_put_super, | ||
138 | .remount_fs = ocfs2_remount, | ||
139 | }; | ||
140 | |||
141 | enum { | ||
142 | Opt_barrier, | ||
143 | Opt_err_panic, | ||
144 | Opt_err_ro, | ||
145 | Opt_intr, | ||
146 | Opt_nointr, | ||
147 | Opt_hb_none, | ||
148 | Opt_hb_local, | ||
149 | Opt_data_ordered, | ||
150 | Opt_data_writeback, | ||
151 | Opt_err, | ||
152 | }; | ||
153 | |||
154 | static match_table_t tokens = { | ||
155 | {Opt_barrier, "barrier=%u"}, | ||
156 | {Opt_err_panic, "errors=panic"}, | ||
157 | {Opt_err_ro, "errors=remount-ro"}, | ||
158 | {Opt_intr, "intr"}, | ||
159 | {Opt_nointr, "nointr"}, | ||
160 | {Opt_hb_none, OCFS2_HB_NONE}, | ||
161 | {Opt_hb_local, OCFS2_HB_LOCAL}, | ||
162 | {Opt_data_ordered, "data=ordered"}, | ||
163 | {Opt_data_writeback, "data=writeback"}, | ||
164 | {Opt_err, NULL} | ||
165 | }; | ||
166 | |||
167 | /* | ||
168 | * write_super and sync_fs ripped right out of ext3. | ||
169 | */ | ||
170 | static void ocfs2_write_super(struct super_block *sb) | ||
171 | { | ||
172 | if (down_trylock(&sb->s_lock) == 0) | ||
173 | BUG(); | ||
174 | sb->s_dirt = 0; | ||
175 | } | ||
176 | |||
177 | static int ocfs2_sync_fs(struct super_block *sb, int wait) | ||
178 | { | ||
179 | int status = 0; | ||
180 | tid_t target; | ||
181 | struct ocfs2_super *osb = OCFS2_SB(sb); | ||
182 | |||
183 | sb->s_dirt = 0; | ||
184 | |||
185 | if (ocfs2_is_hard_readonly(osb)) | ||
186 | return -EROFS; | ||
187 | |||
188 | if (wait) { | ||
189 | status = ocfs2_flush_truncate_log(osb); | ||
190 | if (status < 0) | ||
191 | mlog_errno(status); | ||
192 | } else { | ||
193 | ocfs2_schedule_truncate_log_flush(osb, 0); | ||
194 | } | ||
195 | |||
196 | if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) { | ||
197 | if (wait) | ||
198 | log_wait_commit(OCFS2_SB(sb)->journal->j_journal, | ||
199 | target); | ||
200 | } | ||
201 | return 0; | ||
202 | } | ||
203 | |||
204 | static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) | ||
205 | { | ||
206 | struct inode *new = NULL; | ||
207 | int status = 0; | ||
208 | int i; | ||
209 | |||
210 | mlog_entry_void(); | ||
211 | |||
212 | new = ocfs2_iget(osb, osb->root_blkno); | ||
213 | if (IS_ERR(new)) { | ||
214 | status = PTR_ERR(new); | ||
215 | mlog_errno(status); | ||
216 | goto bail; | ||
217 | } | ||
218 | osb->root_inode = new; | ||
219 | |||
220 | new = ocfs2_iget(osb, osb->system_dir_blkno); | ||
221 | if (IS_ERR(new)) { | ||
222 | status = PTR_ERR(new); | ||
223 | mlog_errno(status); | ||
224 | goto bail; | ||
225 | } | ||
226 | osb->sys_root_inode = new; | ||
227 | |||
228 | for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE; | ||
229 | i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) { | ||
230 | new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); | ||
231 | if (!new) { | ||
232 | ocfs2_release_system_inodes(osb); | ||
233 | status = -EINVAL; | ||
234 | mlog_errno(status); | ||
235 | /* FIXME: Should ERROR_RO_FS */ | ||
236 | mlog(ML_ERROR, "Unable to load system inode %d, " | ||
237 | "possibly corrupt fs?", i); | ||
238 | goto bail; | ||
239 | } | ||
240 | // the array now has one ref, so drop this one | ||
241 | iput(new); | ||
242 | } | ||
243 | |||
244 | bail: | ||
245 | mlog_exit(status); | ||
246 | return status; | ||
247 | } | ||
248 | |||
249 | static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) | ||
250 | { | ||
251 | struct inode *new = NULL; | ||
252 | int status = 0; | ||
253 | int i; | ||
254 | |||
255 | mlog_entry_void(); | ||
256 | |||
257 | for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; | ||
258 | i < NUM_SYSTEM_INODES; | ||
259 | i++) { | ||
260 | new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); | ||
261 | if (!new) { | ||
262 | ocfs2_release_system_inodes(osb); | ||
263 | status = -EINVAL; | ||
264 | mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", | ||
265 | status, i, osb->slot_num); | ||
266 | goto bail; | ||
267 | } | ||
268 | /* the array now has one ref, so drop this one */ | ||
269 | iput(new); | ||
270 | } | ||
271 | |||
272 | bail: | ||
273 | mlog_exit(status); | ||
274 | return status; | ||
275 | } | ||
276 | |||
277 | static int ocfs2_release_system_inodes(struct ocfs2_super *osb) | ||
278 | { | ||
279 | int status = 0, i; | ||
280 | struct inode *inode; | ||
281 | |||
282 | mlog_entry_void(); | ||
283 | |||
284 | for (i = 0; i < NUM_SYSTEM_INODES; i++) { | ||
285 | inode = osb->system_inodes[i]; | ||
286 | if (inode) { | ||
287 | iput(inode); | ||
288 | osb->system_inodes[i] = NULL; | ||
289 | } | ||
290 | } | ||
291 | |||
292 | inode = osb->sys_root_inode; | ||
293 | if (inode) { | ||
294 | iput(inode); | ||
295 | osb->sys_root_inode = NULL; | ||
296 | } | ||
297 | |||
298 | inode = osb->root_inode; | ||
299 | if (inode) { | ||
300 | iput(inode); | ||
301 | osb->root_inode = NULL; | ||
302 | } | ||
303 | |||
304 | mlog_exit(status); | ||
305 | return status; | ||
306 | } | ||
307 | |||
308 | /* We're allocating fs objects, use GFP_NOFS */ | ||
309 | static struct inode *ocfs2_alloc_inode(struct super_block *sb) | ||
310 | { | ||
311 | struct ocfs2_inode_info *oi; | ||
312 | |||
313 | oi = kmem_cache_alloc(ocfs2_inode_cachep, SLAB_NOFS); | ||
314 | if (!oi) | ||
315 | return NULL; | ||
316 | |||
317 | return &oi->vfs_inode; | ||
318 | } | ||
319 | |||
320 | static void ocfs2_destroy_inode(struct inode *inode) | ||
321 | { | ||
322 | kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); | ||
323 | } | ||
324 | |||
325 | /* From xfs_super.c:xfs_max_file_offset | ||
326 | * Copyright (c) 2000-2004 Silicon Graphics, Inc. | ||
327 | */ | ||
328 | static unsigned long long ocfs2_max_file_offset(unsigned int blockshift) | ||
329 | { | ||
330 | unsigned int pagefactor = 1; | ||
331 | unsigned int bitshift = BITS_PER_LONG - 1; | ||
332 | |||
333 | /* Figure out maximum filesize, on Linux this can depend on | ||
334 | * the filesystem blocksize (on 32 bit platforms). | ||
335 | * __block_prepare_write does this in an [unsigned] long... | ||
336 | * page->index << (PAGE_CACHE_SHIFT - bbits) | ||
337 | * So, for page sized blocks (4K on 32 bit platforms), | ||
338 | * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is | ||
339 | * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) | ||
340 | * but for smaller blocksizes it is less (bbits = log2 bsize). | ||
341 | * Note1: get_block_t takes a long (implicit cast from above) | ||
342 | * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch | ||
343 | * can optionally convert the [unsigned] long from above into | ||
344 | * an [unsigned] long long. | ||
345 | */ | ||
346 | |||
347 | #if BITS_PER_LONG == 32 | ||
348 | # if defined(CONFIG_LBD) | ||
349 | BUG_ON(sizeof(sector_t) != 8); | ||
350 | pagefactor = PAGE_CACHE_SIZE; | ||
351 | bitshift = BITS_PER_LONG; | ||
352 | # else | ||
353 | pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift); | ||
354 | # endif | ||
355 | #endif | ||
356 | |||
357 | return (((unsigned long long)pagefactor) << bitshift) - 1; | ||
358 | } | ||
359 | |||
360 | static int ocfs2_remount(struct super_block *sb, int *flags, char *data) | ||
361 | { | ||
362 | int incompat_features; | ||
363 | int ret = 0; | ||
364 | unsigned long parsed_options; | ||
365 | struct ocfs2_super *osb = OCFS2_SB(sb); | ||
366 | |||
367 | if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { | ||
368 | ret = -EINVAL; | ||
369 | goto out; | ||
370 | } | ||
371 | |||
372 | if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) != | ||
373 | (parsed_options & OCFS2_MOUNT_HB_LOCAL)) { | ||
374 | ret = -EINVAL; | ||
375 | mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); | ||
376 | goto out; | ||
377 | } | ||
378 | |||
379 | if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) != | ||
380 | (parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) { | ||
381 | ret = -EINVAL; | ||
382 | mlog(ML_ERROR, "Cannot change data mode on remount\n"); | ||
383 | goto out; | ||
384 | } | ||
385 | |||
386 | /* We're going to/from readonly mode. */ | ||
387 | if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { | ||
388 | /* Lock here so the check of HARD_RO and the potential | ||
389 | * setting of SOFT_RO is atomic. */ | ||
390 | spin_lock(&osb->osb_lock); | ||
391 | if (osb->osb_flags & OCFS2_OSB_HARD_RO) { | ||
392 | mlog(ML_ERROR, "Remount on readonly device is forbidden.\n"); | ||
393 | ret = -EROFS; | ||
394 | goto unlock_osb; | ||
395 | } | ||
396 | |||
397 | if (*flags & MS_RDONLY) { | ||
398 | mlog(0, "Going to ro mode.\n"); | ||
399 | sb->s_flags |= MS_RDONLY; | ||
400 | osb->osb_flags |= OCFS2_OSB_SOFT_RO; | ||
401 | } else { | ||
402 | mlog(0, "Making ro filesystem writeable.\n"); | ||
403 | |||
404 | if (osb->osb_flags & OCFS2_OSB_ERROR_FS) { | ||
405 | mlog(ML_ERROR, "Cannot remount RDWR " | ||
406 | "filesystem due to previous errors.\n"); | ||
407 | ret = -EROFS; | ||
408 | goto unlock_osb; | ||
409 | } | ||
410 | incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP); | ||
411 | if (incompat_features) { | ||
412 | mlog(ML_ERROR, "Cannot remount RDWR because " | ||
413 | "of unsupported optional features " | ||
414 | "(%x).\n", incompat_features); | ||
415 | ret = -EINVAL; | ||
416 | goto unlock_osb; | ||
417 | } | ||
418 | sb->s_flags &= ~MS_RDONLY; | ||
419 | osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; | ||
420 | } | ||
421 | unlock_osb: | ||
422 | spin_unlock(&osb->osb_lock); | ||
423 | } | ||
424 | |||
425 | if (!ret) { | ||
426 | if (!ocfs2_is_hard_readonly(osb)) | ||
427 | ocfs2_set_journal_params(osb); | ||
428 | |||
429 | /* Only save off the new mount options in case of a successful | ||
430 | * remount. */ | ||
431 | osb->s_mount_opt = parsed_options; | ||
432 | } | ||
433 | out: | ||
434 | return ret; | ||
435 | } | ||
436 | |||
437 | static int ocfs2_sb_probe(struct super_block *sb, | ||
438 | struct buffer_head **bh, | ||
439 | int *sector_size) | ||
440 | { | ||
441 | int status = 0, tmpstat; | ||
442 | struct ocfs1_vol_disk_hdr *hdr; | ||
443 | struct ocfs2_dinode *di; | ||
444 | int blksize; | ||
445 | |||
446 | *bh = NULL; | ||
447 | |||
448 | /* may be > 512 */ | ||
449 | *sector_size = bdev_hardsect_size(sb->s_bdev); | ||
450 | if (*sector_size > OCFS2_MAX_BLOCKSIZE) { | ||
451 | mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n", | ||
452 | *sector_size, OCFS2_MAX_BLOCKSIZE); | ||
453 | status = -EINVAL; | ||
454 | goto bail; | ||
455 | } | ||
456 | |||
457 | /* Can this really happen? */ | ||
458 | if (*sector_size < OCFS2_MIN_BLOCKSIZE) | ||
459 | *sector_size = OCFS2_MIN_BLOCKSIZE; | ||
460 | |||
461 | /* check block zero for old format */ | ||
462 | status = ocfs2_get_sector(sb, bh, 0, *sector_size); | ||
463 | if (status < 0) { | ||
464 | mlog_errno(status); | ||
465 | goto bail; | ||
466 | } | ||
467 | hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data; | ||
468 | if (hdr->major_version == OCFS1_MAJOR_VERSION) { | ||
469 | mlog(ML_ERROR, "incompatible version: %u.%u\n", | ||
470 | hdr->major_version, hdr->minor_version); | ||
471 | status = -EINVAL; | ||
472 | } | ||
473 | if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE, | ||
474 | strlen(OCFS1_VOLUME_SIGNATURE)) == 0) { | ||
475 | mlog(ML_ERROR, "incompatible volume signature: %8s\n", | ||
476 | hdr->signature); | ||
477 | status = -EINVAL; | ||
478 | } | ||
479 | brelse(*bh); | ||
480 | *bh = NULL; | ||
481 | if (status < 0) { | ||
482 | mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be " | ||
483 | "upgraded before mounting with ocfs v2\n"); | ||
484 | goto bail; | ||
485 | } | ||
486 | |||
487 | /* | ||
488 | * Now check at magic offset for 512, 1024, 2048, 4096 | ||
489 | * blocksizes. 4096 is the maximum blocksize because it is | ||
490 | * the minimum clustersize. | ||
491 | */ | ||
492 | status = -EINVAL; | ||
493 | for (blksize = *sector_size; | ||
494 | blksize <= OCFS2_MAX_BLOCKSIZE; | ||
495 | blksize <<= 1) { | ||
496 | tmpstat = ocfs2_get_sector(sb, bh, | ||
497 | OCFS2_SUPER_BLOCK_BLKNO, | ||
498 | blksize); | ||
499 | if (tmpstat < 0) { | ||
500 | status = tmpstat; | ||
501 | mlog_errno(status); | ||
502 | goto bail; | ||
503 | } | ||
504 | di = (struct ocfs2_dinode *) (*bh)->b_data; | ||
505 | status = ocfs2_verify_volume(di, *bh, blksize); | ||
506 | if (status >= 0) | ||
507 | goto bail; | ||
508 | brelse(*bh); | ||
509 | *bh = NULL; | ||
510 | if (status != -EAGAIN) | ||
511 | break; | ||
512 | } | ||
513 | |||
514 | bail: | ||
515 | return status; | ||
516 | } | ||
517 | |||
518 | static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) | ||
519 | { | ||
520 | struct dentry *root; | ||
521 | int status, sector_size; | ||
522 | unsigned long parsed_opt; | ||
523 | struct inode *inode = NULL; | ||
524 | struct ocfs2_super *osb = NULL; | ||
525 | struct buffer_head *bh = NULL; | ||
526 | |||
527 | mlog_entry("%p, %p, %i", sb, data, silent); | ||
528 | |||
529 | /* for now we only have one cluster/node, make sure we see it | ||
530 | * in the heartbeat universe */ | ||
531 | if (!o2hb_check_local_node_heartbeating()) { | ||
532 | status = -EINVAL; | ||
533 | goto read_super_error; | ||
534 | } | ||
535 | |||
536 | /* probe for superblock */ | ||
537 | status = ocfs2_sb_probe(sb, &bh, §or_size); | ||
538 | if (status < 0) { | ||
539 | mlog(ML_ERROR, "superblock probe failed!\n"); | ||
540 | goto read_super_error; | ||
541 | } | ||
542 | |||
543 | status = ocfs2_initialize_super(sb, bh, sector_size); | ||
544 | osb = OCFS2_SB(sb); | ||
545 | if (status < 0) { | ||
546 | mlog_errno(status); | ||
547 | goto read_super_error; | ||
548 | } | ||
549 | brelse(bh); | ||
550 | bh = NULL; | ||
551 | |||
552 | if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { | ||
553 | status = -EINVAL; | ||
554 | goto read_super_error; | ||
555 | } | ||
556 | osb->s_mount_opt = parsed_opt; | ||
557 | |||
558 | sb->s_magic = OCFS2_SUPER_MAGIC; | ||
559 | |||
560 | /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, | ||
561 | * heartbeat=none */ | ||
562 | if (bdev_read_only(sb->s_bdev)) { | ||
563 | if (!(sb->s_flags & MS_RDONLY)) { | ||
564 | status = -EACCES; | ||
565 | mlog(ML_ERROR, "Readonly device detected but readonly " | ||
566 | "mount was not specified.\n"); | ||
567 | goto read_super_error; | ||
568 | } | ||
569 | |||
570 | /* You should not be able to start a local heartbeat | ||
571 | * on a readonly device. */ | ||
572 | if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { | ||
573 | status = -EROFS; | ||
574 | mlog(ML_ERROR, "Local heartbeat specified on readonly " | ||
575 | "device.\n"); | ||
576 | goto read_super_error; | ||
577 | } | ||
578 | |||
579 | status = ocfs2_check_journals_nolocks(osb); | ||
580 | if (status < 0) { | ||
581 | if (status == -EROFS) | ||
582 | mlog(ML_ERROR, "Recovery required on readonly " | ||
583 | "file system, but write access is " | ||
584 | "unavailable.\n"); | ||
585 | else | ||
586 | mlog_errno(status); | ||
587 | goto read_super_error; | ||
588 | } | ||
589 | |||
590 | ocfs2_set_ro_flag(osb, 1); | ||
591 | |||
592 | printk(KERN_NOTICE "Readonly device detected. No cluster " | ||
593 | "services will be utilized for this mount. Recovery " | ||
594 | "will be skipped.\n"); | ||
595 | } | ||
596 | |||
597 | if (!ocfs2_is_hard_readonly(osb)) { | ||
598 | /* If this isn't a hard readonly mount, then we need | ||
599 | * to make sure that heartbeat is in a valid state, | ||
600 | * and that we mark ourselves soft readonly is -oro | ||
601 | * was specified. */ | ||
602 | if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { | ||
603 | mlog(ML_ERROR, "No heartbeat for device (%s)\n", | ||
604 | sb->s_id); | ||
605 | status = -EINVAL; | ||
606 | goto read_super_error; | ||
607 | } | ||
608 | |||
609 | if (sb->s_flags & MS_RDONLY) | ||
610 | ocfs2_set_ro_flag(osb, 0); | ||
611 | } | ||
612 | |||
613 | osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, | ||
614 | ocfs2_debugfs_root); | ||
615 | if (!osb->osb_debug_root) { | ||
616 | status = -EINVAL; | ||
617 | mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); | ||
618 | goto read_super_error; | ||
619 | } | ||
620 | |||
621 | status = ocfs2_mount_volume(sb); | ||
622 | if (osb->root_inode) | ||
623 | inode = igrab(osb->root_inode); | ||
624 | |||
625 | if (status < 0) | ||
626 | goto read_super_error; | ||
627 | |||
628 | if (!inode) { | ||
629 | status = -EIO; | ||
630 | mlog_errno(status); | ||
631 | goto read_super_error; | ||
632 | } | ||
633 | |||
634 | root = d_alloc_root(inode); | ||
635 | if (!root) { | ||
636 | status = -ENOMEM; | ||
637 | mlog_errno(status); | ||
638 | goto read_super_error; | ||
639 | } | ||
640 | |||
641 | sb->s_root = root; | ||
642 | |||
643 | ocfs2_complete_mount_recovery(osb); | ||
644 | |||
645 | printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s " | ||
646 | "data mode.\n", | ||
647 | MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num, | ||
648 | osb->slot_num, | ||
649 | osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : | ||
650 | "ordered"); | ||
651 | |||
652 | atomic_set(&osb->vol_state, VOLUME_MOUNTED); | ||
653 | wake_up(&osb->osb_mount_event); | ||
654 | |||
655 | mlog_exit(status); | ||
656 | return status; | ||
657 | |||
658 | read_super_error: | ||
659 | if (bh != NULL) | ||
660 | brelse(bh); | ||
661 | |||
662 | if (inode) | ||
663 | iput(inode); | ||
664 | |||
665 | if (osb) { | ||
666 | atomic_set(&osb->vol_state, VOLUME_DISABLED); | ||
667 | wake_up(&osb->osb_mount_event); | ||
668 | ocfs2_dismount_volume(sb, 1); | ||
669 | } | ||
670 | |||
671 | mlog_exit(status); | ||
672 | return status; | ||
673 | } | ||
674 | |||
675 | static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type, | ||
676 | int flags, | ||
677 | const char *dev_name, | ||
678 | void *data) | ||
679 | { | ||
680 | return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); | ||
681 | } | ||
682 | |||
683 | static struct file_system_type ocfs2_fs_type = { | ||
684 | .owner = THIS_MODULE, | ||
685 | .name = "ocfs2", | ||
686 | .get_sb = ocfs2_get_sb, /* is this called when we mount | ||
687 | * the fs? */ | ||
688 | .kill_sb = kill_block_super, /* set to the generic one | ||
689 | * right now, but do we | ||
690 | * need to change that? */ | ||
691 | .fs_flags = FS_REQUIRES_DEV, | ||
692 | .next = NULL | ||
693 | }; | ||
694 | |||
695 | static int ocfs2_parse_options(struct super_block *sb, | ||
696 | char *options, | ||
697 | unsigned long *mount_opt, | ||
698 | int is_remount) | ||
699 | { | ||
700 | int status; | ||
701 | char *p; | ||
702 | |||
703 | mlog_entry("remount: %d, options: \"%s\"\n", is_remount, | ||
704 | options ? options : "(none)"); | ||
705 | |||
706 | *mount_opt = 0; | ||
707 | |||
708 | if (!options) { | ||
709 | status = 1; | ||
710 | goto bail; | ||
711 | } | ||
712 | |||
713 | while ((p = strsep(&options, ",")) != NULL) { | ||
714 | int token, option; | ||
715 | substring_t args[MAX_OPT_ARGS]; | ||
716 | |||
717 | if (!*p) | ||
718 | continue; | ||
719 | |||
720 | token = match_token(p, tokens, args); | ||
721 | switch (token) { | ||
722 | case Opt_hb_local: | ||
723 | *mount_opt |= OCFS2_MOUNT_HB_LOCAL; | ||
724 | break; | ||
725 | case Opt_hb_none: | ||
726 | *mount_opt &= ~OCFS2_MOUNT_HB_LOCAL; | ||
727 | break; | ||
728 | case Opt_barrier: | ||
729 | if (match_int(&args[0], &option)) { | ||
730 | status = 0; | ||
731 | goto bail; | ||
732 | } | ||
733 | if (option) | ||
734 | *mount_opt |= OCFS2_MOUNT_BARRIER; | ||
735 | else | ||
736 | *mount_opt &= ~OCFS2_MOUNT_BARRIER; | ||
737 | break; | ||
738 | case Opt_intr: | ||
739 | *mount_opt &= ~OCFS2_MOUNT_NOINTR; | ||
740 | break; | ||
741 | case Opt_nointr: | ||
742 | *mount_opt |= OCFS2_MOUNT_NOINTR; | ||
743 | break; | ||
744 | case Opt_err_panic: | ||
745 | *mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; | ||
746 | break; | ||
747 | case Opt_err_ro: | ||
748 | *mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; | ||
749 | break; | ||
750 | case Opt_data_ordered: | ||
751 | *mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; | ||
752 | break; | ||
753 | case Opt_data_writeback: | ||
754 | *mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; | ||
755 | break; | ||
756 | default: | ||
757 | mlog(ML_ERROR, | ||
758 | "Unrecognized mount option \"%s\" " | ||
759 | "or missing value\n", p); | ||
760 | status = 0; | ||
761 | goto bail; | ||
762 | } | ||
763 | } | ||
764 | |||
765 | status = 1; | ||
766 | |||
767 | bail: | ||
768 | mlog_exit(status); | ||
769 | return status; | ||
770 | } | ||
771 | |||
772 | static int __init ocfs2_init(void) | ||
773 | { | ||
774 | int status; | ||
775 | |||
776 | mlog_entry_void(); | ||
777 | |||
778 | ocfs2_print_version(); | ||
779 | |||
780 | if (init_ocfs2_extent_maps()) | ||
781 | return -ENOMEM; | ||
782 | |||
783 | status = init_ocfs2_uptodate_cache(); | ||
784 | if (status < 0) { | ||
785 | mlog_errno(status); | ||
786 | goto leave; | ||
787 | } | ||
788 | |||
789 | status = ocfs2_initialize_mem_caches(); | ||
790 | if (status < 0) { | ||
791 | mlog_errno(status); | ||
792 | goto leave; | ||
793 | } | ||
794 | |||
795 | ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); | ||
796 | if (!ocfs2_wq) { | ||
797 | status = -ENOMEM; | ||
798 | goto leave; | ||
799 | } | ||
800 | |||
801 | spin_lock(&ocfs2_globals_lock); | ||
802 | osb_id = 0; | ||
803 | spin_unlock(&ocfs2_globals_lock); | ||
804 | |||
805 | ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); | ||
806 | if (!ocfs2_debugfs_root) { | ||
807 | status = -EFAULT; | ||
808 | mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); | ||
809 | } | ||
810 | |||
811 | leave: | ||
812 | if (status < 0) { | ||
813 | ocfs2_free_mem_caches(); | ||
814 | exit_ocfs2_uptodate_cache(); | ||
815 | exit_ocfs2_extent_maps(); | ||
816 | } | ||
817 | |||
818 | mlog_exit(status); | ||
819 | |||
820 | if (status >= 0) { | ||
821 | return register_filesystem(&ocfs2_fs_type); | ||
822 | } else | ||
823 | return -1; | ||
824 | } | ||
825 | |||
826 | static void __exit ocfs2_exit(void) | ||
827 | { | ||
828 | mlog_entry_void(); | ||
829 | |||
830 | if (ocfs2_wq) { | ||
831 | flush_workqueue(ocfs2_wq); | ||
832 | destroy_workqueue(ocfs2_wq); | ||
833 | } | ||
834 | |||
835 | debugfs_remove(ocfs2_debugfs_root); | ||
836 | |||
837 | ocfs2_free_mem_caches(); | ||
838 | |||
839 | unregister_filesystem(&ocfs2_fs_type); | ||
840 | |||
841 | exit_ocfs2_extent_maps(); | ||
842 | |||
843 | exit_ocfs2_uptodate_cache(); | ||
844 | |||
845 | mlog_exit_void(); | ||
846 | } | ||
847 | |||
848 | static void ocfs2_put_super(struct super_block *sb) | ||
849 | { | ||
850 | mlog_entry("(0x%p)\n", sb); | ||
851 | |||
852 | ocfs2_sync_blockdev(sb); | ||
853 | ocfs2_dismount_volume(sb, 0); | ||
854 | |||
855 | mlog_exit_void(); | ||
856 | } | ||
857 | |||
858 | static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf) | ||
859 | { | ||
860 | struct ocfs2_super *osb; | ||
861 | u32 numbits, freebits; | ||
862 | int status; | ||
863 | struct ocfs2_dinode *bm_lock; | ||
864 | struct buffer_head *bh = NULL; | ||
865 | struct inode *inode = NULL; | ||
866 | |||
867 | mlog_entry("(%p, %p)\n", sb, buf); | ||
868 | |||
869 | osb = OCFS2_SB(sb); | ||
870 | |||
871 | inode = ocfs2_get_system_file_inode(osb, | ||
872 | GLOBAL_BITMAP_SYSTEM_INODE, | ||
873 | OCFS2_INVALID_SLOT); | ||
874 | if (!inode) { | ||
875 | mlog(ML_ERROR, "failed to get bitmap inode\n"); | ||
876 | status = -EIO; | ||
877 | goto bail; | ||
878 | } | ||
879 | |||
880 | status = ocfs2_meta_lock(inode, NULL, &bh, 0); | ||
881 | if (status < 0) { | ||
882 | mlog_errno(status); | ||
883 | goto bail; | ||
884 | } | ||
885 | |||
886 | bm_lock = (struct ocfs2_dinode *) bh->b_data; | ||
887 | |||
888 | numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total); | ||
889 | freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used); | ||
890 | |||
891 | buf->f_type = OCFS2_SUPER_MAGIC; | ||
892 | buf->f_bsize = sb->s_blocksize; | ||
893 | buf->f_namelen = OCFS2_MAX_FILENAME_LEN; | ||
894 | buf->f_blocks = ((sector_t) numbits) * | ||
895 | (osb->s_clustersize >> osb->sb->s_blocksize_bits); | ||
896 | buf->f_bfree = ((sector_t) freebits) * | ||
897 | (osb->s_clustersize >> osb->sb->s_blocksize_bits); | ||
898 | buf->f_bavail = buf->f_bfree; | ||
899 | buf->f_files = numbits; | ||
900 | buf->f_ffree = freebits; | ||
901 | |||
902 | brelse(bh); | ||
903 | |||
904 | ocfs2_meta_unlock(inode, 0); | ||
905 | status = 0; | ||
906 | bail: | ||
907 | if (inode) | ||
908 | iput(inode); | ||
909 | |||
910 | mlog_exit(status); | ||
911 | |||
912 | return status; | ||
913 | } | ||
914 | |||
915 | static void ocfs2_inode_init_once(void *data, | ||
916 | kmem_cache_t *cachep, | ||
917 | unsigned long flags) | ||
918 | { | ||
919 | struct ocfs2_inode_info *oi = data; | ||
920 | |||
921 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == | ||
922 | SLAB_CTOR_CONSTRUCTOR) { | ||
923 | oi->ip_flags = 0; | ||
924 | oi->ip_open_count = 0; | ||
925 | spin_lock_init(&oi->ip_lock); | ||
926 | ocfs2_extent_map_init(&oi->vfs_inode); | ||
927 | INIT_LIST_HEAD(&oi->ip_handle_list); | ||
928 | INIT_LIST_HEAD(&oi->ip_io_markers); | ||
929 | oi->ip_handle = NULL; | ||
930 | oi->ip_created_trans = 0; | ||
931 | oi->ip_last_trans = 0; | ||
932 | oi->ip_dir_start_lookup = 0; | ||
933 | |||
934 | init_rwsem(&oi->ip_alloc_sem); | ||
935 | init_MUTEX(&(oi->ip_io_sem)); | ||
936 | |||
937 | oi->ip_blkno = 0ULL; | ||
938 | oi->ip_clusters = 0; | ||
939 | |||
940 | ocfs2_lock_res_init_once(&oi->ip_rw_lockres); | ||
941 | ocfs2_lock_res_init_once(&oi->ip_meta_lockres); | ||
942 | ocfs2_lock_res_init_once(&oi->ip_data_lockres); | ||
943 | |||
944 | ocfs2_metadata_cache_init(&oi->vfs_inode); | ||
945 | |||
946 | inode_init_once(&oi->vfs_inode); | ||
947 | } | ||
948 | } | ||
949 | |||
950 | static int ocfs2_initialize_mem_caches(void) | ||
951 | { | ||
952 | ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache", | ||
953 | sizeof(struct ocfs2_inode_info), | ||
954 | 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, | ||
955 | ocfs2_inode_init_once, NULL); | ||
956 | if (!ocfs2_inode_cachep) | ||
957 | return -ENOMEM; | ||
958 | |||
959 | ocfs2_lock_cache = kmem_cache_create("ocfs2_lock", | ||
960 | sizeof(struct ocfs2_journal_lock), | ||
961 | 0, | ||
962 | SLAB_NO_REAP|SLAB_HWCACHE_ALIGN, | ||
963 | NULL, NULL); | ||
964 | if (!ocfs2_lock_cache) | ||
965 | return -ENOMEM; | ||
966 | |||
967 | return 0; | ||
968 | } | ||
969 | |||
970 | static void ocfs2_free_mem_caches(void) | ||
971 | { | ||
972 | if (ocfs2_inode_cachep) | ||
973 | kmem_cache_destroy(ocfs2_inode_cachep); | ||
974 | if (ocfs2_lock_cache) | ||
975 | kmem_cache_destroy(ocfs2_lock_cache); | ||
976 | |||
977 | ocfs2_inode_cachep = NULL; | ||
978 | ocfs2_lock_cache = NULL; | ||
979 | } | ||
980 | |||
981 | static int ocfs2_get_sector(struct super_block *sb, | ||
982 | struct buffer_head **bh, | ||
983 | int block, | ||
984 | int sect_size) | ||
985 | { | ||
986 | if (!sb_set_blocksize(sb, sect_size)) { | ||
987 | mlog(ML_ERROR, "unable to set blocksize\n"); | ||
988 | return -EIO; | ||
989 | } | ||
990 | |||
991 | *bh = sb_getblk(sb, block); | ||
992 | if (!*bh) { | ||
993 | mlog_errno(-EIO); | ||
994 | return -EIO; | ||
995 | } | ||
996 | lock_buffer(*bh); | ||
997 | if (!buffer_dirty(*bh)) | ||
998 | clear_buffer_uptodate(*bh); | ||
999 | unlock_buffer(*bh); | ||
1000 | ll_rw_block(READ, 1, bh); | ||
1001 | wait_on_buffer(*bh); | ||
1002 | return 0; | ||
1003 | } | ||
1004 | |||
1005 | /* ocfs2 1.0 only allows one cluster and node identity per kernel image. */ | ||
1006 | static int ocfs2_fill_local_node_info(struct ocfs2_super *osb) | ||
1007 | { | ||
1008 | int status; | ||
1009 | |||
1010 | /* XXX hold a ref on the node while mounte? easy enough, if | ||
1011 | * desirable. */ | ||
1012 | osb->node_num = o2nm_this_node(); | ||
1013 | if (osb->node_num == O2NM_MAX_NODES) { | ||
1014 | mlog(ML_ERROR, "could not find this host's node number\n"); | ||
1015 | status = -ENOENT; | ||
1016 | goto bail; | ||
1017 | } | ||
1018 | |||
1019 | mlog(ML_NOTICE, "I am node %d\n", osb->node_num); | ||
1020 | |||
1021 | status = 0; | ||
1022 | bail: | ||
1023 | return status; | ||
1024 | } | ||
1025 | |||
1026 | static int ocfs2_mount_volume(struct super_block *sb) | ||
1027 | { | ||
1028 | int status = 0; | ||
1029 | int unlock_super = 0; | ||
1030 | struct ocfs2_super *osb = OCFS2_SB(sb); | ||
1031 | |||
1032 | mlog_entry_void(); | ||
1033 | |||
1034 | if (ocfs2_is_hard_readonly(osb)) | ||
1035 | goto leave; | ||
1036 | |||
1037 | status = ocfs2_fill_local_node_info(osb); | ||
1038 | if (status < 0) { | ||
1039 | mlog_errno(status); | ||
1040 | goto leave; | ||
1041 | } | ||
1042 | |||
1043 | status = ocfs2_register_hb_callbacks(osb); | ||
1044 | if (status < 0) { | ||
1045 | mlog_errno(status); | ||
1046 | goto leave; | ||
1047 | } | ||
1048 | |||
1049 | status = ocfs2_dlm_init(osb); | ||
1050 | if (status < 0) { | ||
1051 | mlog_errno(status); | ||
1052 | goto leave; | ||
1053 | } | ||
1054 | |||
1055 | /* requires vote_thread to be running. */ | ||
1056 | status = ocfs2_register_net_handlers(osb); | ||
1057 | if (status < 0) { | ||
1058 | mlog_errno(status); | ||
1059 | goto leave; | ||
1060 | } | ||
1061 | |||
1062 | status = ocfs2_super_lock(osb, 1); | ||
1063 | if (status < 0) { | ||
1064 | mlog_errno(status); | ||
1065 | goto leave; | ||
1066 | } | ||
1067 | unlock_super = 1; | ||
1068 | |||
1069 | /* This will load up the node map and add ourselves to it. */ | ||
1070 | status = ocfs2_find_slot(osb); | ||
1071 | if (status < 0) { | ||
1072 | mlog_errno(status); | ||
1073 | goto leave; | ||
1074 | } | ||
1075 | |||
1076 | ocfs2_populate_mounted_map(osb); | ||
1077 | |||
1078 | /* load all node-local system inodes */ | ||
1079 | status = ocfs2_init_local_system_inodes(osb); | ||
1080 | if (status < 0) { | ||
1081 | mlog_errno(status); | ||
1082 | goto leave; | ||
1083 | } | ||
1084 | |||
1085 | status = ocfs2_check_volume(osb); | ||
1086 | if (status < 0) { | ||
1087 | mlog_errno(status); | ||
1088 | goto leave; | ||
1089 | } | ||
1090 | |||
1091 | status = ocfs2_truncate_log_init(osb); | ||
1092 | if (status < 0) { | ||
1093 | mlog_errno(status); | ||
1094 | goto leave; | ||
1095 | } | ||
1096 | |||
1097 | /* This should be sent *after* we recovered our journal as it | ||
1098 | * will cause other nodes to unmark us as needing | ||
1099 | * recovery. However, we need to send it *before* dropping the | ||
1100 | * super block lock as otherwise their recovery threads might | ||
1101 | * try to clean us up while we're live! */ | ||
1102 | status = ocfs2_request_mount_vote(osb); | ||
1103 | if (status < 0) | ||
1104 | mlog_errno(status); | ||
1105 | |||
1106 | leave: | ||
1107 | if (unlock_super) | ||
1108 | ocfs2_super_unlock(osb, 1); | ||
1109 | |||
1110 | mlog_exit(status); | ||
1111 | return status; | ||
1112 | } | ||
1113 | |||
1114 | /* we can't grab the goofy sem lock from inside wait_event, so we use | ||
1115 | * memory barriers to make sure that we'll see the null task before | ||
1116 | * being woken up */ | ||
1117 | static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) | ||
1118 | { | ||
1119 | mb(); | ||
1120 | return osb->recovery_thread_task != NULL; | ||
1121 | } | ||
1122 | |||
1123 | static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) | ||
1124 | { | ||
1125 | int tmp; | ||
1126 | struct ocfs2_super *osb = NULL; | ||
1127 | |||
1128 | mlog_entry("(0x%p)\n", sb); | ||
1129 | |||
1130 | BUG_ON(!sb); | ||
1131 | osb = OCFS2_SB(sb); | ||
1132 | BUG_ON(!osb); | ||
1133 | |||
1134 | ocfs2_shutdown_local_alloc(osb); | ||
1135 | |||
1136 | ocfs2_truncate_log_shutdown(osb); | ||
1137 | |||
1138 | /* disable any new recovery threads and wait for any currently | ||
1139 | * running ones to exit. Do this before setting the vol_state. */ | ||
1140 | down(&osb->recovery_lock); | ||
1141 | osb->disable_recovery = 1; | ||
1142 | up(&osb->recovery_lock); | ||
1143 | wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); | ||
1144 | |||
1145 | /* At this point, we know that no more recovery threads can be | ||
1146 | * launched, so wait for any recovery completion work to | ||
1147 | * complete. */ | ||
1148 | flush_workqueue(ocfs2_wq); | ||
1149 | |||
1150 | ocfs2_journal_shutdown(osb); | ||
1151 | |||
1152 | ocfs2_sync_blockdev(sb); | ||
1153 | |||
1154 | /* No dlm means we've failed during mount, so skip all the | ||
1155 | * steps which depended on that to complete. */ | ||
1156 | if (osb->dlm) { | ||
1157 | tmp = ocfs2_super_lock(osb, 1); | ||
1158 | if (tmp < 0) { | ||
1159 | mlog_errno(tmp); | ||
1160 | return; | ||
1161 | } | ||
1162 | |||
1163 | tmp = ocfs2_request_umount_vote(osb); | ||
1164 | if (tmp < 0) | ||
1165 | mlog_errno(tmp); | ||
1166 | |||
1167 | if (osb->slot_num != OCFS2_INVALID_SLOT) | ||
1168 | ocfs2_put_slot(osb); | ||
1169 | |||
1170 | ocfs2_super_unlock(osb, 1); | ||
1171 | } | ||
1172 | |||
1173 | ocfs2_release_system_inodes(osb); | ||
1174 | |||
1175 | if (osb->dlm) { | ||
1176 | ocfs2_unregister_net_handlers(osb); | ||
1177 | |||
1178 | ocfs2_dlm_shutdown(osb); | ||
1179 | } | ||
1180 | |||
1181 | ocfs2_clear_hb_callbacks(osb); | ||
1182 | |||
1183 | debugfs_remove(osb->osb_debug_root); | ||
1184 | |||
1185 | if (!mnt_err) | ||
1186 | ocfs2_stop_heartbeat(osb); | ||
1187 | |||
1188 | atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); | ||
1189 | |||
1190 | printk("ocfs2: Unmounting device (%u,%u) on (node %d)\n", | ||
1191 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev), osb->node_num); | ||
1192 | |||
1193 | ocfs2_delete_osb(osb); | ||
1194 | kfree(osb); | ||
1195 | sb->s_dev = 0; | ||
1196 | sb->s_fs_info = NULL; | ||
1197 | } | ||
1198 | |||
1199 | static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid, | ||
1200 | unsigned uuid_bytes) | ||
1201 | { | ||
1202 | int i, ret; | ||
1203 | char *ptr; | ||
1204 | |||
1205 | BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN); | ||
1206 | |||
1207 | osb->uuid_str = kcalloc(1, OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL); | ||
1208 | if (osb->uuid_str == NULL) | ||
1209 | return -ENOMEM; | ||
1210 | |||
1211 | memcpy(osb->uuid, uuid, OCFS2_VOL_UUID_LEN); | ||
1212 | |||
1213 | for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) { | ||
1214 | /* print with null */ | ||
1215 | ret = snprintf(ptr, 3, "%02X", uuid[i]); | ||
1216 | if (ret != 2) /* drop super cleans up */ | ||
1217 | return -EINVAL; | ||
1218 | /* then only advance past the last char */ | ||
1219 | ptr += 2; | ||
1220 | } | ||
1221 | |||
1222 | return 0; | ||
1223 | } | ||
1224 | |||
1225 | static int ocfs2_initialize_super(struct super_block *sb, | ||
1226 | struct buffer_head *bh, | ||
1227 | int sector_size) | ||
1228 | { | ||
1229 | int status = 0; | ||
1230 | int i; | ||
1231 | struct ocfs2_dinode *di = NULL; | ||
1232 | struct inode *inode = NULL; | ||
1233 | struct buffer_head *bitmap_bh = NULL; | ||
1234 | struct ocfs2_journal *journal; | ||
1235 | __le32 uuid_net_key; | ||
1236 | struct ocfs2_super *osb; | ||
1237 | |||
1238 | mlog_entry_void(); | ||
1239 | |||
1240 | osb = kcalloc(1, sizeof(struct ocfs2_super), GFP_KERNEL); | ||
1241 | if (!osb) { | ||
1242 | status = -ENOMEM; | ||
1243 | mlog_errno(status); | ||
1244 | goto bail; | ||
1245 | } | ||
1246 | |||
1247 | sb->s_fs_info = osb; | ||
1248 | sb->s_op = &ocfs2_sops; | ||
1249 | sb->s_export_op = &ocfs2_export_ops; | ||
1250 | sb->s_flags |= MS_NOATIME; | ||
1251 | /* this is needed to support O_LARGEFILE */ | ||
1252 | sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits); | ||
1253 | |||
1254 | osb->sb = sb; | ||
1255 | /* Save off for ocfs2_rw_direct */ | ||
1256 | osb->s_sectsize_bits = blksize_bits(sector_size); | ||
1257 | if (!osb->s_sectsize_bits) | ||
1258 | BUG(); | ||
1259 | |||
1260 | osb->net_response_ids = 0; | ||
1261 | spin_lock_init(&osb->net_response_lock); | ||
1262 | INIT_LIST_HEAD(&osb->net_response_list); | ||
1263 | |||
1264 | INIT_LIST_HEAD(&osb->osb_net_handlers); | ||
1265 | init_waitqueue_head(&osb->recovery_event); | ||
1266 | spin_lock_init(&osb->vote_task_lock); | ||
1267 | init_waitqueue_head(&osb->vote_event); | ||
1268 | osb->vote_work_sequence = 0; | ||
1269 | osb->vote_wake_sequence = 0; | ||
1270 | INIT_LIST_HEAD(&osb->blocked_lock_list); | ||
1271 | osb->blocked_lock_count = 0; | ||
1272 | INIT_LIST_HEAD(&osb->vote_list); | ||
1273 | spin_lock_init(&osb->osb_lock); | ||
1274 | |||
1275 | atomic_set(&osb->alloc_stats.moves, 0); | ||
1276 | atomic_set(&osb->alloc_stats.local_data, 0); | ||
1277 | atomic_set(&osb->alloc_stats.bitmap_data, 0); | ||
1278 | atomic_set(&osb->alloc_stats.bg_allocs, 0); | ||
1279 | atomic_set(&osb->alloc_stats.bg_extends, 0); | ||
1280 | |||
1281 | ocfs2_init_node_maps(osb); | ||
1282 | |||
1283 | snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", | ||
1284 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); | ||
1285 | |||
1286 | init_MUTEX(&osb->recovery_lock); | ||
1287 | |||
1288 | osb->disable_recovery = 0; | ||
1289 | osb->recovery_thread_task = NULL; | ||
1290 | |||
1291 | init_waitqueue_head(&osb->checkpoint_event); | ||
1292 | atomic_set(&osb->needs_checkpoint, 0); | ||
1293 | |||
1294 | osb->node_num = O2NM_INVALID_NODE_NUM; | ||
1295 | osb->slot_num = OCFS2_INVALID_SLOT; | ||
1296 | |||
1297 | osb->local_alloc_state = OCFS2_LA_UNUSED; | ||
1298 | osb->local_alloc_bh = NULL; | ||
1299 | |||
1300 | ocfs2_setup_hb_callbacks(osb); | ||
1301 | |||
1302 | init_waitqueue_head(&osb->osb_mount_event); | ||
1303 | |||
1304 | osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); | ||
1305 | if (!osb->vol_label) { | ||
1306 | mlog(ML_ERROR, "unable to alloc vol label\n"); | ||
1307 | status = -ENOMEM; | ||
1308 | goto bail; | ||
1309 | } | ||
1310 | |||
1311 | osb->uuid = kmalloc(OCFS2_VOL_UUID_LEN, GFP_KERNEL); | ||
1312 | if (!osb->uuid) { | ||
1313 | mlog(ML_ERROR, "unable to alloc uuid\n"); | ||
1314 | status = -ENOMEM; | ||
1315 | goto bail; | ||
1316 | } | ||
1317 | |||
1318 | di = (struct ocfs2_dinode *)bh->b_data; | ||
1319 | |||
1320 | osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); | ||
1321 | if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { | ||
1322 | mlog(ML_ERROR, "Invalid number of node slots (%u)\n", | ||
1323 | osb->max_slots); | ||
1324 | status = -EINVAL; | ||
1325 | goto bail; | ||
1326 | } | ||
1327 | mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots); | ||
1328 | |||
1329 | osb->s_feature_compat = | ||
1330 | le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); | ||
1331 | osb->s_feature_ro_compat = | ||
1332 | le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat); | ||
1333 | osb->s_feature_incompat = | ||
1334 | le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat); | ||
1335 | |||
1336 | if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) { | ||
1337 | mlog(ML_ERROR, "couldn't mount because of unsupported " | ||
1338 | "optional features (%x).\n", i); | ||
1339 | status = -EINVAL; | ||
1340 | goto bail; | ||
1341 | } | ||
1342 | if (!(osb->sb->s_flags & MS_RDONLY) && | ||
1343 | (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { | ||
1344 | mlog(ML_ERROR, "couldn't mount RDWR because of " | ||
1345 | "unsupported optional features (%x).\n", i); | ||
1346 | status = -EINVAL; | ||
1347 | goto bail; | ||
1348 | } | ||
1349 | |||
1350 | get_random_bytes(&osb->s_next_generation, sizeof(u32)); | ||
1351 | |||
1352 | /* FIXME | ||
1353 | * This should be done in ocfs2_journal_init(), but unknown | ||
1354 | * ordering issues will cause the filesystem to crash. | ||
1355 | * If anyone wants to figure out what part of the code | ||
1356 | * refers to osb->journal before ocfs2_journal_init() is run, | ||
1357 | * be my guest. | ||
1358 | */ | ||
1359 | /* initialize our journal structure */ | ||
1360 | |||
1361 | journal = kcalloc(1, sizeof(struct ocfs2_journal), GFP_KERNEL); | ||
1362 | if (!journal) { | ||
1363 | mlog(ML_ERROR, "unable to alloc journal\n"); | ||
1364 | status = -ENOMEM; | ||
1365 | goto bail; | ||
1366 | } | ||
1367 | osb->journal = journal; | ||
1368 | journal->j_osb = osb; | ||
1369 | |||
1370 | atomic_set(&journal->j_num_trans, 0); | ||
1371 | init_rwsem(&journal->j_trans_barrier); | ||
1372 | init_waitqueue_head(&journal->j_checkpointed); | ||
1373 | spin_lock_init(&journal->j_lock); | ||
1374 | journal->j_trans_id = (unsigned long) 1; | ||
1375 | INIT_LIST_HEAD(&journal->j_la_cleanups); | ||
1376 | INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery, osb); | ||
1377 | journal->j_state = OCFS2_JOURNAL_FREE; | ||
1378 | |||
1379 | /* get some pseudo constants for clustersize bits */ | ||
1380 | osb->s_clustersize_bits = | ||
1381 | le32_to_cpu(di->id2.i_super.s_clustersize_bits); | ||
1382 | osb->s_clustersize = 1 << osb->s_clustersize_bits; | ||
1383 | mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits); | ||
1384 | |||
1385 | if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE || | ||
1386 | osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) { | ||
1387 | mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n", | ||
1388 | osb->s_clustersize); | ||
1389 | status = -EINVAL; | ||
1390 | goto bail; | ||
1391 | } | ||
1392 | |||
1393 | if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1) | ||
1394 | > (u32)~0UL) { | ||
1395 | mlog(ML_ERROR, "Volume might try to write to blocks beyond " | ||
1396 | "what jbd can address in 32 bits.\n"); | ||
1397 | status = -EINVAL; | ||
1398 | goto bail; | ||
1399 | } | ||
1400 | |||
1401 | if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid, | ||
1402 | sizeof(di->id2.i_super.s_uuid))) { | ||
1403 | mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n"); | ||
1404 | status = -ENOMEM; | ||
1405 | goto bail; | ||
1406 | } | ||
1407 | |||
1408 | memcpy(&uuid_net_key, &osb->uuid[i], sizeof(osb->net_key)); | ||
1409 | osb->net_key = le32_to_cpu(uuid_net_key); | ||
1410 | |||
1411 | strncpy(osb->vol_label, di->id2.i_super.s_label, 63); | ||
1412 | osb->vol_label[63] = '\0'; | ||
1413 | osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); | ||
1414 | osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); | ||
1415 | osb->first_cluster_group_blkno = | ||
1416 | le64_to_cpu(di->id2.i_super.s_first_cluster_group); | ||
1417 | osb->fs_generation = le32_to_cpu(di->i_fs_generation); | ||
1418 | mlog(0, "vol_label: %s\n", osb->vol_label); | ||
1419 | mlog(0, "uuid: %s\n", osb->uuid_str); | ||
1420 | mlog(0, "root_blkno=%"MLFu64", system_dir_blkno=%"MLFu64"\n", | ||
1421 | osb->root_blkno, osb->system_dir_blkno); | ||
1422 | |||
1423 | osb->osb_dlm_debug = ocfs2_new_dlm_debug(); | ||
1424 | if (!osb->osb_dlm_debug) { | ||
1425 | status = -ENOMEM; | ||
1426 | mlog_errno(status); | ||
1427 | goto bail; | ||
1428 | } | ||
1429 | |||
1430 | atomic_set(&osb->vol_state, VOLUME_INIT); | ||
1431 | |||
1432 | /* load root, system_dir, and all global system inodes */ | ||
1433 | status = ocfs2_init_global_system_inodes(osb); | ||
1434 | if (status < 0) { | ||
1435 | mlog_errno(status); | ||
1436 | goto bail; | ||
1437 | } | ||
1438 | |||
1439 | /* | ||
1440 | * global bitmap | ||
1441 | */ | ||
1442 | inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, | ||
1443 | OCFS2_INVALID_SLOT); | ||
1444 | if (!inode) { | ||
1445 | status = -EINVAL; | ||
1446 | mlog_errno(status); | ||
1447 | goto bail; | ||
1448 | } | ||
1449 | |||
1450 | osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; | ||
1451 | |||
1452 | status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0, | ||
1453 | inode); | ||
1454 | iput(inode); | ||
1455 | if (status < 0) { | ||
1456 | mlog_errno(status); | ||
1457 | goto bail; | ||
1458 | } | ||
1459 | |||
1460 | di = (struct ocfs2_dinode *) bitmap_bh->b_data; | ||
1461 | osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg); | ||
1462 | osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total); | ||
1463 | brelse(bitmap_bh); | ||
1464 | mlog(0, "cluster bitmap inode: %"MLFu64", clusters per group: %u\n", | ||
1465 | osb->bitmap_blkno, osb->bitmap_cpg); | ||
1466 | |||
1467 | status = ocfs2_init_slot_info(osb); | ||
1468 | if (status < 0) { | ||
1469 | mlog_errno(status); | ||
1470 | goto bail; | ||
1471 | } | ||
1472 | |||
1473 | /* Link this osb onto the global linked list of all osb structures. */ | ||
1474 | /* The Global Link List is mainted for the whole driver . */ | ||
1475 | spin_lock(&ocfs2_globals_lock); | ||
1476 | osb->osb_id = osb_id; | ||
1477 | if (osb_id < OCFS2_MAX_OSB_ID) | ||
1478 | osb_id++; | ||
1479 | else { | ||
1480 | mlog(ML_ERROR, "Too many volumes mounted\n"); | ||
1481 | status = -ENOMEM; | ||
1482 | } | ||
1483 | spin_unlock(&ocfs2_globals_lock); | ||
1484 | |||
1485 | bail: | ||
1486 | mlog_exit(status); | ||
1487 | return status; | ||
1488 | } | ||
1489 | |||
1490 | /* | ||
1491 | * will return: -EAGAIN if it is ok to keep searching for superblocks | ||
1492 | * -EINVAL if there is a bad superblock | ||
1493 | * 0 on success | ||
1494 | */ | ||
1495 | static int ocfs2_verify_volume(struct ocfs2_dinode *di, | ||
1496 | struct buffer_head *bh, | ||
1497 | u32 blksz) | ||
1498 | { | ||
1499 | int status = -EAGAIN; | ||
1500 | |||
1501 | mlog_entry_void(); | ||
1502 | |||
1503 | if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, | ||
1504 | strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { | ||
1505 | status = -EINVAL; | ||
1506 | if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { | ||
1507 | mlog(ML_ERROR, "found superblock with incorrect block " | ||
1508 | "size: found %u, should be %u\n", | ||
1509 | 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits), | ||
1510 | blksz); | ||
1511 | } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != | ||
1512 | OCFS2_MAJOR_REV_LEVEL || | ||
1513 | le16_to_cpu(di->id2.i_super.s_minor_rev_level) != | ||
1514 | OCFS2_MINOR_REV_LEVEL) { | ||
1515 | mlog(ML_ERROR, "found superblock with bad version: " | ||
1516 | "found %u.%u, should be %u.%u\n", | ||
1517 | le16_to_cpu(di->id2.i_super.s_major_rev_level), | ||
1518 | le16_to_cpu(di->id2.i_super.s_minor_rev_level), | ||
1519 | OCFS2_MAJOR_REV_LEVEL, | ||
1520 | OCFS2_MINOR_REV_LEVEL); | ||
1521 | } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) { | ||
1522 | mlog(ML_ERROR, "bad block number on superblock: " | ||
1523 | "found %"MLFu64", should be %llu\n", | ||
1524 | di->i_blkno, (unsigned long long)bh->b_blocknr); | ||
1525 | } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 || | ||
1526 | le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) { | ||
1527 | mlog(ML_ERROR, "bad cluster size found: %u\n", | ||
1528 | 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits)); | ||
1529 | } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) { | ||
1530 | mlog(ML_ERROR, "bad root_blkno: 0\n"); | ||
1531 | } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) { | ||
1532 | mlog(ML_ERROR, "bad system_dir_blkno: 0\n"); | ||
1533 | } else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) { | ||
1534 | mlog(ML_ERROR, | ||
1535 | "Superblock slots found greater than file system " | ||
1536 | "maximum: found %u, max %u\n", | ||
1537 | le16_to_cpu(di->id2.i_super.s_max_slots), | ||
1538 | OCFS2_MAX_SLOTS); | ||
1539 | } else { | ||
1540 | /* found it! */ | ||
1541 | status = 0; | ||
1542 | } | ||
1543 | } | ||
1544 | |||
1545 | mlog_exit(status); | ||
1546 | return status; | ||
1547 | } | ||
1548 | |||
1549 | static int ocfs2_check_volume(struct ocfs2_super *osb) | ||
1550 | { | ||
1551 | int status = 0; | ||
1552 | int dirty; | ||
1553 | struct ocfs2_dinode *local_alloc = NULL; /* only used if we | ||
1554 | * recover | ||
1555 | * ourselves. */ | ||
1556 | |||
1557 | mlog_entry_void(); | ||
1558 | |||
1559 | /* Init our journal object. */ | ||
1560 | status = ocfs2_journal_init(osb->journal, &dirty); | ||
1561 | if (status < 0) { | ||
1562 | mlog(ML_ERROR, "Could not initialize journal!\n"); | ||
1563 | goto finally; | ||
1564 | } | ||
1565 | |||
1566 | /* If the journal was unmounted cleanly then we don't want to | ||
1567 | * recover anything. Otherwise, journal_load will do that | ||
1568 | * dirty work for us :) */ | ||
1569 | if (!dirty) { | ||
1570 | status = ocfs2_journal_wipe(osb->journal, 0); | ||
1571 | if (status < 0) { | ||
1572 | mlog_errno(status); | ||
1573 | goto finally; | ||
1574 | } | ||
1575 | } else { | ||
1576 | mlog(ML_NOTICE, "File system was not unmounted cleanly, " | ||
1577 | "recovering volume.\n"); | ||
1578 | } | ||
1579 | |||
1580 | /* will play back anything left in the journal. */ | ||
1581 | ocfs2_journal_load(osb->journal); | ||
1582 | |||
1583 | if (dirty) { | ||
1584 | /* recover my local alloc if we didn't unmount cleanly. */ | ||
1585 | status = ocfs2_begin_local_alloc_recovery(osb, | ||
1586 | osb->slot_num, | ||
1587 | &local_alloc); | ||
1588 | if (status < 0) { | ||
1589 | mlog_errno(status); | ||
1590 | goto finally; | ||
1591 | } | ||
1592 | /* we complete the recovery process after we've marked | ||
1593 | * ourselves as mounted. */ | ||
1594 | } | ||
1595 | |||
1596 | mlog(0, "Journal loaded.\n"); | ||
1597 | |||
1598 | status = ocfs2_load_local_alloc(osb); | ||
1599 | if (status < 0) { | ||
1600 | mlog_errno(status); | ||
1601 | goto finally; | ||
1602 | } | ||
1603 | |||
1604 | if (dirty) { | ||
1605 | /* Recovery will be completed after we've mounted the | ||
1606 | * rest of the volume. */ | ||
1607 | osb->dirty = 1; | ||
1608 | osb->local_alloc_copy = local_alloc; | ||
1609 | local_alloc = NULL; | ||
1610 | } | ||
1611 | |||
1612 | /* go through each journal, trylock it and if you get the | ||
1613 | * lock, and it's marked as dirty, set the bit in the recover | ||
1614 | * map and launch a recovery thread for it. */ | ||
1615 | status = ocfs2_mark_dead_nodes(osb); | ||
1616 | if (status < 0) | ||
1617 | mlog_errno(status); | ||
1618 | |||
1619 | finally: | ||
1620 | if (local_alloc) | ||
1621 | kfree(local_alloc); | ||
1622 | |||
1623 | mlog_exit(status); | ||
1624 | return status; | ||
1625 | } | ||
1626 | |||
1627 | /* | ||
1628 | * The routine gets called from dismount or close whenever a dismount on | ||
1629 | * volume is requested and the osb open count becomes 1. | ||
1630 | * It will remove the osb from the global list and also free up all the | ||
1631 | * initialized resources and fileobject. | ||
1632 | */ | ||
1633 | static void ocfs2_delete_osb(struct ocfs2_super *osb) | ||
1634 | { | ||
1635 | mlog_entry_void(); | ||
1636 | |||
1637 | /* This function assumes that the caller has the main osb resource */ | ||
1638 | |||
1639 | if (osb->slot_info) | ||
1640 | ocfs2_free_slot_info(osb->slot_info); | ||
1641 | |||
1642 | /* FIXME | ||
1643 | * This belongs in journal shutdown, but because we have to | ||
1644 | * allocate osb->journal at the start of ocfs2_initalize_osb(), | ||
1645 | * we free it here. | ||
1646 | */ | ||
1647 | kfree(osb->journal); | ||
1648 | if (osb->local_alloc_copy) | ||
1649 | kfree(osb->local_alloc_copy); | ||
1650 | kfree(osb->uuid_str); | ||
1651 | ocfs2_put_dlm_debug(osb->osb_dlm_debug); | ||
1652 | memset(osb, 0, sizeof(struct ocfs2_super)); | ||
1653 | |||
1654 | mlog_exit_void(); | ||
1655 | } | ||
1656 | |||
1657 | /* Put OCFS2 into a readonly state, or (if the user specifies it), | ||
1658 | * panic(). We do not support continue-on-error operation. */ | ||
1659 | static void ocfs2_handle_error(struct super_block *sb) | ||
1660 | { | ||
1661 | struct ocfs2_super *osb = OCFS2_SB(sb); | ||
1662 | |||
1663 | if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) | ||
1664 | panic("OCFS2: (device %s): panic forced after error\n", | ||
1665 | sb->s_id); | ||
1666 | |||
1667 | ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); | ||
1668 | |||
1669 | if (sb->s_flags & MS_RDONLY && | ||
1670 | (ocfs2_is_soft_readonly(osb) || | ||
1671 | ocfs2_is_hard_readonly(osb))) | ||
1672 | return; | ||
1673 | |||
1674 | printk(KERN_CRIT "File system is now read-only due to the potential " | ||
1675 | "of on-disk corruption. Please run fsck.ocfs2 once the file " | ||
1676 | "system is unmounted.\n"); | ||
1677 | sb->s_flags |= MS_RDONLY; | ||
1678 | ocfs2_set_ro_flag(osb, 0); | ||
1679 | } | ||
1680 | |||
1681 | static char error_buf[1024]; | ||
1682 | |||
1683 | void __ocfs2_error(struct super_block *sb, | ||
1684 | const char *function, | ||
1685 | const char *fmt, ...) | ||
1686 | { | ||
1687 | va_list args; | ||
1688 | |||
1689 | va_start(args, fmt); | ||
1690 | vsprintf(error_buf, fmt, args); | ||
1691 | va_end(args); | ||
1692 | |||
1693 | /* Not using mlog here because we want to show the actual | ||
1694 | * function the error came from. */ | ||
1695 | printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n", | ||
1696 | sb->s_id, function, error_buf); | ||
1697 | |||
1698 | ocfs2_handle_error(sb); | ||
1699 | } | ||
1700 | |||
1701 | /* Handle critical errors. This is intentionally more drastic than | ||
1702 | * ocfs2_handle_error, so we only use for things like journal errors, | ||
1703 | * etc. */ | ||
1704 | void __ocfs2_abort(struct super_block* sb, | ||
1705 | const char *function, | ||
1706 | const char *fmt, ...) | ||
1707 | { | ||
1708 | va_list args; | ||
1709 | |||
1710 | va_start(args, fmt); | ||
1711 | vsprintf(error_buf, fmt, args); | ||
1712 | va_end(args); | ||
1713 | |||
1714 | printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n", | ||
1715 | sb->s_id, function, error_buf); | ||
1716 | |||
1717 | /* We don't have the cluster support yet to go straight to | ||
1718 | * hard readonly in here. Until then, we want to keep | ||
1719 | * ocfs2_abort() so that we can at least mark critical | ||
1720 | * errors. | ||
1721 | * | ||
1722 | * TODO: This should abort the journal and alert other nodes | ||
1723 | * that our slot needs recovery. */ | ||
1724 | |||
1725 | /* Force a panic(). This stinks, but it's better than letting | ||
1726 | * things continue without having a proper hard readonly | ||
1727 | * here. */ | ||
1728 | OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; | ||
1729 | ocfs2_handle_error(sb); | ||
1730 | } | ||
1731 | |||
1732 | module_init(ocfs2_init); | ||
1733 | module_exit(ocfs2_exit); | ||
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h new file mode 100644 index 000000000000..c564177dfbdc --- /dev/null +++ b/fs/ocfs2/super.h | |||
@@ -0,0 +1,44 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * super.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_SUPER_H | ||
27 | #define OCFS2_SUPER_H | ||
28 | |||
29 | extern struct workqueue_struct *ocfs2_wq; | ||
30 | |||
31 | int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, | ||
32 | int node_num); | ||
33 | |||
34 | void __ocfs2_error(struct super_block *sb, | ||
35 | const char *function, | ||
36 | const char *fmt, ...); | ||
37 | #define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) | ||
38 | |||
39 | void __ocfs2_abort(struct super_block *sb, | ||
40 | const char *function, | ||
41 | const char *fmt, ...); | ||
42 | #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) | ||
43 | |||
44 | #endif /* OCFS2_SUPER_H */ | ||
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c new file mode 100644 index 000000000000..f6986bd79e75 --- /dev/null +++ b/fs/ocfs2/symlink.c | |||
@@ -0,0 +1,180 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * linux/cluster/ssi/cfs/symlink.c | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License as | ||
8 | * published by the Free Software Foundation; either version 2 of | ||
9 | * the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE | ||
14 | * or NON INFRINGEMENT. See the GNU General Public License for more | ||
15 | * details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
20 | * | ||
21 | * Questions/Comments/Bugfixes to ssic-linux-devel@lists.sourceforge.net | ||
22 | * | ||
23 | * Copyright (C) 1992 Rick Sladkey | ||
24 | * | ||
25 | * Optimization changes Copyright (C) 1994 Florian La Roche | ||
26 | * | ||
27 | * Jun 7 1999, cache symlink lookups in the page cache. -DaveM | ||
28 | * | ||
29 | * Portions Copyright (C) 2001 Compaq Computer Corporation | ||
30 | * | ||
31 | * ocfs2 symlink handling code. | ||
32 | * | ||
33 | * Copyright (C) 2004, 2005 Oracle. | ||
34 | * | ||
35 | */ | ||
36 | |||
37 | #include <linux/fs.h> | ||
38 | #include <linux/types.h> | ||
39 | #include <linux/slab.h> | ||
40 | #include <linux/pagemap.h> | ||
41 | #include <linux/utsname.h> | ||
42 | |||
43 | #define MLOG_MASK_PREFIX ML_NAMEI | ||
44 | #include <cluster/masklog.h> | ||
45 | |||
46 | #include "ocfs2.h" | ||
47 | |||
48 | #include "alloc.h" | ||
49 | #include "file.h" | ||
50 | #include "inode.h" | ||
51 | #include "journal.h" | ||
52 | #include "symlink.h" | ||
53 | |||
54 | #include "buffer_head_io.h" | ||
55 | |||
56 | static char *ocfs2_page_getlink(struct dentry * dentry, | ||
57 | struct page **ppage); | ||
58 | static char *ocfs2_fast_symlink_getlink(struct inode *inode, | ||
59 | struct buffer_head **bh); | ||
60 | |||
61 | /* get the link contents into pagecache */ | ||
62 | static char *ocfs2_page_getlink(struct dentry * dentry, | ||
63 | struct page **ppage) | ||
64 | { | ||
65 | struct page * page; | ||
66 | struct address_space *mapping = dentry->d_inode->i_mapping; | ||
67 | page = read_cache_page(mapping, 0, | ||
68 | (filler_t *)mapping->a_ops->readpage, NULL); | ||
69 | if (IS_ERR(page)) | ||
70 | goto sync_fail; | ||
71 | wait_on_page_locked(page); | ||
72 | if (!PageUptodate(page)) | ||
73 | goto async_fail; | ||
74 | *ppage = page; | ||
75 | return kmap(page); | ||
76 | |||
77 | async_fail: | ||
78 | page_cache_release(page); | ||
79 | return ERR_PTR(-EIO); | ||
80 | |||
81 | sync_fail: | ||
82 | return (char*)page; | ||
83 | } | ||
84 | |||
85 | static char *ocfs2_fast_symlink_getlink(struct inode *inode, | ||
86 | struct buffer_head **bh) | ||
87 | { | ||
88 | int status; | ||
89 | char *link = NULL; | ||
90 | struct ocfs2_dinode *fe; | ||
91 | |||
92 | mlog_entry_void(); | ||
93 | |||
94 | status = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
95 | OCFS2_I(inode)->ip_blkno, | ||
96 | bh, | ||
97 | OCFS2_BH_CACHED, | ||
98 | inode); | ||
99 | if (status < 0) { | ||
100 | mlog_errno(status); | ||
101 | link = ERR_PTR(status); | ||
102 | goto bail; | ||
103 | } | ||
104 | |||
105 | fe = (struct ocfs2_dinode *) (*bh)->b_data; | ||
106 | link = (char *) fe->id2.i_symlink; | ||
107 | bail: | ||
108 | mlog_exit(status); | ||
109 | |||
110 | return link; | ||
111 | } | ||
112 | |||
113 | static int ocfs2_readlink(struct dentry *dentry, | ||
114 | char __user *buffer, | ||
115 | int buflen) | ||
116 | { | ||
117 | int ret; | ||
118 | char *link; | ||
119 | struct buffer_head *bh = NULL; | ||
120 | struct inode *inode = dentry->d_inode; | ||
121 | |||
122 | mlog_entry_void(); | ||
123 | |||
124 | link = ocfs2_fast_symlink_getlink(inode, &bh); | ||
125 | if (IS_ERR(link)) { | ||
126 | ret = PTR_ERR(link); | ||
127 | goto out; | ||
128 | } | ||
129 | |||
130 | ret = vfs_readlink(dentry, buffer, buflen, link); | ||
131 | |||
132 | brelse(bh); | ||
133 | out: | ||
134 | mlog_exit(ret); | ||
135 | return ret; | ||
136 | } | ||
137 | |||
138 | static void *ocfs2_follow_link(struct dentry *dentry, | ||
139 | struct nameidata *nd) | ||
140 | { | ||
141 | int status; | ||
142 | char *link; | ||
143 | struct inode *inode = dentry->d_inode; | ||
144 | struct page *page = NULL; | ||
145 | struct buffer_head *bh = NULL; | ||
146 | |||
147 | if (ocfs2_inode_is_fast_symlink(inode)) | ||
148 | link = ocfs2_fast_symlink_getlink(inode, &bh); | ||
149 | else | ||
150 | link = ocfs2_page_getlink(dentry, &page); | ||
151 | if (IS_ERR(link)) { | ||
152 | status = PTR_ERR(link); | ||
153 | mlog_errno(status); | ||
154 | goto bail; | ||
155 | } | ||
156 | |||
157 | status = vfs_follow_link(nd, link); | ||
158 | if (status) | ||
159 | mlog_errno(status); | ||
160 | bail: | ||
161 | if (page) { | ||
162 | kunmap(page); | ||
163 | page_cache_release(page); | ||
164 | } | ||
165 | if (bh) | ||
166 | brelse(bh); | ||
167 | |||
168 | return ERR_PTR(status); | ||
169 | } | ||
170 | |||
171 | struct inode_operations ocfs2_symlink_inode_operations = { | ||
172 | .readlink = page_readlink, | ||
173 | .follow_link = ocfs2_follow_link, | ||
174 | .getattr = ocfs2_getattr, | ||
175 | }; | ||
176 | struct inode_operations ocfs2_fast_symlink_inode_operations = { | ||
177 | .readlink = ocfs2_readlink, | ||
178 | .follow_link = ocfs2_follow_link, | ||
179 | .getattr = ocfs2_getattr, | ||
180 | }; | ||
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h new file mode 100644 index 000000000000..1ea9e4d9e9eb --- /dev/null +++ b/fs/ocfs2/symlink.h | |||
@@ -0,0 +1,42 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * symlink.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_SYMLINK_H | ||
27 | #define OCFS2_SYMLINK_H | ||
28 | |||
29 | extern struct inode_operations ocfs2_symlink_inode_operations; | ||
30 | extern struct inode_operations ocfs2_fast_symlink_inode_operations; | ||
31 | |||
32 | /* | ||
33 | * Test whether an inode is a fast symlink. | ||
34 | */ | ||
35 | static inline int ocfs2_inode_is_fast_symlink(struct inode *inode) | ||
36 | { | ||
37 | return (S_ISLNK(inode->i_mode) && | ||
38 | inode->i_blocks == 0); | ||
39 | } | ||
40 | |||
41 | |||
42 | #endif /* OCFS2_SYMLINK_H */ | ||
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c new file mode 100644 index 000000000000..600a8bc5b541 --- /dev/null +++ b/fs/ocfs2/sysfile.c | |||
@@ -0,0 +1,131 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * sysfile.c | ||
5 | * | ||
6 | * Initialize, read, write, etc. system files. | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/fs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/highmem.h> | ||
30 | |||
31 | #include "ocfs2.h" | ||
32 | |||
33 | #define MLOG_MASK_PREFIX ML_INODE | ||
34 | #include <cluster/masklog.h> | ||
35 | |||
36 | #include "alloc.h" | ||
37 | #include "dir.h" | ||
38 | #include "inode.h" | ||
39 | #include "journal.h" | ||
40 | #include "sysfile.h" | ||
41 | |||
42 | #include "buffer_head_io.h" | ||
43 | |||
44 | static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, | ||
45 | int type, | ||
46 | u32 slot); | ||
47 | |||
48 | static inline int is_global_system_inode(int type); | ||
49 | static inline int is_in_system_inode_array(struct ocfs2_super *osb, | ||
50 | int type, | ||
51 | u32 slot); | ||
52 | |||
53 | static inline int is_global_system_inode(int type) | ||
54 | { | ||
55 | return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE && | ||
56 | type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; | ||
57 | } | ||
58 | |||
59 | static inline int is_in_system_inode_array(struct ocfs2_super *osb, | ||
60 | int type, | ||
61 | u32 slot) | ||
62 | { | ||
63 | return slot == osb->slot_num || is_global_system_inode(type); | ||
64 | } | ||
65 | |||
66 | struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, | ||
67 | int type, | ||
68 | u32 slot) | ||
69 | { | ||
70 | struct inode *inode = NULL; | ||
71 | struct inode **arr = NULL; | ||
72 | |||
73 | /* avoid the lookup if cached in local system file array */ | ||
74 | if (is_in_system_inode_array(osb, type, slot)) | ||
75 | arr = &(osb->system_inodes[type]); | ||
76 | |||
77 | if (arr && ((inode = *arr) != NULL)) { | ||
78 | /* get a ref in addition to the array ref */ | ||
79 | inode = igrab(inode); | ||
80 | if (!inode) | ||
81 | BUG(); | ||
82 | |||
83 | return inode; | ||
84 | } | ||
85 | |||
86 | /* this gets one ref thru iget */ | ||
87 | inode = _ocfs2_get_system_file_inode(osb, type, slot); | ||
88 | |||
89 | /* add one more if putting into array for first time */ | ||
90 | if (arr && inode) { | ||
91 | *arr = igrab(inode); | ||
92 | if (!*arr) | ||
93 | BUG(); | ||
94 | } | ||
95 | return inode; | ||
96 | } | ||
97 | |||
98 | static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, | ||
99 | int type, | ||
100 | u32 slot) | ||
101 | { | ||
102 | char namebuf[40]; | ||
103 | struct inode *inode = NULL; | ||
104 | u64 blkno; | ||
105 | struct buffer_head *dirent_bh = NULL; | ||
106 | struct ocfs2_dir_entry *de = NULL; | ||
107 | int status = 0; | ||
108 | |||
109 | ocfs2_sprintf_system_inode_name(namebuf, | ||
110 | sizeof(namebuf), | ||
111 | type, slot); | ||
112 | |||
113 | status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf), | ||
114 | &blkno, osb->sys_root_inode, | ||
115 | &dirent_bh, &de); | ||
116 | if (status < 0) { | ||
117 | goto bail; | ||
118 | } | ||
119 | |||
120 | inode = ocfs2_iget(osb, blkno); | ||
121 | if (IS_ERR(inode)) { | ||
122 | mlog_errno(PTR_ERR(inode)); | ||
123 | inode = NULL; | ||
124 | goto bail; | ||
125 | } | ||
126 | bail: | ||
127 | if (dirent_bh) | ||
128 | brelse(dirent_bh); | ||
129 | return inode; | ||
130 | } | ||
131 | |||
diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h new file mode 100644 index 000000000000..cc9ea661ffc1 --- /dev/null +++ b/fs/ocfs2/sysfile.h | |||
@@ -0,0 +1,33 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * sysfile.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_SYSFILE_H | ||
27 | #define OCFS2_SYSFILE_H | ||
28 | |||
29 | struct inode * ocfs2_get_system_file_inode(struct ocfs2_super *osb, | ||
30 | int type, | ||
31 | u32 slot); | ||
32 | |||
33 | #endif /* OCFS2_SYSFILE_H */ | ||
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c new file mode 100644 index 000000000000..3a0458fd3e1b --- /dev/null +++ b/fs/ocfs2/uptodate.c | |||
@@ -0,0 +1,544 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * uptodate.c | ||
5 | * | ||
6 | * Tracking the up-to-date-ness of a local buffer_head with respect to | ||
7 | * the cluster. | ||
8 | * | ||
9 | * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public | ||
13 | * License as published by the Free Software Foundation; either | ||
14 | * version 2 of the License, or (at your option) any later version. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
19 | * General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public | ||
22 | * License along with this program; if not, write to the | ||
23 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
24 | * Boston, MA 021110-1307, USA. | ||
25 | * | ||
26 | * Standard buffer head caching flags (uptodate, etc) are insufficient | ||
27 | * in a clustered environment - a buffer may be marked up to date on | ||
28 | * our local node but could have been modified by another cluster | ||
29 | * member. As a result an additional (and performant) caching scheme | ||
30 | * is required. A further requirement is that we consume as little | ||
31 | * memory as possible - we never pin buffer_head structures in order | ||
32 | * to cache them. | ||
33 | * | ||
34 | * We track the existence of up to date buffers on the inodes which | ||
35 | * are associated with them. Because we don't want to pin | ||
36 | * buffer_heads, this is only a (strong) hint and several other checks | ||
37 | * are made in the I/O path to ensure that we don't use a stale or | ||
38 | * invalid buffer without going to disk: | ||
39 | * - buffer_jbd is used liberally - if a bh is in the journal on | ||
40 | * this node then it *must* be up to date. | ||
41 | * - the standard buffer_uptodate() macro is used to detect buffers | ||
42 | * which may be invalid (even if we have an up to date tracking | ||
43 | * item for them) | ||
44 | * | ||
45 | * For a full understanding of how this code works together, one | ||
46 | * should read the callers in dlmglue.c, the I/O functions in | ||
47 | * buffer_head_io.c and ocfs2_journal_access in journal.c | ||
48 | */ | ||
49 | |||
50 | #include <linux/fs.h> | ||
51 | #include <linux/types.h> | ||
52 | #include <linux/slab.h> | ||
53 | #include <linux/highmem.h> | ||
54 | #include <linux/buffer_head.h> | ||
55 | #include <linux/rbtree.h> | ||
56 | #include <linux/jbd.h> | ||
57 | |||
58 | #define MLOG_MASK_PREFIX ML_UPTODATE | ||
59 | |||
60 | #include <cluster/masklog.h> | ||
61 | |||
62 | #include "ocfs2.h" | ||
63 | |||
64 | #include "inode.h" | ||
65 | #include "uptodate.h" | ||
66 | |||
67 | struct ocfs2_meta_cache_item { | ||
68 | struct rb_node c_node; | ||
69 | sector_t c_block; | ||
70 | }; | ||
71 | |||
72 | static kmem_cache_t *ocfs2_uptodate_cachep = NULL; | ||
73 | |||
74 | void ocfs2_metadata_cache_init(struct inode *inode) | ||
75 | { | ||
76 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
77 | struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; | ||
78 | |||
79 | oi->ip_flags |= OCFS2_INODE_CACHE_INLINE; | ||
80 | ci->ci_num_cached = 0; | ||
81 | } | ||
82 | |||
83 | /* No lock taken here as 'root' is not expected to be visible to other | ||
84 | * processes. */ | ||
85 | static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) | ||
86 | { | ||
87 | unsigned int purged = 0; | ||
88 | struct rb_node *node; | ||
89 | struct ocfs2_meta_cache_item *item; | ||
90 | |||
91 | while ((node = rb_last(root)) != NULL) { | ||
92 | item = rb_entry(node, struct ocfs2_meta_cache_item, c_node); | ||
93 | |||
94 | mlog(0, "Purge item %llu\n", | ||
95 | (unsigned long long) item->c_block); | ||
96 | |||
97 | rb_erase(&item->c_node, root); | ||
98 | kmem_cache_free(ocfs2_uptodate_cachep, item); | ||
99 | |||
100 | purged++; | ||
101 | } | ||
102 | return purged; | ||
103 | } | ||
104 | |||
105 | /* Called from locking and called from ocfs2_clear_inode. Dump the | ||
106 | * cache for a given inode. | ||
107 | * | ||
108 | * This function is a few more lines longer than necessary due to some | ||
109 | * accounting done here, but I think it's worth tracking down those | ||
110 | * bugs sooner -- Mark */ | ||
111 | void ocfs2_metadata_cache_purge(struct inode *inode) | ||
112 | { | ||
113 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
114 | unsigned int tree, to_purge, purged; | ||
115 | struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; | ||
116 | struct rb_root root = RB_ROOT; | ||
117 | |||
118 | spin_lock(&oi->ip_lock); | ||
119 | tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE); | ||
120 | to_purge = ci->ci_num_cached; | ||
121 | |||
122 | mlog(0, "Purge %u %s items from Inode %"MLFu64"\n", to_purge, | ||
123 | tree ? "array" : "tree", oi->ip_blkno); | ||
124 | |||
125 | /* If we're a tree, save off the root so that we can safely | ||
126 | * initialize the cache. We do the work to free tree members | ||
127 | * without the spinlock. */ | ||
128 | if (tree) | ||
129 | root = ci->ci_cache.ci_tree; | ||
130 | |||
131 | ocfs2_metadata_cache_init(inode); | ||
132 | spin_unlock(&oi->ip_lock); | ||
133 | |||
134 | purged = ocfs2_purge_copied_metadata_tree(&root); | ||
135 | /* If possible, track the number wiped so that we can more | ||
136 | * easily detect counting errors. Unfortunately, this is only | ||
137 | * meaningful for trees. */ | ||
138 | if (tree && purged != to_purge) | ||
139 | mlog(ML_ERROR, "Inode %"MLFu64", count = %u, purged = %u\n", | ||
140 | oi->ip_blkno, to_purge, purged); | ||
141 | } | ||
142 | |||
143 | /* Returns the index in the cache array, -1 if not found. | ||
144 | * Requires ip_lock. */ | ||
145 | static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci, | ||
146 | sector_t item) | ||
147 | { | ||
148 | int i; | ||
149 | |||
150 | for (i = 0; i < ci->ci_num_cached; i++) { | ||
151 | if (item == ci->ci_cache.ci_array[i]) | ||
152 | return i; | ||
153 | } | ||
154 | |||
155 | return -1; | ||
156 | } | ||
157 | |||
158 | /* Returns the cache item if found, otherwise NULL. | ||
159 | * Requires ip_lock. */ | ||
160 | static struct ocfs2_meta_cache_item * | ||
161 | ocfs2_search_cache_tree(struct ocfs2_caching_info *ci, | ||
162 | sector_t block) | ||
163 | { | ||
164 | struct rb_node * n = ci->ci_cache.ci_tree.rb_node; | ||
165 | struct ocfs2_meta_cache_item *item = NULL; | ||
166 | |||
167 | while (n) { | ||
168 | item = rb_entry(n, struct ocfs2_meta_cache_item, c_node); | ||
169 | |||
170 | if (block < item->c_block) | ||
171 | n = n->rb_left; | ||
172 | else if (block > item->c_block) | ||
173 | n = n->rb_right; | ||
174 | else | ||
175 | return item; | ||
176 | } | ||
177 | |||
178 | return NULL; | ||
179 | } | ||
180 | |||
181 | static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi, | ||
182 | struct buffer_head *bh) | ||
183 | { | ||
184 | int index = -1; | ||
185 | struct ocfs2_meta_cache_item *item = NULL; | ||
186 | |||
187 | spin_lock(&oi->ip_lock); | ||
188 | |||
189 | mlog(0, "Inode %"MLFu64", query block %llu (inline = %u)\n", | ||
190 | oi->ip_blkno, (unsigned long long) bh->b_blocknr, | ||
191 | !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE)); | ||
192 | |||
193 | if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) | ||
194 | index = ocfs2_search_cache_array(&oi->ip_metadata_cache, | ||
195 | bh->b_blocknr); | ||
196 | else | ||
197 | item = ocfs2_search_cache_tree(&oi->ip_metadata_cache, | ||
198 | bh->b_blocknr); | ||
199 | |||
200 | spin_unlock(&oi->ip_lock); | ||
201 | |||
202 | mlog(0, "index = %d, item = %p\n", index, item); | ||
203 | |||
204 | return (index != -1) || (item != NULL); | ||
205 | } | ||
206 | |||
207 | /* Warning: even if it returns true, this does *not* guarantee that | ||
208 | * the block is stored in our inode metadata cache. */ | ||
209 | int ocfs2_buffer_uptodate(struct inode *inode, | ||
210 | struct buffer_head *bh) | ||
211 | { | ||
212 | /* Doesn't matter if the bh is in our cache or not -- if it's | ||
213 | * not marked uptodate then we know it can't have correct | ||
214 | * data. */ | ||
215 | if (!buffer_uptodate(bh)) | ||
216 | return 0; | ||
217 | |||
218 | /* OCFS2 does not allow multiple nodes to be changing the same | ||
219 | * block at the same time. */ | ||
220 | if (buffer_jbd(bh)) | ||
221 | return 1; | ||
222 | |||
223 | /* Ok, locally the buffer is marked as up to date, now search | ||
224 | * our cache to see if we can trust that. */ | ||
225 | return ocfs2_buffer_cached(OCFS2_I(inode), bh); | ||
226 | } | ||
227 | |||
228 | /* Requires ip_lock */ | ||
229 | static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, | ||
230 | sector_t block) | ||
231 | { | ||
232 | BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY); | ||
233 | |||
234 | mlog(0, "block %llu takes position %u\n", (unsigned long long) block, | ||
235 | ci->ci_num_cached); | ||
236 | |||
237 | ci->ci_cache.ci_array[ci->ci_num_cached] = block; | ||
238 | ci->ci_num_cached++; | ||
239 | } | ||
240 | |||
241 | /* By now the caller should have checked that the item does *not* | ||
242 | * exist in the tree. | ||
243 | * Requires ip_lock. */ | ||
244 | static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci, | ||
245 | struct ocfs2_meta_cache_item *new) | ||
246 | { | ||
247 | sector_t block = new->c_block; | ||
248 | struct rb_node *parent = NULL; | ||
249 | struct rb_node **p = &ci->ci_cache.ci_tree.rb_node; | ||
250 | struct ocfs2_meta_cache_item *tmp; | ||
251 | |||
252 | mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block, | ||
253 | ci->ci_num_cached); | ||
254 | |||
255 | while(*p) { | ||
256 | parent = *p; | ||
257 | |||
258 | tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node); | ||
259 | |||
260 | if (block < tmp->c_block) | ||
261 | p = &(*p)->rb_left; | ||
262 | else if (block > tmp->c_block) | ||
263 | p = &(*p)->rb_right; | ||
264 | else { | ||
265 | /* This should never happen! */ | ||
266 | mlog(ML_ERROR, "Duplicate block %llu cached!\n", | ||
267 | (unsigned long long) block); | ||
268 | BUG(); | ||
269 | } | ||
270 | } | ||
271 | |||
272 | rb_link_node(&new->c_node, parent, p); | ||
273 | rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree); | ||
274 | ci->ci_num_cached++; | ||
275 | } | ||
276 | |||
277 | static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi, | ||
278 | struct ocfs2_caching_info *ci) | ||
279 | { | ||
280 | assert_spin_locked(&oi->ip_lock); | ||
281 | |||
282 | return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) && | ||
283 | (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY); | ||
284 | } | ||
285 | |||
286 | /* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the | ||
287 | * pointers in tree after we use them - this allows caller to detect | ||
288 | * when to free in case of error. */ | ||
289 | static void ocfs2_expand_cache(struct ocfs2_inode_info *oi, | ||
290 | struct ocfs2_meta_cache_item **tree) | ||
291 | { | ||
292 | int i; | ||
293 | struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; | ||
294 | |||
295 | mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY, | ||
296 | "Inode %"MLFu64", num cached = %u, should be %u\n", | ||
297 | oi->ip_blkno, ci->ci_num_cached, | ||
298 | OCFS2_INODE_MAX_CACHE_ARRAY); | ||
299 | mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), | ||
300 | "Inode %"MLFu64" not marked as inline anymore!\n", | ||
301 | oi->ip_blkno); | ||
302 | assert_spin_locked(&oi->ip_lock); | ||
303 | |||
304 | /* Be careful to initialize the tree members *first* because | ||
305 | * once the ci_tree is used, the array is junk... */ | ||
306 | for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) | ||
307 | tree[i]->c_block = ci->ci_cache.ci_array[i]; | ||
308 | |||
309 | oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE; | ||
310 | ci->ci_cache.ci_tree = RB_ROOT; | ||
311 | /* this will be set again by __ocfs2_insert_cache_tree */ | ||
312 | ci->ci_num_cached = 0; | ||
313 | |||
314 | for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { | ||
315 | __ocfs2_insert_cache_tree(ci, tree[i]); | ||
316 | tree[i] = NULL; | ||
317 | } | ||
318 | |||
319 | mlog(0, "Expanded %"MLFu64" to a tree cache: flags 0x%x, num = %u\n", | ||
320 | oi->ip_blkno, oi->ip_flags, ci->ci_num_cached); | ||
321 | } | ||
322 | |||
323 | /* Slow path function - memory allocation is necessary. See the | ||
324 | * comment above ocfs2_set_buffer_uptodate for more information. */ | ||
325 | static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, | ||
326 | sector_t block, | ||
327 | int expand_tree) | ||
328 | { | ||
329 | int i; | ||
330 | struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; | ||
331 | struct ocfs2_meta_cache_item *new = NULL; | ||
332 | struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] = | ||
333 | { NULL, }; | ||
334 | |||
335 | mlog(0, "Inode %"MLFu64", block %llu, expand = %d\n", | ||
336 | oi->ip_blkno, (unsigned long long) block, expand_tree); | ||
337 | |||
338 | new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL); | ||
339 | if (!new) { | ||
340 | mlog_errno(-ENOMEM); | ||
341 | return; | ||
342 | } | ||
343 | new->c_block = block; | ||
344 | |||
345 | if (expand_tree) { | ||
346 | /* Do *not* allocate an array here - the removal code | ||
347 | * has no way of tracking that. */ | ||
348 | for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { | ||
349 | tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, | ||
350 | GFP_KERNEL); | ||
351 | if (!tree[i]) { | ||
352 | mlog_errno(-ENOMEM); | ||
353 | goto out_free; | ||
354 | } | ||
355 | |||
356 | /* These are initialized in ocfs2_expand_cache! */ | ||
357 | } | ||
358 | } | ||
359 | |||
360 | spin_lock(&oi->ip_lock); | ||
361 | if (ocfs2_insert_can_use_array(oi, ci)) { | ||
362 | mlog(0, "Someone cleared the tree underneath us\n"); | ||
363 | /* Ok, items were removed from the cache in between | ||
364 | * locks. Detect this and revert back to the fast path */ | ||
365 | ocfs2_append_cache_array(ci, block); | ||
366 | spin_unlock(&oi->ip_lock); | ||
367 | goto out_free; | ||
368 | } | ||
369 | |||
370 | if (expand_tree) | ||
371 | ocfs2_expand_cache(oi, tree); | ||
372 | |||
373 | __ocfs2_insert_cache_tree(ci, new); | ||
374 | spin_unlock(&oi->ip_lock); | ||
375 | |||
376 | new = NULL; | ||
377 | out_free: | ||
378 | if (new) | ||
379 | kmem_cache_free(ocfs2_uptodate_cachep, new); | ||
380 | |||
381 | /* If these were used, then ocfs2_expand_cache re-set them to | ||
382 | * NULL for us. */ | ||
383 | if (tree[0]) { | ||
384 | for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) | ||
385 | if (tree[i]) | ||
386 | kmem_cache_free(ocfs2_uptodate_cachep, | ||
387 | tree[i]); | ||
388 | } | ||
389 | } | ||
390 | |||
391 | /* Item insertion is guarded by ip_io_sem, so the insertion path takes | ||
392 | * advantage of this by not rechecking for a duplicate insert during | ||
393 | * the slow case. Additionally, if the cache needs to be bumped up to | ||
394 | * a tree, the code will not recheck after acquiring the lock -- | ||
395 | * multiple paths cannot be expanding to a tree at the same time. | ||
396 | * | ||
397 | * The slow path takes into account that items can be removed | ||
398 | * (including the whole tree wiped and reset) when this process it out | ||
399 | * allocating memory. In those cases, it reverts back to the fast | ||
400 | * path. | ||
401 | * | ||
402 | * Note that this function may actually fail to insert the block if | ||
403 | * memory cannot be allocated. This is not fatal however (but may | ||
404 | * result in a performance penalty) */ | ||
405 | void ocfs2_set_buffer_uptodate(struct inode *inode, | ||
406 | struct buffer_head *bh) | ||
407 | { | ||
408 | int expand; | ||
409 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
410 | struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; | ||
411 | |||
412 | /* The block may very well exist in our cache already, so avoid | ||
413 | * doing any more work in that case. */ | ||
414 | if (ocfs2_buffer_cached(oi, bh)) | ||
415 | return; | ||
416 | |||
417 | mlog(0, "Inode %"MLFu64", inserting block %llu\n", oi->ip_blkno, | ||
418 | (unsigned long long) bh->b_blocknr); | ||
419 | |||
420 | /* No need to recheck under spinlock - insertion is guarded by | ||
421 | * ip_io_sem */ | ||
422 | spin_lock(&oi->ip_lock); | ||
423 | if (ocfs2_insert_can_use_array(oi, ci)) { | ||
424 | /* Fast case - it's an array and there's a free | ||
425 | * spot. */ | ||
426 | ocfs2_append_cache_array(ci, bh->b_blocknr); | ||
427 | spin_unlock(&oi->ip_lock); | ||
428 | return; | ||
429 | } | ||
430 | |||
431 | expand = 0; | ||
432 | if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { | ||
433 | /* We need to bump things up to a tree. */ | ||
434 | expand = 1; | ||
435 | } | ||
436 | spin_unlock(&oi->ip_lock); | ||
437 | |||
438 | __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand); | ||
439 | } | ||
440 | |||
441 | /* Called against a newly allocated buffer. Most likely nobody should | ||
442 | * be able to read this sort of metadata while it's still being | ||
443 | * allocated, but this is careful to take ip_io_sem anyway. */ | ||
444 | void ocfs2_set_new_buffer_uptodate(struct inode *inode, | ||
445 | struct buffer_head *bh) | ||
446 | { | ||
447 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
448 | |||
449 | /* This should definitely *not* exist in our cache */ | ||
450 | BUG_ON(ocfs2_buffer_cached(oi, bh)); | ||
451 | |||
452 | set_buffer_uptodate(bh); | ||
453 | |||
454 | down(&oi->ip_io_sem); | ||
455 | ocfs2_set_buffer_uptodate(inode, bh); | ||
456 | up(&oi->ip_io_sem); | ||
457 | } | ||
458 | |||
459 | /* Requires ip_lock. */ | ||
460 | static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci, | ||
461 | int index) | ||
462 | { | ||
463 | sector_t *array = ci->ci_cache.ci_array; | ||
464 | int bytes; | ||
465 | |||
466 | BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY); | ||
467 | BUG_ON(index >= ci->ci_num_cached); | ||
468 | BUG_ON(!ci->ci_num_cached); | ||
469 | |||
470 | mlog(0, "remove index %d (num_cached = %u\n", index, | ||
471 | ci->ci_num_cached); | ||
472 | |||
473 | ci->ci_num_cached--; | ||
474 | |||
475 | /* don't need to copy if the array is now empty, or if we | ||
476 | * removed at the tail */ | ||
477 | if (ci->ci_num_cached && index < ci->ci_num_cached) { | ||
478 | bytes = sizeof(sector_t) * (ci->ci_num_cached - index); | ||
479 | memmove(&array[index], &array[index + 1], bytes); | ||
480 | } | ||
481 | } | ||
482 | |||
483 | /* Requires ip_lock. */ | ||
484 | static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, | ||
485 | struct ocfs2_meta_cache_item *item) | ||
486 | { | ||
487 | mlog(0, "remove block %llu from tree\n", | ||
488 | (unsigned long long) item->c_block); | ||
489 | |||
490 | rb_erase(&item->c_node, &ci->ci_cache.ci_tree); | ||
491 | ci->ci_num_cached--; | ||
492 | } | ||
493 | |||
494 | /* Called when we remove a chunk of metadata from an inode. We don't | ||
495 | * bother reverting things to an inlined array in the case of a remove | ||
496 | * which moves us back under the limit. */ | ||
497 | void ocfs2_remove_from_cache(struct inode *inode, | ||
498 | struct buffer_head *bh) | ||
499 | { | ||
500 | int index; | ||
501 | sector_t block = bh->b_blocknr; | ||
502 | struct ocfs2_meta_cache_item *item = NULL; | ||
503 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
504 | struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; | ||
505 | |||
506 | spin_lock(&oi->ip_lock); | ||
507 | mlog(0, "Inode %"MLFu64", remove %llu, items = %u, array = %u\n", | ||
508 | oi->ip_blkno, (unsigned long long) block, ci->ci_num_cached, | ||
509 | oi->ip_flags & OCFS2_INODE_CACHE_INLINE); | ||
510 | |||
511 | if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { | ||
512 | index = ocfs2_search_cache_array(ci, block); | ||
513 | if (index != -1) | ||
514 | ocfs2_remove_metadata_array(ci, index); | ||
515 | } else { | ||
516 | item = ocfs2_search_cache_tree(ci, block); | ||
517 | if (item) | ||
518 | ocfs2_remove_metadata_tree(ci, item); | ||
519 | } | ||
520 | spin_unlock(&oi->ip_lock); | ||
521 | |||
522 | if (item) | ||
523 | kmem_cache_free(ocfs2_uptodate_cachep, item); | ||
524 | } | ||
525 | |||
526 | int __init init_ocfs2_uptodate_cache(void) | ||
527 | { | ||
528 | ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", | ||
529 | sizeof(struct ocfs2_meta_cache_item), | ||
530 | 0, SLAB_HWCACHE_ALIGN, NULL, NULL); | ||
531 | if (!ocfs2_uptodate_cachep) | ||
532 | return -ENOMEM; | ||
533 | |||
534 | mlog(0, "%u inlined cache items per inode.\n", | ||
535 | OCFS2_INODE_MAX_CACHE_ARRAY); | ||
536 | |||
537 | return 0; | ||
538 | } | ||
539 | |||
540 | void __exit exit_ocfs2_uptodate_cache(void) | ||
541 | { | ||
542 | if (ocfs2_uptodate_cachep) | ||
543 | kmem_cache_destroy(ocfs2_uptodate_cachep); | ||
544 | } | ||
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h new file mode 100644 index 000000000000..e5aacdf4eabf --- /dev/null +++ b/fs/ocfs2/uptodate.h | |||
@@ -0,0 +1,44 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * uptodate.h | ||
5 | * | ||
6 | * Cluster uptodate tracking | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_UPTODATE_H | ||
27 | #define OCFS2_UPTODATE_H | ||
28 | |||
29 | int __init init_ocfs2_uptodate_cache(void); | ||
30 | void __exit exit_ocfs2_uptodate_cache(void); | ||
31 | |||
32 | void ocfs2_metadata_cache_init(struct inode *inode); | ||
33 | void ocfs2_metadata_cache_purge(struct inode *inode); | ||
34 | |||
35 | int ocfs2_buffer_uptodate(struct inode *inode, | ||
36 | struct buffer_head *bh); | ||
37 | void ocfs2_set_buffer_uptodate(struct inode *inode, | ||
38 | struct buffer_head *bh); | ||
39 | void ocfs2_set_new_buffer_uptodate(struct inode *inode, | ||
40 | struct buffer_head *bh); | ||
41 | void ocfs2_remove_from_cache(struct inode *inode, | ||
42 | struct buffer_head *bh); | ||
43 | |||
44 | #endif /* OCFS2_UPTODATE_H */ | ||
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c new file mode 100644 index 000000000000..5405ce121c99 --- /dev/null +++ b/fs/ocfs2/ver.c | |||
@@ -0,0 +1,43 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ver.c | ||
5 | * | ||
6 | * version string | ||
7 | * | ||
8 | * Copyright (C) 2002, 2005 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/string.h> | ||
28 | #include <linux/kernel.h> | ||
29 | |||
30 | #include "ver.h" | ||
31 | |||
32 | #define OCFS2_BUILD_VERSION "1.3.3" | ||
33 | |||
34 | #define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION | ||
35 | |||
36 | void ocfs2_print_version(void) | ||
37 | { | ||
38 | printk(KERN_INFO "%s\n", VERSION_STR); | ||
39 | } | ||
40 | |||
41 | MODULE_DESCRIPTION(VERSION_STR); | ||
42 | |||
43 | MODULE_VERSION(OCFS2_BUILD_VERSION); | ||
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h new file mode 100644 index 000000000000..d7395cb91d2f --- /dev/null +++ b/fs/ocfs2/ver.h | |||
@@ -0,0 +1,31 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * ver.h | ||
5 | * | ||
6 | * Function prototypes | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #ifndef OCFS2_VER_H | ||
27 | #define OCFS2_VER_H | ||
28 | |||
29 | void ocfs2_print_version(void); | ||
30 | |||
31 | #endif /* OCFS2_VER_H */ | ||
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c new file mode 100644 index 000000000000..021978e0576b --- /dev/null +++ b/fs/ocfs2/vote.c | |||
@@ -0,0 +1,1202 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * vote.c | ||
5 | * | ||
6 | * description here | ||
7 | * | ||
8 | * Copyright (C) 2003, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | #include <linux/types.h> | ||
27 | #include <linux/slab.h> | ||
28 | #include <linux/highmem.h> | ||
29 | #include <linux/smp_lock.h> | ||
30 | #include <linux/kthread.h> | ||
31 | |||
32 | #include <cluster/heartbeat.h> | ||
33 | #include <cluster/nodemanager.h> | ||
34 | #include <cluster/tcp.h> | ||
35 | |||
36 | #include <dlm/dlmapi.h> | ||
37 | |||
38 | #define MLOG_MASK_PREFIX ML_VOTE | ||
39 | #include <cluster/masklog.h> | ||
40 | |||
41 | #include "ocfs2.h" | ||
42 | |||
43 | #include "alloc.h" | ||
44 | #include "dlmglue.h" | ||
45 | #include "extent_map.h" | ||
46 | #include "heartbeat.h" | ||
47 | #include "inode.h" | ||
48 | #include "journal.h" | ||
49 | #include "slot_map.h" | ||
50 | #include "vote.h" | ||
51 | |||
52 | #include "buffer_head_io.h" | ||
53 | |||
54 | #define OCFS2_MESSAGE_TYPE_VOTE (0x1) | ||
55 | #define OCFS2_MESSAGE_TYPE_RESPONSE (0x2) | ||
56 | struct ocfs2_msg_hdr | ||
57 | { | ||
58 | __be32 h_response_id; /* used to lookup message handle on sending | ||
59 | * node. */ | ||
60 | __be32 h_request; | ||
61 | __be64 h_blkno; | ||
62 | __be32 h_generation; | ||
63 | __be32 h_node_num; /* node sending this particular message. */ | ||
64 | }; | ||
65 | |||
66 | /* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this | ||
67 | * for the network. */ | ||
68 | #define OCFS2_VOTE_FILENAME_LEN 256 | ||
69 | struct ocfs2_vote_msg | ||
70 | { | ||
71 | struct ocfs2_msg_hdr v_hdr; | ||
72 | union { | ||
73 | __be32 v_generic1; | ||
74 | __be32 v_orphaned_slot; /* Used during delete votes */ | ||
75 | __be32 v_nlink; /* Used during unlink votes */ | ||
76 | } md1; /* Message type dependant 1 */ | ||
77 | __be32 v_unlink_namelen; | ||
78 | __be64 v_unlink_parent; | ||
79 | u8 v_unlink_dirent[OCFS2_VOTE_FILENAME_LEN]; | ||
80 | }; | ||
81 | |||
82 | /* Responses are given these values to maintain backwards | ||
83 | * compatibility with older ocfs2 versions */ | ||
84 | #define OCFS2_RESPONSE_OK (0) | ||
85 | #define OCFS2_RESPONSE_BUSY (-16) | ||
86 | #define OCFS2_RESPONSE_BAD_MSG (-22) | ||
87 | |||
88 | struct ocfs2_response_msg | ||
89 | { | ||
90 | struct ocfs2_msg_hdr r_hdr; | ||
91 | __be32 r_response; | ||
92 | __be32 r_orphaned_slot; | ||
93 | }; | ||
94 | |||
95 | struct ocfs2_vote_work { | ||
96 | struct list_head w_list; | ||
97 | struct ocfs2_vote_msg w_msg; | ||
98 | }; | ||
99 | |||
100 | enum ocfs2_vote_request { | ||
101 | OCFS2_VOTE_REQ_INVALID = 0, | ||
102 | OCFS2_VOTE_REQ_DELETE, | ||
103 | OCFS2_VOTE_REQ_UNLINK, | ||
104 | OCFS2_VOTE_REQ_RENAME, | ||
105 | OCFS2_VOTE_REQ_MOUNT, | ||
106 | OCFS2_VOTE_REQ_UMOUNT, | ||
107 | OCFS2_VOTE_REQ_LAST | ||
108 | }; | ||
109 | |||
110 | static inline int ocfs2_is_valid_vote_request(int request) | ||
111 | { | ||
112 | return OCFS2_VOTE_REQ_INVALID < request && | ||
113 | request < OCFS2_VOTE_REQ_LAST; | ||
114 | } | ||
115 | |||
116 | typedef void (*ocfs2_net_response_callback)(void *priv, | ||
117 | struct ocfs2_response_msg *resp); | ||
118 | struct ocfs2_net_response_cb { | ||
119 | ocfs2_net_response_callback rc_cb; | ||
120 | void *rc_priv; | ||
121 | }; | ||
122 | |||
123 | struct ocfs2_net_wait_ctxt { | ||
124 | struct list_head n_list; | ||
125 | u32 n_response_id; | ||
126 | wait_queue_head_t n_event; | ||
127 | struct ocfs2_node_map n_node_map; | ||
128 | int n_response; /* an agreggate response. 0 if | ||
129 | * all nodes are go, < 0 on any | ||
130 | * negative response from any | ||
131 | * node or network error. */ | ||
132 | struct ocfs2_net_response_cb *n_callback; | ||
133 | }; | ||
134 | |||
135 | static void ocfs2_process_mount_request(struct ocfs2_super *osb, | ||
136 | unsigned int node_num) | ||
137 | { | ||
138 | mlog(0, "MOUNT vote from node %u\n", node_num); | ||
139 | /* The other node only sends us this message when he has an EX | ||
140 | * on the superblock, so our recovery threads (if having been | ||
141 | * launched) are waiting on it.*/ | ||
142 | ocfs2_recovery_map_clear(osb, node_num); | ||
143 | ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num); | ||
144 | |||
145 | /* We clear the umount map here because a node may have been | ||
146 | * previously mounted, safely unmounted but never stopped | ||
147 | * heartbeating - in which case we'd have a stale entry. */ | ||
148 | ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num); | ||
149 | } | ||
150 | |||
151 | static void ocfs2_process_umount_request(struct ocfs2_super *osb, | ||
152 | unsigned int node_num) | ||
153 | { | ||
154 | mlog(0, "UMOUNT vote from node %u\n", node_num); | ||
155 | ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num); | ||
156 | ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num); | ||
157 | } | ||
158 | |||
159 | void ocfs2_mark_inode_remotely_deleted(struct inode *inode) | ||
160 | { | ||
161 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
162 | |||
163 | assert_spin_locked(&oi->ip_lock); | ||
164 | /* We set the SKIP_DELETE flag on the inode so we don't try to | ||
165 | * delete it in delete_inode ourselves, thus avoiding | ||
166 | * unecessary lock pinging. If the other node failed to wipe | ||
167 | * the inode as a result of a crash, then recovery will pick | ||
168 | * up the slack. */ | ||
169 | oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE; | ||
170 | } | ||
171 | |||
172 | static int ocfs2_process_delete_request(struct inode *inode, | ||
173 | int *orphaned_slot) | ||
174 | { | ||
175 | int response = OCFS2_RESPONSE_BUSY; | ||
176 | |||
177 | mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n", | ||
178 | inode->i_ino, inode->i_nlink, *orphaned_slot); | ||
179 | |||
180 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
181 | |||
182 | /* Whatever our vote response is, we want to make sure that | ||
183 | * the orphaned slot is recorded properly on this node *and* | ||
184 | * on the requesting node. Technically, if the requesting node | ||
185 | * did not know which slot the inode is orphaned in but we | ||
186 | * respond with BUSY he doesn't actually need the orphaned | ||
187 | * slot, but it doesn't hurt to do it here anyway. */ | ||
188 | if ((*orphaned_slot) != OCFS2_INVALID_SLOT) { | ||
189 | mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != | ||
190 | OCFS2_INVALID_SLOT && | ||
191 | OCFS2_I(inode)->ip_orphaned_slot != | ||
192 | (*orphaned_slot), | ||
193 | "Inode %"MLFu64": This node thinks it's " | ||
194 | "orphaned in slot %d, messaged it's in %d\n", | ||
195 | OCFS2_I(inode)->ip_blkno, | ||
196 | OCFS2_I(inode)->ip_orphaned_slot, | ||
197 | *orphaned_slot); | ||
198 | |||
199 | mlog(0, "Setting orphaned slot for inode %"MLFu64" to %d\n", | ||
200 | OCFS2_I(inode)->ip_blkno, *orphaned_slot); | ||
201 | |||
202 | OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot; | ||
203 | } else { | ||
204 | mlog(0, "Sending back orphaned slot %d for inode %"MLFu64"\n", | ||
205 | OCFS2_I(inode)->ip_orphaned_slot, | ||
206 | OCFS2_I(inode)->ip_blkno); | ||
207 | |||
208 | *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; | ||
209 | } | ||
210 | |||
211 | /* vote no if the file is still open. */ | ||
212 | if (OCFS2_I(inode)->ip_open_count) { | ||
213 | mlog(0, "open count = %u\n", | ||
214 | OCFS2_I(inode)->ip_open_count); | ||
215 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
216 | goto done; | ||
217 | } | ||
218 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
219 | |||
220 | /* directories are a bit ugly... What if someone is sitting in | ||
221 | * it? We want to make sure the inode is removed completely as | ||
222 | * a result of the iput in process_vote. */ | ||
223 | if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) { | ||
224 | mlog(0, "i_count = %u\n", atomic_read(&inode->i_count)); | ||
225 | goto done; | ||
226 | } | ||
227 | |||
228 | if (filemap_fdatawrite(inode->i_mapping)) { | ||
229 | mlog(ML_ERROR, "Could not sync inode %"MLFu64" for delete!\n", | ||
230 | OCFS2_I(inode)->ip_blkno); | ||
231 | goto done; | ||
232 | } | ||
233 | sync_mapping_buffers(inode->i_mapping); | ||
234 | truncate_inode_pages(inode->i_mapping, 0); | ||
235 | ocfs2_extent_map_trunc(inode, 0); | ||
236 | |||
237 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
238 | /* double check open count - someone might have raced this | ||
239 | * thread into ocfs2_file_open while we were writing out | ||
240 | * data. If we're to allow a wipe of this inode now, we *must* | ||
241 | * hold the spinlock until we've marked it. */ | ||
242 | if (OCFS2_I(inode)->ip_open_count) { | ||
243 | mlog(0, "Raced to wipe! open count = %u\n", | ||
244 | OCFS2_I(inode)->ip_open_count); | ||
245 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
246 | goto done; | ||
247 | } | ||
248 | |||
249 | /* Mark the inode as being wiped from disk. */ | ||
250 | ocfs2_mark_inode_remotely_deleted(inode); | ||
251 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
252 | |||
253 | /* Not sure this is necessary anymore. */ | ||
254 | d_prune_aliases(inode); | ||
255 | |||
256 | /* If we get here, then we're voting 'yes', so commit the | ||
257 | * delete on our side. */ | ||
258 | response = OCFS2_RESPONSE_OK; | ||
259 | done: | ||
260 | return response; | ||
261 | } | ||
262 | |||
263 | static int ocfs2_match_dentry(struct dentry *dentry, | ||
264 | u64 parent_blkno, | ||
265 | unsigned int namelen, | ||
266 | const char *name) | ||
267 | { | ||
268 | struct inode *parent; | ||
269 | |||
270 | if (!dentry->d_parent) { | ||
271 | mlog(0, "Detached from parent.\n"); | ||
272 | return 0; | ||
273 | } | ||
274 | |||
275 | parent = dentry->d_parent->d_inode; | ||
276 | /* Negative parent dentry? */ | ||
277 | if (!parent) | ||
278 | return 0; | ||
279 | |||
280 | /* Name is in a different directory. */ | ||
281 | if (OCFS2_I(parent)->ip_blkno != parent_blkno) | ||
282 | return 0; | ||
283 | |||
284 | if (dentry->d_name.len != namelen) | ||
285 | return 0; | ||
286 | |||
287 | /* comparison above guarantees this is safe. */ | ||
288 | if (memcmp(dentry->d_name.name, name, namelen)) | ||
289 | return 0; | ||
290 | |||
291 | return 1; | ||
292 | } | ||
293 | |||
294 | static void ocfs2_process_dentry_request(struct inode *inode, | ||
295 | int rename, | ||
296 | unsigned int new_nlink, | ||
297 | u64 parent_blkno, | ||
298 | unsigned int namelen, | ||
299 | const char *name) | ||
300 | { | ||
301 | struct dentry *dentry = NULL; | ||
302 | struct list_head *p; | ||
303 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
304 | |||
305 | mlog(0, "parent %"MLFu64", namelen = %u, name = %.*s\n", parent_blkno, | ||
306 | namelen, namelen, name); | ||
307 | |||
308 | spin_lock(&dcache_lock); | ||
309 | |||
310 | /* Another node is removing this name from the system. It is | ||
311 | * up to us to find the corresponding dentry and if it exists, | ||
312 | * unhash it from the dcache. */ | ||
313 | list_for_each(p, &inode->i_dentry) { | ||
314 | dentry = list_entry(p, struct dentry, d_alias); | ||
315 | |||
316 | if (ocfs2_match_dentry(dentry, parent_blkno, namelen, name)) { | ||
317 | mlog(0, "dentry found: %.*s\n", | ||
318 | dentry->d_name.len, dentry->d_name.name); | ||
319 | |||
320 | dget_locked(dentry); | ||
321 | break; | ||
322 | } | ||
323 | |||
324 | dentry = NULL; | ||
325 | } | ||
326 | |||
327 | spin_unlock(&dcache_lock); | ||
328 | |||
329 | if (dentry) { | ||
330 | d_delete(dentry); | ||
331 | dput(dentry); | ||
332 | } | ||
333 | |||
334 | /* rename votes don't send link counts */ | ||
335 | if (!rename) { | ||
336 | mlog(0, "new_nlink = %u\n", new_nlink); | ||
337 | |||
338 | /* We don't have the proper locks here to directly | ||
339 | * change i_nlink and besides, the vote is sent | ||
340 | * *before* the operation so it may have failed on the | ||
341 | * other node. This passes a hint to ocfs2_drop_inode | ||
342 | * to force ocfs2_delete_inode, who will take the | ||
343 | * proper cluster locks to sort things out. */ | ||
344 | if (new_nlink == 0) { | ||
345 | spin_lock(&oi->ip_lock); | ||
346 | oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; | ||
347 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
348 | } | ||
349 | } | ||
350 | } | ||
351 | |||
352 | static void ocfs2_process_vote(struct ocfs2_super *osb, | ||
353 | struct ocfs2_vote_msg *msg) | ||
354 | { | ||
355 | int net_status, vote_response; | ||
356 | int orphaned_slot = 0; | ||
357 | int rename = 0; | ||
358 | unsigned int node_num, generation, new_nlink, namelen; | ||
359 | u64 blkno, parent_blkno; | ||
360 | enum ocfs2_vote_request request; | ||
361 | struct inode *inode = NULL; | ||
362 | struct ocfs2_msg_hdr *hdr = &msg->v_hdr; | ||
363 | struct ocfs2_response_msg response; | ||
364 | |||
365 | /* decode the network mumbo jumbo into local variables. */ | ||
366 | request = be32_to_cpu(hdr->h_request); | ||
367 | blkno = be64_to_cpu(hdr->h_blkno); | ||
368 | generation = be32_to_cpu(hdr->h_generation); | ||
369 | node_num = be32_to_cpu(hdr->h_node_num); | ||
370 | if (request == OCFS2_VOTE_REQ_DELETE) | ||
371 | orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot); | ||
372 | |||
373 | mlog(0, "processing vote: request = %u, blkno = %"MLFu64", " | ||
374 | "generation = %u, node_num = %u, priv1 = %u\n", request, | ||
375 | blkno, generation, node_num, be32_to_cpu(msg->md1.v_generic1)); | ||
376 | |||
377 | if (!ocfs2_is_valid_vote_request(request)) { | ||
378 | mlog(ML_ERROR, "Invalid vote request %d from node %u\n", | ||
379 | request, node_num); | ||
380 | vote_response = OCFS2_RESPONSE_BAD_MSG; | ||
381 | goto respond; | ||
382 | } | ||
383 | |||
384 | vote_response = OCFS2_RESPONSE_OK; | ||
385 | |||
386 | switch (request) { | ||
387 | case OCFS2_VOTE_REQ_UMOUNT: | ||
388 | ocfs2_process_umount_request(osb, node_num); | ||
389 | goto respond; | ||
390 | case OCFS2_VOTE_REQ_MOUNT: | ||
391 | ocfs2_process_mount_request(osb, node_num); | ||
392 | goto respond; | ||
393 | default: | ||
394 | /* avoids a gcc warning */ | ||
395 | break; | ||
396 | } | ||
397 | |||
398 | /* We cannot process the remaining message types before we're | ||
399 | * fully mounted. It's perfectly safe however to send a 'yes' | ||
400 | * response as we can't possibly have any of the state they're | ||
401 | * asking us to modify yet. */ | ||
402 | if (atomic_read(&osb->vol_state) == VOLUME_INIT) | ||
403 | goto respond; | ||
404 | |||
405 | /* If we get here, then the request is against an inode. */ | ||
406 | inode = ocfs2_ilookup_for_vote(osb, blkno, | ||
407 | request == OCFS2_VOTE_REQ_DELETE); | ||
408 | |||
409 | /* Not finding the inode is perfectly valid - it means we're | ||
410 | * not interested in what the other node is about to do to it | ||
411 | * so in those cases we automatically respond with an | ||
412 | * affirmative. Cluster locking ensures that we won't race | ||
413 | * interest in the inode with this vote request. */ | ||
414 | if (!inode) | ||
415 | goto respond; | ||
416 | |||
417 | /* Check generation values. It's possible for us to get a | ||
418 | * request against a stale inode. If so then we proceed as if | ||
419 | * we had not found an inode in the first place. */ | ||
420 | if (inode->i_generation != generation) { | ||
421 | mlog(0, "generation passed %u != inode generation = %u, " | ||
422 | "ip_flags = %x, ip_blkno = %"MLFu64", msg %"MLFu64", " | ||
423 | "i_count = %u, message type = %u\n", | ||
424 | generation, inode->i_generation, OCFS2_I(inode)->ip_flags, | ||
425 | OCFS2_I(inode)->ip_blkno, blkno, | ||
426 | atomic_read(&inode->i_count), request); | ||
427 | iput(inode); | ||
428 | inode = NULL; | ||
429 | goto respond; | ||
430 | } | ||
431 | |||
432 | switch (request) { | ||
433 | case OCFS2_VOTE_REQ_DELETE: | ||
434 | vote_response = ocfs2_process_delete_request(inode, | ||
435 | &orphaned_slot); | ||
436 | break; | ||
437 | case OCFS2_VOTE_REQ_RENAME: | ||
438 | rename = 1; | ||
439 | /* fall through */ | ||
440 | case OCFS2_VOTE_REQ_UNLINK: | ||
441 | parent_blkno = be64_to_cpu(msg->v_unlink_parent); | ||
442 | namelen = be32_to_cpu(msg->v_unlink_namelen); | ||
443 | /* new_nlink will be ignored in case of a rename vote */ | ||
444 | new_nlink = be32_to_cpu(msg->md1.v_nlink); | ||
445 | ocfs2_process_dentry_request(inode, rename, new_nlink, | ||
446 | parent_blkno, namelen, | ||
447 | msg->v_unlink_dirent); | ||
448 | break; | ||
449 | default: | ||
450 | mlog(ML_ERROR, "node %u, invalid request: %u\n", | ||
451 | node_num, request); | ||
452 | vote_response = OCFS2_RESPONSE_BAD_MSG; | ||
453 | } | ||
454 | |||
455 | respond: | ||
456 | /* Response struture is small so we just put it on the stack | ||
457 | * and stuff it inline. */ | ||
458 | memset(&response, 0, sizeof(struct ocfs2_response_msg)); | ||
459 | response.r_hdr.h_response_id = hdr->h_response_id; | ||
460 | response.r_hdr.h_blkno = hdr->h_blkno; | ||
461 | response.r_hdr.h_generation = hdr->h_generation; | ||
462 | response.r_hdr.h_node_num = cpu_to_be32(osb->node_num); | ||
463 | response.r_response = cpu_to_be32(vote_response); | ||
464 | response.r_orphaned_slot = cpu_to_be32(orphaned_slot); | ||
465 | |||
466 | net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE, | ||
467 | osb->net_key, | ||
468 | &response, | ||
469 | sizeof(struct ocfs2_response_msg), | ||
470 | node_num, | ||
471 | NULL); | ||
472 | /* We still want to error print for ENOPROTOOPT here. The | ||
473 | * sending node shouldn't have unregistered his net handler | ||
474 | * without sending an unmount vote 1st */ | ||
475 | if (net_status < 0 | ||
476 | && net_status != -ETIMEDOUT | ||
477 | && net_status != -ENOTCONN) | ||
478 | mlog(ML_ERROR, "message to node %u fails with error %d!\n", | ||
479 | node_num, net_status); | ||
480 | |||
481 | if (inode) | ||
482 | iput(inode); | ||
483 | } | ||
484 | |||
485 | static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb) | ||
486 | { | ||
487 | unsigned long processed; | ||
488 | struct ocfs2_lock_res *lockres; | ||
489 | struct ocfs2_vote_work *work; | ||
490 | |||
491 | mlog_entry_void(); | ||
492 | |||
493 | spin_lock(&osb->vote_task_lock); | ||
494 | /* grab this early so we know to try again if a state change and | ||
495 | * wake happens part-way through our work */ | ||
496 | osb->vote_work_sequence = osb->vote_wake_sequence; | ||
497 | |||
498 | processed = osb->blocked_lock_count; | ||
499 | while (processed) { | ||
500 | BUG_ON(list_empty(&osb->blocked_lock_list)); | ||
501 | |||
502 | lockres = list_entry(osb->blocked_lock_list.next, | ||
503 | struct ocfs2_lock_res, l_blocked_list); | ||
504 | list_del_init(&lockres->l_blocked_list); | ||
505 | osb->blocked_lock_count--; | ||
506 | spin_unlock(&osb->vote_task_lock); | ||
507 | |||
508 | BUG_ON(!processed); | ||
509 | processed--; | ||
510 | |||
511 | ocfs2_process_blocked_lock(osb, lockres); | ||
512 | |||
513 | spin_lock(&osb->vote_task_lock); | ||
514 | } | ||
515 | |||
516 | while (osb->vote_count) { | ||
517 | BUG_ON(list_empty(&osb->vote_list)); | ||
518 | work = list_entry(osb->vote_list.next, | ||
519 | struct ocfs2_vote_work, w_list); | ||
520 | list_del(&work->w_list); | ||
521 | osb->vote_count--; | ||
522 | spin_unlock(&osb->vote_task_lock); | ||
523 | |||
524 | ocfs2_process_vote(osb, &work->w_msg); | ||
525 | kfree(work); | ||
526 | |||
527 | spin_lock(&osb->vote_task_lock); | ||
528 | } | ||
529 | spin_unlock(&osb->vote_task_lock); | ||
530 | |||
531 | mlog_exit_void(); | ||
532 | } | ||
533 | |||
534 | static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb) | ||
535 | { | ||
536 | int empty = 0; | ||
537 | |||
538 | spin_lock(&osb->vote_task_lock); | ||
539 | if (list_empty(&osb->blocked_lock_list) && | ||
540 | list_empty(&osb->vote_list)) | ||
541 | empty = 1; | ||
542 | |||
543 | spin_unlock(&osb->vote_task_lock); | ||
544 | return empty; | ||
545 | } | ||
546 | |||
547 | static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb) | ||
548 | { | ||
549 | int should_wake = 0; | ||
550 | |||
551 | spin_lock(&osb->vote_task_lock); | ||
552 | if (osb->vote_work_sequence != osb->vote_wake_sequence) | ||
553 | should_wake = 1; | ||
554 | spin_unlock(&osb->vote_task_lock); | ||
555 | |||
556 | return should_wake; | ||
557 | } | ||
558 | |||
559 | int ocfs2_vote_thread(void *arg) | ||
560 | { | ||
561 | int status = 0; | ||
562 | struct ocfs2_super *osb = arg; | ||
563 | |||
564 | /* only quit once we've been asked to stop and there is no more | ||
565 | * work available */ | ||
566 | while (!(kthread_should_stop() && | ||
567 | ocfs2_vote_thread_lists_empty(osb))) { | ||
568 | |||
569 | wait_event_interruptible(osb->vote_event, | ||
570 | ocfs2_vote_thread_should_wake(osb) || | ||
571 | kthread_should_stop()); | ||
572 | |||
573 | mlog(0, "vote_thread: awoken\n"); | ||
574 | |||
575 | ocfs2_vote_thread_do_work(osb); | ||
576 | } | ||
577 | |||
578 | osb->vote_task = NULL; | ||
579 | return status; | ||
580 | } | ||
581 | |||
582 | static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id) | ||
583 | { | ||
584 | struct ocfs2_net_wait_ctxt *w; | ||
585 | |||
586 | w = kcalloc(1, sizeof(*w), GFP_KERNEL); | ||
587 | if (!w) { | ||
588 | mlog_errno(-ENOMEM); | ||
589 | goto bail; | ||
590 | } | ||
591 | |||
592 | INIT_LIST_HEAD(&w->n_list); | ||
593 | init_waitqueue_head(&w->n_event); | ||
594 | ocfs2_node_map_init(&w->n_node_map); | ||
595 | w->n_response_id = response_id; | ||
596 | w->n_callback = NULL; | ||
597 | bail: | ||
598 | return w; | ||
599 | } | ||
600 | |||
601 | static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb) | ||
602 | { | ||
603 | unsigned int ret; | ||
604 | |||
605 | spin_lock(&osb->net_response_lock); | ||
606 | ret = ++osb->net_response_ids; | ||
607 | spin_unlock(&osb->net_response_lock); | ||
608 | |||
609 | return ret; | ||
610 | } | ||
611 | |||
612 | static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb, | ||
613 | struct ocfs2_net_wait_ctxt *w) | ||
614 | { | ||
615 | spin_lock(&osb->net_response_lock); | ||
616 | list_del(&w->n_list); | ||
617 | spin_unlock(&osb->net_response_lock); | ||
618 | } | ||
619 | |||
620 | static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb, | ||
621 | struct ocfs2_net_wait_ctxt *w) | ||
622 | { | ||
623 | spin_lock(&osb->net_response_lock); | ||
624 | list_add_tail(&w->n_list, | ||
625 | &osb->net_response_list); | ||
626 | spin_unlock(&osb->net_response_lock); | ||
627 | } | ||
628 | |||
629 | static void __ocfs2_mark_node_responded(struct ocfs2_super *osb, | ||
630 | struct ocfs2_net_wait_ctxt *w, | ||
631 | int node_num) | ||
632 | { | ||
633 | assert_spin_locked(&osb->net_response_lock); | ||
634 | |||
635 | ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num); | ||
636 | if (ocfs2_node_map_is_empty(osb, &w->n_node_map)) | ||
637 | wake_up(&w->n_event); | ||
638 | } | ||
639 | |||
640 | /* Intended to be called from the node down callback, we fake remove | ||
641 | * the node from all our response contexts */ | ||
642 | void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, | ||
643 | int node_num) | ||
644 | { | ||
645 | struct list_head *p; | ||
646 | struct ocfs2_net_wait_ctxt *w = NULL; | ||
647 | |||
648 | spin_lock(&osb->net_response_lock); | ||
649 | |||
650 | list_for_each(p, &osb->net_response_list) { | ||
651 | w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list); | ||
652 | |||
653 | __ocfs2_mark_node_responded(osb, w, node_num); | ||
654 | } | ||
655 | |||
656 | spin_unlock(&osb->net_response_lock); | ||
657 | } | ||
658 | |||
659 | static int ocfs2_broadcast_vote(struct ocfs2_super *osb, | ||
660 | struct ocfs2_vote_msg *request, | ||
661 | unsigned int response_id, | ||
662 | int *response, | ||
663 | struct ocfs2_net_response_cb *callback) | ||
664 | { | ||
665 | int status, i, remote_err; | ||
666 | struct ocfs2_net_wait_ctxt *w = NULL; | ||
667 | int dequeued = 0; | ||
668 | |||
669 | mlog_entry_void(); | ||
670 | |||
671 | w = ocfs2_new_net_wait_ctxt(response_id); | ||
672 | if (!w) { | ||
673 | status = -ENOMEM; | ||
674 | mlog_errno(status); | ||
675 | goto bail; | ||
676 | } | ||
677 | w->n_callback = callback; | ||
678 | |||
679 | /* we're pretty much ready to go at this point, and this fills | ||
680 | * in n_response which we need anyway... */ | ||
681 | ocfs2_queue_net_wait_ctxt(osb, w); | ||
682 | |||
683 | i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0); | ||
684 | |||
685 | while (i != O2NM_INVALID_NODE_NUM) { | ||
686 | if (i != osb->node_num) { | ||
687 | mlog(0, "trying to send request to node %i\n", i); | ||
688 | ocfs2_node_map_set_bit(osb, &w->n_node_map, i); | ||
689 | |||
690 | remote_err = 0; | ||
691 | status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE, | ||
692 | osb->net_key, | ||
693 | request, | ||
694 | sizeof(*request), | ||
695 | i, | ||
696 | &remote_err); | ||
697 | if (status == -ETIMEDOUT) { | ||
698 | mlog(0, "remote node %d timed out!\n", i); | ||
699 | status = -EAGAIN; | ||
700 | goto bail; | ||
701 | } | ||
702 | if (remote_err < 0) { | ||
703 | status = remote_err; | ||
704 | mlog(0, "remote error %d on node %d!\n", | ||
705 | remote_err, i); | ||
706 | mlog_errno(status); | ||
707 | goto bail; | ||
708 | } | ||
709 | if (status < 0) { | ||
710 | mlog_errno(status); | ||
711 | goto bail; | ||
712 | } | ||
713 | } | ||
714 | i++; | ||
715 | i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i); | ||
716 | mlog(0, "next is %d, i am %d\n", i, osb->node_num); | ||
717 | } | ||
718 | mlog(0, "done sending, now waiting on responses...\n"); | ||
719 | |||
720 | wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map)); | ||
721 | |||
722 | ocfs2_dequeue_net_wait_ctxt(osb, w); | ||
723 | dequeued = 1; | ||
724 | |||
725 | *response = w->n_response; | ||
726 | status = 0; | ||
727 | bail: | ||
728 | if (w) { | ||
729 | if (!dequeued) | ||
730 | ocfs2_dequeue_net_wait_ctxt(osb, w); | ||
731 | kfree(w); | ||
732 | } | ||
733 | |||
734 | mlog_exit(status); | ||
735 | return status; | ||
736 | } | ||
737 | |||
738 | static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, | ||
739 | u64 blkno, | ||
740 | unsigned int generation, | ||
741 | enum ocfs2_vote_request type, | ||
742 | u32 priv) | ||
743 | { | ||
744 | struct ocfs2_vote_msg *request; | ||
745 | struct ocfs2_msg_hdr *hdr; | ||
746 | |||
747 | BUG_ON(!ocfs2_is_valid_vote_request(type)); | ||
748 | |||
749 | request = kcalloc(1, sizeof(*request), GFP_KERNEL); | ||
750 | if (!request) { | ||
751 | mlog_errno(-ENOMEM); | ||
752 | } else { | ||
753 | hdr = &request->v_hdr; | ||
754 | hdr->h_node_num = cpu_to_be32(osb->node_num); | ||
755 | hdr->h_request = cpu_to_be32(type); | ||
756 | hdr->h_blkno = cpu_to_be64(blkno); | ||
757 | hdr->h_generation = cpu_to_be32(generation); | ||
758 | |||
759 | request->md1.v_generic1 = cpu_to_be32(priv); | ||
760 | } | ||
761 | |||
762 | return request; | ||
763 | } | ||
764 | |||
765 | /* Complete the buildup of a new vote request and process the | ||
766 | * broadcast return value. */ | ||
767 | static int ocfs2_do_request_vote(struct ocfs2_super *osb, | ||
768 | struct ocfs2_vote_msg *request, | ||
769 | struct ocfs2_net_response_cb *callback) | ||
770 | { | ||
771 | int status, response; | ||
772 | unsigned int response_id; | ||
773 | struct ocfs2_msg_hdr *hdr; | ||
774 | |||
775 | response_id = ocfs2_new_response_id(osb); | ||
776 | |||
777 | hdr = &request->v_hdr; | ||
778 | hdr->h_response_id = cpu_to_be32(response_id); | ||
779 | |||
780 | status = ocfs2_broadcast_vote(osb, request, response_id, &response, | ||
781 | callback); | ||
782 | if (status < 0) { | ||
783 | mlog_errno(status); | ||
784 | goto bail; | ||
785 | } | ||
786 | |||
787 | status = response; | ||
788 | bail: | ||
789 | |||
790 | return status; | ||
791 | } | ||
792 | |||
793 | static int ocfs2_request_vote(struct inode *inode, | ||
794 | struct ocfs2_vote_msg *request, | ||
795 | struct ocfs2_net_response_cb *callback) | ||
796 | { | ||
797 | int status; | ||
798 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
799 | |||
800 | if (ocfs2_inode_is_new(inode)) | ||
801 | return 0; | ||
802 | |||
803 | status = -EAGAIN; | ||
804 | while (status == -EAGAIN) { | ||
805 | if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) && | ||
806 | signal_pending(current)) | ||
807 | return -ERESTARTSYS; | ||
808 | |||
809 | status = ocfs2_super_lock(osb, 0); | ||
810 | if (status < 0) { | ||
811 | mlog_errno(status); | ||
812 | break; | ||
813 | } | ||
814 | |||
815 | status = 0; | ||
816 | if (!ocfs2_node_map_is_only(osb, &osb->mounted_map, | ||
817 | osb->node_num)) | ||
818 | status = ocfs2_do_request_vote(osb, request, callback); | ||
819 | |||
820 | ocfs2_super_unlock(osb, 0); | ||
821 | } | ||
822 | return status; | ||
823 | } | ||
824 | |||
825 | static void ocfs2_delete_response_cb(void *priv, | ||
826 | struct ocfs2_response_msg *resp) | ||
827 | { | ||
828 | int orphaned_slot, node; | ||
829 | struct inode *inode = priv; | ||
830 | |||
831 | orphaned_slot = be32_to_cpu(resp->r_orphaned_slot); | ||
832 | node = be32_to_cpu(resp->r_hdr.h_node_num); | ||
833 | mlog(0, "node %d tells us that inode %"MLFu64" is orphaned in slot " | ||
834 | "%d\n", node, OCFS2_I(inode)->ip_blkno, orphaned_slot); | ||
835 | |||
836 | /* The other node may not actually know which slot the inode | ||
837 | * is orphaned in. */ | ||
838 | if (orphaned_slot == OCFS2_INVALID_SLOT) | ||
839 | return; | ||
840 | |||
841 | /* Ok, the responding node knows which slot this inode is | ||
842 | * orphaned in. We verify that the information is correct and | ||
843 | * then record this in the inode. ocfs2_delete_inode will use | ||
844 | * this information to determine which lock to take. */ | ||
845 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
846 | mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot && | ||
847 | OCFS2_I(inode)->ip_orphaned_slot | ||
848 | != OCFS2_INVALID_SLOT, "Inode %"MLFu64": Node %d " | ||
849 | "says it's orphaned in slot %d, we think it's in %d\n", | ||
850 | OCFS2_I(inode)->ip_blkno, | ||
851 | be32_to_cpu(resp->r_hdr.h_node_num), | ||
852 | orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot); | ||
853 | |||
854 | OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot; | ||
855 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
856 | } | ||
857 | |||
858 | int ocfs2_request_delete_vote(struct inode *inode) | ||
859 | { | ||
860 | int orphaned_slot, status; | ||
861 | struct ocfs2_net_response_cb delete_cb; | ||
862 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
863 | struct ocfs2_vote_msg *request; | ||
864 | |||
865 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
866 | orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; | ||
867 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
868 | |||
869 | delete_cb.rc_cb = ocfs2_delete_response_cb; | ||
870 | delete_cb.rc_priv = inode; | ||
871 | |||
872 | mlog(0, "Inode %"MLFu64", we start thinking orphaned slot is %d\n", | ||
873 | OCFS2_I(inode)->ip_blkno, orphaned_slot); | ||
874 | |||
875 | status = -ENOMEM; | ||
876 | request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, | ||
877 | inode->i_generation, | ||
878 | OCFS2_VOTE_REQ_DELETE, orphaned_slot); | ||
879 | if (request) { | ||
880 | status = ocfs2_request_vote(inode, request, &delete_cb); | ||
881 | |||
882 | kfree(request); | ||
883 | } | ||
884 | |||
885 | return status; | ||
886 | } | ||
887 | |||
888 | static void ocfs2_setup_unlink_vote(struct ocfs2_vote_msg *request, | ||
889 | struct dentry *dentry) | ||
890 | { | ||
891 | struct inode *parent = dentry->d_parent->d_inode; | ||
892 | |||
893 | /* We need some values which will uniquely identify a dentry | ||
894 | * on the other nodes so that they can find it and run | ||
895 | * d_delete against it. Parent directory block and full name | ||
896 | * should suffice. */ | ||
897 | |||
898 | mlog(0, "unlink/rename request: parent: %"MLFu64" name: %.*s\n", | ||
899 | OCFS2_I(parent)->ip_blkno, dentry->d_name.len, | ||
900 | dentry->d_name.name); | ||
901 | |||
902 | request->v_unlink_parent = cpu_to_be64(OCFS2_I(parent)->ip_blkno); | ||
903 | request->v_unlink_namelen = cpu_to_be32(dentry->d_name.len); | ||
904 | memcpy(request->v_unlink_dirent, dentry->d_name.name, | ||
905 | dentry->d_name.len); | ||
906 | } | ||
907 | |||
908 | int ocfs2_request_unlink_vote(struct inode *inode, | ||
909 | struct dentry *dentry, | ||
910 | unsigned int nlink) | ||
911 | { | ||
912 | int status; | ||
913 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
914 | struct ocfs2_vote_msg *request; | ||
915 | |||
916 | if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN) | ||
917 | return -ENAMETOOLONG; | ||
918 | |||
919 | status = -ENOMEM; | ||
920 | request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, | ||
921 | inode->i_generation, | ||
922 | OCFS2_VOTE_REQ_UNLINK, nlink); | ||
923 | if (request) { | ||
924 | ocfs2_setup_unlink_vote(request, dentry); | ||
925 | |||
926 | status = ocfs2_request_vote(inode, request, NULL); | ||
927 | |||
928 | kfree(request); | ||
929 | } | ||
930 | return status; | ||
931 | } | ||
932 | |||
933 | int ocfs2_request_rename_vote(struct inode *inode, | ||
934 | struct dentry *dentry) | ||
935 | { | ||
936 | int status; | ||
937 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
938 | struct ocfs2_vote_msg *request; | ||
939 | |||
940 | if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN) | ||
941 | return -ENAMETOOLONG; | ||
942 | |||
943 | status = -ENOMEM; | ||
944 | request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, | ||
945 | inode->i_generation, | ||
946 | OCFS2_VOTE_REQ_RENAME, 0); | ||
947 | if (request) { | ||
948 | ocfs2_setup_unlink_vote(request, dentry); | ||
949 | |||
950 | status = ocfs2_request_vote(inode, request, NULL); | ||
951 | |||
952 | kfree(request); | ||
953 | } | ||
954 | return status; | ||
955 | } | ||
956 | |||
957 | int ocfs2_request_mount_vote(struct ocfs2_super *osb) | ||
958 | { | ||
959 | int status; | ||
960 | struct ocfs2_vote_msg *request = NULL; | ||
961 | |||
962 | request = ocfs2_new_vote_request(osb, 0ULL, 0, | ||
963 | OCFS2_VOTE_REQ_MOUNT, 0); | ||
964 | if (!request) { | ||
965 | status = -ENOMEM; | ||
966 | goto bail; | ||
967 | } | ||
968 | |||
969 | status = -EAGAIN; | ||
970 | while (status == -EAGAIN) { | ||
971 | if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) && | ||
972 | signal_pending(current)) { | ||
973 | status = -ERESTARTSYS; | ||
974 | goto bail; | ||
975 | } | ||
976 | |||
977 | if (ocfs2_node_map_is_only(osb, &osb->mounted_map, | ||
978 | osb->node_num)) { | ||
979 | status = 0; | ||
980 | goto bail; | ||
981 | } | ||
982 | |||
983 | status = ocfs2_do_request_vote(osb, request, NULL); | ||
984 | } | ||
985 | |||
986 | bail: | ||
987 | if (request) | ||
988 | kfree(request); | ||
989 | |||
990 | return status; | ||
991 | } | ||
992 | |||
993 | int ocfs2_request_umount_vote(struct ocfs2_super *osb) | ||
994 | { | ||
995 | int status; | ||
996 | struct ocfs2_vote_msg *request = NULL; | ||
997 | |||
998 | request = ocfs2_new_vote_request(osb, 0ULL, 0, | ||
999 | OCFS2_VOTE_REQ_UMOUNT, 0); | ||
1000 | if (!request) { | ||
1001 | status = -ENOMEM; | ||
1002 | goto bail; | ||
1003 | } | ||
1004 | |||
1005 | status = -EAGAIN; | ||
1006 | while (status == -EAGAIN) { | ||
1007 | /* Do not check signals on this vote... We really want | ||
1008 | * this one to go all the way through. */ | ||
1009 | |||
1010 | if (ocfs2_node_map_is_only(osb, &osb->mounted_map, | ||
1011 | osb->node_num)) { | ||
1012 | status = 0; | ||
1013 | goto bail; | ||
1014 | } | ||
1015 | |||
1016 | status = ocfs2_do_request_vote(osb, request, NULL); | ||
1017 | } | ||
1018 | |||
1019 | bail: | ||
1020 | if (request) | ||
1021 | kfree(request); | ||
1022 | |||
1023 | return status; | ||
1024 | } | ||
1025 | |||
1026 | /* TODO: This should eventually be a hash table! */ | ||
1027 | static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb, | ||
1028 | u32 response_id) | ||
1029 | { | ||
1030 | struct list_head *p; | ||
1031 | struct ocfs2_net_wait_ctxt *w = NULL; | ||
1032 | |||
1033 | list_for_each(p, &osb->net_response_list) { | ||
1034 | w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list); | ||
1035 | if (response_id == w->n_response_id) | ||
1036 | break; | ||
1037 | w = NULL; | ||
1038 | } | ||
1039 | |||
1040 | return w; | ||
1041 | } | ||
1042 | |||
1043 | /* Translate response codes into local node errno values */ | ||
1044 | static inline int ocfs2_translate_response(int response) | ||
1045 | { | ||
1046 | int ret; | ||
1047 | |||
1048 | switch (response) { | ||
1049 | case OCFS2_RESPONSE_OK: | ||
1050 | ret = 0; | ||
1051 | break; | ||
1052 | |||
1053 | case OCFS2_RESPONSE_BUSY: | ||
1054 | ret = -EBUSY; | ||
1055 | break; | ||
1056 | |||
1057 | default: | ||
1058 | ret = -EINVAL; | ||
1059 | } | ||
1060 | |||
1061 | return ret; | ||
1062 | } | ||
1063 | |||
1064 | static int ocfs2_handle_response_message(struct o2net_msg *msg, | ||
1065 | u32 len, | ||
1066 | void *data) | ||
1067 | { | ||
1068 | unsigned int response_id, node_num; | ||
1069 | int response_status; | ||
1070 | struct ocfs2_super *osb = data; | ||
1071 | struct ocfs2_response_msg *resp; | ||
1072 | struct ocfs2_net_wait_ctxt * w; | ||
1073 | struct ocfs2_net_response_cb *resp_cb; | ||
1074 | |||
1075 | resp = (struct ocfs2_response_msg *) msg->buf; | ||
1076 | |||
1077 | response_id = be32_to_cpu(resp->r_hdr.h_response_id); | ||
1078 | node_num = be32_to_cpu(resp->r_hdr.h_node_num); | ||
1079 | response_status = | ||
1080 | ocfs2_translate_response(be32_to_cpu(resp->r_response)); | ||
1081 | |||
1082 | mlog(0, "received response message:\n"); | ||
1083 | mlog(0, "h_response_id = %u\n", response_id); | ||
1084 | mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request)); | ||
1085 | mlog(0, "h_blkno = %"MLFu64"\n", be64_to_cpu(resp->r_hdr.h_blkno)); | ||
1086 | mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation)); | ||
1087 | mlog(0, "h_node_num = %u\n", node_num); | ||
1088 | mlog(0, "r_response = %d\n", response_status); | ||
1089 | |||
1090 | spin_lock(&osb->net_response_lock); | ||
1091 | w = __ocfs2_find_net_wait_ctxt(osb, response_id); | ||
1092 | if (!w) { | ||
1093 | mlog(0, "request not found!\n"); | ||
1094 | goto bail; | ||
1095 | } | ||
1096 | resp_cb = w->n_callback; | ||
1097 | |||
1098 | if (response_status && (!w->n_response)) { | ||
1099 | /* we only really need one negative response so don't | ||
1100 | * set it twice. */ | ||
1101 | w->n_response = response_status; | ||
1102 | } | ||
1103 | |||
1104 | if (resp_cb) { | ||
1105 | spin_unlock(&osb->net_response_lock); | ||
1106 | |||
1107 | resp_cb->rc_cb(resp_cb->rc_priv, resp); | ||
1108 | |||
1109 | spin_lock(&osb->net_response_lock); | ||
1110 | } | ||
1111 | |||
1112 | __ocfs2_mark_node_responded(osb, w, node_num); | ||
1113 | bail: | ||
1114 | spin_unlock(&osb->net_response_lock); | ||
1115 | |||
1116 | return 0; | ||
1117 | } | ||
1118 | |||
1119 | static int ocfs2_handle_vote_message(struct o2net_msg *msg, | ||
1120 | u32 len, | ||
1121 | void *data) | ||
1122 | { | ||
1123 | int status; | ||
1124 | struct ocfs2_super *osb = data; | ||
1125 | struct ocfs2_vote_work *work; | ||
1126 | |||
1127 | work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_KERNEL); | ||
1128 | if (!work) { | ||
1129 | status = -ENOMEM; | ||
1130 | mlog_errno(status); | ||
1131 | goto bail; | ||
1132 | } | ||
1133 | |||
1134 | INIT_LIST_HEAD(&work->w_list); | ||
1135 | memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg)); | ||
1136 | |||
1137 | mlog(0, "scheduling vote request:\n"); | ||
1138 | mlog(0, "h_response_id = %u\n", | ||
1139 | be32_to_cpu(work->w_msg.v_hdr.h_response_id)); | ||
1140 | mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request)); | ||
1141 | mlog(0, "h_blkno = %"MLFu64"\n", | ||
1142 | be64_to_cpu(work->w_msg.v_hdr.h_blkno)); | ||
1143 | mlog(0, "h_generation = %u\n", | ||
1144 | be32_to_cpu(work->w_msg.v_hdr.h_generation)); | ||
1145 | mlog(0, "h_node_num = %u\n", | ||
1146 | be32_to_cpu(work->w_msg.v_hdr.h_node_num)); | ||
1147 | mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1)); | ||
1148 | |||
1149 | spin_lock(&osb->vote_task_lock); | ||
1150 | list_add_tail(&work->w_list, &osb->vote_list); | ||
1151 | osb->vote_count++; | ||
1152 | spin_unlock(&osb->vote_task_lock); | ||
1153 | |||
1154 | ocfs2_kick_vote_thread(osb); | ||
1155 | |||
1156 | status = 0; | ||
1157 | bail: | ||
1158 | return status; | ||
1159 | } | ||
1160 | |||
1161 | void ocfs2_unregister_net_handlers(struct ocfs2_super *osb) | ||
1162 | { | ||
1163 | if (!osb->net_key) | ||
1164 | return; | ||
1165 | |||
1166 | o2net_unregister_handler_list(&osb->osb_net_handlers); | ||
1167 | |||
1168 | if (!list_empty(&osb->net_response_list)) | ||
1169 | mlog(ML_ERROR, "net response list not empty!\n"); | ||
1170 | |||
1171 | osb->net_key = 0; | ||
1172 | } | ||
1173 | |||
1174 | int ocfs2_register_net_handlers(struct ocfs2_super *osb) | ||
1175 | { | ||
1176 | int status = 0; | ||
1177 | |||
1178 | status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE, | ||
1179 | osb->net_key, | ||
1180 | sizeof(struct ocfs2_response_msg), | ||
1181 | ocfs2_handle_response_message, | ||
1182 | osb, &osb->osb_net_handlers); | ||
1183 | if (status) { | ||
1184 | mlog_errno(status); | ||
1185 | goto bail; | ||
1186 | } | ||
1187 | |||
1188 | status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE, | ||
1189 | osb->net_key, | ||
1190 | sizeof(struct ocfs2_vote_msg), | ||
1191 | ocfs2_handle_vote_message, | ||
1192 | osb, &osb->osb_net_handlers); | ||
1193 | if (status) { | ||
1194 | mlog_errno(status); | ||
1195 | goto bail; | ||
1196 | } | ||
1197 | bail: | ||
1198 | if (status < 0) | ||
1199 | ocfs2_unregister_net_handlers(osb); | ||
1200 | |||
1201 | return status; | ||
1202 | } | ||
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h new file mode 100644 index 000000000000..9cce60703466 --- /dev/null +++ b/fs/ocfs2/vote.h | |||
@@ -0,0 +1,56 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * vote.h | ||
5 | * | ||
6 | * description here | ||
7 | * | ||
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | */ | ||
25 | |||
26 | |||
27 | #ifndef VOTE_H | ||
28 | #define VOTE_H | ||
29 | |||
30 | int ocfs2_vote_thread(void *arg); | ||
31 | static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb) | ||
32 | { | ||
33 | spin_lock(&osb->vote_task_lock); | ||
34 | /* make sure the voting thread gets a swipe at whatever changes | ||
35 | * the caller may have made to the voting state */ | ||
36 | osb->vote_wake_sequence++; | ||
37 | spin_unlock(&osb->vote_task_lock); | ||
38 | wake_up(&osb->vote_event); | ||
39 | } | ||
40 | |||
41 | int ocfs2_request_delete_vote(struct inode *inode); | ||
42 | int ocfs2_request_unlink_vote(struct inode *inode, | ||
43 | struct dentry *dentry, | ||
44 | unsigned int nlink); | ||
45 | int ocfs2_request_rename_vote(struct inode *inode, | ||
46 | struct dentry *dentry); | ||
47 | int ocfs2_request_mount_vote(struct ocfs2_super *osb); | ||
48 | int ocfs2_request_umount_vote(struct ocfs2_super *osb); | ||
49 | int ocfs2_register_net_handlers(struct ocfs2_super *osb); | ||
50 | void ocfs2_unregister_net_handlers(struct ocfs2_super *osb); | ||
51 | |||
52 | void ocfs2_mark_inode_remotely_deleted(struct inode *inode); | ||
53 | |||
54 | void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, | ||
55 | int node_num); | ||
56 | #endif | ||
diff --git a/include/linux/configfs.h b/include/linux/configfs.h new file mode 100644 index 000000000000..acffb8c9073a --- /dev/null +++ b/include/linux/configfs.h | |||
@@ -0,0 +1,205 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * configfs.h - definitions for the device driver filesystem | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public | ||
17 | * License along with this program; if not, write to the | ||
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
19 | * Boston, MA 021110-1307, USA. | ||
20 | * | ||
21 | * Based on sysfs: | ||
22 | * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel | ||
23 | * | ||
24 | * Based on kobject.h: | ||
25 | * Copyright (c) 2002-2003 Patrick Mochel | ||
26 | * Copyright (c) 2002-2003 Open Source Development Labs | ||
27 | * | ||
28 | * configfs Copyright (C) 2005 Oracle. All rights reserved. | ||
29 | * | ||
30 | * Please read Documentation/filesystems/configfs.txt before using the | ||
31 | * configfs interface, ESPECIALLY the parts about reference counts and | ||
32 | * item destructors. | ||
33 | */ | ||
34 | |||
35 | #ifndef _CONFIGFS_H_ | ||
36 | #define _CONFIGFS_H_ | ||
37 | |||
38 | #ifdef __KERNEL__ | ||
39 | |||
40 | #include <linux/types.h> | ||
41 | #include <linux/list.h> | ||
42 | #include <linux/kref.h> | ||
43 | |||
44 | #include <asm/atomic.h> | ||
45 | #include <asm/semaphore.h> | ||
46 | |||
47 | #define CONFIGFS_ITEM_NAME_LEN 20 | ||
48 | |||
49 | struct module; | ||
50 | |||
51 | struct configfs_item_operations; | ||
52 | struct configfs_group_operations; | ||
53 | struct configfs_attribute; | ||
54 | struct configfs_subsystem; | ||
55 | |||
56 | struct config_item { | ||
57 | char *ci_name; | ||
58 | char ci_namebuf[CONFIGFS_ITEM_NAME_LEN]; | ||
59 | struct kref ci_kref; | ||
60 | struct list_head ci_entry; | ||
61 | struct config_item *ci_parent; | ||
62 | struct config_group *ci_group; | ||
63 | struct config_item_type *ci_type; | ||
64 | struct dentry *ci_dentry; | ||
65 | }; | ||
66 | |||
67 | extern int config_item_set_name(struct config_item *, const char *, ...); | ||
68 | |||
69 | static inline char *config_item_name(struct config_item * item) | ||
70 | { | ||
71 | return item->ci_name; | ||
72 | } | ||
73 | |||
74 | extern void config_item_init(struct config_item *); | ||
75 | extern void config_item_init_type_name(struct config_item *item, | ||
76 | const char *name, | ||
77 | struct config_item_type *type); | ||
78 | extern void config_item_cleanup(struct config_item *); | ||
79 | |||
80 | extern struct config_item * config_item_get(struct config_item *); | ||
81 | extern void config_item_put(struct config_item *); | ||
82 | |||
83 | struct config_item_type { | ||
84 | struct module *ct_owner; | ||
85 | struct configfs_item_operations *ct_item_ops; | ||
86 | struct configfs_group_operations *ct_group_ops; | ||
87 | struct configfs_attribute **ct_attrs; | ||
88 | }; | ||
89 | |||
90 | |||
91 | /** | ||
92 | * group - a group of config_items of a specific type, belonging | ||
93 | * to a specific subsystem. | ||
94 | */ | ||
95 | |||
96 | struct config_group { | ||
97 | struct config_item cg_item; | ||
98 | struct list_head cg_children; | ||
99 | struct configfs_subsystem *cg_subsys; | ||
100 | struct config_group **default_groups; | ||
101 | }; | ||
102 | |||
103 | |||
104 | extern void config_group_init(struct config_group *group); | ||
105 | extern void config_group_init_type_name(struct config_group *group, | ||
106 | const char *name, | ||
107 | struct config_item_type *type); | ||
108 | |||
109 | |||
110 | static inline struct config_group *to_config_group(struct config_item *item) | ||
111 | { | ||
112 | return item ? container_of(item,struct config_group,cg_item) : NULL; | ||
113 | } | ||
114 | |||
115 | static inline struct config_group *config_group_get(struct config_group *group) | ||
116 | { | ||
117 | return group ? to_config_group(config_item_get(&group->cg_item)) : NULL; | ||
118 | } | ||
119 | |||
120 | static inline void config_group_put(struct config_group *group) | ||
121 | { | ||
122 | config_item_put(&group->cg_item); | ||
123 | } | ||
124 | |||
125 | extern struct config_item *config_group_find_obj(struct config_group *, const char *); | ||
126 | |||
127 | |||
128 | struct configfs_attribute { | ||
129 | char *ca_name; | ||
130 | struct module *ca_owner; | ||
131 | mode_t ca_mode; | ||
132 | }; | ||
133 | |||
134 | |||
135 | /* | ||
136 | * If allow_link() exists, the item can symlink(2) out to other | ||
137 | * items. If the item is a group, it may support mkdir(2). | ||
138 | * Groups supply one of make_group() and make_item(). If the | ||
139 | * group supports make_group(), one can create group children. If it | ||
140 | * supports make_item(), one can create config_item children. If it has | ||
141 | * default_groups on group->default_groups, it has automatically created | ||
142 | * group children. default_groups may coexist alongsize make_group() or | ||
143 | * make_item(), but if the group wishes to have only default_groups | ||
144 | * children (disallowing mkdir(2)), it need not provide either function. | ||
145 | * If the group has commit(), it supports pending and commited (active) | ||
146 | * items. | ||
147 | */ | ||
148 | struct configfs_item_operations { | ||
149 | void (*release)(struct config_item *); | ||
150 | ssize_t (*show_attribute)(struct config_item *, struct configfs_attribute *,char *); | ||
151 | ssize_t (*store_attribute)(struct config_item *,struct configfs_attribute *,const char *, size_t); | ||
152 | int (*allow_link)(struct config_item *src, struct config_item *target); | ||
153 | int (*drop_link)(struct config_item *src, struct config_item *target); | ||
154 | }; | ||
155 | |||
156 | struct configfs_group_operations { | ||
157 | struct config_item *(*make_item)(struct config_group *group, const char *name); | ||
158 | struct config_group *(*make_group)(struct config_group *group, const char *name); | ||
159 | int (*commit_item)(struct config_item *item); | ||
160 | void (*drop_item)(struct config_group *group, struct config_item *item); | ||
161 | }; | ||
162 | |||
163 | |||
164 | |||
165 | /** | ||
166 | * Use these macros to make defining attributes easier. See include/linux/device.h | ||
167 | * for examples.. | ||
168 | */ | ||
169 | |||
170 | #if 0 | ||
171 | #define __ATTR(_name,_mode,_show,_store) { \ | ||
172 | .attr = {.ca_name = __stringify(_name), .ca_mode = _mode, .ca_owner = THIS_MODULE }, \ | ||
173 | .show = _show, \ | ||
174 | .store = _store, \ | ||
175 | } | ||
176 | |||
177 | #define __ATTR_RO(_name) { \ | ||
178 | .attr = { .ca_name = __stringify(_name), .ca_mode = 0444, .ca_owner = THIS_MODULE }, \ | ||
179 | .show = _name##_show, \ | ||
180 | } | ||
181 | |||
182 | #define __ATTR_NULL { .attr = { .name = NULL } } | ||
183 | |||
184 | #define attr_name(_attr) (_attr).attr.name | ||
185 | #endif | ||
186 | |||
187 | |||
188 | struct configfs_subsystem { | ||
189 | struct config_group su_group; | ||
190 | struct semaphore su_sem; | ||
191 | }; | ||
192 | |||
193 | static inline struct configfs_subsystem *to_configfs_subsystem(struct config_group *group) | ||
194 | { | ||
195 | return group ? | ||
196 | container_of(group, struct configfs_subsystem, su_group) : | ||
197 | NULL; | ||
198 | } | ||
199 | |||
200 | int configfs_register_subsystem(struct configfs_subsystem *subsys); | ||
201 | void configfs_unregister_subsystem(struct configfs_subsystem *subsys); | ||
202 | |||
203 | #endif /* __KERNEL__ */ | ||
204 | |||
205 | #endif /* _CONFIGFS_H_ */ | ||
diff --git a/include/linux/fs.h b/include/linux/fs.h index cc35b6ac778d..ed9a41a71e8b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -302,6 +302,37 @@ struct iattr { | |||
302 | */ | 302 | */ |
303 | #include <linux/quota.h> | 303 | #include <linux/quota.h> |
304 | 304 | ||
305 | /** | ||
306 | * enum positive_aop_returns - aop return codes with specific semantics | ||
307 | * | ||
308 | * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has | ||
309 | * completed, that the page is still locked, and | ||
310 | * should be considered active. The VM uses this hint | ||
311 | * to return the page to the active list -- it won't | ||
312 | * be a candidate for writeback again in the near | ||
313 | * future. Other callers must be careful to unlock | ||
314 | * the page if they get this return. Returned by | ||
315 | * writepage(); | ||
316 | * | ||
317 | * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has | ||
318 | * unlocked it and the page might have been truncated. | ||
319 | * The caller should back up to acquiring a new page and | ||
320 | * trying again. The aop will be taking reasonable | ||
321 | * precautions not to livelock. If the caller held a page | ||
322 | * reference, it should drop it before retrying. Returned | ||
323 | * by readpage(), prepare_write(), and commit_write(). | ||
324 | * | ||
325 | * address_space_operation functions return these large constants to indicate | ||
326 | * special semantics to the caller. These are much larger than the bytes in a | ||
327 | * page to allow for functions that return the number of bytes operated on in a | ||
328 | * given page. | ||
329 | */ | ||
330 | |||
331 | enum positive_aop_returns { | ||
332 | AOP_WRITEPAGE_ACTIVATE = 0x80000, | ||
333 | AOP_TRUNCATED_PAGE = 0x80001, | ||
334 | }; | ||
335 | |||
305 | /* | 336 | /* |
306 | * oh the beauties of C type declarations. | 337 | * oh the beauties of C type declarations. |
307 | */ | 338 | */ |
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 343d883d69c5..64a36ba43b2f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
@@ -60,12 +60,6 @@ struct writeback_control { | |||
60 | }; | 60 | }; |
61 | 61 | ||
62 | /* | 62 | /* |
63 | * ->writepage() return values (make these much larger than a pagesize, in | ||
64 | * case some fs is returning number-of-bytes-written from writepage) | ||
65 | */ | ||
66 | #define WRITEPAGE_ACTIVATE 0x80000 /* IO was not started: activate page */ | ||
67 | |||
68 | /* | ||
69 | * fs/fs-writeback.c | 63 | * fs/fs-writeback.c |
70 | */ | 64 | */ |
71 | void writeback_inodes(struct writeback_control *wbc); | 65 | void writeback_inodes(struct writeback_control *wbc); |
diff --git a/mm/filemap.c b/mm/filemap.c index 33a28bfde158..6e1d08a2b8b9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -831,8 +831,13 @@ readpage: | |||
831 | /* Start the actual read. The read will unlock the page. */ | 831 | /* Start the actual read. The read will unlock the page. */ |
832 | error = mapping->a_ops->readpage(filp, page); | 832 | error = mapping->a_ops->readpage(filp, page); |
833 | 833 | ||
834 | if (unlikely(error)) | 834 | if (unlikely(error)) { |
835 | if (error == AOP_TRUNCATED_PAGE) { | ||
836 | page_cache_release(page); | ||
837 | goto find_page; | ||
838 | } | ||
835 | goto readpage_error; | 839 | goto readpage_error; |
840 | } | ||
836 | 841 | ||
837 | if (!PageUptodate(page)) { | 842 | if (!PageUptodate(page)) { |
838 | lock_page(page); | 843 | lock_page(page); |
@@ -1152,26 +1157,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset) | |||
1152 | { | 1157 | { |
1153 | struct address_space *mapping = file->f_mapping; | 1158 | struct address_space *mapping = file->f_mapping; |
1154 | struct page *page; | 1159 | struct page *page; |
1155 | int error; | 1160 | int ret; |
1156 | 1161 | ||
1157 | page = page_cache_alloc_cold(mapping); | 1162 | do { |
1158 | if (!page) | 1163 | page = page_cache_alloc_cold(mapping); |
1159 | return -ENOMEM; | 1164 | if (!page) |
1165 | return -ENOMEM; | ||
1166 | |||
1167 | ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); | ||
1168 | if (ret == 0) | ||
1169 | ret = mapping->a_ops->readpage(file, page); | ||
1170 | else if (ret == -EEXIST) | ||
1171 | ret = 0; /* losing race to add is OK */ | ||
1160 | 1172 | ||
1161 | error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); | ||
1162 | if (!error) { | ||
1163 | error = mapping->a_ops->readpage(file, page); | ||
1164 | page_cache_release(page); | 1173 | page_cache_release(page); |
1165 | return error; | ||
1166 | } | ||
1167 | 1174 | ||
1168 | /* | 1175 | } while (ret == AOP_TRUNCATED_PAGE); |
1169 | * We arrive here in the unlikely event that someone | 1176 | |
1170 | * raced with us and added our page to the cache first | 1177 | return ret; |
1171 | * or we are out of memory for radix-tree nodes. | ||
1172 | */ | ||
1173 | page_cache_release(page); | ||
1174 | return error == -EEXIST ? 0 : error; | ||
1175 | } | 1178 | } |
1176 | 1179 | ||
1177 | #define MMAP_LOTSAMISS (100) | 1180 | #define MMAP_LOTSAMISS (100) |
@@ -1331,10 +1334,14 @@ page_not_uptodate: | |||
1331 | goto success; | 1334 | goto success; |
1332 | } | 1335 | } |
1333 | 1336 | ||
1334 | if (!mapping->a_ops->readpage(file, page)) { | 1337 | error = mapping->a_ops->readpage(file, page); |
1338 | if (!error) { | ||
1335 | wait_on_page_locked(page); | 1339 | wait_on_page_locked(page); |
1336 | if (PageUptodate(page)) | 1340 | if (PageUptodate(page)) |
1337 | goto success; | 1341 | goto success; |
1342 | } else if (error == AOP_TRUNCATED_PAGE) { | ||
1343 | page_cache_release(page); | ||
1344 | goto retry_find; | ||
1338 | } | 1345 | } |
1339 | 1346 | ||
1340 | /* | 1347 | /* |
@@ -1358,10 +1365,14 @@ page_not_uptodate: | |||
1358 | goto success; | 1365 | goto success; |
1359 | } | 1366 | } |
1360 | ClearPageError(page); | 1367 | ClearPageError(page); |
1361 | if (!mapping->a_ops->readpage(file, page)) { | 1368 | error = mapping->a_ops->readpage(file, page); |
1369 | if (!error) { | ||
1362 | wait_on_page_locked(page); | 1370 | wait_on_page_locked(page); |
1363 | if (PageUptodate(page)) | 1371 | if (PageUptodate(page)) |
1364 | goto success; | 1372 | goto success; |
1373 | } else if (error == AOP_TRUNCATED_PAGE) { | ||
1374 | page_cache_release(page); | ||
1375 | goto retry_find; | ||
1365 | } | 1376 | } |
1366 | 1377 | ||
1367 | /* | 1378 | /* |
@@ -1444,10 +1455,14 @@ page_not_uptodate: | |||
1444 | goto success; | 1455 | goto success; |
1445 | } | 1456 | } |
1446 | 1457 | ||
1447 | if (!mapping->a_ops->readpage(file, page)) { | 1458 | error = mapping->a_ops->readpage(file, page); |
1459 | if (!error) { | ||
1448 | wait_on_page_locked(page); | 1460 | wait_on_page_locked(page); |
1449 | if (PageUptodate(page)) | 1461 | if (PageUptodate(page)) |
1450 | goto success; | 1462 | goto success; |
1463 | } else if (error == AOP_TRUNCATED_PAGE) { | ||
1464 | page_cache_release(page); | ||
1465 | goto retry_find; | ||
1451 | } | 1466 | } |
1452 | 1467 | ||
1453 | /* | 1468 | /* |
@@ -1470,10 +1485,14 @@ page_not_uptodate: | |||
1470 | } | 1485 | } |
1471 | 1486 | ||
1472 | ClearPageError(page); | 1487 | ClearPageError(page); |
1473 | if (!mapping->a_ops->readpage(file, page)) { | 1488 | error = mapping->a_ops->readpage(file, page); |
1489 | if (!error) { | ||
1474 | wait_on_page_locked(page); | 1490 | wait_on_page_locked(page); |
1475 | if (PageUptodate(page)) | 1491 | if (PageUptodate(page)) |
1476 | goto success; | 1492 | goto success; |
1493 | } else if (error == AOP_TRUNCATED_PAGE) { | ||
1494 | page_cache_release(page); | ||
1495 | goto retry_find; | ||
1477 | } | 1496 | } |
1478 | 1497 | ||
1479 | /* | 1498 | /* |
@@ -1934,12 +1953,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
1934 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | 1953 | status = a_ops->prepare_write(file, page, offset, offset+bytes); |
1935 | if (unlikely(status)) { | 1954 | if (unlikely(status)) { |
1936 | loff_t isize = i_size_read(inode); | 1955 | loff_t isize = i_size_read(inode); |
1956 | |||
1957 | if (status != AOP_TRUNCATED_PAGE) | ||
1958 | unlock_page(page); | ||
1959 | page_cache_release(page); | ||
1960 | if (status == AOP_TRUNCATED_PAGE) | ||
1961 | continue; | ||
1937 | /* | 1962 | /* |
1938 | * prepare_write() may have instantiated a few blocks | 1963 | * prepare_write() may have instantiated a few blocks |
1939 | * outside i_size. Trim these off again. | 1964 | * outside i_size. Trim these off again. |
1940 | */ | 1965 | */ |
1941 | unlock_page(page); | ||
1942 | page_cache_release(page); | ||
1943 | if (pos + bytes > isize) | 1966 | if (pos + bytes > isize) |
1944 | vmtruncate(inode, isize); | 1967 | vmtruncate(inode, isize); |
1945 | break; | 1968 | break; |
@@ -1952,6 +1975,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
1952 | cur_iov, iov_base, bytes); | 1975 | cur_iov, iov_base, bytes); |
1953 | flush_dcache_page(page); | 1976 | flush_dcache_page(page); |
1954 | status = a_ops->commit_write(file, page, offset, offset+bytes); | 1977 | status = a_ops->commit_write(file, page, offset, offset+bytes); |
1978 | if (status == AOP_TRUNCATED_PAGE) { | ||
1979 | page_cache_release(page); | ||
1980 | continue; | ||
1981 | } | ||
1955 | if (likely(copied > 0)) { | 1982 | if (likely(copied > 0)) { |
1956 | if (!status) | 1983 | if (!status) |
1957 | status = copied; | 1984 | status = copied; |
diff --git a/mm/readahead.c b/mm/readahead.c index 72e7adbb87c7..8d6eeaaa6296 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
158 | { | 158 | { |
159 | unsigned page_idx; | 159 | unsigned page_idx; |
160 | struct pagevec lru_pvec; | 160 | struct pagevec lru_pvec; |
161 | int ret = 0; | 161 | int ret; |
162 | 162 | ||
163 | if (mapping->a_ops->readpages) { | 163 | if (mapping->a_ops->readpages) { |
164 | ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); | 164 | ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); |
@@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
171 | list_del(&page->lru); | 171 | list_del(&page->lru); |
172 | if (!add_to_page_cache(page, mapping, | 172 | if (!add_to_page_cache(page, mapping, |
173 | page->index, GFP_KERNEL)) { | 173 | page->index, GFP_KERNEL)) { |
174 | mapping->a_ops->readpage(filp, page); | 174 | ret = mapping->a_ops->readpage(filp, page); |
175 | if (!pagevec_add(&lru_pvec, page)) | 175 | if (ret != AOP_TRUNCATED_PAGE) { |
176 | __pagevec_lru_add(&lru_pvec); | 176 | if (!pagevec_add(&lru_pvec, page)) |
177 | } else { | 177 | __pagevec_lru_add(&lru_pvec); |
178 | page_cache_release(page); | 178 | continue; |
179 | } /* else fall through to release */ | ||
179 | } | 180 | } |
181 | page_cache_release(page); | ||
180 | } | 182 | } |
181 | pagevec_lru_add(&lru_pvec); | 183 | pagevec_lru_add(&lru_pvec); |
184 | ret = 0; | ||
182 | out: | 185 | out: |
183 | return ret; | 186 | return ret; |
184 | } | 187 | } |
diff --git a/mm/shmem.c b/mm/shmem.c index dc25565a61e9..d9fc277940da 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -855,7 +855,7 @@ unlock: | |||
855 | swap_free(swap); | 855 | swap_free(swap); |
856 | redirty: | 856 | redirty: |
857 | set_page_dirty(page); | 857 | set_page_dirty(page); |
858 | return WRITEPAGE_ACTIVATE; /* Return with the page locked */ | 858 | return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */ |
859 | } | 859 | } |
860 | 860 | ||
861 | #ifdef CONFIG_NUMA | 861 | #ifdef CONFIG_NUMA |
diff --git a/mm/vmscan.c b/mm/vmscan.c index b0cd81c32de6..795a050fe471 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -367,7 +367,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
367 | res = mapping->a_ops->writepage(page, &wbc); | 367 | res = mapping->a_ops->writepage(page, &wbc); |
368 | if (res < 0) | 368 | if (res < 0) |
369 | handle_write_error(mapping, page, res); | 369 | handle_write_error(mapping, page, res); |
370 | if (res == WRITEPAGE_ACTIVATE) { | 370 | if (res == AOP_WRITEPAGE_ACTIVATE) { |
371 | ClearPageReclaim(page); | 371 | ClearPageReclaim(page); |
372 | return PAGE_ACTIVATE; | 372 | return PAGE_ACTIVATE; |
373 | } | 373 | } |