diff options
author | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-07-16 13:52:55 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-07-16 13:52:55 -0400 |
commit | add096909da63ef32d6766f6771c07c9f16c6ee5 (patch) | |
tree | 58594bcf68cbb6f777d5270d098ab8ca69cbaee3 | |
parent | e245befce7af0a1e1347079ed62695b059594bd4 (diff) | |
parent | 54c57dc3b6578356c0a428c767d4bf080254a2ee (diff) |
Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2
* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: (32 commits)
[PATCH] ocfs2: zero_user_page conversion
ocfs2: Support xfs style space reservation ioctls
ocfs2: support for removing file regions
ocfs2: update truncate handling of partial clusters
ocfs2: btree support for removal of arbirtrary extents
ocfs2: Support creation of unwritten extents
ocfs2: support writing of unwritten extents
ocfs2: small cleanup of ocfs2_write_begin_nolock()
ocfs2: btree changes for unwritten extents
ocfs2: abstract btree growing calls
ocfs2: use all extent block suballocators
ocfs2: plug truncate into cached dealloc routines
ocfs2: simplify deallocation locking
ocfs2: harden buffer check during mapping of page blocks
ocfs2: shared writeable mmap
ocfs2: factor out write aops into nolock variants
ocfs2: rework ocfs2_buffered_write_cluster()
ocfs2: take ip_alloc_sem during entire truncate
ocfs2: Add "preferred slot" mount option
[KJ PATCH] Replacing memset(<addr>,0,PAGE_SIZE) with clear_page() in fs/ocfs2/dlm/dlmrecovery.c
...
39 files changed, 4623 insertions, 1054 deletions
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt index b34cdb50eab4..d1b98257d000 100644 --- a/Documentation/filesystems/configfs/configfs.txt +++ b/Documentation/filesystems/configfs/configfs.txt | |||
@@ -238,6 +238,8 @@ config_item_type. | |||
238 | struct config_group *(*make_group)(struct config_group *group, | 238 | struct config_group *(*make_group)(struct config_group *group, |
239 | const char *name); | 239 | const char *name); |
240 | int (*commit_item)(struct config_item *item); | 240 | int (*commit_item)(struct config_item *item); |
241 | void (*disconnect_notify)(struct config_group *group, | ||
242 | struct config_item *item); | ||
241 | void (*drop_item)(struct config_group *group, | 243 | void (*drop_item)(struct config_group *group, |
242 | struct config_item *item); | 244 | struct config_item *item); |
243 | }; | 245 | }; |
@@ -268,6 +270,16 @@ the item in other threads, the memory is safe. It may take some time | |||
268 | for the item to actually disappear from the subsystem's usage. But it | 270 | for the item to actually disappear from the subsystem's usage. But it |
269 | is gone from configfs. | 271 | is gone from configfs. |
270 | 272 | ||
273 | When drop_item() is called, the item's linkage has already been torn | ||
274 | down. It no longer has a reference on its parent and has no place in | ||
275 | the item hierarchy. If a client needs to do some cleanup before this | ||
276 | teardown happens, the subsystem can implement the | ||
277 | ct_group_ops->disconnect_notify() method. The method is called after | ||
278 | configfs has removed the item from the filesystem view but before the | ||
279 | item is removed from its parent group. Like drop_item(), | ||
280 | disconnect_notify() is void and cannot fail. Client subsystems should | ||
281 | not drop any references here, as they still must do it in drop_item(). | ||
282 | |||
271 | A config_group cannot be removed while it still has child items. This | 283 | A config_group cannot be removed while it still has child items. This |
272 | is implemented in the configfs rmdir(2) code. ->drop_item() will not be | 284 | is implemented in the configfs rmdir(2) code. ->drop_item() will not be |
273 | called, as the item has not been dropped. rmdir(2) will fail, as the | 285 | called, as the item has not been dropped. rmdir(2) will fail, as the |
@@ -280,18 +292,18 @@ tells configfs to make the subsystem appear in the file tree. | |||
280 | 292 | ||
281 | struct configfs_subsystem { | 293 | struct configfs_subsystem { |
282 | struct config_group su_group; | 294 | struct config_group su_group; |
283 | struct semaphore su_sem; | 295 | struct mutex su_mutex; |
284 | }; | 296 | }; |
285 | 297 | ||
286 | int configfs_register_subsystem(struct configfs_subsystem *subsys); | 298 | int configfs_register_subsystem(struct configfs_subsystem *subsys); |
287 | void configfs_unregister_subsystem(struct configfs_subsystem *subsys); | 299 | void configfs_unregister_subsystem(struct configfs_subsystem *subsys); |
288 | 300 | ||
289 | A subsystem consists of a toplevel config_group and a semaphore. | 301 | A subsystem consists of a toplevel config_group and a mutex. |
290 | The group is where child config_items are created. For a subsystem, | 302 | The group is where child config_items are created. For a subsystem, |
291 | this group is usually defined statically. Before calling | 303 | this group is usually defined statically. Before calling |
292 | configfs_register_subsystem(), the subsystem must have initialized the | 304 | configfs_register_subsystem(), the subsystem must have initialized the |
293 | group via the usual group _init() functions, and it must also have | 305 | group via the usual group _init() functions, and it must also have |
294 | initialized the semaphore. | 306 | initialized the mutex. |
295 | When the register call returns, the subsystem is live, and it | 307 | When the register call returns, the subsystem is live, and it |
296 | will be visible via configfs. At that point, mkdir(2) can be called and | 308 | will be visible via configfs. At that point, mkdir(2) can be called and |
297 | the subsystem must be ready for it. | 309 | the subsystem must be ready for it. |
@@ -303,7 +315,7 @@ subsystem/group and the simple_child item in configfs_example.c It | |||
303 | shows a trivial object displaying and storing an attribute, and a simple | 315 | shows a trivial object displaying and storing an attribute, and a simple |
304 | group creating and destroying these children. | 316 | group creating and destroying these children. |
305 | 317 | ||
306 | [Hierarchy Navigation and the Subsystem Semaphore] | 318 | [Hierarchy Navigation and the Subsystem Mutex] |
307 | 319 | ||
308 | There is an extra bonus that configfs provides. The config_groups and | 320 | There is an extra bonus that configfs provides. The config_groups and |
309 | config_items are arranged in a hierarchy due to the fact that they | 321 | config_items are arranged in a hierarchy due to the fact that they |
@@ -314,19 +326,19 @@ and config_item->ci_parent structure members. | |||
314 | 326 | ||
315 | A subsystem can navigate the cg_children list and the ci_parent pointer | 327 | A subsystem can navigate the cg_children list and the ci_parent pointer |
316 | to see the tree created by the subsystem. This can race with configfs' | 328 | to see the tree created by the subsystem. This can race with configfs' |
317 | management of the hierarchy, so configfs uses the subsystem semaphore to | 329 | management of the hierarchy, so configfs uses the subsystem mutex to |
318 | protect modifications. Whenever a subsystem wants to navigate the | 330 | protect modifications. Whenever a subsystem wants to navigate the |
319 | hierarchy, it must do so under the protection of the subsystem | 331 | hierarchy, it must do so under the protection of the subsystem |
320 | semaphore. | 332 | mutex. |
321 | 333 | ||
322 | A subsystem will be prevented from acquiring the semaphore while a newly | 334 | A subsystem will be prevented from acquiring the mutex while a newly |
323 | allocated item has not been linked into this hierarchy. Similarly, it | 335 | allocated item has not been linked into this hierarchy. Similarly, it |
324 | will not be able to acquire the semaphore while a dropping item has not | 336 | will not be able to acquire the mutex while a dropping item has not |
325 | yet been unlinked. This means that an item's ci_parent pointer will | 337 | yet been unlinked. This means that an item's ci_parent pointer will |
326 | never be NULL while the item is in configfs, and that an item will only | 338 | never be NULL while the item is in configfs, and that an item will only |
327 | be in its parent's cg_children list for the same duration. This allows | 339 | be in its parent's cg_children list for the same duration. This allows |
328 | a subsystem to trust ci_parent and cg_children while they hold the | 340 | a subsystem to trust ci_parent and cg_children while they hold the |
329 | semaphore. | 341 | mutex. |
330 | 342 | ||
331 | [Item Aggregation Via symlink(2)] | 343 | [Item Aggregation Via symlink(2)] |
332 | 344 | ||
@@ -386,6 +398,33 @@ As a consequence of this, default_groups cannot be removed directly via | |||
386 | rmdir(2). They also are not considered when rmdir(2) on the parent | 398 | rmdir(2). They also are not considered when rmdir(2) on the parent |
387 | group is checking for children. | 399 | group is checking for children. |
388 | 400 | ||
401 | [Dependant Subsystems] | ||
402 | |||
403 | Sometimes other drivers depend on particular configfs items. For | ||
404 | example, ocfs2 mounts depend on a heartbeat region item. If that | ||
405 | region item is removed with rmdir(2), the ocfs2 mount must BUG or go | ||
406 | readonly. Not happy. | ||
407 | |||
408 | configfs provides two additional API calls: configfs_depend_item() and | ||
409 | configfs_undepend_item(). A client driver can call | ||
410 | configfs_depend_item() on an existing item to tell configfs that it is | ||
411 | depended on. configfs will then return -EBUSY from rmdir(2) for that | ||
412 | item. When the item is no longer depended on, the client driver calls | ||
413 | configfs_undepend_item() on it. | ||
414 | |||
415 | These API cannot be called underneath any configfs callbacks, as | ||
416 | they will conflict. They can block and allocate. A client driver | ||
417 | probably shouldn't calling them of its own gumption. Rather it should | ||
418 | be providing an API that external subsystems call. | ||
419 | |||
420 | How does this work? Imagine the ocfs2 mount process. When it mounts, | ||
421 | it asks for a heartbeat region item. This is done via a call into the | ||
422 | heartbeat code. Inside the heartbeat code, the region item is looked | ||
423 | up. Here, the heartbeat code calls configfs_depend_item(). If it | ||
424 | succeeds, then heartbeat knows the region is safe to give to ocfs2. | ||
425 | If it fails, it was being torn down anyway, and heartbeat can gracefully | ||
426 | pass up an error. | ||
427 | |||
389 | [Committable Items] | 428 | [Committable Items] |
390 | 429 | ||
391 | NOTE: Committable items are currently unimplemented. | 430 | NOTE: Committable items are currently unimplemented. |
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c index 2d6a14a463e0..e56d49264b39 100644 --- a/Documentation/filesystems/configfs/configfs_example.c +++ b/Documentation/filesystems/configfs/configfs_example.c | |||
@@ -453,7 +453,7 @@ static int __init configfs_example_init(void) | |||
453 | subsys = example_subsys[i]; | 453 | subsys = example_subsys[i]; |
454 | 454 | ||
455 | config_group_init(&subsys->su_group); | 455 | config_group_init(&subsys->su_group); |
456 | init_MUTEX(&subsys->su_sem); | 456 | mutex_init(&subsys->su_mutex); |
457 | ret = configfs_register_subsystem(subsys); | 457 | ret = configfs_register_subsystem(subsys); |
458 | if (ret) { | 458 | if (ret) { |
459 | printk(KERN_ERR "Error %d while registering subsystem %s\n", | 459 | printk(KERN_ERR "Error %d while registering subsystem %s\n", |
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index 7b48c034b312..3b0185fdf9a4 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h | |||
@@ -29,10 +29,11 @@ | |||
29 | 29 | ||
30 | struct configfs_dirent { | 30 | struct configfs_dirent { |
31 | atomic_t s_count; | 31 | atomic_t s_count; |
32 | int s_dependent_count; | ||
32 | struct list_head s_sibling; | 33 | struct list_head s_sibling; |
33 | struct list_head s_children; | 34 | struct list_head s_children; |
34 | struct list_head s_links; | 35 | struct list_head s_links; |
35 | void * s_element; | 36 | void * s_element; |
36 | int s_type; | 37 | int s_type; |
37 | umode_t s_mode; | 38 | umode_t s_mode; |
38 | struct dentry * s_dentry; | 39 | struct dentry * s_dentry; |
@@ -41,8 +42,8 @@ struct configfs_dirent { | |||
41 | 42 | ||
42 | #define CONFIGFS_ROOT 0x0001 | 43 | #define CONFIGFS_ROOT 0x0001 |
43 | #define CONFIGFS_DIR 0x0002 | 44 | #define CONFIGFS_DIR 0x0002 |
44 | #define CONFIGFS_ITEM_ATTR 0x0004 | 45 | #define CONFIGFS_ITEM_ATTR 0x0004 |
45 | #define CONFIGFS_ITEM_LINK 0x0020 | 46 | #define CONFIGFS_ITEM_LINK 0x0020 |
46 | #define CONFIGFS_USET_DIR 0x0040 | 47 | #define CONFIGFS_USET_DIR 0x0040 |
47 | #define CONFIGFS_USET_DEFAULT 0x0080 | 48 | #define CONFIGFS_USET_DEFAULT 0x0080 |
48 | #define CONFIGFS_USET_DROPPING 0x0100 | 49 | #define CONFIGFS_USET_DROPPING 0x0100 |
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 5e6e37e58f36..2f436d4f1d6d 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c | |||
@@ -355,6 +355,10 @@ static int configfs_detach_prep(struct dentry *dentry) | |||
355 | /* Mark that we've taken i_mutex */ | 355 | /* Mark that we've taken i_mutex */ |
356 | sd->s_type |= CONFIGFS_USET_DROPPING; | 356 | sd->s_type |= CONFIGFS_USET_DROPPING; |
357 | 357 | ||
358 | /* | ||
359 | * Yup, recursive. If there's a problem, blame | ||
360 | * deep nesting of default_groups | ||
361 | */ | ||
358 | ret = configfs_detach_prep(sd->s_dentry); | 362 | ret = configfs_detach_prep(sd->s_dentry); |
359 | if (!ret) | 363 | if (!ret) |
360 | continue; | 364 | continue; |
@@ -562,7 +566,7 @@ static int populate_groups(struct config_group *group) | |||
562 | 566 | ||
563 | /* | 567 | /* |
564 | * All of link_obj/unlink_obj/link_group/unlink_group require that | 568 | * All of link_obj/unlink_obj/link_group/unlink_group require that |
565 | * subsys->su_sem is held. | 569 | * subsys->su_mutex is held. |
566 | */ | 570 | */ |
567 | 571 | ||
568 | static void unlink_obj(struct config_item *item) | 572 | static void unlink_obj(struct config_item *item) |
@@ -714,6 +718,28 @@ static void configfs_detach_group(struct config_item *item) | |||
714 | } | 718 | } |
715 | 719 | ||
716 | /* | 720 | /* |
721 | * After the item has been detached from the filesystem view, we are | ||
722 | * ready to tear it out of the hierarchy. Notify the client before | ||
723 | * we do that so they can perform any cleanup that requires | ||
724 | * navigating the hierarchy. A client does not need to provide this | ||
725 | * callback. The subsystem semaphore MUST be held by the caller, and | ||
726 | * references must be valid for both items. It also assumes the | ||
727 | * caller has validated ci_type. | ||
728 | */ | ||
729 | static void client_disconnect_notify(struct config_item *parent_item, | ||
730 | struct config_item *item) | ||
731 | { | ||
732 | struct config_item_type *type; | ||
733 | |||
734 | type = parent_item->ci_type; | ||
735 | BUG_ON(!type); | ||
736 | |||
737 | if (type->ct_group_ops && type->ct_group_ops->disconnect_notify) | ||
738 | type->ct_group_ops->disconnect_notify(to_config_group(parent_item), | ||
739 | item); | ||
740 | } | ||
741 | |||
742 | /* | ||
717 | * Drop the initial reference from make_item()/make_group() | 743 | * Drop the initial reference from make_item()/make_group() |
718 | * This function assumes that reference is held on item | 744 | * This function assumes that reference is held on item |
719 | * and that item holds a valid reference to the parent. Also, it | 745 | * and that item holds a valid reference to the parent. Also, it |
@@ -733,11 +759,244 @@ static void client_drop_item(struct config_item *parent_item, | |||
733 | */ | 759 | */ |
734 | if (type->ct_group_ops && type->ct_group_ops->drop_item) | 760 | if (type->ct_group_ops && type->ct_group_ops->drop_item) |
735 | type->ct_group_ops->drop_item(to_config_group(parent_item), | 761 | type->ct_group_ops->drop_item(to_config_group(parent_item), |
736 | item); | 762 | item); |
737 | else | 763 | else |
738 | config_item_put(item); | 764 | config_item_put(item); |
739 | } | 765 | } |
740 | 766 | ||
767 | #ifdef DEBUG | ||
768 | static void configfs_dump_one(struct configfs_dirent *sd, int level) | ||
769 | { | ||
770 | printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd)); | ||
771 | |||
772 | #define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type); | ||
773 | type_print(CONFIGFS_ROOT); | ||
774 | type_print(CONFIGFS_DIR); | ||
775 | type_print(CONFIGFS_ITEM_ATTR); | ||
776 | type_print(CONFIGFS_ITEM_LINK); | ||
777 | type_print(CONFIGFS_USET_DIR); | ||
778 | type_print(CONFIGFS_USET_DEFAULT); | ||
779 | type_print(CONFIGFS_USET_DROPPING); | ||
780 | #undef type_print | ||
781 | } | ||
782 | |||
783 | static int configfs_dump(struct configfs_dirent *sd, int level) | ||
784 | { | ||
785 | struct configfs_dirent *child_sd; | ||
786 | int ret = 0; | ||
787 | |||
788 | configfs_dump_one(sd, level); | ||
789 | |||
790 | if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT))) | ||
791 | return 0; | ||
792 | |||
793 | list_for_each_entry(child_sd, &sd->s_children, s_sibling) { | ||
794 | ret = configfs_dump(child_sd, level + 2); | ||
795 | if (ret) | ||
796 | break; | ||
797 | } | ||
798 | |||
799 | return ret; | ||
800 | } | ||
801 | #endif | ||
802 | |||
803 | |||
804 | /* | ||
805 | * configfs_depend_item() and configfs_undepend_item() | ||
806 | * | ||
807 | * WARNING: Do not call these from a configfs callback! | ||
808 | * | ||
809 | * This describes these functions and their helpers. | ||
810 | * | ||
811 | * Allow another kernel system to depend on a config_item. If this | ||
812 | * happens, the item cannot go away until the dependant can live without | ||
813 | * it. The idea is to give client modules as simple an interface as | ||
814 | * possible. When a system asks them to depend on an item, they just | ||
815 | * call configfs_depend_item(). If the item is live and the client | ||
816 | * driver is in good shape, we'll happily do the work for them. | ||
817 | * | ||
818 | * Why is the locking complex? Because configfs uses the VFS to handle | ||
819 | * all locking, but this function is called outside the normal | ||
820 | * VFS->configfs path. So it must take VFS locks to prevent the | ||
821 | * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc). This is | ||
822 | * why you can't call these functions underneath configfs callbacks. | ||
823 | * | ||
824 | * Note, btw, that this can be called at *any* time, even when a configfs | ||
825 | * subsystem isn't registered, or when configfs is loading or unloading. | ||
826 | * Just like configfs_register_subsystem(). So we take the same | ||
827 | * precautions. We pin the filesystem. We lock each i_mutex _in_order_ | ||
828 | * on our way down the tree. If we can find the target item in the | ||
829 | * configfs tree, it must be part of the subsystem tree as well, so we | ||
830 | * do not need the subsystem semaphore. Holding the i_mutex chain locks | ||
831 | * out mkdir() and rmdir(), who might be racing us. | ||
832 | */ | ||
833 | |||
834 | /* | ||
835 | * configfs_depend_prep() | ||
836 | * | ||
837 | * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are | ||
838 | * attributes. This is similar but not the same to configfs_detach_prep(). | ||
839 | * Note that configfs_detach_prep() expects the parent to be locked when it | ||
840 | * is called, but we lock the parent *inside* configfs_depend_prep(). We | ||
841 | * do that so we can unlock it if we find nothing. | ||
842 | * | ||
843 | * Here we do a depth-first search of the dentry hierarchy looking for | ||
844 | * our object. We take i_mutex on each step of the way down. IT IS | ||
845 | * ESSENTIAL THAT i_mutex LOCKING IS ORDERED. If we come back up a branch, | ||
846 | * we'll drop the i_mutex. | ||
847 | * | ||
848 | * If the target is not found, -ENOENT is bubbled up and we have released | ||
849 | * all locks. If the target was found, the locks will be cleared by | ||
850 | * configfs_depend_rollback(). | ||
851 | * | ||
852 | * This adds a requirement that all config_items be unique! | ||
853 | * | ||
854 | * This is recursive because the locking traversal is tricky. There isn't | ||
855 | * much on the stack, though, so folks that need this function - be careful | ||
856 | * about your stack! Patches will be accepted to make it iterative. | ||
857 | */ | ||
858 | static int configfs_depend_prep(struct dentry *origin, | ||
859 | struct config_item *target) | ||
860 | { | ||
861 | struct configfs_dirent *child_sd, *sd = origin->d_fsdata; | ||
862 | int ret = 0; | ||
863 | |||
864 | BUG_ON(!origin || !sd); | ||
865 | |||
866 | /* Lock this guy on the way down */ | ||
867 | mutex_lock(&sd->s_dentry->d_inode->i_mutex); | ||
868 | if (sd->s_element == target) /* Boo-yah */ | ||
869 | goto out; | ||
870 | |||
871 | list_for_each_entry(child_sd, &sd->s_children, s_sibling) { | ||
872 | if (child_sd->s_type & CONFIGFS_DIR) { | ||
873 | ret = configfs_depend_prep(child_sd->s_dentry, | ||
874 | target); | ||
875 | if (!ret) | ||
876 | goto out; /* Child path boo-yah */ | ||
877 | } | ||
878 | } | ||
879 | |||
880 | /* We looped all our children and didn't find target */ | ||
881 | mutex_unlock(&sd->s_dentry->d_inode->i_mutex); | ||
882 | ret = -ENOENT; | ||
883 | |||
884 | out: | ||
885 | return ret; | ||
886 | } | ||
887 | |||
888 | /* | ||
889 | * This is ONLY called if configfs_depend_prep() did its job. So we can | ||
890 | * trust the entire path from item back up to origin. | ||
891 | * | ||
892 | * We walk backwards from item, unlocking each i_mutex. We finish by | ||
893 | * unlocking origin. | ||
894 | */ | ||
895 | static void configfs_depend_rollback(struct dentry *origin, | ||
896 | struct config_item *item) | ||
897 | { | ||
898 | struct dentry *dentry = item->ci_dentry; | ||
899 | |||
900 | while (dentry != origin) { | ||
901 | mutex_unlock(&dentry->d_inode->i_mutex); | ||
902 | dentry = dentry->d_parent; | ||
903 | } | ||
904 | |||
905 | mutex_unlock(&origin->d_inode->i_mutex); | ||
906 | } | ||
907 | |||
908 | int configfs_depend_item(struct configfs_subsystem *subsys, | ||
909 | struct config_item *target) | ||
910 | { | ||
911 | int ret; | ||
912 | struct configfs_dirent *p, *root_sd, *subsys_sd = NULL; | ||
913 | struct config_item *s_item = &subsys->su_group.cg_item; | ||
914 | |||
915 | /* | ||
916 | * Pin the configfs filesystem. This means we can safely access | ||
917 | * the root of the configfs filesystem. | ||
918 | */ | ||
919 | ret = configfs_pin_fs(); | ||
920 | if (ret) | ||
921 | return ret; | ||
922 | |||
923 | /* | ||
924 | * Next, lock the root directory. We're going to check that the | ||
925 | * subsystem is really registered, and so we need to lock out | ||
926 | * configfs_[un]register_subsystem(). | ||
927 | */ | ||
928 | mutex_lock(&configfs_sb->s_root->d_inode->i_mutex); | ||
929 | |||
930 | root_sd = configfs_sb->s_root->d_fsdata; | ||
931 | |||
932 | list_for_each_entry(p, &root_sd->s_children, s_sibling) { | ||
933 | if (p->s_type & CONFIGFS_DIR) { | ||
934 | if (p->s_element == s_item) { | ||
935 | subsys_sd = p; | ||
936 | break; | ||
937 | } | ||
938 | } | ||
939 | } | ||
940 | |||
941 | if (!subsys_sd) { | ||
942 | ret = -ENOENT; | ||
943 | goto out_unlock_fs; | ||
944 | } | ||
945 | |||
946 | /* Ok, now we can trust subsys/s_item */ | ||
947 | |||
948 | /* Scan the tree, locking i_mutex recursively, return 0 if found */ | ||
949 | ret = configfs_depend_prep(subsys_sd->s_dentry, target); | ||
950 | if (ret) | ||
951 | goto out_unlock_fs; | ||
952 | |||
953 | /* We hold all i_mutexes from the subsystem down to the target */ | ||
954 | p = target->ci_dentry->d_fsdata; | ||
955 | p->s_dependent_count += 1; | ||
956 | |||
957 | configfs_depend_rollback(subsys_sd->s_dentry, target); | ||
958 | |||
959 | out_unlock_fs: | ||
960 | mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex); | ||
961 | |||
962 | /* | ||
963 | * If we succeeded, the fs is pinned via other methods. If not, | ||
964 | * we're done with it anyway. So release_fs() is always right. | ||
965 | */ | ||
966 | configfs_release_fs(); | ||
967 | |||
968 | return ret; | ||
969 | } | ||
970 | EXPORT_SYMBOL(configfs_depend_item); | ||
971 | |||
972 | /* | ||
973 | * Release the dependent linkage. This is much simpler than | ||
974 | * configfs_depend_item() because we know that that the client driver is | ||
975 | * pinned, thus the subsystem is pinned, and therefore configfs is pinned. | ||
976 | */ | ||
977 | void configfs_undepend_item(struct configfs_subsystem *subsys, | ||
978 | struct config_item *target) | ||
979 | { | ||
980 | struct configfs_dirent *sd; | ||
981 | |||
982 | /* | ||
983 | * Since we can trust everything is pinned, we just need i_mutex | ||
984 | * on the item. | ||
985 | */ | ||
986 | mutex_lock(&target->ci_dentry->d_inode->i_mutex); | ||
987 | |||
988 | sd = target->ci_dentry->d_fsdata; | ||
989 | BUG_ON(sd->s_dependent_count < 1); | ||
990 | |||
991 | sd->s_dependent_count -= 1; | ||
992 | |||
993 | /* | ||
994 | * After this unlock, we cannot trust the item to stay alive! | ||
995 | * DO NOT REFERENCE item after this unlock. | ||
996 | */ | ||
997 | mutex_unlock(&target->ci_dentry->d_inode->i_mutex); | ||
998 | } | ||
999 | EXPORT_SYMBOL(configfs_undepend_item); | ||
741 | 1000 | ||
742 | static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) | 1001 | static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) |
743 | { | 1002 | { |
@@ -783,7 +1042,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) | |||
783 | 1042 | ||
784 | snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name); | 1043 | snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name); |
785 | 1044 | ||
786 | down(&subsys->su_sem); | 1045 | mutex_lock(&subsys->su_mutex); |
787 | group = NULL; | 1046 | group = NULL; |
788 | item = NULL; | 1047 | item = NULL; |
789 | if (type->ct_group_ops->make_group) { | 1048 | if (type->ct_group_ops->make_group) { |
@@ -797,7 +1056,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) | |||
797 | if (item) | 1056 | if (item) |
798 | link_obj(parent_item, item); | 1057 | link_obj(parent_item, item); |
799 | } | 1058 | } |
800 | up(&subsys->su_sem); | 1059 | mutex_unlock(&subsys->su_mutex); |
801 | 1060 | ||
802 | kfree(name); | 1061 | kfree(name); |
803 | if (!item) { | 1062 | if (!item) { |
@@ -841,13 +1100,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) | |||
841 | out_unlink: | 1100 | out_unlink: |
842 | if (ret) { | 1101 | if (ret) { |
843 | /* Tear down everything we built up */ | 1102 | /* Tear down everything we built up */ |
844 | down(&subsys->su_sem); | 1103 | mutex_lock(&subsys->su_mutex); |
1104 | |||
1105 | client_disconnect_notify(parent_item, item); | ||
845 | if (group) | 1106 | if (group) |
846 | unlink_group(group); | 1107 | unlink_group(group); |
847 | else | 1108 | else |
848 | unlink_obj(item); | 1109 | unlink_obj(item); |
849 | client_drop_item(parent_item, item); | 1110 | client_drop_item(parent_item, item); |
850 | up(&subsys->su_sem); | 1111 | |
1112 | mutex_unlock(&subsys->su_mutex); | ||
851 | 1113 | ||
852 | if (module_got) | 1114 | if (module_got) |
853 | module_put(owner); | 1115 | module_put(owner); |
@@ -881,6 +1143,13 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) | |||
881 | if (sd->s_type & CONFIGFS_USET_DEFAULT) | 1143 | if (sd->s_type & CONFIGFS_USET_DEFAULT) |
882 | return -EPERM; | 1144 | return -EPERM; |
883 | 1145 | ||
1146 | /* | ||
1147 | * Here's where we check for dependents. We're protected by | ||
1148 | * i_mutex. | ||
1149 | */ | ||
1150 | if (sd->s_dependent_count) | ||
1151 | return -EBUSY; | ||
1152 | |||
884 | /* Get a working ref until we have the child */ | 1153 | /* Get a working ref until we have the child */ |
885 | parent_item = configfs_get_config_item(dentry->d_parent); | 1154 | parent_item = configfs_get_config_item(dentry->d_parent); |
886 | subsys = to_config_group(parent_item)->cg_subsys; | 1155 | subsys = to_config_group(parent_item)->cg_subsys; |
@@ -910,17 +1179,19 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) | |||
910 | if (sd->s_type & CONFIGFS_USET_DIR) { | 1179 | if (sd->s_type & CONFIGFS_USET_DIR) { |
911 | configfs_detach_group(item); | 1180 | configfs_detach_group(item); |
912 | 1181 | ||
913 | down(&subsys->su_sem); | 1182 | mutex_lock(&subsys->su_mutex); |
1183 | client_disconnect_notify(parent_item, item); | ||
914 | unlink_group(to_config_group(item)); | 1184 | unlink_group(to_config_group(item)); |
915 | } else { | 1185 | } else { |
916 | configfs_detach_item(item); | 1186 | configfs_detach_item(item); |
917 | 1187 | ||
918 | down(&subsys->su_sem); | 1188 | mutex_lock(&subsys->su_mutex); |
1189 | client_disconnect_notify(parent_item, item); | ||
919 | unlink_obj(item); | 1190 | unlink_obj(item); |
920 | } | 1191 | } |
921 | 1192 | ||
922 | client_drop_item(parent_item, item); | 1193 | client_drop_item(parent_item, item); |
923 | up(&subsys->su_sem); | 1194 | mutex_unlock(&subsys->su_mutex); |
924 | 1195 | ||
925 | /* Drop our reference from above */ | 1196 | /* Drop our reference from above */ |
926 | config_item_put(item); | 1197 | config_item_put(item); |
diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 3527c7c6def8..a3658f9a082c 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c | |||
@@ -27,19 +27,26 @@ | |||
27 | #include <linux/fs.h> | 27 | #include <linux/fs.h> |
28 | #include <linux/module.h> | 28 | #include <linux/module.h> |
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/mutex.h> | ||
30 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
31 | #include <asm/semaphore.h> | ||
32 | 32 | ||
33 | #include <linux/configfs.h> | 33 | #include <linux/configfs.h> |
34 | #include "configfs_internal.h" | 34 | #include "configfs_internal.h" |
35 | 35 | ||
36 | /* | ||
37 | * A simple attribute can only be 4096 characters. Why 4k? Because the | ||
38 | * original code limited it to PAGE_SIZE. That's a bad idea, though, | ||
39 | * because an attribute of 16k on ia64 won't work on x86. So we limit to | ||
40 | * 4k, our minimum common page size. | ||
41 | */ | ||
42 | #define SIMPLE_ATTR_SIZE 4096 | ||
36 | 43 | ||
37 | struct configfs_buffer { | 44 | struct configfs_buffer { |
38 | size_t count; | 45 | size_t count; |
39 | loff_t pos; | 46 | loff_t pos; |
40 | char * page; | 47 | char * page; |
41 | struct configfs_item_operations * ops; | 48 | struct configfs_item_operations * ops; |
42 | struct semaphore sem; | 49 | struct mutex mutex; |
43 | int needs_read_fill; | 50 | int needs_read_fill; |
44 | }; | 51 | }; |
45 | 52 | ||
@@ -69,7 +76,7 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf | |||
69 | 76 | ||
70 | count = ops->show_attribute(item,attr,buffer->page); | 77 | count = ops->show_attribute(item,attr,buffer->page); |
71 | buffer->needs_read_fill = 0; | 78 | buffer->needs_read_fill = 0; |
72 | BUG_ON(count > (ssize_t)PAGE_SIZE); | 79 | BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE); |
73 | if (count >= 0) | 80 | if (count >= 0) |
74 | buffer->count = count; | 81 | buffer->count = count; |
75 | else | 82 | else |
@@ -102,7 +109,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp | |||
102 | struct configfs_buffer * buffer = file->private_data; | 109 | struct configfs_buffer * buffer = file->private_data; |
103 | ssize_t retval = 0; | 110 | ssize_t retval = 0; |
104 | 111 | ||
105 | down(&buffer->sem); | 112 | mutex_lock(&buffer->mutex); |
106 | if (buffer->needs_read_fill) { | 113 | if (buffer->needs_read_fill) { |
107 | if ((retval = fill_read_buffer(file->f_path.dentry,buffer))) | 114 | if ((retval = fill_read_buffer(file->f_path.dentry,buffer))) |
108 | goto out; | 115 | goto out; |
@@ -112,7 +119,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp | |||
112 | retval = simple_read_from_buffer(buf, count, ppos, buffer->page, | 119 | retval = simple_read_from_buffer(buf, count, ppos, buffer->page, |
113 | buffer->count); | 120 | buffer->count); |
114 | out: | 121 | out: |
115 | up(&buffer->sem); | 122 | mutex_unlock(&buffer->mutex); |
116 | return retval; | 123 | return retval; |
117 | } | 124 | } |
118 | 125 | ||
@@ -137,8 +144,8 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size | |||
137 | if (!buffer->page) | 144 | if (!buffer->page) |
138 | return -ENOMEM; | 145 | return -ENOMEM; |
139 | 146 | ||
140 | if (count >= PAGE_SIZE) | 147 | if (count >= SIMPLE_ATTR_SIZE) |
141 | count = PAGE_SIZE - 1; | 148 | count = SIMPLE_ATTR_SIZE - 1; |
142 | error = copy_from_user(buffer->page,buf,count); | 149 | error = copy_from_user(buffer->page,buf,count); |
143 | buffer->needs_read_fill = 1; | 150 | buffer->needs_read_fill = 1; |
144 | /* if buf is assumed to contain a string, terminate it by \0, | 151 | /* if buf is assumed to contain a string, terminate it by \0, |
@@ -193,13 +200,13 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof | |||
193 | struct configfs_buffer * buffer = file->private_data; | 200 | struct configfs_buffer * buffer = file->private_data; |
194 | ssize_t len; | 201 | ssize_t len; |
195 | 202 | ||
196 | down(&buffer->sem); | 203 | mutex_lock(&buffer->mutex); |
197 | len = fill_write_buffer(buffer, buf, count); | 204 | len = fill_write_buffer(buffer, buf, count); |
198 | if (len > 0) | 205 | if (len > 0) |
199 | len = flush_write_buffer(file->f_path.dentry, buffer, count); | 206 | len = flush_write_buffer(file->f_path.dentry, buffer, count); |
200 | if (len > 0) | 207 | if (len > 0) |
201 | *ppos += len; | 208 | *ppos += len; |
202 | up(&buffer->sem); | 209 | mutex_unlock(&buffer->mutex); |
203 | return len; | 210 | return len; |
204 | } | 211 | } |
205 | 212 | ||
@@ -253,7 +260,7 @@ static int check_perm(struct inode * inode, struct file * file) | |||
253 | error = -ENOMEM; | 260 | error = -ENOMEM; |
254 | goto Enomem; | 261 | goto Enomem; |
255 | } | 262 | } |
256 | init_MUTEX(&buffer->sem); | 263 | mutex_init(&buffer->mutex); |
257 | buffer->needs_read_fill = 1; | 264 | buffer->needs_read_fill = 1; |
258 | buffer->ops = ops; | 265 | buffer->ops = ops; |
259 | file->private_data = buffer; | 266 | file->private_data = buffer; |
@@ -292,6 +299,7 @@ static int configfs_release(struct inode * inode, struct file * filp) | |||
292 | if (buffer) { | 299 | if (buffer) { |
293 | if (buffer->page) | 300 | if (buffer->page) |
294 | free_page((unsigned long)buffer->page); | 301 | free_page((unsigned long)buffer->page); |
302 | mutex_destroy(&buffer->mutex); | ||
295 | kfree(buffer); | 303 | kfree(buffer); |
296 | } | 304 | } |
297 | return 0; | 305 | return 0; |
diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 24421209f854..76dc4c3e5d51 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c | |||
@@ -62,7 +62,6 @@ void config_item_init(struct config_item * item) | |||
62 | * dynamically allocated string that @item->ci_name points to. | 62 | * dynamically allocated string that @item->ci_name points to. |
63 | * Otherwise, use the static @item->ci_namebuf array. | 63 | * Otherwise, use the static @item->ci_namebuf array. |
64 | */ | 64 | */ |
65 | |||
66 | int config_item_set_name(struct config_item * item, const char * fmt, ...) | 65 | int config_item_set_name(struct config_item * item, const char * fmt, ...) |
67 | { | 66 | { |
68 | int error = 0; | 67 | int error = 0; |
@@ -139,12 +138,7 @@ struct config_item * config_item_get(struct config_item * item) | |||
139 | return item; | 138 | return item; |
140 | } | 139 | } |
141 | 140 | ||
142 | /** | 141 | static void config_item_cleanup(struct config_item * item) |
143 | * config_item_cleanup - free config_item resources. | ||
144 | * @item: item. | ||
145 | */ | ||
146 | |||
147 | void config_item_cleanup(struct config_item * item) | ||
148 | { | 142 | { |
149 | struct config_item_type * t = item->ci_type; | 143 | struct config_item_type * t = item->ci_type; |
150 | struct config_group * s = item->ci_group; | 144 | struct config_group * s = item->ci_group; |
@@ -179,39 +173,35 @@ void config_item_put(struct config_item * item) | |||
179 | kref_put(&item->ci_kref, config_item_release); | 173 | kref_put(&item->ci_kref, config_item_release); |
180 | } | 174 | } |
181 | 175 | ||
182 | |||
183 | /** | 176 | /** |
184 | * config_group_init - initialize a group for use | 177 | * config_group_init - initialize a group for use |
185 | * @k: group | 178 | * @k: group |
186 | */ | 179 | */ |
187 | |||
188 | void config_group_init(struct config_group *group) | 180 | void config_group_init(struct config_group *group) |
189 | { | 181 | { |
190 | config_item_init(&group->cg_item); | 182 | config_item_init(&group->cg_item); |
191 | INIT_LIST_HEAD(&group->cg_children); | 183 | INIT_LIST_HEAD(&group->cg_children); |
192 | } | 184 | } |
193 | 185 | ||
194 | |||
195 | /** | 186 | /** |
196 | * config_group_find_obj - search for item in group. | 187 | * config_group_find_item - search for item in group. |
197 | * @group: group we're looking in. | 188 | * @group: group we're looking in. |
198 | * @name: item's name. | 189 | * @name: item's name. |
199 | * | 190 | * |
200 | * Lock group via @group->cg_subsys, and iterate over @group->cg_list, | 191 | * Iterate over @group->cg_list, looking for a matching config_item. |
201 | * looking for a matching config_item. If matching item is found | 192 | * If matching item is found take a reference and return the item. |
202 | * take a reference and return the item. | 193 | * Caller must have locked group via @group->cg_subsys->su_mtx. |
203 | */ | 194 | */ |
204 | 195 | struct config_item *config_group_find_item(struct config_group *group, | |
205 | struct config_item * config_group_find_obj(struct config_group * group, const char * name) | 196 | const char *name) |
206 | { | 197 | { |
207 | struct list_head * entry; | 198 | struct list_head * entry; |
208 | struct config_item * ret = NULL; | 199 | struct config_item * ret = NULL; |
209 | 200 | ||
210 | /* XXX LOCKING! */ | ||
211 | list_for_each(entry,&group->cg_children) { | 201 | list_for_each(entry,&group->cg_children) { |
212 | struct config_item * item = to_item(entry); | 202 | struct config_item * item = to_item(entry); |
213 | if (config_item_name(item) && | 203 | if (config_item_name(item) && |
214 | !strcmp(config_item_name(item), name)) { | 204 | !strcmp(config_item_name(item), name)) { |
215 | ret = config_item_get(item); | 205 | ret = config_item_get(item); |
216 | break; | 206 | break; |
217 | } | 207 | } |
@@ -219,9 +209,8 @@ struct config_item * config_group_find_obj(struct config_group * group, const ch | |||
219 | return ret; | 209 | return ret; |
220 | } | 210 | } |
221 | 211 | ||
222 | |||
223 | EXPORT_SYMBOL(config_item_init); | 212 | EXPORT_SYMBOL(config_item_init); |
224 | EXPORT_SYMBOL(config_group_init); | 213 | EXPORT_SYMBOL(config_group_init); |
225 | EXPORT_SYMBOL(config_item_get); | 214 | EXPORT_SYMBOL(config_item_get); |
226 | EXPORT_SYMBOL(config_item_put); | 215 | EXPORT_SYMBOL(config_item_put); |
227 | EXPORT_SYMBOL(config_group_find_obj); | 216 | EXPORT_SYMBOL(config_group_find_item); |
diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 5069b2cb5a1f..2f8e3c81bc19 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c | |||
@@ -133,14 +133,6 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field, | |||
133 | return len; | 133 | return len; |
134 | } | 134 | } |
135 | 135 | ||
136 | #define __CONFIGFS_ATTR(_name,_mode,_read,_write) { \ | ||
137 | .attr = { .ca_name = __stringify(_name), \ | ||
138 | .ca_mode = _mode, \ | ||
139 | .ca_owner = THIS_MODULE }, \ | ||
140 | .show = _read, \ | ||
141 | .store = _write, \ | ||
142 | } | ||
143 | |||
144 | #define CLUSTER_ATTR(name, check_zero) \ | 136 | #define CLUSTER_ATTR(name, check_zero) \ |
145 | static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len) \ | 137 | static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len) \ |
146 | { \ | 138 | { \ |
@@ -615,7 +607,7 @@ static struct clusters clusters_root = { | |||
615 | int dlm_config_init(void) | 607 | int dlm_config_init(void) |
616 | { | 608 | { |
617 | config_group_init(&clusters_root.subsys.su_group); | 609 | config_group_init(&clusters_root.subsys.su_group); |
618 | init_MUTEX(&clusters_root.subsys.su_sem); | 610 | mutex_init(&clusters_root.subsys.su_mutex); |
619 | return configfs_register_subsystem(&clusters_root.subsys); | 611 | return configfs_register_subsystem(&clusters_root.subsys); |
620 | } | 612 | } |
621 | 613 | ||
@@ -759,9 +751,9 @@ static struct space *get_space(char *name) | |||
759 | if (!space_list) | 751 | if (!space_list) |
760 | return NULL; | 752 | return NULL; |
761 | 753 | ||
762 | down(&space_list->cg_subsys->su_sem); | 754 | mutex_lock(&space_list->cg_subsys->su_mutex); |
763 | i = config_group_find_obj(space_list, name); | 755 | i = config_group_find_item(space_list, name); |
764 | up(&space_list->cg_subsys->su_sem); | 756 | mutex_unlock(&space_list->cg_subsys->su_mutex); |
765 | 757 | ||
766 | return to_space(i); | 758 | return to_space(i); |
767 | } | 759 | } |
@@ -780,7 +772,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr) | |||
780 | if (!comm_list) | 772 | if (!comm_list) |
781 | return NULL; | 773 | return NULL; |
782 | 774 | ||
783 | down(&clusters_root.subsys.su_sem); | 775 | mutex_lock(&clusters_root.subsys.su_mutex); |
784 | 776 | ||
785 | list_for_each_entry(i, &comm_list->cg_children, ci_entry) { | 777 | list_for_each_entry(i, &comm_list->cg_children, ci_entry) { |
786 | cm = to_comm(i); | 778 | cm = to_comm(i); |
@@ -800,7 +792,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr) | |||
800 | break; | 792 | break; |
801 | } | 793 | } |
802 | } | 794 | } |
803 | up(&clusters_root.subsys.su_sem); | 795 | mutex_unlock(&clusters_root.subsys.su_mutex); |
804 | 796 | ||
805 | if (!found) | 797 | if (!found) |
806 | cm = NULL; | 798 | cm = NULL; |
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 19712a7d145f..f5e11f4fa952 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
@@ -50,6 +50,8 @@ | |||
50 | #include "buffer_head_io.h" | 50 | #include "buffer_head_io.h" |
51 | 51 | ||
52 | static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); | 52 | static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); |
53 | static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, | ||
54 | struct ocfs2_extent_block *eb); | ||
53 | 55 | ||
54 | /* | 56 | /* |
55 | * Structures which describe a path through a btree, and functions to | 57 | * Structures which describe a path through a btree, and functions to |
@@ -117,6 +119,31 @@ static void ocfs2_free_path(struct ocfs2_path *path) | |||
117 | } | 119 | } |
118 | 120 | ||
119 | /* | 121 | /* |
122 | * All the elements of src into dest. After this call, src could be freed | ||
123 | * without affecting dest. | ||
124 | * | ||
125 | * Both paths should have the same root. Any non-root elements of dest | ||
126 | * will be freed. | ||
127 | */ | ||
128 | static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src) | ||
129 | { | ||
130 | int i; | ||
131 | |||
132 | BUG_ON(path_root_bh(dest) != path_root_bh(src)); | ||
133 | BUG_ON(path_root_el(dest) != path_root_el(src)); | ||
134 | |||
135 | ocfs2_reinit_path(dest, 1); | ||
136 | |||
137 | for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { | ||
138 | dest->p_node[i].bh = src->p_node[i].bh; | ||
139 | dest->p_node[i].el = src->p_node[i].el; | ||
140 | |||
141 | if (dest->p_node[i].bh) | ||
142 | get_bh(dest->p_node[i].bh); | ||
143 | } | ||
144 | } | ||
145 | |||
146 | /* | ||
120 | * Make the *dest path the same as src and re-initialize src path to | 147 | * Make the *dest path the same as src and re-initialize src path to |
121 | * have a root only. | 148 | * have a root only. |
122 | */ | 149 | */ |
@@ -212,10 +239,41 @@ out: | |||
212 | return ret; | 239 | return ret; |
213 | } | 240 | } |
214 | 241 | ||
242 | /* | ||
243 | * Return the index of the extent record which contains cluster #v_cluster. | ||
244 | * -1 is returned if it was not found. | ||
245 | * | ||
246 | * Should work fine on interior and exterior nodes. | ||
247 | */ | ||
248 | int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster) | ||
249 | { | ||
250 | int ret = -1; | ||
251 | int i; | ||
252 | struct ocfs2_extent_rec *rec; | ||
253 | u32 rec_end, rec_start, clusters; | ||
254 | |||
255 | for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | ||
256 | rec = &el->l_recs[i]; | ||
257 | |||
258 | rec_start = le32_to_cpu(rec->e_cpos); | ||
259 | clusters = ocfs2_rec_clusters(el, rec); | ||
260 | |||
261 | rec_end = rec_start + clusters; | ||
262 | |||
263 | if (v_cluster >= rec_start && v_cluster < rec_end) { | ||
264 | ret = i; | ||
265 | break; | ||
266 | } | ||
267 | } | ||
268 | |||
269 | return ret; | ||
270 | } | ||
271 | |||
215 | enum ocfs2_contig_type { | 272 | enum ocfs2_contig_type { |
216 | CONTIG_NONE = 0, | 273 | CONTIG_NONE = 0, |
217 | CONTIG_LEFT, | 274 | CONTIG_LEFT, |
218 | CONTIG_RIGHT | 275 | CONTIG_RIGHT, |
276 | CONTIG_LEFTRIGHT, | ||
219 | }; | 277 | }; |
220 | 278 | ||
221 | 279 | ||
@@ -253,6 +311,14 @@ static enum ocfs2_contig_type | |||
253 | { | 311 | { |
254 | u64 blkno = le64_to_cpu(insert_rec->e_blkno); | 312 | u64 blkno = le64_to_cpu(insert_rec->e_blkno); |
255 | 313 | ||
314 | /* | ||
315 | * Refuse to coalesce extent records with different flag | ||
316 | * fields - we don't want to mix unwritten extents with user | ||
317 | * data. | ||
318 | */ | ||
319 | if (ext->e_flags != insert_rec->e_flags) | ||
320 | return CONTIG_NONE; | ||
321 | |||
256 | if (ocfs2_extents_adjacent(ext, insert_rec) && | 322 | if (ocfs2_extents_adjacent(ext, insert_rec) && |
257 | ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) | 323 | ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) |
258 | return CONTIG_RIGHT; | 324 | return CONTIG_RIGHT; |
@@ -277,7 +343,14 @@ enum ocfs2_append_type { | |||
277 | APPEND_TAIL, | 343 | APPEND_TAIL, |
278 | }; | 344 | }; |
279 | 345 | ||
346 | enum ocfs2_split_type { | ||
347 | SPLIT_NONE = 0, | ||
348 | SPLIT_LEFT, | ||
349 | SPLIT_RIGHT, | ||
350 | }; | ||
351 | |||
280 | struct ocfs2_insert_type { | 352 | struct ocfs2_insert_type { |
353 | enum ocfs2_split_type ins_split; | ||
281 | enum ocfs2_append_type ins_appending; | 354 | enum ocfs2_append_type ins_appending; |
282 | enum ocfs2_contig_type ins_contig; | 355 | enum ocfs2_contig_type ins_contig; |
283 | int ins_contig_index; | 356 | int ins_contig_index; |
@@ -285,6 +358,13 @@ struct ocfs2_insert_type { | |||
285 | int ins_tree_depth; | 358 | int ins_tree_depth; |
286 | }; | 359 | }; |
287 | 360 | ||
361 | struct ocfs2_merge_ctxt { | ||
362 | enum ocfs2_contig_type c_contig_type; | ||
363 | int c_has_empty_extent; | ||
364 | int c_split_covers_rec; | ||
365 | int c_used_tail_recs; | ||
366 | }; | ||
367 | |||
288 | /* | 368 | /* |
289 | * How many free extents have we got before we need more meta data? | 369 | * How many free extents have we got before we need more meta data? |
290 | */ | 370 | */ |
@@ -384,13 +464,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, | |||
384 | strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); | 464 | strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); |
385 | eb->h_blkno = cpu_to_le64(first_blkno); | 465 | eb->h_blkno = cpu_to_le64(first_blkno); |
386 | eb->h_fs_generation = cpu_to_le32(osb->fs_generation); | 466 | eb->h_fs_generation = cpu_to_le32(osb->fs_generation); |
387 | |||
388 | #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS | ||
389 | /* we always use slot zero's suballocator */ | ||
390 | eb->h_suballoc_slot = 0; | ||
391 | #else | ||
392 | eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); | 467 | eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); |
393 | #endif | ||
394 | eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); | 468 | eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); |
395 | eb->h_list.l_count = | 469 | eb->h_list.l_count = |
396 | cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); | 470 | cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); |
@@ -461,7 +535,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, | |||
461 | struct inode *inode, | 535 | struct inode *inode, |
462 | struct buffer_head *fe_bh, | 536 | struct buffer_head *fe_bh, |
463 | struct buffer_head *eb_bh, | 537 | struct buffer_head *eb_bh, |
464 | struct buffer_head *last_eb_bh, | 538 | struct buffer_head **last_eb_bh, |
465 | struct ocfs2_alloc_context *meta_ac) | 539 | struct ocfs2_alloc_context *meta_ac) |
466 | { | 540 | { |
467 | int status, new_blocks, i; | 541 | int status, new_blocks, i; |
@@ -476,7 +550,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, | |||
476 | 550 | ||
477 | mlog_entry_void(); | 551 | mlog_entry_void(); |
478 | 552 | ||
479 | BUG_ON(!last_eb_bh); | 553 | BUG_ON(!last_eb_bh || !*last_eb_bh); |
480 | 554 | ||
481 | fe = (struct ocfs2_dinode *) fe_bh->b_data; | 555 | fe = (struct ocfs2_dinode *) fe_bh->b_data; |
482 | 556 | ||
@@ -507,7 +581,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, | |||
507 | goto bail; | 581 | goto bail; |
508 | } | 582 | } |
509 | 583 | ||
510 | eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; | 584 | eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; |
511 | new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); | 585 | new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); |
512 | 586 | ||
513 | /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be | 587 | /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be |
@@ -568,7 +642,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, | |||
568 | * journal_dirty erroring as it won't unless we've aborted the | 642 | * journal_dirty erroring as it won't unless we've aborted the |
569 | * handle (in which case we would never be here) so reserving | 643 | * handle (in which case we would never be here) so reserving |
570 | * the write with journal_access is all we need to do. */ | 644 | * the write with journal_access is all we need to do. */ |
571 | status = ocfs2_journal_access(handle, inode, last_eb_bh, | 645 | status = ocfs2_journal_access(handle, inode, *last_eb_bh, |
572 | OCFS2_JOURNAL_ACCESS_WRITE); | 646 | OCFS2_JOURNAL_ACCESS_WRITE); |
573 | if (status < 0) { | 647 | if (status < 0) { |
574 | mlog_errno(status); | 648 | mlog_errno(status); |
@@ -601,10 +675,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, | |||
601 | * next_leaf on the previously last-extent-block. */ | 675 | * next_leaf on the previously last-extent-block. */ |
602 | fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); | 676 | fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); |
603 | 677 | ||
604 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | 678 | eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; |
605 | eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); | 679 | eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); |
606 | 680 | ||
607 | status = ocfs2_journal_dirty(handle, last_eb_bh); | 681 | status = ocfs2_journal_dirty(handle, *last_eb_bh); |
608 | if (status < 0) | 682 | if (status < 0) |
609 | mlog_errno(status); | 683 | mlog_errno(status); |
610 | status = ocfs2_journal_dirty(handle, fe_bh); | 684 | status = ocfs2_journal_dirty(handle, fe_bh); |
@@ -616,6 +690,14 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, | |||
616 | mlog_errno(status); | 690 | mlog_errno(status); |
617 | } | 691 | } |
618 | 692 | ||
693 | /* | ||
694 | * Some callers want to track the rightmost leaf so pass it | ||
695 | * back here. | ||
696 | */ | ||
697 | brelse(*last_eb_bh); | ||
698 | get_bh(new_eb_bhs[0]); | ||
699 | *last_eb_bh = new_eb_bhs[0]; | ||
700 | |||
619 | status = 0; | 701 | status = 0; |
620 | bail: | 702 | bail: |
621 | if (new_eb_bhs) { | 703 | if (new_eb_bhs) { |
@@ -829,6 +911,87 @@ bail: | |||
829 | } | 911 | } |
830 | 912 | ||
831 | /* | 913 | /* |
914 | * Grow a b-tree so that it has more records. | ||
915 | * | ||
916 | * We might shift the tree depth in which case existing paths should | ||
917 | * be considered invalid. | ||
918 | * | ||
919 | * Tree depth after the grow is returned via *final_depth. | ||
920 | * | ||
921 | * *last_eb_bh will be updated by ocfs2_add_branch(). | ||
922 | */ | ||
923 | static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, | ||
924 | struct buffer_head *di_bh, int *final_depth, | ||
925 | struct buffer_head **last_eb_bh, | ||
926 | struct ocfs2_alloc_context *meta_ac) | ||
927 | { | ||
928 | int ret, shift; | ||
929 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; | ||
930 | int depth = le16_to_cpu(di->id2.i_list.l_tree_depth); | ||
931 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
932 | struct buffer_head *bh = NULL; | ||
933 | |||
934 | BUG_ON(meta_ac == NULL); | ||
935 | |||
936 | shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh); | ||
937 | if (shift < 0) { | ||
938 | ret = shift; | ||
939 | mlog_errno(ret); | ||
940 | goto out; | ||
941 | } | ||
942 | |||
943 | /* We traveled all the way to the bottom of the allocation tree | ||
944 | * and didn't find room for any more extents - we need to add | ||
945 | * another tree level */ | ||
946 | if (shift) { | ||
947 | BUG_ON(bh); | ||
948 | mlog(0, "need to shift tree depth (current = %d)\n", depth); | ||
949 | |||
950 | /* ocfs2_shift_tree_depth will return us a buffer with | ||
951 | * the new extent block (so we can pass that to | ||
952 | * ocfs2_add_branch). */ | ||
953 | ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh, | ||
954 | meta_ac, &bh); | ||
955 | if (ret < 0) { | ||
956 | mlog_errno(ret); | ||
957 | goto out; | ||
958 | } | ||
959 | depth++; | ||
960 | if (depth == 1) { | ||
961 | /* | ||
962 | * Special case: we have room now if we shifted from | ||
963 | * tree_depth 0, so no more work needs to be done. | ||
964 | * | ||
965 | * We won't be calling add_branch, so pass | ||
966 | * back *last_eb_bh as the new leaf. At depth | ||
967 | * zero, it should always be null so there's | ||
968 | * no reason to brelse. | ||
969 | */ | ||
970 | BUG_ON(*last_eb_bh); | ||
971 | get_bh(bh); | ||
972 | *last_eb_bh = bh; | ||
973 | goto out; | ||
974 | } | ||
975 | } | ||
976 | |||
977 | /* call ocfs2_add_branch to add the final part of the tree with | ||
978 | * the new data. */ | ||
979 | mlog(0, "add branch. bh = %p\n", bh); | ||
980 | ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh, | ||
981 | meta_ac); | ||
982 | if (ret < 0) { | ||
983 | mlog_errno(ret); | ||
984 | goto out; | ||
985 | } | ||
986 | |||
987 | out: | ||
988 | if (final_depth) | ||
989 | *final_depth = depth; | ||
990 | brelse(bh); | ||
991 | return ret; | ||
992 | } | ||
993 | |||
994 | /* | ||
832 | * This is only valid for leaf nodes, which are the only ones that can | 995 | * This is only valid for leaf nodes, which are the only ones that can |
833 | * have empty extents anyway. | 996 | * have empty extents anyway. |
834 | */ | 997 | */ |
@@ -934,6 +1097,22 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, | |||
934 | 1097 | ||
935 | } | 1098 | } |
936 | 1099 | ||
1100 | static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el) | ||
1101 | { | ||
1102 | int size, num_recs = le16_to_cpu(el->l_next_free_rec); | ||
1103 | |||
1104 | BUG_ON(num_recs == 0); | ||
1105 | |||
1106 | if (ocfs2_is_empty_extent(&el->l_recs[0])) { | ||
1107 | num_recs--; | ||
1108 | size = num_recs * sizeof(struct ocfs2_extent_rec); | ||
1109 | memmove(&el->l_recs[0], &el->l_recs[1], size); | ||
1110 | memset(&el->l_recs[num_recs], 0, | ||
1111 | sizeof(struct ocfs2_extent_rec)); | ||
1112 | el->l_next_free_rec = cpu_to_le16(num_recs); | ||
1113 | } | ||
1114 | } | ||
1115 | |||
937 | /* | 1116 | /* |
938 | * Create an empty extent record . | 1117 | * Create an empty extent record . |
939 | * | 1118 | * |
@@ -1211,6 +1390,10 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, | |||
1211 | * immediately to their right. | 1390 | * immediately to their right. |
1212 | */ | 1391 | */ |
1213 | left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); | 1392 | left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); |
1393 | if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) { | ||
1394 | BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); | ||
1395 | left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); | ||
1396 | } | ||
1214 | left_clusters -= le32_to_cpu(left_rec->e_cpos); | 1397 | left_clusters -= le32_to_cpu(left_rec->e_cpos); |
1215 | left_rec->e_int_clusters = cpu_to_le32(left_clusters); | 1398 | left_rec->e_int_clusters = cpu_to_le32(left_clusters); |
1216 | 1399 | ||
@@ -1531,10 +1714,16 @@ out: | |||
1531 | return ret; | 1714 | return ret; |
1532 | } | 1715 | } |
1533 | 1716 | ||
1717 | /* | ||
1718 | * Extend the transaction by enough credits to complete the rotation, | ||
1719 | * and still leave at least the original number of credits allocated | ||
1720 | * to this transaction. | ||
1721 | */ | ||
1534 | static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, | 1722 | static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, |
1723 | int op_credits, | ||
1535 | struct ocfs2_path *path) | 1724 | struct ocfs2_path *path) |
1536 | { | 1725 | { |
1537 | int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; | 1726 | int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; |
1538 | 1727 | ||
1539 | if (handle->h_buffer_credits < credits) | 1728 | if (handle->h_buffer_credits < credits) |
1540 | return ocfs2_extend_trans(handle, credits); | 1729 | return ocfs2_extend_trans(handle, credits); |
@@ -1568,6 +1757,29 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, | |||
1568 | return 0; | 1757 | return 0; |
1569 | } | 1758 | } |
1570 | 1759 | ||
1760 | static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos) | ||
1761 | { | ||
1762 | int next_free = le16_to_cpu(el->l_next_free_rec); | ||
1763 | unsigned int range; | ||
1764 | struct ocfs2_extent_rec *rec; | ||
1765 | |||
1766 | if (next_free == 0) | ||
1767 | return 0; | ||
1768 | |||
1769 | rec = &el->l_recs[0]; | ||
1770 | if (ocfs2_is_empty_extent(rec)) { | ||
1771 | /* Empty list. */ | ||
1772 | if (next_free == 1) | ||
1773 | return 0; | ||
1774 | rec = &el->l_recs[1]; | ||
1775 | } | ||
1776 | |||
1777 | range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); | ||
1778 | if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) | ||
1779 | return 1; | ||
1780 | return 0; | ||
1781 | } | ||
1782 | |||
1571 | /* | 1783 | /* |
1572 | * Rotate all the records in a btree right one record, starting at insert_cpos. | 1784 | * Rotate all the records in a btree right one record, starting at insert_cpos. |
1573 | * | 1785 | * |
@@ -1586,11 +1798,12 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, | |||
1586 | */ | 1798 | */ |
1587 | static int ocfs2_rotate_tree_right(struct inode *inode, | 1799 | static int ocfs2_rotate_tree_right(struct inode *inode, |
1588 | handle_t *handle, | 1800 | handle_t *handle, |
1801 | enum ocfs2_split_type split, | ||
1589 | u32 insert_cpos, | 1802 | u32 insert_cpos, |
1590 | struct ocfs2_path *right_path, | 1803 | struct ocfs2_path *right_path, |
1591 | struct ocfs2_path **ret_left_path) | 1804 | struct ocfs2_path **ret_left_path) |
1592 | { | 1805 | { |
1593 | int ret, start; | 1806 | int ret, start, orig_credits = handle->h_buffer_credits; |
1594 | u32 cpos; | 1807 | u32 cpos; |
1595 | struct ocfs2_path *left_path = NULL; | 1808 | struct ocfs2_path *left_path = NULL; |
1596 | 1809 | ||
@@ -1657,9 +1870,9 @@ static int ocfs2_rotate_tree_right(struct inode *inode, | |||
1657 | (unsigned long long) | 1870 | (unsigned long long) |
1658 | path_leaf_bh(left_path)->b_blocknr); | 1871 | path_leaf_bh(left_path)->b_blocknr); |
1659 | 1872 | ||
1660 | if (ocfs2_rotate_requires_path_adjustment(left_path, | 1873 | if (split == SPLIT_NONE && |
1874 | ocfs2_rotate_requires_path_adjustment(left_path, | ||
1661 | insert_cpos)) { | 1875 | insert_cpos)) { |
1662 | mlog(0, "Path adjustment required\n"); | ||
1663 | 1876 | ||
1664 | /* | 1877 | /* |
1665 | * We've rotated the tree as much as we | 1878 | * We've rotated the tree as much as we |
@@ -1687,7 +1900,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, | |||
1687 | right_path->p_tree_depth); | 1900 | right_path->p_tree_depth); |
1688 | 1901 | ||
1689 | ret = ocfs2_extend_rotate_transaction(handle, start, | 1902 | ret = ocfs2_extend_rotate_transaction(handle, start, |
1690 | right_path); | 1903 | orig_credits, right_path); |
1691 | if (ret) { | 1904 | if (ret) { |
1692 | mlog_errno(ret); | 1905 | mlog_errno(ret); |
1693 | goto out; | 1906 | goto out; |
@@ -1700,6 +1913,24 @@ static int ocfs2_rotate_tree_right(struct inode *inode, | |||
1700 | goto out; | 1913 | goto out; |
1701 | } | 1914 | } |
1702 | 1915 | ||
1916 | if (split != SPLIT_NONE && | ||
1917 | ocfs2_leftmost_rec_contains(path_leaf_el(right_path), | ||
1918 | insert_cpos)) { | ||
1919 | /* | ||
1920 | * A rotate moves the rightmost left leaf | ||
1921 | * record over to the leftmost right leaf | ||
1922 | * slot. If we're doing an extent split | ||
1923 | * instead of a real insert, then we have to | ||
1924 | * check that the extent to be split wasn't | ||
1925 | * just moved over. If it was, then we can | ||
1926 | * exit here, passing left_path back - | ||
1927 | * ocfs2_split_extent() is smart enough to | ||
1928 | * search both leaves. | ||
1929 | */ | ||
1930 | *ret_left_path = left_path; | ||
1931 | goto out_ret_path; | ||
1932 | } | ||
1933 | |||
1703 | /* | 1934 | /* |
1704 | * There is no need to re-read the next right path | 1935 | * There is no need to re-read the next right path |
1705 | * as we know that it'll be our current left | 1936 | * as we know that it'll be our current left |
@@ -1722,6 +1953,1031 @@ out_ret_path: | |||
1722 | return ret; | 1953 | return ret; |
1723 | } | 1954 | } |
1724 | 1955 | ||
1956 | static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, | ||
1957 | struct ocfs2_path *path) | ||
1958 | { | ||
1959 | int i, idx; | ||
1960 | struct ocfs2_extent_rec *rec; | ||
1961 | struct ocfs2_extent_list *el; | ||
1962 | struct ocfs2_extent_block *eb; | ||
1963 | u32 range; | ||
1964 | |||
1965 | /* Path should always be rightmost. */ | ||
1966 | eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; | ||
1967 | BUG_ON(eb->h_next_leaf_blk != 0ULL); | ||
1968 | |||
1969 | el = &eb->h_list; | ||
1970 | BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); | ||
1971 | idx = le16_to_cpu(el->l_next_free_rec) - 1; | ||
1972 | rec = &el->l_recs[idx]; | ||
1973 | range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); | ||
1974 | |||
1975 | for (i = 0; i < path->p_tree_depth; i++) { | ||
1976 | el = path->p_node[i].el; | ||
1977 | idx = le16_to_cpu(el->l_next_free_rec) - 1; | ||
1978 | rec = &el->l_recs[idx]; | ||
1979 | |||
1980 | rec->e_int_clusters = cpu_to_le32(range); | ||
1981 | le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos)); | ||
1982 | |||
1983 | ocfs2_journal_dirty(handle, path->p_node[i].bh); | ||
1984 | } | ||
1985 | } | ||
1986 | |||
1987 | static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, | ||
1988 | struct ocfs2_cached_dealloc_ctxt *dealloc, | ||
1989 | struct ocfs2_path *path, int unlink_start) | ||
1990 | { | ||
1991 | int ret, i; | ||
1992 | struct ocfs2_extent_block *eb; | ||
1993 | struct ocfs2_extent_list *el; | ||
1994 | struct buffer_head *bh; | ||
1995 | |||
1996 | for(i = unlink_start; i < path_num_items(path); i++) { | ||
1997 | bh = path->p_node[i].bh; | ||
1998 | |||
1999 | eb = (struct ocfs2_extent_block *)bh->b_data; | ||
2000 | /* | ||
2001 | * Not all nodes might have had their final count | ||
2002 | * decremented by the caller - handle this here. | ||
2003 | */ | ||
2004 | el = &eb->h_list; | ||
2005 | if (le16_to_cpu(el->l_next_free_rec) > 1) { | ||
2006 | mlog(ML_ERROR, | ||
2007 | "Inode %llu, attempted to remove extent block " | ||
2008 | "%llu with %u records\n", | ||
2009 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
2010 | (unsigned long long)le64_to_cpu(eb->h_blkno), | ||
2011 | le16_to_cpu(el->l_next_free_rec)); | ||
2012 | |||
2013 | ocfs2_journal_dirty(handle, bh); | ||
2014 | ocfs2_remove_from_cache(inode, bh); | ||
2015 | continue; | ||
2016 | } | ||
2017 | |||
2018 | el->l_next_free_rec = 0; | ||
2019 | memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); | ||
2020 | |||
2021 | ocfs2_journal_dirty(handle, bh); | ||
2022 | |||
2023 | ret = ocfs2_cache_extent_block_free(dealloc, eb); | ||
2024 | if (ret) | ||
2025 | mlog_errno(ret); | ||
2026 | |||
2027 | ocfs2_remove_from_cache(inode, bh); | ||
2028 | } | ||
2029 | } | ||
2030 | |||
2031 | static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle, | ||
2032 | struct ocfs2_path *left_path, | ||
2033 | struct ocfs2_path *right_path, | ||
2034 | int subtree_index, | ||
2035 | struct ocfs2_cached_dealloc_ctxt *dealloc) | ||
2036 | { | ||
2037 | int i; | ||
2038 | struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; | ||
2039 | struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el; | ||
2040 | struct ocfs2_extent_list *el; | ||
2041 | struct ocfs2_extent_block *eb; | ||
2042 | |||
2043 | el = path_leaf_el(left_path); | ||
2044 | |||
2045 | eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data; | ||
2046 | |||
2047 | for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) | ||
2048 | if (root_el->l_recs[i].e_blkno == eb->h_blkno) | ||
2049 | break; | ||
2050 | |||
2051 | BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec)); | ||
2052 | |||
2053 | memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); | ||
2054 | le16_add_cpu(&root_el->l_next_free_rec, -1); | ||
2055 | |||
2056 | eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; | ||
2057 | eb->h_next_leaf_blk = 0; | ||
2058 | |||
2059 | ocfs2_journal_dirty(handle, root_bh); | ||
2060 | ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); | ||
2061 | |||
2062 | ocfs2_unlink_path(inode, handle, dealloc, right_path, | ||
2063 | subtree_index + 1); | ||
2064 | } | ||
2065 | |||
2066 | static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, | ||
2067 | struct ocfs2_path *left_path, | ||
2068 | struct ocfs2_path *right_path, | ||
2069 | int subtree_index, | ||
2070 | struct ocfs2_cached_dealloc_ctxt *dealloc, | ||
2071 | int *deleted) | ||
2072 | { | ||
2073 | int ret, i, del_right_subtree = 0, right_has_empty = 0; | ||
2074 | struct buffer_head *root_bh, *di_bh = path_root_bh(right_path); | ||
2075 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; | ||
2076 | struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; | ||
2077 | struct ocfs2_extent_block *eb; | ||
2078 | |||
2079 | *deleted = 0; | ||
2080 | |||
2081 | right_leaf_el = path_leaf_el(right_path); | ||
2082 | left_leaf_el = path_leaf_el(left_path); | ||
2083 | root_bh = left_path->p_node[subtree_index].bh; | ||
2084 | BUG_ON(root_bh != right_path->p_node[subtree_index].bh); | ||
2085 | |||
2086 | if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0])) | ||
2087 | return 0; | ||
2088 | |||
2089 | eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data; | ||
2090 | if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) { | ||
2091 | /* | ||
2092 | * It's legal for us to proceed if the right leaf is | ||
2093 | * the rightmost one and it has an empty extent. There | ||
2094 | * are two cases to handle - whether the leaf will be | ||
2095 | * empty after removal or not. If the leaf isn't empty | ||
2096 | * then just remove the empty extent up front. The | ||
2097 | * next block will handle empty leaves by flagging | ||
2098 | * them for unlink. | ||
2099 | * | ||
2100 | * Non rightmost leaves will throw -EAGAIN and the | ||
2101 | * caller can manually move the subtree and retry. | ||
2102 | */ | ||
2103 | |||
2104 | if (eb->h_next_leaf_blk != 0ULL) | ||
2105 | return -EAGAIN; | ||
2106 | |||
2107 | if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { | ||
2108 | ret = ocfs2_journal_access(handle, inode, | ||
2109 | path_leaf_bh(right_path), | ||
2110 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2111 | if (ret) { | ||
2112 | mlog_errno(ret); | ||
2113 | goto out; | ||
2114 | } | ||
2115 | |||
2116 | ocfs2_remove_empty_extent(right_leaf_el); | ||
2117 | } else | ||
2118 | right_has_empty = 1; | ||
2119 | } | ||
2120 | |||
2121 | if (eb->h_next_leaf_blk == 0ULL && | ||
2122 | le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) { | ||
2123 | /* | ||
2124 | * We have to update i_last_eb_blk during the meta | ||
2125 | * data delete. | ||
2126 | */ | ||
2127 | ret = ocfs2_journal_access(handle, inode, di_bh, | ||
2128 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2129 | if (ret) { | ||
2130 | mlog_errno(ret); | ||
2131 | goto out; | ||
2132 | } | ||
2133 | |||
2134 | del_right_subtree = 1; | ||
2135 | } | ||
2136 | |||
2137 | /* | ||
2138 | * Getting here with an empty extent in the right path implies | ||
2139 | * that it's the rightmost path and will be deleted. | ||
2140 | */ | ||
2141 | BUG_ON(right_has_empty && !del_right_subtree); | ||
2142 | |||
2143 | ret = ocfs2_journal_access(handle, inode, root_bh, | ||
2144 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2145 | if (ret) { | ||
2146 | mlog_errno(ret); | ||
2147 | goto out; | ||
2148 | } | ||
2149 | |||
2150 | for(i = subtree_index + 1; i < path_num_items(right_path); i++) { | ||
2151 | ret = ocfs2_journal_access(handle, inode, | ||
2152 | right_path->p_node[i].bh, | ||
2153 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2154 | if (ret) { | ||
2155 | mlog_errno(ret); | ||
2156 | goto out; | ||
2157 | } | ||
2158 | |||
2159 | ret = ocfs2_journal_access(handle, inode, | ||
2160 | left_path->p_node[i].bh, | ||
2161 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2162 | if (ret) { | ||
2163 | mlog_errno(ret); | ||
2164 | goto out; | ||
2165 | } | ||
2166 | } | ||
2167 | |||
2168 | if (!right_has_empty) { | ||
2169 | /* | ||
2170 | * Only do this if we're moving a real | ||
2171 | * record. Otherwise, the action is delayed until | ||
2172 | * after removal of the right path in which case we | ||
2173 | * can do a simple shift to remove the empty extent. | ||
2174 | */ | ||
2175 | ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]); | ||
2176 | memset(&right_leaf_el->l_recs[0], 0, | ||
2177 | sizeof(struct ocfs2_extent_rec)); | ||
2178 | } | ||
2179 | if (eb->h_next_leaf_blk == 0ULL) { | ||
2180 | /* | ||
2181 | * Move recs over to get rid of empty extent, decrease | ||
2182 | * next_free. This is allowed to remove the last | ||
2183 | * extent in our leaf (setting l_next_free_rec to | ||
2184 | * zero) - the delete code below won't care. | ||
2185 | */ | ||
2186 | ocfs2_remove_empty_extent(right_leaf_el); | ||
2187 | } | ||
2188 | |||
2189 | ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); | ||
2190 | if (ret) | ||
2191 | mlog_errno(ret); | ||
2192 | ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); | ||
2193 | if (ret) | ||
2194 | mlog_errno(ret); | ||
2195 | |||
2196 | if (del_right_subtree) { | ||
2197 | ocfs2_unlink_subtree(inode, handle, left_path, right_path, | ||
2198 | subtree_index, dealloc); | ||
2199 | ocfs2_update_edge_lengths(inode, handle, left_path); | ||
2200 | |||
2201 | eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; | ||
2202 | di->i_last_eb_blk = eb->h_blkno; | ||
2203 | |||
2204 | /* | ||
2205 | * Removal of the extent in the left leaf was skipped | ||
2206 | * above so we could delete the right path | ||
2207 | * 1st. | ||
2208 | */ | ||
2209 | if (right_has_empty) | ||
2210 | ocfs2_remove_empty_extent(left_leaf_el); | ||
2211 | |||
2212 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
2213 | if (ret) | ||
2214 | mlog_errno(ret); | ||
2215 | |||
2216 | *deleted = 1; | ||
2217 | } else | ||
2218 | ocfs2_complete_edge_insert(inode, handle, left_path, right_path, | ||
2219 | subtree_index); | ||
2220 | |||
2221 | out: | ||
2222 | return ret; | ||
2223 | } | ||
2224 | |||
2225 | /* | ||
2226 | * Given a full path, determine what cpos value would return us a path | ||
2227 | * containing the leaf immediately to the right of the current one. | ||
2228 | * | ||
2229 | * Will return zero if the path passed in is already the rightmost path. | ||
2230 | * | ||
2231 | * This looks similar, but is subtly different to | ||
2232 | * ocfs2_find_cpos_for_left_leaf(). | ||
2233 | */ | ||
2234 | static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, | ||
2235 | struct ocfs2_path *path, u32 *cpos) | ||
2236 | { | ||
2237 | int i, j, ret = 0; | ||
2238 | u64 blkno; | ||
2239 | struct ocfs2_extent_list *el; | ||
2240 | |||
2241 | *cpos = 0; | ||
2242 | |||
2243 | if (path->p_tree_depth == 0) | ||
2244 | return 0; | ||
2245 | |||
2246 | blkno = path_leaf_bh(path)->b_blocknr; | ||
2247 | |||
2248 | /* Start at the tree node just above the leaf and work our way up. */ | ||
2249 | i = path->p_tree_depth - 1; | ||
2250 | while (i >= 0) { | ||
2251 | int next_free; | ||
2252 | |||
2253 | el = path->p_node[i].el; | ||
2254 | |||
2255 | /* | ||
2256 | * Find the extent record just after the one in our | ||
2257 | * path. | ||
2258 | */ | ||
2259 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
2260 | for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { | ||
2261 | if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { | ||
2262 | if (j == (next_free - 1)) { | ||
2263 | if (i == 0) { | ||
2264 | /* | ||
2265 | * We've determined that the | ||
2266 | * path specified is already | ||
2267 | * the rightmost one - return a | ||
2268 | * cpos of zero. | ||
2269 | */ | ||
2270 | goto out; | ||
2271 | } | ||
2272 | /* | ||
2273 | * The rightmost record points to our | ||
2274 | * leaf - we need to travel up the | ||
2275 | * tree one level. | ||
2276 | */ | ||
2277 | goto next_node; | ||
2278 | } | ||
2279 | |||
2280 | *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos); | ||
2281 | goto out; | ||
2282 | } | ||
2283 | } | ||
2284 | |||
2285 | /* | ||
2286 | * If we got here, we never found a valid node where | ||
2287 | * the tree indicated one should be. | ||
2288 | */ | ||
2289 | ocfs2_error(sb, | ||
2290 | "Invalid extent tree at extent block %llu\n", | ||
2291 | (unsigned long long)blkno); | ||
2292 | ret = -EROFS; | ||
2293 | goto out; | ||
2294 | |||
2295 | next_node: | ||
2296 | blkno = path->p_node[i].bh->b_blocknr; | ||
2297 | i--; | ||
2298 | } | ||
2299 | |||
2300 | out: | ||
2301 | return ret; | ||
2302 | } | ||
2303 | |||
2304 | static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, | ||
2305 | handle_t *handle, | ||
2306 | struct buffer_head *bh, | ||
2307 | struct ocfs2_extent_list *el) | ||
2308 | { | ||
2309 | int ret; | ||
2310 | |||
2311 | if (!ocfs2_is_empty_extent(&el->l_recs[0])) | ||
2312 | return 0; | ||
2313 | |||
2314 | ret = ocfs2_journal_access(handle, inode, bh, | ||
2315 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2316 | if (ret) { | ||
2317 | mlog_errno(ret); | ||
2318 | goto out; | ||
2319 | } | ||
2320 | |||
2321 | ocfs2_remove_empty_extent(el); | ||
2322 | |||
2323 | ret = ocfs2_journal_dirty(handle, bh); | ||
2324 | if (ret) | ||
2325 | mlog_errno(ret); | ||
2326 | |||
2327 | out: | ||
2328 | return ret; | ||
2329 | } | ||
2330 | |||
2331 | static int __ocfs2_rotate_tree_left(struct inode *inode, | ||
2332 | handle_t *handle, int orig_credits, | ||
2333 | struct ocfs2_path *path, | ||
2334 | struct ocfs2_cached_dealloc_ctxt *dealloc, | ||
2335 | struct ocfs2_path **empty_extent_path) | ||
2336 | { | ||
2337 | int ret, subtree_root, deleted; | ||
2338 | u32 right_cpos; | ||
2339 | struct ocfs2_path *left_path = NULL; | ||
2340 | struct ocfs2_path *right_path = NULL; | ||
2341 | |||
2342 | BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); | ||
2343 | |||
2344 | *empty_extent_path = NULL; | ||
2345 | |||
2346 | ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path, | ||
2347 | &right_cpos); | ||
2348 | if (ret) { | ||
2349 | mlog_errno(ret); | ||
2350 | goto out; | ||
2351 | } | ||
2352 | |||
2353 | left_path = ocfs2_new_path(path_root_bh(path), | ||
2354 | path_root_el(path)); | ||
2355 | if (!left_path) { | ||
2356 | ret = -ENOMEM; | ||
2357 | mlog_errno(ret); | ||
2358 | goto out; | ||
2359 | } | ||
2360 | |||
2361 | ocfs2_cp_path(left_path, path); | ||
2362 | |||
2363 | right_path = ocfs2_new_path(path_root_bh(path), | ||
2364 | path_root_el(path)); | ||
2365 | if (!right_path) { | ||
2366 | ret = -ENOMEM; | ||
2367 | mlog_errno(ret); | ||
2368 | goto out; | ||
2369 | } | ||
2370 | |||
2371 | while (right_cpos) { | ||
2372 | ret = ocfs2_find_path(inode, right_path, right_cpos); | ||
2373 | if (ret) { | ||
2374 | mlog_errno(ret); | ||
2375 | goto out; | ||
2376 | } | ||
2377 | |||
2378 | subtree_root = ocfs2_find_subtree_root(inode, left_path, | ||
2379 | right_path); | ||
2380 | |||
2381 | mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", | ||
2382 | subtree_root, | ||
2383 | (unsigned long long) | ||
2384 | right_path->p_node[subtree_root].bh->b_blocknr, | ||
2385 | right_path->p_tree_depth); | ||
2386 | |||
2387 | ret = ocfs2_extend_rotate_transaction(handle, subtree_root, | ||
2388 | orig_credits, left_path); | ||
2389 | if (ret) { | ||
2390 | mlog_errno(ret); | ||
2391 | goto out; | ||
2392 | } | ||
2393 | |||
2394 | ret = ocfs2_rotate_subtree_left(inode, handle, left_path, | ||
2395 | right_path, subtree_root, | ||
2396 | dealloc, &deleted); | ||
2397 | if (ret == -EAGAIN) { | ||
2398 | /* | ||
2399 | * The rotation has to temporarily stop due to | ||
2400 | * the right subtree having an empty | ||
2401 | * extent. Pass it back to the caller for a | ||
2402 | * fixup. | ||
2403 | */ | ||
2404 | *empty_extent_path = right_path; | ||
2405 | right_path = NULL; | ||
2406 | goto out; | ||
2407 | } | ||
2408 | if (ret) { | ||
2409 | mlog_errno(ret); | ||
2410 | goto out; | ||
2411 | } | ||
2412 | |||
2413 | /* | ||
2414 | * The subtree rotate might have removed records on | ||
2415 | * the rightmost edge. If so, then rotation is | ||
2416 | * complete. | ||
2417 | */ | ||
2418 | if (deleted) | ||
2419 | break; | ||
2420 | |||
2421 | ocfs2_mv_path(left_path, right_path); | ||
2422 | |||
2423 | ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, | ||
2424 | &right_cpos); | ||
2425 | if (ret) { | ||
2426 | mlog_errno(ret); | ||
2427 | goto out; | ||
2428 | } | ||
2429 | } | ||
2430 | |||
2431 | out: | ||
2432 | ocfs2_free_path(right_path); | ||
2433 | ocfs2_free_path(left_path); | ||
2434 | |||
2435 | return ret; | ||
2436 | } | ||
2437 | |||
2438 | static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, | ||
2439 | struct ocfs2_path *path, | ||
2440 | struct ocfs2_cached_dealloc_ctxt *dealloc) | ||
2441 | { | ||
2442 | int ret, subtree_index; | ||
2443 | u32 cpos; | ||
2444 | struct ocfs2_path *left_path = NULL; | ||
2445 | struct ocfs2_dinode *di; | ||
2446 | struct ocfs2_extent_block *eb; | ||
2447 | struct ocfs2_extent_list *el; | ||
2448 | |||
2449 | /* | ||
2450 | * XXX: This code assumes that the root is an inode, which is | ||
2451 | * true for now but may change as tree code gets generic. | ||
2452 | */ | ||
2453 | di = (struct ocfs2_dinode *)path_root_bh(path)->b_data; | ||
2454 | if (!OCFS2_IS_VALID_DINODE(di)) { | ||
2455 | ret = -EIO; | ||
2456 | ocfs2_error(inode->i_sb, | ||
2457 | "Inode %llu has invalid path root", | ||
2458 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
2459 | goto out; | ||
2460 | } | ||
2461 | |||
2462 | /* | ||
2463 | * There's two ways we handle this depending on | ||
2464 | * whether path is the only existing one. | ||
2465 | */ | ||
2466 | ret = ocfs2_extend_rotate_transaction(handle, 0, | ||
2467 | handle->h_buffer_credits, | ||
2468 | path); | ||
2469 | if (ret) { | ||
2470 | mlog_errno(ret); | ||
2471 | goto out; | ||
2472 | } | ||
2473 | |||
2474 | ret = ocfs2_journal_access_path(inode, handle, path); | ||
2475 | if (ret) { | ||
2476 | mlog_errno(ret); | ||
2477 | goto out; | ||
2478 | } | ||
2479 | |||
2480 | ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); | ||
2481 | if (ret) { | ||
2482 | mlog_errno(ret); | ||
2483 | goto out; | ||
2484 | } | ||
2485 | |||
2486 | if (cpos) { | ||
2487 | /* | ||
2488 | * We have a path to the left of this one - it needs | ||
2489 | * an update too. | ||
2490 | */ | ||
2491 | left_path = ocfs2_new_path(path_root_bh(path), | ||
2492 | path_root_el(path)); | ||
2493 | if (!left_path) { | ||
2494 | ret = -ENOMEM; | ||
2495 | mlog_errno(ret); | ||
2496 | goto out; | ||
2497 | } | ||
2498 | |||
2499 | ret = ocfs2_find_path(inode, left_path, cpos); | ||
2500 | if (ret) { | ||
2501 | mlog_errno(ret); | ||
2502 | goto out; | ||
2503 | } | ||
2504 | |||
2505 | ret = ocfs2_journal_access_path(inode, handle, left_path); | ||
2506 | if (ret) { | ||
2507 | mlog_errno(ret); | ||
2508 | goto out; | ||
2509 | } | ||
2510 | |||
2511 | subtree_index = ocfs2_find_subtree_root(inode, left_path, path); | ||
2512 | |||
2513 | ocfs2_unlink_subtree(inode, handle, left_path, path, | ||
2514 | subtree_index, dealloc); | ||
2515 | ocfs2_update_edge_lengths(inode, handle, left_path); | ||
2516 | |||
2517 | eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; | ||
2518 | di->i_last_eb_blk = eb->h_blkno; | ||
2519 | } else { | ||
2520 | /* | ||
2521 | * 'path' is also the leftmost path which | ||
2522 | * means it must be the only one. This gets | ||
2523 | * handled differently because we want to | ||
2524 | * revert the inode back to having extents | ||
2525 | * in-line. | ||
2526 | */ | ||
2527 | ocfs2_unlink_path(inode, handle, dealloc, path, 1); | ||
2528 | |||
2529 | el = &di->id2.i_list; | ||
2530 | el->l_tree_depth = 0; | ||
2531 | el->l_next_free_rec = 0; | ||
2532 | memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); | ||
2533 | |||
2534 | di->i_last_eb_blk = 0; | ||
2535 | } | ||
2536 | |||
2537 | ocfs2_journal_dirty(handle, path_root_bh(path)); | ||
2538 | |||
2539 | out: | ||
2540 | ocfs2_free_path(left_path); | ||
2541 | return ret; | ||
2542 | } | ||
2543 | |||
2544 | /* | ||
2545 | * Left rotation of btree records. | ||
2546 | * | ||
2547 | * In many ways, this is (unsurprisingly) the opposite of right | ||
2548 | * rotation. We start at some non-rightmost path containing an empty | ||
2549 | * extent in the leaf block. The code works its way to the rightmost | ||
2550 | * path by rotating records to the left in every subtree. | ||
2551 | * | ||
2552 | * This is used by any code which reduces the number of extent records | ||
2553 | * in a leaf. After removal, an empty record should be placed in the | ||
2554 | * leftmost list position. | ||
2555 | * | ||
2556 | * This won't handle a length update of the rightmost path records if | ||
2557 | * the rightmost tree leaf record is removed so the caller is | ||
2558 | * responsible for detecting and correcting that. | ||
2559 | */ | ||
2560 | static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, | ||
2561 | struct ocfs2_path *path, | ||
2562 | struct ocfs2_cached_dealloc_ctxt *dealloc) | ||
2563 | { | ||
2564 | int ret, orig_credits = handle->h_buffer_credits; | ||
2565 | struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; | ||
2566 | struct ocfs2_extent_block *eb; | ||
2567 | struct ocfs2_extent_list *el; | ||
2568 | |||
2569 | el = path_leaf_el(path); | ||
2570 | if (!ocfs2_is_empty_extent(&el->l_recs[0])) | ||
2571 | return 0; | ||
2572 | |||
2573 | if (path->p_tree_depth == 0) { | ||
2574 | rightmost_no_delete: | ||
2575 | /* | ||
2576 | * In-inode extents. This is trivially handled, so do | ||
2577 | * it up front. | ||
2578 | */ | ||
2579 | ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, | ||
2580 | path_leaf_bh(path), | ||
2581 | path_leaf_el(path)); | ||
2582 | if (ret) | ||
2583 | mlog_errno(ret); | ||
2584 | goto out; | ||
2585 | } | ||
2586 | |||
2587 | /* | ||
2588 | * Handle rightmost branch now. There's several cases: | ||
2589 | * 1) simple rotation leaving records in there. That's trivial. | ||
2590 | * 2) rotation requiring a branch delete - there's no more | ||
2591 | * records left. Two cases of this: | ||
2592 | * a) There are branches to the left. | ||
2593 | * b) This is also the leftmost (the only) branch. | ||
2594 | * | ||
2595 | * 1) is handled via ocfs2_rotate_rightmost_leaf_left() | ||
2596 | * 2a) we need the left branch so that we can update it with the unlink | ||
2597 | * 2b) we need to bring the inode back to inline extents. | ||
2598 | */ | ||
2599 | |||
2600 | eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; | ||
2601 | el = &eb->h_list; | ||
2602 | if (eb->h_next_leaf_blk == 0) { | ||
2603 | /* | ||
2604 | * This gets a bit tricky if we're going to delete the | ||
2605 | * rightmost path. Get the other cases out of the way | ||
2606 | * 1st. | ||
2607 | */ | ||
2608 | if (le16_to_cpu(el->l_next_free_rec) > 1) | ||
2609 | goto rightmost_no_delete; | ||
2610 | |||
2611 | if (le16_to_cpu(el->l_next_free_rec) == 0) { | ||
2612 | ret = -EIO; | ||
2613 | ocfs2_error(inode->i_sb, | ||
2614 | "Inode %llu has empty extent block at %llu", | ||
2615 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
2616 | (unsigned long long)le64_to_cpu(eb->h_blkno)); | ||
2617 | goto out; | ||
2618 | } | ||
2619 | |||
2620 | /* | ||
2621 | * XXX: The caller can not trust "path" any more after | ||
2622 | * this as it will have been deleted. What do we do? | ||
2623 | * | ||
2624 | * In theory the rotate-for-merge code will never get | ||
2625 | * here because it'll always ask for a rotate in a | ||
2626 | * nonempty list. | ||
2627 | */ | ||
2628 | |||
2629 | ret = ocfs2_remove_rightmost_path(inode, handle, path, | ||
2630 | dealloc); | ||
2631 | if (ret) | ||
2632 | mlog_errno(ret); | ||
2633 | goto out; | ||
2634 | } | ||
2635 | |||
2636 | /* | ||
2637 | * Now we can loop, remembering the path we get from -EAGAIN | ||
2638 | * and restarting from there. | ||
2639 | */ | ||
2640 | try_rotate: | ||
2641 | ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, | ||
2642 | dealloc, &restart_path); | ||
2643 | if (ret && ret != -EAGAIN) { | ||
2644 | mlog_errno(ret); | ||
2645 | goto out; | ||
2646 | } | ||
2647 | |||
2648 | while (ret == -EAGAIN) { | ||
2649 | tmp_path = restart_path; | ||
2650 | restart_path = NULL; | ||
2651 | |||
2652 | ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, | ||
2653 | tmp_path, dealloc, | ||
2654 | &restart_path); | ||
2655 | if (ret && ret != -EAGAIN) { | ||
2656 | mlog_errno(ret); | ||
2657 | goto out; | ||
2658 | } | ||
2659 | |||
2660 | ocfs2_free_path(tmp_path); | ||
2661 | tmp_path = NULL; | ||
2662 | |||
2663 | if (ret == 0) | ||
2664 | goto try_rotate; | ||
2665 | } | ||
2666 | |||
2667 | out: | ||
2668 | ocfs2_free_path(tmp_path); | ||
2669 | ocfs2_free_path(restart_path); | ||
2670 | return ret; | ||
2671 | } | ||
2672 | |||
2673 | static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el, | ||
2674 | int index) | ||
2675 | { | ||
2676 | struct ocfs2_extent_rec *rec = &el->l_recs[index]; | ||
2677 | unsigned int size; | ||
2678 | |||
2679 | if (rec->e_leaf_clusters == 0) { | ||
2680 | /* | ||
2681 | * We consumed all of the merged-from record. An empty | ||
2682 | * extent cannot exist anywhere but the 1st array | ||
2683 | * position, so move things over if the merged-from | ||
2684 | * record doesn't occupy that position. | ||
2685 | * | ||
2686 | * This creates a new empty extent so the caller | ||
2687 | * should be smart enough to have removed any existing | ||
2688 | * ones. | ||
2689 | */ | ||
2690 | if (index > 0) { | ||
2691 | BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); | ||
2692 | size = index * sizeof(struct ocfs2_extent_rec); | ||
2693 | memmove(&el->l_recs[1], &el->l_recs[0], size); | ||
2694 | } | ||
2695 | |||
2696 | /* | ||
2697 | * Always memset - the caller doesn't check whether it | ||
2698 | * created an empty extent, so there could be junk in | ||
2699 | * the other fields. | ||
2700 | */ | ||
2701 | memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); | ||
2702 | } | ||
2703 | } | ||
2704 | |||
2705 | /* | ||
2706 | * Remove split_rec clusters from the record at index and merge them | ||
2707 | * onto the beginning of the record at index + 1. | ||
2708 | */ | ||
2709 | static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh, | ||
2710 | handle_t *handle, | ||
2711 | struct ocfs2_extent_rec *split_rec, | ||
2712 | struct ocfs2_extent_list *el, int index) | ||
2713 | { | ||
2714 | int ret; | ||
2715 | unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); | ||
2716 | struct ocfs2_extent_rec *left_rec; | ||
2717 | struct ocfs2_extent_rec *right_rec; | ||
2718 | |||
2719 | BUG_ON(index >= le16_to_cpu(el->l_next_free_rec)); | ||
2720 | |||
2721 | left_rec = &el->l_recs[index]; | ||
2722 | right_rec = &el->l_recs[index + 1]; | ||
2723 | |||
2724 | ret = ocfs2_journal_access(handle, inode, bh, | ||
2725 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2726 | if (ret) { | ||
2727 | mlog_errno(ret); | ||
2728 | goto out; | ||
2729 | } | ||
2730 | |||
2731 | le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters); | ||
2732 | |||
2733 | le32_add_cpu(&right_rec->e_cpos, -split_clusters); | ||
2734 | le64_add_cpu(&right_rec->e_blkno, | ||
2735 | -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); | ||
2736 | le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters); | ||
2737 | |||
2738 | ocfs2_cleanup_merge(el, index); | ||
2739 | |||
2740 | ret = ocfs2_journal_dirty(handle, bh); | ||
2741 | if (ret) | ||
2742 | mlog_errno(ret); | ||
2743 | |||
2744 | out: | ||
2745 | return ret; | ||
2746 | } | ||
2747 | |||
2748 | /* | ||
2749 | * Remove split_rec clusters from the record at index and merge them | ||
2750 | * onto the tail of the record at index - 1. | ||
2751 | */ | ||
2752 | static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh, | ||
2753 | handle_t *handle, | ||
2754 | struct ocfs2_extent_rec *split_rec, | ||
2755 | struct ocfs2_extent_list *el, int index) | ||
2756 | { | ||
2757 | int ret, has_empty_extent = 0; | ||
2758 | unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); | ||
2759 | struct ocfs2_extent_rec *left_rec; | ||
2760 | struct ocfs2_extent_rec *right_rec; | ||
2761 | |||
2762 | BUG_ON(index <= 0); | ||
2763 | |||
2764 | left_rec = &el->l_recs[index - 1]; | ||
2765 | right_rec = &el->l_recs[index]; | ||
2766 | if (ocfs2_is_empty_extent(&el->l_recs[0])) | ||
2767 | has_empty_extent = 1; | ||
2768 | |||
2769 | ret = ocfs2_journal_access(handle, inode, bh, | ||
2770 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2771 | if (ret) { | ||
2772 | mlog_errno(ret); | ||
2773 | goto out; | ||
2774 | } | ||
2775 | |||
2776 | if (has_empty_extent && index == 1) { | ||
2777 | /* | ||
2778 | * The easy case - we can just plop the record right in. | ||
2779 | */ | ||
2780 | *left_rec = *split_rec; | ||
2781 | |||
2782 | has_empty_extent = 0; | ||
2783 | } else { | ||
2784 | le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); | ||
2785 | } | ||
2786 | |||
2787 | le32_add_cpu(&right_rec->e_cpos, split_clusters); | ||
2788 | le64_add_cpu(&right_rec->e_blkno, | ||
2789 | ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); | ||
2790 | le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters); | ||
2791 | |||
2792 | ocfs2_cleanup_merge(el, index); | ||
2793 | |||
2794 | ret = ocfs2_journal_dirty(handle, bh); | ||
2795 | if (ret) | ||
2796 | mlog_errno(ret); | ||
2797 | |||
2798 | out: | ||
2799 | return ret; | ||
2800 | } | ||
2801 | |||
2802 | static int ocfs2_try_to_merge_extent(struct inode *inode, | ||
2803 | handle_t *handle, | ||
2804 | struct ocfs2_path *left_path, | ||
2805 | int split_index, | ||
2806 | struct ocfs2_extent_rec *split_rec, | ||
2807 | struct ocfs2_cached_dealloc_ctxt *dealloc, | ||
2808 | struct ocfs2_merge_ctxt *ctxt) | ||
2809 | |||
2810 | { | ||
2811 | int ret = 0, delete_tail_recs = 0; | ||
2812 | struct ocfs2_extent_list *el = path_leaf_el(left_path); | ||
2813 | struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; | ||
2814 | |||
2815 | BUG_ON(ctxt->c_contig_type == CONTIG_NONE); | ||
2816 | |||
2817 | if (ctxt->c_split_covers_rec) { | ||
2818 | delete_tail_recs++; | ||
2819 | |||
2820 | if (ctxt->c_contig_type == CONTIG_LEFTRIGHT || | ||
2821 | ctxt->c_has_empty_extent) | ||
2822 | delete_tail_recs++; | ||
2823 | |||
2824 | if (ctxt->c_has_empty_extent) { | ||
2825 | /* | ||
2826 | * The merge code will need to create an empty | ||
2827 | * extent to take the place of the newly | ||
2828 | * emptied slot. Remove any pre-existing empty | ||
2829 | * extents - having more than one in a leaf is | ||
2830 | * illegal. | ||
2831 | */ | ||
2832 | ret = ocfs2_rotate_tree_left(inode, handle, left_path, | ||
2833 | dealloc); | ||
2834 | if (ret) { | ||
2835 | mlog_errno(ret); | ||
2836 | goto out; | ||
2837 | } | ||
2838 | split_index--; | ||
2839 | rec = &el->l_recs[split_index]; | ||
2840 | } | ||
2841 | } | ||
2842 | |||
2843 | if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) { | ||
2844 | /* | ||
2845 | * Left-right contig implies this. | ||
2846 | */ | ||
2847 | BUG_ON(!ctxt->c_split_covers_rec); | ||
2848 | BUG_ON(split_index == 0); | ||
2849 | |||
2850 | /* | ||
2851 | * Since the leftright insert always covers the entire | ||
2852 | * extent, this call will delete the insert record | ||
2853 | * entirely, resulting in an empty extent record added to | ||
2854 | * the extent block. | ||
2855 | * | ||
2856 | * Since the adding of an empty extent shifts | ||
2857 | * everything back to the right, there's no need to | ||
2858 | * update split_index here. | ||
2859 | */ | ||
2860 | ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path), | ||
2861 | handle, split_rec, el, split_index); | ||
2862 | if (ret) { | ||
2863 | mlog_errno(ret); | ||
2864 | goto out; | ||
2865 | } | ||
2866 | |||
2867 | /* | ||
2868 | * We can only get this from logic error above. | ||
2869 | */ | ||
2870 | BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); | ||
2871 | |||
2872 | /* | ||
2873 | * The left merge left us with an empty extent, remove | ||
2874 | * it. | ||
2875 | */ | ||
2876 | ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc); | ||
2877 | if (ret) { | ||
2878 | mlog_errno(ret); | ||
2879 | goto out; | ||
2880 | } | ||
2881 | split_index--; | ||
2882 | rec = &el->l_recs[split_index]; | ||
2883 | |||
2884 | /* | ||
2885 | * Note that we don't pass split_rec here on purpose - | ||
2886 | * we've merged it into the left side. | ||
2887 | */ | ||
2888 | ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path), | ||
2889 | handle, rec, el, split_index); | ||
2890 | if (ret) { | ||
2891 | mlog_errno(ret); | ||
2892 | goto out; | ||
2893 | } | ||
2894 | |||
2895 | BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); | ||
2896 | |||
2897 | ret = ocfs2_rotate_tree_left(inode, handle, left_path, | ||
2898 | dealloc); | ||
2899 | /* | ||
2900 | * Error from this last rotate is not critical, so | ||
2901 | * print but don't bubble it up. | ||
2902 | */ | ||
2903 | if (ret) | ||
2904 | mlog_errno(ret); | ||
2905 | ret = 0; | ||
2906 | } else { | ||
2907 | /* | ||
2908 | * Merge a record to the left or right. | ||
2909 | * | ||
2910 | * 'contig_type' is relative to the existing record, | ||
2911 | * so for example, if we're "right contig", it's to | ||
2912 | * the record on the left (hence the left merge). | ||
2913 | */ | ||
2914 | if (ctxt->c_contig_type == CONTIG_RIGHT) { | ||
2915 | ret = ocfs2_merge_rec_left(inode, | ||
2916 | path_leaf_bh(left_path), | ||
2917 | handle, split_rec, el, | ||
2918 | split_index); | ||
2919 | if (ret) { | ||
2920 | mlog_errno(ret); | ||
2921 | goto out; | ||
2922 | } | ||
2923 | } else { | ||
2924 | ret = ocfs2_merge_rec_right(inode, | ||
2925 | path_leaf_bh(left_path), | ||
2926 | handle, split_rec, el, | ||
2927 | split_index); | ||
2928 | if (ret) { | ||
2929 | mlog_errno(ret); | ||
2930 | goto out; | ||
2931 | } | ||
2932 | } | ||
2933 | |||
2934 | if (ctxt->c_split_covers_rec) { | ||
2935 | /* | ||
2936 | * The merge may have left an empty extent in | ||
2937 | * our leaf. Try to rotate it away. | ||
2938 | */ | ||
2939 | ret = ocfs2_rotate_tree_left(inode, handle, left_path, | ||
2940 | dealloc); | ||
2941 | if (ret) | ||
2942 | mlog_errno(ret); | ||
2943 | ret = 0; | ||
2944 | } | ||
2945 | } | ||
2946 | |||
2947 | out: | ||
2948 | return ret; | ||
2949 | } | ||
2950 | |||
2951 | static void ocfs2_subtract_from_rec(struct super_block *sb, | ||
2952 | enum ocfs2_split_type split, | ||
2953 | struct ocfs2_extent_rec *rec, | ||
2954 | struct ocfs2_extent_rec *split_rec) | ||
2955 | { | ||
2956 | u64 len_blocks; | ||
2957 | |||
2958 | len_blocks = ocfs2_clusters_to_blocks(sb, | ||
2959 | le16_to_cpu(split_rec->e_leaf_clusters)); | ||
2960 | |||
2961 | if (split == SPLIT_LEFT) { | ||
2962 | /* | ||
2963 | * Region is on the left edge of the existing | ||
2964 | * record. | ||
2965 | */ | ||
2966 | le32_add_cpu(&rec->e_cpos, | ||
2967 | le16_to_cpu(split_rec->e_leaf_clusters)); | ||
2968 | le64_add_cpu(&rec->e_blkno, len_blocks); | ||
2969 | le16_add_cpu(&rec->e_leaf_clusters, | ||
2970 | -le16_to_cpu(split_rec->e_leaf_clusters)); | ||
2971 | } else { | ||
2972 | /* | ||
2973 | * Region is on the right edge of the existing | ||
2974 | * record. | ||
2975 | */ | ||
2976 | le16_add_cpu(&rec->e_leaf_clusters, | ||
2977 | -le16_to_cpu(split_rec->e_leaf_clusters)); | ||
2978 | } | ||
2979 | } | ||
2980 | |||
1725 | /* | 2981 | /* |
1726 | * Do the final bits of extent record insertion at the target leaf | 2982 | * Do the final bits of extent record insertion at the target leaf |
1727 | * list. If this leaf is part of an allocation tree, it is assumed | 2983 | * list. If this leaf is part of an allocation tree, it is assumed |
@@ -1738,6 +2994,15 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, | |||
1738 | 2994 | ||
1739 | BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); | 2995 | BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); |
1740 | 2996 | ||
2997 | if (insert->ins_split != SPLIT_NONE) { | ||
2998 | i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos)); | ||
2999 | BUG_ON(i == -1); | ||
3000 | rec = &el->l_recs[i]; | ||
3001 | ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec, | ||
3002 | insert_rec); | ||
3003 | goto rotate; | ||
3004 | } | ||
3005 | |||
1741 | /* | 3006 | /* |
1742 | * Contiguous insert - either left or right. | 3007 | * Contiguous insert - either left or right. |
1743 | */ | 3008 | */ |
@@ -1792,6 +3057,7 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, | |||
1792 | return; | 3057 | return; |
1793 | } | 3058 | } |
1794 | 3059 | ||
3060 | rotate: | ||
1795 | /* | 3061 | /* |
1796 | * Ok, we have to rotate. | 3062 | * Ok, we have to rotate. |
1797 | * | 3063 | * |
@@ -1815,13 +3081,53 @@ static inline void ocfs2_update_dinode_clusters(struct inode *inode, | |||
1815 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 3081 | spin_unlock(&OCFS2_I(inode)->ip_lock); |
1816 | } | 3082 | } |
1817 | 3083 | ||
3084 | static void ocfs2_adjust_rightmost_records(struct inode *inode, | ||
3085 | handle_t *handle, | ||
3086 | struct ocfs2_path *path, | ||
3087 | struct ocfs2_extent_rec *insert_rec) | ||
3088 | { | ||
3089 | int ret, i, next_free; | ||
3090 | struct buffer_head *bh; | ||
3091 | struct ocfs2_extent_list *el; | ||
3092 | struct ocfs2_extent_rec *rec; | ||
3093 | |||
3094 | /* | ||
3095 | * Update everything except the leaf block. | ||
3096 | */ | ||
3097 | for (i = 0; i < path->p_tree_depth; i++) { | ||
3098 | bh = path->p_node[i].bh; | ||
3099 | el = path->p_node[i].el; | ||
3100 | |||
3101 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
3102 | if (next_free == 0) { | ||
3103 | ocfs2_error(inode->i_sb, | ||
3104 | "Dinode %llu has a bad extent list", | ||
3105 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
3106 | ret = -EIO; | ||
3107 | return; | ||
3108 | } | ||
3109 | |||
3110 | rec = &el->l_recs[next_free - 1]; | ||
3111 | |||
3112 | rec->e_int_clusters = insert_rec->e_cpos; | ||
3113 | le32_add_cpu(&rec->e_int_clusters, | ||
3114 | le16_to_cpu(insert_rec->e_leaf_clusters)); | ||
3115 | le32_add_cpu(&rec->e_int_clusters, | ||
3116 | -le32_to_cpu(rec->e_cpos)); | ||
3117 | |||
3118 | ret = ocfs2_journal_dirty(handle, bh); | ||
3119 | if (ret) | ||
3120 | mlog_errno(ret); | ||
3121 | |||
3122 | } | ||
3123 | } | ||
3124 | |||
1818 | static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, | 3125 | static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, |
1819 | struct ocfs2_extent_rec *insert_rec, | 3126 | struct ocfs2_extent_rec *insert_rec, |
1820 | struct ocfs2_path *right_path, | 3127 | struct ocfs2_path *right_path, |
1821 | struct ocfs2_path **ret_left_path) | 3128 | struct ocfs2_path **ret_left_path) |
1822 | { | 3129 | { |
1823 | int ret, i, next_free; | 3130 | int ret, next_free; |
1824 | struct buffer_head *bh; | ||
1825 | struct ocfs2_extent_list *el; | 3131 | struct ocfs2_extent_list *el; |
1826 | struct ocfs2_path *left_path = NULL; | 3132 | struct ocfs2_path *left_path = NULL; |
1827 | 3133 | ||
@@ -1887,40 +3193,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, | |||
1887 | goto out; | 3193 | goto out; |
1888 | } | 3194 | } |
1889 | 3195 | ||
1890 | el = path_root_el(right_path); | 3196 | ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec); |
1891 | bh = path_root_bh(right_path); | ||
1892 | i = 0; | ||
1893 | while (1) { | ||
1894 | struct ocfs2_extent_rec *rec; | ||
1895 | |||
1896 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
1897 | if (next_free == 0) { | ||
1898 | ocfs2_error(inode->i_sb, | ||
1899 | "Dinode %llu has a bad extent list", | ||
1900 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
1901 | ret = -EIO; | ||
1902 | goto out; | ||
1903 | } | ||
1904 | |||
1905 | rec = &el->l_recs[next_free - 1]; | ||
1906 | |||
1907 | rec->e_int_clusters = insert_rec->e_cpos; | ||
1908 | le32_add_cpu(&rec->e_int_clusters, | ||
1909 | le16_to_cpu(insert_rec->e_leaf_clusters)); | ||
1910 | le32_add_cpu(&rec->e_int_clusters, | ||
1911 | -le32_to_cpu(rec->e_cpos)); | ||
1912 | |||
1913 | ret = ocfs2_journal_dirty(handle, bh); | ||
1914 | if (ret) | ||
1915 | mlog_errno(ret); | ||
1916 | |||
1917 | /* Don't touch the leaf node */ | ||
1918 | if (++i >= right_path->p_tree_depth) | ||
1919 | break; | ||
1920 | |||
1921 | bh = right_path->p_node[i].bh; | ||
1922 | el = right_path->p_node[i].el; | ||
1923 | } | ||
1924 | 3197 | ||
1925 | *ret_left_path = left_path; | 3198 | *ret_left_path = left_path; |
1926 | ret = 0; | 3199 | ret = 0; |
@@ -1931,6 +3204,83 @@ out: | |||
1931 | return ret; | 3204 | return ret; |
1932 | } | 3205 | } |
1933 | 3206 | ||
3207 | static void ocfs2_split_record(struct inode *inode, | ||
3208 | struct ocfs2_path *left_path, | ||
3209 | struct ocfs2_path *right_path, | ||
3210 | struct ocfs2_extent_rec *split_rec, | ||
3211 | enum ocfs2_split_type split) | ||
3212 | { | ||
3213 | int index; | ||
3214 | u32 cpos = le32_to_cpu(split_rec->e_cpos); | ||
3215 | struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; | ||
3216 | struct ocfs2_extent_rec *rec, *tmprec; | ||
3217 | |||
3218 | right_el = path_leaf_el(right_path);; | ||
3219 | if (left_path) | ||
3220 | left_el = path_leaf_el(left_path); | ||
3221 | |||
3222 | el = right_el; | ||
3223 | insert_el = right_el; | ||
3224 | index = ocfs2_search_extent_list(el, cpos); | ||
3225 | if (index != -1) { | ||
3226 | if (index == 0 && left_path) { | ||
3227 | BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); | ||
3228 | |||
3229 | /* | ||
3230 | * This typically means that the record | ||
3231 | * started in the left path but moved to the | ||
3232 | * right as a result of rotation. We either | ||
3233 | * move the existing record to the left, or we | ||
3234 | * do the later insert there. | ||
3235 | * | ||
3236 | * In this case, the left path should always | ||
3237 | * exist as the rotate code will have passed | ||
3238 | * it back for a post-insert update. | ||
3239 | */ | ||
3240 | |||
3241 | if (split == SPLIT_LEFT) { | ||
3242 | /* | ||
3243 | * It's a left split. Since we know | ||
3244 | * that the rotate code gave us an | ||
3245 | * empty extent in the left path, we | ||
3246 | * can just do the insert there. | ||
3247 | */ | ||
3248 | insert_el = left_el; | ||
3249 | } else { | ||
3250 | /* | ||
3251 | * Right split - we have to move the | ||
3252 | * existing record over to the left | ||
3253 | * leaf. The insert will be into the | ||
3254 | * newly created empty extent in the | ||
3255 | * right leaf. | ||
3256 | */ | ||
3257 | tmprec = &right_el->l_recs[index]; | ||
3258 | ocfs2_rotate_leaf(left_el, tmprec); | ||
3259 | el = left_el; | ||
3260 | |||
3261 | memset(tmprec, 0, sizeof(*tmprec)); | ||
3262 | index = ocfs2_search_extent_list(left_el, cpos); | ||
3263 | BUG_ON(index == -1); | ||
3264 | } | ||
3265 | } | ||
3266 | } else { | ||
3267 | BUG_ON(!left_path); | ||
3268 | BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0])); | ||
3269 | /* | ||
3270 | * Left path is easy - we can just allow the insert to | ||
3271 | * happen. | ||
3272 | */ | ||
3273 | el = left_el; | ||
3274 | insert_el = left_el; | ||
3275 | index = ocfs2_search_extent_list(el, cpos); | ||
3276 | BUG_ON(index == -1); | ||
3277 | } | ||
3278 | |||
3279 | rec = &el->l_recs[index]; | ||
3280 | ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec); | ||
3281 | ocfs2_rotate_leaf(insert_el, split_rec); | ||
3282 | } | ||
3283 | |||
1934 | /* | 3284 | /* |
1935 | * This function only does inserts on an allocation b-tree. For dinode | 3285 | * This function only does inserts on an allocation b-tree. For dinode |
1936 | * lists, ocfs2_insert_at_leaf() is called directly. | 3286 | * lists, ocfs2_insert_at_leaf() is called directly. |
@@ -1948,7 +3298,6 @@ static int ocfs2_insert_path(struct inode *inode, | |||
1948 | { | 3298 | { |
1949 | int ret, subtree_index; | 3299 | int ret, subtree_index; |
1950 | struct buffer_head *leaf_bh = path_leaf_bh(right_path); | 3300 | struct buffer_head *leaf_bh = path_leaf_bh(right_path); |
1951 | struct ocfs2_extent_list *el; | ||
1952 | 3301 | ||
1953 | /* | 3302 | /* |
1954 | * Pass both paths to the journal. The majority of inserts | 3303 | * Pass both paths to the journal. The majority of inserts |
@@ -1984,9 +3333,18 @@ static int ocfs2_insert_path(struct inode *inode, | |||
1984 | } | 3333 | } |
1985 | } | 3334 | } |
1986 | 3335 | ||
1987 | el = path_leaf_el(right_path); | 3336 | if (insert->ins_split != SPLIT_NONE) { |
3337 | /* | ||
3338 | * We could call ocfs2_insert_at_leaf() for some types | ||
3339 | * of splits, but it's easier to just let one seperate | ||
3340 | * function sort it all out. | ||
3341 | */ | ||
3342 | ocfs2_split_record(inode, left_path, right_path, | ||
3343 | insert_rec, insert->ins_split); | ||
3344 | } else | ||
3345 | ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path), | ||
3346 | insert, inode); | ||
1988 | 3347 | ||
1989 | ocfs2_insert_at_leaf(insert_rec, el, insert, inode); | ||
1990 | ret = ocfs2_journal_dirty(handle, leaf_bh); | 3348 | ret = ocfs2_journal_dirty(handle, leaf_bh); |
1991 | if (ret) | 3349 | if (ret) |
1992 | mlog_errno(ret); | 3350 | mlog_errno(ret); |
@@ -2075,7 +3433,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, | |||
2075 | * can wind up skipping both of these two special cases... | 3433 | * can wind up skipping both of these two special cases... |
2076 | */ | 3434 | */ |
2077 | if (rotate) { | 3435 | if (rotate) { |
2078 | ret = ocfs2_rotate_tree_right(inode, handle, | 3436 | ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split, |
2079 | le32_to_cpu(insert_rec->e_cpos), | 3437 | le32_to_cpu(insert_rec->e_cpos), |
2080 | right_path, &left_path); | 3438 | right_path, &left_path); |
2081 | if (ret) { | 3439 | if (ret) { |
@@ -2100,8 +3458,9 @@ static int ocfs2_do_insert_extent(struct inode *inode, | |||
2100 | } | 3458 | } |
2101 | 3459 | ||
2102 | out_update_clusters: | 3460 | out_update_clusters: |
2103 | ocfs2_update_dinode_clusters(inode, di, | 3461 | if (type->ins_split == SPLIT_NONE) |
2104 | le16_to_cpu(insert_rec->e_leaf_clusters)); | 3462 | ocfs2_update_dinode_clusters(inode, di, |
3463 | le16_to_cpu(insert_rec->e_leaf_clusters)); | ||
2105 | 3464 | ||
2106 | ret = ocfs2_journal_dirty(handle, di_bh); | 3465 | ret = ocfs2_journal_dirty(handle, di_bh); |
2107 | if (ret) | 3466 | if (ret) |
@@ -2114,6 +3473,44 @@ out: | |||
2114 | return ret; | 3473 | return ret; |
2115 | } | 3474 | } |
2116 | 3475 | ||
3476 | static enum ocfs2_contig_type | ||
3477 | ocfs2_figure_merge_contig_type(struct inode *inode, | ||
3478 | struct ocfs2_extent_list *el, int index, | ||
3479 | struct ocfs2_extent_rec *split_rec) | ||
3480 | { | ||
3481 | struct ocfs2_extent_rec *rec; | ||
3482 | enum ocfs2_contig_type ret = CONTIG_NONE; | ||
3483 | |||
3484 | /* | ||
3485 | * We're careful to check for an empty extent record here - | ||
3486 | * the merge code will know what to do if it sees one. | ||
3487 | */ | ||
3488 | |||
3489 | if (index > 0) { | ||
3490 | rec = &el->l_recs[index - 1]; | ||
3491 | if (index == 1 && ocfs2_is_empty_extent(rec)) { | ||
3492 | if (split_rec->e_cpos == el->l_recs[index].e_cpos) | ||
3493 | ret = CONTIG_RIGHT; | ||
3494 | } else { | ||
3495 | ret = ocfs2_extent_contig(inode, rec, split_rec); | ||
3496 | } | ||
3497 | } | ||
3498 | |||
3499 | if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) { | ||
3500 | enum ocfs2_contig_type contig_type; | ||
3501 | |||
3502 | rec = &el->l_recs[index + 1]; | ||
3503 | contig_type = ocfs2_extent_contig(inode, rec, split_rec); | ||
3504 | |||
3505 | if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) | ||
3506 | ret = CONTIG_LEFTRIGHT; | ||
3507 | else if (ret == CONTIG_NONE) | ||
3508 | ret = contig_type; | ||
3509 | } | ||
3510 | |||
3511 | return ret; | ||
3512 | } | ||
3513 | |||
2117 | static void ocfs2_figure_contig_type(struct inode *inode, | 3514 | static void ocfs2_figure_contig_type(struct inode *inode, |
2118 | struct ocfs2_insert_type *insert, | 3515 | struct ocfs2_insert_type *insert, |
2119 | struct ocfs2_extent_list *el, | 3516 | struct ocfs2_extent_list *el, |
@@ -2205,6 +3602,8 @@ static int ocfs2_figure_insert_type(struct inode *inode, | |||
2205 | struct ocfs2_path *path = NULL; | 3602 | struct ocfs2_path *path = NULL; |
2206 | struct buffer_head *bh = NULL; | 3603 | struct buffer_head *bh = NULL; |
2207 | 3604 | ||
3605 | insert->ins_split = SPLIT_NONE; | ||
3606 | |||
2208 | el = &di->id2.i_list; | 3607 | el = &di->id2.i_list; |
2209 | insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); | 3608 | insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); |
2210 | 3609 | ||
@@ -2327,9 +3726,10 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, | |||
2327 | u32 cpos, | 3726 | u32 cpos, |
2328 | u64 start_blk, | 3727 | u64 start_blk, |
2329 | u32 new_clusters, | 3728 | u32 new_clusters, |
3729 | u8 flags, | ||
2330 | struct ocfs2_alloc_context *meta_ac) | 3730 | struct ocfs2_alloc_context *meta_ac) |
2331 | { | 3731 | { |
2332 | int status, shift; | 3732 | int status; |
2333 | struct buffer_head *last_eb_bh = NULL; | 3733 | struct buffer_head *last_eb_bh = NULL; |
2334 | struct buffer_head *bh = NULL; | 3734 | struct buffer_head *bh = NULL; |
2335 | struct ocfs2_insert_type insert = {0, }; | 3735 | struct ocfs2_insert_type insert = {0, }; |
@@ -2350,6 +3750,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, | |||
2350 | rec.e_cpos = cpu_to_le32(cpos); | 3750 | rec.e_cpos = cpu_to_le32(cpos); |
2351 | rec.e_blkno = cpu_to_le64(start_blk); | 3751 | rec.e_blkno = cpu_to_le64(start_blk); |
2352 | rec.e_leaf_clusters = cpu_to_le16(new_clusters); | 3752 | rec.e_leaf_clusters = cpu_to_le16(new_clusters); |
3753 | rec.e_flags = flags; | ||
2353 | 3754 | ||
2354 | status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, | 3755 | status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, |
2355 | &insert); | 3756 | &insert); |
@@ -2364,55 +3765,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, | |||
2364 | insert.ins_appending, insert.ins_contig, insert.ins_contig_index, | 3765 | insert.ins_appending, insert.ins_contig, insert.ins_contig_index, |
2365 | insert.ins_free_records, insert.ins_tree_depth); | 3766 | insert.ins_free_records, insert.ins_tree_depth); |
2366 | 3767 | ||
2367 | /* | 3768 | if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) { |
2368 | * Avoid growing the tree unless we're out of records and the | 3769 | status = ocfs2_grow_tree(inode, handle, fe_bh, |
2369 | * insert type requres one. | 3770 | &insert.ins_tree_depth, &last_eb_bh, |
2370 | */ | 3771 | meta_ac); |
2371 | if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records) | 3772 | if (status) { |
2372 | goto out_add; | ||
2373 | |||
2374 | shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); | ||
2375 | if (shift < 0) { | ||
2376 | status = shift; | ||
2377 | mlog_errno(status); | ||
2378 | goto bail; | ||
2379 | } | ||
2380 | |||
2381 | /* We traveled all the way to the bottom of the allocation tree | ||
2382 | * and didn't find room for any more extents - we need to add | ||
2383 | * another tree level */ | ||
2384 | if (shift) { | ||
2385 | BUG_ON(bh); | ||
2386 | mlog(0, "need to shift tree depth " | ||
2387 | "(current = %d)\n", insert.ins_tree_depth); | ||
2388 | |||
2389 | /* ocfs2_shift_tree_depth will return us a buffer with | ||
2390 | * the new extent block (so we can pass that to | ||
2391 | * ocfs2_add_branch). */ | ||
2392 | status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh, | ||
2393 | meta_ac, &bh); | ||
2394 | if (status < 0) { | ||
2395 | mlog_errno(status); | 3773 | mlog_errno(status); |
2396 | goto bail; | 3774 | goto bail; |
2397 | } | 3775 | } |
2398 | insert.ins_tree_depth++; | ||
2399 | /* Special case: we have room now if we shifted from | ||
2400 | * tree_depth 0 */ | ||
2401 | if (insert.ins_tree_depth == 1) | ||
2402 | goto out_add; | ||
2403 | } | ||
2404 | |||
2405 | /* call ocfs2_add_branch to add the final part of the tree with | ||
2406 | * the new data. */ | ||
2407 | mlog(0, "add branch. bh = %p\n", bh); | ||
2408 | status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, | ||
2409 | meta_ac); | ||
2410 | if (status < 0) { | ||
2411 | mlog_errno(status); | ||
2412 | goto bail; | ||
2413 | } | 3776 | } |
2414 | 3777 | ||
2415 | out_add: | ||
2416 | /* Finally, we can add clusters. This might rotate the tree for us. */ | 3778 | /* Finally, we can add clusters. This might rotate the tree for us. */ |
2417 | status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); | 3779 | status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); |
2418 | if (status < 0) | 3780 | if (status < 0) |
@@ -2431,7 +3793,720 @@ bail: | |||
2431 | return status; | 3793 | return status; |
2432 | } | 3794 | } |
2433 | 3795 | ||
2434 | static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) | 3796 | static void ocfs2_make_right_split_rec(struct super_block *sb, |
3797 | struct ocfs2_extent_rec *split_rec, | ||
3798 | u32 cpos, | ||
3799 | struct ocfs2_extent_rec *rec) | ||
3800 | { | ||
3801 | u32 rec_cpos = le32_to_cpu(rec->e_cpos); | ||
3802 | u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters); | ||
3803 | |||
3804 | memset(split_rec, 0, sizeof(struct ocfs2_extent_rec)); | ||
3805 | |||
3806 | split_rec->e_cpos = cpu_to_le32(cpos); | ||
3807 | split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos); | ||
3808 | |||
3809 | split_rec->e_blkno = rec->e_blkno; | ||
3810 | le64_add_cpu(&split_rec->e_blkno, | ||
3811 | ocfs2_clusters_to_blocks(sb, cpos - rec_cpos)); | ||
3812 | |||
3813 | split_rec->e_flags = rec->e_flags; | ||
3814 | } | ||
3815 | |||
3816 | static int ocfs2_split_and_insert(struct inode *inode, | ||
3817 | handle_t *handle, | ||
3818 | struct ocfs2_path *path, | ||
3819 | struct buffer_head *di_bh, | ||
3820 | struct buffer_head **last_eb_bh, | ||
3821 | int split_index, | ||
3822 | struct ocfs2_extent_rec *orig_split_rec, | ||
3823 | struct ocfs2_alloc_context *meta_ac) | ||
3824 | { | ||
3825 | int ret = 0, depth; | ||
3826 | unsigned int insert_range, rec_range, do_leftright = 0; | ||
3827 | struct ocfs2_extent_rec tmprec; | ||
3828 | struct ocfs2_extent_list *rightmost_el; | ||
3829 | struct ocfs2_extent_rec rec; | ||
3830 | struct ocfs2_extent_rec split_rec = *orig_split_rec; | ||
3831 | struct ocfs2_insert_type insert; | ||
3832 | struct ocfs2_extent_block *eb; | ||
3833 | struct ocfs2_dinode *di; | ||
3834 | |||
3835 | leftright: | ||
3836 | /* | ||
3837 | * Store a copy of the record on the stack - it might move | ||
3838 | * around as the tree is manipulated below. | ||
3839 | */ | ||
3840 | rec = path_leaf_el(path)->l_recs[split_index]; | ||
3841 | |||
3842 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
3843 | rightmost_el = &di->id2.i_list; | ||
3844 | |||
3845 | depth = le16_to_cpu(rightmost_el->l_tree_depth); | ||
3846 | if (depth) { | ||
3847 | BUG_ON(!(*last_eb_bh)); | ||
3848 | eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; | ||
3849 | rightmost_el = &eb->h_list; | ||
3850 | } | ||
3851 | |||
3852 | if (le16_to_cpu(rightmost_el->l_next_free_rec) == | ||
3853 | le16_to_cpu(rightmost_el->l_count)) { | ||
3854 | int old_depth = depth; | ||
3855 | |||
3856 | ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh, | ||
3857 | meta_ac); | ||
3858 | if (ret) { | ||
3859 | mlog_errno(ret); | ||
3860 | goto out; | ||
3861 | } | ||
3862 | |||
3863 | if (old_depth != depth) { | ||
3864 | eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; | ||
3865 | rightmost_el = &eb->h_list; | ||
3866 | } | ||
3867 | } | ||
3868 | |||
3869 | memset(&insert, 0, sizeof(struct ocfs2_insert_type)); | ||
3870 | insert.ins_appending = APPEND_NONE; | ||
3871 | insert.ins_contig = CONTIG_NONE; | ||
3872 | insert.ins_free_records = le16_to_cpu(rightmost_el->l_count) | ||
3873 | - le16_to_cpu(rightmost_el->l_next_free_rec); | ||
3874 | insert.ins_tree_depth = depth; | ||
3875 | |||
3876 | insert_range = le32_to_cpu(split_rec.e_cpos) + | ||
3877 | le16_to_cpu(split_rec.e_leaf_clusters); | ||
3878 | rec_range = le32_to_cpu(rec.e_cpos) + | ||
3879 | le16_to_cpu(rec.e_leaf_clusters); | ||
3880 | |||
3881 | if (split_rec.e_cpos == rec.e_cpos) { | ||
3882 | insert.ins_split = SPLIT_LEFT; | ||
3883 | } else if (insert_range == rec_range) { | ||
3884 | insert.ins_split = SPLIT_RIGHT; | ||
3885 | } else { | ||
3886 | /* | ||
3887 | * Left/right split. We fake this as a right split | ||
3888 | * first and then make a second pass as a left split. | ||
3889 | */ | ||
3890 | insert.ins_split = SPLIT_RIGHT; | ||
3891 | |||
3892 | ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range, | ||
3893 | &rec); | ||
3894 | |||
3895 | split_rec = tmprec; | ||
3896 | |||
3897 | BUG_ON(do_leftright); | ||
3898 | do_leftright = 1; | ||
3899 | } | ||
3900 | |||
3901 | ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, | ||
3902 | &insert); | ||
3903 | if (ret) { | ||
3904 | mlog_errno(ret); | ||
3905 | goto out; | ||
3906 | } | ||
3907 | |||
3908 | if (do_leftright == 1) { | ||
3909 | u32 cpos; | ||
3910 | struct ocfs2_extent_list *el; | ||
3911 | |||
3912 | do_leftright++; | ||
3913 | split_rec = *orig_split_rec; | ||
3914 | |||
3915 | ocfs2_reinit_path(path, 1); | ||
3916 | |||
3917 | cpos = le32_to_cpu(split_rec.e_cpos); | ||
3918 | ret = ocfs2_find_path(inode, path, cpos); | ||
3919 | if (ret) { | ||
3920 | mlog_errno(ret); | ||
3921 | goto out; | ||
3922 | } | ||
3923 | |||
3924 | el = path_leaf_el(path); | ||
3925 | split_index = ocfs2_search_extent_list(el, cpos); | ||
3926 | goto leftright; | ||
3927 | } | ||
3928 | out: | ||
3929 | |||
3930 | return ret; | ||
3931 | } | ||
3932 | |||
3933 | /* | ||
3934 | * Mark part or all of the extent record at split_index in the leaf | ||
3935 | * pointed to by path as written. This removes the unwritten | ||
3936 | * extent flag. | ||
3937 | * | ||
3938 | * Care is taken to handle contiguousness so as to not grow the tree. | ||
3939 | * | ||
3940 | * meta_ac is not strictly necessary - we only truly need it if growth | ||
3941 | * of the tree is required. All other cases will degrade into a less | ||
3942 | * optimal tree layout. | ||
3943 | * | ||
3944 | * last_eb_bh should be the rightmost leaf block for any inode with a | ||
3945 | * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call. | ||
3946 | * | ||
3947 | * This code is optimized for readability - several passes might be | ||
3948 | * made over certain portions of the tree. All of those blocks will | ||
3949 | * have been brought into cache (and pinned via the journal), so the | ||
3950 | * extra overhead is not expressed in terms of disk reads. | ||
3951 | */ | ||
3952 | static int __ocfs2_mark_extent_written(struct inode *inode, | ||
3953 | struct buffer_head *di_bh, | ||
3954 | handle_t *handle, | ||
3955 | struct ocfs2_path *path, | ||
3956 | int split_index, | ||
3957 | struct ocfs2_extent_rec *split_rec, | ||
3958 | struct ocfs2_alloc_context *meta_ac, | ||
3959 | struct ocfs2_cached_dealloc_ctxt *dealloc) | ||
3960 | { | ||
3961 | int ret = 0; | ||
3962 | struct ocfs2_extent_list *el = path_leaf_el(path); | ||
3963 | struct buffer_head *eb_bh, *last_eb_bh = NULL; | ||
3964 | struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; | ||
3965 | struct ocfs2_merge_ctxt ctxt; | ||
3966 | struct ocfs2_extent_list *rightmost_el; | ||
3967 | |||
3968 | if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) { | ||
3969 | ret = -EIO; | ||
3970 | mlog_errno(ret); | ||
3971 | goto out; | ||
3972 | } | ||
3973 | |||
3974 | if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || | ||
3975 | ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < | ||
3976 | (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) { | ||
3977 | ret = -EIO; | ||
3978 | mlog_errno(ret); | ||
3979 | goto out; | ||
3980 | } | ||
3981 | |||
3982 | eb_bh = path_leaf_bh(path); | ||
3983 | ret = ocfs2_journal_access(handle, inode, eb_bh, | ||
3984 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
3985 | if (ret) { | ||
3986 | mlog_errno(ret); | ||
3987 | goto out; | ||
3988 | } | ||
3989 | |||
3990 | ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el, | ||
3991 | split_index, | ||
3992 | split_rec); | ||
3993 | |||
3994 | /* | ||
3995 | * The core merge / split code wants to know how much room is | ||
3996 | * left in this inodes allocation tree, so we pass the | ||
3997 | * rightmost extent list. | ||
3998 | */ | ||
3999 | if (path->p_tree_depth) { | ||
4000 | struct ocfs2_extent_block *eb; | ||
4001 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; | ||
4002 | |||
4003 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
4004 | le64_to_cpu(di->i_last_eb_blk), | ||
4005 | &last_eb_bh, OCFS2_BH_CACHED, inode); | ||
4006 | if (ret) { | ||
4007 | mlog_exit(ret); | ||
4008 | goto out; | ||
4009 | } | ||
4010 | |||
4011 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
4012 | if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { | ||
4013 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); | ||
4014 | ret = -EROFS; | ||
4015 | goto out; | ||
4016 | } | ||
4017 | |||
4018 | rightmost_el = &eb->h_list; | ||
4019 | } else | ||
4020 | rightmost_el = path_root_el(path); | ||
4021 | |||
4022 | ctxt.c_used_tail_recs = le16_to_cpu(rightmost_el->l_next_free_rec); | ||
4023 | if (ctxt.c_used_tail_recs > 0 && | ||
4024 | ocfs2_is_empty_extent(&rightmost_el->l_recs[0])) | ||
4025 | ctxt.c_used_tail_recs--; | ||
4026 | |||
4027 | if (rec->e_cpos == split_rec->e_cpos && | ||
4028 | rec->e_leaf_clusters == split_rec->e_leaf_clusters) | ||
4029 | ctxt.c_split_covers_rec = 1; | ||
4030 | else | ||
4031 | ctxt.c_split_covers_rec = 0; | ||
4032 | |||
4033 | ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]); | ||
4034 | |||
4035 | mlog(0, "index: %d, contig: %u, used_tail_recs: %u, " | ||
4036 | "has_empty: %u, split_covers: %u\n", split_index, | ||
4037 | ctxt.c_contig_type, ctxt.c_used_tail_recs, | ||
4038 | ctxt.c_has_empty_extent, ctxt.c_split_covers_rec); | ||
4039 | |||
4040 | if (ctxt.c_contig_type == CONTIG_NONE) { | ||
4041 | if (ctxt.c_split_covers_rec) | ||
4042 | el->l_recs[split_index] = *split_rec; | ||
4043 | else | ||
4044 | ret = ocfs2_split_and_insert(inode, handle, path, di_bh, | ||
4045 | &last_eb_bh, split_index, | ||
4046 | split_rec, meta_ac); | ||
4047 | if (ret) | ||
4048 | mlog_errno(ret); | ||
4049 | } else { | ||
4050 | ret = ocfs2_try_to_merge_extent(inode, handle, path, | ||
4051 | split_index, split_rec, | ||
4052 | dealloc, &ctxt); | ||
4053 | if (ret) | ||
4054 | mlog_errno(ret); | ||
4055 | } | ||
4056 | |||
4057 | ocfs2_journal_dirty(handle, eb_bh); | ||
4058 | |||
4059 | out: | ||
4060 | brelse(last_eb_bh); | ||
4061 | return ret; | ||
4062 | } | ||
4063 | |||
4064 | /* | ||
4065 | * Mark the already-existing extent at cpos as written for len clusters. | ||
4066 | * | ||
4067 | * If the existing extent is larger than the request, initiate a | ||
4068 | * split. An attempt will be made at merging with adjacent extents. | ||
4069 | * | ||
4070 | * The caller is responsible for passing down meta_ac if we'll need it. | ||
4071 | */ | ||
4072 | int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, | ||
4073 | handle_t *handle, u32 cpos, u32 len, u32 phys, | ||
4074 | struct ocfs2_alloc_context *meta_ac, | ||
4075 | struct ocfs2_cached_dealloc_ctxt *dealloc) | ||
4076 | { | ||
4077 | int ret, index; | ||
4078 | u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys); | ||
4079 | struct ocfs2_extent_rec split_rec; | ||
4080 | struct ocfs2_path *left_path = NULL; | ||
4081 | struct ocfs2_extent_list *el; | ||
4082 | |||
4083 | mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n", | ||
4084 | inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno); | ||
4085 | |||
4086 | if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { | ||
4087 | ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " | ||
4088 | "that are being written to, but the feature bit " | ||
4089 | "is not set in the super block.", | ||
4090 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
4091 | ret = -EROFS; | ||
4092 | goto out; | ||
4093 | } | ||
4094 | |||
4095 | /* | ||
4096 | * XXX: This should be fixed up so that we just re-insert the | ||
4097 | * next extent records. | ||
4098 | */ | ||
4099 | ocfs2_extent_map_trunc(inode, 0); | ||
4100 | |||
4101 | left_path = ocfs2_new_inode_path(di_bh); | ||
4102 | if (!left_path) { | ||
4103 | ret = -ENOMEM; | ||
4104 | mlog_errno(ret); | ||
4105 | goto out; | ||
4106 | } | ||
4107 | |||
4108 | ret = ocfs2_find_path(inode, left_path, cpos); | ||
4109 | if (ret) { | ||
4110 | mlog_errno(ret); | ||
4111 | goto out; | ||
4112 | } | ||
4113 | el = path_leaf_el(left_path); | ||
4114 | |||
4115 | index = ocfs2_search_extent_list(el, cpos); | ||
4116 | if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { | ||
4117 | ocfs2_error(inode->i_sb, | ||
4118 | "Inode %llu has an extent at cpos %u which can no " | ||
4119 | "longer be found.\n", | ||
4120 | (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); | ||
4121 | ret = -EROFS; | ||
4122 | goto out; | ||
4123 | } | ||
4124 | |||
4125 | memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec)); | ||
4126 | split_rec.e_cpos = cpu_to_le32(cpos); | ||
4127 | split_rec.e_leaf_clusters = cpu_to_le16(len); | ||
4128 | split_rec.e_blkno = cpu_to_le64(start_blkno); | ||
4129 | split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; | ||
4130 | split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; | ||
4131 | |||
4132 | ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path, | ||
4133 | index, &split_rec, meta_ac, dealloc); | ||
4134 | if (ret) | ||
4135 | mlog_errno(ret); | ||
4136 | |||
4137 | out: | ||
4138 | ocfs2_free_path(left_path); | ||
4139 | return ret; | ||
4140 | } | ||
4141 | |||
4142 | static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, | ||
4143 | handle_t *handle, struct ocfs2_path *path, | ||
4144 | int index, u32 new_range, | ||
4145 | struct ocfs2_alloc_context *meta_ac) | ||
4146 | { | ||
4147 | int ret, depth, credits = handle->h_buffer_credits; | ||
4148 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; | ||
4149 | struct buffer_head *last_eb_bh = NULL; | ||
4150 | struct ocfs2_extent_block *eb; | ||
4151 | struct ocfs2_extent_list *rightmost_el, *el; | ||
4152 | struct ocfs2_extent_rec split_rec; | ||
4153 | struct ocfs2_extent_rec *rec; | ||
4154 | struct ocfs2_insert_type insert; | ||
4155 | |||
4156 | /* | ||
4157 | * Setup the record to split before we grow the tree. | ||
4158 | */ | ||
4159 | el = path_leaf_el(path); | ||
4160 | rec = &el->l_recs[index]; | ||
4161 | ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec); | ||
4162 | |||
4163 | depth = path->p_tree_depth; | ||
4164 | if (depth > 0) { | ||
4165 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
4166 | le64_to_cpu(di->i_last_eb_blk), | ||
4167 | &last_eb_bh, OCFS2_BH_CACHED, inode); | ||
4168 | if (ret < 0) { | ||
4169 | mlog_errno(ret); | ||
4170 | goto out; | ||
4171 | } | ||
4172 | |||
4173 | eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; | ||
4174 | rightmost_el = &eb->h_list; | ||
4175 | } else | ||
4176 | rightmost_el = path_leaf_el(path); | ||
4177 | |||
4178 | credits += path->p_tree_depth + ocfs2_extend_meta_needed(di); | ||
4179 | ret = ocfs2_extend_trans(handle, credits); | ||
4180 | if (ret) { | ||
4181 | mlog_errno(ret); | ||
4182 | goto out; | ||
4183 | } | ||
4184 | |||
4185 | if (le16_to_cpu(rightmost_el->l_next_free_rec) == | ||
4186 | le16_to_cpu(rightmost_el->l_count)) { | ||
4187 | int old_depth = depth; | ||
4188 | |||
4189 | ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh, | ||
4190 | meta_ac); | ||
4191 | if (ret) { | ||
4192 | mlog_errno(ret); | ||
4193 | goto out; | ||
4194 | } | ||
4195 | |||
4196 | if (old_depth != depth) { | ||
4197 | eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; | ||
4198 | rightmost_el = &eb->h_list; | ||
4199 | } | ||
4200 | } | ||
4201 | |||
4202 | memset(&insert, 0, sizeof(struct ocfs2_insert_type)); | ||
4203 | insert.ins_appending = APPEND_NONE; | ||
4204 | insert.ins_contig = CONTIG_NONE; | ||
4205 | insert.ins_split = SPLIT_RIGHT; | ||
4206 | insert.ins_free_records = le16_to_cpu(rightmost_el->l_count) | ||
4207 | - le16_to_cpu(rightmost_el->l_next_free_rec); | ||
4208 | insert.ins_tree_depth = depth; | ||
4209 | |||
4210 | ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert); | ||
4211 | if (ret) | ||
4212 | mlog_errno(ret); | ||
4213 | |||
4214 | out: | ||
4215 | brelse(last_eb_bh); | ||
4216 | return ret; | ||
4217 | } | ||
4218 | |||
4219 | static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, | ||
4220 | struct ocfs2_path *path, int index, | ||
4221 | struct ocfs2_cached_dealloc_ctxt *dealloc, | ||
4222 | u32 cpos, u32 len) | ||
4223 | { | ||
4224 | int ret; | ||
4225 | u32 left_cpos, rec_range, trunc_range; | ||
4226 | int wants_rotate = 0, is_rightmost_tree_rec = 0; | ||
4227 | struct super_block *sb = inode->i_sb; | ||
4228 | struct ocfs2_path *left_path = NULL; | ||
4229 | struct ocfs2_extent_list *el = path_leaf_el(path); | ||
4230 | struct ocfs2_extent_rec *rec; | ||
4231 | struct ocfs2_extent_block *eb; | ||
4232 | |||
4233 | if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { | ||
4234 | ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); | ||
4235 | if (ret) { | ||
4236 | mlog_errno(ret); | ||
4237 | goto out; | ||
4238 | } | ||
4239 | |||
4240 | index--; | ||
4241 | } | ||
4242 | |||
4243 | if (index == (le16_to_cpu(el->l_next_free_rec) - 1) && | ||
4244 | path->p_tree_depth) { | ||
4245 | /* | ||
4246 | * Check whether this is the rightmost tree record. If | ||
4247 | * we remove all of this record or part of its right | ||
4248 | * edge then an update of the record lengths above it | ||
4249 | * will be required. | ||
4250 | */ | ||
4251 | eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; | ||
4252 | if (eb->h_next_leaf_blk == 0) | ||
4253 | is_rightmost_tree_rec = 1; | ||
4254 | } | ||
4255 | |||
4256 | rec = &el->l_recs[index]; | ||
4257 | if (index == 0 && path->p_tree_depth && | ||
4258 | le32_to_cpu(rec->e_cpos) == cpos) { | ||
4259 | /* | ||
4260 | * Changing the leftmost offset (via partial or whole | ||
4261 | * record truncate) of an interior (or rightmost) path | ||
4262 | * means we have to update the subtree that is formed | ||
4263 | * by this leaf and the one to it's left. | ||
4264 | * | ||
4265 | * There are two cases we can skip: | ||
4266 | * 1) Path is the leftmost one in our inode tree. | ||
4267 | * 2) The leaf is rightmost and will be empty after | ||
4268 | * we remove the extent record - the rotate code | ||
4269 | * knows how to update the newly formed edge. | ||
4270 | */ | ||
4271 | |||
4272 | ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, | ||
4273 | &left_cpos); | ||
4274 | if (ret) { | ||
4275 | mlog_errno(ret); | ||
4276 | goto out; | ||
4277 | } | ||
4278 | |||
4279 | if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) { | ||
4280 | left_path = ocfs2_new_path(path_root_bh(path), | ||
4281 | path_root_el(path)); | ||
4282 | if (!left_path) { | ||
4283 | ret = -ENOMEM; | ||
4284 | mlog_errno(ret); | ||
4285 | goto out; | ||
4286 | } | ||
4287 | |||
4288 | ret = ocfs2_find_path(inode, left_path, left_cpos); | ||
4289 | if (ret) { | ||
4290 | mlog_errno(ret); | ||
4291 | goto out; | ||
4292 | } | ||
4293 | } | ||
4294 | } | ||
4295 | |||
4296 | ret = ocfs2_extend_rotate_transaction(handle, 0, | ||
4297 | handle->h_buffer_credits, | ||
4298 | path); | ||
4299 | if (ret) { | ||
4300 | mlog_errno(ret); | ||
4301 | goto out; | ||
4302 | } | ||
4303 | |||
4304 | ret = ocfs2_journal_access_path(inode, handle, path); | ||
4305 | if (ret) { | ||
4306 | mlog_errno(ret); | ||
4307 | goto out; | ||
4308 | } | ||
4309 | |||
4310 | ret = ocfs2_journal_access_path(inode, handle, left_path); | ||
4311 | if (ret) { | ||
4312 | mlog_errno(ret); | ||
4313 | goto out; | ||
4314 | } | ||
4315 | |||
4316 | rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); | ||
4317 | trunc_range = cpos + len; | ||
4318 | |||
4319 | if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) { | ||
4320 | int next_free; | ||
4321 | |||
4322 | memset(rec, 0, sizeof(*rec)); | ||
4323 | ocfs2_cleanup_merge(el, index); | ||
4324 | wants_rotate = 1; | ||
4325 | |||
4326 | next_free = le16_to_cpu(el->l_next_free_rec); | ||
4327 | if (is_rightmost_tree_rec && next_free > 1) { | ||
4328 | /* | ||
4329 | * We skip the edge update if this path will | ||
4330 | * be deleted by the rotate code. | ||
4331 | */ | ||
4332 | rec = &el->l_recs[next_free - 1]; | ||
4333 | ocfs2_adjust_rightmost_records(inode, handle, path, | ||
4334 | rec); | ||
4335 | } | ||
4336 | } else if (le32_to_cpu(rec->e_cpos) == cpos) { | ||
4337 | /* Remove leftmost portion of the record. */ | ||
4338 | le32_add_cpu(&rec->e_cpos, len); | ||
4339 | le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len)); | ||
4340 | le16_add_cpu(&rec->e_leaf_clusters, -len); | ||
4341 | } else if (rec_range == trunc_range) { | ||
4342 | /* Remove rightmost portion of the record */ | ||
4343 | le16_add_cpu(&rec->e_leaf_clusters, -len); | ||
4344 | if (is_rightmost_tree_rec) | ||
4345 | ocfs2_adjust_rightmost_records(inode, handle, path, rec); | ||
4346 | } else { | ||
4347 | /* Caller should have trapped this. */ | ||
4348 | mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) " | ||
4349 | "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
4350 | le32_to_cpu(rec->e_cpos), | ||
4351 | le16_to_cpu(rec->e_leaf_clusters), cpos, len); | ||
4352 | BUG(); | ||
4353 | } | ||
4354 | |||
4355 | if (left_path) { | ||
4356 | int subtree_index; | ||
4357 | |||
4358 | subtree_index = ocfs2_find_subtree_root(inode, left_path, path); | ||
4359 | ocfs2_complete_edge_insert(inode, handle, left_path, path, | ||
4360 | subtree_index); | ||
4361 | } | ||
4362 | |||
4363 | ocfs2_journal_dirty(handle, path_leaf_bh(path)); | ||
4364 | |||
4365 | ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); | ||
4366 | if (ret) { | ||
4367 | mlog_errno(ret); | ||
4368 | goto out; | ||
4369 | } | ||
4370 | |||
4371 | out: | ||
4372 | ocfs2_free_path(left_path); | ||
4373 | return ret; | ||
4374 | } | ||
4375 | |||
4376 | int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, | ||
4377 | u32 cpos, u32 len, handle_t *handle, | ||
4378 | struct ocfs2_alloc_context *meta_ac, | ||
4379 | struct ocfs2_cached_dealloc_ctxt *dealloc) | ||
4380 | { | ||
4381 | int ret, index; | ||
4382 | u32 rec_range, trunc_range; | ||
4383 | struct ocfs2_extent_rec *rec; | ||
4384 | struct ocfs2_extent_list *el; | ||
4385 | struct ocfs2_path *path; | ||
4386 | |||
4387 | ocfs2_extent_map_trunc(inode, 0); | ||
4388 | |||
4389 | path = ocfs2_new_inode_path(di_bh); | ||
4390 | if (!path) { | ||
4391 | ret = -ENOMEM; | ||
4392 | mlog_errno(ret); | ||
4393 | goto out; | ||
4394 | } | ||
4395 | |||
4396 | ret = ocfs2_find_path(inode, path, cpos); | ||
4397 | if (ret) { | ||
4398 | mlog_errno(ret); | ||
4399 | goto out; | ||
4400 | } | ||
4401 | |||
4402 | el = path_leaf_el(path); | ||
4403 | index = ocfs2_search_extent_list(el, cpos); | ||
4404 | if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { | ||
4405 | ocfs2_error(inode->i_sb, | ||
4406 | "Inode %llu has an extent at cpos %u which can no " | ||
4407 | "longer be found.\n", | ||
4408 | (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); | ||
4409 | ret = -EROFS; | ||
4410 | goto out; | ||
4411 | } | ||
4412 | |||
4413 | /* | ||
4414 | * We have 3 cases of extent removal: | ||
4415 | * 1) Range covers the entire extent rec | ||
4416 | * 2) Range begins or ends on one edge of the extent rec | ||
4417 | * 3) Range is in the middle of the extent rec (no shared edges) | ||
4418 | * | ||
4419 | * For case 1 we remove the extent rec and left rotate to | ||
4420 | * fill the hole. | ||
4421 | * | ||
4422 | * For case 2 we just shrink the existing extent rec, with a | ||
4423 | * tree update if the shrinking edge is also the edge of an | ||
4424 | * extent block. | ||
4425 | * | ||
4426 | * For case 3 we do a right split to turn the extent rec into | ||
4427 | * something case 2 can handle. | ||
4428 | */ | ||
4429 | rec = &el->l_recs[index]; | ||
4430 | rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); | ||
4431 | trunc_range = cpos + len; | ||
4432 | |||
4433 | BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); | ||
4434 | |||
4435 | mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d " | ||
4436 | "(cpos %u, len %u)\n", | ||
4437 | (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index, | ||
4438 | le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); | ||
4439 | |||
4440 | if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { | ||
4441 | ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, | ||
4442 | cpos, len); | ||
4443 | if (ret) { | ||
4444 | mlog_errno(ret); | ||
4445 | goto out; | ||
4446 | } | ||
4447 | } else { | ||
4448 | ret = ocfs2_split_tree(inode, di_bh, handle, path, index, | ||
4449 | trunc_range, meta_ac); | ||
4450 | if (ret) { | ||
4451 | mlog_errno(ret); | ||
4452 | goto out; | ||
4453 | } | ||
4454 | |||
4455 | /* | ||
4456 | * The split could have manipulated the tree enough to | ||
4457 | * move the record location, so we have to look for it again. | ||
4458 | */ | ||
4459 | ocfs2_reinit_path(path, 1); | ||
4460 | |||
4461 | ret = ocfs2_find_path(inode, path, cpos); | ||
4462 | if (ret) { | ||
4463 | mlog_errno(ret); | ||
4464 | goto out; | ||
4465 | } | ||
4466 | |||
4467 | el = path_leaf_el(path); | ||
4468 | index = ocfs2_search_extent_list(el, cpos); | ||
4469 | if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { | ||
4470 | ocfs2_error(inode->i_sb, | ||
4471 | "Inode %llu: split at cpos %u lost record.", | ||
4472 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
4473 | cpos); | ||
4474 | ret = -EROFS; | ||
4475 | goto out; | ||
4476 | } | ||
4477 | |||
4478 | /* | ||
4479 | * Double check our values here. If anything is fishy, | ||
4480 | * it's easier to catch it at the top level. | ||
4481 | */ | ||
4482 | rec = &el->l_recs[index]; | ||
4483 | rec_range = le32_to_cpu(rec->e_cpos) + | ||
4484 | ocfs2_rec_clusters(el, rec); | ||
4485 | if (rec_range != trunc_range) { | ||
4486 | ocfs2_error(inode->i_sb, | ||
4487 | "Inode %llu: error after split at cpos %u" | ||
4488 | "trunc len %u, existing record is (%u,%u)", | ||
4489 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
4490 | cpos, len, le32_to_cpu(rec->e_cpos), | ||
4491 | ocfs2_rec_clusters(el, rec)); | ||
4492 | ret = -EROFS; | ||
4493 | goto out; | ||
4494 | } | ||
4495 | |||
4496 | ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, | ||
4497 | cpos, len); | ||
4498 | if (ret) { | ||
4499 | mlog_errno(ret); | ||
4500 | goto out; | ||
4501 | } | ||
4502 | } | ||
4503 | |||
4504 | out: | ||
4505 | ocfs2_free_path(path); | ||
4506 | return ret; | ||
4507 | } | ||
4508 | |||
4509 | int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) | ||
2435 | { | 4510 | { |
2436 | struct buffer_head *tl_bh = osb->osb_tl_bh; | 4511 | struct buffer_head *tl_bh = osb->osb_tl_bh; |
2437 | struct ocfs2_dinode *di; | 4512 | struct ocfs2_dinode *di; |
@@ -2464,10 +4539,10 @@ static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, | |||
2464 | return current_tail == new_start; | 4539 | return current_tail == new_start; |
2465 | } | 4540 | } |
2466 | 4541 | ||
2467 | static int ocfs2_truncate_log_append(struct ocfs2_super *osb, | 4542 | int ocfs2_truncate_log_append(struct ocfs2_super *osb, |
2468 | handle_t *handle, | 4543 | handle_t *handle, |
2469 | u64 start_blk, | 4544 | u64 start_blk, |
2470 | unsigned int num_clusters) | 4545 | unsigned int num_clusters) |
2471 | { | 4546 | { |
2472 | int status, index; | 4547 | int status, index; |
2473 | unsigned int start_cluster, tl_count; | 4548 | unsigned int start_cluster, tl_count; |
@@ -2623,7 +4698,7 @@ bail: | |||
2623 | } | 4698 | } |
2624 | 4699 | ||
2625 | /* Expects you to already be holding tl_inode->i_mutex */ | 4700 | /* Expects you to already be holding tl_inode->i_mutex */ |
2626 | static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) | 4701 | int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) |
2627 | { | 4702 | { |
2628 | int status; | 4703 | int status; |
2629 | unsigned int num_to_flush; | 4704 | unsigned int num_to_flush; |
@@ -2957,6 +5032,219 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) | |||
2957 | return status; | 5032 | return status; |
2958 | } | 5033 | } |
2959 | 5034 | ||
5035 | /* | ||
5036 | * Delayed de-allocation of suballocator blocks. | ||
5037 | * | ||
5038 | * Some sets of block de-allocations might involve multiple suballocator inodes. | ||
5039 | * | ||
5040 | * The locking for this can get extremely complicated, especially when | ||
5041 | * the suballocator inodes to delete from aren't known until deep | ||
5042 | * within an unrelated codepath. | ||
5043 | * | ||
5044 | * ocfs2_extent_block structures are a good example of this - an inode | ||
5045 | * btree could have been grown by any number of nodes each allocating | ||
5046 | * out of their own suballoc inode. | ||
5047 | * | ||
5048 | * These structures allow the delay of block de-allocation until a | ||
5049 | * later time, when locking of multiple cluster inodes won't cause | ||
5050 | * deadlock. | ||
5051 | */ | ||
5052 | |||
5053 | /* | ||
5054 | * Describes a single block free from a suballocator | ||
5055 | */ | ||
5056 | struct ocfs2_cached_block_free { | ||
5057 | struct ocfs2_cached_block_free *free_next; | ||
5058 | u64 free_blk; | ||
5059 | unsigned int free_bit; | ||
5060 | }; | ||
5061 | |||
5062 | struct ocfs2_per_slot_free_list { | ||
5063 | struct ocfs2_per_slot_free_list *f_next_suballocator; | ||
5064 | int f_inode_type; | ||
5065 | int f_slot; | ||
5066 | struct ocfs2_cached_block_free *f_first; | ||
5067 | }; | ||
5068 | |||
5069 | static int ocfs2_free_cached_items(struct ocfs2_super *osb, | ||
5070 | int sysfile_type, | ||
5071 | int slot, | ||
5072 | struct ocfs2_cached_block_free *head) | ||
5073 | { | ||
5074 | int ret; | ||
5075 | u64 bg_blkno; | ||
5076 | handle_t *handle; | ||
5077 | struct inode *inode; | ||
5078 | struct buffer_head *di_bh = NULL; | ||
5079 | struct ocfs2_cached_block_free *tmp; | ||
5080 | |||
5081 | inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot); | ||
5082 | if (!inode) { | ||
5083 | ret = -EINVAL; | ||
5084 | mlog_errno(ret); | ||
5085 | goto out; | ||
5086 | } | ||
5087 | |||
5088 | mutex_lock(&inode->i_mutex); | ||
5089 | |||
5090 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | ||
5091 | if (ret) { | ||
5092 | mlog_errno(ret); | ||
5093 | goto out_mutex; | ||
5094 | } | ||
5095 | |||
5096 | handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); | ||
5097 | if (IS_ERR(handle)) { | ||
5098 | ret = PTR_ERR(handle); | ||
5099 | mlog_errno(ret); | ||
5100 | goto out_unlock; | ||
5101 | } | ||
5102 | |||
5103 | while (head) { | ||
5104 | bg_blkno = ocfs2_which_suballoc_group(head->free_blk, | ||
5105 | head->free_bit); | ||
5106 | mlog(0, "Free bit: (bit %u, blkno %llu)\n", | ||
5107 | head->free_bit, (unsigned long long)head->free_blk); | ||
5108 | |||
5109 | ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, | ||
5110 | head->free_bit, bg_blkno, 1); | ||
5111 | if (ret) { | ||
5112 | mlog_errno(ret); | ||
5113 | goto out_journal; | ||
5114 | } | ||
5115 | |||
5116 | ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); | ||
5117 | if (ret) { | ||
5118 | mlog_errno(ret); | ||
5119 | goto out_journal; | ||
5120 | } | ||
5121 | |||
5122 | tmp = head; | ||
5123 | head = head->free_next; | ||
5124 | kfree(tmp); | ||
5125 | } | ||
5126 | |||
5127 | out_journal: | ||
5128 | ocfs2_commit_trans(osb, handle); | ||
5129 | |||
5130 | out_unlock: | ||
5131 | ocfs2_meta_unlock(inode, 1); | ||
5132 | brelse(di_bh); | ||
5133 | out_mutex: | ||
5134 | mutex_unlock(&inode->i_mutex); | ||
5135 | iput(inode); | ||
5136 | out: | ||
5137 | while(head) { | ||
5138 | /* Premature exit may have left some dangling items. */ | ||
5139 | tmp = head; | ||
5140 | head = head->free_next; | ||
5141 | kfree(tmp); | ||
5142 | } | ||
5143 | |||
5144 | return ret; | ||
5145 | } | ||
5146 | |||
5147 | int ocfs2_run_deallocs(struct ocfs2_super *osb, | ||
5148 | struct ocfs2_cached_dealloc_ctxt *ctxt) | ||
5149 | { | ||
5150 | int ret = 0, ret2; | ||
5151 | struct ocfs2_per_slot_free_list *fl; | ||
5152 | |||
5153 | if (!ctxt) | ||
5154 | return 0; | ||
5155 | |||
5156 | while (ctxt->c_first_suballocator) { | ||
5157 | fl = ctxt->c_first_suballocator; | ||
5158 | |||
5159 | if (fl->f_first) { | ||
5160 | mlog(0, "Free items: (type %u, slot %d)\n", | ||
5161 | fl->f_inode_type, fl->f_slot); | ||
5162 | ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type, | ||
5163 | fl->f_slot, fl->f_first); | ||
5164 | if (ret2) | ||
5165 | mlog_errno(ret2); | ||
5166 | if (!ret) | ||
5167 | ret = ret2; | ||
5168 | } | ||
5169 | |||
5170 | ctxt->c_first_suballocator = fl->f_next_suballocator; | ||
5171 | kfree(fl); | ||
5172 | } | ||
5173 | |||
5174 | return ret; | ||
5175 | } | ||
5176 | |||
5177 | static struct ocfs2_per_slot_free_list * | ||
5178 | ocfs2_find_per_slot_free_list(int type, | ||
5179 | int slot, | ||
5180 | struct ocfs2_cached_dealloc_ctxt *ctxt) | ||
5181 | { | ||
5182 | struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; | ||
5183 | |||
5184 | while (fl) { | ||
5185 | if (fl->f_inode_type == type && fl->f_slot == slot) | ||
5186 | return fl; | ||
5187 | |||
5188 | fl = fl->f_next_suballocator; | ||
5189 | } | ||
5190 | |||
5191 | fl = kmalloc(sizeof(*fl), GFP_NOFS); | ||
5192 | if (fl) { | ||
5193 | fl->f_inode_type = type; | ||
5194 | fl->f_slot = slot; | ||
5195 | fl->f_first = NULL; | ||
5196 | fl->f_next_suballocator = ctxt->c_first_suballocator; | ||
5197 | |||
5198 | ctxt->c_first_suballocator = fl; | ||
5199 | } | ||
5200 | return fl; | ||
5201 | } | ||
5202 | |||
5203 | static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, | ||
5204 | int type, int slot, u64 blkno, | ||
5205 | unsigned int bit) | ||
5206 | { | ||
5207 | int ret; | ||
5208 | struct ocfs2_per_slot_free_list *fl; | ||
5209 | struct ocfs2_cached_block_free *item; | ||
5210 | |||
5211 | fl = ocfs2_find_per_slot_free_list(type, slot, ctxt); | ||
5212 | if (fl == NULL) { | ||
5213 | ret = -ENOMEM; | ||
5214 | mlog_errno(ret); | ||
5215 | goto out; | ||
5216 | } | ||
5217 | |||
5218 | item = kmalloc(sizeof(*item), GFP_NOFS); | ||
5219 | if (item == NULL) { | ||
5220 | ret = -ENOMEM; | ||
5221 | mlog_errno(ret); | ||
5222 | goto out; | ||
5223 | } | ||
5224 | |||
5225 | mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", | ||
5226 | type, slot, bit, (unsigned long long)blkno); | ||
5227 | |||
5228 | item->free_blk = blkno; | ||
5229 | item->free_bit = bit; | ||
5230 | item->free_next = fl->f_first; | ||
5231 | |||
5232 | fl->f_first = item; | ||
5233 | |||
5234 | ret = 0; | ||
5235 | out: | ||
5236 | return ret; | ||
5237 | } | ||
5238 | |||
5239 | static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, | ||
5240 | struct ocfs2_extent_block *eb) | ||
5241 | { | ||
5242 | return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, | ||
5243 | le16_to_cpu(eb->h_suballoc_slot), | ||
5244 | le64_to_cpu(eb->h_blkno), | ||
5245 | le16_to_cpu(eb->h_suballoc_bit)); | ||
5246 | } | ||
5247 | |||
2960 | /* This function will figure out whether the currently last extent | 5248 | /* This function will figure out whether the currently last extent |
2961 | * block will be deleted, and if it will, what the new last extent | 5249 | * block will be deleted, and if it will, what the new last extent |
2962 | * block will be so we can update his h_next_leaf_blk field, as well | 5250 | * block will be so we can update his h_next_leaf_blk field, as well |
@@ -3238,27 +5526,10 @@ delete: | |||
3238 | BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); | 5526 | BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); |
3239 | BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); | 5527 | BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); |
3240 | 5528 | ||
3241 | if (le16_to_cpu(eb->h_suballoc_slot) == 0) { | 5529 | ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb); |
3242 | /* | 5530 | /* An error here is not fatal. */ |
3243 | * This code only understands how to | 5531 | if (ret < 0) |
3244 | * lock the suballocator in slot 0, | 5532 | mlog_errno(ret); |
3245 | * which is fine because allocation is | ||
3246 | * only ever done out of that | ||
3247 | * suballocator too. A future version | ||
3248 | * might change that however, so avoid | ||
3249 | * a free if we don't know how to | ||
3250 | * handle it. This way an fs incompat | ||
3251 | * bit will not be necessary. | ||
3252 | */ | ||
3253 | ret = ocfs2_free_extent_block(handle, | ||
3254 | tc->tc_ext_alloc_inode, | ||
3255 | tc->tc_ext_alloc_bh, | ||
3256 | eb); | ||
3257 | |||
3258 | /* An error here is not fatal. */ | ||
3259 | if (ret < 0) | ||
3260 | mlog_errno(ret); | ||
3261 | } | ||
3262 | } else { | 5533 | } else { |
3263 | deleted_eb = 0; | 5534 | deleted_eb = 0; |
3264 | } | 5535 | } |
@@ -3397,9 +5668,9 @@ static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh) | |||
3397 | return ocfs2_journal_dirty_data(handle, bh); | 5668 | return ocfs2_journal_dirty_data(handle, bh); |
3398 | } | 5669 | } |
3399 | 5670 | ||
3400 | static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, | 5671 | static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, |
3401 | struct page **pages, int numpages, | 5672 | loff_t end, struct page **pages, |
3402 | u64 phys, handle_t *handle) | 5673 | int numpages, u64 phys, handle_t *handle) |
3403 | { | 5674 | { |
3404 | int i, ret, partial = 0; | 5675 | int i, ret, partial = 0; |
3405 | void *kaddr; | 5676 | void *kaddr; |
@@ -3412,26 +5683,14 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, | |||
3412 | if (numpages == 0) | 5683 | if (numpages == 0) |
3413 | goto out; | 5684 | goto out; |
3414 | 5685 | ||
3415 | from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */ | 5686 | to = PAGE_CACHE_SIZE; |
3416 | if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) { | ||
3417 | /* | ||
3418 | * Since 'from' has been capped to a value below page | ||
3419 | * size, this calculation won't be able to overflow | ||
3420 | * 'to' | ||
3421 | */ | ||
3422 | to = ocfs2_align_bytes_to_clusters(sb, from); | ||
3423 | |||
3424 | /* | ||
3425 | * The truncate tail in this case should never contain | ||
3426 | * more than one page at maximum. The loop below also | ||
3427 | * assumes this. | ||
3428 | */ | ||
3429 | BUG_ON(numpages != 1); | ||
3430 | } | ||
3431 | |||
3432 | for(i = 0; i < numpages; i++) { | 5687 | for(i = 0; i < numpages; i++) { |
3433 | page = pages[i]; | 5688 | page = pages[i]; |
3434 | 5689 | ||
5690 | from = start & (PAGE_CACHE_SIZE - 1); | ||
5691 | if ((end >> PAGE_CACHE_SHIFT) == page->index) | ||
5692 | to = end & (PAGE_CACHE_SIZE - 1); | ||
5693 | |||
3435 | BUG_ON(from > PAGE_CACHE_SIZE); | 5694 | BUG_ON(from > PAGE_CACHE_SIZE); |
3436 | BUG_ON(to > PAGE_CACHE_SIZE); | 5695 | BUG_ON(to > PAGE_CACHE_SIZE); |
3437 | 5696 | ||
@@ -3468,10 +5727,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, | |||
3468 | 5727 | ||
3469 | flush_dcache_page(page); | 5728 | flush_dcache_page(page); |
3470 | 5729 | ||
3471 | /* | 5730 | start = (page->index + 1) << PAGE_CACHE_SHIFT; |
3472 | * Every page after the 1st one should be completely zero'd. | ||
3473 | */ | ||
3474 | from = 0; | ||
3475 | } | 5731 | } |
3476 | out: | 5732 | out: |
3477 | if (pages) { | 5733 | if (pages) { |
@@ -3484,24 +5740,26 @@ out: | |||
3484 | } | 5740 | } |
3485 | } | 5741 | } |
3486 | 5742 | ||
3487 | static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages, | 5743 | static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, |
3488 | int *num, u64 *phys) | 5744 | struct page **pages, int *num, u64 *phys) |
3489 | { | 5745 | { |
3490 | int i, numpages = 0, ret = 0; | 5746 | int i, numpages = 0, ret = 0; |
3491 | unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize; | ||
3492 | unsigned int ext_flags; | 5747 | unsigned int ext_flags; |
3493 | struct super_block *sb = inode->i_sb; | 5748 | struct super_block *sb = inode->i_sb; |
3494 | struct address_space *mapping = inode->i_mapping; | 5749 | struct address_space *mapping = inode->i_mapping; |
3495 | unsigned long index; | 5750 | unsigned long index; |
3496 | u64 next_cluster_bytes; | 5751 | loff_t last_page_bytes; |
3497 | 5752 | ||
3498 | BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); | 5753 | BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); |
5754 | BUG_ON(start > end); | ||
3499 | 5755 | ||
3500 | /* Cluster boundary, so we don't need to grab any pages. */ | 5756 | if (start == end) |
3501 | if ((isize & (csize - 1)) == 0) | ||
3502 | goto out; | 5757 | goto out; |
3503 | 5758 | ||
3504 | ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits, | 5759 | BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != |
5760 | (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); | ||
5761 | |||
5762 | ret = ocfs2_extent_map_get_blocks(inode, start >> sb->s_blocksize_bits, | ||
3505 | phys, NULL, &ext_flags); | 5763 | phys, NULL, &ext_flags); |
3506 | if (ret) { | 5764 | if (ret) { |
3507 | mlog_errno(ret); | 5765 | mlog_errno(ret); |
@@ -3517,8 +5775,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page * | |||
3517 | if (ext_flags & OCFS2_EXT_UNWRITTEN) | 5775 | if (ext_flags & OCFS2_EXT_UNWRITTEN) |
3518 | goto out; | 5776 | goto out; |
3519 | 5777 | ||
3520 | next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize); | 5778 | last_page_bytes = PAGE_ALIGN(end); |
3521 | index = isize >> PAGE_CACHE_SHIFT; | 5779 | index = start >> PAGE_CACHE_SHIFT; |
3522 | do { | 5780 | do { |
3523 | pages[numpages] = grab_cache_page(mapping, index); | 5781 | pages[numpages] = grab_cache_page(mapping, index); |
3524 | if (!pages[numpages]) { | 5782 | if (!pages[numpages]) { |
@@ -3529,7 +5787,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page * | |||
3529 | 5787 | ||
3530 | numpages++; | 5788 | numpages++; |
3531 | index++; | 5789 | index++; |
3532 | } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT)); | 5790 | } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT)); |
3533 | 5791 | ||
3534 | out: | 5792 | out: |
3535 | if (ret != 0) { | 5793 | if (ret != 0) { |
@@ -3558,11 +5816,10 @@ out: | |||
3558 | * otherwise block_write_full_page() will skip writeout of pages past | 5816 | * otherwise block_write_full_page() will skip writeout of pages past |
3559 | * i_size. The new_i_size parameter is passed for this reason. | 5817 | * i_size. The new_i_size parameter is passed for this reason. |
3560 | */ | 5818 | */ |
3561 | int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, | 5819 | int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, |
3562 | u64 new_i_size) | 5820 | u64 range_start, u64 range_end) |
3563 | { | 5821 | { |
3564 | int ret, numpages; | 5822 | int ret, numpages; |
3565 | loff_t endbyte; | ||
3566 | struct page **pages = NULL; | 5823 | struct page **pages = NULL; |
3567 | u64 phys; | 5824 | u64 phys; |
3568 | 5825 | ||
@@ -3581,7 +5838,8 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, | |||
3581 | goto out; | 5838 | goto out; |
3582 | } | 5839 | } |
3583 | 5840 | ||
3584 | ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys); | 5841 | ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages, |
5842 | &numpages, &phys); | ||
3585 | if (ret) { | 5843 | if (ret) { |
3586 | mlog_errno(ret); | 5844 | mlog_errno(ret); |
3587 | goto out; | 5845 | goto out; |
@@ -3590,17 +5848,16 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, | |||
3590 | if (numpages == 0) | 5848 | if (numpages == 0) |
3591 | goto out; | 5849 | goto out; |
3592 | 5850 | ||
3593 | ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys, | 5851 | ocfs2_zero_cluster_pages(inode, range_start, range_end, pages, |
3594 | handle); | 5852 | numpages, phys, handle); |
3595 | 5853 | ||
3596 | /* | 5854 | /* |
3597 | * Initiate writeout of the pages we zero'd here. We don't | 5855 | * Initiate writeout of the pages we zero'd here. We don't |
3598 | * wait on them - the truncate_inode_pages() call later will | 5856 | * wait on them - the truncate_inode_pages() call later will |
3599 | * do that for us. | 5857 | * do that for us. |
3600 | */ | 5858 | */ |
3601 | endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); | 5859 | ret = do_sync_mapping_range(inode->i_mapping, range_start, |
3602 | ret = do_sync_mapping_range(inode->i_mapping, new_i_size, | 5860 | range_end - 1, SYNC_FILE_RANGE_WRITE); |
3603 | endbyte - 1, SYNC_FILE_RANGE_WRITE); | ||
3604 | if (ret) | 5861 | if (ret) |
3605 | mlog_errno(ret); | 5862 | mlog_errno(ret); |
3606 | 5863 | ||
@@ -3631,8 +5888,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, | |||
3631 | 5888 | ||
3632 | mlog_entry_void(); | 5889 | mlog_entry_void(); |
3633 | 5890 | ||
3634 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
3635 | |||
3636 | new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, | 5891 | new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, |
3637 | i_size_read(inode)); | 5892 | i_size_read(inode)); |
3638 | 5893 | ||
@@ -3754,7 +6009,6 @@ start: | |||
3754 | goto start; | 6009 | goto start; |
3755 | 6010 | ||
3756 | bail: | 6011 | bail: |
3757 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
3758 | 6012 | ||
3759 | ocfs2_schedule_truncate_log_flush(osb, 1); | 6013 | ocfs2_schedule_truncate_log_flush(osb, 1); |
3760 | 6014 | ||
@@ -3764,6 +6018,8 @@ bail: | |||
3764 | if (handle) | 6018 | if (handle) |
3765 | ocfs2_commit_trans(osb, handle); | 6019 | ocfs2_commit_trans(osb, handle); |
3766 | 6020 | ||
6021 | ocfs2_run_deallocs(osb, &tc->tc_dealloc); | ||
6022 | |||
3767 | ocfs2_free_path(path); | 6023 | ocfs2_free_path(path); |
3768 | 6024 | ||
3769 | /* This will drop the ext_alloc cluster lock for us */ | 6025 | /* This will drop the ext_alloc cluster lock for us */ |
@@ -3774,23 +6030,18 @@ bail: | |||
3774 | } | 6030 | } |
3775 | 6031 | ||
3776 | /* | 6032 | /* |
3777 | * Expects the inode to already be locked. This will figure out which | 6033 | * Expects the inode to already be locked. |
3778 | * inodes need to be locked and will put them on the returned truncate | ||
3779 | * context. | ||
3780 | */ | 6034 | */ |
3781 | int ocfs2_prepare_truncate(struct ocfs2_super *osb, | 6035 | int ocfs2_prepare_truncate(struct ocfs2_super *osb, |
3782 | struct inode *inode, | 6036 | struct inode *inode, |
3783 | struct buffer_head *fe_bh, | 6037 | struct buffer_head *fe_bh, |
3784 | struct ocfs2_truncate_context **tc) | 6038 | struct ocfs2_truncate_context **tc) |
3785 | { | 6039 | { |
3786 | int status, metadata_delete, i; | 6040 | int status; |
3787 | unsigned int new_i_clusters; | 6041 | unsigned int new_i_clusters; |
3788 | struct ocfs2_dinode *fe; | 6042 | struct ocfs2_dinode *fe; |
3789 | struct ocfs2_extent_block *eb; | 6043 | struct ocfs2_extent_block *eb; |
3790 | struct ocfs2_extent_list *el; | ||
3791 | struct buffer_head *last_eb_bh = NULL; | 6044 | struct buffer_head *last_eb_bh = NULL; |
3792 | struct inode *ext_alloc_inode = NULL; | ||
3793 | struct buffer_head *ext_alloc_bh = NULL; | ||
3794 | 6045 | ||
3795 | mlog_entry_void(); | 6046 | mlog_entry_void(); |
3796 | 6047 | ||
@@ -3810,12 +6061,9 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, | |||
3810 | mlog_errno(status); | 6061 | mlog_errno(status); |
3811 | goto bail; | 6062 | goto bail; |
3812 | } | 6063 | } |
6064 | ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); | ||
3813 | 6065 | ||
3814 | metadata_delete = 0; | ||
3815 | if (fe->id2.i_list.l_tree_depth) { | 6066 | if (fe->id2.i_list.l_tree_depth) { |
3816 | /* If we have a tree, then the truncate may result in | ||
3817 | * metadata deletes. Figure this out from the | ||
3818 | * rightmost leaf block.*/ | ||
3819 | status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), | 6067 | status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), |
3820 | &last_eb_bh, OCFS2_BH_CACHED, inode); | 6068 | &last_eb_bh, OCFS2_BH_CACHED, inode); |
3821 | if (status < 0) { | 6069 | if (status < 0) { |
@@ -3830,43 +6078,10 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, | |||
3830 | status = -EIO; | 6078 | status = -EIO; |
3831 | goto bail; | 6079 | goto bail; |
3832 | } | 6080 | } |
3833 | el = &(eb->h_list); | ||
3834 | |||
3835 | i = 0; | ||
3836 | if (ocfs2_is_empty_extent(&el->l_recs[0])) | ||
3837 | i = 1; | ||
3838 | /* | ||
3839 | * XXX: Should we check that next_free_rec contains | ||
3840 | * the extent? | ||
3841 | */ | ||
3842 | if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters) | ||
3843 | metadata_delete = 1; | ||
3844 | } | 6081 | } |
3845 | 6082 | ||
3846 | (*tc)->tc_last_eb_bh = last_eb_bh; | 6083 | (*tc)->tc_last_eb_bh = last_eb_bh; |
3847 | 6084 | ||
3848 | if (metadata_delete) { | ||
3849 | mlog(0, "Will have to delete metadata for this trunc. " | ||
3850 | "locking allocator.\n"); | ||
3851 | ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0); | ||
3852 | if (!ext_alloc_inode) { | ||
3853 | status = -ENOMEM; | ||
3854 | mlog_errno(status); | ||
3855 | goto bail; | ||
3856 | } | ||
3857 | |||
3858 | mutex_lock(&ext_alloc_inode->i_mutex); | ||
3859 | (*tc)->tc_ext_alloc_inode = ext_alloc_inode; | ||
3860 | |||
3861 | status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1); | ||
3862 | if (status < 0) { | ||
3863 | mlog_errno(status); | ||
3864 | goto bail; | ||
3865 | } | ||
3866 | (*tc)->tc_ext_alloc_bh = ext_alloc_bh; | ||
3867 | (*tc)->tc_ext_alloc_locked = 1; | ||
3868 | } | ||
3869 | |||
3870 | status = 0; | 6085 | status = 0; |
3871 | bail: | 6086 | bail: |
3872 | if (status < 0) { | 6087 | if (status < 0) { |
@@ -3880,16 +6095,13 @@ bail: | |||
3880 | 6095 | ||
3881 | static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) | 6096 | static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) |
3882 | { | 6097 | { |
3883 | if (tc->tc_ext_alloc_inode) { | 6098 | /* |
3884 | if (tc->tc_ext_alloc_locked) | 6099 | * The caller is responsible for completing deallocation |
3885 | ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1); | 6100 | * before freeing the context. |
3886 | 6101 | */ | |
3887 | mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex); | 6102 | if (tc->tc_dealloc.c_first_suballocator != NULL) |
3888 | iput(tc->tc_ext_alloc_inode); | 6103 | mlog(ML_NOTICE, |
3889 | } | 6104 | "Truncate completion has non-empty dealloc context\n"); |
3890 | |||
3891 | if (tc->tc_ext_alloc_bh) | ||
3892 | brelse(tc->tc_ext_alloc_bh); | ||
3893 | 6105 | ||
3894 | if (tc->tc_last_eb_bh) | 6106 | if (tc->tc_last_eb_bh) |
3895 | brelse(tc->tc_last_eb_bh); | 6107 | brelse(tc->tc_last_eb_bh); |
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index fbcb5934a081..990df48ae8d3 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h | |||
@@ -34,7 +34,17 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, | |||
34 | u32 cpos, | 34 | u32 cpos, |
35 | u64 start_blk, | 35 | u64 start_blk, |
36 | u32 new_clusters, | 36 | u32 new_clusters, |
37 | u8 flags, | ||
37 | struct ocfs2_alloc_context *meta_ac); | 38 | struct ocfs2_alloc_context *meta_ac); |
39 | struct ocfs2_cached_dealloc_ctxt; | ||
40 | int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, | ||
41 | handle_t *handle, u32 cpos, u32 len, u32 phys, | ||
42 | struct ocfs2_alloc_context *meta_ac, | ||
43 | struct ocfs2_cached_dealloc_ctxt *dealloc); | ||
44 | int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, | ||
45 | u32 cpos, u32 len, handle_t *handle, | ||
46 | struct ocfs2_alloc_context *meta_ac, | ||
47 | struct ocfs2_cached_dealloc_ctxt *dealloc); | ||
38 | int ocfs2_num_free_extents(struct ocfs2_super *osb, | 48 | int ocfs2_num_free_extents(struct ocfs2_super *osb, |
39 | struct inode *inode, | 49 | struct inode *inode, |
40 | struct ocfs2_dinode *fe); | 50 | struct ocfs2_dinode *fe); |
@@ -62,17 +72,41 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, | |||
62 | struct ocfs2_dinode **tl_copy); | 72 | struct ocfs2_dinode **tl_copy); |
63 | int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, | 73 | int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, |
64 | struct ocfs2_dinode *tl_copy); | 74 | struct ocfs2_dinode *tl_copy); |
75 | int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb); | ||
76 | int ocfs2_truncate_log_append(struct ocfs2_super *osb, | ||
77 | handle_t *handle, | ||
78 | u64 start_blk, | ||
79 | unsigned int num_clusters); | ||
80 | int __ocfs2_flush_truncate_log(struct ocfs2_super *osb); | ||
81 | |||
82 | /* | ||
83 | * Process local structure which describes the block unlinks done | ||
84 | * during an operation. This is populated via | ||
85 | * ocfs2_cache_block_dealloc(). | ||
86 | * | ||
87 | * ocfs2_run_deallocs() should be called after the potentially | ||
88 | * de-allocating routines. No journal handles should be open, and most | ||
89 | * locks should have been dropped. | ||
90 | */ | ||
91 | struct ocfs2_cached_dealloc_ctxt { | ||
92 | struct ocfs2_per_slot_free_list *c_first_suballocator; | ||
93 | }; | ||
94 | static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c) | ||
95 | { | ||
96 | c->c_first_suballocator = NULL; | ||
97 | } | ||
98 | int ocfs2_run_deallocs(struct ocfs2_super *osb, | ||
99 | struct ocfs2_cached_dealloc_ctxt *ctxt); | ||
65 | 100 | ||
66 | struct ocfs2_truncate_context { | 101 | struct ocfs2_truncate_context { |
67 | struct inode *tc_ext_alloc_inode; | 102 | struct ocfs2_cached_dealloc_ctxt tc_dealloc; |
68 | struct buffer_head *tc_ext_alloc_bh; | ||
69 | int tc_ext_alloc_locked; /* is it cluster locked? */ | 103 | int tc_ext_alloc_locked; /* is it cluster locked? */ |
70 | /* these get destroyed once it's passed to ocfs2_commit_truncate. */ | 104 | /* these get destroyed once it's passed to ocfs2_commit_truncate. */ |
71 | struct buffer_head *tc_last_eb_bh; | 105 | struct buffer_head *tc_last_eb_bh; |
72 | }; | 106 | }; |
73 | 107 | ||
74 | int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, | 108 | int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, |
75 | u64 new_i_size); | 109 | u64 range_start, u64 range_end); |
76 | int ocfs2_prepare_truncate(struct ocfs2_super *osb, | 110 | int ocfs2_prepare_truncate(struct ocfs2_super *osb, |
77 | struct inode *inode, | 111 | struct inode *inode, |
78 | struct buffer_head *fe_bh, | 112 | struct buffer_head *fe_bh, |
@@ -84,6 +118,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, | |||
84 | 118 | ||
85 | int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, | 119 | int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, |
86 | u32 cpos, struct buffer_head **leaf_bh); | 120 | u32 cpos, struct buffer_head **leaf_bh); |
121 | int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster); | ||
87 | 122 | ||
88 | /* | 123 | /* |
89 | * Helper function to look at the # of clusters in an extent record. | 124 | * Helper function to look at the # of clusters in an extent record. |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a480b09c79b9..84bf6e79de23 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -684,6 +684,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | |||
684 | bh = bh->b_this_page, block_start += bsize) { | 684 | bh = bh->b_this_page, block_start += bsize) { |
685 | block_end = block_start + bsize; | 685 | block_end = block_start + bsize; |
686 | 686 | ||
687 | clear_buffer_new(bh); | ||
688 | |||
687 | /* | 689 | /* |
688 | * Ignore blocks outside of our i/o range - | 690 | * Ignore blocks outside of our i/o range - |
689 | * they may belong to unallocated clusters. | 691 | * they may belong to unallocated clusters. |
@@ -698,9 +700,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | |||
698 | * For an allocating write with cluster size >= page | 700 | * For an allocating write with cluster size >= page |
699 | * size, we always write the entire page. | 701 | * size, we always write the entire page. |
700 | */ | 702 | */ |
701 | 703 | if (new) | |
702 | if (buffer_new(bh)) | 704 | set_buffer_new(bh); |
703 | clear_buffer_new(bh); | ||
704 | 705 | ||
705 | if (!buffer_mapped(bh)) { | 706 | if (!buffer_mapped(bh)) { |
706 | map_bh(bh, inode->i_sb, *p_blkno); | 707 | map_bh(bh, inode->i_sb, *p_blkno); |
@@ -711,7 +712,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | |||
711 | if (!buffer_uptodate(bh)) | 712 | if (!buffer_uptodate(bh)) |
712 | set_buffer_uptodate(bh); | 713 | set_buffer_uptodate(bh); |
713 | } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && | 714 | } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && |
714 | (block_start < from || block_end > to)) { | 715 | !buffer_new(bh) && |
716 | (block_start < from || block_end > to)) { | ||
715 | ll_rw_block(READ, 1, &bh); | 717 | ll_rw_block(READ, 1, &bh); |
716 | *wait_bh++=bh; | 718 | *wait_bh++=bh; |
717 | } | 719 | } |
@@ -738,18 +740,13 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | |||
738 | bh = head; | 740 | bh = head; |
739 | block_start = 0; | 741 | block_start = 0; |
740 | do { | 742 | do { |
741 | void *kaddr; | ||
742 | |||
743 | block_end = block_start + bsize; | 743 | block_end = block_start + bsize; |
744 | if (block_end <= from) | 744 | if (block_end <= from) |
745 | goto next_bh; | 745 | goto next_bh; |
746 | if (block_start >= to) | 746 | if (block_start >= to) |
747 | break; | 747 | break; |
748 | 748 | ||
749 | kaddr = kmap_atomic(page, KM_USER0); | 749 | zero_user_page(page, block_start, bh->b_size, KM_USER0); |
750 | memset(kaddr+block_start, 0, bh->b_size); | ||
751 | flush_dcache_page(page); | ||
752 | kunmap_atomic(kaddr, KM_USER0); | ||
753 | set_buffer_uptodate(bh); | 750 | set_buffer_uptodate(bh); |
754 | mark_buffer_dirty(bh); | 751 | mark_buffer_dirty(bh); |
755 | 752 | ||
@@ -761,217 +758,240 @@ next_bh: | |||
761 | return ret; | 758 | return ret; |
762 | } | 759 | } |
763 | 760 | ||
761 | #if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE) | ||
762 | #define OCFS2_MAX_CTXT_PAGES 1 | ||
763 | #else | ||
764 | #define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE) | ||
765 | #endif | ||
766 | |||
767 | #define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) | ||
768 | |||
764 | /* | 769 | /* |
765 | * This will copy user data from the buffer page in the splice | 770 | * Describe the state of a single cluster to be written to. |
766 | * context. | ||
767 | * | ||
768 | * For now, we ignore SPLICE_F_MOVE as that would require some extra | ||
769 | * communication out all the way to ocfs2_write(). | ||
770 | */ | 771 | */ |
771 | int ocfs2_map_and_write_splice_data(struct inode *inode, | 772 | struct ocfs2_write_cluster_desc { |
772 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | 773 | u32 c_cpos; |
773 | unsigned int *ret_from, unsigned int *ret_to) | 774 | u32 c_phys; |
775 | /* | ||
776 | * Give this a unique field because c_phys eventually gets | ||
777 | * filled. | ||
778 | */ | ||
779 | unsigned c_new; | ||
780 | unsigned c_unwritten; | ||
781 | }; | ||
782 | |||
783 | static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d) | ||
774 | { | 784 | { |
775 | int ret; | 785 | return d->c_new || d->c_unwritten; |
776 | unsigned int to, from, cluster_start, cluster_end; | 786 | } |
777 | char *src, *dst; | ||
778 | struct ocfs2_splice_write_priv *sp = wc->w_private; | ||
779 | struct pipe_buffer *buf = sp->s_buf; | ||
780 | unsigned long bytes, src_from; | ||
781 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
782 | 787 | ||
783 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | 788 | struct ocfs2_write_ctxt { |
784 | &cluster_end); | 789 | /* Logical cluster position / len of write */ |
790 | u32 w_cpos; | ||
791 | u32 w_clen; | ||
785 | 792 | ||
786 | from = sp->s_offset; | 793 | struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; |
787 | src_from = sp->s_buf_offset; | ||
788 | bytes = wc->w_count; | ||
789 | 794 | ||
790 | if (wc->w_large_pages) { | 795 | /* |
791 | /* | 796 | * This is true if page_size > cluster_size. |
792 | * For cluster size < page size, we have to | 797 | * |
793 | * calculate pos within the cluster and obey | 798 | * It triggers a set of special cases during write which might |
794 | * the rightmost boundary. | 799 | * have to deal with allocating writes to partial pages. |
795 | */ | 800 | */ |
796 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | 801 | unsigned int w_large_pages; |
797 | - (wc->w_pos & (osb->s_clustersize - 1)))); | 802 | |
798 | } | 803 | /* |
799 | to = from + bytes; | 804 | * Pages involved in this write. |
805 | * | ||
806 | * w_target_page is the page being written to by the user. | ||
807 | * | ||
808 | * w_pages is an array of pages which always contains | ||
809 | * w_target_page, and in the case of an allocating write with | ||
810 | * page_size < cluster size, it will contain zero'd and mapped | ||
811 | * pages adjacent to w_target_page which need to be written | ||
812 | * out in so that future reads from that region will get | ||
813 | * zero's. | ||
814 | */ | ||
815 | struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; | ||
816 | unsigned int w_num_pages; | ||
817 | struct page *w_target_page; | ||
800 | 818 | ||
801 | BUG_ON(from > PAGE_CACHE_SIZE); | 819 | /* |
802 | BUG_ON(to > PAGE_CACHE_SIZE); | 820 | * ocfs2_write_end() uses this to know what the real range to |
803 | BUG_ON(from < cluster_start); | 821 | * write in the target should be. |
804 | BUG_ON(to > cluster_end); | 822 | */ |
823 | unsigned int w_target_from; | ||
824 | unsigned int w_target_to; | ||
805 | 825 | ||
806 | if (wc->w_this_page_new) | 826 | /* |
807 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | 827 | * We could use journal_current_handle() but this is cleaner, |
808 | cluster_start, cluster_end, 1); | 828 | * IMHO -Mark |
809 | else | 829 | */ |
810 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | 830 | handle_t *w_handle; |
811 | from, to, 0); | 831 | |
812 | if (ret) { | 832 | struct buffer_head *w_di_bh; |
813 | mlog_errno(ret); | 833 | |
814 | goto out; | 834 | struct ocfs2_cached_dealloc_ctxt w_dealloc; |
835 | }; | ||
836 | |||
837 | static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) | ||
838 | { | ||
839 | int i; | ||
840 | |||
841 | for(i = 0; i < wc->w_num_pages; i++) { | ||
842 | if (wc->w_pages[i] == NULL) | ||
843 | continue; | ||
844 | |||
845 | unlock_page(wc->w_pages[i]); | ||
846 | mark_page_accessed(wc->w_pages[i]); | ||
847 | page_cache_release(wc->w_pages[i]); | ||
815 | } | 848 | } |
816 | 849 | ||
817 | src = buf->ops->map(sp->s_pipe, buf, 1); | 850 | brelse(wc->w_di_bh); |
818 | dst = kmap_atomic(wc->w_this_page, KM_USER1); | 851 | kfree(wc); |
819 | memcpy(dst + from, src + src_from, bytes); | 852 | } |
820 | kunmap_atomic(wc->w_this_page, KM_USER1); | 853 | |
821 | buf->ops->unmap(sp->s_pipe, buf, src); | 854 | static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, |
855 | struct ocfs2_super *osb, loff_t pos, | ||
856 | unsigned len, struct buffer_head *di_bh) | ||
857 | { | ||
858 | struct ocfs2_write_ctxt *wc; | ||
859 | |||
860 | wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS); | ||
861 | if (!wc) | ||
862 | return -ENOMEM; | ||
822 | 863 | ||
823 | wc->w_finished_copy = 1; | 864 | wc->w_cpos = pos >> osb->s_clustersize_bits; |
865 | wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len); | ||
866 | get_bh(di_bh); | ||
867 | wc->w_di_bh = di_bh; | ||
824 | 868 | ||
825 | *ret_from = from; | 869 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) |
826 | *ret_to = to; | 870 | wc->w_large_pages = 1; |
827 | out: | 871 | else |
872 | wc->w_large_pages = 0; | ||
873 | |||
874 | ocfs2_init_dealloc_ctxt(&wc->w_dealloc); | ||
875 | |||
876 | *wcp = wc; | ||
828 | 877 | ||
829 | return bytes ? (unsigned int)bytes : ret; | 878 | return 0; |
830 | } | 879 | } |
831 | 880 | ||
832 | /* | 881 | /* |
833 | * This will copy user data from the iovec in the buffered write | 882 | * If a page has any new buffers, zero them out here, and mark them uptodate |
834 | * context. | 883 | * and dirty so they'll be written out (in order to prevent uninitialised |
884 | * block data from leaking). And clear the new bit. | ||
835 | */ | 885 | */ |
836 | int ocfs2_map_and_write_user_data(struct inode *inode, | 886 | static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to) |
837 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | ||
838 | unsigned int *ret_from, unsigned int *ret_to) | ||
839 | { | 887 | { |
840 | int ret; | 888 | unsigned int block_start, block_end; |
841 | unsigned int to, from, cluster_start, cluster_end; | 889 | struct buffer_head *head, *bh; |
842 | unsigned long bytes, src_from; | ||
843 | char *dst; | ||
844 | struct ocfs2_buffered_write_priv *bp = wc->w_private; | ||
845 | const struct iovec *cur_iov = bp->b_cur_iov; | ||
846 | char __user *buf; | ||
847 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
848 | 890 | ||
849 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | 891 | BUG_ON(!PageLocked(page)); |
850 | &cluster_end); | 892 | if (!page_has_buffers(page)) |
893 | return; | ||
851 | 894 | ||
852 | buf = cur_iov->iov_base + bp->b_cur_off; | 895 | bh = head = page_buffers(page); |
853 | src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; | 896 | block_start = 0; |
897 | do { | ||
898 | block_end = block_start + bh->b_size; | ||
854 | 899 | ||
855 | from = wc->w_pos & (PAGE_CACHE_SIZE - 1); | 900 | if (buffer_new(bh)) { |
901 | if (block_end > from && block_start < to) { | ||
902 | if (!PageUptodate(page)) { | ||
903 | unsigned start, end; | ||
856 | 904 | ||
857 | /* | 905 | start = max(from, block_start); |
858 | * This is a lot of comparisons, but it reads quite | 906 | end = min(to, block_end); |
859 | * easily, which is important here. | ||
860 | */ | ||
861 | /* Stay within the src page */ | ||
862 | bytes = PAGE_SIZE - src_from; | ||
863 | /* Stay within the vector */ | ||
864 | bytes = min(bytes, | ||
865 | (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); | ||
866 | /* Stay within count */ | ||
867 | bytes = min(bytes, (unsigned long)wc->w_count); | ||
868 | /* | ||
869 | * For clustersize > page size, just stay within | ||
870 | * target page, otherwise we have to calculate pos | ||
871 | * within the cluster and obey the rightmost | ||
872 | * boundary. | ||
873 | */ | ||
874 | if (wc->w_large_pages) { | ||
875 | /* | ||
876 | * For cluster size < page size, we have to | ||
877 | * calculate pos within the cluster and obey | ||
878 | * the rightmost boundary. | ||
879 | */ | ||
880 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | ||
881 | - (wc->w_pos & (osb->s_clustersize - 1)))); | ||
882 | } else { | ||
883 | /* | ||
884 | * cluster size > page size is the most common | ||
885 | * case - we just stay within the target page | ||
886 | * boundary. | ||
887 | */ | ||
888 | bytes = min(bytes, PAGE_CACHE_SIZE - from); | ||
889 | } | ||
890 | 907 | ||
891 | to = from + bytes; | 908 | zero_user_page(page, start, end - start, KM_USER0); |
909 | set_buffer_uptodate(bh); | ||
910 | } | ||
892 | 911 | ||
893 | BUG_ON(from > PAGE_CACHE_SIZE); | 912 | clear_buffer_new(bh); |
894 | BUG_ON(to > PAGE_CACHE_SIZE); | 913 | mark_buffer_dirty(bh); |
895 | BUG_ON(from < cluster_start); | 914 | } |
896 | BUG_ON(to > cluster_end); | 915 | } |
897 | 916 | ||
898 | if (wc->w_this_page_new) | 917 | block_start = block_end; |
899 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | 918 | bh = bh->b_this_page; |
900 | cluster_start, cluster_end, 1); | 919 | } while (bh != head); |
901 | else | 920 | } |
902 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
903 | from, to, 0); | ||
904 | if (ret) { | ||
905 | mlog_errno(ret); | ||
906 | goto out; | ||
907 | } | ||
908 | 921 | ||
909 | dst = kmap(wc->w_this_page); | 922 | /* |
910 | memcpy(dst + from, bp->b_src_buf + src_from, bytes); | 923 | * Only called when we have a failure during allocating write to write |
911 | kunmap(wc->w_this_page); | 924 | * zero's to the newly allocated region. |
925 | */ | ||
926 | static void ocfs2_write_failure(struct inode *inode, | ||
927 | struct ocfs2_write_ctxt *wc, | ||
928 | loff_t user_pos, unsigned user_len) | ||
929 | { | ||
930 | int i; | ||
931 | unsigned from, to; | ||
932 | struct page *tmppage; | ||
912 | 933 | ||
913 | /* | 934 | ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len); |
914 | * XXX: This is slow, but simple. The caller of | ||
915 | * ocfs2_buffered_write_cluster() is responsible for | ||
916 | * passing through the iovecs, so it's difficult to | ||
917 | * predict what our next step is in here after our | ||
918 | * initial write. A future version should be pushing | ||
919 | * that iovec manipulation further down. | ||
920 | * | ||
921 | * By setting this, we indicate that a copy from user | ||
922 | * data was done, and subsequent calls for this | ||
923 | * cluster will skip copying more data. | ||
924 | */ | ||
925 | wc->w_finished_copy = 1; | ||
926 | 935 | ||
927 | *ret_from = from; | 936 | if (wc->w_large_pages) { |
928 | *ret_to = to; | 937 | from = wc->w_target_from; |
929 | out: | 938 | to = wc->w_target_to; |
939 | } else { | ||
940 | from = 0; | ||
941 | to = PAGE_CACHE_SIZE; | ||
942 | } | ||
943 | |||
944 | for(i = 0; i < wc->w_num_pages; i++) { | ||
945 | tmppage = wc->w_pages[i]; | ||
930 | 946 | ||
931 | return bytes ? (unsigned int)bytes : ret; | 947 | if (ocfs2_should_order_data(inode)) |
948 | walk_page_buffers(wc->w_handle, page_buffers(tmppage), | ||
949 | from, to, NULL, | ||
950 | ocfs2_journal_dirty_data); | ||
951 | |||
952 | block_commit_write(tmppage, from, to); | ||
953 | } | ||
932 | } | 954 | } |
933 | 955 | ||
934 | /* | 956 | static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, |
935 | * Map, fill and write a page to disk. | 957 | struct ocfs2_write_ctxt *wc, |
936 | * | 958 | struct page *page, u32 cpos, |
937 | * The work of copying data is done via callback. Newly allocated | 959 | loff_t user_pos, unsigned user_len, |
938 | * pages which don't take user data will be zero'd (set 'new' to | 960 | int new) |
939 | * indicate an allocating write) | ||
940 | * | ||
941 | * Returns a negative error code or the number of bytes copied into | ||
942 | * the page. | ||
943 | */ | ||
944 | static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | ||
945 | u64 *p_blkno, struct page *page, | ||
946 | struct ocfs2_write_ctxt *wc, int new) | ||
947 | { | 961 | { |
948 | int ret, copied = 0; | 962 | int ret; |
949 | unsigned int from = 0, to = 0; | 963 | unsigned int map_from = 0, map_to = 0; |
950 | unsigned int cluster_start, cluster_end; | 964 | unsigned int cluster_start, cluster_end; |
951 | unsigned int zero_from = 0, zero_to = 0; | 965 | unsigned int user_data_from = 0, user_data_to = 0; |
952 | 966 | ||
953 | ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, | 967 | ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos, |
954 | &cluster_start, &cluster_end); | 968 | &cluster_start, &cluster_end); |
955 | 969 | ||
956 | if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index | 970 | if (page == wc->w_target_page) { |
957 | && !wc->w_finished_copy) { | 971 | map_from = user_pos & (PAGE_CACHE_SIZE - 1); |
958 | 972 | map_to = map_from + user_len; | |
959 | wc->w_this_page = page; | 973 | |
960 | wc->w_this_page_new = new; | 974 | if (new) |
961 | ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); | 975 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, |
962 | if (ret < 0) { | 976 | cluster_start, cluster_end, |
977 | new); | ||
978 | else | ||
979 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, | ||
980 | map_from, map_to, new); | ||
981 | if (ret) { | ||
963 | mlog_errno(ret); | 982 | mlog_errno(ret); |
964 | goto out; | 983 | goto out; |
965 | } | 984 | } |
966 | 985 | ||
967 | copied = ret; | 986 | user_data_from = map_from; |
968 | 987 | user_data_to = map_to; | |
969 | zero_from = from; | ||
970 | zero_to = to; | ||
971 | if (new) { | 988 | if (new) { |
972 | from = cluster_start; | 989 | map_from = cluster_start; |
973 | to = cluster_end; | 990 | map_to = cluster_end; |
974 | } | 991 | } |
992 | |||
993 | wc->w_target_from = map_from; | ||
994 | wc->w_target_to = map_to; | ||
975 | } else { | 995 | } else { |
976 | /* | 996 | /* |
977 | * If we haven't allocated the new page yet, we | 997 | * If we haven't allocated the new page yet, we |
@@ -980,11 +1000,11 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | |||
980 | */ | 1000 | */ |
981 | BUG_ON(!new); | 1001 | BUG_ON(!new); |
982 | 1002 | ||
983 | from = cluster_start; | 1003 | map_from = cluster_start; |
984 | to = cluster_end; | 1004 | map_to = cluster_end; |
985 | 1005 | ||
986 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, | 1006 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, |
987 | cluster_start, cluster_end, 1); | 1007 | cluster_start, cluster_end, new); |
988 | if (ret) { | 1008 | if (ret) { |
989 | mlog_errno(ret); | 1009 | mlog_errno(ret); |
990 | goto out; | 1010 | goto out; |
@@ -1003,108 +1023,113 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | |||
1003 | */ | 1023 | */ |
1004 | if (new && !PageUptodate(page)) | 1024 | if (new && !PageUptodate(page)) |
1005 | ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), | 1025 | ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), |
1006 | wc->w_cpos, zero_from, zero_to); | 1026 | cpos, user_data_from, user_data_to); |
1007 | 1027 | ||
1008 | flush_dcache_page(page); | 1028 | flush_dcache_page(page); |
1009 | 1029 | ||
1010 | if (ocfs2_should_order_data(inode)) { | ||
1011 | ret = walk_page_buffers(handle, | ||
1012 | page_buffers(page), | ||
1013 | from, to, NULL, | ||
1014 | ocfs2_journal_dirty_data); | ||
1015 | if (ret < 0) | ||
1016 | mlog_errno(ret); | ||
1017 | } | ||
1018 | |||
1019 | /* | ||
1020 | * We don't use generic_commit_write() because we need to | ||
1021 | * handle our own i_size update. | ||
1022 | */ | ||
1023 | ret = block_commit_write(page, from, to); | ||
1024 | if (ret) | ||
1025 | mlog_errno(ret); | ||
1026 | out: | 1030 | out: |
1027 | 1031 | return ret; | |
1028 | return copied ? copied : ret; | ||
1029 | } | 1032 | } |
1030 | 1033 | ||
1031 | /* | 1034 | /* |
1032 | * Do the actual write of some data into an inode. Optionally allocate | 1035 | * This function will only grab one clusters worth of pages. |
1033 | * in order to fulfill the write. | ||
1034 | * | ||
1035 | * cpos is the logical cluster offset within the file to write at | ||
1036 | * | ||
1037 | * 'phys' is the physical mapping of that offset. a 'phys' value of | ||
1038 | * zero indicates that allocation is required. In this case, data_ac | ||
1039 | * and meta_ac should be valid (meta_ac can be null if metadata | ||
1040 | * allocation isn't required). | ||
1041 | */ | 1036 | */ |
1042 | static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, | 1037 | static int ocfs2_grab_pages_for_write(struct address_space *mapping, |
1043 | struct buffer_head *di_bh, | 1038 | struct ocfs2_write_ctxt *wc, |
1044 | struct ocfs2_alloc_context *data_ac, | 1039 | u32 cpos, loff_t user_pos, int new, |
1045 | struct ocfs2_alloc_context *meta_ac, | 1040 | struct page *mmap_page) |
1046 | struct ocfs2_write_ctxt *wc) | ||
1047 | { | 1041 | { |
1048 | int ret, i, numpages = 1, new; | 1042 | int ret = 0, i; |
1049 | unsigned int copied = 0; | 1043 | unsigned long start, target_index, index; |
1050 | u32 tmp_pos; | ||
1051 | u64 v_blkno, p_blkno; | ||
1052 | struct address_space *mapping = file->f_mapping; | ||
1053 | struct inode *inode = mapping->host; | 1044 | struct inode *inode = mapping->host; |
1054 | unsigned long index, start; | ||
1055 | struct page **cpages; | ||
1056 | 1045 | ||
1057 | new = phys == 0 ? 1 : 0; | 1046 | target_index = user_pos >> PAGE_CACHE_SHIFT; |
1058 | 1047 | ||
1059 | /* | 1048 | /* |
1060 | * Figure out how many pages we'll be manipulating here. For | 1049 | * Figure out how many pages we'll be manipulating here. For |
1061 | * non allocating write, we just change the one | 1050 | * non allocating write, we just change the one |
1062 | * page. Otherwise, we'll need a whole clusters worth. | 1051 | * page. Otherwise, we'll need a whole clusters worth. |
1063 | */ | 1052 | */ |
1064 | if (new) | ||
1065 | numpages = ocfs2_pages_per_cluster(inode->i_sb); | ||
1066 | |||
1067 | cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); | ||
1068 | if (!cpages) { | ||
1069 | ret = -ENOMEM; | ||
1070 | mlog_errno(ret); | ||
1071 | return ret; | ||
1072 | } | ||
1073 | |||
1074 | /* | ||
1075 | * Fill our page array first. That way we've grabbed enough so | ||
1076 | * that we can zero and flush if we error after adding the | ||
1077 | * extent. | ||
1078 | */ | ||
1079 | if (new) { | 1053 | if (new) { |
1080 | start = ocfs2_align_clusters_to_page_index(inode->i_sb, | 1054 | wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); |
1081 | wc->w_cpos); | 1055 | start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); |
1082 | v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); | ||
1083 | } else { | 1056 | } else { |
1084 | start = wc->w_pos >> PAGE_CACHE_SHIFT; | 1057 | wc->w_num_pages = 1; |
1085 | v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; | 1058 | start = target_index; |
1086 | } | 1059 | } |
1087 | 1060 | ||
1088 | for(i = 0; i < numpages; i++) { | 1061 | for(i = 0; i < wc->w_num_pages; i++) { |
1089 | index = start + i; | 1062 | index = start + i; |
1090 | 1063 | ||
1091 | cpages[i] = find_or_create_page(mapping, index, GFP_NOFS); | 1064 | if (index == target_index && mmap_page) { |
1092 | if (!cpages[i]) { | 1065 | /* |
1093 | ret = -ENOMEM; | 1066 | * ocfs2_pagemkwrite() is a little different |
1094 | mlog_errno(ret); | 1067 | * and wants us to directly use the page |
1095 | goto out; | 1068 | * passed in. |
1069 | */ | ||
1070 | lock_page(mmap_page); | ||
1071 | |||
1072 | if (mmap_page->mapping != mapping) { | ||
1073 | unlock_page(mmap_page); | ||
1074 | /* | ||
1075 | * Sanity check - the locking in | ||
1076 | * ocfs2_pagemkwrite() should ensure | ||
1077 | * that this code doesn't trigger. | ||
1078 | */ | ||
1079 | ret = -EINVAL; | ||
1080 | mlog_errno(ret); | ||
1081 | goto out; | ||
1082 | } | ||
1083 | |||
1084 | page_cache_get(mmap_page); | ||
1085 | wc->w_pages[i] = mmap_page; | ||
1086 | } else { | ||
1087 | wc->w_pages[i] = find_or_create_page(mapping, index, | ||
1088 | GFP_NOFS); | ||
1089 | if (!wc->w_pages[i]) { | ||
1090 | ret = -ENOMEM; | ||
1091 | mlog_errno(ret); | ||
1092 | goto out; | ||
1093 | } | ||
1096 | } | 1094 | } |
1095 | |||
1096 | if (index == target_index) | ||
1097 | wc->w_target_page = wc->w_pages[i]; | ||
1097 | } | 1098 | } |
1099 | out: | ||
1100 | return ret; | ||
1101 | } | ||
1102 | |||
1103 | /* | ||
1104 | * Prepare a single cluster for write one cluster into the file. | ||
1105 | */ | ||
1106 | static int ocfs2_write_cluster(struct address_space *mapping, | ||
1107 | u32 phys, unsigned int unwritten, | ||
1108 | struct ocfs2_alloc_context *data_ac, | ||
1109 | struct ocfs2_alloc_context *meta_ac, | ||
1110 | struct ocfs2_write_ctxt *wc, u32 cpos, | ||
1111 | loff_t user_pos, unsigned user_len) | ||
1112 | { | ||
1113 | int ret, i, new, should_zero = 0; | ||
1114 | u64 v_blkno, p_blkno; | ||
1115 | struct inode *inode = mapping->host; | ||
1116 | |||
1117 | new = phys == 0 ? 1 : 0; | ||
1118 | if (new || unwritten) | ||
1119 | should_zero = 1; | ||
1098 | 1120 | ||
1099 | if (new) { | 1121 | if (new) { |
1122 | u32 tmp_pos; | ||
1123 | |||
1100 | /* | 1124 | /* |
1101 | * This is safe to call with the page locks - it won't take | 1125 | * This is safe to call with the page locks - it won't take |
1102 | * any additional semaphores or cluster locks. | 1126 | * any additional semaphores or cluster locks. |
1103 | */ | 1127 | */ |
1104 | tmp_pos = wc->w_cpos; | 1128 | tmp_pos = cpos; |
1105 | ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, | 1129 | ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, |
1106 | &tmp_pos, 1, di_bh, handle, | 1130 | &tmp_pos, 1, 0, wc->w_di_bh, |
1107 | data_ac, meta_ac, NULL); | 1131 | wc->w_handle, data_ac, |
1132 | meta_ac, NULL); | ||
1108 | /* | 1133 | /* |
1109 | * This shouldn't happen because we must have already | 1134 | * This shouldn't happen because we must have already |
1110 | * calculated the correct meta data allocation required. The | 1135 | * calculated the correct meta data allocation required. The |
@@ -1121,159 +1146,433 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, | |||
1121 | mlog_errno(ret); | 1146 | mlog_errno(ret); |
1122 | goto out; | 1147 | goto out; |
1123 | } | 1148 | } |
1149 | } else if (unwritten) { | ||
1150 | ret = ocfs2_mark_extent_written(inode, wc->w_di_bh, | ||
1151 | wc->w_handle, cpos, 1, phys, | ||
1152 | meta_ac, &wc->w_dealloc); | ||
1153 | if (ret < 0) { | ||
1154 | mlog_errno(ret); | ||
1155 | goto out; | ||
1156 | } | ||
1124 | } | 1157 | } |
1125 | 1158 | ||
1159 | if (should_zero) | ||
1160 | v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); | ||
1161 | else | ||
1162 | v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; | ||
1163 | |||
1164 | /* | ||
1165 | * The only reason this should fail is due to an inability to | ||
1166 | * find the extent added. | ||
1167 | */ | ||
1126 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, | 1168 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, |
1127 | NULL); | 1169 | NULL); |
1128 | if (ret < 0) { | 1170 | if (ret < 0) { |
1129 | 1171 | ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, " | |
1130 | /* | 1172 | "at logical block %llu", |
1131 | * XXX: Should we go readonly here? | 1173 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
1132 | */ | 1174 | (unsigned long long)v_blkno); |
1133 | |||
1134 | mlog_errno(ret); | ||
1135 | goto out; | 1175 | goto out; |
1136 | } | 1176 | } |
1137 | 1177 | ||
1138 | BUG_ON(p_blkno == 0); | 1178 | BUG_ON(p_blkno == 0); |
1139 | 1179 | ||
1140 | for(i = 0; i < numpages; i++) { | 1180 | for(i = 0; i < wc->w_num_pages; i++) { |
1141 | ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], | 1181 | int tmpret; |
1142 | wc, new); | 1182 | |
1143 | if (ret < 0) { | 1183 | tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, |
1144 | mlog_errno(ret); | 1184 | wc->w_pages[i], cpos, |
1145 | goto out; | 1185 | user_pos, user_len, |
1186 | should_zero); | ||
1187 | if (tmpret) { | ||
1188 | mlog_errno(tmpret); | ||
1189 | if (ret == 0) | ||
1190 | tmpret = ret; | ||
1146 | } | 1191 | } |
1147 | |||
1148 | copied += ret; | ||
1149 | } | 1192 | } |
1150 | 1193 | ||
1194 | /* | ||
1195 | * We only have cleanup to do in case of allocating write. | ||
1196 | */ | ||
1197 | if (ret && new) | ||
1198 | ocfs2_write_failure(inode, wc, user_pos, user_len); | ||
1199 | |||
1151 | out: | 1200 | out: |
1152 | for(i = 0; i < numpages; i++) { | 1201 | |
1153 | unlock_page(cpages[i]); | 1202 | return ret; |
1154 | mark_page_accessed(cpages[i]); | 1203 | } |
1155 | page_cache_release(cpages[i]); | 1204 | |
1205 | static int ocfs2_write_cluster_by_desc(struct address_space *mapping, | ||
1206 | struct ocfs2_alloc_context *data_ac, | ||
1207 | struct ocfs2_alloc_context *meta_ac, | ||
1208 | struct ocfs2_write_ctxt *wc, | ||
1209 | loff_t pos, unsigned len) | ||
1210 | { | ||
1211 | int ret, i; | ||
1212 | struct ocfs2_write_cluster_desc *desc; | ||
1213 | |||
1214 | for (i = 0; i < wc->w_clen; i++) { | ||
1215 | desc = &wc->w_desc[i]; | ||
1216 | |||
1217 | ret = ocfs2_write_cluster(mapping, desc->c_phys, | ||
1218 | desc->c_unwritten, data_ac, meta_ac, | ||
1219 | wc, desc->c_cpos, pos, len); | ||
1220 | if (ret) { | ||
1221 | mlog_errno(ret); | ||
1222 | goto out; | ||
1223 | } | ||
1156 | } | 1224 | } |
1157 | kfree(cpages); | ||
1158 | 1225 | ||
1159 | return copied ? copied : ret; | 1226 | ret = 0; |
1227 | out: | ||
1228 | return ret; | ||
1160 | } | 1229 | } |
1161 | 1230 | ||
1162 | static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, | 1231 | /* |
1163 | struct ocfs2_super *osb, loff_t pos, | 1232 | * ocfs2_write_end() wants to know which parts of the target page it |
1164 | size_t count, ocfs2_page_writer *cb, | 1233 | * should complete the write on. It's easiest to compute them ahead of |
1165 | void *cb_priv) | 1234 | * time when a more complete view of the write is available. |
1235 | */ | ||
1236 | static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, | ||
1237 | struct ocfs2_write_ctxt *wc, | ||
1238 | loff_t pos, unsigned len, int alloc) | ||
1166 | { | 1239 | { |
1167 | wc->w_count = count; | 1240 | struct ocfs2_write_cluster_desc *desc; |
1168 | wc->w_pos = pos; | ||
1169 | wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; | ||
1170 | wc->w_finished_copy = 0; | ||
1171 | 1241 | ||
1172 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) | 1242 | wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1); |
1173 | wc->w_large_pages = 1; | 1243 | wc->w_target_to = wc->w_target_from + len; |
1174 | else | ||
1175 | wc->w_large_pages = 0; | ||
1176 | 1244 | ||
1177 | wc->w_write_data_page = cb; | 1245 | if (alloc == 0) |
1178 | wc->w_private = cb_priv; | 1246 | return; |
1247 | |||
1248 | /* | ||
1249 | * Allocating write - we may have different boundaries based | ||
1250 | * on page size and cluster size. | ||
1251 | * | ||
1252 | * NOTE: We can no longer compute one value from the other as | ||
1253 | * the actual write length and user provided length may be | ||
1254 | * different. | ||
1255 | */ | ||
1256 | |||
1257 | if (wc->w_large_pages) { | ||
1258 | /* | ||
1259 | * We only care about the 1st and last cluster within | ||
1260 | * our range and whether they should be zero'd or not. Either | ||
1261 | * value may be extended out to the start/end of a | ||
1262 | * newly allocated cluster. | ||
1263 | */ | ||
1264 | desc = &wc->w_desc[0]; | ||
1265 | if (ocfs2_should_zero_cluster(desc)) | ||
1266 | ocfs2_figure_cluster_boundaries(osb, | ||
1267 | desc->c_cpos, | ||
1268 | &wc->w_target_from, | ||
1269 | NULL); | ||
1270 | |||
1271 | desc = &wc->w_desc[wc->w_clen - 1]; | ||
1272 | if (ocfs2_should_zero_cluster(desc)) | ||
1273 | ocfs2_figure_cluster_boundaries(osb, | ||
1274 | desc->c_cpos, | ||
1275 | NULL, | ||
1276 | &wc->w_target_to); | ||
1277 | } else { | ||
1278 | wc->w_target_from = 0; | ||
1279 | wc->w_target_to = PAGE_CACHE_SIZE; | ||
1280 | } | ||
1179 | } | 1281 | } |
1180 | 1282 | ||
1181 | /* | 1283 | /* |
1182 | * Write a cluster to an inode. The cluster may not be allocated yet, | 1284 | * Populate each single-cluster write descriptor in the write context |
1183 | * in which case it will be. This only exists for buffered writes - | 1285 | * with information about the i/o to be done. |
1184 | * O_DIRECT takes a more "traditional" path through the kernel. | ||
1185 | * | ||
1186 | * The caller is responsible for incrementing pos, written counts, etc | ||
1187 | * | 1286 | * |
1188 | * For file systems that don't support sparse files, pre-allocation | 1287 | * Returns the number of clusters that will have to be allocated, as |
1189 | * and page zeroing up until cpos should be done prior to this | 1288 | * well as a worst case estimate of the number of extent records that |
1190 | * function call. | 1289 | * would have to be created during a write to an unwritten region. |
1191 | * | ||
1192 | * Callers should be holding i_sem, and the rw cluster lock. | ||
1193 | * | ||
1194 | * Returns the number of user bytes written, or less than zero for | ||
1195 | * error. | ||
1196 | */ | 1290 | */ |
1197 | ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | 1291 | static int ocfs2_populate_write_desc(struct inode *inode, |
1198 | size_t count, ocfs2_page_writer *actor, | 1292 | struct ocfs2_write_ctxt *wc, |
1199 | void *priv) | 1293 | unsigned int *clusters_to_alloc, |
1294 | unsigned int *extents_to_split) | ||
1295 | { | ||
1296 | int ret; | ||
1297 | struct ocfs2_write_cluster_desc *desc; | ||
1298 | unsigned int num_clusters = 0; | ||
1299 | unsigned int ext_flags = 0; | ||
1300 | u32 phys = 0; | ||
1301 | int i; | ||
1302 | |||
1303 | *clusters_to_alloc = 0; | ||
1304 | *extents_to_split = 0; | ||
1305 | |||
1306 | for (i = 0; i < wc->w_clen; i++) { | ||
1307 | desc = &wc->w_desc[i]; | ||
1308 | desc->c_cpos = wc->w_cpos + i; | ||
1309 | |||
1310 | if (num_clusters == 0) { | ||
1311 | /* | ||
1312 | * Need to look up the next extent record. | ||
1313 | */ | ||
1314 | ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys, | ||
1315 | &num_clusters, &ext_flags); | ||
1316 | if (ret) { | ||
1317 | mlog_errno(ret); | ||
1318 | goto out; | ||
1319 | } | ||
1320 | |||
1321 | /* | ||
1322 | * Assume worst case - that we're writing in | ||
1323 | * the middle of the extent. | ||
1324 | * | ||
1325 | * We can assume that the write proceeds from | ||
1326 | * left to right, in which case the extent | ||
1327 | * insert code is smart enough to coalesce the | ||
1328 | * next splits into the previous records created. | ||
1329 | */ | ||
1330 | if (ext_flags & OCFS2_EXT_UNWRITTEN) | ||
1331 | *extents_to_split = *extents_to_split + 2; | ||
1332 | } else if (phys) { | ||
1333 | /* | ||
1334 | * Only increment phys if it doesn't describe | ||
1335 | * a hole. | ||
1336 | */ | ||
1337 | phys++; | ||
1338 | } | ||
1339 | |||
1340 | desc->c_phys = phys; | ||
1341 | if (phys == 0) { | ||
1342 | desc->c_new = 1; | ||
1343 | *clusters_to_alloc = *clusters_to_alloc + 1; | ||
1344 | } | ||
1345 | if (ext_flags & OCFS2_EXT_UNWRITTEN) | ||
1346 | desc->c_unwritten = 1; | ||
1347 | |||
1348 | num_clusters--; | ||
1349 | } | ||
1350 | |||
1351 | ret = 0; | ||
1352 | out: | ||
1353 | return ret; | ||
1354 | } | ||
1355 | |||
1356 | int ocfs2_write_begin_nolock(struct address_space *mapping, | ||
1357 | loff_t pos, unsigned len, unsigned flags, | ||
1358 | struct page **pagep, void **fsdata, | ||
1359 | struct buffer_head *di_bh, struct page *mmap_page) | ||
1200 | { | 1360 | { |
1201 | int ret, credits = OCFS2_INODE_UPDATE_CREDITS; | 1361 | int ret, credits = OCFS2_INODE_UPDATE_CREDITS; |
1202 | ssize_t written = 0; | 1362 | unsigned int clusters_to_alloc, extents_to_split; |
1203 | u32 phys; | 1363 | struct ocfs2_write_ctxt *wc; |
1204 | struct inode *inode = file->f_mapping->host; | 1364 | struct inode *inode = mapping->host; |
1205 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1365 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1206 | struct buffer_head *di_bh = NULL; | ||
1207 | struct ocfs2_dinode *di; | 1366 | struct ocfs2_dinode *di; |
1208 | struct ocfs2_alloc_context *data_ac = NULL; | 1367 | struct ocfs2_alloc_context *data_ac = NULL; |
1209 | struct ocfs2_alloc_context *meta_ac = NULL; | 1368 | struct ocfs2_alloc_context *meta_ac = NULL; |
1210 | handle_t *handle; | 1369 | handle_t *handle; |
1211 | struct ocfs2_write_ctxt wc; | ||
1212 | |||
1213 | ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); | ||
1214 | 1370 | ||
1215 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | 1371 | ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); |
1216 | if (ret) { | 1372 | if (ret) { |
1217 | mlog_errno(ret); | 1373 | mlog_errno(ret); |
1218 | goto out; | 1374 | return ret; |
1219 | } | 1375 | } |
1220 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
1221 | |||
1222 | /* | ||
1223 | * Take alloc sem here to prevent concurrent lookups. That way | ||
1224 | * the mapping, zeroing and tree manipulation within | ||
1225 | * ocfs2_write() will be safe against ->readpage(). This | ||
1226 | * should also serve to lock out allocation from a shared | ||
1227 | * writeable region. | ||
1228 | */ | ||
1229 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1230 | 1376 | ||
1231 | ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); | 1377 | ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, |
1378 | &extents_to_split); | ||
1232 | if (ret) { | 1379 | if (ret) { |
1233 | mlog_errno(ret); | 1380 | mlog_errno(ret); |
1234 | goto out_meta; | 1381 | goto out; |
1235 | } | 1382 | } |
1236 | 1383 | ||
1237 | /* phys == 0 means that allocation is required. */ | 1384 | di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; |
1238 | if (phys == 0) { | 1385 | |
1239 | ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); | 1386 | /* |
1387 | * We set w_target_from, w_target_to here so that | ||
1388 | * ocfs2_write_end() knows which range in the target page to | ||
1389 | * write out. An allocation requires that we write the entire | ||
1390 | * cluster range. | ||
1391 | */ | ||
1392 | if (clusters_to_alloc || extents_to_split) { | ||
1393 | /* | ||
1394 | * XXX: We are stretching the limits of | ||
1395 | * ocfs2_lock_allocators(). It greatly over-estimates | ||
1396 | * the work to be done. | ||
1397 | */ | ||
1398 | ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc, | ||
1399 | extents_to_split, &data_ac, &meta_ac); | ||
1240 | if (ret) { | 1400 | if (ret) { |
1241 | mlog_errno(ret); | 1401 | mlog_errno(ret); |
1242 | goto out_meta; | 1402 | goto out; |
1243 | } | 1403 | } |
1244 | 1404 | ||
1245 | credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); | 1405 | credits = ocfs2_calc_extend_credits(inode->i_sb, di, |
1246 | } | 1406 | clusters_to_alloc); |
1247 | 1407 | ||
1248 | ret = ocfs2_data_lock(inode, 1); | ||
1249 | if (ret) { | ||
1250 | mlog_errno(ret); | ||
1251 | goto out_meta; | ||
1252 | } | 1408 | } |
1253 | 1409 | ||
1410 | ocfs2_set_target_boundaries(osb, wc, pos, len, | ||
1411 | clusters_to_alloc + extents_to_split); | ||
1412 | |||
1254 | handle = ocfs2_start_trans(osb, credits); | 1413 | handle = ocfs2_start_trans(osb, credits); |
1255 | if (IS_ERR(handle)) { | 1414 | if (IS_ERR(handle)) { |
1256 | ret = PTR_ERR(handle); | 1415 | ret = PTR_ERR(handle); |
1257 | mlog_errno(ret); | 1416 | mlog_errno(ret); |
1258 | goto out_data; | 1417 | goto out; |
1259 | } | 1418 | } |
1260 | 1419 | ||
1261 | written = ocfs2_write(file, phys, handle, di_bh, data_ac, | 1420 | wc->w_handle = handle; |
1262 | meta_ac, &wc); | 1421 | |
1263 | if (written < 0) { | 1422 | /* |
1264 | ret = written; | 1423 | * We don't want this to fail in ocfs2_write_end(), so do it |
1424 | * here. | ||
1425 | */ | ||
1426 | ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, | ||
1427 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1428 | if (ret) { | ||
1265 | mlog_errno(ret); | 1429 | mlog_errno(ret); |
1266 | goto out_commit; | 1430 | goto out_commit; |
1267 | } | 1431 | } |
1268 | 1432 | ||
1269 | ret = ocfs2_journal_access(handle, inode, di_bh, | 1433 | /* |
1270 | OCFS2_JOURNAL_ACCESS_WRITE); | 1434 | * Fill our page array first. That way we've grabbed enough so |
1435 | * that we can zero and flush if we error after adding the | ||
1436 | * extent. | ||
1437 | */ | ||
1438 | ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, | ||
1439 | clusters_to_alloc + extents_to_split, | ||
1440 | mmap_page); | ||
1271 | if (ret) { | 1441 | if (ret) { |
1272 | mlog_errno(ret); | 1442 | mlog_errno(ret); |
1273 | goto out_commit; | 1443 | goto out_commit; |
1274 | } | 1444 | } |
1275 | 1445 | ||
1276 | pos += written; | 1446 | ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, |
1447 | len); | ||
1448 | if (ret) { | ||
1449 | mlog_errno(ret); | ||
1450 | goto out_commit; | ||
1451 | } | ||
1452 | |||
1453 | if (data_ac) | ||
1454 | ocfs2_free_alloc_context(data_ac); | ||
1455 | if (meta_ac) | ||
1456 | ocfs2_free_alloc_context(meta_ac); | ||
1457 | |||
1458 | *pagep = wc->w_target_page; | ||
1459 | *fsdata = wc; | ||
1460 | return 0; | ||
1461 | out_commit: | ||
1462 | ocfs2_commit_trans(osb, handle); | ||
1463 | |||
1464 | out: | ||
1465 | ocfs2_free_write_ctxt(wc); | ||
1466 | |||
1467 | if (data_ac) | ||
1468 | ocfs2_free_alloc_context(data_ac); | ||
1469 | if (meta_ac) | ||
1470 | ocfs2_free_alloc_context(meta_ac); | ||
1471 | return ret; | ||
1472 | } | ||
1473 | |||
1474 | int ocfs2_write_begin(struct file *file, struct address_space *mapping, | ||
1475 | loff_t pos, unsigned len, unsigned flags, | ||
1476 | struct page **pagep, void **fsdata) | ||
1477 | { | ||
1478 | int ret; | ||
1479 | struct buffer_head *di_bh = NULL; | ||
1480 | struct inode *inode = mapping->host; | ||
1481 | |||
1482 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | ||
1483 | if (ret) { | ||
1484 | mlog_errno(ret); | ||
1485 | return ret; | ||
1486 | } | ||
1487 | |||
1488 | /* | ||
1489 | * Take alloc sem here to prevent concurrent lookups. That way | ||
1490 | * the mapping, zeroing and tree manipulation within | ||
1491 | * ocfs2_write() will be safe against ->readpage(). This | ||
1492 | * should also serve to lock out allocation from a shared | ||
1493 | * writeable region. | ||
1494 | */ | ||
1495 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1496 | |||
1497 | ret = ocfs2_data_lock(inode, 1); | ||
1498 | if (ret) { | ||
1499 | mlog_errno(ret); | ||
1500 | goto out_fail; | ||
1501 | } | ||
1502 | |||
1503 | ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, | ||
1504 | fsdata, di_bh, NULL); | ||
1505 | if (ret) { | ||
1506 | mlog_errno(ret); | ||
1507 | goto out_fail_data; | ||
1508 | } | ||
1509 | |||
1510 | brelse(di_bh); | ||
1511 | |||
1512 | return 0; | ||
1513 | |||
1514 | out_fail_data: | ||
1515 | ocfs2_data_unlock(inode, 1); | ||
1516 | out_fail: | ||
1517 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1518 | |||
1519 | brelse(di_bh); | ||
1520 | ocfs2_meta_unlock(inode, 1); | ||
1521 | |||
1522 | return ret; | ||
1523 | } | ||
1524 | |||
1525 | int ocfs2_write_end_nolock(struct address_space *mapping, | ||
1526 | loff_t pos, unsigned len, unsigned copied, | ||
1527 | struct page *page, void *fsdata) | ||
1528 | { | ||
1529 | int i; | ||
1530 | unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); | ||
1531 | struct inode *inode = mapping->host; | ||
1532 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1533 | struct ocfs2_write_ctxt *wc = fsdata; | ||
1534 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; | ||
1535 | handle_t *handle = wc->w_handle; | ||
1536 | struct page *tmppage; | ||
1537 | |||
1538 | if (unlikely(copied < len)) { | ||
1539 | if (!PageUptodate(wc->w_target_page)) | ||
1540 | copied = 0; | ||
1541 | |||
1542 | ocfs2_zero_new_buffers(wc->w_target_page, start+copied, | ||
1543 | start+len); | ||
1544 | } | ||
1545 | flush_dcache_page(wc->w_target_page); | ||
1546 | |||
1547 | for(i = 0; i < wc->w_num_pages; i++) { | ||
1548 | tmppage = wc->w_pages[i]; | ||
1549 | |||
1550 | if (tmppage == wc->w_target_page) { | ||
1551 | from = wc->w_target_from; | ||
1552 | to = wc->w_target_to; | ||
1553 | |||
1554 | BUG_ON(from > PAGE_CACHE_SIZE || | ||
1555 | to > PAGE_CACHE_SIZE || | ||
1556 | to < from); | ||
1557 | } else { | ||
1558 | /* | ||
1559 | * Pages adjacent to the target (if any) imply | ||
1560 | * a hole-filling write in which case we want | ||
1561 | * to flush their entire range. | ||
1562 | */ | ||
1563 | from = 0; | ||
1564 | to = PAGE_CACHE_SIZE; | ||
1565 | } | ||
1566 | |||
1567 | if (ocfs2_should_order_data(inode)) | ||
1568 | walk_page_buffers(wc->w_handle, page_buffers(tmppage), | ||
1569 | from, to, NULL, | ||
1570 | ocfs2_journal_dirty_data); | ||
1571 | |||
1572 | block_commit_write(tmppage, from, to); | ||
1573 | } | ||
1574 | |||
1575 | pos += copied; | ||
1277 | if (pos > inode->i_size) { | 1576 | if (pos > inode->i_size) { |
1278 | i_size_write(inode, pos); | 1577 | i_size_write(inode, pos); |
1279 | mark_inode_dirty(inode); | 1578 | mark_inode_dirty(inode); |
@@ -1283,29 +1582,31 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | |||
1283 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 1582 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
1284 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | 1583 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); |
1285 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | 1584 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); |
1585 | ocfs2_journal_dirty(handle, wc->w_di_bh); | ||
1286 | 1586 | ||
1287 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
1288 | if (ret) | ||
1289 | mlog_errno(ret); | ||
1290 | |||
1291 | out_commit: | ||
1292 | ocfs2_commit_trans(osb, handle); | 1587 | ocfs2_commit_trans(osb, handle); |
1293 | 1588 | ||
1294 | out_data: | 1589 | ocfs2_run_deallocs(osb, &wc->w_dealloc); |
1295 | ocfs2_data_unlock(inode, 1); | 1590 | |
1591 | ocfs2_free_write_ctxt(wc); | ||
1592 | |||
1593 | return copied; | ||
1594 | } | ||
1595 | |||
1596 | int ocfs2_write_end(struct file *file, struct address_space *mapping, | ||
1597 | loff_t pos, unsigned len, unsigned copied, | ||
1598 | struct page *page, void *fsdata) | ||
1599 | { | ||
1600 | int ret; | ||
1601 | struct inode *inode = mapping->host; | ||
1296 | 1602 | ||
1297 | out_meta: | 1603 | ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); |
1604 | |||
1605 | ocfs2_data_unlock(inode, 1); | ||
1298 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | 1606 | up_write(&OCFS2_I(inode)->ip_alloc_sem); |
1299 | ocfs2_meta_unlock(inode, 1); | 1607 | ocfs2_meta_unlock(inode, 1); |
1300 | 1608 | ||
1301 | out: | 1609 | return ret; |
1302 | brelse(di_bh); | ||
1303 | if (data_ac) | ||
1304 | ocfs2_free_alloc_context(data_ac); | ||
1305 | if (meta_ac) | ||
1306 | ocfs2_free_alloc_context(meta_ac); | ||
1307 | |||
1308 | return written ? written : ret; | ||
1309 | } | 1610 | } |
1310 | 1611 | ||
1311 | const struct address_space_operations ocfs2_aops = { | 1612 | const struct address_space_operations ocfs2_aops = { |
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 45821d479b5a..389579bd64e3 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h | |||
@@ -42,57 +42,22 @@ int walk_page_buffers( handle_t *handle, | |||
42 | int (*fn)( handle_t *handle, | 42 | int (*fn)( handle_t *handle, |
43 | struct buffer_head *bh)); | 43 | struct buffer_head *bh)); |
44 | 44 | ||
45 | struct ocfs2_write_ctxt; | 45 | int ocfs2_write_begin(struct file *file, struct address_space *mapping, |
46 | typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *, | 46 | loff_t pos, unsigned len, unsigned flags, |
47 | u64 *, unsigned int *, unsigned int *); | 47 | struct page **pagep, void **fsdata); |
48 | 48 | ||
49 | ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | 49 | int ocfs2_write_end(struct file *file, struct address_space *mapping, |
50 | size_t count, ocfs2_page_writer *actor, | 50 | loff_t pos, unsigned len, unsigned copied, |
51 | void *priv); | 51 | struct page *page, void *fsdata); |
52 | 52 | ||
53 | struct ocfs2_write_ctxt { | 53 | int ocfs2_write_end_nolock(struct address_space *mapping, |
54 | size_t w_count; | 54 | loff_t pos, unsigned len, unsigned copied, |
55 | loff_t w_pos; | 55 | struct page *page, void *fsdata); |
56 | u32 w_cpos; | ||
57 | unsigned int w_finished_copy; | ||
58 | 56 | ||
59 | /* This is true if page_size > cluster_size */ | 57 | int ocfs2_write_begin_nolock(struct address_space *mapping, |
60 | unsigned int w_large_pages; | 58 | loff_t pos, unsigned len, unsigned flags, |
61 | 59 | struct page **pagep, void **fsdata, | |
62 | /* Filler callback and private data */ | 60 | struct buffer_head *di_bh, struct page *mmap_page); |
63 | ocfs2_page_writer *w_write_data_page; | ||
64 | void *w_private; | ||
65 | |||
66 | /* Only valid for the filler callback */ | ||
67 | struct page *w_this_page; | ||
68 | unsigned int w_this_page_new; | ||
69 | }; | ||
70 | |||
71 | struct ocfs2_buffered_write_priv { | ||
72 | char *b_src_buf; | ||
73 | const struct iovec *b_cur_iov; /* Current iovec */ | ||
74 | size_t b_cur_off; /* Offset in the | ||
75 | * current iovec */ | ||
76 | }; | ||
77 | int ocfs2_map_and_write_user_data(struct inode *inode, | ||
78 | struct ocfs2_write_ctxt *wc, | ||
79 | u64 *p_blkno, | ||
80 | unsigned int *ret_from, | ||
81 | unsigned int *ret_to); | ||
82 | |||
83 | struct ocfs2_splice_write_priv { | ||
84 | struct splice_desc *s_sd; | ||
85 | struct pipe_buffer *s_buf; | ||
86 | struct pipe_inode_info *s_pipe; | ||
87 | /* Neither offset value is ever larger than one page */ | ||
88 | unsigned int s_offset; | ||
89 | unsigned int s_buf_offset; | ||
90 | }; | ||
91 | int ocfs2_map_and_write_splice_data(struct inode *inode, | ||
92 | struct ocfs2_write_ctxt *wc, | ||
93 | u64 *p_blkno, | ||
94 | unsigned int *ret_from, | ||
95 | unsigned int *ret_to); | ||
96 | 61 | ||
97 | /* all ocfs2_dio_end_io()'s fault */ | 62 | /* all ocfs2_dio_end_io()'s fault */ |
98 | #define ocfs2_iocb_is_rw_locked(iocb) \ | 63 | #define ocfs2_iocb_is_rw_locked(iocb) \ |
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 979113479c66..2bd7f788cf34 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -1335,6 +1335,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, | |||
1335 | ret = wait_event_interruptible(o2hb_steady_queue, | 1335 | ret = wait_event_interruptible(o2hb_steady_queue, |
1336 | atomic_read(®->hr_steady_iterations) == 0); | 1336 | atomic_read(®->hr_steady_iterations) == 0); |
1337 | if (ret) { | 1337 | if (ret) { |
1338 | /* We got interrupted (hello ptrace!). Clean up */ | ||
1338 | spin_lock(&o2hb_live_lock); | 1339 | spin_lock(&o2hb_live_lock); |
1339 | hb_task = reg->hr_task; | 1340 | hb_task = reg->hr_task; |
1340 | reg->hr_task = NULL; | 1341 | reg->hr_task = NULL; |
@@ -1345,7 +1346,16 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, | |||
1345 | goto out; | 1346 | goto out; |
1346 | } | 1347 | } |
1347 | 1348 | ||
1348 | ret = count; | 1349 | /* Ok, we were woken. Make sure it wasn't by drop_item() */ |
1350 | spin_lock(&o2hb_live_lock); | ||
1351 | hb_task = reg->hr_task; | ||
1352 | spin_unlock(&o2hb_live_lock); | ||
1353 | |||
1354 | if (hb_task) | ||
1355 | ret = count; | ||
1356 | else | ||
1357 | ret = -EIO; | ||
1358 | |||
1349 | out: | 1359 | out: |
1350 | if (filp) | 1360 | if (filp) |
1351 | fput(filp); | 1361 | fput(filp); |
@@ -1523,6 +1533,15 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, | |||
1523 | if (hb_task) | 1533 | if (hb_task) |
1524 | kthread_stop(hb_task); | 1534 | kthread_stop(hb_task); |
1525 | 1535 | ||
1536 | /* | ||
1537 | * If we're racing a dev_write(), we need to wake them. They will | ||
1538 | * check reg->hr_task | ||
1539 | */ | ||
1540 | if (atomic_read(®->hr_steady_iterations) != 0) { | ||
1541 | atomic_set(®->hr_steady_iterations, 0); | ||
1542 | wake_up(&o2hb_steady_queue); | ||
1543 | } | ||
1544 | |||
1526 | config_item_put(item); | 1545 | config_item_put(item); |
1527 | } | 1546 | } |
1528 | 1547 | ||
@@ -1665,7 +1684,67 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc, | |||
1665 | } | 1684 | } |
1666 | EXPORT_SYMBOL_GPL(o2hb_setup_callback); | 1685 | EXPORT_SYMBOL_GPL(o2hb_setup_callback); |
1667 | 1686 | ||
1668 | int o2hb_register_callback(struct o2hb_callback_func *hc) | 1687 | static struct o2hb_region *o2hb_find_region(const char *region_uuid) |
1688 | { | ||
1689 | struct o2hb_region *p, *reg = NULL; | ||
1690 | |||
1691 | assert_spin_locked(&o2hb_live_lock); | ||
1692 | |||
1693 | list_for_each_entry(p, &o2hb_all_regions, hr_all_item) { | ||
1694 | if (!strcmp(region_uuid, config_item_name(&p->hr_item))) { | ||
1695 | reg = p; | ||
1696 | break; | ||
1697 | } | ||
1698 | } | ||
1699 | |||
1700 | return reg; | ||
1701 | } | ||
1702 | |||
1703 | static int o2hb_region_get(const char *region_uuid) | ||
1704 | { | ||
1705 | int ret = 0; | ||
1706 | struct o2hb_region *reg; | ||
1707 | |||
1708 | spin_lock(&o2hb_live_lock); | ||
1709 | |||
1710 | reg = o2hb_find_region(region_uuid); | ||
1711 | if (!reg) | ||
1712 | ret = -ENOENT; | ||
1713 | spin_unlock(&o2hb_live_lock); | ||
1714 | |||
1715 | if (ret) | ||
1716 | goto out; | ||
1717 | |||
1718 | ret = o2nm_depend_this_node(); | ||
1719 | if (ret) | ||
1720 | goto out; | ||
1721 | |||
1722 | ret = o2nm_depend_item(®->hr_item); | ||
1723 | if (ret) | ||
1724 | o2nm_undepend_this_node(); | ||
1725 | |||
1726 | out: | ||
1727 | return ret; | ||
1728 | } | ||
1729 | |||
1730 | static void o2hb_region_put(const char *region_uuid) | ||
1731 | { | ||
1732 | struct o2hb_region *reg; | ||
1733 | |||
1734 | spin_lock(&o2hb_live_lock); | ||
1735 | |||
1736 | reg = o2hb_find_region(region_uuid); | ||
1737 | |||
1738 | spin_unlock(&o2hb_live_lock); | ||
1739 | |||
1740 | if (reg) { | ||
1741 | o2nm_undepend_item(®->hr_item); | ||
1742 | o2nm_undepend_this_node(); | ||
1743 | } | ||
1744 | } | ||
1745 | |||
1746 | int o2hb_register_callback(const char *region_uuid, | ||
1747 | struct o2hb_callback_func *hc) | ||
1669 | { | 1748 | { |
1670 | struct o2hb_callback_func *tmp; | 1749 | struct o2hb_callback_func *tmp; |
1671 | struct list_head *iter; | 1750 | struct list_head *iter; |
@@ -1681,6 +1760,12 @@ int o2hb_register_callback(struct o2hb_callback_func *hc) | |||
1681 | goto out; | 1760 | goto out; |
1682 | } | 1761 | } |
1683 | 1762 | ||
1763 | if (region_uuid) { | ||
1764 | ret = o2hb_region_get(region_uuid); | ||
1765 | if (ret) | ||
1766 | goto out; | ||
1767 | } | ||
1768 | |||
1684 | down_write(&o2hb_callback_sem); | 1769 | down_write(&o2hb_callback_sem); |
1685 | 1770 | ||
1686 | list_for_each(iter, &hbcall->list) { | 1771 | list_for_each(iter, &hbcall->list) { |
@@ -1702,16 +1787,21 @@ out: | |||
1702 | } | 1787 | } |
1703 | EXPORT_SYMBOL_GPL(o2hb_register_callback); | 1788 | EXPORT_SYMBOL_GPL(o2hb_register_callback); |
1704 | 1789 | ||
1705 | void o2hb_unregister_callback(struct o2hb_callback_func *hc) | 1790 | void o2hb_unregister_callback(const char *region_uuid, |
1791 | struct o2hb_callback_func *hc) | ||
1706 | { | 1792 | { |
1707 | BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); | 1793 | BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); |
1708 | 1794 | ||
1709 | mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n", | 1795 | mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n", |
1710 | __builtin_return_address(0), hc); | 1796 | __builtin_return_address(0), hc); |
1711 | 1797 | ||
1798 | /* XXX Can this happen _with_ a region reference? */ | ||
1712 | if (list_empty(&hc->hc_item)) | 1799 | if (list_empty(&hc->hc_item)) |
1713 | return; | 1800 | return; |
1714 | 1801 | ||
1802 | if (region_uuid) | ||
1803 | o2hb_region_put(region_uuid); | ||
1804 | |||
1715 | down_write(&o2hb_callback_sem); | 1805 | down_write(&o2hb_callback_sem); |
1716 | 1806 | ||
1717 | list_del_init(&hc->hc_item); | 1807 | list_del_init(&hc->hc_item); |
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index cc6d40b39771..35397dd5ecdb 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h | |||
@@ -69,8 +69,10 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc, | |||
69 | o2hb_cb_func *func, | 69 | o2hb_cb_func *func, |
70 | void *data, | 70 | void *data, |
71 | int priority); | 71 | int priority); |
72 | int o2hb_register_callback(struct o2hb_callback_func *hc); | 72 | int o2hb_register_callback(const char *region_uuid, |
73 | void o2hb_unregister_callback(struct o2hb_callback_func *hc); | 73 | struct o2hb_callback_func *hc); |
74 | void o2hb_unregister_callback(const char *region_uuid, | ||
75 | struct o2hb_callback_func *hc); | ||
74 | void o2hb_fill_node_map(unsigned long *map, | 76 | void o2hb_fill_node_map(unsigned long *map, |
75 | unsigned bytes); | 77 | unsigned bytes); |
76 | void o2hb_init(void); | 78 | void o2hb_init(void); |
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 9f5ad0f01ce0..af2070da308b 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c | |||
@@ -900,6 +900,46 @@ static struct o2nm_cluster_group o2nm_cluster_group = { | |||
900 | }, | 900 | }, |
901 | }; | 901 | }; |
902 | 902 | ||
903 | int o2nm_depend_item(struct config_item *item) | ||
904 | { | ||
905 | return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item); | ||
906 | } | ||
907 | |||
908 | void o2nm_undepend_item(struct config_item *item) | ||
909 | { | ||
910 | configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item); | ||
911 | } | ||
912 | |||
913 | int o2nm_depend_this_node(void) | ||
914 | { | ||
915 | int ret = 0; | ||
916 | struct o2nm_node *local_node; | ||
917 | |||
918 | local_node = o2nm_get_node_by_num(o2nm_this_node()); | ||
919 | if (!local_node) { | ||
920 | ret = -EINVAL; | ||
921 | goto out; | ||
922 | } | ||
923 | |||
924 | ret = o2nm_depend_item(&local_node->nd_item); | ||
925 | o2nm_node_put(local_node); | ||
926 | |||
927 | out: | ||
928 | return ret; | ||
929 | } | ||
930 | |||
931 | void o2nm_undepend_this_node(void) | ||
932 | { | ||
933 | struct o2nm_node *local_node; | ||
934 | |||
935 | local_node = o2nm_get_node_by_num(o2nm_this_node()); | ||
936 | BUG_ON(!local_node); | ||
937 | |||
938 | o2nm_undepend_item(&local_node->nd_item); | ||
939 | o2nm_node_put(local_node); | ||
940 | } | ||
941 | |||
942 | |||
903 | static void __exit exit_o2nm(void) | 943 | static void __exit exit_o2nm(void) |
904 | { | 944 | { |
905 | if (ocfs2_table_header) | 945 | if (ocfs2_table_header) |
@@ -934,7 +974,7 @@ static int __init init_o2nm(void) | |||
934 | goto out_sysctl; | 974 | goto out_sysctl; |
935 | 975 | ||
936 | config_group_init(&o2nm_cluster_group.cs_subsys.su_group); | 976 | config_group_init(&o2nm_cluster_group.cs_subsys.su_group); |
937 | init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem); | 977 | mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex); |
938 | ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys); | 978 | ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys); |
939 | if (ret) { | 979 | if (ret) { |
940 | printk(KERN_ERR "nodemanager: Registration returned %d\n", ret); | 980 | printk(KERN_ERR "nodemanager: Registration returned %d\n", ret); |
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index 070522138ae2..7c860361b8dd 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h | |||
@@ -77,4 +77,9 @@ struct o2nm_node *o2nm_get_node_by_ip(__be32 addr); | |||
77 | void o2nm_node_get(struct o2nm_node *node); | 77 | void o2nm_node_get(struct o2nm_node *node); |
78 | void o2nm_node_put(struct o2nm_node *node); | 78 | void o2nm_node_put(struct o2nm_node *node); |
79 | 79 | ||
80 | int o2nm_depend_item(struct config_item *item); | ||
81 | void o2nm_undepend_item(struct config_item *item); | ||
82 | int o2nm_depend_this_node(void); | ||
83 | void o2nm_undepend_this_node(void); | ||
84 | |||
80 | #endif /* O2CLUSTER_NODEMANAGER_H */ | 85 | #endif /* O2CLUSTER_NODEMANAGER_H */ |
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 0b229a9c7952..f0bdfd944c44 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c | |||
@@ -261,14 +261,12 @@ out: | |||
261 | 261 | ||
262 | static void o2net_complete_nodes_nsw(struct o2net_node *nn) | 262 | static void o2net_complete_nodes_nsw(struct o2net_node *nn) |
263 | { | 263 | { |
264 | struct list_head *iter, *tmp; | 264 | struct o2net_status_wait *nsw, *tmp; |
265 | unsigned int num_kills = 0; | 265 | unsigned int num_kills = 0; |
266 | struct o2net_status_wait *nsw; | ||
267 | 266 | ||
268 | assert_spin_locked(&nn->nn_lock); | 267 | assert_spin_locked(&nn->nn_lock); |
269 | 268 | ||
270 | list_for_each_safe(iter, tmp, &nn->nn_status_list) { | 269 | list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) { |
271 | nsw = list_entry(iter, struct o2net_status_wait, ns_node_item); | ||
272 | o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0); | 270 | o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0); |
273 | num_kills++; | 271 | num_kills++; |
274 | } | 272 | } |
@@ -764,13 +762,10 @@ EXPORT_SYMBOL_GPL(o2net_register_handler); | |||
764 | 762 | ||
765 | void o2net_unregister_handler_list(struct list_head *list) | 763 | void o2net_unregister_handler_list(struct list_head *list) |
766 | { | 764 | { |
767 | struct list_head *pos, *n; | 765 | struct o2net_msg_handler *nmh, *n; |
768 | struct o2net_msg_handler *nmh; | ||
769 | 766 | ||
770 | write_lock(&o2net_handler_lock); | 767 | write_lock(&o2net_handler_lock); |
771 | list_for_each_safe(pos, n, list) { | 768 | list_for_each_entry_safe(nmh, n, list, nh_unregister_item) { |
772 | nmh = list_entry(pos, struct o2net_msg_handler, | ||
773 | nh_unregister_item); | ||
774 | mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n", | 769 | mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n", |
775 | nmh->nh_func, nmh->nh_msg_type, nmh->nh_key); | 770 | nmh->nh_func, nmh->nh_msg_type, nmh->nh_key); |
776 | rb_erase(&nmh->nh_node, &o2net_handler_tree); | 771 | rb_erase(&nmh->nh_node, &o2net_handler_tree); |
@@ -1638,8 +1633,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, | |||
1638 | 1633 | ||
1639 | void o2net_unregister_hb_callbacks(void) | 1634 | void o2net_unregister_hb_callbacks(void) |
1640 | { | 1635 | { |
1641 | o2hb_unregister_callback(&o2net_hb_up); | 1636 | o2hb_unregister_callback(NULL, &o2net_hb_up); |
1642 | o2hb_unregister_callback(&o2net_hb_down); | 1637 | o2hb_unregister_callback(NULL, &o2net_hb_down); |
1643 | } | 1638 | } |
1644 | 1639 | ||
1645 | int o2net_register_hb_callbacks(void) | 1640 | int o2net_register_hb_callbacks(void) |
@@ -1651,9 +1646,9 @@ int o2net_register_hb_callbacks(void) | |||
1651 | o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB, | 1646 | o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB, |
1652 | o2net_hb_node_up_cb, NULL, O2NET_HB_PRI); | 1647 | o2net_hb_node_up_cb, NULL, O2NET_HB_PRI); |
1653 | 1648 | ||
1654 | ret = o2hb_register_callback(&o2net_hb_up); | 1649 | ret = o2hb_register_callback(NULL, &o2net_hb_up); |
1655 | if (ret == 0) | 1650 | if (ret == 0) |
1656 | ret = o2hb_register_callback(&o2net_hb_down); | 1651 | ret = o2hb_register_callback(NULL, &o2net_hb_down); |
1657 | 1652 | ||
1658 | if (ret) | 1653 | if (ret) |
1659 | o2net_unregister_hb_callbacks(); | 1654 | o2net_unregister_hb_callbacks(); |
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index c441ef1f2bad..0d5fdde959c8 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c | |||
@@ -368,7 +368,7 @@ int ocfs2_do_extend_dir(struct super_block *sb, | |||
368 | u32 offset = OCFS2_I(dir)->ip_clusters; | 368 | u32 offset = OCFS2_I(dir)->ip_clusters; |
369 | 369 | ||
370 | status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, | 370 | status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, |
371 | 1, parent_fe_bh, handle, | 371 | 1, 0, parent_fe_bh, handle, |
372 | data_ac, meta_ac, NULL); | 372 | data_ac, meta_ac, NULL); |
373 | BUG_ON(status == -EAGAIN); | 373 | BUG_ON(status == -EAGAIN); |
374 | if (status < 0) { | 374 | if (status < 0) { |
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index d836b98dd99a..6954565b8ccb 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
@@ -1128,8 +1128,8 @@ bail: | |||
1128 | 1128 | ||
1129 | static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) | 1129 | static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) |
1130 | { | 1130 | { |
1131 | o2hb_unregister_callback(&dlm->dlm_hb_up); | 1131 | o2hb_unregister_callback(NULL, &dlm->dlm_hb_up); |
1132 | o2hb_unregister_callback(&dlm->dlm_hb_down); | 1132 | o2hb_unregister_callback(NULL, &dlm->dlm_hb_down); |
1133 | o2net_unregister_handler_list(&dlm->dlm_domain_handlers); | 1133 | o2net_unregister_handler_list(&dlm->dlm_domain_handlers); |
1134 | } | 1134 | } |
1135 | 1135 | ||
@@ -1141,13 +1141,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) | |||
1141 | 1141 | ||
1142 | o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, | 1142 | o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, |
1143 | dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); | 1143 | dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); |
1144 | status = o2hb_register_callback(&dlm->dlm_hb_down); | 1144 | status = o2hb_register_callback(NULL, &dlm->dlm_hb_down); |
1145 | if (status) | 1145 | if (status) |
1146 | goto bail; | 1146 | goto bail; |
1147 | 1147 | ||
1148 | o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, | 1148 | o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, |
1149 | dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); | 1149 | dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); |
1150 | status = o2hb_register_callback(&dlm->dlm_hb_up); | 1150 | status = o2hb_register_callback(NULL, &dlm->dlm_hb_up); |
1151 | if (status) | 1151 | if (status) |
1152 | goto bail; | 1152 | goto bail; |
1153 | 1153 | ||
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 6edffca99d98..65b2b9b92688 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c | |||
@@ -192,25 +192,20 @@ static void dlm_print_one_mle(struct dlm_master_list_entry *mle) | |||
192 | static void dlm_dump_mles(struct dlm_ctxt *dlm) | 192 | static void dlm_dump_mles(struct dlm_ctxt *dlm) |
193 | { | 193 | { |
194 | struct dlm_master_list_entry *mle; | 194 | struct dlm_master_list_entry *mle; |
195 | struct list_head *iter; | ||
196 | 195 | ||
197 | mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); | 196 | mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); |
198 | spin_lock(&dlm->master_lock); | 197 | spin_lock(&dlm->master_lock); |
199 | list_for_each(iter, &dlm->master_list) { | 198 | list_for_each_entry(mle, &dlm->master_list, list) |
200 | mle = list_entry(iter, struct dlm_master_list_entry, list); | ||
201 | dlm_print_one_mle(mle); | 199 | dlm_print_one_mle(mle); |
202 | } | ||
203 | spin_unlock(&dlm->master_lock); | 200 | spin_unlock(&dlm->master_lock); |
204 | } | 201 | } |
205 | 202 | ||
206 | int dlm_dump_all_mles(const char __user *data, unsigned int len) | 203 | int dlm_dump_all_mles(const char __user *data, unsigned int len) |
207 | { | 204 | { |
208 | struct list_head *iter; | ||
209 | struct dlm_ctxt *dlm; | 205 | struct dlm_ctxt *dlm; |
210 | 206 | ||
211 | spin_lock(&dlm_domain_lock); | 207 | spin_lock(&dlm_domain_lock); |
212 | list_for_each(iter, &dlm_domains) { | 208 | list_for_each_entry(dlm, &dlm_domains, list) { |
213 | dlm = list_entry (iter, struct dlm_ctxt, list); | ||
214 | mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name); | 209 | mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name); |
215 | dlm_dump_mles(dlm); | 210 | dlm_dump_mles(dlm); |
216 | } | 211 | } |
@@ -454,12 +449,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm, | |||
454 | char *name, unsigned int namelen) | 449 | char *name, unsigned int namelen) |
455 | { | 450 | { |
456 | struct dlm_master_list_entry *tmpmle; | 451 | struct dlm_master_list_entry *tmpmle; |
457 | struct list_head *iter; | ||
458 | 452 | ||
459 | assert_spin_locked(&dlm->master_lock); | 453 | assert_spin_locked(&dlm->master_lock); |
460 | 454 | ||
461 | list_for_each(iter, &dlm->master_list) { | 455 | list_for_each_entry(tmpmle, &dlm->master_list, list) { |
462 | tmpmle = list_entry(iter, struct dlm_master_list_entry, list); | ||
463 | if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) | 456 | if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) |
464 | continue; | 457 | continue; |
465 | dlm_get_mle(tmpmle); | 458 | dlm_get_mle(tmpmle); |
@@ -472,13 +465,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm, | |||
472 | void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) | 465 | void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) |
473 | { | 466 | { |
474 | struct dlm_master_list_entry *mle; | 467 | struct dlm_master_list_entry *mle; |
475 | struct list_head *iter; | ||
476 | 468 | ||
477 | assert_spin_locked(&dlm->spinlock); | 469 | assert_spin_locked(&dlm->spinlock); |
478 | 470 | ||
479 | list_for_each(iter, &dlm->mle_hb_events) { | 471 | list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) { |
480 | mle = list_entry(iter, struct dlm_master_list_entry, | ||
481 | hb_events); | ||
482 | if (node_up) | 472 | if (node_up) |
483 | dlm_mle_node_up(dlm, mle, NULL, idx); | 473 | dlm_mle_node_up(dlm, mle, NULL, idx); |
484 | else | 474 | else |
@@ -2434,7 +2424,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, | |||
2434 | int ret; | 2424 | int ret; |
2435 | int i; | 2425 | int i; |
2436 | int count = 0; | 2426 | int count = 0; |
2437 | struct list_head *queue, *iter; | 2427 | struct list_head *queue; |
2438 | struct dlm_lock *lock; | 2428 | struct dlm_lock *lock; |
2439 | 2429 | ||
2440 | assert_spin_locked(&res->spinlock); | 2430 | assert_spin_locked(&res->spinlock); |
@@ -2453,8 +2443,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, | |||
2453 | ret = 0; | 2443 | ret = 0; |
2454 | queue = &res->granted; | 2444 | queue = &res->granted; |
2455 | for (i = 0; i < 3; i++) { | 2445 | for (i = 0; i < 3; i++) { |
2456 | list_for_each(iter, queue) { | 2446 | list_for_each_entry(lock, queue, list) { |
2457 | lock = list_entry(iter, struct dlm_lock, list); | ||
2458 | ++count; | 2447 | ++count; |
2459 | if (lock->ml.node == dlm->node_num) { | 2448 | if (lock->ml.node == dlm->node_num) { |
2460 | mlog(0, "found a lock owned by this node still " | 2449 | mlog(0, "found a lock owned by this node still " |
@@ -2923,18 +2912,16 @@ again: | |||
2923 | static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, | 2912 | static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, |
2924 | struct dlm_lock_resource *res) | 2913 | struct dlm_lock_resource *res) |
2925 | { | 2914 | { |
2926 | struct list_head *iter, *iter2; | ||
2927 | struct list_head *queue = &res->granted; | 2915 | struct list_head *queue = &res->granted; |
2928 | int i, bit; | 2916 | int i, bit; |
2929 | struct dlm_lock *lock; | 2917 | struct dlm_lock *lock, *next; |
2930 | 2918 | ||
2931 | assert_spin_locked(&res->spinlock); | 2919 | assert_spin_locked(&res->spinlock); |
2932 | 2920 | ||
2933 | BUG_ON(res->owner == dlm->node_num); | 2921 | BUG_ON(res->owner == dlm->node_num); |
2934 | 2922 | ||
2935 | for (i=0; i<3; i++) { | 2923 | for (i=0; i<3; i++) { |
2936 | list_for_each_safe(iter, iter2, queue) { | 2924 | list_for_each_entry_safe(lock, next, queue, list) { |
2937 | lock = list_entry (iter, struct dlm_lock, list); | ||
2938 | if (lock->ml.node != dlm->node_num) { | 2925 | if (lock->ml.node != dlm->node_num) { |
2939 | mlog(0, "putting lock for node %u\n", | 2926 | mlog(0, "putting lock for node %u\n", |
2940 | lock->ml.node); | 2927 | lock->ml.node); |
@@ -2976,7 +2963,6 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, | |||
2976 | { | 2963 | { |
2977 | int i; | 2964 | int i; |
2978 | struct list_head *queue = &res->granted; | 2965 | struct list_head *queue = &res->granted; |
2979 | struct list_head *iter; | ||
2980 | struct dlm_lock *lock; | 2966 | struct dlm_lock *lock; |
2981 | int nodenum; | 2967 | int nodenum; |
2982 | 2968 | ||
@@ -2984,10 +2970,9 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, | |||
2984 | 2970 | ||
2985 | spin_lock(&res->spinlock); | 2971 | spin_lock(&res->spinlock); |
2986 | for (i=0; i<3; i++) { | 2972 | for (i=0; i<3; i++) { |
2987 | list_for_each(iter, queue) { | 2973 | list_for_each_entry(lock, queue, list) { |
2988 | /* up to the caller to make sure this node | 2974 | /* up to the caller to make sure this node |
2989 | * is alive */ | 2975 | * is alive */ |
2990 | lock = list_entry (iter, struct dlm_lock, list); | ||
2991 | if (lock->ml.node != dlm->node_num) { | 2976 | if (lock->ml.node != dlm->node_num) { |
2992 | spin_unlock(&res->spinlock); | 2977 | spin_unlock(&res->spinlock); |
2993 | return lock->ml.node; | 2978 | return lock->ml.node; |
@@ -3234,8 +3219,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, | |||
3234 | 3219 | ||
3235 | void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) | 3220 | void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) |
3236 | { | 3221 | { |
3237 | struct list_head *iter, *iter2; | 3222 | struct dlm_master_list_entry *mle, *next; |
3238 | struct dlm_master_list_entry *mle; | ||
3239 | struct dlm_lock_resource *res; | 3223 | struct dlm_lock_resource *res; |
3240 | unsigned int hash; | 3224 | unsigned int hash; |
3241 | 3225 | ||
@@ -3245,9 +3229,7 @@ top: | |||
3245 | 3229 | ||
3246 | /* clean the master list */ | 3230 | /* clean the master list */ |
3247 | spin_lock(&dlm->master_lock); | 3231 | spin_lock(&dlm->master_lock); |
3248 | list_for_each_safe(iter, iter2, &dlm->master_list) { | 3232 | list_for_each_entry_safe(mle, next, &dlm->master_list, list) { |
3249 | mle = list_entry(iter, struct dlm_master_list_entry, list); | ||
3250 | |||
3251 | BUG_ON(mle->type != DLM_MLE_BLOCK && | 3233 | BUG_ON(mle->type != DLM_MLE_BLOCK && |
3252 | mle->type != DLM_MLE_MASTER && | 3234 | mle->type != DLM_MLE_MASTER && |
3253 | mle->type != DLM_MLE_MIGRATION); | 3235 | mle->type != DLM_MLE_MIGRATION); |
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 671c4ed58ee2..a2c33160bfd6 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -158,8 +158,7 @@ void dlm_dispatch_work(struct work_struct *work) | |||
158 | struct dlm_ctxt *dlm = | 158 | struct dlm_ctxt *dlm = |
159 | container_of(work, struct dlm_ctxt, dispatched_work); | 159 | container_of(work, struct dlm_ctxt, dispatched_work); |
160 | LIST_HEAD(tmp_list); | 160 | LIST_HEAD(tmp_list); |
161 | struct list_head *iter, *iter2; | 161 | struct dlm_work_item *item, *next; |
162 | struct dlm_work_item *item; | ||
163 | dlm_workfunc_t *workfunc; | 162 | dlm_workfunc_t *workfunc; |
164 | int tot=0; | 163 | int tot=0; |
165 | 164 | ||
@@ -167,13 +166,12 @@ void dlm_dispatch_work(struct work_struct *work) | |||
167 | list_splice_init(&dlm->work_list, &tmp_list); | 166 | list_splice_init(&dlm->work_list, &tmp_list); |
168 | spin_unlock(&dlm->work_lock); | 167 | spin_unlock(&dlm->work_lock); |
169 | 168 | ||
170 | list_for_each_safe(iter, iter2, &tmp_list) { | 169 | list_for_each_entry(item, &tmp_list, list) { |
171 | tot++; | 170 | tot++; |
172 | } | 171 | } |
173 | mlog(0, "%s: work thread has %d work items\n", dlm->name, tot); | 172 | mlog(0, "%s: work thread has %d work items\n", dlm->name, tot); |
174 | 173 | ||
175 | list_for_each_safe(iter, iter2, &tmp_list) { | 174 | list_for_each_entry_safe(item, next, &tmp_list, list) { |
176 | item = list_entry(iter, struct dlm_work_item, list); | ||
177 | workfunc = item->func; | 175 | workfunc = item->func; |
178 | list_del_init(&item->list); | 176 | list_del_init(&item->list); |
179 | 177 | ||
@@ -549,7 +547,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
549 | { | 547 | { |
550 | int status = 0; | 548 | int status = 0; |
551 | struct dlm_reco_node_data *ndata; | 549 | struct dlm_reco_node_data *ndata; |
552 | struct list_head *iter; | ||
553 | int all_nodes_done; | 550 | int all_nodes_done; |
554 | int destroy = 0; | 551 | int destroy = 0; |
555 | int pass = 0; | 552 | int pass = 0; |
@@ -567,8 +564,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
567 | 564 | ||
568 | /* safe to access the node data list without a lock, since this | 565 | /* safe to access the node data list without a lock, since this |
569 | * process is the only one to change the list */ | 566 | * process is the only one to change the list */ |
570 | list_for_each(iter, &dlm->reco.node_data) { | 567 | list_for_each_entry(ndata, &dlm->reco.node_data, list) { |
571 | ndata = list_entry (iter, struct dlm_reco_node_data, list); | ||
572 | BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); | 568 | BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); |
573 | ndata->state = DLM_RECO_NODE_DATA_REQUESTING; | 569 | ndata->state = DLM_RECO_NODE_DATA_REQUESTING; |
574 | 570 | ||
@@ -655,9 +651,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
655 | * done, or if anyone died */ | 651 | * done, or if anyone died */ |
656 | all_nodes_done = 1; | 652 | all_nodes_done = 1; |
657 | spin_lock(&dlm_reco_state_lock); | 653 | spin_lock(&dlm_reco_state_lock); |
658 | list_for_each(iter, &dlm->reco.node_data) { | 654 | list_for_each_entry(ndata, &dlm->reco.node_data, list) { |
659 | ndata = list_entry (iter, struct dlm_reco_node_data, list); | ||
660 | |||
661 | mlog(0, "checking recovery state of node %u\n", | 655 | mlog(0, "checking recovery state of node %u\n", |
662 | ndata->node_num); | 656 | ndata->node_num); |
663 | switch (ndata->state) { | 657 | switch (ndata->state) { |
@@ -774,16 +768,14 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) | |||
774 | 768 | ||
775 | static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) | 769 | static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) |
776 | { | 770 | { |
777 | struct list_head *iter, *iter2; | 771 | struct dlm_reco_node_data *ndata, *next; |
778 | struct dlm_reco_node_data *ndata; | ||
779 | LIST_HEAD(tmplist); | 772 | LIST_HEAD(tmplist); |
780 | 773 | ||
781 | spin_lock(&dlm_reco_state_lock); | 774 | spin_lock(&dlm_reco_state_lock); |
782 | list_splice_init(&dlm->reco.node_data, &tmplist); | 775 | list_splice_init(&dlm->reco.node_data, &tmplist); |
783 | spin_unlock(&dlm_reco_state_lock); | 776 | spin_unlock(&dlm_reco_state_lock); |
784 | 777 | ||
785 | list_for_each_safe(iter, iter2, &tmplist) { | 778 | list_for_each_entry_safe(ndata, next, &tmplist, list) { |
786 | ndata = list_entry (iter, struct dlm_reco_node_data, list); | ||
787 | list_del_init(&ndata->list); | 779 | list_del_init(&ndata->list); |
788 | kfree(ndata); | 780 | kfree(ndata); |
789 | } | 781 | } |
@@ -876,7 +868,6 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | |||
876 | struct dlm_lock_resource *res; | 868 | struct dlm_lock_resource *res; |
877 | struct dlm_ctxt *dlm; | 869 | struct dlm_ctxt *dlm; |
878 | LIST_HEAD(resources); | 870 | LIST_HEAD(resources); |
879 | struct list_head *iter; | ||
880 | int ret; | 871 | int ret; |
881 | u8 dead_node, reco_master; | 872 | u8 dead_node, reco_master; |
882 | int skip_all_done = 0; | 873 | int skip_all_done = 0; |
@@ -920,8 +911,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | |||
920 | 911 | ||
921 | /* any errors returned will be due to the new_master dying, | 912 | /* any errors returned will be due to the new_master dying, |
922 | * the dlm_reco_thread should detect this */ | 913 | * the dlm_reco_thread should detect this */ |
923 | list_for_each(iter, &resources) { | 914 | list_for_each_entry(res, &resources, recovering) { |
924 | res = list_entry (iter, struct dlm_lock_resource, recovering); | ||
925 | ret = dlm_send_one_lockres(dlm, res, mres, reco_master, | 915 | ret = dlm_send_one_lockres(dlm, res, mres, reco_master, |
926 | DLM_MRES_RECOVERY); | 916 | DLM_MRES_RECOVERY); |
927 | if (ret < 0) { | 917 | if (ret < 0) { |
@@ -983,7 +973,6 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, | |||
983 | { | 973 | { |
984 | struct dlm_ctxt *dlm = data; | 974 | struct dlm_ctxt *dlm = data; |
985 | struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; | 975 | struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; |
986 | struct list_head *iter; | ||
987 | struct dlm_reco_node_data *ndata = NULL; | 976 | struct dlm_reco_node_data *ndata = NULL; |
988 | int ret = -EINVAL; | 977 | int ret = -EINVAL; |
989 | 978 | ||
@@ -1000,8 +989,7 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data, | |||
1000 | dlm->reco.dead_node, done->node_idx, dlm->node_num); | 989 | dlm->reco.dead_node, done->node_idx, dlm->node_num); |
1001 | 990 | ||
1002 | spin_lock(&dlm_reco_state_lock); | 991 | spin_lock(&dlm_reco_state_lock); |
1003 | list_for_each(iter, &dlm->reco.node_data) { | 992 | list_for_each_entry(ndata, &dlm->reco.node_data, list) { |
1004 | ndata = list_entry (iter, struct dlm_reco_node_data, list); | ||
1005 | if (ndata->node_num != done->node_idx) | 993 | if (ndata->node_num != done->node_idx) |
1006 | continue; | 994 | continue; |
1007 | 995 | ||
@@ -1049,13 +1037,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, | |||
1049 | struct list_head *list, | 1037 | struct list_head *list, |
1050 | u8 dead_node) | 1038 | u8 dead_node) |
1051 | { | 1039 | { |
1052 | struct dlm_lock_resource *res; | 1040 | struct dlm_lock_resource *res, *next; |
1053 | struct list_head *iter, *iter2; | ||
1054 | struct dlm_lock *lock; | 1041 | struct dlm_lock *lock; |
1055 | 1042 | ||
1056 | spin_lock(&dlm->spinlock); | 1043 | spin_lock(&dlm->spinlock); |
1057 | list_for_each_safe(iter, iter2, &dlm->reco.resources) { | 1044 | list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { |
1058 | res = list_entry (iter, struct dlm_lock_resource, recovering); | ||
1059 | /* always prune any $RECOVERY entries for dead nodes, | 1045 | /* always prune any $RECOVERY entries for dead nodes, |
1060 | * otherwise hangs can occur during later recovery */ | 1046 | * otherwise hangs can occur during later recovery */ |
1061 | if (dlm_is_recovery_lock(res->lockname.name, | 1047 | if (dlm_is_recovery_lock(res->lockname.name, |
@@ -1169,7 +1155,7 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, | |||
1169 | u8 flags, u8 master) | 1155 | u8 flags, u8 master) |
1170 | { | 1156 | { |
1171 | /* mres here is one full page */ | 1157 | /* mres here is one full page */ |
1172 | memset(mres, 0, PAGE_SIZE); | 1158 | clear_page(mres); |
1173 | mres->lockname_len = namelen; | 1159 | mres->lockname_len = namelen; |
1174 | memcpy(mres->lockname, lockname, namelen); | 1160 | memcpy(mres->lockname, lockname, namelen); |
1175 | mres->num_locks = 0; | 1161 | mres->num_locks = 0; |
@@ -1252,7 +1238,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | |||
1252 | struct dlm_migratable_lockres *mres, | 1238 | struct dlm_migratable_lockres *mres, |
1253 | u8 send_to, u8 flags) | 1239 | u8 send_to, u8 flags) |
1254 | { | 1240 | { |
1255 | struct list_head *queue, *iter; | 1241 | struct list_head *queue; |
1256 | int total_locks, i; | 1242 | int total_locks, i; |
1257 | u64 mig_cookie = 0; | 1243 | u64 mig_cookie = 0; |
1258 | struct dlm_lock *lock; | 1244 | struct dlm_lock *lock; |
@@ -1278,9 +1264,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | |||
1278 | total_locks = 0; | 1264 | total_locks = 0; |
1279 | for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) { | 1265 | for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) { |
1280 | queue = dlm_list_idx_to_ptr(res, i); | 1266 | queue = dlm_list_idx_to_ptr(res, i); |
1281 | list_for_each(iter, queue) { | 1267 | list_for_each_entry(lock, queue, list) { |
1282 | lock = list_entry (iter, struct dlm_lock, list); | ||
1283 | |||
1284 | /* add another lock. */ | 1268 | /* add another lock. */ |
1285 | total_locks++; | 1269 | total_locks++; |
1286 | if (!dlm_add_lock_to_array(lock, mres, i)) | 1270 | if (!dlm_add_lock_to_array(lock, mres, i)) |
@@ -1717,7 +1701,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, | |||
1717 | struct dlm_lockstatus *lksb = NULL; | 1701 | struct dlm_lockstatus *lksb = NULL; |
1718 | int ret = 0; | 1702 | int ret = 0; |
1719 | int i, j, bad; | 1703 | int i, j, bad; |
1720 | struct list_head *iter; | ||
1721 | struct dlm_lock *lock = NULL; | 1704 | struct dlm_lock *lock = NULL; |
1722 | u8 from = O2NM_MAX_NODES; | 1705 | u8 from = O2NM_MAX_NODES; |
1723 | unsigned int added = 0; | 1706 | unsigned int added = 0; |
@@ -1755,8 +1738,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, | |||
1755 | spin_lock(&res->spinlock); | 1738 | spin_lock(&res->spinlock); |
1756 | for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { | 1739 | for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { |
1757 | tmpq = dlm_list_idx_to_ptr(res, j); | 1740 | tmpq = dlm_list_idx_to_ptr(res, j); |
1758 | list_for_each(iter, tmpq) { | 1741 | list_for_each_entry(lock, tmpq, list) { |
1759 | lock = list_entry (iter, struct dlm_lock, list); | ||
1760 | if (lock->ml.cookie != ml->cookie) | 1742 | if (lock->ml.cookie != ml->cookie) |
1761 | lock = NULL; | 1743 | lock = NULL; |
1762 | else | 1744 | else |
@@ -1930,8 +1912,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, | |||
1930 | struct dlm_lock_resource *res) | 1912 | struct dlm_lock_resource *res) |
1931 | { | 1913 | { |
1932 | int i; | 1914 | int i; |
1933 | struct list_head *queue, *iter, *iter2; | 1915 | struct list_head *queue; |
1934 | struct dlm_lock *lock; | 1916 | struct dlm_lock *lock, *next; |
1935 | 1917 | ||
1936 | res->state |= DLM_LOCK_RES_RECOVERING; | 1918 | res->state |= DLM_LOCK_RES_RECOVERING; |
1937 | if (!list_empty(&res->recovering)) { | 1919 | if (!list_empty(&res->recovering)) { |
@@ -1947,8 +1929,7 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, | |||
1947 | /* find any pending locks and put them back on proper list */ | 1929 | /* find any pending locks and put them back on proper list */ |
1948 | for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) { | 1930 | for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) { |
1949 | queue = dlm_list_idx_to_ptr(res, i); | 1931 | queue = dlm_list_idx_to_ptr(res, i); |
1950 | list_for_each_safe(iter, iter2, queue) { | 1932 | list_for_each_entry_safe(lock, next, queue, list) { |
1951 | lock = list_entry (iter, struct dlm_lock, list); | ||
1952 | dlm_lock_get(lock); | 1933 | dlm_lock_get(lock); |
1953 | if (lock->convert_pending) { | 1934 | if (lock->convert_pending) { |
1954 | /* move converting lock back to granted */ | 1935 | /* move converting lock back to granted */ |
@@ -2013,18 +1994,15 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, | |||
2013 | u8 dead_node, u8 new_master) | 1994 | u8 dead_node, u8 new_master) |
2014 | { | 1995 | { |
2015 | int i; | 1996 | int i; |
2016 | struct list_head *iter, *iter2; | ||
2017 | struct hlist_node *hash_iter; | 1997 | struct hlist_node *hash_iter; |
2018 | struct hlist_head *bucket; | 1998 | struct hlist_head *bucket; |
2019 | 1999 | struct dlm_lock_resource *res, *next; | |
2020 | struct dlm_lock_resource *res; | ||
2021 | 2000 | ||
2022 | mlog_entry_void(); | 2001 | mlog_entry_void(); |
2023 | 2002 | ||
2024 | assert_spin_locked(&dlm->spinlock); | 2003 | assert_spin_locked(&dlm->spinlock); |
2025 | 2004 | ||
2026 | list_for_each_safe(iter, iter2, &dlm->reco.resources) { | 2005 | list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { |
2027 | res = list_entry (iter, struct dlm_lock_resource, recovering); | ||
2028 | if (res->owner == dead_node) { | 2006 | if (res->owner == dead_node) { |
2029 | list_del_init(&res->recovering); | 2007 | list_del_init(&res->recovering); |
2030 | spin_lock(&res->spinlock); | 2008 | spin_lock(&res->spinlock); |
@@ -2099,7 +2077,7 @@ static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local) | |||
2099 | static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, | 2077 | static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, |
2100 | struct dlm_lock_resource *res, u8 dead_node) | 2078 | struct dlm_lock_resource *res, u8 dead_node) |
2101 | { | 2079 | { |
2102 | struct list_head *iter, *queue; | 2080 | struct list_head *queue; |
2103 | struct dlm_lock *lock; | 2081 | struct dlm_lock *lock; |
2104 | int blank_lvb = 0, local = 0; | 2082 | int blank_lvb = 0, local = 0; |
2105 | int i; | 2083 | int i; |
@@ -2121,8 +2099,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, | |||
2121 | 2099 | ||
2122 | for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) { | 2100 | for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) { |
2123 | queue = dlm_list_idx_to_ptr(res, i); | 2101 | queue = dlm_list_idx_to_ptr(res, i); |
2124 | list_for_each(iter, queue) { | 2102 | list_for_each_entry(lock, queue, list) { |
2125 | lock = list_entry (iter, struct dlm_lock, list); | ||
2126 | if (lock->ml.node == search_node) { | 2103 | if (lock->ml.node == search_node) { |
2127 | if (dlm_lvb_needs_invalidation(lock, local)) { | 2104 | if (dlm_lvb_needs_invalidation(lock, local)) { |
2128 | /* zero the lksb lvb and lockres lvb */ | 2105 | /* zero the lksb lvb and lockres lvb */ |
@@ -2143,8 +2120,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, | |||
2143 | static void dlm_free_dead_locks(struct dlm_ctxt *dlm, | 2120 | static void dlm_free_dead_locks(struct dlm_ctxt *dlm, |
2144 | struct dlm_lock_resource *res, u8 dead_node) | 2121 | struct dlm_lock_resource *res, u8 dead_node) |
2145 | { | 2122 | { |
2146 | struct list_head *iter, *tmpiter; | 2123 | struct dlm_lock *lock, *next; |
2147 | struct dlm_lock *lock; | ||
2148 | unsigned int freed = 0; | 2124 | unsigned int freed = 0; |
2149 | 2125 | ||
2150 | /* this node is the lockres master: | 2126 | /* this node is the lockres master: |
@@ -2155,24 +2131,21 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, | |||
2155 | assert_spin_locked(&res->spinlock); | 2131 | assert_spin_locked(&res->spinlock); |
2156 | 2132 | ||
2157 | /* TODO: check pending_asts, pending_basts here */ | 2133 | /* TODO: check pending_asts, pending_basts here */ |
2158 | list_for_each_safe(iter, tmpiter, &res->granted) { | 2134 | list_for_each_entry_safe(lock, next, &res->granted, list) { |
2159 | lock = list_entry (iter, struct dlm_lock, list); | ||
2160 | if (lock->ml.node == dead_node) { | 2135 | if (lock->ml.node == dead_node) { |
2161 | list_del_init(&lock->list); | 2136 | list_del_init(&lock->list); |
2162 | dlm_lock_put(lock); | 2137 | dlm_lock_put(lock); |
2163 | freed++; | 2138 | freed++; |
2164 | } | 2139 | } |
2165 | } | 2140 | } |
2166 | list_for_each_safe(iter, tmpiter, &res->converting) { | 2141 | list_for_each_entry_safe(lock, next, &res->converting, list) { |
2167 | lock = list_entry (iter, struct dlm_lock, list); | ||
2168 | if (lock->ml.node == dead_node) { | 2142 | if (lock->ml.node == dead_node) { |
2169 | list_del_init(&lock->list); | 2143 | list_del_init(&lock->list); |
2170 | dlm_lock_put(lock); | 2144 | dlm_lock_put(lock); |
2171 | freed++; | 2145 | freed++; |
2172 | } | 2146 | } |
2173 | } | 2147 | } |
2174 | list_for_each_safe(iter, tmpiter, &res->blocked) { | 2148 | list_for_each_entry_safe(lock, next, &res->blocked, list) { |
2175 | lock = list_entry (iter, struct dlm_lock, list); | ||
2176 | if (lock->ml.node == dead_node) { | 2149 | if (lock->ml.node == dead_node) { |
2177 | list_del_init(&lock->list); | 2150 | list_del_init(&lock->list); |
2178 | dlm_lock_put(lock); | 2151 | dlm_lock_put(lock); |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index d1bd305ef0d7..f71250ed166f 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -600,15 +600,13 @@ static inline int ocfs2_highest_compat_lock_level(int level) | |||
600 | static void lockres_set_flags(struct ocfs2_lock_res *lockres, | 600 | static void lockres_set_flags(struct ocfs2_lock_res *lockres, |
601 | unsigned long newflags) | 601 | unsigned long newflags) |
602 | { | 602 | { |
603 | struct list_head *pos, *tmp; | 603 | struct ocfs2_mask_waiter *mw, *tmp; |
604 | struct ocfs2_mask_waiter *mw; | ||
605 | 604 | ||
606 | assert_spin_locked(&lockres->l_lock); | 605 | assert_spin_locked(&lockres->l_lock); |
607 | 606 | ||
608 | lockres->l_flags = newflags; | 607 | lockres->l_flags = newflags; |
609 | 608 | ||
610 | list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) { | 609 | list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) { |
611 | mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item); | ||
612 | if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) | 610 | if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) |
613 | continue; | 611 | continue; |
614 | 612 | ||
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h index f226b2207628..ff257628af16 100644 --- a/fs/ocfs2/endian.h +++ b/fs/ocfs2/endian.h | |||
@@ -32,6 +32,11 @@ static inline void le32_add_cpu(__le32 *var, u32 val) | |||
32 | *var = cpu_to_le32(le32_to_cpu(*var) + val); | 32 | *var = cpu_to_le32(le32_to_cpu(*var) + val); |
33 | } | 33 | } |
34 | 34 | ||
35 | static inline void le64_add_cpu(__le64 *var, u64 val) | ||
36 | { | ||
37 | *var = cpu_to_le64(le64_to_cpu(*var) + val); | ||
38 | } | ||
39 | |||
35 | static inline void le32_and_cpu(__le32 *var, u32 val) | 40 | static inline void le32_and_cpu(__le32 *var, u32 val) |
36 | { | 41 | { |
37 | *var = cpu_to_le32(le32_to_cpu(*var) & val); | 42 | *var = cpu_to_le32(le32_to_cpu(*var) & val); |
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index ba2b2ab1c6e4..03c1d365c78b 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c | |||
@@ -109,17 +109,14 @@ static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos, | |||
109 | */ | 109 | */ |
110 | void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) | 110 | void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) |
111 | { | 111 | { |
112 | struct list_head *p, *n; | 112 | struct ocfs2_extent_map_item *emi, *n; |
113 | struct ocfs2_extent_map_item *emi; | ||
114 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | 113 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
115 | struct ocfs2_extent_map *em = &oi->ip_extent_map; | 114 | struct ocfs2_extent_map *em = &oi->ip_extent_map; |
116 | LIST_HEAD(tmp_list); | 115 | LIST_HEAD(tmp_list); |
117 | unsigned int range; | 116 | unsigned int range; |
118 | 117 | ||
119 | spin_lock(&oi->ip_lock); | 118 | spin_lock(&oi->ip_lock); |
120 | list_for_each_safe(p, n, &em->em_list) { | 119 | list_for_each_entry_safe(emi, n, &em->em_list, ei_list) { |
121 | emi = list_entry(p, struct ocfs2_extent_map_item, ei_list); | ||
122 | |||
123 | if (emi->ei_cpos >= cpos) { | 120 | if (emi->ei_cpos >= cpos) { |
124 | /* Full truncate of this record. */ | 121 | /* Full truncate of this record. */ |
125 | list_move(&emi->ei_list, &tmp_list); | 122 | list_move(&emi->ei_list, &tmp_list); |
@@ -136,8 +133,7 @@ void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) | |||
136 | } | 133 | } |
137 | spin_unlock(&oi->ip_lock); | 134 | spin_unlock(&oi->ip_lock); |
138 | 135 | ||
139 | list_for_each_safe(p, n, &tmp_list) { | 136 | list_for_each_entry_safe(emi, n, &tmp_list, ei_list) { |
140 | emi = list_entry(p, struct ocfs2_extent_map_item, ei_list); | ||
141 | list_del(&emi->ei_list); | 137 | list_del(&emi->ei_list); |
142 | kfree(emi); | 138 | kfree(emi); |
143 | } | 139 | } |
@@ -377,37 +373,6 @@ out: | |||
377 | return ret; | 373 | return ret; |
378 | } | 374 | } |
379 | 375 | ||
380 | /* | ||
381 | * Return the index of the extent record which contains cluster #v_cluster. | ||
382 | * -1 is returned if it was not found. | ||
383 | * | ||
384 | * Should work fine on interior and exterior nodes. | ||
385 | */ | ||
386 | static int ocfs2_search_extent_list(struct ocfs2_extent_list *el, | ||
387 | u32 v_cluster) | ||
388 | { | ||
389 | int ret = -1; | ||
390 | int i; | ||
391 | struct ocfs2_extent_rec *rec; | ||
392 | u32 rec_end, rec_start, clusters; | ||
393 | |||
394 | for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { | ||
395 | rec = &el->l_recs[i]; | ||
396 | |||
397 | rec_start = le32_to_cpu(rec->e_cpos); | ||
398 | clusters = ocfs2_rec_clusters(el, rec); | ||
399 | |||
400 | rec_end = rec_start + clusters; | ||
401 | |||
402 | if (v_cluster >= rec_start && v_cluster < rec_end) { | ||
403 | ret = i; | ||
404 | break; | ||
405 | } | ||
406 | } | ||
407 | |||
408 | return ret; | ||
409 | } | ||
410 | |||
411 | int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, | 376 | int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, |
412 | u32 *p_cluster, u32 *num_clusters, | 377 | u32 *p_cluster, u32 *num_clusters, |
413 | unsigned int *extent_flags) | 378 | unsigned int *extent_flags) |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4979b6675717..f04c7aa834cb 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -263,6 +263,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, | |||
263 | int status; | 263 | int status; |
264 | handle_t *handle; | 264 | handle_t *handle; |
265 | struct ocfs2_dinode *di; | 265 | struct ocfs2_dinode *di; |
266 | u64 cluster_bytes; | ||
266 | 267 | ||
267 | mlog_entry_void(); | 268 | mlog_entry_void(); |
268 | 269 | ||
@@ -286,7 +287,9 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, | |||
286 | /* | 287 | /* |
287 | * Do this before setting i_size. | 288 | * Do this before setting i_size. |
288 | */ | 289 | */ |
289 | status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); | 290 | cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); |
291 | status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size, | ||
292 | cluster_bytes); | ||
290 | if (status) { | 293 | if (status) { |
291 | mlog_errno(status); | 294 | mlog_errno(status); |
292 | goto out_commit; | 295 | goto out_commit; |
@@ -326,9 +329,6 @@ static int ocfs2_truncate_file(struct inode *inode, | |||
326 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 329 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
327 | (unsigned long long)new_i_size); | 330 | (unsigned long long)new_i_size); |
328 | 331 | ||
329 | unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); | ||
330 | truncate_inode_pages(inode->i_mapping, new_i_size); | ||
331 | |||
332 | fe = (struct ocfs2_dinode *) di_bh->b_data; | 332 | fe = (struct ocfs2_dinode *) di_bh->b_data; |
333 | if (!OCFS2_IS_VALID_DINODE(fe)) { | 333 | if (!OCFS2_IS_VALID_DINODE(fe)) { |
334 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | 334 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); |
@@ -363,16 +363,23 @@ static int ocfs2_truncate_file(struct inode *inode, | |||
363 | if (new_i_size == le64_to_cpu(fe->i_size)) | 363 | if (new_i_size == le64_to_cpu(fe->i_size)) |
364 | goto bail; | 364 | goto bail; |
365 | 365 | ||
366 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
367 | |||
366 | /* This forces other nodes to sync and drop their pages. Do | 368 | /* This forces other nodes to sync and drop their pages. Do |
367 | * this even if we have a truncate without allocation change - | 369 | * this even if we have a truncate without allocation change - |
368 | * ocfs2 cluster sizes can be much greater than page size, so | 370 | * ocfs2 cluster sizes can be much greater than page size, so |
369 | * we have to truncate them anyway. */ | 371 | * we have to truncate them anyway. */ |
370 | status = ocfs2_data_lock(inode, 1); | 372 | status = ocfs2_data_lock(inode, 1); |
371 | if (status < 0) { | 373 | if (status < 0) { |
374 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
375 | |||
372 | mlog_errno(status); | 376 | mlog_errno(status); |
373 | goto bail; | 377 | goto bail; |
374 | } | 378 | } |
375 | 379 | ||
380 | unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); | ||
381 | truncate_inode_pages(inode->i_mapping, new_i_size); | ||
382 | |||
376 | /* alright, we're going to need to do a full blown alloc size | 383 | /* alright, we're going to need to do a full blown alloc size |
377 | * change. Orphan the inode so that recovery can complete the | 384 | * change. Orphan the inode so that recovery can complete the |
378 | * truncate if necessary. This does the task of marking | 385 | * truncate if necessary. This does the task of marking |
@@ -399,6 +406,8 @@ static int ocfs2_truncate_file(struct inode *inode, | |||
399 | bail_unlock_data: | 406 | bail_unlock_data: |
400 | ocfs2_data_unlock(inode, 1); | 407 | ocfs2_data_unlock(inode, 1); |
401 | 408 | ||
409 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
410 | |||
402 | bail: | 411 | bail: |
403 | 412 | ||
404 | mlog_exit(status); | 413 | mlog_exit(status); |
@@ -419,6 +428,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | |||
419 | struct inode *inode, | 428 | struct inode *inode, |
420 | u32 *logical_offset, | 429 | u32 *logical_offset, |
421 | u32 clusters_to_add, | 430 | u32 clusters_to_add, |
431 | int mark_unwritten, | ||
422 | struct buffer_head *fe_bh, | 432 | struct buffer_head *fe_bh, |
423 | handle_t *handle, | 433 | handle_t *handle, |
424 | struct ocfs2_alloc_context *data_ac, | 434 | struct ocfs2_alloc_context *data_ac, |
@@ -431,9 +441,13 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | |||
431 | enum ocfs2_alloc_restarted reason = RESTART_NONE; | 441 | enum ocfs2_alloc_restarted reason = RESTART_NONE; |
432 | u32 bit_off, num_bits; | 442 | u32 bit_off, num_bits; |
433 | u64 block; | 443 | u64 block; |
444 | u8 flags = 0; | ||
434 | 445 | ||
435 | BUG_ON(!clusters_to_add); | 446 | BUG_ON(!clusters_to_add); |
436 | 447 | ||
448 | if (mark_unwritten) | ||
449 | flags = OCFS2_EXT_UNWRITTEN; | ||
450 | |||
437 | free_extents = ocfs2_num_free_extents(osb, inode, fe); | 451 | free_extents = ocfs2_num_free_extents(osb, inode, fe); |
438 | if (free_extents < 0) { | 452 | if (free_extents < 0) { |
439 | status = free_extents; | 453 | status = free_extents; |
@@ -483,7 +497,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | |||
483 | num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); | 497 | num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); |
484 | status = ocfs2_insert_extent(osb, handle, inode, fe_bh, | 498 | status = ocfs2_insert_extent(osb, handle, inode, fe_bh, |
485 | *logical_offset, block, num_bits, | 499 | *logical_offset, block, num_bits, |
486 | meta_ac); | 500 | flags, meta_ac); |
487 | if (status < 0) { | 501 | if (status < 0) { |
488 | mlog_errno(status); | 502 | mlog_errno(status); |
489 | goto leave; | 503 | goto leave; |
@@ -516,25 +530,31 @@ leave: | |||
516 | * For a given allocation, determine which allocators will need to be | 530 | * For a given allocation, determine which allocators will need to be |
517 | * accessed, and lock them, reserving the appropriate number of bits. | 531 | * accessed, and lock them, reserving the appropriate number of bits. |
518 | * | 532 | * |
519 | * Called from ocfs2_extend_allocation() for file systems which don't | 533 | * Sparse file systems call this from ocfs2_write_begin_nolock() |
520 | * support holes, and from ocfs2_write() for file systems which | 534 | * and ocfs2_allocate_unwritten_extents(). |
521 | * understand sparse inodes. | 535 | * |
536 | * File systems which don't support holes call this from | ||
537 | * ocfs2_extend_allocation(). | ||
522 | */ | 538 | */ |
523 | int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, | 539 | int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, |
524 | u32 clusters_to_add, | 540 | u32 clusters_to_add, u32 extents_to_split, |
525 | struct ocfs2_alloc_context **data_ac, | 541 | struct ocfs2_alloc_context **data_ac, |
526 | struct ocfs2_alloc_context **meta_ac) | 542 | struct ocfs2_alloc_context **meta_ac) |
527 | { | 543 | { |
528 | int ret, num_free_extents; | 544 | int ret = 0, num_free_extents; |
545 | unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; | ||
529 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 546 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
530 | 547 | ||
531 | *meta_ac = NULL; | 548 | *meta_ac = NULL; |
532 | *data_ac = NULL; | 549 | if (data_ac) |
550 | *data_ac = NULL; | ||
551 | |||
552 | BUG_ON(clusters_to_add != 0 && data_ac == NULL); | ||
533 | 553 | ||
534 | mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " | 554 | mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " |
535 | "clusters_to_add = %u\n", | 555 | "clusters_to_add = %u, extents_to_split = %u\n", |
536 | (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), | 556 | (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), |
537 | le32_to_cpu(di->i_clusters), clusters_to_add); | 557 | le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split); |
538 | 558 | ||
539 | num_free_extents = ocfs2_num_free_extents(osb, inode, di); | 559 | num_free_extents = ocfs2_num_free_extents(osb, inode, di); |
540 | if (num_free_extents < 0) { | 560 | if (num_free_extents < 0) { |
@@ -552,9 +572,12 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, | |||
552 | * | 572 | * |
553 | * Most of the time we'll only be seeing this 1 cluster at a time | 573 | * Most of the time we'll only be seeing this 1 cluster at a time |
554 | * anyway. | 574 | * anyway. |
575 | * | ||
576 | * Always lock for any unwritten extents - we might want to | ||
577 | * add blocks during a split. | ||
555 | */ | 578 | */ |
556 | if (!num_free_extents || | 579 | if (!num_free_extents || |
557 | (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { | 580 | (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { |
558 | ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); | 581 | ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); |
559 | if (ret < 0) { | 582 | if (ret < 0) { |
560 | if (ret != -ENOSPC) | 583 | if (ret != -ENOSPC) |
@@ -563,6 +586,9 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, | |||
563 | } | 586 | } |
564 | } | 587 | } |
565 | 588 | ||
589 | if (clusters_to_add == 0) | ||
590 | goto out; | ||
591 | |||
566 | ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); | 592 | ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); |
567 | if (ret < 0) { | 593 | if (ret < 0) { |
568 | if (ret != -ENOSPC) | 594 | if (ret != -ENOSPC) |
@@ -585,14 +611,13 @@ out: | |||
585 | return ret; | 611 | return ret; |
586 | } | 612 | } |
587 | 613 | ||
588 | static int ocfs2_extend_allocation(struct inode *inode, | 614 | static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, |
589 | u32 clusters_to_add) | 615 | u32 clusters_to_add, int mark_unwritten) |
590 | { | 616 | { |
591 | int status = 0; | 617 | int status = 0; |
592 | int restart_func = 0; | 618 | int restart_func = 0; |
593 | int drop_alloc_sem = 0; | ||
594 | int credits; | 619 | int credits; |
595 | u32 prev_clusters, logical_start; | 620 | u32 prev_clusters; |
596 | struct buffer_head *bh = NULL; | 621 | struct buffer_head *bh = NULL; |
597 | struct ocfs2_dinode *fe = NULL; | 622 | struct ocfs2_dinode *fe = NULL; |
598 | handle_t *handle = NULL; | 623 | handle_t *handle = NULL; |
@@ -607,7 +632,7 @@ static int ocfs2_extend_allocation(struct inode *inode, | |||
607 | * This function only exists for file systems which don't | 632 | * This function only exists for file systems which don't |
608 | * support holes. | 633 | * support holes. |
609 | */ | 634 | */ |
610 | BUG_ON(ocfs2_sparse_alloc(osb)); | 635 | BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); |
611 | 636 | ||
612 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, | 637 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, |
613 | OCFS2_BH_CACHED, inode); | 638 | OCFS2_BH_CACHED, inode); |
@@ -623,19 +648,10 @@ static int ocfs2_extend_allocation(struct inode *inode, | |||
623 | goto leave; | 648 | goto leave; |
624 | } | 649 | } |
625 | 650 | ||
626 | logical_start = OCFS2_I(inode)->ip_clusters; | ||
627 | |||
628 | restart_all: | 651 | restart_all: |
629 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); | 652 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); |
630 | 653 | ||
631 | /* blocks peope in read/write from reading our allocation | 654 | status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac, |
632 | * until we're done changing it. We depend on i_mutex to block | ||
633 | * other extend/truncate calls while we're here. Ordering wrt | ||
634 | * start_trans is important here -- always do it before! */ | ||
635 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
636 | drop_alloc_sem = 1; | ||
637 | |||
638 | status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac, | ||
639 | &meta_ac); | 655 | &meta_ac); |
640 | if (status) { | 656 | if (status) { |
641 | mlog_errno(status); | 657 | mlog_errno(status); |
@@ -668,6 +684,7 @@ restarted_transaction: | |||
668 | inode, | 684 | inode, |
669 | &logical_start, | 685 | &logical_start, |
670 | clusters_to_add, | 686 | clusters_to_add, |
687 | mark_unwritten, | ||
671 | bh, | 688 | bh, |
672 | handle, | 689 | handle, |
673 | data_ac, | 690 | data_ac, |
@@ -720,10 +737,6 @@ restarted_transaction: | |||
720 | OCFS2_I(inode)->ip_clusters, i_size_read(inode)); | 737 | OCFS2_I(inode)->ip_clusters, i_size_read(inode)); |
721 | 738 | ||
722 | leave: | 739 | leave: |
723 | if (drop_alloc_sem) { | ||
724 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
725 | drop_alloc_sem = 0; | ||
726 | } | ||
727 | if (handle) { | 740 | if (handle) { |
728 | ocfs2_commit_trans(osb, handle); | 741 | ocfs2_commit_trans(osb, handle); |
729 | handle = NULL; | 742 | handle = NULL; |
@@ -749,6 +762,25 @@ leave: | |||
749 | return status; | 762 | return status; |
750 | } | 763 | } |
751 | 764 | ||
765 | static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, | ||
766 | u32 clusters_to_add, int mark_unwritten) | ||
767 | { | ||
768 | int ret; | ||
769 | |||
770 | /* | ||
771 | * The alloc sem blocks peope in read/write from reading our | ||
772 | * allocation until we're done changing it. We depend on | ||
773 | * i_mutex to block other extend/truncate calls while we're | ||
774 | * here. | ||
775 | */ | ||
776 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
777 | ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add, | ||
778 | mark_unwritten); | ||
779 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
780 | |||
781 | return ret; | ||
782 | } | ||
783 | |||
752 | /* Some parts of this taken from generic_cont_expand, which turned out | 784 | /* Some parts of this taken from generic_cont_expand, which turned out |
753 | * to be too fragile to do exactly what we need without us having to | 785 | * to be too fragile to do exactly what we need without us having to |
754 | * worry about recursive locking in ->prepare_write() and | 786 | * worry about recursive locking in ->prepare_write() and |
@@ -890,7 +922,9 @@ static int ocfs2_extend_file(struct inode *inode, | |||
890 | } | 922 | } |
891 | 923 | ||
892 | if (clusters_to_add) { | 924 | if (clusters_to_add) { |
893 | ret = ocfs2_extend_allocation(inode, clusters_to_add); | 925 | ret = ocfs2_extend_allocation(inode, |
926 | OCFS2_I(inode)->ip_clusters, | ||
927 | clusters_to_add, 0); | ||
894 | if (ret < 0) { | 928 | if (ret < 0) { |
895 | mlog_errno(ret); | 929 | mlog_errno(ret); |
896 | goto out_unlock; | 930 | goto out_unlock; |
@@ -995,6 +1029,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) | |||
995 | goto bail_unlock; | 1029 | goto bail_unlock; |
996 | } | 1030 | } |
997 | 1031 | ||
1032 | /* | ||
1033 | * This will intentionally not wind up calling vmtruncate(), | ||
1034 | * since all the work for a size change has been done above. | ||
1035 | * Otherwise, we could get into problems with truncate as | ||
1036 | * ip_alloc_sem is used there to protect against i_size | ||
1037 | * changes. | ||
1038 | */ | ||
998 | status = inode_setattr(inode, attr); | 1039 | status = inode_setattr(inode, attr); |
999 | if (status < 0) { | 1040 | if (status < 0) { |
1000 | mlog_errno(status); | 1041 | mlog_errno(status); |
@@ -1070,17 +1111,16 @@ out: | |||
1070 | return ret; | 1111 | return ret; |
1071 | } | 1112 | } |
1072 | 1113 | ||
1073 | static int ocfs2_write_remove_suid(struct inode *inode) | 1114 | static int __ocfs2_write_remove_suid(struct inode *inode, |
1115 | struct buffer_head *bh) | ||
1074 | { | 1116 | { |
1075 | int ret; | 1117 | int ret; |
1076 | struct buffer_head *bh = NULL; | ||
1077 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
1078 | handle_t *handle; | 1118 | handle_t *handle; |
1079 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1119 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1080 | struct ocfs2_dinode *di; | 1120 | struct ocfs2_dinode *di; |
1081 | 1121 | ||
1082 | mlog_entry("(Inode %llu, mode 0%o)\n", | 1122 | mlog_entry("(Inode %llu, mode 0%o)\n", |
1083 | (unsigned long long)oi->ip_blkno, inode->i_mode); | 1123 | (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode); |
1084 | 1124 | ||
1085 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | 1125 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
1086 | if (handle == NULL) { | 1126 | if (handle == NULL) { |
@@ -1089,17 +1129,11 @@ static int ocfs2_write_remove_suid(struct inode *inode) | |||
1089 | goto out; | 1129 | goto out; |
1090 | } | 1130 | } |
1091 | 1131 | ||
1092 | ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); | ||
1093 | if (ret < 0) { | ||
1094 | mlog_errno(ret); | ||
1095 | goto out_trans; | ||
1096 | } | ||
1097 | |||
1098 | ret = ocfs2_journal_access(handle, inode, bh, | 1132 | ret = ocfs2_journal_access(handle, inode, bh, |
1099 | OCFS2_JOURNAL_ACCESS_WRITE); | 1133 | OCFS2_JOURNAL_ACCESS_WRITE); |
1100 | if (ret < 0) { | 1134 | if (ret < 0) { |
1101 | mlog_errno(ret); | 1135 | mlog_errno(ret); |
1102 | goto out_bh; | 1136 | goto out_trans; |
1103 | } | 1137 | } |
1104 | 1138 | ||
1105 | inode->i_mode &= ~S_ISUID; | 1139 | inode->i_mode &= ~S_ISUID; |
@@ -1112,8 +1146,7 @@ static int ocfs2_write_remove_suid(struct inode *inode) | |||
1112 | ret = ocfs2_journal_dirty(handle, bh); | 1146 | ret = ocfs2_journal_dirty(handle, bh); |
1113 | if (ret < 0) | 1147 | if (ret < 0) |
1114 | mlog_errno(ret); | 1148 | mlog_errno(ret); |
1115 | out_bh: | 1149 | |
1116 | brelse(bh); | ||
1117 | out_trans: | 1150 | out_trans: |
1118 | ocfs2_commit_trans(osb, handle); | 1151 | ocfs2_commit_trans(osb, handle); |
1119 | out: | 1152 | out: |
@@ -1159,6 +1192,460 @@ out: | |||
1159 | return ret; | 1192 | return ret; |
1160 | } | 1193 | } |
1161 | 1194 | ||
1195 | static int ocfs2_write_remove_suid(struct inode *inode) | ||
1196 | { | ||
1197 | int ret; | ||
1198 | struct buffer_head *bh = NULL; | ||
1199 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | ||
1200 | |||
1201 | ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), | ||
1202 | oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); | ||
1203 | if (ret < 0) { | ||
1204 | mlog_errno(ret); | ||
1205 | goto out; | ||
1206 | } | ||
1207 | |||
1208 | ret = __ocfs2_write_remove_suid(inode, bh); | ||
1209 | out: | ||
1210 | brelse(bh); | ||
1211 | return ret; | ||
1212 | } | ||
1213 | |||
1214 | /* | ||
1215 | * Allocate enough extents to cover the region starting at byte offset | ||
1216 | * start for len bytes. Existing extents are skipped, any extents | ||
1217 | * added are marked as "unwritten". | ||
1218 | */ | ||
1219 | static int ocfs2_allocate_unwritten_extents(struct inode *inode, | ||
1220 | u64 start, u64 len) | ||
1221 | { | ||
1222 | int ret; | ||
1223 | u32 cpos, phys_cpos, clusters, alloc_size; | ||
1224 | |||
1225 | /* | ||
1226 | * We consider both start and len to be inclusive. | ||
1227 | */ | ||
1228 | cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; | ||
1229 | clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len); | ||
1230 | clusters -= cpos; | ||
1231 | |||
1232 | while (clusters) { | ||
1233 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, | ||
1234 | &alloc_size, NULL); | ||
1235 | if (ret) { | ||
1236 | mlog_errno(ret); | ||
1237 | goto out; | ||
1238 | } | ||
1239 | |||
1240 | /* | ||
1241 | * Hole or existing extent len can be arbitrary, so | ||
1242 | * cap it to our own allocation request. | ||
1243 | */ | ||
1244 | if (alloc_size > clusters) | ||
1245 | alloc_size = clusters; | ||
1246 | |||
1247 | if (phys_cpos) { | ||
1248 | /* | ||
1249 | * We already have an allocation at this | ||
1250 | * region so we can safely skip it. | ||
1251 | */ | ||
1252 | goto next; | ||
1253 | } | ||
1254 | |||
1255 | ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); | ||
1256 | if (ret) { | ||
1257 | if (ret != -ENOSPC) | ||
1258 | mlog_errno(ret); | ||
1259 | goto out; | ||
1260 | } | ||
1261 | |||
1262 | next: | ||
1263 | cpos += alloc_size; | ||
1264 | clusters -= alloc_size; | ||
1265 | } | ||
1266 | |||
1267 | ret = 0; | ||
1268 | out: | ||
1269 | return ret; | ||
1270 | } | ||
1271 | |||
1272 | static int __ocfs2_remove_inode_range(struct inode *inode, | ||
1273 | struct buffer_head *di_bh, | ||
1274 | u32 cpos, u32 phys_cpos, u32 len, | ||
1275 | struct ocfs2_cached_dealloc_ctxt *dealloc) | ||
1276 | { | ||
1277 | int ret; | ||
1278 | u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); | ||
1279 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1280 | struct inode *tl_inode = osb->osb_tl_inode; | ||
1281 | handle_t *handle; | ||
1282 | struct ocfs2_alloc_context *meta_ac = NULL; | ||
1283 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; | ||
1284 | |||
1285 | ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac); | ||
1286 | if (ret) { | ||
1287 | mlog_errno(ret); | ||
1288 | return ret; | ||
1289 | } | ||
1290 | |||
1291 | mutex_lock(&tl_inode->i_mutex); | ||
1292 | |||
1293 | if (ocfs2_truncate_log_needs_flush(osb)) { | ||
1294 | ret = __ocfs2_flush_truncate_log(osb); | ||
1295 | if (ret < 0) { | ||
1296 | mlog_errno(ret); | ||
1297 | goto out; | ||
1298 | } | ||
1299 | } | ||
1300 | |||
1301 | handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); | ||
1302 | if (handle == NULL) { | ||
1303 | ret = -ENOMEM; | ||
1304 | mlog_errno(ret); | ||
1305 | goto out; | ||
1306 | } | ||
1307 | |||
1308 | ret = ocfs2_journal_access(handle, inode, di_bh, | ||
1309 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1310 | if (ret) { | ||
1311 | mlog_errno(ret); | ||
1312 | goto out; | ||
1313 | } | ||
1314 | |||
1315 | ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac, | ||
1316 | dealloc); | ||
1317 | if (ret) { | ||
1318 | mlog_errno(ret); | ||
1319 | goto out_commit; | ||
1320 | } | ||
1321 | |||
1322 | OCFS2_I(inode)->ip_clusters -= len; | ||
1323 | di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); | ||
1324 | |||
1325 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
1326 | if (ret) { | ||
1327 | mlog_errno(ret); | ||
1328 | goto out_commit; | ||
1329 | } | ||
1330 | |||
1331 | ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); | ||
1332 | if (ret) | ||
1333 | mlog_errno(ret); | ||
1334 | |||
1335 | out_commit: | ||
1336 | ocfs2_commit_trans(osb, handle); | ||
1337 | out: | ||
1338 | mutex_unlock(&tl_inode->i_mutex); | ||
1339 | |||
1340 | if (meta_ac) | ||
1341 | ocfs2_free_alloc_context(meta_ac); | ||
1342 | |||
1343 | return ret; | ||
1344 | } | ||
1345 | |||
1346 | /* | ||
1347 | * Truncate a byte range, avoiding pages within partial clusters. This | ||
1348 | * preserves those pages for the zeroing code to write to. | ||
1349 | */ | ||
1350 | static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start, | ||
1351 | u64 byte_len) | ||
1352 | { | ||
1353 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1354 | loff_t start, end; | ||
1355 | struct address_space *mapping = inode->i_mapping; | ||
1356 | |||
1357 | start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start); | ||
1358 | end = byte_start + byte_len; | ||
1359 | end = end & ~(osb->s_clustersize - 1); | ||
1360 | |||
1361 | if (start < end) { | ||
1362 | unmap_mapping_range(mapping, start, end - start, 0); | ||
1363 | truncate_inode_pages_range(mapping, start, end - 1); | ||
1364 | } | ||
1365 | } | ||
1366 | |||
1367 | static int ocfs2_zero_partial_clusters(struct inode *inode, | ||
1368 | u64 start, u64 len) | ||
1369 | { | ||
1370 | int ret = 0; | ||
1371 | u64 tmpend, end = start + len; | ||
1372 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1373 | unsigned int csize = osb->s_clustersize; | ||
1374 | handle_t *handle; | ||
1375 | |||
1376 | /* | ||
1377 | * The "start" and "end" values are NOT necessarily part of | ||
1378 | * the range whose allocation is being deleted. Rather, this | ||
1379 | * is what the user passed in with the request. We must zero | ||
1380 | * partial clusters here. There's no need to worry about | ||
1381 | * physical allocation - the zeroing code knows to skip holes. | ||
1382 | */ | ||
1383 | mlog(0, "byte start: %llu, end: %llu\n", | ||
1384 | (unsigned long long)start, (unsigned long long)end); | ||
1385 | |||
1386 | /* | ||
1387 | * If both edges are on a cluster boundary then there's no | ||
1388 | * zeroing required as the region is part of the allocation to | ||
1389 | * be truncated. | ||
1390 | */ | ||
1391 | if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) | ||
1392 | goto out; | ||
1393 | |||
1394 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | ||
1395 | if (handle == NULL) { | ||
1396 | ret = -ENOMEM; | ||
1397 | mlog_errno(ret); | ||
1398 | goto out; | ||
1399 | } | ||
1400 | |||
1401 | /* | ||
1402 | * We want to get the byte offset of the end of the 1st cluster. | ||
1403 | */ | ||
1404 | tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); | ||
1405 | if (tmpend > end) | ||
1406 | tmpend = end; | ||
1407 | |||
1408 | mlog(0, "1st range: start: %llu, tmpend: %llu\n", | ||
1409 | (unsigned long long)start, (unsigned long long)tmpend); | ||
1410 | |||
1411 | ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); | ||
1412 | if (ret) | ||
1413 | mlog_errno(ret); | ||
1414 | |||
1415 | if (tmpend < end) { | ||
1416 | /* | ||
1417 | * This may make start and end equal, but the zeroing | ||
1418 | * code will skip any work in that case so there's no | ||
1419 | * need to catch it up here. | ||
1420 | */ | ||
1421 | start = end & ~(osb->s_clustersize - 1); | ||
1422 | |||
1423 | mlog(0, "2nd range: start: %llu, end: %llu\n", | ||
1424 | (unsigned long long)start, (unsigned long long)end); | ||
1425 | |||
1426 | ret = ocfs2_zero_range_for_truncate(inode, handle, start, end); | ||
1427 | if (ret) | ||
1428 | mlog_errno(ret); | ||
1429 | } | ||
1430 | |||
1431 | ocfs2_commit_trans(osb, handle); | ||
1432 | out: | ||
1433 | return ret; | ||
1434 | } | ||
1435 | |||
1436 | static int ocfs2_remove_inode_range(struct inode *inode, | ||
1437 | struct buffer_head *di_bh, u64 byte_start, | ||
1438 | u64 byte_len) | ||
1439 | { | ||
1440 | int ret = 0; | ||
1441 | u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; | ||
1442 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1443 | struct ocfs2_cached_dealloc_ctxt dealloc; | ||
1444 | |||
1445 | ocfs2_init_dealloc_ctxt(&dealloc); | ||
1446 | |||
1447 | if (byte_len == 0) | ||
1448 | return 0; | ||
1449 | |||
1450 | trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); | ||
1451 | trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; | ||
1452 | if (trunc_len >= trunc_start) | ||
1453 | trunc_len -= trunc_start; | ||
1454 | else | ||
1455 | trunc_len = 0; | ||
1456 | |||
1457 | mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n", | ||
1458 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
1459 | (unsigned long long)byte_start, | ||
1460 | (unsigned long long)byte_len, trunc_start, trunc_len); | ||
1461 | |||
1462 | ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); | ||
1463 | if (ret) { | ||
1464 | mlog_errno(ret); | ||
1465 | goto out; | ||
1466 | } | ||
1467 | |||
1468 | cpos = trunc_start; | ||
1469 | while (trunc_len) { | ||
1470 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, | ||
1471 | &alloc_size, NULL); | ||
1472 | if (ret) { | ||
1473 | mlog_errno(ret); | ||
1474 | goto out; | ||
1475 | } | ||
1476 | |||
1477 | if (alloc_size > trunc_len) | ||
1478 | alloc_size = trunc_len; | ||
1479 | |||
1480 | /* Only do work for non-holes */ | ||
1481 | if (phys_cpos != 0) { | ||
1482 | ret = __ocfs2_remove_inode_range(inode, di_bh, cpos, | ||
1483 | phys_cpos, alloc_size, | ||
1484 | &dealloc); | ||
1485 | if (ret) { | ||
1486 | mlog_errno(ret); | ||
1487 | goto out; | ||
1488 | } | ||
1489 | } | ||
1490 | |||
1491 | cpos += alloc_size; | ||
1492 | trunc_len -= alloc_size; | ||
1493 | } | ||
1494 | |||
1495 | ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); | ||
1496 | |||
1497 | out: | ||
1498 | ocfs2_schedule_truncate_log_flush(osb, 1); | ||
1499 | ocfs2_run_deallocs(osb, &dealloc); | ||
1500 | |||
1501 | return ret; | ||
1502 | } | ||
1503 | |||
1504 | /* | ||
1505 | * Parts of this function taken from xfs_change_file_space() | ||
1506 | */ | ||
1507 | int ocfs2_change_file_space(struct file *file, unsigned int cmd, | ||
1508 | struct ocfs2_space_resv *sr) | ||
1509 | { | ||
1510 | int ret; | ||
1511 | s64 llen; | ||
1512 | struct inode *inode = file->f_path.dentry->d_inode; | ||
1513 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1514 | struct buffer_head *di_bh = NULL; | ||
1515 | handle_t *handle; | ||
1516 | unsigned long long max_off = ocfs2_max_file_offset(inode->i_sb->s_blocksize_bits); | ||
1517 | |||
1518 | if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && | ||
1519 | !ocfs2_writes_unwritten_extents(osb)) | ||
1520 | return -ENOTTY; | ||
1521 | else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) && | ||
1522 | !ocfs2_sparse_alloc(osb)) | ||
1523 | return -ENOTTY; | ||
1524 | |||
1525 | if (!S_ISREG(inode->i_mode)) | ||
1526 | return -EINVAL; | ||
1527 | |||
1528 | if (!(file->f_mode & FMODE_WRITE)) | ||
1529 | return -EBADF; | ||
1530 | |||
1531 | if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) | ||
1532 | return -EROFS; | ||
1533 | |||
1534 | mutex_lock(&inode->i_mutex); | ||
1535 | |||
1536 | /* | ||
1537 | * This prevents concurrent writes on other nodes | ||
1538 | */ | ||
1539 | ret = ocfs2_rw_lock(inode, 1); | ||
1540 | if (ret) { | ||
1541 | mlog_errno(ret); | ||
1542 | goto out; | ||
1543 | } | ||
1544 | |||
1545 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | ||
1546 | if (ret) { | ||
1547 | mlog_errno(ret); | ||
1548 | goto out_rw_unlock; | ||
1549 | } | ||
1550 | |||
1551 | if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { | ||
1552 | ret = -EPERM; | ||
1553 | goto out_meta_unlock; | ||
1554 | } | ||
1555 | |||
1556 | switch (sr->l_whence) { | ||
1557 | case 0: /*SEEK_SET*/ | ||
1558 | break; | ||
1559 | case 1: /*SEEK_CUR*/ | ||
1560 | sr->l_start += file->f_pos; | ||
1561 | break; | ||
1562 | case 2: /*SEEK_END*/ | ||
1563 | sr->l_start += i_size_read(inode); | ||
1564 | break; | ||
1565 | default: | ||
1566 | ret = -EINVAL; | ||
1567 | goto out_meta_unlock; | ||
1568 | } | ||
1569 | sr->l_whence = 0; | ||
1570 | |||
1571 | llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len; | ||
1572 | |||
1573 | if (sr->l_start < 0 | ||
1574 | || sr->l_start > max_off | ||
1575 | || (sr->l_start + llen) < 0 | ||
1576 | || (sr->l_start + llen) > max_off) { | ||
1577 | ret = -EINVAL; | ||
1578 | goto out_meta_unlock; | ||
1579 | } | ||
1580 | |||
1581 | if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { | ||
1582 | if (sr->l_len <= 0) { | ||
1583 | ret = -EINVAL; | ||
1584 | goto out_meta_unlock; | ||
1585 | } | ||
1586 | } | ||
1587 | |||
1588 | if (should_remove_suid(file->f_path.dentry)) { | ||
1589 | ret = __ocfs2_write_remove_suid(inode, di_bh); | ||
1590 | if (ret) { | ||
1591 | mlog_errno(ret); | ||
1592 | goto out_meta_unlock; | ||
1593 | } | ||
1594 | } | ||
1595 | |||
1596 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1597 | switch (cmd) { | ||
1598 | case OCFS2_IOC_RESVSP: | ||
1599 | case OCFS2_IOC_RESVSP64: | ||
1600 | /* | ||
1601 | * This takes unsigned offsets, but the signed ones we | ||
1602 | * pass have been checked against overflow above. | ||
1603 | */ | ||
1604 | ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start, | ||
1605 | sr->l_len); | ||
1606 | break; | ||
1607 | case OCFS2_IOC_UNRESVSP: | ||
1608 | case OCFS2_IOC_UNRESVSP64: | ||
1609 | ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start, | ||
1610 | sr->l_len); | ||
1611 | break; | ||
1612 | default: | ||
1613 | ret = -EINVAL; | ||
1614 | } | ||
1615 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1616 | if (ret) { | ||
1617 | mlog_errno(ret); | ||
1618 | goto out_meta_unlock; | ||
1619 | } | ||
1620 | |||
1621 | /* | ||
1622 | * We update c/mtime for these changes | ||
1623 | */ | ||
1624 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | ||
1625 | if (IS_ERR(handle)) { | ||
1626 | ret = PTR_ERR(handle); | ||
1627 | mlog_errno(ret); | ||
1628 | goto out_meta_unlock; | ||
1629 | } | ||
1630 | |||
1631 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | ||
1632 | ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); | ||
1633 | if (ret < 0) | ||
1634 | mlog_errno(ret); | ||
1635 | |||
1636 | ocfs2_commit_trans(osb, handle); | ||
1637 | |||
1638 | out_meta_unlock: | ||
1639 | brelse(di_bh); | ||
1640 | ocfs2_meta_unlock(inode, 1); | ||
1641 | out_rw_unlock: | ||
1642 | ocfs2_rw_unlock(inode, 1); | ||
1643 | |||
1644 | mutex_unlock(&inode->i_mutex); | ||
1645 | out: | ||
1646 | return ret; | ||
1647 | } | ||
1648 | |||
1162 | static int ocfs2_prepare_inode_for_write(struct dentry *dentry, | 1649 | static int ocfs2_prepare_inode_for_write(struct dentry *dentry, |
1163 | loff_t *ppos, | 1650 | loff_t *ppos, |
1164 | size_t count, | 1651 | size_t count, |
@@ -1329,15 +1816,16 @@ ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) | |||
1329 | *basep = base; | 1816 | *basep = base; |
1330 | } | 1817 | } |
1331 | 1818 | ||
1332 | static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, | 1819 | static struct page * ocfs2_get_write_source(char **ret_src_buf, |
1333 | const struct iovec *cur_iov, | 1820 | const struct iovec *cur_iov, |
1334 | size_t iov_offset) | 1821 | size_t iov_offset) |
1335 | { | 1822 | { |
1336 | int ret; | 1823 | int ret; |
1337 | char *buf; | 1824 | char *buf = cur_iov->iov_base + iov_offset; |
1338 | struct page *src_page = NULL; | 1825 | struct page *src_page = NULL; |
1826 | unsigned long off; | ||
1339 | 1827 | ||
1340 | buf = cur_iov->iov_base + iov_offset; | 1828 | off = (unsigned long)(buf) & ~PAGE_CACHE_MASK; |
1341 | 1829 | ||
1342 | if (!segment_eq(get_fs(), KERNEL_DS)) { | 1830 | if (!segment_eq(get_fs(), KERNEL_DS)) { |
1343 | /* | 1831 | /* |
@@ -1349,18 +1837,17 @@ static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp | |||
1349 | (unsigned long)buf & PAGE_CACHE_MASK, 1, | 1837 | (unsigned long)buf & PAGE_CACHE_MASK, 1, |
1350 | 0, 0, &src_page, NULL); | 1838 | 0, 0, &src_page, NULL); |
1351 | if (ret == 1) | 1839 | if (ret == 1) |
1352 | bp->b_src_buf = kmap(src_page); | 1840 | *ret_src_buf = kmap(src_page) + off; |
1353 | else | 1841 | else |
1354 | src_page = ERR_PTR(-EFAULT); | 1842 | src_page = ERR_PTR(-EFAULT); |
1355 | } else { | 1843 | } else { |
1356 | bp->b_src_buf = buf; | 1844 | *ret_src_buf = buf; |
1357 | } | 1845 | } |
1358 | 1846 | ||
1359 | return src_page; | 1847 | return src_page; |
1360 | } | 1848 | } |
1361 | 1849 | ||
1362 | static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, | 1850 | static void ocfs2_put_write_source(struct page *page) |
1363 | struct page *page) | ||
1364 | { | 1851 | { |
1365 | if (page) { | 1852 | if (page) { |
1366 | kunmap(page); | 1853 | kunmap(page); |
@@ -1376,10 +1863,12 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, | |||
1376 | { | 1863 | { |
1377 | int ret = 0; | 1864 | int ret = 0; |
1378 | ssize_t copied, total = 0; | 1865 | ssize_t copied, total = 0; |
1379 | size_t iov_offset = 0; | 1866 | size_t iov_offset = 0, bytes; |
1867 | loff_t pos; | ||
1380 | const struct iovec *cur_iov = iov; | 1868 | const struct iovec *cur_iov = iov; |
1381 | struct ocfs2_buffered_write_priv bp; | 1869 | struct page *user_page, *page; |
1382 | struct page *page; | 1870 | char *buf, *dst; |
1871 | void *fsdata; | ||
1383 | 1872 | ||
1384 | /* | 1873 | /* |
1385 | * handle partial DIO write. Adjust cur_iov if needed. | 1874 | * handle partial DIO write. Adjust cur_iov if needed. |
@@ -1387,21 +1876,38 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, | |||
1387 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); | 1876 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); |
1388 | 1877 | ||
1389 | do { | 1878 | do { |
1390 | bp.b_cur_off = iov_offset; | 1879 | pos = *ppos; |
1391 | bp.b_cur_iov = cur_iov; | ||
1392 | 1880 | ||
1393 | page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); | 1881 | user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset); |
1394 | if (IS_ERR(page)) { | 1882 | if (IS_ERR(user_page)) { |
1395 | ret = PTR_ERR(page); | 1883 | ret = PTR_ERR(user_page); |
1396 | goto out; | 1884 | goto out; |
1397 | } | 1885 | } |
1398 | 1886 | ||
1399 | copied = ocfs2_buffered_write_cluster(file, *ppos, count, | 1887 | /* Stay within our page boundaries */ |
1400 | ocfs2_map_and_write_user_data, | 1888 | bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)), |
1401 | &bp); | 1889 | (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK))); |
1890 | /* Stay within the vector boundary */ | ||
1891 | bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset); | ||
1892 | /* Stay within count */ | ||
1893 | bytes = min(bytes, count); | ||
1894 | |||
1895 | page = NULL; | ||
1896 | ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0, | ||
1897 | &page, &fsdata); | ||
1898 | if (ret) { | ||
1899 | mlog_errno(ret); | ||
1900 | goto out; | ||
1901 | } | ||
1402 | 1902 | ||
1403 | ocfs2_put_write_source(&bp, page); | 1903 | dst = kmap_atomic(page, KM_USER0); |
1904 | memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes); | ||
1905 | kunmap_atomic(dst, KM_USER0); | ||
1906 | flush_dcache_page(page); | ||
1907 | ocfs2_put_write_source(user_page); | ||
1404 | 1908 | ||
1909 | copied = ocfs2_write_end(file, file->f_mapping, pos, bytes, | ||
1910 | bytes, page, fsdata); | ||
1405 | if (copied < 0) { | 1911 | if (copied < 0) { |
1406 | mlog_errno(copied); | 1912 | mlog_errno(copied); |
1407 | ret = copied; | 1913 | ret = copied; |
@@ -1409,7 +1915,7 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, | |||
1409 | } | 1915 | } |
1410 | 1916 | ||
1411 | total += copied; | 1917 | total += copied; |
1412 | *ppos = *ppos + copied; | 1918 | *ppos = pos + copied; |
1413 | count -= copied; | 1919 | count -= copied; |
1414 | 1920 | ||
1415 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); | 1921 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); |
@@ -1579,52 +2085,46 @@ static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, | |||
1579 | struct pipe_buffer *buf, | 2085 | struct pipe_buffer *buf, |
1580 | struct splice_desc *sd) | 2086 | struct splice_desc *sd) |
1581 | { | 2087 | { |
1582 | int ret, count, total = 0; | 2088 | int ret, count; |
1583 | ssize_t copied = 0; | 2089 | ssize_t copied = 0; |
1584 | struct ocfs2_splice_write_priv sp; | 2090 | struct file *file = sd->u.file; |
2091 | unsigned int offset; | ||
2092 | struct page *page = NULL; | ||
2093 | void *fsdata; | ||
2094 | char *src, *dst; | ||
1585 | 2095 | ||
1586 | ret = buf->ops->confirm(pipe, buf); | 2096 | ret = buf->ops->confirm(pipe, buf); |
1587 | if (ret) | 2097 | if (ret) |
1588 | goto out; | 2098 | goto out; |
1589 | 2099 | ||
1590 | sp.s_sd = sd; | 2100 | offset = sd->pos & ~PAGE_CACHE_MASK; |
1591 | sp.s_buf = buf; | ||
1592 | sp.s_pipe = pipe; | ||
1593 | sp.s_offset = sd->pos & ~PAGE_CACHE_MASK; | ||
1594 | sp.s_buf_offset = buf->offset; | ||
1595 | |||
1596 | count = sd->len; | 2101 | count = sd->len; |
1597 | if (count + sp.s_offset > PAGE_CACHE_SIZE) | 2102 | if (count + offset > PAGE_CACHE_SIZE) |
1598 | count = PAGE_CACHE_SIZE - sp.s_offset; | 2103 | count = PAGE_CACHE_SIZE - offset; |
1599 | 2104 | ||
1600 | do { | 2105 | ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0, |
1601 | /* | 2106 | &page, &fsdata); |
1602 | * splice wants us to copy up to one page at a | 2107 | if (ret) { |
1603 | * time. For pagesize > cluster size, this means we | 2108 | mlog_errno(ret); |
1604 | * might enter ocfs2_buffered_write_cluster() more | 2109 | goto out; |
1605 | * than once, so keep track of our progress here. | 2110 | } |
1606 | */ | ||
1607 | copied = ocfs2_buffered_write_cluster(sd->u.file, | ||
1608 | (loff_t)sd->pos + total, | ||
1609 | count, | ||
1610 | ocfs2_map_and_write_splice_data, | ||
1611 | &sp); | ||
1612 | if (copied < 0) { | ||
1613 | mlog_errno(copied); | ||
1614 | ret = copied; | ||
1615 | goto out; | ||
1616 | } | ||
1617 | 2111 | ||
1618 | count -= copied; | 2112 | src = buf->ops->map(pipe, buf, 1); |
1619 | sp.s_offset += copied; | 2113 | dst = kmap_atomic(page, KM_USER1); |
1620 | sp.s_buf_offset += copied; | 2114 | memcpy(dst + offset, src + buf->offset, count); |
1621 | total += copied; | 2115 | kunmap_atomic(page, KM_USER1); |
1622 | } while (count); | 2116 | buf->ops->unmap(pipe, buf, src); |
1623 | 2117 | ||
1624 | ret = 0; | 2118 | copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count, |
2119 | page, fsdata); | ||
2120 | if (copied < 0) { | ||
2121 | mlog_errno(copied); | ||
2122 | ret = copied; | ||
2123 | goto out; | ||
2124 | } | ||
1625 | out: | 2125 | out: |
1626 | 2126 | ||
1627 | return total ? total : ret; | 2127 | return copied ? copied : ret; |
1628 | } | 2128 | } |
1629 | 2129 | ||
1630 | static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, | 2130 | static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, |
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index a4dd1fa1822b..36fe27f268ee 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h | |||
@@ -39,15 +39,16 @@ enum ocfs2_alloc_restarted { | |||
39 | }; | 39 | }; |
40 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | 40 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, |
41 | struct inode *inode, | 41 | struct inode *inode, |
42 | u32 *cluster_start, | 42 | u32 *logical_offset, |
43 | u32 clusters_to_add, | 43 | u32 clusters_to_add, |
44 | int mark_unwritten, | ||
44 | struct buffer_head *fe_bh, | 45 | struct buffer_head *fe_bh, |
45 | handle_t *handle, | 46 | handle_t *handle, |
46 | struct ocfs2_alloc_context *data_ac, | 47 | struct ocfs2_alloc_context *data_ac, |
47 | struct ocfs2_alloc_context *meta_ac, | 48 | struct ocfs2_alloc_context *meta_ac, |
48 | enum ocfs2_alloc_restarted *reason); | 49 | enum ocfs2_alloc_restarted *reason_ret); |
49 | int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, | 50 | int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, |
50 | u32 clusters_to_add, | 51 | u32 clusters_to_add, u32 extents_to_split, |
51 | struct ocfs2_alloc_context **data_ac, | 52 | struct ocfs2_alloc_context **data_ac, |
52 | struct ocfs2_alloc_context **meta_ac); | 53 | struct ocfs2_alloc_context **meta_ac); |
53 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); | 54 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); |
@@ -61,4 +62,7 @@ int ocfs2_should_update_atime(struct inode *inode, | |||
61 | int ocfs2_update_inode_atime(struct inode *inode, | 62 | int ocfs2_update_inode_atime(struct inode *inode, |
62 | struct buffer_head *bh); | 63 | struct buffer_head *bh); |
63 | 64 | ||
65 | int ocfs2_change_file_space(struct file *file, unsigned int cmd, | ||
66 | struct ocfs2_space_resv *sr); | ||
67 | |||
64 | #endif /* OCFS2_FILE_H */ | 68 | #endif /* OCFS2_FILE_H */ |
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index b25ef63781ba..352eb4a13f98 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c | |||
@@ -157,16 +157,16 @@ int ocfs2_register_hb_callbacks(struct ocfs2_super *osb) | |||
157 | if (ocfs2_mount_local(osb)) | 157 | if (ocfs2_mount_local(osb)) |
158 | return 0; | 158 | return 0; |
159 | 159 | ||
160 | status = o2hb_register_callback(&osb->osb_hb_down); | 160 | status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down); |
161 | if (status < 0) { | 161 | if (status < 0) { |
162 | mlog_errno(status); | 162 | mlog_errno(status); |
163 | goto bail; | 163 | goto bail; |
164 | } | 164 | } |
165 | 165 | ||
166 | status = o2hb_register_callback(&osb->osb_hb_up); | 166 | status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up); |
167 | if (status < 0) { | 167 | if (status < 0) { |
168 | mlog_errno(status); | 168 | mlog_errno(status); |
169 | o2hb_unregister_callback(&osb->osb_hb_down); | 169 | o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down); |
170 | } | 170 | } |
171 | 171 | ||
172 | bail: | 172 | bail: |
@@ -178,8 +178,8 @@ void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb) | |||
178 | if (ocfs2_mount_local(osb)) | 178 | if (ocfs2_mount_local(osb)) |
179 | return; | 179 | return; |
180 | 180 | ||
181 | o2hb_unregister_callback(&osb->osb_hb_down); | 181 | o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down); |
182 | o2hb_unregister_callback(&osb->osb_hb_up); | 182 | o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up); |
183 | } | 183 | } |
184 | 184 | ||
185 | void ocfs2_stop_heartbeat(struct ocfs2_super *osb) | 185 | void ocfs2_stop_heartbeat(struct ocfs2_super *osb) |
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index f3ad21ad9aed..bd68c3f2afbe 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include "ocfs2.h" | 14 | #include "ocfs2.h" |
15 | #include "alloc.h" | 15 | #include "alloc.h" |
16 | #include "dlmglue.h" | 16 | #include "dlmglue.h" |
17 | #include "file.h" | ||
17 | #include "inode.h" | 18 | #include "inode.h" |
18 | #include "journal.h" | 19 | #include "journal.h" |
19 | 20 | ||
@@ -115,6 +116,7 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp, | |||
115 | { | 116 | { |
116 | unsigned int flags; | 117 | unsigned int flags; |
117 | int status; | 118 | int status; |
119 | struct ocfs2_space_resv sr; | ||
118 | 120 | ||
119 | switch (cmd) { | 121 | switch (cmd) { |
120 | case OCFS2_IOC_GETFLAGS: | 122 | case OCFS2_IOC_GETFLAGS: |
@@ -130,6 +132,14 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp, | |||
130 | 132 | ||
131 | return ocfs2_set_inode_attr(inode, flags, | 133 | return ocfs2_set_inode_attr(inode, flags, |
132 | OCFS2_FL_MODIFIABLE); | 134 | OCFS2_FL_MODIFIABLE); |
135 | case OCFS2_IOC_RESVSP: | ||
136 | case OCFS2_IOC_RESVSP64: | ||
137 | case OCFS2_IOC_UNRESVSP: | ||
138 | case OCFS2_IOC_UNRESVSP64: | ||
139 | if (copy_from_user(&sr, (int __user *) arg, sizeof(sr))) | ||
140 | return -EFAULT; | ||
141 | |||
142 | return ocfs2_change_file_space(filp, cmd, &sr); | ||
133 | default: | 143 | default: |
134 | return -ENOTTY; | 144 | return -ENOTTY; |
135 | } | 145 | } |
@@ -148,6 +158,11 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) | |||
148 | case OCFS2_IOC32_SETFLAGS: | 158 | case OCFS2_IOC32_SETFLAGS: |
149 | cmd = OCFS2_IOC_SETFLAGS; | 159 | cmd = OCFS2_IOC_SETFLAGS; |
150 | break; | 160 | break; |
161 | case OCFS2_IOC_RESVSP: | ||
162 | case OCFS2_IOC_RESVSP64: | ||
163 | case OCFS2_IOC_UNRESVSP: | ||
164 | case OCFS2_IOC_UNRESVSP64: | ||
165 | break; | ||
151 | default: | 166 | default: |
152 | return -ENOIOCTLCMD; | 167 | return -ENOIOCTLCMD; |
153 | } | 168 | } |
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index dc1188081720..dbfb20bb27ea 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -722,8 +722,7 @@ void ocfs2_complete_recovery(struct work_struct *work) | |||
722 | container_of(work, struct ocfs2_journal, j_recovery_work); | 722 | container_of(work, struct ocfs2_journal, j_recovery_work); |
723 | struct ocfs2_super *osb = journal->j_osb; | 723 | struct ocfs2_super *osb = journal->j_osb; |
724 | struct ocfs2_dinode *la_dinode, *tl_dinode; | 724 | struct ocfs2_dinode *la_dinode, *tl_dinode; |
725 | struct ocfs2_la_recovery_item *item; | 725 | struct ocfs2_la_recovery_item *item, *n; |
726 | struct list_head *p, *n; | ||
727 | LIST_HEAD(tmp_la_list); | 726 | LIST_HEAD(tmp_la_list); |
728 | 727 | ||
729 | mlog_entry_void(); | 728 | mlog_entry_void(); |
@@ -734,8 +733,7 @@ void ocfs2_complete_recovery(struct work_struct *work) | |||
734 | list_splice_init(&journal->j_la_cleanups, &tmp_la_list); | 733 | list_splice_init(&journal->j_la_cleanups, &tmp_la_list); |
735 | spin_unlock(&journal->j_lock); | 734 | spin_unlock(&journal->j_lock); |
736 | 735 | ||
737 | list_for_each_safe(p, n, &tmp_la_list) { | 736 | list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) { |
738 | item = list_entry(p, struct ocfs2_la_recovery_item, lri_list); | ||
739 | list_del_init(&item->lri_list); | 737 | list_del_init(&item->lri_list); |
740 | 738 | ||
741 | mlog(0, "Complete recovery for slot %d\n", item->lri_slot); | 739 | mlog(0, "Complete recovery for slot %d\n", item->lri_slot); |
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 3db5de4506da..ce60aab013aa 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h | |||
@@ -289,6 +289,8 @@ int ocfs2_journal_dirty_data(handle_t *handle, | |||
289 | #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ | 289 | #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ |
290 | + OCFS2_TRUNCATE_LOG_UPDATE) | 290 | + OCFS2_TRUNCATE_LOG_UPDATE) |
291 | 291 | ||
292 | #define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS) | ||
293 | |||
292 | /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + | 294 | /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + |
293 | * bitmap block for the new bit) */ | 295 | * bitmap block for the new bit) */ |
294 | #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) | 296 | #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) |
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index af01158b39f5..d79aa12137d2 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c | |||
@@ -37,11 +37,29 @@ | |||
37 | 37 | ||
38 | #include "ocfs2.h" | 38 | #include "ocfs2.h" |
39 | 39 | ||
40 | #include "aops.h" | ||
40 | #include "dlmglue.h" | 41 | #include "dlmglue.h" |
41 | #include "file.h" | 42 | #include "file.h" |
42 | #include "inode.h" | 43 | #include "inode.h" |
43 | #include "mmap.h" | 44 | #include "mmap.h" |
44 | 45 | ||
46 | static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset) | ||
47 | { | ||
48 | /* The best way to deal with signals in the vm path is | ||
49 | * to block them upfront, rather than allowing the | ||
50 | * locking paths to return -ERESTARTSYS. */ | ||
51 | sigfillset(blocked); | ||
52 | |||
53 | /* We should technically never get a bad return value | ||
54 | * from sigprocmask */ | ||
55 | return sigprocmask(SIG_BLOCK, blocked, oldset); | ||
56 | } | ||
57 | |||
58 | static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset) | ||
59 | { | ||
60 | return sigprocmask(SIG_SETMASK, oldset, NULL); | ||
61 | } | ||
62 | |||
45 | static struct page *ocfs2_nopage(struct vm_area_struct * area, | 63 | static struct page *ocfs2_nopage(struct vm_area_struct * area, |
46 | unsigned long address, | 64 | unsigned long address, |
47 | int *type) | 65 | int *type) |
@@ -53,14 +71,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area, | |||
53 | mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address, | 71 | mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address, |
54 | type); | 72 | type); |
55 | 73 | ||
56 | /* The best way to deal with signals in this path is | 74 | ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); |
57 | * to block them upfront, rather than allowing the | ||
58 | * locking paths to return -ERESTARTSYS. */ | ||
59 | sigfillset(&blocked); | ||
60 | |||
61 | /* We should technically never get a bad ret return | ||
62 | * from sigprocmask */ | ||
63 | ret = sigprocmask(SIG_BLOCK, &blocked, &oldset); | ||
64 | if (ret < 0) { | 75 | if (ret < 0) { |
65 | mlog_errno(ret); | 76 | mlog_errno(ret); |
66 | goto out; | 77 | goto out; |
@@ -68,7 +79,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area, | |||
68 | 79 | ||
69 | page = filemap_nopage(area, address, type); | 80 | page = filemap_nopage(area, address, type); |
70 | 81 | ||
71 | ret = sigprocmask(SIG_SETMASK, &oldset, NULL); | 82 | ret = ocfs2_vm_op_unblock_sigs(&oldset); |
72 | if (ret < 0) | 83 | if (ret < 0) |
73 | mlog_errno(ret); | 84 | mlog_errno(ret); |
74 | out: | 85 | out: |
@@ -76,28 +87,136 @@ out: | |||
76 | return page; | 87 | return page; |
77 | } | 88 | } |
78 | 89 | ||
79 | static struct vm_operations_struct ocfs2_file_vm_ops = { | 90 | static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, |
80 | .nopage = ocfs2_nopage, | 91 | struct page *page) |
81 | }; | 92 | { |
93 | int ret; | ||
94 | struct address_space *mapping = inode->i_mapping; | ||
95 | loff_t pos = page->index << PAGE_CACHE_SHIFT; | ||
96 | unsigned int len = PAGE_CACHE_SIZE; | ||
97 | pgoff_t last_index; | ||
98 | struct page *locked_page = NULL; | ||
99 | void *fsdata; | ||
100 | loff_t size = i_size_read(inode); | ||
82 | 101 | ||
83 | int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) | 102 | /* |
103 | * Another node might have truncated while we were waiting on | ||
104 | * cluster locks. | ||
105 | */ | ||
106 | last_index = size >> PAGE_CACHE_SHIFT; | ||
107 | if (page->index > last_index) { | ||
108 | ret = -EINVAL; | ||
109 | goto out; | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * The i_size check above doesn't catch the case where nodes | ||
114 | * truncated and then re-extended the file. We'll re-check the | ||
115 | * page mapping after taking the page lock inside of | ||
116 | * ocfs2_write_begin_nolock(). | ||
117 | */ | ||
118 | if (!PageUptodate(page) || page->mapping != inode->i_mapping) { | ||
119 | ret = -EINVAL; | ||
120 | goto out; | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | * Call ocfs2_write_begin() and ocfs2_write_end() to take | ||
125 | * advantage of the allocation code there. We pass a write | ||
126 | * length of the whole page (chopped to i_size) to make sure | ||
127 | * the whole thing is allocated. | ||
128 | * | ||
129 | * Since we know the page is up to date, we don't have to | ||
130 | * worry about ocfs2_write_begin() skipping some buffer reads | ||
131 | * because the "write" would invalidate their data. | ||
132 | */ | ||
133 | if (page->index == last_index) | ||
134 | len = size & ~PAGE_CACHE_MASK; | ||
135 | |||
136 | ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, | ||
137 | &fsdata, di_bh, page); | ||
138 | if (ret) { | ||
139 | if (ret != -ENOSPC) | ||
140 | mlog_errno(ret); | ||
141 | goto out; | ||
142 | } | ||
143 | |||
144 | ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, | ||
145 | fsdata); | ||
146 | if (ret < 0) { | ||
147 | mlog_errno(ret); | ||
148 | goto out; | ||
149 | } | ||
150 | BUG_ON(ret != len); | ||
151 | ret = 0; | ||
152 | out: | ||
153 | return ret; | ||
154 | } | ||
155 | |||
156 | static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) | ||
84 | { | 157 | { |
85 | int ret = 0, lock_level = 0; | 158 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
86 | struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); | 159 | struct buffer_head *di_bh = NULL; |
160 | sigset_t blocked, oldset; | ||
161 | int ret, ret2; | ||
162 | |||
163 | ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); | ||
164 | if (ret < 0) { | ||
165 | mlog_errno(ret); | ||
166 | return ret; | ||
167 | } | ||
168 | |||
169 | /* | ||
170 | * The cluster locks taken will block a truncate from another | ||
171 | * node. Taking the data lock will also ensure that we don't | ||
172 | * attempt page truncation as part of a downconvert. | ||
173 | */ | ||
174 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | ||
175 | if (ret < 0) { | ||
176 | mlog_errno(ret); | ||
177 | goto out; | ||
178 | } | ||
87 | 179 | ||
88 | /* | 180 | /* |
89 | * Only support shared writeable mmap for local mounts which | 181 | * The alloc sem should be enough to serialize with |
90 | * don't know about holes. | 182 | * ocfs2_truncate_file() changing i_size as well as any thread |
183 | * modifying the inode btree. | ||
91 | */ | 184 | */ |
92 | if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) && | 185 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
93 | ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && | 186 | |
94 | ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { | 187 | ret = ocfs2_data_lock(inode, 1); |
95 | mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); | 188 | if (ret < 0) { |
96 | /* This is -EINVAL because generic_file_readonly_mmap | 189 | mlog_errno(ret); |
97 | * returns it in a similar situation. */ | 190 | goto out_meta_unlock; |
98 | return -EINVAL; | ||
99 | } | 191 | } |
100 | 192 | ||
193 | ret = __ocfs2_page_mkwrite(inode, di_bh, page); | ||
194 | |||
195 | ocfs2_data_unlock(inode, 1); | ||
196 | |||
197 | out_meta_unlock: | ||
198 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
199 | |||
200 | brelse(di_bh); | ||
201 | ocfs2_meta_unlock(inode, 1); | ||
202 | |||
203 | out: | ||
204 | ret2 = ocfs2_vm_op_unblock_sigs(&oldset); | ||
205 | if (ret2 < 0) | ||
206 | mlog_errno(ret2); | ||
207 | |||
208 | return ret; | ||
209 | } | ||
210 | |||
211 | static struct vm_operations_struct ocfs2_file_vm_ops = { | ||
212 | .nopage = ocfs2_nopage, | ||
213 | .page_mkwrite = ocfs2_page_mkwrite, | ||
214 | }; | ||
215 | |||
216 | int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) | ||
217 | { | ||
218 | int ret = 0, lock_level = 0; | ||
219 | |||
101 | ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, | 220 | ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, |
102 | file->f_vfsmnt, &lock_level); | 221 | file->f_vfsmnt, &lock_level); |
103 | if (ret < 0) { | 222 | if (ret < 0) { |
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 36289e6295ce..d430fdab16e9 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c | |||
@@ -1674,7 +1674,7 @@ static int ocfs2_symlink(struct inode *dir, | |||
1674 | u32 offset = 0; | 1674 | u32 offset = 0; |
1675 | 1675 | ||
1676 | inode->i_op = &ocfs2_symlink_inode_operations; | 1676 | inode->i_op = &ocfs2_symlink_inode_operations; |
1677 | status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, | 1677 | status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0, |
1678 | new_fe_bh, | 1678 | new_fe_bh, |
1679 | handle, data_ac, NULL, | 1679 | handle, data_ac, NULL, |
1680 | NULL); | 1680 | NULL); |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index a860633e833f..5cc90a40b3c5 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -219,6 +219,7 @@ struct ocfs2_super | |||
219 | u16 max_slots; | 219 | u16 max_slots; |
220 | s16 node_num; | 220 | s16 node_num; |
221 | s16 slot_num; | 221 | s16 slot_num; |
222 | s16 preferred_slot; | ||
222 | int s_sectsize_bits; | 223 | int s_sectsize_bits; |
223 | int s_clustersize; | 224 | int s_clustersize; |
224 | int s_clustersize_bits; | 225 | int s_clustersize_bits; |
@@ -305,6 +306,19 @@ static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb) | |||
305 | return 0; | 306 | return 0; |
306 | } | 307 | } |
307 | 308 | ||
309 | static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) | ||
310 | { | ||
311 | /* | ||
312 | * Support for sparse files is a pre-requisite | ||
313 | */ | ||
314 | if (!ocfs2_sparse_alloc(osb)) | ||
315 | return 0; | ||
316 | |||
317 | if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN) | ||
318 | return 1; | ||
319 | return 0; | ||
320 | } | ||
321 | |||
308 | /* set / clear functions because cluster events can make these happen | 322 | /* set / clear functions because cluster events can make these happen |
309 | * in parallel so we want the transitions to be atomic. this also | 323 | * in parallel so we want the transitions to be atomic. this also |
310 | * means that any future flags osb_flags must be protected by spinlock | 324 | * means that any future flags osb_flags must be protected by spinlock |
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index f0d9eb08547a..82f8a75b207e 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h | |||
@@ -88,7 +88,7 @@ | |||
88 | #define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB | 88 | #define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB |
89 | #define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ | 89 | #define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ |
90 | | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) | 90 | | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) |
91 | #define OCFS2_FEATURE_RO_COMPAT_SUPP 0 | 91 | #define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN |
92 | 92 | ||
93 | /* | 93 | /* |
94 | * Heartbeat-only devices are missing journals and other files. The | 94 | * Heartbeat-only devices are missing journals and other files. The |
@@ -116,6 +116,11 @@ | |||
116 | */ | 116 | */ |
117 | #define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001 | 117 | #define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001 |
118 | 118 | ||
119 | /* | ||
120 | * Unwritten extents support. | ||
121 | */ | ||
122 | #define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001 | ||
123 | |||
119 | /* The byte offset of the first backup block will be 1G. | 124 | /* The byte offset of the first backup block will be 1G. |
120 | * The following will be 4G, 16G, 64G, 256G and 1T. | 125 | * The following will be 4G, 16G, 64G, 256G and 1T. |
121 | */ | 126 | */ |
@@ -170,6 +175,32 @@ | |||
170 | #define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int) | 175 | #define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int) |
171 | 176 | ||
172 | /* | 177 | /* |
178 | * Space reservation / allocation / free ioctls and argument structure | ||
179 | * are designed to be compatible with XFS. | ||
180 | * | ||
181 | * ALLOCSP* and FREESP* are not and will never be supported, but are | ||
182 | * included here for completeness. | ||
183 | */ | ||
184 | struct ocfs2_space_resv { | ||
185 | __s16 l_type; | ||
186 | __s16 l_whence; | ||
187 | __s64 l_start; | ||
188 | __s64 l_len; /* len == 0 means until end of file */ | ||
189 | __s32 l_sysid; | ||
190 | __u32 l_pid; | ||
191 | __s32 l_pad[4]; /* reserve area */ | ||
192 | }; | ||
193 | |||
194 | #define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv) | ||
195 | #define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv) | ||
196 | #define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv) | ||
197 | #define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv) | ||
198 | #define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv) | ||
199 | #define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv) | ||
200 | #define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv) | ||
201 | #define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv) | ||
202 | |||
203 | /* | ||
173 | * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) | 204 | * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) |
174 | */ | 205 | */ |
175 | #define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ | 206 | #define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ |
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index d8b79067dc14..af4882b62cfa 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c | |||
@@ -121,17 +121,25 @@ static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, | |||
121 | return ret; | 121 | return ret; |
122 | } | 122 | } |
123 | 123 | ||
124 | static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si) | 124 | static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred) |
125 | { | 125 | { |
126 | int i; | 126 | int i; |
127 | s16 ret = OCFS2_INVALID_SLOT; | 127 | s16 ret = OCFS2_INVALID_SLOT; |
128 | 128 | ||
129 | if (preferred >= 0 && preferred < si->si_num_slots) { | ||
130 | if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) { | ||
131 | ret = preferred; | ||
132 | goto out; | ||
133 | } | ||
134 | } | ||
135 | |||
129 | for(i = 0; i < si->si_num_slots; i++) { | 136 | for(i = 0; i < si->si_num_slots; i++) { |
130 | if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { | 137 | if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { |
131 | ret = (s16) i; | 138 | ret = (s16) i; |
132 | break; | 139 | break; |
133 | } | 140 | } |
134 | } | 141 | } |
142 | out: | ||
135 | return ret; | 143 | return ret; |
136 | } | 144 | } |
137 | 145 | ||
@@ -248,7 +256,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb) | |||
248 | if (slot == OCFS2_INVALID_SLOT) { | 256 | if (slot == OCFS2_INVALID_SLOT) { |
249 | /* if no slot yet, then just take 1st available | 257 | /* if no slot yet, then just take 1st available |
250 | * one. */ | 258 | * one. */ |
251 | slot = __ocfs2_find_empty_slot(si); | 259 | slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); |
252 | if (slot == OCFS2_INVALID_SLOT) { | 260 | if (slot == OCFS2_INVALID_SLOT) { |
253 | spin_unlock(&si->si_lock); | 261 | spin_unlock(&si->si_lock); |
254 | mlog(ML_ERROR, "no free slots available!\n"); | 262 | mlog(ML_ERROR, "no free slots available!\n"); |
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index e3437626d183..d9c5c9fcb30f 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c | |||
@@ -98,14 +98,6 @@ static int ocfs2_relink_block_group(handle_t *handle, | |||
98 | u16 chain); | 98 | u16 chain); |
99 | static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, | 99 | static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, |
100 | u32 wanted); | 100 | u32 wanted); |
101 | static int ocfs2_free_suballoc_bits(handle_t *handle, | ||
102 | struct inode *alloc_inode, | ||
103 | struct buffer_head *alloc_bh, | ||
104 | unsigned int start_bit, | ||
105 | u64 bg_blkno, | ||
106 | unsigned int count); | ||
107 | static inline u64 ocfs2_which_suballoc_group(u64 block, | ||
108 | unsigned int bit); | ||
109 | static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, | 101 | static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, |
110 | u64 bg_blkno, | 102 | u64 bg_blkno, |
111 | u16 bg_bit_off); | 103 | u16 bg_bit_off); |
@@ -496,13 +488,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, | |||
496 | 488 | ||
497 | (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); | 489 | (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); |
498 | (*ac)->ac_which = OCFS2_AC_USE_META; | 490 | (*ac)->ac_which = OCFS2_AC_USE_META; |
499 | |||
500 | #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS | ||
501 | slot = 0; | ||
502 | #else | ||
503 | slot = osb->slot_num; | 491 | slot = osb->slot_num; |
504 | #endif | ||
505 | |||
506 | (*ac)->ac_group_search = ocfs2_block_group_search; | 492 | (*ac)->ac_group_search = ocfs2_block_group_search; |
507 | 493 | ||
508 | status = ocfs2_reserve_suballoc_bits(osb, (*ac), | 494 | status = ocfs2_reserve_suballoc_bits(osb, (*ac), |
@@ -1626,12 +1612,12 @@ bail: | |||
1626 | /* | 1612 | /* |
1627 | * expects the suballoc inode to already be locked. | 1613 | * expects the suballoc inode to already be locked. |
1628 | */ | 1614 | */ |
1629 | static int ocfs2_free_suballoc_bits(handle_t *handle, | 1615 | int ocfs2_free_suballoc_bits(handle_t *handle, |
1630 | struct inode *alloc_inode, | 1616 | struct inode *alloc_inode, |
1631 | struct buffer_head *alloc_bh, | 1617 | struct buffer_head *alloc_bh, |
1632 | unsigned int start_bit, | 1618 | unsigned int start_bit, |
1633 | u64 bg_blkno, | 1619 | u64 bg_blkno, |
1634 | unsigned int count) | 1620 | unsigned int count) |
1635 | { | 1621 | { |
1636 | int status = 0; | 1622 | int status = 0; |
1637 | u32 tmp_used; | 1623 | u32 tmp_used; |
@@ -1703,13 +1689,6 @@ bail: | |||
1703 | return status; | 1689 | return status; |
1704 | } | 1690 | } |
1705 | 1691 | ||
1706 | static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) | ||
1707 | { | ||
1708 | u64 group = block - (u64) bit; | ||
1709 | |||
1710 | return group; | ||
1711 | } | ||
1712 | |||
1713 | int ocfs2_free_dinode(handle_t *handle, | 1692 | int ocfs2_free_dinode(handle_t *handle, |
1714 | struct inode *inode_alloc_inode, | 1693 | struct inode *inode_alloc_inode, |
1715 | struct buffer_head *inode_alloc_bh, | 1694 | struct buffer_head *inode_alloc_bh, |
@@ -1723,19 +1702,6 @@ int ocfs2_free_dinode(handle_t *handle, | |||
1723 | inode_alloc_bh, bit, bg_blkno, 1); | 1702 | inode_alloc_bh, bit, bg_blkno, 1); |
1724 | } | 1703 | } |
1725 | 1704 | ||
1726 | int ocfs2_free_extent_block(handle_t *handle, | ||
1727 | struct inode *eb_alloc_inode, | ||
1728 | struct buffer_head *eb_alloc_bh, | ||
1729 | struct ocfs2_extent_block *eb) | ||
1730 | { | ||
1731 | u64 blk = le64_to_cpu(eb->h_blkno); | ||
1732 | u16 bit = le16_to_cpu(eb->h_suballoc_bit); | ||
1733 | u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); | ||
1734 | |||
1735 | return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh, | ||
1736 | bit, bg_blkno, 1); | ||
1737 | } | ||
1738 | |||
1739 | int ocfs2_free_clusters(handle_t *handle, | 1705 | int ocfs2_free_clusters(handle_t *handle, |
1740 | struct inode *bitmap_inode, | 1706 | struct inode *bitmap_inode, |
1741 | struct buffer_head *bitmap_bh, | 1707 | struct buffer_head *bitmap_bh, |
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 1a3c94cb9250..f212dc01a84b 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h | |||
@@ -86,20 +86,29 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb, | |||
86 | u32 *cluster_start, | 86 | u32 *cluster_start, |
87 | u32 *num_clusters); | 87 | u32 *num_clusters); |
88 | 88 | ||
89 | int ocfs2_free_suballoc_bits(handle_t *handle, | ||
90 | struct inode *alloc_inode, | ||
91 | struct buffer_head *alloc_bh, | ||
92 | unsigned int start_bit, | ||
93 | u64 bg_blkno, | ||
94 | unsigned int count); | ||
89 | int ocfs2_free_dinode(handle_t *handle, | 95 | int ocfs2_free_dinode(handle_t *handle, |
90 | struct inode *inode_alloc_inode, | 96 | struct inode *inode_alloc_inode, |
91 | struct buffer_head *inode_alloc_bh, | 97 | struct buffer_head *inode_alloc_bh, |
92 | struct ocfs2_dinode *di); | 98 | struct ocfs2_dinode *di); |
93 | int ocfs2_free_extent_block(handle_t *handle, | ||
94 | struct inode *eb_alloc_inode, | ||
95 | struct buffer_head *eb_alloc_bh, | ||
96 | struct ocfs2_extent_block *eb); | ||
97 | int ocfs2_free_clusters(handle_t *handle, | 99 | int ocfs2_free_clusters(handle_t *handle, |
98 | struct inode *bitmap_inode, | 100 | struct inode *bitmap_inode, |
99 | struct buffer_head *bitmap_bh, | 101 | struct buffer_head *bitmap_bh, |
100 | u64 start_blk, | 102 | u64 start_blk, |
101 | unsigned int num_clusters); | 103 | unsigned int num_clusters); |
102 | 104 | ||
105 | static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) | ||
106 | { | ||
107 | u64 group = block - (u64) bit; | ||
108 | |||
109 | return group; | ||
110 | } | ||
111 | |||
103 | static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb, | 112 | static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb, |
104 | u64 bg_blkno) | 113 | u64 bg_blkno) |
105 | { | 114 | { |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 86b559c7dce9..3a5a1ed09ac9 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -82,7 +82,8 @@ MODULE_AUTHOR("Oracle"); | |||
82 | MODULE_LICENSE("GPL"); | 82 | MODULE_LICENSE("GPL"); |
83 | 83 | ||
84 | static int ocfs2_parse_options(struct super_block *sb, char *options, | 84 | static int ocfs2_parse_options(struct super_block *sb, char *options, |
85 | unsigned long *mount_opt, int is_remount); | 85 | unsigned long *mount_opt, s16 *slot, |
86 | int is_remount); | ||
86 | static void ocfs2_put_super(struct super_block *sb); | 87 | static void ocfs2_put_super(struct super_block *sb); |
87 | static int ocfs2_mount_volume(struct super_block *sb); | 88 | static int ocfs2_mount_volume(struct super_block *sb); |
88 | static int ocfs2_remount(struct super_block *sb, int *flags, char *data); | 89 | static int ocfs2_remount(struct super_block *sb, int *flags, char *data); |
@@ -114,8 +115,6 @@ static void ocfs2_write_super(struct super_block *sb); | |||
114 | static struct inode *ocfs2_alloc_inode(struct super_block *sb); | 115 | static struct inode *ocfs2_alloc_inode(struct super_block *sb); |
115 | static void ocfs2_destroy_inode(struct inode *inode); | 116 | static void ocfs2_destroy_inode(struct inode *inode); |
116 | 117 | ||
117 | static unsigned long long ocfs2_max_file_offset(unsigned int blockshift); | ||
118 | |||
119 | static const struct super_operations ocfs2_sops = { | 118 | static const struct super_operations ocfs2_sops = { |
120 | .statfs = ocfs2_statfs, | 119 | .statfs = ocfs2_statfs, |
121 | .alloc_inode = ocfs2_alloc_inode, | 120 | .alloc_inode = ocfs2_alloc_inode, |
@@ -140,6 +139,7 @@ enum { | |||
140 | Opt_data_ordered, | 139 | Opt_data_ordered, |
141 | Opt_data_writeback, | 140 | Opt_data_writeback, |
142 | Opt_atime_quantum, | 141 | Opt_atime_quantum, |
142 | Opt_slot, | ||
143 | Opt_err, | 143 | Opt_err, |
144 | }; | 144 | }; |
145 | 145 | ||
@@ -154,6 +154,7 @@ static match_table_t tokens = { | |||
154 | {Opt_data_ordered, "data=ordered"}, | 154 | {Opt_data_ordered, "data=ordered"}, |
155 | {Opt_data_writeback, "data=writeback"}, | 155 | {Opt_data_writeback, "data=writeback"}, |
156 | {Opt_atime_quantum, "atime_quantum=%u"}, | 156 | {Opt_atime_quantum, "atime_quantum=%u"}, |
157 | {Opt_slot, "preferred_slot=%u"}, | ||
157 | {Opt_err, NULL} | 158 | {Opt_err, NULL} |
158 | }; | 159 | }; |
159 | 160 | ||
@@ -318,7 +319,7 @@ static void ocfs2_destroy_inode(struct inode *inode) | |||
318 | /* From xfs_super.c:xfs_max_file_offset | 319 | /* From xfs_super.c:xfs_max_file_offset |
319 | * Copyright (c) 2000-2004 Silicon Graphics, Inc. | 320 | * Copyright (c) 2000-2004 Silicon Graphics, Inc. |
320 | */ | 321 | */ |
321 | static unsigned long long ocfs2_max_file_offset(unsigned int blockshift) | 322 | unsigned long long ocfs2_max_file_offset(unsigned int blockshift) |
322 | { | 323 | { |
323 | unsigned int pagefactor = 1; | 324 | unsigned int pagefactor = 1; |
324 | unsigned int bitshift = BITS_PER_LONG - 1; | 325 | unsigned int bitshift = BITS_PER_LONG - 1; |
@@ -355,9 +356,10 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) | |||
355 | int incompat_features; | 356 | int incompat_features; |
356 | int ret = 0; | 357 | int ret = 0; |
357 | unsigned long parsed_options; | 358 | unsigned long parsed_options; |
359 | s16 slot; | ||
358 | struct ocfs2_super *osb = OCFS2_SB(sb); | 360 | struct ocfs2_super *osb = OCFS2_SB(sb); |
359 | 361 | ||
360 | if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { | 362 | if (!ocfs2_parse_options(sb, data, &parsed_options, &slot, 1)) { |
361 | ret = -EINVAL; | 363 | ret = -EINVAL; |
362 | goto out; | 364 | goto out; |
363 | } | 365 | } |
@@ -534,6 +536,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) | |||
534 | struct dentry *root; | 536 | struct dentry *root; |
535 | int status, sector_size; | 537 | int status, sector_size; |
536 | unsigned long parsed_opt; | 538 | unsigned long parsed_opt; |
539 | s16 slot; | ||
537 | struct inode *inode = NULL; | 540 | struct inode *inode = NULL; |
538 | struct ocfs2_super *osb = NULL; | 541 | struct ocfs2_super *osb = NULL; |
539 | struct buffer_head *bh = NULL; | 542 | struct buffer_head *bh = NULL; |
@@ -541,7 +544,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) | |||
541 | 544 | ||
542 | mlog_entry("%p, %p, %i", sb, data, silent); | 545 | mlog_entry("%p, %p, %i", sb, data, silent); |
543 | 546 | ||
544 | if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { | 547 | if (!ocfs2_parse_options(sb, data, &parsed_opt, &slot, 0)) { |
545 | status = -EINVAL; | 548 | status = -EINVAL; |
546 | goto read_super_error; | 549 | goto read_super_error; |
547 | } | 550 | } |
@@ -571,6 +574,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) | |||
571 | brelse(bh); | 574 | brelse(bh); |
572 | bh = NULL; | 575 | bh = NULL; |
573 | osb->s_mount_opt = parsed_opt; | 576 | osb->s_mount_opt = parsed_opt; |
577 | osb->preferred_slot = slot; | ||
574 | 578 | ||
575 | sb->s_magic = OCFS2_SUPER_MAGIC; | 579 | sb->s_magic = OCFS2_SUPER_MAGIC; |
576 | 580 | ||
@@ -713,6 +717,7 @@ static struct file_system_type ocfs2_fs_type = { | |||
713 | static int ocfs2_parse_options(struct super_block *sb, | 717 | static int ocfs2_parse_options(struct super_block *sb, |
714 | char *options, | 718 | char *options, |
715 | unsigned long *mount_opt, | 719 | unsigned long *mount_opt, |
720 | s16 *slot, | ||
716 | int is_remount) | 721 | int is_remount) |
717 | { | 722 | { |
718 | int status; | 723 | int status; |
@@ -722,6 +727,7 @@ static int ocfs2_parse_options(struct super_block *sb, | |||
722 | options ? options : "(none)"); | 727 | options ? options : "(none)"); |
723 | 728 | ||
724 | *mount_opt = 0; | 729 | *mount_opt = 0; |
730 | *slot = OCFS2_INVALID_SLOT; | ||
725 | 731 | ||
726 | if (!options) { | 732 | if (!options) { |
727 | status = 1; | 733 | status = 1; |
@@ -782,6 +788,15 @@ static int ocfs2_parse_options(struct super_block *sb, | |||
782 | else | 788 | else |
783 | osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; | 789 | osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; |
784 | break; | 790 | break; |
791 | case Opt_slot: | ||
792 | option = 0; | ||
793 | if (match_int(&args[0], &option)) { | ||
794 | status = 0; | ||
795 | goto bail; | ||
796 | } | ||
797 | if (option) | ||
798 | *slot = (s16)option; | ||
799 | break; | ||
785 | default: | 800 | default: |
786 | mlog(ML_ERROR, | 801 | mlog(ML_ERROR, |
787 | "Unrecognized mount option \"%s\" " | 802 | "Unrecognized mount option \"%s\" " |
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 783f5270f2a1..3b9cb3d0b008 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h | |||
@@ -45,4 +45,6 @@ void __ocfs2_abort(struct super_block *sb, | |||
45 | 45 | ||
46 | #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) | 46 | #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) |
47 | 47 | ||
48 | unsigned long long ocfs2_max_file_offset(unsigned int blockshift); | ||
49 | |||
48 | #endif /* OCFS2_SUPER_H */ | 50 | #endif /* OCFS2_SUPER_H */ |
diff --git a/include/linux/configfs.h b/include/linux/configfs.h index fef6f3d0a4a7..8c6967f3fb11 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h | |||
@@ -40,9 +40,9 @@ | |||
40 | #include <linux/types.h> | 40 | #include <linux/types.h> |
41 | #include <linux/list.h> | 41 | #include <linux/list.h> |
42 | #include <linux/kref.h> | 42 | #include <linux/kref.h> |
43 | #include <linux/mutex.h> | ||
43 | 44 | ||
44 | #include <asm/atomic.h> | 45 | #include <asm/atomic.h> |
45 | #include <asm/semaphore.h> | ||
46 | 46 | ||
47 | #define CONFIGFS_ITEM_NAME_LEN 20 | 47 | #define CONFIGFS_ITEM_NAME_LEN 20 |
48 | 48 | ||
@@ -75,7 +75,6 @@ extern void config_item_init(struct config_item *); | |||
75 | extern void config_item_init_type_name(struct config_item *item, | 75 | extern void config_item_init_type_name(struct config_item *item, |
76 | const char *name, | 76 | const char *name, |
77 | struct config_item_type *type); | 77 | struct config_item_type *type); |
78 | extern void config_item_cleanup(struct config_item *); | ||
79 | 78 | ||
80 | extern struct config_item * config_item_get(struct config_item *); | 79 | extern struct config_item * config_item_get(struct config_item *); |
81 | extern void config_item_put(struct config_item *); | 80 | extern void config_item_put(struct config_item *); |
@@ -87,12 +86,10 @@ struct config_item_type { | |||
87 | struct configfs_attribute **ct_attrs; | 86 | struct configfs_attribute **ct_attrs; |
88 | }; | 87 | }; |
89 | 88 | ||
90 | |||
91 | /** | 89 | /** |
92 | * group - a group of config_items of a specific type, belonging | 90 | * group - a group of config_items of a specific type, belonging |
93 | * to a specific subsystem. | 91 | * to a specific subsystem. |
94 | */ | 92 | */ |
95 | |||
96 | struct config_group { | 93 | struct config_group { |
97 | struct config_item cg_item; | 94 | struct config_item cg_item; |
98 | struct list_head cg_children; | 95 | struct list_head cg_children; |
@@ -100,13 +97,11 @@ struct config_group { | |||
100 | struct config_group **default_groups; | 97 | struct config_group **default_groups; |
101 | }; | 98 | }; |
102 | 99 | ||
103 | |||
104 | extern void config_group_init(struct config_group *group); | 100 | extern void config_group_init(struct config_group *group); |
105 | extern void config_group_init_type_name(struct config_group *group, | 101 | extern void config_group_init_type_name(struct config_group *group, |
106 | const char *name, | 102 | const char *name, |
107 | struct config_item_type *type); | 103 | struct config_item_type *type); |
108 | 104 | ||
109 | |||
110 | static inline struct config_group *to_config_group(struct config_item *item) | 105 | static inline struct config_group *to_config_group(struct config_item *item) |
111 | { | 106 | { |
112 | return item ? container_of(item,struct config_group,cg_item) : NULL; | 107 | return item ? container_of(item,struct config_group,cg_item) : NULL; |
@@ -122,7 +117,8 @@ static inline void config_group_put(struct config_group *group) | |||
122 | config_item_put(&group->cg_item); | 117 | config_item_put(&group->cg_item); |
123 | } | 118 | } |
124 | 119 | ||
125 | extern struct config_item *config_group_find_obj(struct config_group *, const char *); | 120 | extern struct config_item *config_group_find_item(struct config_group *, |
121 | const char *); | ||
126 | 122 | ||
127 | 123 | ||
128 | struct configfs_attribute { | 124 | struct configfs_attribute { |
@@ -131,6 +127,22 @@ struct configfs_attribute { | |||
131 | mode_t ca_mode; | 127 | mode_t ca_mode; |
132 | }; | 128 | }; |
133 | 129 | ||
130 | /* | ||
131 | * Users often need to create attribute structures for their configurable | ||
132 | * attributes, containing a configfs_attribute member and function pointers | ||
133 | * for the show() and store() operations on that attribute. They can use | ||
134 | * this macro (similar to sysfs' __ATTR) to make defining attributes easier. | ||
135 | */ | ||
136 | #define __CONFIGFS_ATTR(_name, _mode, _show, _store) \ | ||
137 | { \ | ||
138 | .attr = { \ | ||
139 | .ca_name = __stringify(_name), \ | ||
140 | .ca_mode = _mode, \ | ||
141 | .ca_owner = THIS_MODULE, \ | ||
142 | }, \ | ||
143 | .show = _show, \ | ||
144 | .store = _store, \ | ||
145 | } | ||
134 | 146 | ||
135 | /* | 147 | /* |
136 | * If allow_link() exists, the item can symlink(2) out to other | 148 | * If allow_link() exists, the item can symlink(2) out to other |
@@ -157,12 +169,13 @@ struct configfs_group_operations { | |||
157 | struct config_item *(*make_item)(struct config_group *group, const char *name); | 169 | struct config_item *(*make_item)(struct config_group *group, const char *name); |
158 | struct config_group *(*make_group)(struct config_group *group, const char *name); | 170 | struct config_group *(*make_group)(struct config_group *group, const char *name); |
159 | int (*commit_item)(struct config_item *item); | 171 | int (*commit_item)(struct config_item *item); |
172 | void (*disconnect_notify)(struct config_group *group, struct config_item *item); | ||
160 | void (*drop_item)(struct config_group *group, struct config_item *item); | 173 | void (*drop_item)(struct config_group *group, struct config_item *item); |
161 | }; | 174 | }; |
162 | 175 | ||
163 | struct configfs_subsystem { | 176 | struct configfs_subsystem { |
164 | struct config_group su_group; | 177 | struct config_group su_group; |
165 | struct semaphore su_sem; | 178 | struct mutex su_mutex; |
166 | }; | 179 | }; |
167 | 180 | ||
168 | static inline struct configfs_subsystem *to_configfs_subsystem(struct config_group *group) | 181 | static inline struct configfs_subsystem *to_configfs_subsystem(struct config_group *group) |
@@ -175,6 +188,11 @@ static inline struct configfs_subsystem *to_configfs_subsystem(struct config_gro | |||
175 | int configfs_register_subsystem(struct configfs_subsystem *subsys); | 188 | int configfs_register_subsystem(struct configfs_subsystem *subsys); |
176 | void configfs_unregister_subsystem(struct configfs_subsystem *subsys); | 189 | void configfs_unregister_subsystem(struct configfs_subsystem *subsys); |
177 | 190 | ||
191 | /* These functions can sleep and can alloc with GFP_KERNEL */ | ||
192 | /* WARNING: These cannot be called underneath configfs callbacks!! */ | ||
193 | int configfs_depend_item(struct configfs_subsystem *subsys, struct config_item *target); | ||
194 | void configfs_undepend_item(struct configfs_subsystem *subsys, struct config_item *target); | ||
195 | |||
178 | #endif /* __KERNEL__ */ | 196 | #endif /* __KERNEL__ */ |
179 | 197 | ||
180 | #endif /* _CONFIGFS_H_ */ | 198 | #endif /* _CONFIGFS_H_ */ |