aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-16 13:52:55 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-16 13:52:55 -0400
commitadd096909da63ef32d6766f6771c07c9f16c6ee5 (patch)
tree58594bcf68cbb6f777d5270d098ab8ca69cbaee3
parente245befce7af0a1e1347079ed62695b059594bd4 (diff)
parent54c57dc3b6578356c0a428c767d4bf080254a2ee (diff)
Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2
* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: (32 commits) [PATCH] ocfs2: zero_user_page conversion ocfs2: Support xfs style space reservation ioctls ocfs2: support for removing file regions ocfs2: update truncate handling of partial clusters ocfs2: btree support for removal of arbirtrary extents ocfs2: Support creation of unwritten extents ocfs2: support writing of unwritten extents ocfs2: small cleanup of ocfs2_write_begin_nolock() ocfs2: btree changes for unwritten extents ocfs2: abstract btree growing calls ocfs2: use all extent block suballocators ocfs2: plug truncate into cached dealloc routines ocfs2: simplify deallocation locking ocfs2: harden buffer check during mapping of page blocks ocfs2: shared writeable mmap ocfs2: factor out write aops into nolock variants ocfs2: rework ocfs2_buffered_write_cluster() ocfs2: take ip_alloc_sem during entire truncate ocfs2: Add "preferred slot" mount option [KJ PATCH] Replacing memset(<addr>,0,PAGE_SIZE) with clear_page() in fs/ocfs2/dlm/dlmrecovery.c ...
-rw-r--r--Documentation/filesystems/configfs/configfs.txt57
-rw-r--r--Documentation/filesystems/configfs/configfs_example.c2
-rw-r--r--fs/configfs/configfs_internal.h7
-rw-r--r--fs/configfs/dir.c289
-rw-r--r--fs/configfs/file.c28
-rw-r--r--fs/configfs/item.c29
-rw-r--r--fs/dlm/config.c20
-rw-r--r--fs/ocfs2/alloc.c2676
-rw-r--r--fs/ocfs2/alloc.h43
-rw-r--r--fs/ocfs2/aops.c1015
-rw-r--r--fs/ocfs2/aops.h61
-rw-r--r--fs/ocfs2/cluster/heartbeat.c96
-rw-r--r--fs/ocfs2/cluster/heartbeat.h6
-rw-r--r--fs/ocfs2/cluster/nodemanager.c42
-rw-r--r--fs/ocfs2/cluster/nodemanager.h5
-rw-r--r--fs/ocfs2/cluster/tcp.c21
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c8
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c40
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c79
-rw-r--r--fs/ocfs2/dlmglue.c6
-rw-r--r--fs/ocfs2/endian.h5
-rw-r--r--fs/ocfs2/extent_map.c41
-rw-r--r--fs/ocfs2/file.c702
-rw-r--r--fs/ocfs2/file.h10
-rw-r--r--fs/ocfs2/heartbeat.c10
-rw-r--r--fs/ocfs2/ioctl.c15
-rw-r--r--fs/ocfs2/journal.c6
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/mmap.c167
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h14
-rw-r--r--fs/ocfs2/ocfs2_fs.h33
-rw-r--r--fs/ocfs2/slot_map.c12
-rw-r--r--fs/ocfs2/suballoc.c46
-rw-r--r--fs/ocfs2/suballoc.h17
-rw-r--r--fs/ocfs2/super.c27
-rw-r--r--fs/ocfs2/super.h2
-rw-r--r--include/linux/configfs.h34
39 files changed, 4623 insertions, 1054 deletions
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
index b34cdb50eab4..d1b98257d000 100644
--- a/Documentation/filesystems/configfs/configfs.txt
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -238,6 +238,8 @@ config_item_type.
238 struct config_group *(*make_group)(struct config_group *group, 238 struct config_group *(*make_group)(struct config_group *group,
239 const char *name); 239 const char *name);
240 int (*commit_item)(struct config_item *item); 240 int (*commit_item)(struct config_item *item);
241 void (*disconnect_notify)(struct config_group *group,
242 struct config_item *item);
241 void (*drop_item)(struct config_group *group, 243 void (*drop_item)(struct config_group *group,
242 struct config_item *item); 244 struct config_item *item);
243 }; 245 };
@@ -268,6 +270,16 @@ the item in other threads, the memory is safe. It may take some time
268for the item to actually disappear from the subsystem's usage. But it 270for the item to actually disappear from the subsystem's usage. But it
269is gone from configfs. 271is gone from configfs.
270 272
273When drop_item() is called, the item's linkage has already been torn
274down. It no longer has a reference on its parent and has no place in
275the item hierarchy. If a client needs to do some cleanup before this
276teardown happens, the subsystem can implement the
277ct_group_ops->disconnect_notify() method. The method is called after
278configfs has removed the item from the filesystem view but before the
279item is removed from its parent group. Like drop_item(),
280disconnect_notify() is void and cannot fail. Client subsystems should
281not drop any references here, as they still must do it in drop_item().
282
271A config_group cannot be removed while it still has child items. This 283A config_group cannot be removed while it still has child items. This
272is implemented in the configfs rmdir(2) code. ->drop_item() will not be 284is implemented in the configfs rmdir(2) code. ->drop_item() will not be
273called, as the item has not been dropped. rmdir(2) will fail, as the 285called, as the item has not been dropped. rmdir(2) will fail, as the
@@ -280,18 +292,18 @@ tells configfs to make the subsystem appear in the file tree.
280 292
281 struct configfs_subsystem { 293 struct configfs_subsystem {
282 struct config_group su_group; 294 struct config_group su_group;
283 struct semaphore su_sem; 295 struct mutex su_mutex;
284 }; 296 };
285 297
286 int configfs_register_subsystem(struct configfs_subsystem *subsys); 298 int configfs_register_subsystem(struct configfs_subsystem *subsys);
287 void configfs_unregister_subsystem(struct configfs_subsystem *subsys); 299 void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
288 300
289 A subsystem consists of a toplevel config_group and a semaphore. 301 A subsystem consists of a toplevel config_group and a mutex.
290The group is where child config_items are created. For a subsystem, 302The group is where child config_items are created. For a subsystem,
291this group is usually defined statically. Before calling 303this group is usually defined statically. Before calling
292configfs_register_subsystem(), the subsystem must have initialized the 304configfs_register_subsystem(), the subsystem must have initialized the
293group via the usual group _init() functions, and it must also have 305group via the usual group _init() functions, and it must also have
294initialized the semaphore. 306initialized the mutex.
295 When the register call returns, the subsystem is live, and it 307 When the register call returns, the subsystem is live, and it
296will be visible via configfs. At that point, mkdir(2) can be called and 308will be visible via configfs. At that point, mkdir(2) can be called and
297the subsystem must be ready for it. 309the subsystem must be ready for it.
@@ -303,7 +315,7 @@ subsystem/group and the simple_child item in configfs_example.c It
303shows a trivial object displaying and storing an attribute, and a simple 315shows a trivial object displaying and storing an attribute, and a simple
304group creating and destroying these children. 316group creating and destroying these children.
305 317
306[Hierarchy Navigation and the Subsystem Semaphore] 318[Hierarchy Navigation and the Subsystem Mutex]
307 319
308There is an extra bonus that configfs provides. The config_groups and 320There is an extra bonus that configfs provides. The config_groups and
309config_items are arranged in a hierarchy due to the fact that they 321config_items are arranged in a hierarchy due to the fact that they
@@ -314,19 +326,19 @@ and config_item->ci_parent structure members.
314 326
315A subsystem can navigate the cg_children list and the ci_parent pointer 327A subsystem can navigate the cg_children list and the ci_parent pointer
316to see the tree created by the subsystem. This can race with configfs' 328to see the tree created by the subsystem. This can race with configfs'
317management of the hierarchy, so configfs uses the subsystem semaphore to 329management of the hierarchy, so configfs uses the subsystem mutex to
318protect modifications. Whenever a subsystem wants to navigate the 330protect modifications. Whenever a subsystem wants to navigate the
319hierarchy, it must do so under the protection of the subsystem 331hierarchy, it must do so under the protection of the subsystem
320semaphore. 332mutex.
321 333
322A subsystem will be prevented from acquiring the semaphore while a newly 334A subsystem will be prevented from acquiring the mutex while a newly
323allocated item has not been linked into this hierarchy. Similarly, it 335allocated item has not been linked into this hierarchy. Similarly, it
324will not be able to acquire the semaphore while a dropping item has not 336will not be able to acquire the mutex while a dropping item has not
325yet been unlinked. This means that an item's ci_parent pointer will 337yet been unlinked. This means that an item's ci_parent pointer will
326never be NULL while the item is in configfs, and that an item will only 338never be NULL while the item is in configfs, and that an item will only
327be in its parent's cg_children list for the same duration. This allows 339be in its parent's cg_children list for the same duration. This allows
328a subsystem to trust ci_parent and cg_children while they hold the 340a subsystem to trust ci_parent and cg_children while they hold the
329semaphore. 341mutex.
330 342
331[Item Aggregation Via symlink(2)] 343[Item Aggregation Via symlink(2)]
332 344
@@ -386,6 +398,33 @@ As a consequence of this, default_groups cannot be removed directly via
386rmdir(2). They also are not considered when rmdir(2) on the parent 398rmdir(2). They also are not considered when rmdir(2) on the parent
387group is checking for children. 399group is checking for children.
388 400
401[Dependant Subsystems]
402
403Sometimes other drivers depend on particular configfs items. For
404example, ocfs2 mounts depend on a heartbeat region item. If that
405region item is removed with rmdir(2), the ocfs2 mount must BUG or go
406readonly. Not happy.
407
408configfs provides two additional API calls: configfs_depend_item() and
409configfs_undepend_item(). A client driver can call
410configfs_depend_item() on an existing item to tell configfs that it is
411depended on. configfs will then return -EBUSY from rmdir(2) for that
412item. When the item is no longer depended on, the client driver calls
413configfs_undepend_item() on it.
414
415These API cannot be called underneath any configfs callbacks, as
416they will conflict. They can block and allocate. A client driver
417probably shouldn't calling them of its own gumption. Rather it should
418be providing an API that external subsystems call.
419
420How does this work? Imagine the ocfs2 mount process. When it mounts,
421it asks for a heartbeat region item. This is done via a call into the
422heartbeat code. Inside the heartbeat code, the region item is looked
423up. Here, the heartbeat code calls configfs_depend_item(). If it
424succeeds, then heartbeat knows the region is safe to give to ocfs2.
425If it fails, it was being torn down anyway, and heartbeat can gracefully
426pass up an error.
427
389[Committable Items] 428[Committable Items]
390 429
391NOTE: Committable items are currently unimplemented. 430NOTE: Committable items are currently unimplemented.
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c
index 2d6a14a463e0..e56d49264b39 100644
--- a/Documentation/filesystems/configfs/configfs_example.c
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -453,7 +453,7 @@ static int __init configfs_example_init(void)
453 subsys = example_subsys[i]; 453 subsys = example_subsys[i];
454 454
455 config_group_init(&subsys->su_group); 455 config_group_init(&subsys->su_group);
456 init_MUTEX(&subsys->su_sem); 456 mutex_init(&subsys->su_mutex);
457 ret = configfs_register_subsystem(subsys); 457 ret = configfs_register_subsystem(subsys);
458 if (ret) { 458 if (ret) {
459 printk(KERN_ERR "Error %d while registering subsystem %s\n", 459 printk(KERN_ERR "Error %d while registering subsystem %s\n",
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 7b48c034b312..3b0185fdf9a4 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -29,10 +29,11 @@
29 29
30struct configfs_dirent { 30struct configfs_dirent {
31 atomic_t s_count; 31 atomic_t s_count;
32 int s_dependent_count;
32 struct list_head s_sibling; 33 struct list_head s_sibling;
33 struct list_head s_children; 34 struct list_head s_children;
34 struct list_head s_links; 35 struct list_head s_links;
35 void * s_element; 36 void * s_element;
36 int s_type; 37 int s_type;
37 umode_t s_mode; 38 umode_t s_mode;
38 struct dentry * s_dentry; 39 struct dentry * s_dentry;
@@ -41,8 +42,8 @@ struct configfs_dirent {
41 42
42#define CONFIGFS_ROOT 0x0001 43#define CONFIGFS_ROOT 0x0001
43#define CONFIGFS_DIR 0x0002 44#define CONFIGFS_DIR 0x0002
44#define CONFIGFS_ITEM_ATTR 0x0004 45#define CONFIGFS_ITEM_ATTR 0x0004
45#define CONFIGFS_ITEM_LINK 0x0020 46#define CONFIGFS_ITEM_LINK 0x0020
46#define CONFIGFS_USET_DIR 0x0040 47#define CONFIGFS_USET_DIR 0x0040
47#define CONFIGFS_USET_DEFAULT 0x0080 48#define CONFIGFS_USET_DEFAULT 0x0080
48#define CONFIGFS_USET_DROPPING 0x0100 49#define CONFIGFS_USET_DROPPING 0x0100
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5e6e37e58f36..2f436d4f1d6d 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -355,6 +355,10 @@ static int configfs_detach_prep(struct dentry *dentry)
355 /* Mark that we've taken i_mutex */ 355 /* Mark that we've taken i_mutex */
356 sd->s_type |= CONFIGFS_USET_DROPPING; 356 sd->s_type |= CONFIGFS_USET_DROPPING;
357 357
358 /*
359 * Yup, recursive. If there's a problem, blame
360 * deep nesting of default_groups
361 */
358 ret = configfs_detach_prep(sd->s_dentry); 362 ret = configfs_detach_prep(sd->s_dentry);
359 if (!ret) 363 if (!ret)
360 continue; 364 continue;
@@ -562,7 +566,7 @@ static int populate_groups(struct config_group *group)
562 566
563/* 567/*
564 * All of link_obj/unlink_obj/link_group/unlink_group require that 568 * All of link_obj/unlink_obj/link_group/unlink_group require that
565 * subsys->su_sem is held. 569 * subsys->su_mutex is held.
566 */ 570 */
567 571
568static void unlink_obj(struct config_item *item) 572static void unlink_obj(struct config_item *item)
@@ -714,6 +718,28 @@ static void configfs_detach_group(struct config_item *item)
714} 718}
715 719
716/* 720/*
721 * After the item has been detached from the filesystem view, we are
722 * ready to tear it out of the hierarchy. Notify the client before
723 * we do that so they can perform any cleanup that requires
724 * navigating the hierarchy. A client does not need to provide this
725 * callback. The subsystem semaphore MUST be held by the caller, and
726 * references must be valid for both items. It also assumes the
727 * caller has validated ci_type.
728 */
729static void client_disconnect_notify(struct config_item *parent_item,
730 struct config_item *item)
731{
732 struct config_item_type *type;
733
734 type = parent_item->ci_type;
735 BUG_ON(!type);
736
737 if (type->ct_group_ops && type->ct_group_ops->disconnect_notify)
738 type->ct_group_ops->disconnect_notify(to_config_group(parent_item),
739 item);
740}
741
742/*
717 * Drop the initial reference from make_item()/make_group() 743 * Drop the initial reference from make_item()/make_group()
718 * This function assumes that reference is held on item 744 * This function assumes that reference is held on item
719 * and that item holds a valid reference to the parent. Also, it 745 * and that item holds a valid reference to the parent. Also, it
@@ -733,11 +759,244 @@ static void client_drop_item(struct config_item *parent_item,
733 */ 759 */
734 if (type->ct_group_ops && type->ct_group_ops->drop_item) 760 if (type->ct_group_ops && type->ct_group_ops->drop_item)
735 type->ct_group_ops->drop_item(to_config_group(parent_item), 761 type->ct_group_ops->drop_item(to_config_group(parent_item),
736 item); 762 item);
737 else 763 else
738 config_item_put(item); 764 config_item_put(item);
739} 765}
740 766
767#ifdef DEBUG
768static void configfs_dump_one(struct configfs_dirent *sd, int level)
769{
770 printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd));
771
772#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type);
773 type_print(CONFIGFS_ROOT);
774 type_print(CONFIGFS_DIR);
775 type_print(CONFIGFS_ITEM_ATTR);
776 type_print(CONFIGFS_ITEM_LINK);
777 type_print(CONFIGFS_USET_DIR);
778 type_print(CONFIGFS_USET_DEFAULT);
779 type_print(CONFIGFS_USET_DROPPING);
780#undef type_print
781}
782
783static int configfs_dump(struct configfs_dirent *sd, int level)
784{
785 struct configfs_dirent *child_sd;
786 int ret = 0;
787
788 configfs_dump_one(sd, level);
789
790 if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT)))
791 return 0;
792
793 list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
794 ret = configfs_dump(child_sd, level + 2);
795 if (ret)
796 break;
797 }
798
799 return ret;
800}
801#endif
802
803
804/*
805 * configfs_depend_item() and configfs_undepend_item()
806 *
807 * WARNING: Do not call these from a configfs callback!
808 *
809 * This describes these functions and their helpers.
810 *
811 * Allow another kernel system to depend on a config_item. If this
812 * happens, the item cannot go away until the dependant can live without
813 * it. The idea is to give client modules as simple an interface as
814 * possible. When a system asks them to depend on an item, they just
815 * call configfs_depend_item(). If the item is live and the client
816 * driver is in good shape, we'll happily do the work for them.
817 *
818 * Why is the locking complex? Because configfs uses the VFS to handle
819 * all locking, but this function is called outside the normal
820 * VFS->configfs path. So it must take VFS locks to prevent the
821 * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc). This is
822 * why you can't call these functions underneath configfs callbacks.
823 *
824 * Note, btw, that this can be called at *any* time, even when a configfs
825 * subsystem isn't registered, or when configfs is loading or unloading.
826 * Just like configfs_register_subsystem(). So we take the same
827 * precautions. We pin the filesystem. We lock each i_mutex _in_order_
828 * on our way down the tree. If we can find the target item in the
829 * configfs tree, it must be part of the subsystem tree as well, so we
830 * do not need the subsystem semaphore. Holding the i_mutex chain locks
831 * out mkdir() and rmdir(), who might be racing us.
832 */
833
834/*
835 * configfs_depend_prep()
836 *
837 * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are
838 * attributes. This is similar but not the same to configfs_detach_prep().
839 * Note that configfs_detach_prep() expects the parent to be locked when it
840 * is called, but we lock the parent *inside* configfs_depend_prep(). We
841 * do that so we can unlock it if we find nothing.
842 *
843 * Here we do a depth-first search of the dentry hierarchy looking for
844 * our object. We take i_mutex on each step of the way down. IT IS
845 * ESSENTIAL THAT i_mutex LOCKING IS ORDERED. If we come back up a branch,
846 * we'll drop the i_mutex.
847 *
848 * If the target is not found, -ENOENT is bubbled up and we have released
849 * all locks. If the target was found, the locks will be cleared by
850 * configfs_depend_rollback().
851 *
852 * This adds a requirement that all config_items be unique!
853 *
854 * This is recursive because the locking traversal is tricky. There isn't
855 * much on the stack, though, so folks that need this function - be careful
856 * about your stack! Patches will be accepted to make it iterative.
857 */
858static int configfs_depend_prep(struct dentry *origin,
859 struct config_item *target)
860{
861 struct configfs_dirent *child_sd, *sd = origin->d_fsdata;
862 int ret = 0;
863
864 BUG_ON(!origin || !sd);
865
866 /* Lock this guy on the way down */
867 mutex_lock(&sd->s_dentry->d_inode->i_mutex);
868 if (sd->s_element == target) /* Boo-yah */
869 goto out;
870
871 list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
872 if (child_sd->s_type & CONFIGFS_DIR) {
873 ret = configfs_depend_prep(child_sd->s_dentry,
874 target);
875 if (!ret)
876 goto out; /* Child path boo-yah */
877 }
878 }
879
880 /* We looped all our children and didn't find target */
881 mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
882 ret = -ENOENT;
883
884out:
885 return ret;
886}
887
888/*
889 * This is ONLY called if configfs_depend_prep() did its job. So we can
890 * trust the entire path from item back up to origin.
891 *
892 * We walk backwards from item, unlocking each i_mutex. We finish by
893 * unlocking origin.
894 */
895static void configfs_depend_rollback(struct dentry *origin,
896 struct config_item *item)
897{
898 struct dentry *dentry = item->ci_dentry;
899
900 while (dentry != origin) {
901 mutex_unlock(&dentry->d_inode->i_mutex);
902 dentry = dentry->d_parent;
903 }
904
905 mutex_unlock(&origin->d_inode->i_mutex);
906}
907
908int configfs_depend_item(struct configfs_subsystem *subsys,
909 struct config_item *target)
910{
911 int ret;
912 struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
913 struct config_item *s_item = &subsys->su_group.cg_item;
914
915 /*
916 * Pin the configfs filesystem. This means we can safely access
917 * the root of the configfs filesystem.
918 */
919 ret = configfs_pin_fs();
920 if (ret)
921 return ret;
922
923 /*
924 * Next, lock the root directory. We're going to check that the
925 * subsystem is really registered, and so we need to lock out
926 * configfs_[un]register_subsystem().
927 */
928 mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
929
930 root_sd = configfs_sb->s_root->d_fsdata;
931
932 list_for_each_entry(p, &root_sd->s_children, s_sibling) {
933 if (p->s_type & CONFIGFS_DIR) {
934 if (p->s_element == s_item) {
935 subsys_sd = p;
936 break;
937 }
938 }
939 }
940
941 if (!subsys_sd) {
942 ret = -ENOENT;
943 goto out_unlock_fs;
944 }
945
946 /* Ok, now we can trust subsys/s_item */
947
948 /* Scan the tree, locking i_mutex recursively, return 0 if found */
949 ret = configfs_depend_prep(subsys_sd->s_dentry, target);
950 if (ret)
951 goto out_unlock_fs;
952
953 /* We hold all i_mutexes from the subsystem down to the target */
954 p = target->ci_dentry->d_fsdata;
955 p->s_dependent_count += 1;
956
957 configfs_depend_rollback(subsys_sd->s_dentry, target);
958
959out_unlock_fs:
960 mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
961
962 /*
963 * If we succeeded, the fs is pinned via other methods. If not,
964 * we're done with it anyway. So release_fs() is always right.
965 */
966 configfs_release_fs();
967
968 return ret;
969}
970EXPORT_SYMBOL(configfs_depend_item);
971
972/*
973 * Release the dependent linkage. This is much simpler than
974 * configfs_depend_item() because we know that that the client driver is
975 * pinned, thus the subsystem is pinned, and therefore configfs is pinned.
976 */
977void configfs_undepend_item(struct configfs_subsystem *subsys,
978 struct config_item *target)
979{
980 struct configfs_dirent *sd;
981
982 /*
983 * Since we can trust everything is pinned, we just need i_mutex
984 * on the item.
985 */
986 mutex_lock(&target->ci_dentry->d_inode->i_mutex);
987
988 sd = target->ci_dentry->d_fsdata;
989 BUG_ON(sd->s_dependent_count < 1);
990
991 sd->s_dependent_count -= 1;
992
993 /*
994 * After this unlock, we cannot trust the item to stay alive!
995 * DO NOT REFERENCE item after this unlock.
996 */
997 mutex_unlock(&target->ci_dentry->d_inode->i_mutex);
998}
999EXPORT_SYMBOL(configfs_undepend_item);
741 1000
742static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 1001static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
743{ 1002{
@@ -783,7 +1042,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
783 1042
784 snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name); 1043 snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
785 1044
786 down(&subsys->su_sem); 1045 mutex_lock(&subsys->su_mutex);
787 group = NULL; 1046 group = NULL;
788 item = NULL; 1047 item = NULL;
789 if (type->ct_group_ops->make_group) { 1048 if (type->ct_group_ops->make_group) {
@@ -797,7 +1056,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
797 if (item) 1056 if (item)
798 link_obj(parent_item, item); 1057 link_obj(parent_item, item);
799 } 1058 }
800 up(&subsys->su_sem); 1059 mutex_unlock(&subsys->su_mutex);
801 1060
802 kfree(name); 1061 kfree(name);
803 if (!item) { 1062 if (!item) {
@@ -841,13 +1100,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
841out_unlink: 1100out_unlink:
842 if (ret) { 1101 if (ret) {
843 /* Tear down everything we built up */ 1102 /* Tear down everything we built up */
844 down(&subsys->su_sem); 1103 mutex_lock(&subsys->su_mutex);
1104
1105 client_disconnect_notify(parent_item, item);
845 if (group) 1106 if (group)
846 unlink_group(group); 1107 unlink_group(group);
847 else 1108 else
848 unlink_obj(item); 1109 unlink_obj(item);
849 client_drop_item(parent_item, item); 1110 client_drop_item(parent_item, item);
850 up(&subsys->su_sem); 1111
1112 mutex_unlock(&subsys->su_mutex);
851 1113
852 if (module_got) 1114 if (module_got)
853 module_put(owner); 1115 module_put(owner);
@@ -881,6 +1143,13 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
881 if (sd->s_type & CONFIGFS_USET_DEFAULT) 1143 if (sd->s_type & CONFIGFS_USET_DEFAULT)
882 return -EPERM; 1144 return -EPERM;
883 1145
1146 /*
1147 * Here's where we check for dependents. We're protected by
1148 * i_mutex.
1149 */
1150 if (sd->s_dependent_count)
1151 return -EBUSY;
1152
884 /* Get a working ref until we have the child */ 1153 /* Get a working ref until we have the child */
885 parent_item = configfs_get_config_item(dentry->d_parent); 1154 parent_item = configfs_get_config_item(dentry->d_parent);
886 subsys = to_config_group(parent_item)->cg_subsys; 1155 subsys = to_config_group(parent_item)->cg_subsys;
@@ -910,17 +1179,19 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
910 if (sd->s_type & CONFIGFS_USET_DIR) { 1179 if (sd->s_type & CONFIGFS_USET_DIR) {
911 configfs_detach_group(item); 1180 configfs_detach_group(item);
912 1181
913 down(&subsys->su_sem); 1182 mutex_lock(&subsys->su_mutex);
1183 client_disconnect_notify(parent_item, item);
914 unlink_group(to_config_group(item)); 1184 unlink_group(to_config_group(item));
915 } else { 1185 } else {
916 configfs_detach_item(item); 1186 configfs_detach_item(item);
917 1187
918 down(&subsys->su_sem); 1188 mutex_lock(&subsys->su_mutex);
1189 client_disconnect_notify(parent_item, item);
919 unlink_obj(item); 1190 unlink_obj(item);
920 } 1191 }
921 1192
922 client_drop_item(parent_item, item); 1193 client_drop_item(parent_item, item);
923 up(&subsys->su_sem); 1194 mutex_unlock(&subsys->su_mutex);
924 1195
925 /* Drop our reference from above */ 1196 /* Drop our reference from above */
926 config_item_put(item); 1197 config_item_put(item);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 3527c7c6def8..a3658f9a082c 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -27,19 +27,26 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/mutex.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/semaphore.h>
32 32
33#include <linux/configfs.h> 33#include <linux/configfs.h>
34#include "configfs_internal.h" 34#include "configfs_internal.h"
35 35
36/*
37 * A simple attribute can only be 4096 characters. Why 4k? Because the
38 * original code limited it to PAGE_SIZE. That's a bad idea, though,
39 * because an attribute of 16k on ia64 won't work on x86. So we limit to
40 * 4k, our minimum common page size.
41 */
42#define SIMPLE_ATTR_SIZE 4096
36 43
37struct configfs_buffer { 44struct configfs_buffer {
38 size_t count; 45 size_t count;
39 loff_t pos; 46 loff_t pos;
40 char * page; 47 char * page;
41 struct configfs_item_operations * ops; 48 struct configfs_item_operations * ops;
42 struct semaphore sem; 49 struct mutex mutex;
43 int needs_read_fill; 50 int needs_read_fill;
44}; 51};
45 52
@@ -69,7 +76,7 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
69 76
70 count = ops->show_attribute(item,attr,buffer->page); 77 count = ops->show_attribute(item,attr,buffer->page);
71 buffer->needs_read_fill = 0; 78 buffer->needs_read_fill = 0;
72 BUG_ON(count > (ssize_t)PAGE_SIZE); 79 BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
73 if (count >= 0) 80 if (count >= 0)
74 buffer->count = count; 81 buffer->count = count;
75 else 82 else
@@ -102,7 +109,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
102 struct configfs_buffer * buffer = file->private_data; 109 struct configfs_buffer * buffer = file->private_data;
103 ssize_t retval = 0; 110 ssize_t retval = 0;
104 111
105 down(&buffer->sem); 112 mutex_lock(&buffer->mutex);
106 if (buffer->needs_read_fill) { 113 if (buffer->needs_read_fill) {
107 if ((retval = fill_read_buffer(file->f_path.dentry,buffer))) 114 if ((retval = fill_read_buffer(file->f_path.dentry,buffer)))
108 goto out; 115 goto out;
@@ -112,7 +119,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
112 retval = simple_read_from_buffer(buf, count, ppos, buffer->page, 119 retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
113 buffer->count); 120 buffer->count);
114out: 121out:
115 up(&buffer->sem); 122 mutex_unlock(&buffer->mutex);
116 return retval; 123 return retval;
117} 124}
118 125
@@ -137,8 +144,8 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size
137 if (!buffer->page) 144 if (!buffer->page)
138 return -ENOMEM; 145 return -ENOMEM;
139 146
140 if (count >= PAGE_SIZE) 147 if (count >= SIMPLE_ATTR_SIZE)
141 count = PAGE_SIZE - 1; 148 count = SIMPLE_ATTR_SIZE - 1;
142 error = copy_from_user(buffer->page,buf,count); 149 error = copy_from_user(buffer->page,buf,count);
143 buffer->needs_read_fill = 1; 150 buffer->needs_read_fill = 1;
144 /* if buf is assumed to contain a string, terminate it by \0, 151 /* if buf is assumed to contain a string, terminate it by \0,
@@ -193,13 +200,13 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
193 struct configfs_buffer * buffer = file->private_data; 200 struct configfs_buffer * buffer = file->private_data;
194 ssize_t len; 201 ssize_t len;
195 202
196 down(&buffer->sem); 203 mutex_lock(&buffer->mutex);
197 len = fill_write_buffer(buffer, buf, count); 204 len = fill_write_buffer(buffer, buf, count);
198 if (len > 0) 205 if (len > 0)
199 len = flush_write_buffer(file->f_path.dentry, buffer, count); 206 len = flush_write_buffer(file->f_path.dentry, buffer, count);
200 if (len > 0) 207 if (len > 0)
201 *ppos += len; 208 *ppos += len;
202 up(&buffer->sem); 209 mutex_unlock(&buffer->mutex);
203 return len; 210 return len;
204} 211}
205 212
@@ -253,7 +260,7 @@ static int check_perm(struct inode * inode, struct file * file)
253 error = -ENOMEM; 260 error = -ENOMEM;
254 goto Enomem; 261 goto Enomem;
255 } 262 }
256 init_MUTEX(&buffer->sem); 263 mutex_init(&buffer->mutex);
257 buffer->needs_read_fill = 1; 264 buffer->needs_read_fill = 1;
258 buffer->ops = ops; 265 buffer->ops = ops;
259 file->private_data = buffer; 266 file->private_data = buffer;
@@ -292,6 +299,7 @@ static int configfs_release(struct inode * inode, struct file * filp)
292 if (buffer) { 299 if (buffer) {
293 if (buffer->page) 300 if (buffer->page)
294 free_page((unsigned long)buffer->page); 301 free_page((unsigned long)buffer->page);
302 mutex_destroy(&buffer->mutex);
295 kfree(buffer); 303 kfree(buffer);
296 } 304 }
297 return 0; 305 return 0;
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 24421209f854..76dc4c3e5d51 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -62,7 +62,6 @@ void config_item_init(struct config_item * item)
62 * dynamically allocated string that @item->ci_name points to. 62 * dynamically allocated string that @item->ci_name points to.
63 * Otherwise, use the static @item->ci_namebuf array. 63 * Otherwise, use the static @item->ci_namebuf array.
64 */ 64 */
65
66int config_item_set_name(struct config_item * item, const char * fmt, ...) 65int config_item_set_name(struct config_item * item, const char * fmt, ...)
67{ 66{
68 int error = 0; 67 int error = 0;
@@ -139,12 +138,7 @@ struct config_item * config_item_get(struct config_item * item)
139 return item; 138 return item;
140} 139}
141 140
142/** 141static void config_item_cleanup(struct config_item * item)
143 * config_item_cleanup - free config_item resources.
144 * @item: item.
145 */
146
147void config_item_cleanup(struct config_item * item)
148{ 142{
149 struct config_item_type * t = item->ci_type; 143 struct config_item_type * t = item->ci_type;
150 struct config_group * s = item->ci_group; 144 struct config_group * s = item->ci_group;
@@ -179,39 +173,35 @@ void config_item_put(struct config_item * item)
179 kref_put(&item->ci_kref, config_item_release); 173 kref_put(&item->ci_kref, config_item_release);
180} 174}
181 175
182
183/** 176/**
184 * config_group_init - initialize a group for use 177 * config_group_init - initialize a group for use
185 * @k: group 178 * @k: group
186 */ 179 */
187
188void config_group_init(struct config_group *group) 180void config_group_init(struct config_group *group)
189{ 181{
190 config_item_init(&group->cg_item); 182 config_item_init(&group->cg_item);
191 INIT_LIST_HEAD(&group->cg_children); 183 INIT_LIST_HEAD(&group->cg_children);
192} 184}
193 185
194
195/** 186/**
196 * config_group_find_obj - search for item in group. 187 * config_group_find_item - search for item in group.
197 * @group: group we're looking in. 188 * @group: group we're looking in.
198 * @name: item's name. 189 * @name: item's name.
199 * 190 *
200 * Lock group via @group->cg_subsys, and iterate over @group->cg_list, 191 * Iterate over @group->cg_list, looking for a matching config_item.
201 * looking for a matching config_item. If matching item is found 192 * If matching item is found take a reference and return the item.
202 * take a reference and return the item. 193 * Caller must have locked group via @group->cg_subsys->su_mtx.
203 */ 194 */
204 195struct config_item *config_group_find_item(struct config_group *group,
205struct config_item * config_group_find_obj(struct config_group * group, const char * name) 196 const char *name)
206{ 197{
207 struct list_head * entry; 198 struct list_head * entry;
208 struct config_item * ret = NULL; 199 struct config_item * ret = NULL;
209 200
210 /* XXX LOCKING! */
211 list_for_each(entry,&group->cg_children) { 201 list_for_each(entry,&group->cg_children) {
212 struct config_item * item = to_item(entry); 202 struct config_item * item = to_item(entry);
213 if (config_item_name(item) && 203 if (config_item_name(item) &&
214 !strcmp(config_item_name(item), name)) { 204 !strcmp(config_item_name(item), name)) {
215 ret = config_item_get(item); 205 ret = config_item_get(item);
216 break; 206 break;
217 } 207 }
@@ -219,9 +209,8 @@ struct config_item * config_group_find_obj(struct config_group * group, const ch
219 return ret; 209 return ret;
220} 210}
221 211
222
223EXPORT_SYMBOL(config_item_init); 212EXPORT_SYMBOL(config_item_init);
224EXPORT_SYMBOL(config_group_init); 213EXPORT_SYMBOL(config_group_init);
225EXPORT_SYMBOL(config_item_get); 214EXPORT_SYMBOL(config_item_get);
226EXPORT_SYMBOL(config_item_put); 215EXPORT_SYMBOL(config_item_put);
227EXPORT_SYMBOL(config_group_find_obj); 216EXPORT_SYMBOL(config_group_find_item);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 5069b2cb5a1f..2f8e3c81bc19 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -133,14 +133,6 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
133 return len; 133 return len;
134} 134}
135 135
136#define __CONFIGFS_ATTR(_name,_mode,_read,_write) { \
137 .attr = { .ca_name = __stringify(_name), \
138 .ca_mode = _mode, \
139 .ca_owner = THIS_MODULE }, \
140 .show = _read, \
141 .store = _write, \
142}
143
144#define CLUSTER_ATTR(name, check_zero) \ 136#define CLUSTER_ATTR(name, check_zero) \
145static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len) \ 137static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len) \
146{ \ 138{ \
@@ -615,7 +607,7 @@ static struct clusters clusters_root = {
615int dlm_config_init(void) 607int dlm_config_init(void)
616{ 608{
617 config_group_init(&clusters_root.subsys.su_group); 609 config_group_init(&clusters_root.subsys.su_group);
618 init_MUTEX(&clusters_root.subsys.su_sem); 610 mutex_init(&clusters_root.subsys.su_mutex);
619 return configfs_register_subsystem(&clusters_root.subsys); 611 return configfs_register_subsystem(&clusters_root.subsys);
620} 612}
621 613
@@ -759,9 +751,9 @@ static struct space *get_space(char *name)
759 if (!space_list) 751 if (!space_list)
760 return NULL; 752 return NULL;
761 753
762 down(&space_list->cg_subsys->su_sem); 754 mutex_lock(&space_list->cg_subsys->su_mutex);
763 i = config_group_find_obj(space_list, name); 755 i = config_group_find_item(space_list, name);
764 up(&space_list->cg_subsys->su_sem); 756 mutex_unlock(&space_list->cg_subsys->su_mutex);
765 757
766 return to_space(i); 758 return to_space(i);
767} 759}
@@ -780,7 +772,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
780 if (!comm_list) 772 if (!comm_list)
781 return NULL; 773 return NULL;
782 774
783 down(&clusters_root.subsys.su_sem); 775 mutex_lock(&clusters_root.subsys.su_mutex);
784 776
785 list_for_each_entry(i, &comm_list->cg_children, ci_entry) { 777 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
786 cm = to_comm(i); 778 cm = to_comm(i);
@@ -800,7 +792,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
800 break; 792 break;
801 } 793 }
802 } 794 }
803 up(&clusters_root.subsys.su_sem); 795 mutex_unlock(&clusters_root.subsys.su_mutex);
804 796
805 if (!found) 797 if (!found)
806 cm = NULL; 798 cm = NULL;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19712a7d145f..f5e11f4fa952 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -50,6 +50,8 @@
50#include "buffer_head_io.h" 50#include "buffer_head_io.h"
51 51
52static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 52static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
53static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
54 struct ocfs2_extent_block *eb);
53 55
54/* 56/*
55 * Structures which describe a path through a btree, and functions to 57 * Structures which describe a path through a btree, and functions to
@@ -117,6 +119,31 @@ static void ocfs2_free_path(struct ocfs2_path *path)
117} 119}
118 120
119/* 121/*
122 * All the elements of src into dest. After this call, src could be freed
123 * without affecting dest.
124 *
125 * Both paths should have the same root. Any non-root elements of dest
126 * will be freed.
127 */
128static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
129{
130 int i;
131
132 BUG_ON(path_root_bh(dest) != path_root_bh(src));
133 BUG_ON(path_root_el(dest) != path_root_el(src));
134
135 ocfs2_reinit_path(dest, 1);
136
137 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
138 dest->p_node[i].bh = src->p_node[i].bh;
139 dest->p_node[i].el = src->p_node[i].el;
140
141 if (dest->p_node[i].bh)
142 get_bh(dest->p_node[i].bh);
143 }
144}
145
146/*
120 * Make the *dest path the same as src and re-initialize src path to 147 * Make the *dest path the same as src and re-initialize src path to
121 * have a root only. 148 * have a root only.
122 */ 149 */
@@ -212,10 +239,41 @@ out:
212 return ret; 239 return ret;
213} 240}
214 241
242/*
243 * Return the index of the extent record which contains cluster #v_cluster.
244 * -1 is returned if it was not found.
245 *
246 * Should work fine on interior and exterior nodes.
247 */
248int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
249{
250 int ret = -1;
251 int i;
252 struct ocfs2_extent_rec *rec;
253 u32 rec_end, rec_start, clusters;
254
255 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
256 rec = &el->l_recs[i];
257
258 rec_start = le32_to_cpu(rec->e_cpos);
259 clusters = ocfs2_rec_clusters(el, rec);
260
261 rec_end = rec_start + clusters;
262
263 if (v_cluster >= rec_start && v_cluster < rec_end) {
264 ret = i;
265 break;
266 }
267 }
268
269 return ret;
270}
271
215enum ocfs2_contig_type { 272enum ocfs2_contig_type {
216 CONTIG_NONE = 0, 273 CONTIG_NONE = 0,
217 CONTIG_LEFT, 274 CONTIG_LEFT,
218 CONTIG_RIGHT 275 CONTIG_RIGHT,
276 CONTIG_LEFTRIGHT,
219}; 277};
220 278
221 279
@@ -253,6 +311,14 @@ static enum ocfs2_contig_type
253{ 311{
254 u64 blkno = le64_to_cpu(insert_rec->e_blkno); 312 u64 blkno = le64_to_cpu(insert_rec->e_blkno);
255 313
314 /*
315 * Refuse to coalesce extent records with different flag
316 * fields - we don't want to mix unwritten extents with user
317 * data.
318 */
319 if (ext->e_flags != insert_rec->e_flags)
320 return CONTIG_NONE;
321
256 if (ocfs2_extents_adjacent(ext, insert_rec) && 322 if (ocfs2_extents_adjacent(ext, insert_rec) &&
257 ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) 323 ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
258 return CONTIG_RIGHT; 324 return CONTIG_RIGHT;
@@ -277,7 +343,14 @@ enum ocfs2_append_type {
277 APPEND_TAIL, 343 APPEND_TAIL,
278}; 344};
279 345
346enum ocfs2_split_type {
347 SPLIT_NONE = 0,
348 SPLIT_LEFT,
349 SPLIT_RIGHT,
350};
351
280struct ocfs2_insert_type { 352struct ocfs2_insert_type {
353 enum ocfs2_split_type ins_split;
281 enum ocfs2_append_type ins_appending; 354 enum ocfs2_append_type ins_appending;
282 enum ocfs2_contig_type ins_contig; 355 enum ocfs2_contig_type ins_contig;
283 int ins_contig_index; 356 int ins_contig_index;
@@ -285,6 +358,13 @@ struct ocfs2_insert_type {
285 int ins_tree_depth; 358 int ins_tree_depth;
286}; 359};
287 360
361struct ocfs2_merge_ctxt {
362 enum ocfs2_contig_type c_contig_type;
363 int c_has_empty_extent;
364 int c_split_covers_rec;
365 int c_used_tail_recs;
366};
367
288/* 368/*
289 * How many free extents have we got before we need more meta data? 369 * How many free extents have we got before we need more meta data?
290 */ 370 */
@@ -384,13 +464,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
384 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 464 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
385 eb->h_blkno = cpu_to_le64(first_blkno); 465 eb->h_blkno = cpu_to_le64(first_blkno);
386 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 466 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
387
388#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
389 /* we always use slot zero's suballocator */
390 eb->h_suballoc_slot = 0;
391#else
392 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 467 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
393#endif
394 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 468 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
395 eb->h_list.l_count = 469 eb->h_list.l_count =
396 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 470 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -461,7 +535,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
461 struct inode *inode, 535 struct inode *inode,
462 struct buffer_head *fe_bh, 536 struct buffer_head *fe_bh,
463 struct buffer_head *eb_bh, 537 struct buffer_head *eb_bh,
464 struct buffer_head *last_eb_bh, 538 struct buffer_head **last_eb_bh,
465 struct ocfs2_alloc_context *meta_ac) 539 struct ocfs2_alloc_context *meta_ac)
466{ 540{
467 int status, new_blocks, i; 541 int status, new_blocks, i;
@@ -476,7 +550,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
476 550
477 mlog_entry_void(); 551 mlog_entry_void();
478 552
479 BUG_ON(!last_eb_bh); 553 BUG_ON(!last_eb_bh || !*last_eb_bh);
480 554
481 fe = (struct ocfs2_dinode *) fe_bh->b_data; 555 fe = (struct ocfs2_dinode *) fe_bh->b_data;
482 556
@@ -507,7 +581,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
507 goto bail; 581 goto bail;
508 } 582 }
509 583
510 eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; 584 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
511 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); 585 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
512 586
513 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 587 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
@@ -568,7 +642,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
568 * journal_dirty erroring as it won't unless we've aborted the 642 * journal_dirty erroring as it won't unless we've aborted the
569 * handle (in which case we would never be here) so reserving 643 * handle (in which case we would never be here) so reserving
570 * the write with journal_access is all we need to do. */ 644 * the write with journal_access is all we need to do. */
571 status = ocfs2_journal_access(handle, inode, last_eb_bh, 645 status = ocfs2_journal_access(handle, inode, *last_eb_bh,
572 OCFS2_JOURNAL_ACCESS_WRITE); 646 OCFS2_JOURNAL_ACCESS_WRITE);
573 if (status < 0) { 647 if (status < 0) {
574 mlog_errno(status); 648 mlog_errno(status);
@@ -601,10 +675,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
601 * next_leaf on the previously last-extent-block. */ 675 * next_leaf on the previously last-extent-block. */
602 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); 676 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
603 677
604 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 678 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
605 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 679 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
606 680
607 status = ocfs2_journal_dirty(handle, last_eb_bh); 681 status = ocfs2_journal_dirty(handle, *last_eb_bh);
608 if (status < 0) 682 if (status < 0)
609 mlog_errno(status); 683 mlog_errno(status);
610 status = ocfs2_journal_dirty(handle, fe_bh); 684 status = ocfs2_journal_dirty(handle, fe_bh);
@@ -616,6 +690,14 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
616 mlog_errno(status); 690 mlog_errno(status);
617 } 691 }
618 692
693 /*
694 * Some callers want to track the rightmost leaf so pass it
695 * back here.
696 */
697 brelse(*last_eb_bh);
698 get_bh(new_eb_bhs[0]);
699 *last_eb_bh = new_eb_bhs[0];
700
619 status = 0; 701 status = 0;
620bail: 702bail:
621 if (new_eb_bhs) { 703 if (new_eb_bhs) {
@@ -829,6 +911,87 @@ bail:
829} 911}
830 912
831/* 913/*
914 * Grow a b-tree so that it has more records.
915 *
916 * We might shift the tree depth in which case existing paths should
917 * be considered invalid.
918 *
919 * Tree depth after the grow is returned via *final_depth.
920 *
921 * *last_eb_bh will be updated by ocfs2_add_branch().
922 */
923static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
924 struct buffer_head *di_bh, int *final_depth,
925 struct buffer_head **last_eb_bh,
926 struct ocfs2_alloc_context *meta_ac)
927{
928 int ret, shift;
929 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
930 int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
931 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
932 struct buffer_head *bh = NULL;
933
934 BUG_ON(meta_ac == NULL);
935
936 shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
937 if (shift < 0) {
938 ret = shift;
939 mlog_errno(ret);
940 goto out;
941 }
942
943 /* We traveled all the way to the bottom of the allocation tree
944 * and didn't find room for any more extents - we need to add
945 * another tree level */
946 if (shift) {
947 BUG_ON(bh);
948 mlog(0, "need to shift tree depth (current = %d)\n", depth);
949
950 /* ocfs2_shift_tree_depth will return us a buffer with
951 * the new extent block (so we can pass that to
952 * ocfs2_add_branch). */
953 ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
954 meta_ac, &bh);
955 if (ret < 0) {
956 mlog_errno(ret);
957 goto out;
958 }
959 depth++;
960 if (depth == 1) {
961 /*
962 * Special case: we have room now if we shifted from
963 * tree_depth 0, so no more work needs to be done.
964 *
965 * We won't be calling add_branch, so pass
966 * back *last_eb_bh as the new leaf. At depth
967 * zero, it should always be null so there's
968 * no reason to brelse.
969 */
970 BUG_ON(*last_eb_bh);
971 get_bh(bh);
972 *last_eb_bh = bh;
973 goto out;
974 }
975 }
976
977 /* call ocfs2_add_branch to add the final part of the tree with
978 * the new data. */
979 mlog(0, "add branch. bh = %p\n", bh);
980 ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
981 meta_ac);
982 if (ret < 0) {
983 mlog_errno(ret);
984 goto out;
985 }
986
987out:
988 if (final_depth)
989 *final_depth = depth;
990 brelse(bh);
991 return ret;
992}
993
994/*
832 * This is only valid for leaf nodes, which are the only ones that can 995 * This is only valid for leaf nodes, which are the only ones that can
833 * have empty extents anyway. 996 * have empty extents anyway.
834 */ 997 */
@@ -934,6 +1097,22 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
934 1097
935} 1098}
936 1099
1100static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1101{
1102 int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1103
1104 BUG_ON(num_recs == 0);
1105
1106 if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1107 num_recs--;
1108 size = num_recs * sizeof(struct ocfs2_extent_rec);
1109 memmove(&el->l_recs[0], &el->l_recs[1], size);
1110 memset(&el->l_recs[num_recs], 0,
1111 sizeof(struct ocfs2_extent_rec));
1112 el->l_next_free_rec = cpu_to_le16(num_recs);
1113 }
1114}
1115
937/* 1116/*
938 * Create an empty extent record . 1117 * Create an empty extent record .
939 * 1118 *
@@ -1211,6 +1390,10 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1211 * immediately to their right. 1390 * immediately to their right.
1212 */ 1391 */
1213 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); 1392 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1393 if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1394 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1395 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1396 }
1214 left_clusters -= le32_to_cpu(left_rec->e_cpos); 1397 left_clusters -= le32_to_cpu(left_rec->e_cpos);
1215 left_rec->e_int_clusters = cpu_to_le32(left_clusters); 1398 left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1216 1399
@@ -1531,10 +1714,16 @@ out:
1531 return ret; 1714 return ret;
1532} 1715}
1533 1716
1717/*
1718 * Extend the transaction by enough credits to complete the rotation,
1719 * and still leave at least the original number of credits allocated
1720 * to this transaction.
1721 */
1534static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, 1722static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1723 int op_credits,
1535 struct ocfs2_path *path) 1724 struct ocfs2_path *path)
1536{ 1725{
1537 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; 1726 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
1538 1727
1539 if (handle->h_buffer_credits < credits) 1728 if (handle->h_buffer_credits < credits)
1540 return ocfs2_extend_trans(handle, credits); 1729 return ocfs2_extend_trans(handle, credits);
@@ -1568,6 +1757,29 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1568 return 0; 1757 return 0;
1569} 1758}
1570 1759
1760static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
1761{
1762 int next_free = le16_to_cpu(el->l_next_free_rec);
1763 unsigned int range;
1764 struct ocfs2_extent_rec *rec;
1765
1766 if (next_free == 0)
1767 return 0;
1768
1769 rec = &el->l_recs[0];
1770 if (ocfs2_is_empty_extent(rec)) {
1771 /* Empty list. */
1772 if (next_free == 1)
1773 return 0;
1774 rec = &el->l_recs[1];
1775 }
1776
1777 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1778 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1779 return 1;
1780 return 0;
1781}
1782
1571/* 1783/*
1572 * Rotate all the records in a btree right one record, starting at insert_cpos. 1784 * Rotate all the records in a btree right one record, starting at insert_cpos.
1573 * 1785 *
@@ -1586,11 +1798,12 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1586 */ 1798 */
1587static int ocfs2_rotate_tree_right(struct inode *inode, 1799static int ocfs2_rotate_tree_right(struct inode *inode,
1588 handle_t *handle, 1800 handle_t *handle,
1801 enum ocfs2_split_type split,
1589 u32 insert_cpos, 1802 u32 insert_cpos,
1590 struct ocfs2_path *right_path, 1803 struct ocfs2_path *right_path,
1591 struct ocfs2_path **ret_left_path) 1804 struct ocfs2_path **ret_left_path)
1592{ 1805{
1593 int ret, start; 1806 int ret, start, orig_credits = handle->h_buffer_credits;
1594 u32 cpos; 1807 u32 cpos;
1595 struct ocfs2_path *left_path = NULL; 1808 struct ocfs2_path *left_path = NULL;
1596 1809
@@ -1657,9 +1870,9 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1657 (unsigned long long) 1870 (unsigned long long)
1658 path_leaf_bh(left_path)->b_blocknr); 1871 path_leaf_bh(left_path)->b_blocknr);
1659 1872
1660 if (ocfs2_rotate_requires_path_adjustment(left_path, 1873 if (split == SPLIT_NONE &&
1874 ocfs2_rotate_requires_path_adjustment(left_path,
1661 insert_cpos)) { 1875 insert_cpos)) {
1662 mlog(0, "Path adjustment required\n");
1663 1876
1664 /* 1877 /*
1665 * We've rotated the tree as much as we 1878 * We've rotated the tree as much as we
@@ -1687,7 +1900,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1687 right_path->p_tree_depth); 1900 right_path->p_tree_depth);
1688 1901
1689 ret = ocfs2_extend_rotate_transaction(handle, start, 1902 ret = ocfs2_extend_rotate_transaction(handle, start,
1690 right_path); 1903 orig_credits, right_path);
1691 if (ret) { 1904 if (ret) {
1692 mlog_errno(ret); 1905 mlog_errno(ret);
1693 goto out; 1906 goto out;
@@ -1700,6 +1913,24 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
1700 goto out; 1913 goto out;
1701 } 1914 }
1702 1915
1916 if (split != SPLIT_NONE &&
1917 ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
1918 insert_cpos)) {
1919 /*
1920 * A rotate moves the rightmost left leaf
1921 * record over to the leftmost right leaf
1922 * slot. If we're doing an extent split
1923 * instead of a real insert, then we have to
1924 * check that the extent to be split wasn't
1925 * just moved over. If it was, then we can
1926 * exit here, passing left_path back -
1927 * ocfs2_split_extent() is smart enough to
1928 * search both leaves.
1929 */
1930 *ret_left_path = left_path;
1931 goto out_ret_path;
1932 }
1933
1703 /* 1934 /*
1704 * There is no need to re-read the next right path 1935 * There is no need to re-read the next right path
1705 * as we know that it'll be our current left 1936 * as we know that it'll be our current left
@@ -1722,6 +1953,1031 @@ out_ret_path:
1722 return ret; 1953 return ret;
1723} 1954}
1724 1955
1956static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
1957 struct ocfs2_path *path)
1958{
1959 int i, idx;
1960 struct ocfs2_extent_rec *rec;
1961 struct ocfs2_extent_list *el;
1962 struct ocfs2_extent_block *eb;
1963 u32 range;
1964
1965 /* Path should always be rightmost. */
1966 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
1967 BUG_ON(eb->h_next_leaf_blk != 0ULL);
1968
1969 el = &eb->h_list;
1970 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1971 idx = le16_to_cpu(el->l_next_free_rec) - 1;
1972 rec = &el->l_recs[idx];
1973 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1974
1975 for (i = 0; i < path->p_tree_depth; i++) {
1976 el = path->p_node[i].el;
1977 idx = le16_to_cpu(el->l_next_free_rec) - 1;
1978 rec = &el->l_recs[idx];
1979
1980 rec->e_int_clusters = cpu_to_le32(range);
1981 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
1982
1983 ocfs2_journal_dirty(handle, path->p_node[i].bh);
1984 }
1985}
1986
1987static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
1988 struct ocfs2_cached_dealloc_ctxt *dealloc,
1989 struct ocfs2_path *path, int unlink_start)
1990{
1991 int ret, i;
1992 struct ocfs2_extent_block *eb;
1993 struct ocfs2_extent_list *el;
1994 struct buffer_head *bh;
1995
1996 for(i = unlink_start; i < path_num_items(path); i++) {
1997 bh = path->p_node[i].bh;
1998
1999 eb = (struct ocfs2_extent_block *)bh->b_data;
2000 /*
2001 * Not all nodes might have had their final count
2002 * decremented by the caller - handle this here.
2003 */
2004 el = &eb->h_list;
2005 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2006 mlog(ML_ERROR,
2007 "Inode %llu, attempted to remove extent block "
2008 "%llu with %u records\n",
2009 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2010 (unsigned long long)le64_to_cpu(eb->h_blkno),
2011 le16_to_cpu(el->l_next_free_rec));
2012
2013 ocfs2_journal_dirty(handle, bh);
2014 ocfs2_remove_from_cache(inode, bh);
2015 continue;
2016 }
2017
2018 el->l_next_free_rec = 0;
2019 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2020
2021 ocfs2_journal_dirty(handle, bh);
2022
2023 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2024 if (ret)
2025 mlog_errno(ret);
2026
2027 ocfs2_remove_from_cache(inode, bh);
2028 }
2029}
2030
2031static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2032 struct ocfs2_path *left_path,
2033 struct ocfs2_path *right_path,
2034 int subtree_index,
2035 struct ocfs2_cached_dealloc_ctxt *dealloc)
2036{
2037 int i;
2038 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2039 struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2040 struct ocfs2_extent_list *el;
2041 struct ocfs2_extent_block *eb;
2042
2043 el = path_leaf_el(left_path);
2044
2045 eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2046
2047 for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2048 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2049 break;
2050
2051 BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2052
2053 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2054 le16_add_cpu(&root_el->l_next_free_rec, -1);
2055
2056 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2057 eb->h_next_leaf_blk = 0;
2058
2059 ocfs2_journal_dirty(handle, root_bh);
2060 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2061
2062 ocfs2_unlink_path(inode, handle, dealloc, right_path,
2063 subtree_index + 1);
2064}
2065
2066static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2067 struct ocfs2_path *left_path,
2068 struct ocfs2_path *right_path,
2069 int subtree_index,
2070 struct ocfs2_cached_dealloc_ctxt *dealloc,
2071 int *deleted)
2072{
2073 int ret, i, del_right_subtree = 0, right_has_empty = 0;
2074 struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
2075 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2076 struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2077 struct ocfs2_extent_block *eb;
2078
2079 *deleted = 0;
2080
2081 right_leaf_el = path_leaf_el(right_path);
2082 left_leaf_el = path_leaf_el(left_path);
2083 root_bh = left_path->p_node[subtree_index].bh;
2084 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2085
2086 if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2087 return 0;
2088
2089 eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2090 if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2091 /*
2092 * It's legal for us to proceed if the right leaf is
2093 * the rightmost one and it has an empty extent. There
2094 * are two cases to handle - whether the leaf will be
2095 * empty after removal or not. If the leaf isn't empty
2096 * then just remove the empty extent up front. The
2097 * next block will handle empty leaves by flagging
2098 * them for unlink.
2099 *
2100 * Non rightmost leaves will throw -EAGAIN and the
2101 * caller can manually move the subtree and retry.
2102 */
2103
2104 if (eb->h_next_leaf_blk != 0ULL)
2105 return -EAGAIN;
2106
2107 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2108 ret = ocfs2_journal_access(handle, inode,
2109 path_leaf_bh(right_path),
2110 OCFS2_JOURNAL_ACCESS_WRITE);
2111 if (ret) {
2112 mlog_errno(ret);
2113 goto out;
2114 }
2115
2116 ocfs2_remove_empty_extent(right_leaf_el);
2117 } else
2118 right_has_empty = 1;
2119 }
2120
2121 if (eb->h_next_leaf_blk == 0ULL &&
2122 le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2123 /*
2124 * We have to update i_last_eb_blk during the meta
2125 * data delete.
2126 */
2127 ret = ocfs2_journal_access(handle, inode, di_bh,
2128 OCFS2_JOURNAL_ACCESS_WRITE);
2129 if (ret) {
2130 mlog_errno(ret);
2131 goto out;
2132 }
2133
2134 del_right_subtree = 1;
2135 }
2136
2137 /*
2138 * Getting here with an empty extent in the right path implies
2139 * that it's the rightmost path and will be deleted.
2140 */
2141 BUG_ON(right_has_empty && !del_right_subtree);
2142
2143 ret = ocfs2_journal_access(handle, inode, root_bh,
2144 OCFS2_JOURNAL_ACCESS_WRITE);
2145 if (ret) {
2146 mlog_errno(ret);
2147 goto out;
2148 }
2149
2150 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2151 ret = ocfs2_journal_access(handle, inode,
2152 right_path->p_node[i].bh,
2153 OCFS2_JOURNAL_ACCESS_WRITE);
2154 if (ret) {
2155 mlog_errno(ret);
2156 goto out;
2157 }
2158
2159 ret = ocfs2_journal_access(handle, inode,
2160 left_path->p_node[i].bh,
2161 OCFS2_JOURNAL_ACCESS_WRITE);
2162 if (ret) {
2163 mlog_errno(ret);
2164 goto out;
2165 }
2166 }
2167
2168 if (!right_has_empty) {
2169 /*
2170 * Only do this if we're moving a real
2171 * record. Otherwise, the action is delayed until
2172 * after removal of the right path in which case we
2173 * can do a simple shift to remove the empty extent.
2174 */
2175 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2176 memset(&right_leaf_el->l_recs[0], 0,
2177 sizeof(struct ocfs2_extent_rec));
2178 }
2179 if (eb->h_next_leaf_blk == 0ULL) {
2180 /*
2181 * Move recs over to get rid of empty extent, decrease
2182 * next_free. This is allowed to remove the last
2183 * extent in our leaf (setting l_next_free_rec to
2184 * zero) - the delete code below won't care.
2185 */
2186 ocfs2_remove_empty_extent(right_leaf_el);
2187 }
2188
2189 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2190 if (ret)
2191 mlog_errno(ret);
2192 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2193 if (ret)
2194 mlog_errno(ret);
2195
2196 if (del_right_subtree) {
2197 ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2198 subtree_index, dealloc);
2199 ocfs2_update_edge_lengths(inode, handle, left_path);
2200
2201 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2202 di->i_last_eb_blk = eb->h_blkno;
2203
2204 /*
2205 * Removal of the extent in the left leaf was skipped
2206 * above so we could delete the right path
2207 * 1st.
2208 */
2209 if (right_has_empty)
2210 ocfs2_remove_empty_extent(left_leaf_el);
2211
2212 ret = ocfs2_journal_dirty(handle, di_bh);
2213 if (ret)
2214 mlog_errno(ret);
2215
2216 *deleted = 1;
2217 } else
2218 ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2219 subtree_index);
2220
2221out:
2222 return ret;
2223}
2224
2225/*
2226 * Given a full path, determine what cpos value would return us a path
2227 * containing the leaf immediately to the right of the current one.
2228 *
2229 * Will return zero if the path passed in is already the rightmost path.
2230 *
2231 * This looks similar, but is subtly different to
2232 * ocfs2_find_cpos_for_left_leaf().
2233 */
2234static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2235 struct ocfs2_path *path, u32 *cpos)
2236{
2237 int i, j, ret = 0;
2238 u64 blkno;
2239 struct ocfs2_extent_list *el;
2240
2241 *cpos = 0;
2242
2243 if (path->p_tree_depth == 0)
2244 return 0;
2245
2246 blkno = path_leaf_bh(path)->b_blocknr;
2247
2248 /* Start at the tree node just above the leaf and work our way up. */
2249 i = path->p_tree_depth - 1;
2250 while (i >= 0) {
2251 int next_free;
2252
2253 el = path->p_node[i].el;
2254
2255 /*
2256 * Find the extent record just after the one in our
2257 * path.
2258 */
2259 next_free = le16_to_cpu(el->l_next_free_rec);
2260 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2261 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2262 if (j == (next_free - 1)) {
2263 if (i == 0) {
2264 /*
2265 * We've determined that the
2266 * path specified is already
2267 * the rightmost one - return a
2268 * cpos of zero.
2269 */
2270 goto out;
2271 }
2272 /*
2273 * The rightmost record points to our
2274 * leaf - we need to travel up the
2275 * tree one level.
2276 */
2277 goto next_node;
2278 }
2279
2280 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2281 goto out;
2282 }
2283 }
2284
2285 /*
2286 * If we got here, we never found a valid node where
2287 * the tree indicated one should be.
2288 */
2289 ocfs2_error(sb,
2290 "Invalid extent tree at extent block %llu\n",
2291 (unsigned long long)blkno);
2292 ret = -EROFS;
2293 goto out;
2294
2295next_node:
2296 blkno = path->p_node[i].bh->b_blocknr;
2297 i--;
2298 }
2299
2300out:
2301 return ret;
2302}
2303
2304static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2305 handle_t *handle,
2306 struct buffer_head *bh,
2307 struct ocfs2_extent_list *el)
2308{
2309 int ret;
2310
2311 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2312 return 0;
2313
2314 ret = ocfs2_journal_access(handle, inode, bh,
2315 OCFS2_JOURNAL_ACCESS_WRITE);
2316 if (ret) {
2317 mlog_errno(ret);
2318 goto out;
2319 }
2320
2321 ocfs2_remove_empty_extent(el);
2322
2323 ret = ocfs2_journal_dirty(handle, bh);
2324 if (ret)
2325 mlog_errno(ret);
2326
2327out:
2328 return ret;
2329}
2330
2331static int __ocfs2_rotate_tree_left(struct inode *inode,
2332 handle_t *handle, int orig_credits,
2333 struct ocfs2_path *path,
2334 struct ocfs2_cached_dealloc_ctxt *dealloc,
2335 struct ocfs2_path **empty_extent_path)
2336{
2337 int ret, subtree_root, deleted;
2338 u32 right_cpos;
2339 struct ocfs2_path *left_path = NULL;
2340 struct ocfs2_path *right_path = NULL;
2341
2342 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2343
2344 *empty_extent_path = NULL;
2345
2346 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2347 &right_cpos);
2348 if (ret) {
2349 mlog_errno(ret);
2350 goto out;
2351 }
2352
2353 left_path = ocfs2_new_path(path_root_bh(path),
2354 path_root_el(path));
2355 if (!left_path) {
2356 ret = -ENOMEM;
2357 mlog_errno(ret);
2358 goto out;
2359 }
2360
2361 ocfs2_cp_path(left_path, path);
2362
2363 right_path = ocfs2_new_path(path_root_bh(path),
2364 path_root_el(path));
2365 if (!right_path) {
2366 ret = -ENOMEM;
2367 mlog_errno(ret);
2368 goto out;
2369 }
2370
2371 while (right_cpos) {
2372 ret = ocfs2_find_path(inode, right_path, right_cpos);
2373 if (ret) {
2374 mlog_errno(ret);
2375 goto out;
2376 }
2377
2378 subtree_root = ocfs2_find_subtree_root(inode, left_path,
2379 right_path);
2380
2381 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2382 subtree_root,
2383 (unsigned long long)
2384 right_path->p_node[subtree_root].bh->b_blocknr,
2385 right_path->p_tree_depth);
2386
2387 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2388 orig_credits, left_path);
2389 if (ret) {
2390 mlog_errno(ret);
2391 goto out;
2392 }
2393
2394 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2395 right_path, subtree_root,
2396 dealloc, &deleted);
2397 if (ret == -EAGAIN) {
2398 /*
2399 * The rotation has to temporarily stop due to
2400 * the right subtree having an empty
2401 * extent. Pass it back to the caller for a
2402 * fixup.
2403 */
2404 *empty_extent_path = right_path;
2405 right_path = NULL;
2406 goto out;
2407 }
2408 if (ret) {
2409 mlog_errno(ret);
2410 goto out;
2411 }
2412
2413 /*
2414 * The subtree rotate might have removed records on
2415 * the rightmost edge. If so, then rotation is
2416 * complete.
2417 */
2418 if (deleted)
2419 break;
2420
2421 ocfs2_mv_path(left_path, right_path);
2422
2423 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2424 &right_cpos);
2425 if (ret) {
2426 mlog_errno(ret);
2427 goto out;
2428 }
2429 }
2430
2431out:
2432 ocfs2_free_path(right_path);
2433 ocfs2_free_path(left_path);
2434
2435 return ret;
2436}
2437
2438static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2439 struct ocfs2_path *path,
2440 struct ocfs2_cached_dealloc_ctxt *dealloc)
2441{
2442 int ret, subtree_index;
2443 u32 cpos;
2444 struct ocfs2_path *left_path = NULL;
2445 struct ocfs2_dinode *di;
2446 struct ocfs2_extent_block *eb;
2447 struct ocfs2_extent_list *el;
2448
2449 /*
2450 * XXX: This code assumes that the root is an inode, which is
2451 * true for now but may change as tree code gets generic.
2452 */
2453 di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
2454 if (!OCFS2_IS_VALID_DINODE(di)) {
2455 ret = -EIO;
2456 ocfs2_error(inode->i_sb,
2457 "Inode %llu has invalid path root",
2458 (unsigned long long)OCFS2_I(inode)->ip_blkno);
2459 goto out;
2460 }
2461
2462 /*
2463 * There's two ways we handle this depending on
2464 * whether path is the only existing one.
2465 */
2466 ret = ocfs2_extend_rotate_transaction(handle, 0,
2467 handle->h_buffer_credits,
2468 path);
2469 if (ret) {
2470 mlog_errno(ret);
2471 goto out;
2472 }
2473
2474 ret = ocfs2_journal_access_path(inode, handle, path);
2475 if (ret) {
2476 mlog_errno(ret);
2477 goto out;
2478 }
2479
2480 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2481 if (ret) {
2482 mlog_errno(ret);
2483 goto out;
2484 }
2485
2486 if (cpos) {
2487 /*
2488 * We have a path to the left of this one - it needs
2489 * an update too.
2490 */
2491 left_path = ocfs2_new_path(path_root_bh(path),
2492 path_root_el(path));
2493 if (!left_path) {
2494 ret = -ENOMEM;
2495 mlog_errno(ret);
2496 goto out;
2497 }
2498
2499 ret = ocfs2_find_path(inode, left_path, cpos);
2500 if (ret) {
2501 mlog_errno(ret);
2502 goto out;
2503 }
2504
2505 ret = ocfs2_journal_access_path(inode, handle, left_path);
2506 if (ret) {
2507 mlog_errno(ret);
2508 goto out;
2509 }
2510
2511 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2512
2513 ocfs2_unlink_subtree(inode, handle, left_path, path,
2514 subtree_index, dealloc);
2515 ocfs2_update_edge_lengths(inode, handle, left_path);
2516
2517 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2518 di->i_last_eb_blk = eb->h_blkno;
2519 } else {
2520 /*
2521 * 'path' is also the leftmost path which
2522 * means it must be the only one. This gets
2523 * handled differently because we want to
2524 * revert the inode back to having extents
2525 * in-line.
2526 */
2527 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2528
2529 el = &di->id2.i_list;
2530 el->l_tree_depth = 0;
2531 el->l_next_free_rec = 0;
2532 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2533
2534 di->i_last_eb_blk = 0;
2535 }
2536
2537 ocfs2_journal_dirty(handle, path_root_bh(path));
2538
2539out:
2540 ocfs2_free_path(left_path);
2541 return ret;
2542}
2543
2544/*
2545 * Left rotation of btree records.
2546 *
2547 * In many ways, this is (unsurprisingly) the opposite of right
2548 * rotation. We start at some non-rightmost path containing an empty
2549 * extent in the leaf block. The code works its way to the rightmost
2550 * path by rotating records to the left in every subtree.
2551 *
2552 * This is used by any code which reduces the number of extent records
2553 * in a leaf. After removal, an empty record should be placed in the
2554 * leftmost list position.
2555 *
2556 * This won't handle a length update of the rightmost path records if
2557 * the rightmost tree leaf record is removed so the caller is
2558 * responsible for detecting and correcting that.
2559 */
2560static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2561 struct ocfs2_path *path,
2562 struct ocfs2_cached_dealloc_ctxt *dealloc)
2563{
2564 int ret, orig_credits = handle->h_buffer_credits;
2565 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2566 struct ocfs2_extent_block *eb;
2567 struct ocfs2_extent_list *el;
2568
2569 el = path_leaf_el(path);
2570 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2571 return 0;
2572
2573 if (path->p_tree_depth == 0) {
2574rightmost_no_delete:
2575 /*
2576 * In-inode extents. This is trivially handled, so do
2577 * it up front.
2578 */
2579 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2580 path_leaf_bh(path),
2581 path_leaf_el(path));
2582 if (ret)
2583 mlog_errno(ret);
2584 goto out;
2585 }
2586
2587 /*
2588 * Handle rightmost branch now. There's several cases:
2589 * 1) simple rotation leaving records in there. That's trivial.
2590 * 2) rotation requiring a branch delete - there's no more
2591 * records left. Two cases of this:
2592 * a) There are branches to the left.
2593 * b) This is also the leftmost (the only) branch.
2594 *
2595 * 1) is handled via ocfs2_rotate_rightmost_leaf_left()
2596 * 2a) we need the left branch so that we can update it with the unlink
2597 * 2b) we need to bring the inode back to inline extents.
2598 */
2599
2600 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2601 el = &eb->h_list;
2602 if (eb->h_next_leaf_blk == 0) {
2603 /*
2604 * This gets a bit tricky if we're going to delete the
2605 * rightmost path. Get the other cases out of the way
2606 * 1st.
2607 */
2608 if (le16_to_cpu(el->l_next_free_rec) > 1)
2609 goto rightmost_no_delete;
2610
2611 if (le16_to_cpu(el->l_next_free_rec) == 0) {
2612 ret = -EIO;
2613 ocfs2_error(inode->i_sb,
2614 "Inode %llu has empty extent block at %llu",
2615 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2616 (unsigned long long)le64_to_cpu(eb->h_blkno));
2617 goto out;
2618 }
2619
2620 /*
2621 * XXX: The caller can not trust "path" any more after
2622 * this as it will have been deleted. What do we do?
2623 *
2624 * In theory the rotate-for-merge code will never get
2625 * here because it'll always ask for a rotate in a
2626 * nonempty list.
2627 */
2628
2629 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2630 dealloc);
2631 if (ret)
2632 mlog_errno(ret);
2633 goto out;
2634 }
2635
2636 /*
2637 * Now we can loop, remembering the path we get from -EAGAIN
2638 * and restarting from there.
2639 */
2640try_rotate:
2641 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2642 dealloc, &restart_path);
2643 if (ret && ret != -EAGAIN) {
2644 mlog_errno(ret);
2645 goto out;
2646 }
2647
2648 while (ret == -EAGAIN) {
2649 tmp_path = restart_path;
2650 restart_path = NULL;
2651
2652 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2653 tmp_path, dealloc,
2654 &restart_path);
2655 if (ret && ret != -EAGAIN) {
2656 mlog_errno(ret);
2657 goto out;
2658 }
2659
2660 ocfs2_free_path(tmp_path);
2661 tmp_path = NULL;
2662
2663 if (ret == 0)
2664 goto try_rotate;
2665 }
2666
2667out:
2668 ocfs2_free_path(tmp_path);
2669 ocfs2_free_path(restart_path);
2670 return ret;
2671}
2672
2673static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2674 int index)
2675{
2676 struct ocfs2_extent_rec *rec = &el->l_recs[index];
2677 unsigned int size;
2678
2679 if (rec->e_leaf_clusters == 0) {
2680 /*
2681 * We consumed all of the merged-from record. An empty
2682 * extent cannot exist anywhere but the 1st array
2683 * position, so move things over if the merged-from
2684 * record doesn't occupy that position.
2685 *
2686 * This creates a new empty extent so the caller
2687 * should be smart enough to have removed any existing
2688 * ones.
2689 */
2690 if (index > 0) {
2691 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2692 size = index * sizeof(struct ocfs2_extent_rec);
2693 memmove(&el->l_recs[1], &el->l_recs[0], size);
2694 }
2695
2696 /*
2697 * Always memset - the caller doesn't check whether it
2698 * created an empty extent, so there could be junk in
2699 * the other fields.
2700 */
2701 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2702 }
2703}
2704
2705/*
2706 * Remove split_rec clusters from the record at index and merge them
2707 * onto the beginning of the record at index + 1.
2708 */
2709static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
2710 handle_t *handle,
2711 struct ocfs2_extent_rec *split_rec,
2712 struct ocfs2_extent_list *el, int index)
2713{
2714 int ret;
2715 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2716 struct ocfs2_extent_rec *left_rec;
2717 struct ocfs2_extent_rec *right_rec;
2718
2719 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
2720
2721 left_rec = &el->l_recs[index];
2722 right_rec = &el->l_recs[index + 1];
2723
2724 ret = ocfs2_journal_access(handle, inode, bh,
2725 OCFS2_JOURNAL_ACCESS_WRITE);
2726 if (ret) {
2727 mlog_errno(ret);
2728 goto out;
2729 }
2730
2731 le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
2732
2733 le32_add_cpu(&right_rec->e_cpos, -split_clusters);
2734 le64_add_cpu(&right_rec->e_blkno,
2735 -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2736 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
2737
2738 ocfs2_cleanup_merge(el, index);
2739
2740 ret = ocfs2_journal_dirty(handle, bh);
2741 if (ret)
2742 mlog_errno(ret);
2743
2744out:
2745 return ret;
2746}
2747
2748/*
2749 * Remove split_rec clusters from the record at index and merge them
2750 * onto the tail of the record at index - 1.
2751 */
2752static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
2753 handle_t *handle,
2754 struct ocfs2_extent_rec *split_rec,
2755 struct ocfs2_extent_list *el, int index)
2756{
2757 int ret, has_empty_extent = 0;
2758 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2759 struct ocfs2_extent_rec *left_rec;
2760 struct ocfs2_extent_rec *right_rec;
2761
2762 BUG_ON(index <= 0);
2763
2764 left_rec = &el->l_recs[index - 1];
2765 right_rec = &el->l_recs[index];
2766 if (ocfs2_is_empty_extent(&el->l_recs[0]))
2767 has_empty_extent = 1;
2768
2769 ret = ocfs2_journal_access(handle, inode, bh,
2770 OCFS2_JOURNAL_ACCESS_WRITE);
2771 if (ret) {
2772 mlog_errno(ret);
2773 goto out;
2774 }
2775
2776 if (has_empty_extent && index == 1) {
2777 /*
2778 * The easy case - we can just plop the record right in.
2779 */
2780 *left_rec = *split_rec;
2781
2782 has_empty_extent = 0;
2783 } else {
2784 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
2785 }
2786
2787 le32_add_cpu(&right_rec->e_cpos, split_clusters);
2788 le64_add_cpu(&right_rec->e_blkno,
2789 ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2790 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
2791
2792 ocfs2_cleanup_merge(el, index);
2793
2794 ret = ocfs2_journal_dirty(handle, bh);
2795 if (ret)
2796 mlog_errno(ret);
2797
2798out:
2799 return ret;
2800}
2801
2802static int ocfs2_try_to_merge_extent(struct inode *inode,
2803 handle_t *handle,
2804 struct ocfs2_path *left_path,
2805 int split_index,
2806 struct ocfs2_extent_rec *split_rec,
2807 struct ocfs2_cached_dealloc_ctxt *dealloc,
2808 struct ocfs2_merge_ctxt *ctxt)
2809
2810{
2811 int ret = 0, delete_tail_recs = 0;
2812 struct ocfs2_extent_list *el = path_leaf_el(left_path);
2813 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
2814
2815 BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
2816
2817 if (ctxt->c_split_covers_rec) {
2818 delete_tail_recs++;
2819
2820 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT ||
2821 ctxt->c_has_empty_extent)
2822 delete_tail_recs++;
2823
2824 if (ctxt->c_has_empty_extent) {
2825 /*
2826 * The merge code will need to create an empty
2827 * extent to take the place of the newly
2828 * emptied slot. Remove any pre-existing empty
2829 * extents - having more than one in a leaf is
2830 * illegal.
2831 */
2832 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2833 dealloc);
2834 if (ret) {
2835 mlog_errno(ret);
2836 goto out;
2837 }
2838 split_index--;
2839 rec = &el->l_recs[split_index];
2840 }
2841 }
2842
2843 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
2844 /*
2845 * Left-right contig implies this.
2846 */
2847 BUG_ON(!ctxt->c_split_covers_rec);
2848 BUG_ON(split_index == 0);
2849
2850 /*
2851 * Since the leftright insert always covers the entire
2852 * extent, this call will delete the insert record
2853 * entirely, resulting in an empty extent record added to
2854 * the extent block.
2855 *
2856 * Since the adding of an empty extent shifts
2857 * everything back to the right, there's no need to
2858 * update split_index here.
2859 */
2860 ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
2861 handle, split_rec, el, split_index);
2862 if (ret) {
2863 mlog_errno(ret);
2864 goto out;
2865 }
2866
2867 /*
2868 * We can only get this from logic error above.
2869 */
2870 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2871
2872 /*
2873 * The left merge left us with an empty extent, remove
2874 * it.
2875 */
2876 ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
2877 if (ret) {
2878 mlog_errno(ret);
2879 goto out;
2880 }
2881 split_index--;
2882 rec = &el->l_recs[split_index];
2883
2884 /*
2885 * Note that we don't pass split_rec here on purpose -
2886 * we've merged it into the left side.
2887 */
2888 ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
2889 handle, rec, el, split_index);
2890 if (ret) {
2891 mlog_errno(ret);
2892 goto out;
2893 }
2894
2895 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2896
2897 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2898 dealloc);
2899 /*
2900 * Error from this last rotate is not critical, so
2901 * print but don't bubble it up.
2902 */
2903 if (ret)
2904 mlog_errno(ret);
2905 ret = 0;
2906 } else {
2907 /*
2908 * Merge a record to the left or right.
2909 *
2910 * 'contig_type' is relative to the existing record,
2911 * so for example, if we're "right contig", it's to
2912 * the record on the left (hence the left merge).
2913 */
2914 if (ctxt->c_contig_type == CONTIG_RIGHT) {
2915 ret = ocfs2_merge_rec_left(inode,
2916 path_leaf_bh(left_path),
2917 handle, split_rec, el,
2918 split_index);
2919 if (ret) {
2920 mlog_errno(ret);
2921 goto out;
2922 }
2923 } else {
2924 ret = ocfs2_merge_rec_right(inode,
2925 path_leaf_bh(left_path),
2926 handle, split_rec, el,
2927 split_index);
2928 if (ret) {
2929 mlog_errno(ret);
2930 goto out;
2931 }
2932 }
2933
2934 if (ctxt->c_split_covers_rec) {
2935 /*
2936 * The merge may have left an empty extent in
2937 * our leaf. Try to rotate it away.
2938 */
2939 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2940 dealloc);
2941 if (ret)
2942 mlog_errno(ret);
2943 ret = 0;
2944 }
2945 }
2946
2947out:
2948 return ret;
2949}
2950
2951static void ocfs2_subtract_from_rec(struct super_block *sb,
2952 enum ocfs2_split_type split,
2953 struct ocfs2_extent_rec *rec,
2954 struct ocfs2_extent_rec *split_rec)
2955{
2956 u64 len_blocks;
2957
2958 len_blocks = ocfs2_clusters_to_blocks(sb,
2959 le16_to_cpu(split_rec->e_leaf_clusters));
2960
2961 if (split == SPLIT_LEFT) {
2962 /*
2963 * Region is on the left edge of the existing
2964 * record.
2965 */
2966 le32_add_cpu(&rec->e_cpos,
2967 le16_to_cpu(split_rec->e_leaf_clusters));
2968 le64_add_cpu(&rec->e_blkno, len_blocks);
2969 le16_add_cpu(&rec->e_leaf_clusters,
2970 -le16_to_cpu(split_rec->e_leaf_clusters));
2971 } else {
2972 /*
2973 * Region is on the right edge of the existing
2974 * record.
2975 */
2976 le16_add_cpu(&rec->e_leaf_clusters,
2977 -le16_to_cpu(split_rec->e_leaf_clusters));
2978 }
2979}
2980
1725/* 2981/*
1726 * Do the final bits of extent record insertion at the target leaf 2982 * Do the final bits of extent record insertion at the target leaf
1727 * list. If this leaf is part of an allocation tree, it is assumed 2983 * list. If this leaf is part of an allocation tree, it is assumed
@@ -1738,6 +2994,15 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1738 2994
1739 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 2995 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1740 2996
2997 if (insert->ins_split != SPLIT_NONE) {
2998 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
2999 BUG_ON(i == -1);
3000 rec = &el->l_recs[i];
3001 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3002 insert_rec);
3003 goto rotate;
3004 }
3005
1741 /* 3006 /*
1742 * Contiguous insert - either left or right. 3007 * Contiguous insert - either left or right.
1743 */ 3008 */
@@ -1792,6 +3057,7 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1792 return; 3057 return;
1793 } 3058 }
1794 3059
3060rotate:
1795 /* 3061 /*
1796 * Ok, we have to rotate. 3062 * Ok, we have to rotate.
1797 * 3063 *
@@ -1815,13 +3081,53 @@ static inline void ocfs2_update_dinode_clusters(struct inode *inode,
1815 spin_unlock(&OCFS2_I(inode)->ip_lock); 3081 spin_unlock(&OCFS2_I(inode)->ip_lock);
1816} 3082}
1817 3083
3084static void ocfs2_adjust_rightmost_records(struct inode *inode,
3085 handle_t *handle,
3086 struct ocfs2_path *path,
3087 struct ocfs2_extent_rec *insert_rec)
3088{
3089 int ret, i, next_free;
3090 struct buffer_head *bh;
3091 struct ocfs2_extent_list *el;
3092 struct ocfs2_extent_rec *rec;
3093
3094 /*
3095 * Update everything except the leaf block.
3096 */
3097 for (i = 0; i < path->p_tree_depth; i++) {
3098 bh = path->p_node[i].bh;
3099 el = path->p_node[i].el;
3100
3101 next_free = le16_to_cpu(el->l_next_free_rec);
3102 if (next_free == 0) {
3103 ocfs2_error(inode->i_sb,
3104 "Dinode %llu has a bad extent list",
3105 (unsigned long long)OCFS2_I(inode)->ip_blkno);
3106 ret = -EIO;
3107 return;
3108 }
3109
3110 rec = &el->l_recs[next_free - 1];
3111
3112 rec->e_int_clusters = insert_rec->e_cpos;
3113 le32_add_cpu(&rec->e_int_clusters,
3114 le16_to_cpu(insert_rec->e_leaf_clusters));
3115 le32_add_cpu(&rec->e_int_clusters,
3116 -le32_to_cpu(rec->e_cpos));
3117
3118 ret = ocfs2_journal_dirty(handle, bh);
3119 if (ret)
3120 mlog_errno(ret);
3121
3122 }
3123}
3124
1818static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, 3125static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1819 struct ocfs2_extent_rec *insert_rec, 3126 struct ocfs2_extent_rec *insert_rec,
1820 struct ocfs2_path *right_path, 3127 struct ocfs2_path *right_path,
1821 struct ocfs2_path **ret_left_path) 3128 struct ocfs2_path **ret_left_path)
1822{ 3129{
1823 int ret, i, next_free; 3130 int ret, next_free;
1824 struct buffer_head *bh;
1825 struct ocfs2_extent_list *el; 3131 struct ocfs2_extent_list *el;
1826 struct ocfs2_path *left_path = NULL; 3132 struct ocfs2_path *left_path = NULL;
1827 3133
@@ -1887,40 +3193,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1887 goto out; 3193 goto out;
1888 } 3194 }
1889 3195
1890 el = path_root_el(right_path); 3196 ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
1891 bh = path_root_bh(right_path);
1892 i = 0;
1893 while (1) {
1894 struct ocfs2_extent_rec *rec;
1895
1896 next_free = le16_to_cpu(el->l_next_free_rec);
1897 if (next_free == 0) {
1898 ocfs2_error(inode->i_sb,
1899 "Dinode %llu has a bad extent list",
1900 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1901 ret = -EIO;
1902 goto out;
1903 }
1904
1905 rec = &el->l_recs[next_free - 1];
1906
1907 rec->e_int_clusters = insert_rec->e_cpos;
1908 le32_add_cpu(&rec->e_int_clusters,
1909 le16_to_cpu(insert_rec->e_leaf_clusters));
1910 le32_add_cpu(&rec->e_int_clusters,
1911 -le32_to_cpu(rec->e_cpos));
1912
1913 ret = ocfs2_journal_dirty(handle, bh);
1914 if (ret)
1915 mlog_errno(ret);
1916
1917 /* Don't touch the leaf node */
1918 if (++i >= right_path->p_tree_depth)
1919 break;
1920
1921 bh = right_path->p_node[i].bh;
1922 el = right_path->p_node[i].el;
1923 }
1924 3197
1925 *ret_left_path = left_path; 3198 *ret_left_path = left_path;
1926 ret = 0; 3199 ret = 0;
@@ -1931,6 +3204,83 @@ out:
1931 return ret; 3204 return ret;
1932} 3205}
1933 3206
3207static void ocfs2_split_record(struct inode *inode,
3208 struct ocfs2_path *left_path,
3209 struct ocfs2_path *right_path,
3210 struct ocfs2_extent_rec *split_rec,
3211 enum ocfs2_split_type split)
3212{
3213 int index;
3214 u32 cpos = le32_to_cpu(split_rec->e_cpos);
3215 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3216 struct ocfs2_extent_rec *rec, *tmprec;
3217
3218 right_el = path_leaf_el(right_path);;
3219 if (left_path)
3220 left_el = path_leaf_el(left_path);
3221
3222 el = right_el;
3223 insert_el = right_el;
3224 index = ocfs2_search_extent_list(el, cpos);
3225 if (index != -1) {
3226 if (index == 0 && left_path) {
3227 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3228
3229 /*
3230 * This typically means that the record
3231 * started in the left path but moved to the
3232 * right as a result of rotation. We either
3233 * move the existing record to the left, or we
3234 * do the later insert there.
3235 *
3236 * In this case, the left path should always
3237 * exist as the rotate code will have passed
3238 * it back for a post-insert update.
3239 */
3240
3241 if (split == SPLIT_LEFT) {
3242 /*
3243 * It's a left split. Since we know
3244 * that the rotate code gave us an
3245 * empty extent in the left path, we
3246 * can just do the insert there.
3247 */
3248 insert_el = left_el;
3249 } else {
3250 /*
3251 * Right split - we have to move the
3252 * existing record over to the left
3253 * leaf. The insert will be into the
3254 * newly created empty extent in the
3255 * right leaf.
3256 */
3257 tmprec = &right_el->l_recs[index];
3258 ocfs2_rotate_leaf(left_el, tmprec);
3259 el = left_el;
3260
3261 memset(tmprec, 0, sizeof(*tmprec));
3262 index = ocfs2_search_extent_list(left_el, cpos);
3263 BUG_ON(index == -1);
3264 }
3265 }
3266 } else {
3267 BUG_ON(!left_path);
3268 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3269 /*
3270 * Left path is easy - we can just allow the insert to
3271 * happen.
3272 */
3273 el = left_el;
3274 insert_el = left_el;
3275 index = ocfs2_search_extent_list(el, cpos);
3276 BUG_ON(index == -1);
3277 }
3278
3279 rec = &el->l_recs[index];
3280 ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3281 ocfs2_rotate_leaf(insert_el, split_rec);
3282}
3283
1934/* 3284/*
1935 * This function only does inserts on an allocation b-tree. For dinode 3285 * This function only does inserts on an allocation b-tree. For dinode
1936 * lists, ocfs2_insert_at_leaf() is called directly. 3286 * lists, ocfs2_insert_at_leaf() is called directly.
@@ -1948,7 +3298,6 @@ static int ocfs2_insert_path(struct inode *inode,
1948{ 3298{
1949 int ret, subtree_index; 3299 int ret, subtree_index;
1950 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 3300 struct buffer_head *leaf_bh = path_leaf_bh(right_path);
1951 struct ocfs2_extent_list *el;
1952 3301
1953 /* 3302 /*
1954 * Pass both paths to the journal. The majority of inserts 3303 * Pass both paths to the journal. The majority of inserts
@@ -1984,9 +3333,18 @@ static int ocfs2_insert_path(struct inode *inode,
1984 } 3333 }
1985 } 3334 }
1986 3335
1987 el = path_leaf_el(right_path); 3336 if (insert->ins_split != SPLIT_NONE) {
3337 /*
3338 * We could call ocfs2_insert_at_leaf() for some types
3339 * of splits, but it's easier to just let one seperate
3340 * function sort it all out.
3341 */
3342 ocfs2_split_record(inode, left_path, right_path,
3343 insert_rec, insert->ins_split);
3344 } else
3345 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3346 insert, inode);
1988 3347
1989 ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
1990 ret = ocfs2_journal_dirty(handle, leaf_bh); 3348 ret = ocfs2_journal_dirty(handle, leaf_bh);
1991 if (ret) 3349 if (ret)
1992 mlog_errno(ret); 3350 mlog_errno(ret);
@@ -2075,7 +3433,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
2075 * can wind up skipping both of these two special cases... 3433 * can wind up skipping both of these two special cases...
2076 */ 3434 */
2077 if (rotate) { 3435 if (rotate) {
2078 ret = ocfs2_rotate_tree_right(inode, handle, 3436 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
2079 le32_to_cpu(insert_rec->e_cpos), 3437 le32_to_cpu(insert_rec->e_cpos),
2080 right_path, &left_path); 3438 right_path, &left_path);
2081 if (ret) { 3439 if (ret) {
@@ -2100,8 +3458,9 @@ static int ocfs2_do_insert_extent(struct inode *inode,
2100 } 3458 }
2101 3459
2102out_update_clusters: 3460out_update_clusters:
2103 ocfs2_update_dinode_clusters(inode, di, 3461 if (type->ins_split == SPLIT_NONE)
2104 le16_to_cpu(insert_rec->e_leaf_clusters)); 3462 ocfs2_update_dinode_clusters(inode, di,
3463 le16_to_cpu(insert_rec->e_leaf_clusters));
2105 3464
2106 ret = ocfs2_journal_dirty(handle, di_bh); 3465 ret = ocfs2_journal_dirty(handle, di_bh);
2107 if (ret) 3466 if (ret)
@@ -2114,6 +3473,44 @@ out:
2114 return ret; 3473 return ret;
2115} 3474}
2116 3475
3476static enum ocfs2_contig_type
3477ocfs2_figure_merge_contig_type(struct inode *inode,
3478 struct ocfs2_extent_list *el, int index,
3479 struct ocfs2_extent_rec *split_rec)
3480{
3481 struct ocfs2_extent_rec *rec;
3482 enum ocfs2_contig_type ret = CONTIG_NONE;
3483
3484 /*
3485 * We're careful to check for an empty extent record here -
3486 * the merge code will know what to do if it sees one.
3487 */
3488
3489 if (index > 0) {
3490 rec = &el->l_recs[index - 1];
3491 if (index == 1 && ocfs2_is_empty_extent(rec)) {
3492 if (split_rec->e_cpos == el->l_recs[index].e_cpos)
3493 ret = CONTIG_RIGHT;
3494 } else {
3495 ret = ocfs2_extent_contig(inode, rec, split_rec);
3496 }
3497 }
3498
3499 if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
3500 enum ocfs2_contig_type contig_type;
3501
3502 rec = &el->l_recs[index + 1];
3503 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
3504
3505 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
3506 ret = CONTIG_LEFTRIGHT;
3507 else if (ret == CONTIG_NONE)
3508 ret = contig_type;
3509 }
3510
3511 return ret;
3512}
3513
2117static void ocfs2_figure_contig_type(struct inode *inode, 3514static void ocfs2_figure_contig_type(struct inode *inode,
2118 struct ocfs2_insert_type *insert, 3515 struct ocfs2_insert_type *insert,
2119 struct ocfs2_extent_list *el, 3516 struct ocfs2_extent_list *el,
@@ -2205,6 +3602,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
2205 struct ocfs2_path *path = NULL; 3602 struct ocfs2_path *path = NULL;
2206 struct buffer_head *bh = NULL; 3603 struct buffer_head *bh = NULL;
2207 3604
3605 insert->ins_split = SPLIT_NONE;
3606
2208 el = &di->id2.i_list; 3607 el = &di->id2.i_list;
2209 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); 3608 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
2210 3609
@@ -2327,9 +3726,10 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
2327 u32 cpos, 3726 u32 cpos,
2328 u64 start_blk, 3727 u64 start_blk,
2329 u32 new_clusters, 3728 u32 new_clusters,
3729 u8 flags,
2330 struct ocfs2_alloc_context *meta_ac) 3730 struct ocfs2_alloc_context *meta_ac)
2331{ 3731{
2332 int status, shift; 3732 int status;
2333 struct buffer_head *last_eb_bh = NULL; 3733 struct buffer_head *last_eb_bh = NULL;
2334 struct buffer_head *bh = NULL; 3734 struct buffer_head *bh = NULL;
2335 struct ocfs2_insert_type insert = {0, }; 3735 struct ocfs2_insert_type insert = {0, };
@@ -2350,6 +3750,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
2350 rec.e_cpos = cpu_to_le32(cpos); 3750 rec.e_cpos = cpu_to_le32(cpos);
2351 rec.e_blkno = cpu_to_le64(start_blk); 3751 rec.e_blkno = cpu_to_le64(start_blk);
2352 rec.e_leaf_clusters = cpu_to_le16(new_clusters); 3752 rec.e_leaf_clusters = cpu_to_le16(new_clusters);
3753 rec.e_flags = flags;
2353 3754
2354 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, 3755 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
2355 &insert); 3756 &insert);
@@ -2364,55 +3765,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
2364 insert.ins_appending, insert.ins_contig, insert.ins_contig_index, 3765 insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
2365 insert.ins_free_records, insert.ins_tree_depth); 3766 insert.ins_free_records, insert.ins_tree_depth);
2366 3767
2367 /* 3768 if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) {
2368 * Avoid growing the tree unless we're out of records and the 3769 status = ocfs2_grow_tree(inode, handle, fe_bh,
2369 * insert type requres one. 3770 &insert.ins_tree_depth, &last_eb_bh,
2370 */ 3771 meta_ac);
2371 if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records) 3772 if (status) {
2372 goto out_add;
2373
2374 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
2375 if (shift < 0) {
2376 status = shift;
2377 mlog_errno(status);
2378 goto bail;
2379 }
2380
2381 /* We traveled all the way to the bottom of the allocation tree
2382 * and didn't find room for any more extents - we need to add
2383 * another tree level */
2384 if (shift) {
2385 BUG_ON(bh);
2386 mlog(0, "need to shift tree depth "
2387 "(current = %d)\n", insert.ins_tree_depth);
2388
2389 /* ocfs2_shift_tree_depth will return us a buffer with
2390 * the new extent block (so we can pass that to
2391 * ocfs2_add_branch). */
2392 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
2393 meta_ac, &bh);
2394 if (status < 0) {
2395 mlog_errno(status); 3773 mlog_errno(status);
2396 goto bail; 3774 goto bail;
2397 } 3775 }
2398 insert.ins_tree_depth++;
2399 /* Special case: we have room now if we shifted from
2400 * tree_depth 0 */
2401 if (insert.ins_tree_depth == 1)
2402 goto out_add;
2403 }
2404
2405 /* call ocfs2_add_branch to add the final part of the tree with
2406 * the new data. */
2407 mlog(0, "add branch. bh = %p\n", bh);
2408 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
2409 meta_ac);
2410 if (status < 0) {
2411 mlog_errno(status);
2412 goto bail;
2413 } 3776 }
2414 3777
2415out_add:
2416 /* Finally, we can add clusters. This might rotate the tree for us. */ 3778 /* Finally, we can add clusters. This might rotate the tree for us. */
2417 status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); 3779 status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
2418 if (status < 0) 3780 if (status < 0)
@@ -2431,7 +3793,720 @@ bail:
2431 return status; 3793 return status;
2432} 3794}
2433 3795
2434static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 3796static void ocfs2_make_right_split_rec(struct super_block *sb,
3797 struct ocfs2_extent_rec *split_rec,
3798 u32 cpos,
3799 struct ocfs2_extent_rec *rec)
3800{
3801 u32 rec_cpos = le32_to_cpu(rec->e_cpos);
3802 u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
3803
3804 memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
3805
3806 split_rec->e_cpos = cpu_to_le32(cpos);
3807 split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
3808
3809 split_rec->e_blkno = rec->e_blkno;
3810 le64_add_cpu(&split_rec->e_blkno,
3811 ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
3812
3813 split_rec->e_flags = rec->e_flags;
3814}
3815
3816static int ocfs2_split_and_insert(struct inode *inode,
3817 handle_t *handle,
3818 struct ocfs2_path *path,
3819 struct buffer_head *di_bh,
3820 struct buffer_head **last_eb_bh,
3821 int split_index,
3822 struct ocfs2_extent_rec *orig_split_rec,
3823 struct ocfs2_alloc_context *meta_ac)
3824{
3825 int ret = 0, depth;
3826 unsigned int insert_range, rec_range, do_leftright = 0;
3827 struct ocfs2_extent_rec tmprec;
3828 struct ocfs2_extent_list *rightmost_el;
3829 struct ocfs2_extent_rec rec;
3830 struct ocfs2_extent_rec split_rec = *orig_split_rec;
3831 struct ocfs2_insert_type insert;
3832 struct ocfs2_extent_block *eb;
3833 struct ocfs2_dinode *di;
3834
3835leftright:
3836 /*
3837 * Store a copy of the record on the stack - it might move
3838 * around as the tree is manipulated below.
3839 */
3840 rec = path_leaf_el(path)->l_recs[split_index];
3841
3842 di = (struct ocfs2_dinode *)di_bh->b_data;
3843 rightmost_el = &di->id2.i_list;
3844
3845 depth = le16_to_cpu(rightmost_el->l_tree_depth);
3846 if (depth) {
3847 BUG_ON(!(*last_eb_bh));
3848 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
3849 rightmost_el = &eb->h_list;
3850 }
3851
3852 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
3853 le16_to_cpu(rightmost_el->l_count)) {
3854 int old_depth = depth;
3855
3856 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
3857 meta_ac);
3858 if (ret) {
3859 mlog_errno(ret);
3860 goto out;
3861 }
3862
3863 if (old_depth != depth) {
3864 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
3865 rightmost_el = &eb->h_list;
3866 }
3867 }
3868
3869 memset(&insert, 0, sizeof(struct ocfs2_insert_type));
3870 insert.ins_appending = APPEND_NONE;
3871 insert.ins_contig = CONTIG_NONE;
3872 insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
3873 - le16_to_cpu(rightmost_el->l_next_free_rec);
3874 insert.ins_tree_depth = depth;
3875
3876 insert_range = le32_to_cpu(split_rec.e_cpos) +
3877 le16_to_cpu(split_rec.e_leaf_clusters);
3878 rec_range = le32_to_cpu(rec.e_cpos) +
3879 le16_to_cpu(rec.e_leaf_clusters);
3880
3881 if (split_rec.e_cpos == rec.e_cpos) {
3882 insert.ins_split = SPLIT_LEFT;
3883 } else if (insert_range == rec_range) {
3884 insert.ins_split = SPLIT_RIGHT;
3885 } else {
3886 /*
3887 * Left/right split. We fake this as a right split
3888 * first and then make a second pass as a left split.
3889 */
3890 insert.ins_split = SPLIT_RIGHT;
3891
3892 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
3893 &rec);
3894
3895 split_rec = tmprec;
3896
3897 BUG_ON(do_leftright);
3898 do_leftright = 1;
3899 }
3900
3901 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
3902 &insert);
3903 if (ret) {
3904 mlog_errno(ret);
3905 goto out;
3906 }
3907
3908 if (do_leftright == 1) {
3909 u32 cpos;
3910 struct ocfs2_extent_list *el;
3911
3912 do_leftright++;
3913 split_rec = *orig_split_rec;
3914
3915 ocfs2_reinit_path(path, 1);
3916
3917 cpos = le32_to_cpu(split_rec.e_cpos);
3918 ret = ocfs2_find_path(inode, path, cpos);
3919 if (ret) {
3920 mlog_errno(ret);
3921 goto out;
3922 }
3923
3924 el = path_leaf_el(path);
3925 split_index = ocfs2_search_extent_list(el, cpos);
3926 goto leftright;
3927 }
3928out:
3929
3930 return ret;
3931}
3932
3933/*
3934 * Mark part or all of the extent record at split_index in the leaf
3935 * pointed to by path as written. This removes the unwritten
3936 * extent flag.
3937 *
3938 * Care is taken to handle contiguousness so as to not grow the tree.
3939 *
3940 * meta_ac is not strictly necessary - we only truly need it if growth
3941 * of the tree is required. All other cases will degrade into a less
3942 * optimal tree layout.
3943 *
3944 * last_eb_bh should be the rightmost leaf block for any inode with a
3945 * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
3946 *
3947 * This code is optimized for readability - several passes might be
3948 * made over certain portions of the tree. All of those blocks will
3949 * have been brought into cache (and pinned via the journal), so the
3950 * extra overhead is not expressed in terms of disk reads.
3951 */
3952static int __ocfs2_mark_extent_written(struct inode *inode,
3953 struct buffer_head *di_bh,
3954 handle_t *handle,
3955 struct ocfs2_path *path,
3956 int split_index,
3957 struct ocfs2_extent_rec *split_rec,
3958 struct ocfs2_alloc_context *meta_ac,
3959 struct ocfs2_cached_dealloc_ctxt *dealloc)
3960{
3961 int ret = 0;
3962 struct ocfs2_extent_list *el = path_leaf_el(path);
3963 struct buffer_head *eb_bh, *last_eb_bh = NULL;
3964 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3965 struct ocfs2_merge_ctxt ctxt;
3966 struct ocfs2_extent_list *rightmost_el;
3967
3968 if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) {
3969 ret = -EIO;
3970 mlog_errno(ret);
3971 goto out;
3972 }
3973
3974 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
3975 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
3976 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
3977 ret = -EIO;
3978 mlog_errno(ret);
3979 goto out;
3980 }
3981
3982 eb_bh = path_leaf_bh(path);
3983 ret = ocfs2_journal_access(handle, inode, eb_bh,
3984 OCFS2_JOURNAL_ACCESS_WRITE);
3985 if (ret) {
3986 mlog_errno(ret);
3987 goto out;
3988 }
3989
3990 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
3991 split_index,
3992 split_rec);
3993
3994 /*
3995 * The core merge / split code wants to know how much room is
3996 * left in this inodes allocation tree, so we pass the
3997 * rightmost extent list.
3998 */
3999 if (path->p_tree_depth) {
4000 struct ocfs2_extent_block *eb;
4001 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4002
4003 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4004 le64_to_cpu(di->i_last_eb_blk),
4005 &last_eb_bh, OCFS2_BH_CACHED, inode);
4006 if (ret) {
4007 mlog_exit(ret);
4008 goto out;
4009 }
4010
4011 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4012 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4013 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4014 ret = -EROFS;
4015 goto out;
4016 }
4017
4018 rightmost_el = &eb->h_list;
4019 } else
4020 rightmost_el = path_root_el(path);
4021
4022 ctxt.c_used_tail_recs = le16_to_cpu(rightmost_el->l_next_free_rec);
4023 if (ctxt.c_used_tail_recs > 0 &&
4024 ocfs2_is_empty_extent(&rightmost_el->l_recs[0]))
4025 ctxt.c_used_tail_recs--;
4026
4027 if (rec->e_cpos == split_rec->e_cpos &&
4028 rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4029 ctxt.c_split_covers_rec = 1;
4030 else
4031 ctxt.c_split_covers_rec = 0;
4032
4033 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4034
4035 mlog(0, "index: %d, contig: %u, used_tail_recs: %u, "
4036 "has_empty: %u, split_covers: %u\n", split_index,
4037 ctxt.c_contig_type, ctxt.c_used_tail_recs,
4038 ctxt.c_has_empty_extent, ctxt.c_split_covers_rec);
4039
4040 if (ctxt.c_contig_type == CONTIG_NONE) {
4041 if (ctxt.c_split_covers_rec)
4042 el->l_recs[split_index] = *split_rec;
4043 else
4044 ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
4045 &last_eb_bh, split_index,
4046 split_rec, meta_ac);
4047 if (ret)
4048 mlog_errno(ret);
4049 } else {
4050 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4051 split_index, split_rec,
4052 dealloc, &ctxt);
4053 if (ret)
4054 mlog_errno(ret);
4055 }
4056
4057 ocfs2_journal_dirty(handle, eb_bh);
4058
4059out:
4060 brelse(last_eb_bh);
4061 return ret;
4062}
4063
4064/*
4065 * Mark the already-existing extent at cpos as written for len clusters.
4066 *
4067 * If the existing extent is larger than the request, initiate a
4068 * split. An attempt will be made at merging with adjacent extents.
4069 *
4070 * The caller is responsible for passing down meta_ac if we'll need it.
4071 */
4072int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
4073 handle_t *handle, u32 cpos, u32 len, u32 phys,
4074 struct ocfs2_alloc_context *meta_ac,
4075 struct ocfs2_cached_dealloc_ctxt *dealloc)
4076{
4077 int ret, index;
4078 u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4079 struct ocfs2_extent_rec split_rec;
4080 struct ocfs2_path *left_path = NULL;
4081 struct ocfs2_extent_list *el;
4082
4083 mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4084 inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4085
4086 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4087 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4088 "that are being written to, but the feature bit "
4089 "is not set in the super block.",
4090 (unsigned long long)OCFS2_I(inode)->ip_blkno);
4091 ret = -EROFS;
4092 goto out;
4093 }
4094
4095 /*
4096 * XXX: This should be fixed up so that we just re-insert the
4097 * next extent records.
4098 */
4099 ocfs2_extent_map_trunc(inode, 0);
4100
4101 left_path = ocfs2_new_inode_path(di_bh);
4102 if (!left_path) {
4103 ret = -ENOMEM;
4104 mlog_errno(ret);
4105 goto out;
4106 }
4107
4108 ret = ocfs2_find_path(inode, left_path, cpos);
4109 if (ret) {
4110 mlog_errno(ret);
4111 goto out;
4112 }
4113 el = path_leaf_el(left_path);
4114
4115 index = ocfs2_search_extent_list(el, cpos);
4116 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4117 ocfs2_error(inode->i_sb,
4118 "Inode %llu has an extent at cpos %u which can no "
4119 "longer be found.\n",
4120 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4121 ret = -EROFS;
4122 goto out;
4123 }
4124
4125 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4126 split_rec.e_cpos = cpu_to_le32(cpos);
4127 split_rec.e_leaf_clusters = cpu_to_le16(len);
4128 split_rec.e_blkno = cpu_to_le64(start_blkno);
4129 split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4130 split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4131
4132 ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
4133 index, &split_rec, meta_ac, dealloc);
4134 if (ret)
4135 mlog_errno(ret);
4136
4137out:
4138 ocfs2_free_path(left_path);
4139 return ret;
4140}
4141
4142static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4143 handle_t *handle, struct ocfs2_path *path,
4144 int index, u32 new_range,
4145 struct ocfs2_alloc_context *meta_ac)
4146{
4147 int ret, depth, credits = handle->h_buffer_credits;
4148 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4149 struct buffer_head *last_eb_bh = NULL;
4150 struct ocfs2_extent_block *eb;
4151 struct ocfs2_extent_list *rightmost_el, *el;
4152 struct ocfs2_extent_rec split_rec;
4153 struct ocfs2_extent_rec *rec;
4154 struct ocfs2_insert_type insert;
4155
4156 /*
4157 * Setup the record to split before we grow the tree.
4158 */
4159 el = path_leaf_el(path);
4160 rec = &el->l_recs[index];
4161 ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4162
4163 depth = path->p_tree_depth;
4164 if (depth > 0) {
4165 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4166 le64_to_cpu(di->i_last_eb_blk),
4167 &last_eb_bh, OCFS2_BH_CACHED, inode);
4168 if (ret < 0) {
4169 mlog_errno(ret);
4170 goto out;
4171 }
4172
4173 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4174 rightmost_el = &eb->h_list;
4175 } else
4176 rightmost_el = path_leaf_el(path);
4177
4178 credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
4179 ret = ocfs2_extend_trans(handle, credits);
4180 if (ret) {
4181 mlog_errno(ret);
4182 goto out;
4183 }
4184
4185 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4186 le16_to_cpu(rightmost_el->l_count)) {
4187 int old_depth = depth;
4188
4189 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
4190 meta_ac);
4191 if (ret) {
4192 mlog_errno(ret);
4193 goto out;
4194 }
4195
4196 if (old_depth != depth) {
4197 eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
4198 rightmost_el = &eb->h_list;
4199 }
4200 }
4201
4202 memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4203 insert.ins_appending = APPEND_NONE;
4204 insert.ins_contig = CONTIG_NONE;
4205 insert.ins_split = SPLIT_RIGHT;
4206 insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
4207 - le16_to_cpu(rightmost_el->l_next_free_rec);
4208 insert.ins_tree_depth = depth;
4209
4210 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
4211 if (ret)
4212 mlog_errno(ret);
4213
4214out:
4215 brelse(last_eb_bh);
4216 return ret;
4217}
4218
4219static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4220 struct ocfs2_path *path, int index,
4221 struct ocfs2_cached_dealloc_ctxt *dealloc,
4222 u32 cpos, u32 len)
4223{
4224 int ret;
4225 u32 left_cpos, rec_range, trunc_range;
4226 int wants_rotate = 0, is_rightmost_tree_rec = 0;
4227 struct super_block *sb = inode->i_sb;
4228 struct ocfs2_path *left_path = NULL;
4229 struct ocfs2_extent_list *el = path_leaf_el(path);
4230 struct ocfs2_extent_rec *rec;
4231 struct ocfs2_extent_block *eb;
4232
4233 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4234 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4235 if (ret) {
4236 mlog_errno(ret);
4237 goto out;
4238 }
4239
4240 index--;
4241 }
4242
4243 if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
4244 path->p_tree_depth) {
4245 /*
4246 * Check whether this is the rightmost tree record. If
4247 * we remove all of this record or part of its right
4248 * edge then an update of the record lengths above it
4249 * will be required.
4250 */
4251 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
4252 if (eb->h_next_leaf_blk == 0)
4253 is_rightmost_tree_rec = 1;
4254 }
4255
4256 rec = &el->l_recs[index];
4257 if (index == 0 && path->p_tree_depth &&
4258 le32_to_cpu(rec->e_cpos) == cpos) {
4259 /*
4260 * Changing the leftmost offset (via partial or whole
4261 * record truncate) of an interior (or rightmost) path
4262 * means we have to update the subtree that is formed
4263 * by this leaf and the one to it's left.
4264 *
4265 * There are two cases we can skip:
4266 * 1) Path is the leftmost one in our inode tree.
4267 * 2) The leaf is rightmost and will be empty after
4268 * we remove the extent record - the rotate code
4269 * knows how to update the newly formed edge.
4270 */
4271
4272 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
4273 &left_cpos);
4274 if (ret) {
4275 mlog_errno(ret);
4276 goto out;
4277 }
4278
4279 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
4280 left_path = ocfs2_new_path(path_root_bh(path),
4281 path_root_el(path));
4282 if (!left_path) {
4283 ret = -ENOMEM;
4284 mlog_errno(ret);
4285 goto out;
4286 }
4287
4288 ret = ocfs2_find_path(inode, left_path, left_cpos);
4289 if (ret) {
4290 mlog_errno(ret);
4291 goto out;
4292 }
4293 }
4294 }
4295
4296 ret = ocfs2_extend_rotate_transaction(handle, 0,
4297 handle->h_buffer_credits,
4298 path);
4299 if (ret) {
4300 mlog_errno(ret);
4301 goto out;
4302 }
4303
4304 ret = ocfs2_journal_access_path(inode, handle, path);
4305 if (ret) {
4306 mlog_errno(ret);
4307 goto out;
4308 }
4309
4310 ret = ocfs2_journal_access_path(inode, handle, left_path);
4311 if (ret) {
4312 mlog_errno(ret);
4313 goto out;
4314 }
4315
4316 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4317 trunc_range = cpos + len;
4318
4319 if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
4320 int next_free;
4321
4322 memset(rec, 0, sizeof(*rec));
4323 ocfs2_cleanup_merge(el, index);
4324 wants_rotate = 1;
4325
4326 next_free = le16_to_cpu(el->l_next_free_rec);
4327 if (is_rightmost_tree_rec && next_free > 1) {
4328 /*
4329 * We skip the edge update if this path will
4330 * be deleted by the rotate code.
4331 */
4332 rec = &el->l_recs[next_free - 1];
4333 ocfs2_adjust_rightmost_records(inode, handle, path,
4334 rec);
4335 }
4336 } else if (le32_to_cpu(rec->e_cpos) == cpos) {
4337 /* Remove leftmost portion of the record. */
4338 le32_add_cpu(&rec->e_cpos, len);
4339 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
4340 le16_add_cpu(&rec->e_leaf_clusters, -len);
4341 } else if (rec_range == trunc_range) {
4342 /* Remove rightmost portion of the record */
4343 le16_add_cpu(&rec->e_leaf_clusters, -len);
4344 if (is_rightmost_tree_rec)
4345 ocfs2_adjust_rightmost_records(inode, handle, path, rec);
4346 } else {
4347 /* Caller should have trapped this. */
4348 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
4349 "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
4350 le32_to_cpu(rec->e_cpos),
4351 le16_to_cpu(rec->e_leaf_clusters), cpos, len);
4352 BUG();
4353 }
4354
4355 if (left_path) {
4356 int subtree_index;
4357
4358 subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
4359 ocfs2_complete_edge_insert(inode, handle, left_path, path,
4360 subtree_index);
4361 }
4362
4363 ocfs2_journal_dirty(handle, path_leaf_bh(path));
4364
4365 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4366 if (ret) {
4367 mlog_errno(ret);
4368 goto out;
4369 }
4370
4371out:
4372 ocfs2_free_path(left_path);
4373 return ret;
4374}
4375
4376int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
4377 u32 cpos, u32 len, handle_t *handle,
4378 struct ocfs2_alloc_context *meta_ac,
4379 struct ocfs2_cached_dealloc_ctxt *dealloc)
4380{
4381 int ret, index;
4382 u32 rec_range, trunc_range;
4383 struct ocfs2_extent_rec *rec;
4384 struct ocfs2_extent_list *el;
4385 struct ocfs2_path *path;
4386
4387 ocfs2_extent_map_trunc(inode, 0);
4388
4389 path = ocfs2_new_inode_path(di_bh);
4390 if (!path) {
4391 ret = -ENOMEM;
4392 mlog_errno(ret);
4393 goto out;
4394 }
4395
4396 ret = ocfs2_find_path(inode, path, cpos);
4397 if (ret) {
4398 mlog_errno(ret);
4399 goto out;
4400 }
4401
4402 el = path_leaf_el(path);
4403 index = ocfs2_search_extent_list(el, cpos);
4404 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4405 ocfs2_error(inode->i_sb,
4406 "Inode %llu has an extent at cpos %u which can no "
4407 "longer be found.\n",
4408 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4409 ret = -EROFS;
4410 goto out;
4411 }
4412
4413 /*
4414 * We have 3 cases of extent removal:
4415 * 1) Range covers the entire extent rec
4416 * 2) Range begins or ends on one edge of the extent rec
4417 * 3) Range is in the middle of the extent rec (no shared edges)
4418 *
4419 * For case 1 we remove the extent rec and left rotate to
4420 * fill the hole.
4421 *
4422 * For case 2 we just shrink the existing extent rec, with a
4423 * tree update if the shrinking edge is also the edge of an
4424 * extent block.
4425 *
4426 * For case 3 we do a right split to turn the extent rec into
4427 * something case 2 can handle.
4428 */
4429 rec = &el->l_recs[index];
4430 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4431 trunc_range = cpos + len;
4432
4433 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
4434
4435 mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
4436 "(cpos %u, len %u)\n",
4437 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
4438 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
4439
4440 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
4441 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4442 cpos, len);
4443 if (ret) {
4444 mlog_errno(ret);
4445 goto out;
4446 }
4447 } else {
4448 ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
4449 trunc_range, meta_ac);
4450 if (ret) {
4451 mlog_errno(ret);
4452 goto out;
4453 }
4454
4455 /*
4456 * The split could have manipulated the tree enough to
4457 * move the record location, so we have to look for it again.
4458 */
4459 ocfs2_reinit_path(path, 1);
4460
4461 ret = ocfs2_find_path(inode, path, cpos);
4462 if (ret) {
4463 mlog_errno(ret);
4464 goto out;
4465 }
4466
4467 el = path_leaf_el(path);
4468 index = ocfs2_search_extent_list(el, cpos);
4469 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4470 ocfs2_error(inode->i_sb,
4471 "Inode %llu: split at cpos %u lost record.",
4472 (unsigned long long)OCFS2_I(inode)->ip_blkno,
4473 cpos);
4474 ret = -EROFS;
4475 goto out;
4476 }
4477
4478 /*
4479 * Double check our values here. If anything is fishy,
4480 * it's easier to catch it at the top level.
4481 */
4482 rec = &el->l_recs[index];
4483 rec_range = le32_to_cpu(rec->e_cpos) +
4484 ocfs2_rec_clusters(el, rec);
4485 if (rec_range != trunc_range) {
4486 ocfs2_error(inode->i_sb,
4487 "Inode %llu: error after split at cpos %u"
4488 "trunc len %u, existing record is (%u,%u)",
4489 (unsigned long long)OCFS2_I(inode)->ip_blkno,
4490 cpos, len, le32_to_cpu(rec->e_cpos),
4491 ocfs2_rec_clusters(el, rec));
4492 ret = -EROFS;
4493 goto out;
4494 }
4495
4496 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4497 cpos, len);
4498 if (ret) {
4499 mlog_errno(ret);
4500 goto out;
4501 }
4502 }
4503
4504out:
4505 ocfs2_free_path(path);
4506 return ret;
4507}
4508
4509int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
2435{ 4510{
2436 struct buffer_head *tl_bh = osb->osb_tl_bh; 4511 struct buffer_head *tl_bh = osb->osb_tl_bh;
2437 struct ocfs2_dinode *di; 4512 struct ocfs2_dinode *di;
@@ -2464,10 +4539,10 @@ static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
2464 return current_tail == new_start; 4539 return current_tail == new_start;
2465} 4540}
2466 4541
2467static int ocfs2_truncate_log_append(struct ocfs2_super *osb, 4542int ocfs2_truncate_log_append(struct ocfs2_super *osb,
2468 handle_t *handle, 4543 handle_t *handle,
2469 u64 start_blk, 4544 u64 start_blk,
2470 unsigned int num_clusters) 4545 unsigned int num_clusters)
2471{ 4546{
2472 int status, index; 4547 int status, index;
2473 unsigned int start_cluster, tl_count; 4548 unsigned int start_cluster, tl_count;
@@ -2623,7 +4698,7 @@ bail:
2623} 4698}
2624 4699
2625/* Expects you to already be holding tl_inode->i_mutex */ 4700/* Expects you to already be holding tl_inode->i_mutex */
2626static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) 4701int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
2627{ 4702{
2628 int status; 4703 int status;
2629 unsigned int num_to_flush; 4704 unsigned int num_to_flush;
@@ -2957,6 +5032,219 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
2957 return status; 5032 return status;
2958} 5033}
2959 5034
5035/*
5036 * Delayed de-allocation of suballocator blocks.
5037 *
5038 * Some sets of block de-allocations might involve multiple suballocator inodes.
5039 *
5040 * The locking for this can get extremely complicated, especially when
5041 * the suballocator inodes to delete from aren't known until deep
5042 * within an unrelated codepath.
5043 *
5044 * ocfs2_extent_block structures are a good example of this - an inode
5045 * btree could have been grown by any number of nodes each allocating
5046 * out of their own suballoc inode.
5047 *
5048 * These structures allow the delay of block de-allocation until a
5049 * later time, when locking of multiple cluster inodes won't cause
5050 * deadlock.
5051 */
5052
5053/*
5054 * Describes a single block free from a suballocator
5055 */
5056struct ocfs2_cached_block_free {
5057 struct ocfs2_cached_block_free *free_next;
5058 u64 free_blk;
5059 unsigned int free_bit;
5060};
5061
5062struct ocfs2_per_slot_free_list {
5063 struct ocfs2_per_slot_free_list *f_next_suballocator;
5064 int f_inode_type;
5065 int f_slot;
5066 struct ocfs2_cached_block_free *f_first;
5067};
5068
5069static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5070 int sysfile_type,
5071 int slot,
5072 struct ocfs2_cached_block_free *head)
5073{
5074 int ret;
5075 u64 bg_blkno;
5076 handle_t *handle;
5077 struct inode *inode;
5078 struct buffer_head *di_bh = NULL;
5079 struct ocfs2_cached_block_free *tmp;
5080
5081 inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5082 if (!inode) {
5083 ret = -EINVAL;
5084 mlog_errno(ret);
5085 goto out;
5086 }
5087
5088 mutex_lock(&inode->i_mutex);
5089
5090 ret = ocfs2_meta_lock(inode, &di_bh, 1);
5091 if (ret) {
5092 mlog_errno(ret);
5093 goto out_mutex;
5094 }
5095
5096 handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5097 if (IS_ERR(handle)) {
5098 ret = PTR_ERR(handle);
5099 mlog_errno(ret);
5100 goto out_unlock;
5101 }
5102
5103 while (head) {
5104 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5105 head->free_bit);
5106 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5107 head->free_bit, (unsigned long long)head->free_blk);
5108
5109 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5110 head->free_bit, bg_blkno, 1);
5111 if (ret) {
5112 mlog_errno(ret);
5113 goto out_journal;
5114 }
5115
5116 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5117 if (ret) {
5118 mlog_errno(ret);
5119 goto out_journal;
5120 }
5121
5122 tmp = head;
5123 head = head->free_next;
5124 kfree(tmp);
5125 }
5126
5127out_journal:
5128 ocfs2_commit_trans(osb, handle);
5129
5130out_unlock:
5131 ocfs2_meta_unlock(inode, 1);
5132 brelse(di_bh);
5133out_mutex:
5134 mutex_unlock(&inode->i_mutex);
5135 iput(inode);
5136out:
5137 while(head) {
5138 /* Premature exit may have left some dangling items. */
5139 tmp = head;
5140 head = head->free_next;
5141 kfree(tmp);
5142 }
5143
5144 return ret;
5145}
5146
5147int ocfs2_run_deallocs(struct ocfs2_super *osb,
5148 struct ocfs2_cached_dealloc_ctxt *ctxt)
5149{
5150 int ret = 0, ret2;
5151 struct ocfs2_per_slot_free_list *fl;
5152
5153 if (!ctxt)
5154 return 0;
5155
5156 while (ctxt->c_first_suballocator) {
5157 fl = ctxt->c_first_suballocator;
5158
5159 if (fl->f_first) {
5160 mlog(0, "Free items: (type %u, slot %d)\n",
5161 fl->f_inode_type, fl->f_slot);
5162 ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5163 fl->f_slot, fl->f_first);
5164 if (ret2)
5165 mlog_errno(ret2);
5166 if (!ret)
5167 ret = ret2;
5168 }
5169
5170 ctxt->c_first_suballocator = fl->f_next_suballocator;
5171 kfree(fl);
5172 }
5173
5174 return ret;
5175}
5176
5177static struct ocfs2_per_slot_free_list *
5178ocfs2_find_per_slot_free_list(int type,
5179 int slot,
5180 struct ocfs2_cached_dealloc_ctxt *ctxt)
5181{
5182 struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
5183
5184 while (fl) {
5185 if (fl->f_inode_type == type && fl->f_slot == slot)
5186 return fl;
5187
5188 fl = fl->f_next_suballocator;
5189 }
5190
5191 fl = kmalloc(sizeof(*fl), GFP_NOFS);
5192 if (fl) {
5193 fl->f_inode_type = type;
5194 fl->f_slot = slot;
5195 fl->f_first = NULL;
5196 fl->f_next_suballocator = ctxt->c_first_suballocator;
5197
5198 ctxt->c_first_suballocator = fl;
5199 }
5200 return fl;
5201}
5202
5203static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
5204 int type, int slot, u64 blkno,
5205 unsigned int bit)
5206{
5207 int ret;
5208 struct ocfs2_per_slot_free_list *fl;
5209 struct ocfs2_cached_block_free *item;
5210
5211 fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
5212 if (fl == NULL) {
5213 ret = -ENOMEM;
5214 mlog_errno(ret);
5215 goto out;
5216 }
5217
5218 item = kmalloc(sizeof(*item), GFP_NOFS);
5219 if (item == NULL) {
5220 ret = -ENOMEM;
5221 mlog_errno(ret);
5222 goto out;
5223 }
5224
5225 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
5226 type, slot, bit, (unsigned long long)blkno);
5227
5228 item->free_blk = blkno;
5229 item->free_bit = bit;
5230 item->free_next = fl->f_first;
5231
5232 fl->f_first = item;
5233
5234 ret = 0;
5235out:
5236 return ret;
5237}
5238
5239static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
5240 struct ocfs2_extent_block *eb)
5241{
5242 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
5243 le16_to_cpu(eb->h_suballoc_slot),
5244 le64_to_cpu(eb->h_blkno),
5245 le16_to_cpu(eb->h_suballoc_bit));
5246}
5247
2960/* This function will figure out whether the currently last extent 5248/* This function will figure out whether the currently last extent
2961 * block will be deleted, and if it will, what the new last extent 5249 * block will be deleted, and if it will, what the new last extent
2962 * block will be so we can update his h_next_leaf_blk field, as well 5250 * block will be so we can update his h_next_leaf_blk field, as well
@@ -3238,27 +5526,10 @@ delete:
3238 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); 5526 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
3239 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); 5527 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
3240 5528
3241 if (le16_to_cpu(eb->h_suballoc_slot) == 0) { 5529 ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
3242 /* 5530 /* An error here is not fatal. */
3243 * This code only understands how to 5531 if (ret < 0)
3244 * lock the suballocator in slot 0, 5532 mlog_errno(ret);
3245 * which is fine because allocation is
3246 * only ever done out of that
3247 * suballocator too. A future version
3248 * might change that however, so avoid
3249 * a free if we don't know how to
3250 * handle it. This way an fs incompat
3251 * bit will not be necessary.
3252 */
3253 ret = ocfs2_free_extent_block(handle,
3254 tc->tc_ext_alloc_inode,
3255 tc->tc_ext_alloc_bh,
3256 eb);
3257
3258 /* An error here is not fatal. */
3259 if (ret < 0)
3260 mlog_errno(ret);
3261 }
3262 } else { 5533 } else {
3263 deleted_eb = 0; 5534 deleted_eb = 0;
3264 } 5535 }
@@ -3397,9 +5668,9 @@ static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
3397 return ocfs2_journal_dirty_data(handle, bh); 5668 return ocfs2_journal_dirty_data(handle, bh);
3398} 5669}
3399 5670
3400static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, 5671static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
3401 struct page **pages, int numpages, 5672 loff_t end, struct page **pages,
3402 u64 phys, handle_t *handle) 5673 int numpages, u64 phys, handle_t *handle)
3403{ 5674{
3404 int i, ret, partial = 0; 5675 int i, ret, partial = 0;
3405 void *kaddr; 5676 void *kaddr;
@@ -3412,26 +5683,14 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
3412 if (numpages == 0) 5683 if (numpages == 0)
3413 goto out; 5684 goto out;
3414 5685
3415 from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */ 5686 to = PAGE_CACHE_SIZE;
3416 if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
3417 /*
3418 * Since 'from' has been capped to a value below page
3419 * size, this calculation won't be able to overflow
3420 * 'to'
3421 */
3422 to = ocfs2_align_bytes_to_clusters(sb, from);
3423
3424 /*
3425 * The truncate tail in this case should never contain
3426 * more than one page at maximum. The loop below also
3427 * assumes this.
3428 */
3429 BUG_ON(numpages != 1);
3430 }
3431
3432 for(i = 0; i < numpages; i++) { 5687 for(i = 0; i < numpages; i++) {
3433 page = pages[i]; 5688 page = pages[i];
3434 5689
5690 from = start & (PAGE_CACHE_SIZE - 1);
5691 if ((end >> PAGE_CACHE_SHIFT) == page->index)
5692 to = end & (PAGE_CACHE_SIZE - 1);
5693
3435 BUG_ON(from > PAGE_CACHE_SIZE); 5694 BUG_ON(from > PAGE_CACHE_SIZE);
3436 BUG_ON(to > PAGE_CACHE_SIZE); 5695 BUG_ON(to > PAGE_CACHE_SIZE);
3437 5696
@@ -3468,10 +5727,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
3468 5727
3469 flush_dcache_page(page); 5728 flush_dcache_page(page);
3470 5729
3471 /* 5730 start = (page->index + 1) << PAGE_CACHE_SHIFT;
3472 * Every page after the 1st one should be completely zero'd.
3473 */
3474 from = 0;
3475 } 5731 }
3476out: 5732out:
3477 if (pages) { 5733 if (pages) {
@@ -3484,24 +5740,26 @@ out:
3484 } 5740 }
3485} 5741}
3486 5742
3487static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages, 5743static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
3488 int *num, u64 *phys) 5744 struct page **pages, int *num, u64 *phys)
3489{ 5745{
3490 int i, numpages = 0, ret = 0; 5746 int i, numpages = 0, ret = 0;
3491 unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
3492 unsigned int ext_flags; 5747 unsigned int ext_flags;
3493 struct super_block *sb = inode->i_sb; 5748 struct super_block *sb = inode->i_sb;
3494 struct address_space *mapping = inode->i_mapping; 5749 struct address_space *mapping = inode->i_mapping;
3495 unsigned long index; 5750 unsigned long index;
3496 u64 next_cluster_bytes; 5751 loff_t last_page_bytes;
3497 5752
3498 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); 5753 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
5754 BUG_ON(start > end);
3499 5755
3500 /* Cluster boundary, so we don't need to grab any pages. */ 5756 if (start == end)
3501 if ((isize & (csize - 1)) == 0)
3502 goto out; 5757 goto out;
3503 5758
3504 ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits, 5759 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
5760 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
5761
5762 ret = ocfs2_extent_map_get_blocks(inode, start >> sb->s_blocksize_bits,
3505 phys, NULL, &ext_flags); 5763 phys, NULL, &ext_flags);
3506 if (ret) { 5764 if (ret) {
3507 mlog_errno(ret); 5765 mlog_errno(ret);
@@ -3517,8 +5775,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
3517 if (ext_flags & OCFS2_EXT_UNWRITTEN) 5775 if (ext_flags & OCFS2_EXT_UNWRITTEN)
3518 goto out; 5776 goto out;
3519 5777
3520 next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize); 5778 last_page_bytes = PAGE_ALIGN(end);
3521 index = isize >> PAGE_CACHE_SHIFT; 5779 index = start >> PAGE_CACHE_SHIFT;
3522 do { 5780 do {
3523 pages[numpages] = grab_cache_page(mapping, index); 5781 pages[numpages] = grab_cache_page(mapping, index);
3524 if (!pages[numpages]) { 5782 if (!pages[numpages]) {
@@ -3529,7 +5787,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
3529 5787
3530 numpages++; 5788 numpages++;
3531 index++; 5789 index++;
3532 } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT)); 5790 } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
3533 5791
3534out: 5792out:
3535 if (ret != 0) { 5793 if (ret != 0) {
@@ -3558,11 +5816,10 @@ out:
3558 * otherwise block_write_full_page() will skip writeout of pages past 5816 * otherwise block_write_full_page() will skip writeout of pages past
3559 * i_size. The new_i_size parameter is passed for this reason. 5817 * i_size. The new_i_size parameter is passed for this reason.
3560 */ 5818 */
3561int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, 5819int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
3562 u64 new_i_size) 5820 u64 range_start, u64 range_end)
3563{ 5821{
3564 int ret, numpages; 5822 int ret, numpages;
3565 loff_t endbyte;
3566 struct page **pages = NULL; 5823 struct page **pages = NULL;
3567 u64 phys; 5824 u64 phys;
3568 5825
@@ -3581,7 +5838,8 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
3581 goto out; 5838 goto out;
3582 } 5839 }
3583 5840
3584 ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys); 5841 ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
5842 &numpages, &phys);
3585 if (ret) { 5843 if (ret) {
3586 mlog_errno(ret); 5844 mlog_errno(ret);
3587 goto out; 5845 goto out;
@@ -3590,17 +5848,16 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
3590 if (numpages == 0) 5848 if (numpages == 0)
3591 goto out; 5849 goto out;
3592 5850
3593 ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys, 5851 ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
3594 handle); 5852 numpages, phys, handle);
3595 5853
3596 /* 5854 /*
3597 * Initiate writeout of the pages we zero'd here. We don't 5855 * Initiate writeout of the pages we zero'd here. We don't
3598 * wait on them - the truncate_inode_pages() call later will 5856 * wait on them - the truncate_inode_pages() call later will
3599 * do that for us. 5857 * do that for us.
3600 */ 5858 */
3601 endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); 5859 ret = do_sync_mapping_range(inode->i_mapping, range_start,
3602 ret = do_sync_mapping_range(inode->i_mapping, new_i_size, 5860 range_end - 1, SYNC_FILE_RANGE_WRITE);
3603 endbyte - 1, SYNC_FILE_RANGE_WRITE);
3604 if (ret) 5861 if (ret)
3605 mlog_errno(ret); 5862 mlog_errno(ret);
3606 5863
@@ -3631,8 +5888,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
3631 5888
3632 mlog_entry_void(); 5889 mlog_entry_void();
3633 5890
3634 down_write(&OCFS2_I(inode)->ip_alloc_sem);
3635
3636 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 5891 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
3637 i_size_read(inode)); 5892 i_size_read(inode));
3638 5893
@@ -3754,7 +6009,6 @@ start:
3754 goto start; 6009 goto start;
3755 6010
3756bail: 6011bail:
3757 up_write(&OCFS2_I(inode)->ip_alloc_sem);
3758 6012
3759 ocfs2_schedule_truncate_log_flush(osb, 1); 6013 ocfs2_schedule_truncate_log_flush(osb, 1);
3760 6014
@@ -3764,6 +6018,8 @@ bail:
3764 if (handle) 6018 if (handle)
3765 ocfs2_commit_trans(osb, handle); 6019 ocfs2_commit_trans(osb, handle);
3766 6020
6021 ocfs2_run_deallocs(osb, &tc->tc_dealloc);
6022
3767 ocfs2_free_path(path); 6023 ocfs2_free_path(path);
3768 6024
3769 /* This will drop the ext_alloc cluster lock for us */ 6025 /* This will drop the ext_alloc cluster lock for us */
@@ -3774,23 +6030,18 @@ bail:
3774} 6030}
3775 6031
3776/* 6032/*
3777 * Expects the inode to already be locked. This will figure out which 6033 * Expects the inode to already be locked.
3778 * inodes need to be locked and will put them on the returned truncate
3779 * context.
3780 */ 6034 */
3781int ocfs2_prepare_truncate(struct ocfs2_super *osb, 6035int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3782 struct inode *inode, 6036 struct inode *inode,
3783 struct buffer_head *fe_bh, 6037 struct buffer_head *fe_bh,
3784 struct ocfs2_truncate_context **tc) 6038 struct ocfs2_truncate_context **tc)
3785{ 6039{
3786 int status, metadata_delete, i; 6040 int status;
3787 unsigned int new_i_clusters; 6041 unsigned int new_i_clusters;
3788 struct ocfs2_dinode *fe; 6042 struct ocfs2_dinode *fe;
3789 struct ocfs2_extent_block *eb; 6043 struct ocfs2_extent_block *eb;
3790 struct ocfs2_extent_list *el;
3791 struct buffer_head *last_eb_bh = NULL; 6044 struct buffer_head *last_eb_bh = NULL;
3792 struct inode *ext_alloc_inode = NULL;
3793 struct buffer_head *ext_alloc_bh = NULL;
3794 6045
3795 mlog_entry_void(); 6046 mlog_entry_void();
3796 6047
@@ -3810,12 +6061,9 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3810 mlog_errno(status); 6061 mlog_errno(status);
3811 goto bail; 6062 goto bail;
3812 } 6063 }
6064 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
3813 6065
3814 metadata_delete = 0;
3815 if (fe->id2.i_list.l_tree_depth) { 6066 if (fe->id2.i_list.l_tree_depth) {
3816 /* If we have a tree, then the truncate may result in
3817 * metadata deletes. Figure this out from the
3818 * rightmost leaf block.*/
3819 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 6067 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
3820 &last_eb_bh, OCFS2_BH_CACHED, inode); 6068 &last_eb_bh, OCFS2_BH_CACHED, inode);
3821 if (status < 0) { 6069 if (status < 0) {
@@ -3830,43 +6078,10 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3830 status = -EIO; 6078 status = -EIO;
3831 goto bail; 6079 goto bail;
3832 } 6080 }
3833 el = &(eb->h_list);
3834
3835 i = 0;
3836 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3837 i = 1;
3838 /*
3839 * XXX: Should we check that next_free_rec contains
3840 * the extent?
3841 */
3842 if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
3843 metadata_delete = 1;
3844 } 6081 }
3845 6082
3846 (*tc)->tc_last_eb_bh = last_eb_bh; 6083 (*tc)->tc_last_eb_bh = last_eb_bh;
3847 6084
3848 if (metadata_delete) {
3849 mlog(0, "Will have to delete metadata for this trunc. "
3850 "locking allocator.\n");
3851 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
3852 if (!ext_alloc_inode) {
3853 status = -ENOMEM;
3854 mlog_errno(status);
3855 goto bail;
3856 }
3857
3858 mutex_lock(&ext_alloc_inode->i_mutex);
3859 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
3860
3861 status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
3862 if (status < 0) {
3863 mlog_errno(status);
3864 goto bail;
3865 }
3866 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
3867 (*tc)->tc_ext_alloc_locked = 1;
3868 }
3869
3870 status = 0; 6085 status = 0;
3871bail: 6086bail:
3872 if (status < 0) { 6087 if (status < 0) {
@@ -3880,16 +6095,13 @@ bail:
3880 6095
3881static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) 6096static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
3882{ 6097{
3883 if (tc->tc_ext_alloc_inode) { 6098 /*
3884 if (tc->tc_ext_alloc_locked) 6099 * The caller is responsible for completing deallocation
3885 ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1); 6100 * before freeing the context.
3886 6101 */
3887 mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex); 6102 if (tc->tc_dealloc.c_first_suballocator != NULL)
3888 iput(tc->tc_ext_alloc_inode); 6103 mlog(ML_NOTICE,
3889 } 6104 "Truncate completion has non-empty dealloc context\n");
3890
3891 if (tc->tc_ext_alloc_bh)
3892 brelse(tc->tc_ext_alloc_bh);
3893 6105
3894 if (tc->tc_last_eb_bh) 6106 if (tc->tc_last_eb_bh)
3895 brelse(tc->tc_last_eb_bh); 6107 brelse(tc->tc_last_eb_bh);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index fbcb5934a081..990df48ae8d3 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -34,7 +34,17 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
34 u32 cpos, 34 u32 cpos,
35 u64 start_blk, 35 u64 start_blk,
36 u32 new_clusters, 36 u32 new_clusters,
37 u8 flags,
37 struct ocfs2_alloc_context *meta_ac); 38 struct ocfs2_alloc_context *meta_ac);
39struct ocfs2_cached_dealloc_ctxt;
40int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
41 handle_t *handle, u32 cpos, u32 len, u32 phys,
42 struct ocfs2_alloc_context *meta_ac,
43 struct ocfs2_cached_dealloc_ctxt *dealloc);
44int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
45 u32 cpos, u32 len, handle_t *handle,
46 struct ocfs2_alloc_context *meta_ac,
47 struct ocfs2_cached_dealloc_ctxt *dealloc);
38int ocfs2_num_free_extents(struct ocfs2_super *osb, 48int ocfs2_num_free_extents(struct ocfs2_super *osb,
39 struct inode *inode, 49 struct inode *inode,
40 struct ocfs2_dinode *fe); 50 struct ocfs2_dinode *fe);
@@ -62,17 +72,41 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
62 struct ocfs2_dinode **tl_copy); 72 struct ocfs2_dinode **tl_copy);
63int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, 73int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
64 struct ocfs2_dinode *tl_copy); 74 struct ocfs2_dinode *tl_copy);
75int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb);
76int ocfs2_truncate_log_append(struct ocfs2_super *osb,
77 handle_t *handle,
78 u64 start_blk,
79 unsigned int num_clusters);
80int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
81
82/*
83 * Process local structure which describes the block unlinks done
84 * during an operation. This is populated via
85 * ocfs2_cache_block_dealloc().
86 *
87 * ocfs2_run_deallocs() should be called after the potentially
88 * de-allocating routines. No journal handles should be open, and most
89 * locks should have been dropped.
90 */
91struct ocfs2_cached_dealloc_ctxt {
92 struct ocfs2_per_slot_free_list *c_first_suballocator;
93};
94static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
95{
96 c->c_first_suballocator = NULL;
97}
98int ocfs2_run_deallocs(struct ocfs2_super *osb,
99 struct ocfs2_cached_dealloc_ctxt *ctxt);
65 100
66struct ocfs2_truncate_context { 101struct ocfs2_truncate_context {
67 struct inode *tc_ext_alloc_inode; 102 struct ocfs2_cached_dealloc_ctxt tc_dealloc;
68 struct buffer_head *tc_ext_alloc_bh;
69 int tc_ext_alloc_locked; /* is it cluster locked? */ 103 int tc_ext_alloc_locked; /* is it cluster locked? */
70 /* these get destroyed once it's passed to ocfs2_commit_truncate. */ 104 /* these get destroyed once it's passed to ocfs2_commit_truncate. */
71 struct buffer_head *tc_last_eb_bh; 105 struct buffer_head *tc_last_eb_bh;
72}; 106};
73 107
74int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, 108int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
75 u64 new_i_size); 109 u64 range_start, u64 range_end);
76int ocfs2_prepare_truncate(struct ocfs2_super *osb, 110int ocfs2_prepare_truncate(struct ocfs2_super *osb,
77 struct inode *inode, 111 struct inode *inode,
78 struct buffer_head *fe_bh, 112 struct buffer_head *fe_bh,
@@ -84,6 +118,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
84 118
85int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 119int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
86 u32 cpos, struct buffer_head **leaf_bh); 120 u32 cpos, struct buffer_head **leaf_bh);
121int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
87 122
88/* 123/*
89 * Helper function to look at the # of clusters in an extent record. 124 * Helper function to look at the # of clusters in an extent record.
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a480b09c79b9..84bf6e79de23 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -684,6 +684,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
684 bh = bh->b_this_page, block_start += bsize) { 684 bh = bh->b_this_page, block_start += bsize) {
685 block_end = block_start + bsize; 685 block_end = block_start + bsize;
686 686
687 clear_buffer_new(bh);
688
687 /* 689 /*
688 * Ignore blocks outside of our i/o range - 690 * Ignore blocks outside of our i/o range -
689 * they may belong to unallocated clusters. 691 * they may belong to unallocated clusters.
@@ -698,9 +700,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
698 * For an allocating write with cluster size >= page 700 * For an allocating write with cluster size >= page
699 * size, we always write the entire page. 701 * size, we always write the entire page.
700 */ 702 */
701 703 if (new)
702 if (buffer_new(bh)) 704 set_buffer_new(bh);
703 clear_buffer_new(bh);
704 705
705 if (!buffer_mapped(bh)) { 706 if (!buffer_mapped(bh)) {
706 map_bh(bh, inode->i_sb, *p_blkno); 707 map_bh(bh, inode->i_sb, *p_blkno);
@@ -711,7 +712,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
711 if (!buffer_uptodate(bh)) 712 if (!buffer_uptodate(bh))
712 set_buffer_uptodate(bh); 713 set_buffer_uptodate(bh);
713 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && 714 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
714 (block_start < from || block_end > to)) { 715 !buffer_new(bh) &&
716 (block_start < from || block_end > to)) {
715 ll_rw_block(READ, 1, &bh); 717 ll_rw_block(READ, 1, &bh);
716 *wait_bh++=bh; 718 *wait_bh++=bh;
717 } 719 }
@@ -738,18 +740,13 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
738 bh = head; 740 bh = head;
739 block_start = 0; 741 block_start = 0;
740 do { 742 do {
741 void *kaddr;
742
743 block_end = block_start + bsize; 743 block_end = block_start + bsize;
744 if (block_end <= from) 744 if (block_end <= from)
745 goto next_bh; 745 goto next_bh;
746 if (block_start >= to) 746 if (block_start >= to)
747 break; 747 break;
748 748
749 kaddr = kmap_atomic(page, KM_USER0); 749 zero_user_page(page, block_start, bh->b_size, KM_USER0);
750 memset(kaddr+block_start, 0, bh->b_size);
751 flush_dcache_page(page);
752 kunmap_atomic(kaddr, KM_USER0);
753 set_buffer_uptodate(bh); 750 set_buffer_uptodate(bh);
754 mark_buffer_dirty(bh); 751 mark_buffer_dirty(bh);
755 752
@@ -761,217 +758,240 @@ next_bh:
761 return ret; 758 return ret;
762} 759}
763 760
761#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
762#define OCFS2_MAX_CTXT_PAGES 1
763#else
764#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
765#endif
766
767#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
768
764/* 769/*
765 * This will copy user data from the buffer page in the splice 770 * Describe the state of a single cluster to be written to.
766 * context.
767 *
768 * For now, we ignore SPLICE_F_MOVE as that would require some extra
769 * communication out all the way to ocfs2_write().
770 */ 771 */
771int ocfs2_map_and_write_splice_data(struct inode *inode, 772struct ocfs2_write_cluster_desc {
772 struct ocfs2_write_ctxt *wc, u64 *p_blkno, 773 u32 c_cpos;
773 unsigned int *ret_from, unsigned int *ret_to) 774 u32 c_phys;
775 /*
776 * Give this a unique field because c_phys eventually gets
777 * filled.
778 */
779 unsigned c_new;
780 unsigned c_unwritten;
781};
782
783static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
774{ 784{
775 int ret; 785 return d->c_new || d->c_unwritten;
776 unsigned int to, from, cluster_start, cluster_end; 786}
777 char *src, *dst;
778 struct ocfs2_splice_write_priv *sp = wc->w_private;
779 struct pipe_buffer *buf = sp->s_buf;
780 unsigned long bytes, src_from;
781 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
782 787
783 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, 788struct ocfs2_write_ctxt {
784 &cluster_end); 789 /* Logical cluster position / len of write */
790 u32 w_cpos;
791 u32 w_clen;
785 792
786 from = sp->s_offset; 793 struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
787 src_from = sp->s_buf_offset;
788 bytes = wc->w_count;
789 794
790 if (wc->w_large_pages) { 795 /*
791 /* 796 * This is true if page_size > cluster_size.
792 * For cluster size < page size, we have to 797 *
793 * calculate pos within the cluster and obey 798 * It triggers a set of special cases during write which might
794 * the rightmost boundary. 799 * have to deal with allocating writes to partial pages.
795 */ 800 */
796 bytes = min(bytes, (unsigned long)(osb->s_clustersize 801 unsigned int w_large_pages;
797 - (wc->w_pos & (osb->s_clustersize - 1)))); 802
798 } 803 /*
799 to = from + bytes; 804 * Pages involved in this write.
805 *
806 * w_target_page is the page being written to by the user.
807 *
808 * w_pages is an array of pages which always contains
809 * w_target_page, and in the case of an allocating write with
810 * page_size < cluster size, it will contain zero'd and mapped
811 * pages adjacent to w_target_page which need to be written
812 * out in so that future reads from that region will get
813 * zero's.
814 */
815 struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
816 unsigned int w_num_pages;
817 struct page *w_target_page;
800 818
801 BUG_ON(from > PAGE_CACHE_SIZE); 819 /*
802 BUG_ON(to > PAGE_CACHE_SIZE); 820 * ocfs2_write_end() uses this to know what the real range to
803 BUG_ON(from < cluster_start); 821 * write in the target should be.
804 BUG_ON(to > cluster_end); 822 */
823 unsigned int w_target_from;
824 unsigned int w_target_to;
805 825
806 if (wc->w_this_page_new) 826 /*
807 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 827 * We could use journal_current_handle() but this is cleaner,
808 cluster_start, cluster_end, 1); 828 * IMHO -Mark
809 else 829 */
810 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 830 handle_t *w_handle;
811 from, to, 0); 831
812 if (ret) { 832 struct buffer_head *w_di_bh;
813 mlog_errno(ret); 833
814 goto out; 834 struct ocfs2_cached_dealloc_ctxt w_dealloc;
835};
836
837static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
838{
839 int i;
840
841 for(i = 0; i < wc->w_num_pages; i++) {
842 if (wc->w_pages[i] == NULL)
843 continue;
844
845 unlock_page(wc->w_pages[i]);
846 mark_page_accessed(wc->w_pages[i]);
847 page_cache_release(wc->w_pages[i]);
815 } 848 }
816 849
817 src = buf->ops->map(sp->s_pipe, buf, 1); 850 brelse(wc->w_di_bh);
818 dst = kmap_atomic(wc->w_this_page, KM_USER1); 851 kfree(wc);
819 memcpy(dst + from, src + src_from, bytes); 852}
820 kunmap_atomic(wc->w_this_page, KM_USER1); 853
821 buf->ops->unmap(sp->s_pipe, buf, src); 854static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
855 struct ocfs2_super *osb, loff_t pos,
856 unsigned len, struct buffer_head *di_bh)
857{
858 struct ocfs2_write_ctxt *wc;
859
860 wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
861 if (!wc)
862 return -ENOMEM;
822 863
823 wc->w_finished_copy = 1; 864 wc->w_cpos = pos >> osb->s_clustersize_bits;
865 wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len);
866 get_bh(di_bh);
867 wc->w_di_bh = di_bh;
824 868
825 *ret_from = from; 869 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
826 *ret_to = to; 870 wc->w_large_pages = 1;
827out: 871 else
872 wc->w_large_pages = 0;
873
874 ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
875
876 *wcp = wc;
828 877
829 return bytes ? (unsigned int)bytes : ret; 878 return 0;
830} 879}
831 880
832/* 881/*
833 * This will copy user data from the iovec in the buffered write 882 * If a page has any new buffers, zero them out here, and mark them uptodate
834 * context. 883 * and dirty so they'll be written out (in order to prevent uninitialised
884 * block data from leaking). And clear the new bit.
835 */ 885 */
836int ocfs2_map_and_write_user_data(struct inode *inode, 886static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
837 struct ocfs2_write_ctxt *wc, u64 *p_blkno,
838 unsigned int *ret_from, unsigned int *ret_to)
839{ 887{
840 int ret; 888 unsigned int block_start, block_end;
841 unsigned int to, from, cluster_start, cluster_end; 889 struct buffer_head *head, *bh;
842 unsigned long bytes, src_from;
843 char *dst;
844 struct ocfs2_buffered_write_priv *bp = wc->w_private;
845 const struct iovec *cur_iov = bp->b_cur_iov;
846 char __user *buf;
847 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
848 890
849 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, 891 BUG_ON(!PageLocked(page));
850 &cluster_end); 892 if (!page_has_buffers(page))
893 return;
851 894
852 buf = cur_iov->iov_base + bp->b_cur_off; 895 bh = head = page_buffers(page);
853 src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; 896 block_start = 0;
897 do {
898 block_end = block_start + bh->b_size;
854 899
855 from = wc->w_pos & (PAGE_CACHE_SIZE - 1); 900 if (buffer_new(bh)) {
901 if (block_end > from && block_start < to) {
902 if (!PageUptodate(page)) {
903 unsigned start, end;
856 904
857 /* 905 start = max(from, block_start);
858 * This is a lot of comparisons, but it reads quite 906 end = min(to, block_end);
859 * easily, which is important here.
860 */
861 /* Stay within the src page */
862 bytes = PAGE_SIZE - src_from;
863 /* Stay within the vector */
864 bytes = min(bytes,
865 (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
866 /* Stay within count */
867 bytes = min(bytes, (unsigned long)wc->w_count);
868 /*
869 * For clustersize > page size, just stay within
870 * target page, otherwise we have to calculate pos
871 * within the cluster and obey the rightmost
872 * boundary.
873 */
874 if (wc->w_large_pages) {
875 /*
876 * For cluster size < page size, we have to
877 * calculate pos within the cluster and obey
878 * the rightmost boundary.
879 */
880 bytes = min(bytes, (unsigned long)(osb->s_clustersize
881 - (wc->w_pos & (osb->s_clustersize - 1))));
882 } else {
883 /*
884 * cluster size > page size is the most common
885 * case - we just stay within the target page
886 * boundary.
887 */
888 bytes = min(bytes, PAGE_CACHE_SIZE - from);
889 }
890 907
891 to = from + bytes; 908 zero_user_page(page, start, end - start, KM_USER0);
909 set_buffer_uptodate(bh);
910 }
892 911
893 BUG_ON(from > PAGE_CACHE_SIZE); 912 clear_buffer_new(bh);
894 BUG_ON(to > PAGE_CACHE_SIZE); 913 mark_buffer_dirty(bh);
895 BUG_ON(from < cluster_start); 914 }
896 BUG_ON(to > cluster_end); 915 }
897 916
898 if (wc->w_this_page_new) 917 block_start = block_end;
899 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 918 bh = bh->b_this_page;
900 cluster_start, cluster_end, 1); 919 } while (bh != head);
901 else 920}
902 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
903 from, to, 0);
904 if (ret) {
905 mlog_errno(ret);
906 goto out;
907 }
908 921
909 dst = kmap(wc->w_this_page); 922/*
910 memcpy(dst + from, bp->b_src_buf + src_from, bytes); 923 * Only called when we have a failure during allocating write to write
911 kunmap(wc->w_this_page); 924 * zero's to the newly allocated region.
925 */
926static void ocfs2_write_failure(struct inode *inode,
927 struct ocfs2_write_ctxt *wc,
928 loff_t user_pos, unsigned user_len)
929{
930 int i;
931 unsigned from, to;
932 struct page *tmppage;
912 933
913 /* 934 ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len);
914 * XXX: This is slow, but simple. The caller of
915 * ocfs2_buffered_write_cluster() is responsible for
916 * passing through the iovecs, so it's difficult to
917 * predict what our next step is in here after our
918 * initial write. A future version should be pushing
919 * that iovec manipulation further down.
920 *
921 * By setting this, we indicate that a copy from user
922 * data was done, and subsequent calls for this
923 * cluster will skip copying more data.
924 */
925 wc->w_finished_copy = 1;
926 935
927 *ret_from = from; 936 if (wc->w_large_pages) {
928 *ret_to = to; 937 from = wc->w_target_from;
929out: 938 to = wc->w_target_to;
939 } else {
940 from = 0;
941 to = PAGE_CACHE_SIZE;
942 }
943
944 for(i = 0; i < wc->w_num_pages; i++) {
945 tmppage = wc->w_pages[i];
930 946
931 return bytes ? (unsigned int)bytes : ret; 947 if (ocfs2_should_order_data(inode))
948 walk_page_buffers(wc->w_handle, page_buffers(tmppage),
949 from, to, NULL,
950 ocfs2_journal_dirty_data);
951
952 block_commit_write(tmppage, from, to);
953 }
932} 954}
933 955
934/* 956static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
935 * Map, fill and write a page to disk. 957 struct ocfs2_write_ctxt *wc,
936 * 958 struct page *page, u32 cpos,
937 * The work of copying data is done via callback. Newly allocated 959 loff_t user_pos, unsigned user_len,
938 * pages which don't take user data will be zero'd (set 'new' to 960 int new)
939 * indicate an allocating write)
940 *
941 * Returns a negative error code or the number of bytes copied into
942 * the page.
943 */
944static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
945 u64 *p_blkno, struct page *page,
946 struct ocfs2_write_ctxt *wc, int new)
947{ 961{
948 int ret, copied = 0; 962 int ret;
949 unsigned int from = 0, to = 0; 963 unsigned int map_from = 0, map_to = 0;
950 unsigned int cluster_start, cluster_end; 964 unsigned int cluster_start, cluster_end;
951 unsigned int zero_from = 0, zero_to = 0; 965 unsigned int user_data_from = 0, user_data_to = 0;
952 966
953 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, 967 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
954 &cluster_start, &cluster_end); 968 &cluster_start, &cluster_end);
955 969
956 if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index 970 if (page == wc->w_target_page) {
957 && !wc->w_finished_copy) { 971 map_from = user_pos & (PAGE_CACHE_SIZE - 1);
958 972 map_to = map_from + user_len;
959 wc->w_this_page = page; 973
960 wc->w_this_page_new = new; 974 if (new)
961 ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); 975 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
962 if (ret < 0) { 976 cluster_start, cluster_end,
977 new);
978 else
979 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
980 map_from, map_to, new);
981 if (ret) {
963 mlog_errno(ret); 982 mlog_errno(ret);
964 goto out; 983 goto out;
965 } 984 }
966 985
967 copied = ret; 986 user_data_from = map_from;
968 987 user_data_to = map_to;
969 zero_from = from;
970 zero_to = to;
971 if (new) { 988 if (new) {
972 from = cluster_start; 989 map_from = cluster_start;
973 to = cluster_end; 990 map_to = cluster_end;
974 } 991 }
992
993 wc->w_target_from = map_from;
994 wc->w_target_to = map_to;
975 } else { 995 } else {
976 /* 996 /*
977 * If we haven't allocated the new page yet, we 997 * If we haven't allocated the new page yet, we
@@ -980,11 +1000,11 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
980 */ 1000 */
981 BUG_ON(!new); 1001 BUG_ON(!new);
982 1002
983 from = cluster_start; 1003 map_from = cluster_start;
984 to = cluster_end; 1004 map_to = cluster_end;
985 1005
986 ret = ocfs2_map_page_blocks(page, p_blkno, inode, 1006 ret = ocfs2_map_page_blocks(page, p_blkno, inode,
987 cluster_start, cluster_end, 1); 1007 cluster_start, cluster_end, new);
988 if (ret) { 1008 if (ret) {
989 mlog_errno(ret); 1009 mlog_errno(ret);
990 goto out; 1010 goto out;
@@ -1003,108 +1023,113 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
1003 */ 1023 */
1004 if (new && !PageUptodate(page)) 1024 if (new && !PageUptodate(page))
1005 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), 1025 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
1006 wc->w_cpos, zero_from, zero_to); 1026 cpos, user_data_from, user_data_to);
1007 1027
1008 flush_dcache_page(page); 1028 flush_dcache_page(page);
1009 1029
1010 if (ocfs2_should_order_data(inode)) {
1011 ret = walk_page_buffers(handle,
1012 page_buffers(page),
1013 from, to, NULL,
1014 ocfs2_journal_dirty_data);
1015 if (ret < 0)
1016 mlog_errno(ret);
1017 }
1018
1019 /*
1020 * We don't use generic_commit_write() because we need to
1021 * handle our own i_size update.
1022 */
1023 ret = block_commit_write(page, from, to);
1024 if (ret)
1025 mlog_errno(ret);
1026out: 1030out:
1027 1031 return ret;
1028 return copied ? copied : ret;
1029} 1032}
1030 1033
1031/* 1034/*
1032 * Do the actual write of some data into an inode. Optionally allocate 1035 * This function will only grab one clusters worth of pages.
1033 * in order to fulfill the write.
1034 *
1035 * cpos is the logical cluster offset within the file to write at
1036 *
1037 * 'phys' is the physical mapping of that offset. a 'phys' value of
1038 * zero indicates that allocation is required. In this case, data_ac
1039 * and meta_ac should be valid (meta_ac can be null if metadata
1040 * allocation isn't required).
1041 */ 1036 */
1042static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, 1037static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1043 struct buffer_head *di_bh, 1038 struct ocfs2_write_ctxt *wc,
1044 struct ocfs2_alloc_context *data_ac, 1039 u32 cpos, loff_t user_pos, int new,
1045 struct ocfs2_alloc_context *meta_ac, 1040 struct page *mmap_page)
1046 struct ocfs2_write_ctxt *wc)
1047{ 1041{
1048 int ret, i, numpages = 1, new; 1042 int ret = 0, i;
1049 unsigned int copied = 0; 1043 unsigned long start, target_index, index;
1050 u32 tmp_pos;
1051 u64 v_blkno, p_blkno;
1052 struct address_space *mapping = file->f_mapping;
1053 struct inode *inode = mapping->host; 1044 struct inode *inode = mapping->host;
1054 unsigned long index, start;
1055 struct page **cpages;
1056 1045
1057 new = phys == 0 ? 1 : 0; 1046 target_index = user_pos >> PAGE_CACHE_SHIFT;
1058 1047
1059 /* 1048 /*
1060 * Figure out how many pages we'll be manipulating here. For 1049 * Figure out how many pages we'll be manipulating here. For
1061 * non allocating write, we just change the one 1050 * non allocating write, we just change the one
1062 * page. Otherwise, we'll need a whole clusters worth. 1051 * page. Otherwise, we'll need a whole clusters worth.
1063 */ 1052 */
1064 if (new)
1065 numpages = ocfs2_pages_per_cluster(inode->i_sb);
1066
1067 cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
1068 if (!cpages) {
1069 ret = -ENOMEM;
1070 mlog_errno(ret);
1071 return ret;
1072 }
1073
1074 /*
1075 * Fill our page array first. That way we've grabbed enough so
1076 * that we can zero and flush if we error after adding the
1077 * extent.
1078 */
1079 if (new) { 1053 if (new) {
1080 start = ocfs2_align_clusters_to_page_index(inode->i_sb, 1054 wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
1081 wc->w_cpos); 1055 start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
1082 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
1083 } else { 1056 } else {
1084 start = wc->w_pos >> PAGE_CACHE_SHIFT; 1057 wc->w_num_pages = 1;
1085 v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; 1058 start = target_index;
1086 } 1059 }
1087 1060
1088 for(i = 0; i < numpages; i++) { 1061 for(i = 0; i < wc->w_num_pages; i++) {
1089 index = start + i; 1062 index = start + i;
1090 1063
1091 cpages[i] = find_or_create_page(mapping, index, GFP_NOFS); 1064 if (index == target_index && mmap_page) {
1092 if (!cpages[i]) { 1065 /*
1093 ret = -ENOMEM; 1066 * ocfs2_pagemkwrite() is a little different
1094 mlog_errno(ret); 1067 * and wants us to directly use the page
1095 goto out; 1068 * passed in.
1069 */
1070 lock_page(mmap_page);
1071
1072 if (mmap_page->mapping != mapping) {
1073 unlock_page(mmap_page);
1074 /*
1075 * Sanity check - the locking in
1076 * ocfs2_pagemkwrite() should ensure
1077 * that this code doesn't trigger.
1078 */
1079 ret = -EINVAL;
1080 mlog_errno(ret);
1081 goto out;
1082 }
1083
1084 page_cache_get(mmap_page);
1085 wc->w_pages[i] = mmap_page;
1086 } else {
1087 wc->w_pages[i] = find_or_create_page(mapping, index,
1088 GFP_NOFS);
1089 if (!wc->w_pages[i]) {
1090 ret = -ENOMEM;
1091 mlog_errno(ret);
1092 goto out;
1093 }
1096 } 1094 }
1095
1096 if (index == target_index)
1097 wc->w_target_page = wc->w_pages[i];
1097 } 1098 }
1099out:
1100 return ret;
1101}
1102
1103/*
1104 * Prepare a single cluster for write one cluster into the file.
1105 */
1106static int ocfs2_write_cluster(struct address_space *mapping,
1107 u32 phys, unsigned int unwritten,
1108 struct ocfs2_alloc_context *data_ac,
1109 struct ocfs2_alloc_context *meta_ac,
1110 struct ocfs2_write_ctxt *wc, u32 cpos,
1111 loff_t user_pos, unsigned user_len)
1112{
1113 int ret, i, new, should_zero = 0;
1114 u64 v_blkno, p_blkno;
1115 struct inode *inode = mapping->host;
1116
1117 new = phys == 0 ? 1 : 0;
1118 if (new || unwritten)
1119 should_zero = 1;
1098 1120
1099 if (new) { 1121 if (new) {
1122 u32 tmp_pos;
1123
1100 /* 1124 /*
1101 * This is safe to call with the page locks - it won't take 1125 * This is safe to call with the page locks - it won't take
1102 * any additional semaphores or cluster locks. 1126 * any additional semaphores or cluster locks.
1103 */ 1127 */
1104 tmp_pos = wc->w_cpos; 1128 tmp_pos = cpos;
1105 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, 1129 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
1106 &tmp_pos, 1, di_bh, handle, 1130 &tmp_pos, 1, 0, wc->w_di_bh,
1107 data_ac, meta_ac, NULL); 1131 wc->w_handle, data_ac,
1132 meta_ac, NULL);
1108 /* 1133 /*
1109 * This shouldn't happen because we must have already 1134 * This shouldn't happen because we must have already
1110 * calculated the correct meta data allocation required. The 1135 * calculated the correct meta data allocation required. The
@@ -1121,159 +1146,433 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
1121 mlog_errno(ret); 1146 mlog_errno(ret);
1122 goto out; 1147 goto out;
1123 } 1148 }
1149 } else if (unwritten) {
1150 ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
1151 wc->w_handle, cpos, 1, phys,
1152 meta_ac, &wc->w_dealloc);
1153 if (ret < 0) {
1154 mlog_errno(ret);
1155 goto out;
1156 }
1124 } 1157 }
1125 1158
1159 if (should_zero)
1160 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
1161 else
1162 v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
1163
1164 /*
1165 * The only reason this should fail is due to an inability to
1166 * find the extent added.
1167 */
1126 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, 1168 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
1127 NULL); 1169 NULL);
1128 if (ret < 0) { 1170 if (ret < 0) {
1129 1171 ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
1130 /* 1172 "at logical block %llu",
1131 * XXX: Should we go readonly here? 1173 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1132 */ 1174 (unsigned long long)v_blkno);
1133
1134 mlog_errno(ret);
1135 goto out; 1175 goto out;
1136 } 1176 }
1137 1177
1138 BUG_ON(p_blkno == 0); 1178 BUG_ON(p_blkno == 0);
1139 1179
1140 for(i = 0; i < numpages; i++) { 1180 for(i = 0; i < wc->w_num_pages; i++) {
1141 ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], 1181 int tmpret;
1142 wc, new); 1182
1143 if (ret < 0) { 1183 tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
1144 mlog_errno(ret); 1184 wc->w_pages[i], cpos,
1145 goto out; 1185 user_pos, user_len,
1186 should_zero);
1187 if (tmpret) {
1188 mlog_errno(tmpret);
1189 if (ret == 0)
1190 tmpret = ret;
1146 } 1191 }
1147
1148 copied += ret;
1149 } 1192 }
1150 1193
1194 /*
1195 * We only have cleanup to do in case of allocating write.
1196 */
1197 if (ret && new)
1198 ocfs2_write_failure(inode, wc, user_pos, user_len);
1199
1151out: 1200out:
1152 for(i = 0; i < numpages; i++) { 1201
1153 unlock_page(cpages[i]); 1202 return ret;
1154 mark_page_accessed(cpages[i]); 1203}
1155 page_cache_release(cpages[i]); 1204
1205static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
1206 struct ocfs2_alloc_context *data_ac,
1207 struct ocfs2_alloc_context *meta_ac,
1208 struct ocfs2_write_ctxt *wc,
1209 loff_t pos, unsigned len)
1210{
1211 int ret, i;
1212 struct ocfs2_write_cluster_desc *desc;
1213
1214 for (i = 0; i < wc->w_clen; i++) {
1215 desc = &wc->w_desc[i];
1216
1217 ret = ocfs2_write_cluster(mapping, desc->c_phys,
1218 desc->c_unwritten, data_ac, meta_ac,
1219 wc, desc->c_cpos, pos, len);
1220 if (ret) {
1221 mlog_errno(ret);
1222 goto out;
1223 }
1156 } 1224 }
1157 kfree(cpages);
1158 1225
1159 return copied ? copied : ret; 1226 ret = 0;
1227out:
1228 return ret;
1160} 1229}
1161 1230
1162static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, 1231/*
1163 struct ocfs2_super *osb, loff_t pos, 1232 * ocfs2_write_end() wants to know which parts of the target page it
1164 size_t count, ocfs2_page_writer *cb, 1233 * should complete the write on. It's easiest to compute them ahead of
1165 void *cb_priv) 1234 * time when a more complete view of the write is available.
1235 */
1236static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
1237 struct ocfs2_write_ctxt *wc,
1238 loff_t pos, unsigned len, int alloc)
1166{ 1239{
1167 wc->w_count = count; 1240 struct ocfs2_write_cluster_desc *desc;
1168 wc->w_pos = pos;
1169 wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
1170 wc->w_finished_copy = 0;
1171 1241
1172 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) 1242 wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
1173 wc->w_large_pages = 1; 1243 wc->w_target_to = wc->w_target_from + len;
1174 else
1175 wc->w_large_pages = 0;
1176 1244
1177 wc->w_write_data_page = cb; 1245 if (alloc == 0)
1178 wc->w_private = cb_priv; 1246 return;
1247
1248 /*
1249 * Allocating write - we may have different boundaries based
1250 * on page size and cluster size.
1251 *
1252 * NOTE: We can no longer compute one value from the other as
1253 * the actual write length and user provided length may be
1254 * different.
1255 */
1256
1257 if (wc->w_large_pages) {
1258 /*
1259 * We only care about the 1st and last cluster within
1260 * our range and whether they should be zero'd or not. Either
1261 * value may be extended out to the start/end of a
1262 * newly allocated cluster.
1263 */
1264 desc = &wc->w_desc[0];
1265 if (ocfs2_should_zero_cluster(desc))
1266 ocfs2_figure_cluster_boundaries(osb,
1267 desc->c_cpos,
1268 &wc->w_target_from,
1269 NULL);
1270
1271 desc = &wc->w_desc[wc->w_clen - 1];
1272 if (ocfs2_should_zero_cluster(desc))
1273 ocfs2_figure_cluster_boundaries(osb,
1274 desc->c_cpos,
1275 NULL,
1276 &wc->w_target_to);
1277 } else {
1278 wc->w_target_from = 0;
1279 wc->w_target_to = PAGE_CACHE_SIZE;
1280 }
1179} 1281}
1180 1282
1181/* 1283/*
1182 * Write a cluster to an inode. The cluster may not be allocated yet, 1284 * Populate each single-cluster write descriptor in the write context
1183 * in which case it will be. This only exists for buffered writes - 1285 * with information about the i/o to be done.
1184 * O_DIRECT takes a more "traditional" path through the kernel.
1185 *
1186 * The caller is responsible for incrementing pos, written counts, etc
1187 * 1286 *
1188 * For file systems that don't support sparse files, pre-allocation 1287 * Returns the number of clusters that will have to be allocated, as
1189 * and page zeroing up until cpos should be done prior to this 1288 * well as a worst case estimate of the number of extent records that
1190 * function call. 1289 * would have to be created during a write to an unwritten region.
1191 *
1192 * Callers should be holding i_sem, and the rw cluster lock.
1193 *
1194 * Returns the number of user bytes written, or less than zero for
1195 * error.
1196 */ 1290 */
1197ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, 1291static int ocfs2_populate_write_desc(struct inode *inode,
1198 size_t count, ocfs2_page_writer *actor, 1292 struct ocfs2_write_ctxt *wc,
1199 void *priv) 1293 unsigned int *clusters_to_alloc,
1294 unsigned int *extents_to_split)
1295{
1296 int ret;
1297 struct ocfs2_write_cluster_desc *desc;
1298 unsigned int num_clusters = 0;
1299 unsigned int ext_flags = 0;
1300 u32 phys = 0;
1301 int i;
1302
1303 *clusters_to_alloc = 0;
1304 *extents_to_split = 0;
1305
1306 for (i = 0; i < wc->w_clen; i++) {
1307 desc = &wc->w_desc[i];
1308 desc->c_cpos = wc->w_cpos + i;
1309
1310 if (num_clusters == 0) {
1311 /*
1312 * Need to look up the next extent record.
1313 */
1314 ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
1315 &num_clusters, &ext_flags);
1316 if (ret) {
1317 mlog_errno(ret);
1318 goto out;
1319 }
1320
1321 /*
1322 * Assume worst case - that we're writing in
1323 * the middle of the extent.
1324 *
1325 * We can assume that the write proceeds from
1326 * left to right, in which case the extent
1327 * insert code is smart enough to coalesce the
1328 * next splits into the previous records created.
1329 */
1330 if (ext_flags & OCFS2_EXT_UNWRITTEN)
1331 *extents_to_split = *extents_to_split + 2;
1332 } else if (phys) {
1333 /*
1334 * Only increment phys if it doesn't describe
1335 * a hole.
1336 */
1337 phys++;
1338 }
1339
1340 desc->c_phys = phys;
1341 if (phys == 0) {
1342 desc->c_new = 1;
1343 *clusters_to_alloc = *clusters_to_alloc + 1;
1344 }
1345 if (ext_flags & OCFS2_EXT_UNWRITTEN)
1346 desc->c_unwritten = 1;
1347
1348 num_clusters--;
1349 }
1350
1351 ret = 0;
1352out:
1353 return ret;
1354}
1355
1356int ocfs2_write_begin_nolock(struct address_space *mapping,
1357 loff_t pos, unsigned len, unsigned flags,
1358 struct page **pagep, void **fsdata,
1359 struct buffer_head *di_bh, struct page *mmap_page)
1200{ 1360{
1201 int ret, credits = OCFS2_INODE_UPDATE_CREDITS; 1361 int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
1202 ssize_t written = 0; 1362 unsigned int clusters_to_alloc, extents_to_split;
1203 u32 phys; 1363 struct ocfs2_write_ctxt *wc;
1204 struct inode *inode = file->f_mapping->host; 1364 struct inode *inode = mapping->host;
1205 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1365 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1206 struct buffer_head *di_bh = NULL;
1207 struct ocfs2_dinode *di; 1366 struct ocfs2_dinode *di;
1208 struct ocfs2_alloc_context *data_ac = NULL; 1367 struct ocfs2_alloc_context *data_ac = NULL;
1209 struct ocfs2_alloc_context *meta_ac = NULL; 1368 struct ocfs2_alloc_context *meta_ac = NULL;
1210 handle_t *handle; 1369 handle_t *handle;
1211 struct ocfs2_write_ctxt wc;
1212
1213 ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
1214 1370
1215 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1371 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
1216 if (ret) { 1372 if (ret) {
1217 mlog_errno(ret); 1373 mlog_errno(ret);
1218 goto out; 1374 return ret;
1219 } 1375 }
1220 di = (struct ocfs2_dinode *)di_bh->b_data;
1221
1222 /*
1223 * Take alloc sem here to prevent concurrent lookups. That way
1224 * the mapping, zeroing and tree manipulation within
1225 * ocfs2_write() will be safe against ->readpage(). This
1226 * should also serve to lock out allocation from a shared
1227 * writeable region.
1228 */
1229 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1230 1376
1231 ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); 1377 ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
1378 &extents_to_split);
1232 if (ret) { 1379 if (ret) {
1233 mlog_errno(ret); 1380 mlog_errno(ret);
1234 goto out_meta; 1381 goto out;
1235 } 1382 }
1236 1383
1237 /* phys == 0 means that allocation is required. */ 1384 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1238 if (phys == 0) { 1385
1239 ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); 1386 /*
1387 * We set w_target_from, w_target_to here so that
1388 * ocfs2_write_end() knows which range in the target page to
1389 * write out. An allocation requires that we write the entire
1390 * cluster range.
1391 */
1392 if (clusters_to_alloc || extents_to_split) {
1393 /*
1394 * XXX: We are stretching the limits of
1395 * ocfs2_lock_allocators(). It greatly over-estimates
1396 * the work to be done.
1397 */
1398 ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
1399 extents_to_split, &data_ac, &meta_ac);
1240 if (ret) { 1400 if (ret) {
1241 mlog_errno(ret); 1401 mlog_errno(ret);
1242 goto out_meta; 1402 goto out;
1243 } 1403 }
1244 1404
1245 credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); 1405 credits = ocfs2_calc_extend_credits(inode->i_sb, di,
1246 } 1406 clusters_to_alloc);
1247 1407
1248 ret = ocfs2_data_lock(inode, 1);
1249 if (ret) {
1250 mlog_errno(ret);
1251 goto out_meta;
1252 } 1408 }
1253 1409
1410 ocfs2_set_target_boundaries(osb, wc, pos, len,
1411 clusters_to_alloc + extents_to_split);
1412
1254 handle = ocfs2_start_trans(osb, credits); 1413 handle = ocfs2_start_trans(osb, credits);
1255 if (IS_ERR(handle)) { 1414 if (IS_ERR(handle)) {
1256 ret = PTR_ERR(handle); 1415 ret = PTR_ERR(handle);
1257 mlog_errno(ret); 1416 mlog_errno(ret);
1258 goto out_data; 1417 goto out;
1259 } 1418 }
1260 1419
1261 written = ocfs2_write(file, phys, handle, di_bh, data_ac, 1420 wc->w_handle = handle;
1262 meta_ac, &wc); 1421
1263 if (written < 0) { 1422 /*
1264 ret = written; 1423 * We don't want this to fail in ocfs2_write_end(), so do it
1424 * here.
1425 */
1426 ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
1427 OCFS2_JOURNAL_ACCESS_WRITE);
1428 if (ret) {
1265 mlog_errno(ret); 1429 mlog_errno(ret);
1266 goto out_commit; 1430 goto out_commit;
1267 } 1431 }
1268 1432
1269 ret = ocfs2_journal_access(handle, inode, di_bh, 1433 /*
1270 OCFS2_JOURNAL_ACCESS_WRITE); 1434 * Fill our page array first. That way we've grabbed enough so
1435 * that we can zero and flush if we error after adding the
1436 * extent.
1437 */
1438 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
1439 clusters_to_alloc + extents_to_split,
1440 mmap_page);
1271 if (ret) { 1441 if (ret) {
1272 mlog_errno(ret); 1442 mlog_errno(ret);
1273 goto out_commit; 1443 goto out_commit;
1274 } 1444 }
1275 1445
1276 pos += written; 1446 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
1447 len);
1448 if (ret) {
1449 mlog_errno(ret);
1450 goto out_commit;
1451 }
1452
1453 if (data_ac)
1454 ocfs2_free_alloc_context(data_ac);
1455 if (meta_ac)
1456 ocfs2_free_alloc_context(meta_ac);
1457
1458 *pagep = wc->w_target_page;
1459 *fsdata = wc;
1460 return 0;
1461out_commit:
1462 ocfs2_commit_trans(osb, handle);
1463
1464out:
1465 ocfs2_free_write_ctxt(wc);
1466
1467 if (data_ac)
1468 ocfs2_free_alloc_context(data_ac);
1469 if (meta_ac)
1470 ocfs2_free_alloc_context(meta_ac);
1471 return ret;
1472}
1473
1474int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1475 loff_t pos, unsigned len, unsigned flags,
1476 struct page **pagep, void **fsdata)
1477{
1478 int ret;
1479 struct buffer_head *di_bh = NULL;
1480 struct inode *inode = mapping->host;
1481
1482 ret = ocfs2_meta_lock(inode, &di_bh, 1);
1483 if (ret) {
1484 mlog_errno(ret);
1485 return ret;
1486 }
1487
1488 /*
1489 * Take alloc sem here to prevent concurrent lookups. That way
1490 * the mapping, zeroing and tree manipulation within
1491 * ocfs2_write() will be safe against ->readpage(). This
1492 * should also serve to lock out allocation from a shared
1493 * writeable region.
1494 */
1495 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1496
1497 ret = ocfs2_data_lock(inode, 1);
1498 if (ret) {
1499 mlog_errno(ret);
1500 goto out_fail;
1501 }
1502
1503 ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
1504 fsdata, di_bh, NULL);
1505 if (ret) {
1506 mlog_errno(ret);
1507 goto out_fail_data;
1508 }
1509
1510 brelse(di_bh);
1511
1512 return 0;
1513
1514out_fail_data:
1515 ocfs2_data_unlock(inode, 1);
1516out_fail:
1517 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1518
1519 brelse(di_bh);
1520 ocfs2_meta_unlock(inode, 1);
1521
1522 return ret;
1523}
1524
1525int ocfs2_write_end_nolock(struct address_space *mapping,
1526 loff_t pos, unsigned len, unsigned copied,
1527 struct page *page, void *fsdata)
1528{
1529 int i;
1530 unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
1531 struct inode *inode = mapping->host;
1532 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1533 struct ocfs2_write_ctxt *wc = fsdata;
1534 struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1535 handle_t *handle = wc->w_handle;
1536 struct page *tmppage;
1537
1538 if (unlikely(copied < len)) {
1539 if (!PageUptodate(wc->w_target_page))
1540 copied = 0;
1541
1542 ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
1543 start+len);
1544 }
1545 flush_dcache_page(wc->w_target_page);
1546
1547 for(i = 0; i < wc->w_num_pages; i++) {
1548 tmppage = wc->w_pages[i];
1549
1550 if (tmppage == wc->w_target_page) {
1551 from = wc->w_target_from;
1552 to = wc->w_target_to;
1553
1554 BUG_ON(from > PAGE_CACHE_SIZE ||
1555 to > PAGE_CACHE_SIZE ||
1556 to < from);
1557 } else {
1558 /*
1559 * Pages adjacent to the target (if any) imply
1560 * a hole-filling write in which case we want
1561 * to flush their entire range.
1562 */
1563 from = 0;
1564 to = PAGE_CACHE_SIZE;
1565 }
1566
1567 if (ocfs2_should_order_data(inode))
1568 walk_page_buffers(wc->w_handle, page_buffers(tmppage),
1569 from, to, NULL,
1570 ocfs2_journal_dirty_data);
1571
1572 block_commit_write(tmppage, from, to);
1573 }
1574
1575 pos += copied;
1277 if (pos > inode->i_size) { 1576 if (pos > inode->i_size) {
1278 i_size_write(inode, pos); 1577 i_size_write(inode, pos);
1279 mark_inode_dirty(inode); 1578 mark_inode_dirty(inode);
@@ -1283,29 +1582,31 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
1283 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1582 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1284 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 1583 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
1285 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1584 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1585 ocfs2_journal_dirty(handle, wc->w_di_bh);
1286 1586
1287 ret = ocfs2_journal_dirty(handle, di_bh);
1288 if (ret)
1289 mlog_errno(ret);
1290
1291out_commit:
1292 ocfs2_commit_trans(osb, handle); 1587 ocfs2_commit_trans(osb, handle);
1293 1588
1294out_data: 1589 ocfs2_run_deallocs(osb, &wc->w_dealloc);
1295 ocfs2_data_unlock(inode, 1); 1590
1591 ocfs2_free_write_ctxt(wc);
1592
1593 return copied;
1594}
1595
1596int ocfs2_write_end(struct file *file, struct address_space *mapping,
1597 loff_t pos, unsigned len, unsigned copied,
1598 struct page *page, void *fsdata)
1599{
1600 int ret;
1601 struct inode *inode = mapping->host;
1296 1602
1297out_meta: 1603 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
1604
1605 ocfs2_data_unlock(inode, 1);
1298 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1606 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1299 ocfs2_meta_unlock(inode, 1); 1607 ocfs2_meta_unlock(inode, 1);
1300 1608
1301out: 1609 return ret;
1302 brelse(di_bh);
1303 if (data_ac)
1304 ocfs2_free_alloc_context(data_ac);
1305 if (meta_ac)
1306 ocfs2_free_alloc_context(meta_ac);
1307
1308 return written ? written : ret;
1309} 1610}
1310 1611
1311const struct address_space_operations ocfs2_aops = { 1612const struct address_space_operations ocfs2_aops = {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 45821d479b5a..389579bd64e3 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -42,57 +42,22 @@ int walk_page_buffers( handle_t *handle,
42 int (*fn)( handle_t *handle, 42 int (*fn)( handle_t *handle,
43 struct buffer_head *bh)); 43 struct buffer_head *bh));
44 44
45struct ocfs2_write_ctxt; 45int ocfs2_write_begin(struct file *file, struct address_space *mapping,
46typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *, 46 loff_t pos, unsigned len, unsigned flags,
47 u64 *, unsigned int *, unsigned int *); 47 struct page **pagep, void **fsdata);
48 48
49ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, 49int ocfs2_write_end(struct file *file, struct address_space *mapping,
50 size_t count, ocfs2_page_writer *actor, 50 loff_t pos, unsigned len, unsigned copied,
51 void *priv); 51 struct page *page, void *fsdata);
52 52
53struct ocfs2_write_ctxt { 53int ocfs2_write_end_nolock(struct address_space *mapping,
54 size_t w_count; 54 loff_t pos, unsigned len, unsigned copied,
55 loff_t w_pos; 55 struct page *page, void *fsdata);
56 u32 w_cpos;
57 unsigned int w_finished_copy;
58 56
59 /* This is true if page_size > cluster_size */ 57int ocfs2_write_begin_nolock(struct address_space *mapping,
60 unsigned int w_large_pages; 58 loff_t pos, unsigned len, unsigned flags,
61 59 struct page **pagep, void **fsdata,
62 /* Filler callback and private data */ 60 struct buffer_head *di_bh, struct page *mmap_page);
63 ocfs2_page_writer *w_write_data_page;
64 void *w_private;
65
66 /* Only valid for the filler callback */
67 struct page *w_this_page;
68 unsigned int w_this_page_new;
69};
70
71struct ocfs2_buffered_write_priv {
72 char *b_src_buf;
73 const struct iovec *b_cur_iov; /* Current iovec */
74 size_t b_cur_off; /* Offset in the
75 * current iovec */
76};
77int ocfs2_map_and_write_user_data(struct inode *inode,
78 struct ocfs2_write_ctxt *wc,
79 u64 *p_blkno,
80 unsigned int *ret_from,
81 unsigned int *ret_to);
82
83struct ocfs2_splice_write_priv {
84 struct splice_desc *s_sd;
85 struct pipe_buffer *s_buf;
86 struct pipe_inode_info *s_pipe;
87 /* Neither offset value is ever larger than one page */
88 unsigned int s_offset;
89 unsigned int s_buf_offset;
90};
91int ocfs2_map_and_write_splice_data(struct inode *inode,
92 struct ocfs2_write_ctxt *wc,
93 u64 *p_blkno,
94 unsigned int *ret_from,
95 unsigned int *ret_to);
96 61
97/* all ocfs2_dio_end_io()'s fault */ 62/* all ocfs2_dio_end_io()'s fault */
98#define ocfs2_iocb_is_rw_locked(iocb) \ 63#define ocfs2_iocb_is_rw_locked(iocb) \
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 979113479c66..2bd7f788cf34 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1335,6 +1335,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1335 ret = wait_event_interruptible(o2hb_steady_queue, 1335 ret = wait_event_interruptible(o2hb_steady_queue,
1336 atomic_read(&reg->hr_steady_iterations) == 0); 1336 atomic_read(&reg->hr_steady_iterations) == 0);
1337 if (ret) { 1337 if (ret) {
1338 /* We got interrupted (hello ptrace!). Clean up */
1338 spin_lock(&o2hb_live_lock); 1339 spin_lock(&o2hb_live_lock);
1339 hb_task = reg->hr_task; 1340 hb_task = reg->hr_task;
1340 reg->hr_task = NULL; 1341 reg->hr_task = NULL;
@@ -1345,7 +1346,16 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1345 goto out; 1346 goto out;
1346 } 1347 }
1347 1348
1348 ret = count; 1349 /* Ok, we were woken. Make sure it wasn't by drop_item() */
1350 spin_lock(&o2hb_live_lock);
1351 hb_task = reg->hr_task;
1352 spin_unlock(&o2hb_live_lock);
1353
1354 if (hb_task)
1355 ret = count;
1356 else
1357 ret = -EIO;
1358
1349out: 1359out:
1350 if (filp) 1360 if (filp)
1351 fput(filp); 1361 fput(filp);
@@ -1523,6 +1533,15 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1523 if (hb_task) 1533 if (hb_task)
1524 kthread_stop(hb_task); 1534 kthread_stop(hb_task);
1525 1535
1536 /*
1537 * If we're racing a dev_write(), we need to wake them. They will
1538 * check reg->hr_task
1539 */
1540 if (atomic_read(&reg->hr_steady_iterations) != 0) {
1541 atomic_set(&reg->hr_steady_iterations, 0);
1542 wake_up(&o2hb_steady_queue);
1543 }
1544
1526 config_item_put(item); 1545 config_item_put(item);
1527} 1546}
1528 1547
@@ -1665,7 +1684,67 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
1665} 1684}
1666EXPORT_SYMBOL_GPL(o2hb_setup_callback); 1685EXPORT_SYMBOL_GPL(o2hb_setup_callback);
1667 1686
1668int o2hb_register_callback(struct o2hb_callback_func *hc) 1687static struct o2hb_region *o2hb_find_region(const char *region_uuid)
1688{
1689 struct o2hb_region *p, *reg = NULL;
1690
1691 assert_spin_locked(&o2hb_live_lock);
1692
1693 list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
1694 if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
1695 reg = p;
1696 break;
1697 }
1698 }
1699
1700 return reg;
1701}
1702
1703static int o2hb_region_get(const char *region_uuid)
1704{
1705 int ret = 0;
1706 struct o2hb_region *reg;
1707
1708 spin_lock(&o2hb_live_lock);
1709
1710 reg = o2hb_find_region(region_uuid);
1711 if (!reg)
1712 ret = -ENOENT;
1713 spin_unlock(&o2hb_live_lock);
1714
1715 if (ret)
1716 goto out;
1717
1718 ret = o2nm_depend_this_node();
1719 if (ret)
1720 goto out;
1721
1722 ret = o2nm_depend_item(&reg->hr_item);
1723 if (ret)
1724 o2nm_undepend_this_node();
1725
1726out:
1727 return ret;
1728}
1729
1730static void o2hb_region_put(const char *region_uuid)
1731{
1732 struct o2hb_region *reg;
1733
1734 spin_lock(&o2hb_live_lock);
1735
1736 reg = o2hb_find_region(region_uuid);
1737
1738 spin_unlock(&o2hb_live_lock);
1739
1740 if (reg) {
1741 o2nm_undepend_item(&reg->hr_item);
1742 o2nm_undepend_this_node();
1743 }
1744}
1745
1746int o2hb_register_callback(const char *region_uuid,
1747 struct o2hb_callback_func *hc)
1669{ 1748{
1670 struct o2hb_callback_func *tmp; 1749 struct o2hb_callback_func *tmp;
1671 struct list_head *iter; 1750 struct list_head *iter;
@@ -1681,6 +1760,12 @@ int o2hb_register_callback(struct o2hb_callback_func *hc)
1681 goto out; 1760 goto out;
1682 } 1761 }
1683 1762
1763 if (region_uuid) {
1764 ret = o2hb_region_get(region_uuid);
1765 if (ret)
1766 goto out;
1767 }
1768
1684 down_write(&o2hb_callback_sem); 1769 down_write(&o2hb_callback_sem);
1685 1770
1686 list_for_each(iter, &hbcall->list) { 1771 list_for_each(iter, &hbcall->list) {
@@ -1702,16 +1787,21 @@ out:
1702} 1787}
1703EXPORT_SYMBOL_GPL(o2hb_register_callback); 1788EXPORT_SYMBOL_GPL(o2hb_register_callback);
1704 1789
1705void o2hb_unregister_callback(struct o2hb_callback_func *hc) 1790void o2hb_unregister_callback(const char *region_uuid,
1791 struct o2hb_callback_func *hc)
1706{ 1792{
1707 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); 1793 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
1708 1794
1709 mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n", 1795 mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
1710 __builtin_return_address(0), hc); 1796 __builtin_return_address(0), hc);
1711 1797
1798 /* XXX Can this happen _with_ a region reference? */
1712 if (list_empty(&hc->hc_item)) 1799 if (list_empty(&hc->hc_item))
1713 return; 1800 return;
1714 1801
1802 if (region_uuid)
1803 o2hb_region_put(region_uuid);
1804
1715 down_write(&o2hb_callback_sem); 1805 down_write(&o2hb_callback_sem);
1716 1806
1717 list_del_init(&hc->hc_item); 1807 list_del_init(&hc->hc_item);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index cc6d40b39771..35397dd5ecdb 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -69,8 +69,10 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
69 o2hb_cb_func *func, 69 o2hb_cb_func *func,
70 void *data, 70 void *data,
71 int priority); 71 int priority);
72int o2hb_register_callback(struct o2hb_callback_func *hc); 72int o2hb_register_callback(const char *region_uuid,
73void o2hb_unregister_callback(struct o2hb_callback_func *hc); 73 struct o2hb_callback_func *hc);
74void o2hb_unregister_callback(const char *region_uuid,
75 struct o2hb_callback_func *hc);
74void o2hb_fill_node_map(unsigned long *map, 76void o2hb_fill_node_map(unsigned long *map,
75 unsigned bytes); 77 unsigned bytes);
76void o2hb_init(void); 78void o2hb_init(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 9f5ad0f01ce0..af2070da308b 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -900,6 +900,46 @@ static struct o2nm_cluster_group o2nm_cluster_group = {
900 }, 900 },
901}; 901};
902 902
903int o2nm_depend_item(struct config_item *item)
904{
905 return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
906}
907
908void o2nm_undepend_item(struct config_item *item)
909{
910 configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
911}
912
913int o2nm_depend_this_node(void)
914{
915 int ret = 0;
916 struct o2nm_node *local_node;
917
918 local_node = o2nm_get_node_by_num(o2nm_this_node());
919 if (!local_node) {
920 ret = -EINVAL;
921 goto out;
922 }
923
924 ret = o2nm_depend_item(&local_node->nd_item);
925 o2nm_node_put(local_node);
926
927out:
928 return ret;
929}
930
931void o2nm_undepend_this_node(void)
932{
933 struct o2nm_node *local_node;
934
935 local_node = o2nm_get_node_by_num(o2nm_this_node());
936 BUG_ON(!local_node);
937
938 o2nm_undepend_item(&local_node->nd_item);
939 o2nm_node_put(local_node);
940}
941
942
903static void __exit exit_o2nm(void) 943static void __exit exit_o2nm(void)
904{ 944{
905 if (ocfs2_table_header) 945 if (ocfs2_table_header)
@@ -934,7 +974,7 @@ static int __init init_o2nm(void)
934 goto out_sysctl; 974 goto out_sysctl;
935 975
936 config_group_init(&o2nm_cluster_group.cs_subsys.su_group); 976 config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
937 init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem); 977 mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
938 ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys); 978 ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
939 if (ret) { 979 if (ret) {
940 printk(KERN_ERR "nodemanager: Registration returned %d\n", ret); 980 printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index 070522138ae2..7c860361b8dd 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -77,4 +77,9 @@ struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
77void o2nm_node_get(struct o2nm_node *node); 77void o2nm_node_get(struct o2nm_node *node);
78void o2nm_node_put(struct o2nm_node *node); 78void o2nm_node_put(struct o2nm_node *node);
79 79
80int o2nm_depend_item(struct config_item *item);
81void o2nm_undepend_item(struct config_item *item);
82int o2nm_depend_this_node(void);
83void o2nm_undepend_this_node(void);
84
80#endif /* O2CLUSTER_NODEMANAGER_H */ 85#endif /* O2CLUSTER_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0b229a9c7952..f0bdfd944c44 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -261,14 +261,12 @@ out:
261 261
262static void o2net_complete_nodes_nsw(struct o2net_node *nn) 262static void o2net_complete_nodes_nsw(struct o2net_node *nn)
263{ 263{
264 struct list_head *iter, *tmp; 264 struct o2net_status_wait *nsw, *tmp;
265 unsigned int num_kills = 0; 265 unsigned int num_kills = 0;
266 struct o2net_status_wait *nsw;
267 266
268 assert_spin_locked(&nn->nn_lock); 267 assert_spin_locked(&nn->nn_lock);
269 268
270 list_for_each_safe(iter, tmp, &nn->nn_status_list) { 269 list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) {
271 nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
272 o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0); 270 o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
273 num_kills++; 271 num_kills++;
274 } 272 }
@@ -764,13 +762,10 @@ EXPORT_SYMBOL_GPL(o2net_register_handler);
764 762
765void o2net_unregister_handler_list(struct list_head *list) 763void o2net_unregister_handler_list(struct list_head *list)
766{ 764{
767 struct list_head *pos, *n; 765 struct o2net_msg_handler *nmh, *n;
768 struct o2net_msg_handler *nmh;
769 766
770 write_lock(&o2net_handler_lock); 767 write_lock(&o2net_handler_lock);
771 list_for_each_safe(pos, n, list) { 768 list_for_each_entry_safe(nmh, n, list, nh_unregister_item) {
772 nmh = list_entry(pos, struct o2net_msg_handler,
773 nh_unregister_item);
774 mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n", 769 mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
775 nmh->nh_func, nmh->nh_msg_type, nmh->nh_key); 770 nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
776 rb_erase(&nmh->nh_node, &o2net_handler_tree); 771 rb_erase(&nmh->nh_node, &o2net_handler_tree);
@@ -1638,8 +1633,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1638 1633
1639void o2net_unregister_hb_callbacks(void) 1634void o2net_unregister_hb_callbacks(void)
1640{ 1635{
1641 o2hb_unregister_callback(&o2net_hb_up); 1636 o2hb_unregister_callback(NULL, &o2net_hb_up);
1642 o2hb_unregister_callback(&o2net_hb_down); 1637 o2hb_unregister_callback(NULL, &o2net_hb_down);
1643} 1638}
1644 1639
1645int o2net_register_hb_callbacks(void) 1640int o2net_register_hb_callbacks(void)
@@ -1651,9 +1646,9 @@ int o2net_register_hb_callbacks(void)
1651 o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB, 1646 o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
1652 o2net_hb_node_up_cb, NULL, O2NET_HB_PRI); 1647 o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
1653 1648
1654 ret = o2hb_register_callback(&o2net_hb_up); 1649 ret = o2hb_register_callback(NULL, &o2net_hb_up);
1655 if (ret == 0) 1650 if (ret == 0)
1656 ret = o2hb_register_callback(&o2net_hb_down); 1651 ret = o2hb_register_callback(NULL, &o2net_hb_down);
1657 1652
1658 if (ret) 1653 if (ret)
1659 o2net_unregister_hb_callbacks(); 1654 o2net_unregister_hb_callbacks();
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c441ef1f2bad..0d5fdde959c8 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -368,7 +368,7 @@ int ocfs2_do_extend_dir(struct super_block *sb,
368 u32 offset = OCFS2_I(dir)->ip_clusters; 368 u32 offset = OCFS2_I(dir)->ip_clusters;
369 369
370 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, 370 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
371 1, parent_fe_bh, handle, 371 1, 0, parent_fe_bh, handle,
372 data_ac, meta_ac, NULL); 372 data_ac, meta_ac, NULL);
373 BUG_ON(status == -EAGAIN); 373 BUG_ON(status == -EAGAIN);
374 if (status < 0) { 374 if (status < 0) {
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index d836b98dd99a..6954565b8ccb 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1128,8 +1128,8 @@ bail:
1128 1128
1129static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) 1129static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
1130{ 1130{
1131 o2hb_unregister_callback(&dlm->dlm_hb_up); 1131 o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
1132 o2hb_unregister_callback(&dlm->dlm_hb_down); 1132 o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
1133 o2net_unregister_handler_list(&dlm->dlm_domain_handlers); 1133 o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
1134} 1134}
1135 1135
@@ -1141,13 +1141,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
1141 1141
1142 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, 1142 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
1143 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); 1143 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
1144 status = o2hb_register_callback(&dlm->dlm_hb_down); 1144 status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
1145 if (status) 1145 if (status)
1146 goto bail; 1146 goto bail;
1147 1147
1148 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, 1148 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
1149 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); 1149 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
1150 status = o2hb_register_callback(&dlm->dlm_hb_up); 1150 status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
1151 if (status) 1151 if (status)
1152 goto bail; 1152 goto bail;
1153 1153
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 6edffca99d98..65b2b9b92688 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -192,25 +192,20 @@ static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
192static void dlm_dump_mles(struct dlm_ctxt *dlm) 192static void dlm_dump_mles(struct dlm_ctxt *dlm)
193{ 193{
194 struct dlm_master_list_entry *mle; 194 struct dlm_master_list_entry *mle;
195 struct list_head *iter;
196 195
197 mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); 196 mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
198 spin_lock(&dlm->master_lock); 197 spin_lock(&dlm->master_lock);
199 list_for_each(iter, &dlm->master_list) { 198 list_for_each_entry(mle, &dlm->master_list, list)
200 mle = list_entry(iter, struct dlm_master_list_entry, list);
201 dlm_print_one_mle(mle); 199 dlm_print_one_mle(mle);
202 }
203 spin_unlock(&dlm->master_lock); 200 spin_unlock(&dlm->master_lock);
204} 201}
205 202
206int dlm_dump_all_mles(const char __user *data, unsigned int len) 203int dlm_dump_all_mles(const char __user *data, unsigned int len)
207{ 204{
208 struct list_head *iter;
209 struct dlm_ctxt *dlm; 205 struct dlm_ctxt *dlm;
210 206
211 spin_lock(&dlm_domain_lock); 207 spin_lock(&dlm_domain_lock);
212 list_for_each(iter, &dlm_domains) { 208 list_for_each_entry(dlm, &dlm_domains, list) {
213 dlm = list_entry (iter, struct dlm_ctxt, list);
214 mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name); 209 mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
215 dlm_dump_mles(dlm); 210 dlm_dump_mles(dlm);
216 } 211 }
@@ -454,12 +449,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
454 char *name, unsigned int namelen) 449 char *name, unsigned int namelen)
455{ 450{
456 struct dlm_master_list_entry *tmpmle; 451 struct dlm_master_list_entry *tmpmle;
457 struct list_head *iter;
458 452
459 assert_spin_locked(&dlm->master_lock); 453 assert_spin_locked(&dlm->master_lock);
460 454
461 list_for_each(iter, &dlm->master_list) { 455 list_for_each_entry(tmpmle, &dlm->master_list, list) {
462 tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
463 if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) 456 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
464 continue; 457 continue;
465 dlm_get_mle(tmpmle); 458 dlm_get_mle(tmpmle);
@@ -472,13 +465,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
472void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) 465void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
473{ 466{
474 struct dlm_master_list_entry *mle; 467 struct dlm_master_list_entry *mle;
475 struct list_head *iter;
476 468
477 assert_spin_locked(&dlm->spinlock); 469 assert_spin_locked(&dlm->spinlock);
478 470
479 list_for_each(iter, &dlm->mle_hb_events) { 471 list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
480 mle = list_entry(iter, struct dlm_master_list_entry,
481 hb_events);
482 if (node_up) 472 if (node_up)
483 dlm_mle_node_up(dlm, mle, NULL, idx); 473 dlm_mle_node_up(dlm, mle, NULL, idx);
484 else 474 else
@@ -2434,7 +2424,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2434 int ret; 2424 int ret;
2435 int i; 2425 int i;
2436 int count = 0; 2426 int count = 0;
2437 struct list_head *queue, *iter; 2427 struct list_head *queue;
2438 struct dlm_lock *lock; 2428 struct dlm_lock *lock;
2439 2429
2440 assert_spin_locked(&res->spinlock); 2430 assert_spin_locked(&res->spinlock);
@@ -2453,8 +2443,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2453 ret = 0; 2443 ret = 0;
2454 queue = &res->granted; 2444 queue = &res->granted;
2455 for (i = 0; i < 3; i++) { 2445 for (i = 0; i < 3; i++) {
2456 list_for_each(iter, queue) { 2446 list_for_each_entry(lock, queue, list) {
2457 lock = list_entry(iter, struct dlm_lock, list);
2458 ++count; 2447 ++count;
2459 if (lock->ml.node == dlm->node_num) { 2448 if (lock->ml.node == dlm->node_num) {
2460 mlog(0, "found a lock owned by this node still " 2449 mlog(0, "found a lock owned by this node still "
@@ -2923,18 +2912,16 @@ again:
2923static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, 2912static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2924 struct dlm_lock_resource *res) 2913 struct dlm_lock_resource *res)
2925{ 2914{
2926 struct list_head *iter, *iter2;
2927 struct list_head *queue = &res->granted; 2915 struct list_head *queue = &res->granted;
2928 int i, bit; 2916 int i, bit;
2929 struct dlm_lock *lock; 2917 struct dlm_lock *lock, *next;
2930 2918
2931 assert_spin_locked(&res->spinlock); 2919 assert_spin_locked(&res->spinlock);
2932 2920
2933 BUG_ON(res->owner == dlm->node_num); 2921 BUG_ON(res->owner == dlm->node_num);
2934 2922
2935 for (i=0; i<3; i++) { 2923 for (i=0; i<3; i++) {
2936 list_for_each_safe(iter, iter2, queue) { 2924 list_for_each_entry_safe(lock, next, queue, list) {
2937 lock = list_entry (iter, struct dlm_lock, list);
2938 if (lock->ml.node != dlm->node_num) { 2925 if (lock->ml.node != dlm->node_num) {
2939 mlog(0, "putting lock for node %u\n", 2926 mlog(0, "putting lock for node %u\n",
2940 lock->ml.node); 2927 lock->ml.node);
@@ -2976,7 +2963,6 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2976{ 2963{
2977 int i; 2964 int i;
2978 struct list_head *queue = &res->granted; 2965 struct list_head *queue = &res->granted;
2979 struct list_head *iter;
2980 struct dlm_lock *lock; 2966 struct dlm_lock *lock;
2981 int nodenum; 2967 int nodenum;
2982 2968
@@ -2984,10 +2970,9 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2984 2970
2985 spin_lock(&res->spinlock); 2971 spin_lock(&res->spinlock);
2986 for (i=0; i<3; i++) { 2972 for (i=0; i<3; i++) {
2987 list_for_each(iter, queue) { 2973 list_for_each_entry(lock, queue, list) {
2988 /* up to the caller to make sure this node 2974 /* up to the caller to make sure this node
2989 * is alive */ 2975 * is alive */
2990 lock = list_entry (iter, struct dlm_lock, list);
2991 if (lock->ml.node != dlm->node_num) { 2976 if (lock->ml.node != dlm->node_num) {
2992 spin_unlock(&res->spinlock); 2977 spin_unlock(&res->spinlock);
2993 return lock->ml.node; 2978 return lock->ml.node;
@@ -3234,8 +3219,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3234 3219
3235void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) 3220void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3236{ 3221{
3237 struct list_head *iter, *iter2; 3222 struct dlm_master_list_entry *mle, *next;
3238 struct dlm_master_list_entry *mle;
3239 struct dlm_lock_resource *res; 3223 struct dlm_lock_resource *res;
3240 unsigned int hash; 3224 unsigned int hash;
3241 3225
@@ -3245,9 +3229,7 @@ top:
3245 3229
3246 /* clean the master list */ 3230 /* clean the master list */
3247 spin_lock(&dlm->master_lock); 3231 spin_lock(&dlm->master_lock);
3248 list_for_each_safe(iter, iter2, &dlm->master_list) { 3232 list_for_each_entry_safe(mle, next, &dlm->master_list, list) {
3249 mle = list_entry(iter, struct dlm_master_list_entry, list);
3250
3251 BUG_ON(mle->type != DLM_MLE_BLOCK && 3233 BUG_ON(mle->type != DLM_MLE_BLOCK &&
3252 mle->type != DLM_MLE_MASTER && 3234 mle->type != DLM_MLE_MASTER &&
3253 mle->type != DLM_MLE_MIGRATION); 3235 mle->type != DLM_MLE_MIGRATION);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 671c4ed58ee2..a2c33160bfd6 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -158,8 +158,7 @@ void dlm_dispatch_work(struct work_struct *work)
158 struct dlm_ctxt *dlm = 158 struct dlm_ctxt *dlm =
159 container_of(work, struct dlm_ctxt, dispatched_work); 159 container_of(work, struct dlm_ctxt, dispatched_work);
160 LIST_HEAD(tmp_list); 160 LIST_HEAD(tmp_list);
161 struct list_head *iter, *iter2; 161 struct dlm_work_item *item, *next;
162 struct dlm_work_item *item;
163 dlm_workfunc_t *workfunc; 162 dlm_workfunc_t *workfunc;
164 int tot=0; 163 int tot=0;
165 164
@@ -167,13 +166,12 @@ void dlm_dispatch_work(struct work_struct *work)
167 list_splice_init(&dlm->work_list, &tmp_list); 166 list_splice_init(&dlm->work_list, &tmp_list);
168 spin_unlock(&dlm->work_lock); 167 spin_unlock(&dlm->work_lock);
169 168
170 list_for_each_safe(iter, iter2, &tmp_list) { 169 list_for_each_entry(item, &tmp_list, list) {
171 tot++; 170 tot++;
172 } 171 }
173 mlog(0, "%s: work thread has %d work items\n", dlm->name, tot); 172 mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
174 173
175 list_for_each_safe(iter, iter2, &tmp_list) { 174 list_for_each_entry_safe(item, next, &tmp_list, list) {
176 item = list_entry(iter, struct dlm_work_item, list);
177 workfunc = item->func; 175 workfunc = item->func;
178 list_del_init(&item->list); 176 list_del_init(&item->list);
179 177
@@ -549,7 +547,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
549{ 547{
550 int status = 0; 548 int status = 0;
551 struct dlm_reco_node_data *ndata; 549 struct dlm_reco_node_data *ndata;
552 struct list_head *iter;
553 int all_nodes_done; 550 int all_nodes_done;
554 int destroy = 0; 551 int destroy = 0;
555 int pass = 0; 552 int pass = 0;
@@ -567,8 +564,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
567 564
568 /* safe to access the node data list without a lock, since this 565 /* safe to access the node data list without a lock, since this
569 * process is the only one to change the list */ 566 * process is the only one to change the list */
570 list_for_each(iter, &dlm->reco.node_data) { 567 list_for_each_entry(ndata, &dlm->reco.node_data, list) {
571 ndata = list_entry (iter, struct dlm_reco_node_data, list);
572 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); 568 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
573 ndata->state = DLM_RECO_NODE_DATA_REQUESTING; 569 ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
574 570
@@ -655,9 +651,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
655 * done, or if anyone died */ 651 * done, or if anyone died */
656 all_nodes_done = 1; 652 all_nodes_done = 1;
657 spin_lock(&dlm_reco_state_lock); 653 spin_lock(&dlm_reco_state_lock);
658 list_for_each(iter, &dlm->reco.node_data) { 654 list_for_each_entry(ndata, &dlm->reco.node_data, list) {
659 ndata = list_entry (iter, struct dlm_reco_node_data, list);
660
661 mlog(0, "checking recovery state of node %u\n", 655 mlog(0, "checking recovery state of node %u\n",
662 ndata->node_num); 656 ndata->node_num);
663 switch (ndata->state) { 657 switch (ndata->state) {
@@ -774,16 +768,14 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
774 768
775static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) 769static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
776{ 770{
777 struct list_head *iter, *iter2; 771 struct dlm_reco_node_data *ndata, *next;
778 struct dlm_reco_node_data *ndata;
779 LIST_HEAD(tmplist); 772 LIST_HEAD(tmplist);
780 773
781 spin_lock(&dlm_reco_state_lock); 774 spin_lock(&dlm_reco_state_lock);
782 list_splice_init(&dlm->reco.node_data, &tmplist); 775 list_splice_init(&dlm->reco.node_data, &tmplist);
783 spin_unlock(&dlm_reco_state_lock); 776 spin_unlock(&dlm_reco_state_lock);
784 777
785 list_for_each_safe(iter, iter2, &tmplist) { 778 list_for_each_entry_safe(ndata, next, &tmplist, list) {
786 ndata = list_entry (iter, struct dlm_reco_node_data, list);
787 list_del_init(&ndata->list); 779 list_del_init(&ndata->list);
788 kfree(ndata); 780 kfree(ndata);
789 } 781 }
@@ -876,7 +868,6 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
876 struct dlm_lock_resource *res; 868 struct dlm_lock_resource *res;
877 struct dlm_ctxt *dlm; 869 struct dlm_ctxt *dlm;
878 LIST_HEAD(resources); 870 LIST_HEAD(resources);
879 struct list_head *iter;
880 int ret; 871 int ret;
881 u8 dead_node, reco_master; 872 u8 dead_node, reco_master;
882 int skip_all_done = 0; 873 int skip_all_done = 0;
@@ -920,8 +911,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
920 911
921 /* any errors returned will be due to the new_master dying, 912 /* any errors returned will be due to the new_master dying,
922 * the dlm_reco_thread should detect this */ 913 * the dlm_reco_thread should detect this */
923 list_for_each(iter, &resources) { 914 list_for_each_entry(res, &resources, recovering) {
924 res = list_entry (iter, struct dlm_lock_resource, recovering);
925 ret = dlm_send_one_lockres(dlm, res, mres, reco_master, 915 ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
926 DLM_MRES_RECOVERY); 916 DLM_MRES_RECOVERY);
927 if (ret < 0) { 917 if (ret < 0) {
@@ -983,7 +973,6 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
983{ 973{
984 struct dlm_ctxt *dlm = data; 974 struct dlm_ctxt *dlm = data;
985 struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; 975 struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
986 struct list_head *iter;
987 struct dlm_reco_node_data *ndata = NULL; 976 struct dlm_reco_node_data *ndata = NULL;
988 int ret = -EINVAL; 977 int ret = -EINVAL;
989 978
@@ -1000,8 +989,7 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
1000 dlm->reco.dead_node, done->node_idx, dlm->node_num); 989 dlm->reco.dead_node, done->node_idx, dlm->node_num);
1001 990
1002 spin_lock(&dlm_reco_state_lock); 991 spin_lock(&dlm_reco_state_lock);
1003 list_for_each(iter, &dlm->reco.node_data) { 992 list_for_each_entry(ndata, &dlm->reco.node_data, list) {
1004 ndata = list_entry (iter, struct dlm_reco_node_data, list);
1005 if (ndata->node_num != done->node_idx) 993 if (ndata->node_num != done->node_idx)
1006 continue; 994 continue;
1007 995
@@ -1049,13 +1037,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
1049 struct list_head *list, 1037 struct list_head *list,
1050 u8 dead_node) 1038 u8 dead_node)
1051{ 1039{
1052 struct dlm_lock_resource *res; 1040 struct dlm_lock_resource *res, *next;
1053 struct list_head *iter, *iter2;
1054 struct dlm_lock *lock; 1041 struct dlm_lock *lock;
1055 1042
1056 spin_lock(&dlm->spinlock); 1043 spin_lock(&dlm->spinlock);
1057 list_for_each_safe(iter, iter2, &dlm->reco.resources) { 1044 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
1058 res = list_entry (iter, struct dlm_lock_resource, recovering);
1059 /* always prune any $RECOVERY entries for dead nodes, 1045 /* always prune any $RECOVERY entries for dead nodes,
1060 * otherwise hangs can occur during later recovery */ 1046 * otherwise hangs can occur during later recovery */
1061 if (dlm_is_recovery_lock(res->lockname.name, 1047 if (dlm_is_recovery_lock(res->lockname.name,
@@ -1169,7 +1155,7 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
1169 u8 flags, u8 master) 1155 u8 flags, u8 master)
1170{ 1156{
1171 /* mres here is one full page */ 1157 /* mres here is one full page */
1172 memset(mres, 0, PAGE_SIZE); 1158 clear_page(mres);
1173 mres->lockname_len = namelen; 1159 mres->lockname_len = namelen;
1174 memcpy(mres->lockname, lockname, namelen); 1160 memcpy(mres->lockname, lockname, namelen);
1175 mres->num_locks = 0; 1161 mres->num_locks = 0;
@@ -1252,7 +1238,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1252 struct dlm_migratable_lockres *mres, 1238 struct dlm_migratable_lockres *mres,
1253 u8 send_to, u8 flags) 1239 u8 send_to, u8 flags)
1254{ 1240{
1255 struct list_head *queue, *iter; 1241 struct list_head *queue;
1256 int total_locks, i; 1242 int total_locks, i;
1257 u64 mig_cookie = 0; 1243 u64 mig_cookie = 0;
1258 struct dlm_lock *lock; 1244 struct dlm_lock *lock;
@@ -1278,9 +1264,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1278 total_locks = 0; 1264 total_locks = 0;
1279 for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) { 1265 for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
1280 queue = dlm_list_idx_to_ptr(res, i); 1266 queue = dlm_list_idx_to_ptr(res, i);
1281 list_for_each(iter, queue) { 1267 list_for_each_entry(lock, queue, list) {
1282 lock = list_entry (iter, struct dlm_lock, list);
1283
1284 /* add another lock. */ 1268 /* add another lock. */
1285 total_locks++; 1269 total_locks++;
1286 if (!dlm_add_lock_to_array(lock, mres, i)) 1270 if (!dlm_add_lock_to_array(lock, mres, i))
@@ -1717,7 +1701,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1717 struct dlm_lockstatus *lksb = NULL; 1701 struct dlm_lockstatus *lksb = NULL;
1718 int ret = 0; 1702 int ret = 0;
1719 int i, j, bad; 1703 int i, j, bad;
1720 struct list_head *iter;
1721 struct dlm_lock *lock = NULL; 1704 struct dlm_lock *lock = NULL;
1722 u8 from = O2NM_MAX_NODES; 1705 u8 from = O2NM_MAX_NODES;
1723 unsigned int added = 0; 1706 unsigned int added = 0;
@@ -1755,8 +1738,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1755 spin_lock(&res->spinlock); 1738 spin_lock(&res->spinlock);
1756 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { 1739 for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
1757 tmpq = dlm_list_idx_to_ptr(res, j); 1740 tmpq = dlm_list_idx_to_ptr(res, j);
1758 list_for_each(iter, tmpq) { 1741 list_for_each_entry(lock, tmpq, list) {
1759 lock = list_entry (iter, struct dlm_lock, list);
1760 if (lock->ml.cookie != ml->cookie) 1742 if (lock->ml.cookie != ml->cookie)
1761 lock = NULL; 1743 lock = NULL;
1762 else 1744 else
@@ -1930,8 +1912,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1930 struct dlm_lock_resource *res) 1912 struct dlm_lock_resource *res)
1931{ 1913{
1932 int i; 1914 int i;
1933 struct list_head *queue, *iter, *iter2; 1915 struct list_head *queue;
1934 struct dlm_lock *lock; 1916 struct dlm_lock *lock, *next;
1935 1917
1936 res->state |= DLM_LOCK_RES_RECOVERING; 1918 res->state |= DLM_LOCK_RES_RECOVERING;
1937 if (!list_empty(&res->recovering)) { 1919 if (!list_empty(&res->recovering)) {
@@ -1947,8 +1929,7 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1947 /* find any pending locks and put them back on proper list */ 1929 /* find any pending locks and put them back on proper list */
1948 for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) { 1930 for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
1949 queue = dlm_list_idx_to_ptr(res, i); 1931 queue = dlm_list_idx_to_ptr(res, i);
1950 list_for_each_safe(iter, iter2, queue) { 1932 list_for_each_entry_safe(lock, next, queue, list) {
1951 lock = list_entry (iter, struct dlm_lock, list);
1952 dlm_lock_get(lock); 1933 dlm_lock_get(lock);
1953 if (lock->convert_pending) { 1934 if (lock->convert_pending) {
1954 /* move converting lock back to granted */ 1935 /* move converting lock back to granted */
@@ -2013,18 +1994,15 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
2013 u8 dead_node, u8 new_master) 1994 u8 dead_node, u8 new_master)
2014{ 1995{
2015 int i; 1996 int i;
2016 struct list_head *iter, *iter2;
2017 struct hlist_node *hash_iter; 1997 struct hlist_node *hash_iter;
2018 struct hlist_head *bucket; 1998 struct hlist_head *bucket;
2019 1999 struct dlm_lock_resource *res, *next;
2020 struct dlm_lock_resource *res;
2021 2000
2022 mlog_entry_void(); 2001 mlog_entry_void();
2023 2002
2024 assert_spin_locked(&dlm->spinlock); 2003 assert_spin_locked(&dlm->spinlock);
2025 2004
2026 list_for_each_safe(iter, iter2, &dlm->reco.resources) { 2005 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
2027 res = list_entry (iter, struct dlm_lock_resource, recovering);
2028 if (res->owner == dead_node) { 2006 if (res->owner == dead_node) {
2029 list_del_init(&res->recovering); 2007 list_del_init(&res->recovering);
2030 spin_lock(&res->spinlock); 2008 spin_lock(&res->spinlock);
@@ -2099,7 +2077,7 @@ static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
2099static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, 2077static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
2100 struct dlm_lock_resource *res, u8 dead_node) 2078 struct dlm_lock_resource *res, u8 dead_node)
2101{ 2079{
2102 struct list_head *iter, *queue; 2080 struct list_head *queue;
2103 struct dlm_lock *lock; 2081 struct dlm_lock *lock;
2104 int blank_lvb = 0, local = 0; 2082 int blank_lvb = 0, local = 0;
2105 int i; 2083 int i;
@@ -2121,8 +2099,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
2121 2099
2122 for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) { 2100 for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
2123 queue = dlm_list_idx_to_ptr(res, i); 2101 queue = dlm_list_idx_to_ptr(res, i);
2124 list_for_each(iter, queue) { 2102 list_for_each_entry(lock, queue, list) {
2125 lock = list_entry (iter, struct dlm_lock, list);
2126 if (lock->ml.node == search_node) { 2103 if (lock->ml.node == search_node) {
2127 if (dlm_lvb_needs_invalidation(lock, local)) { 2104 if (dlm_lvb_needs_invalidation(lock, local)) {
2128 /* zero the lksb lvb and lockres lvb */ 2105 /* zero the lksb lvb and lockres lvb */
@@ -2143,8 +2120,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
2143static void dlm_free_dead_locks(struct dlm_ctxt *dlm, 2120static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2144 struct dlm_lock_resource *res, u8 dead_node) 2121 struct dlm_lock_resource *res, u8 dead_node)
2145{ 2122{
2146 struct list_head *iter, *tmpiter; 2123 struct dlm_lock *lock, *next;
2147 struct dlm_lock *lock;
2148 unsigned int freed = 0; 2124 unsigned int freed = 0;
2149 2125
2150 /* this node is the lockres master: 2126 /* this node is the lockres master:
@@ -2155,24 +2131,21 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2155 assert_spin_locked(&res->spinlock); 2131 assert_spin_locked(&res->spinlock);
2156 2132
2157 /* TODO: check pending_asts, pending_basts here */ 2133 /* TODO: check pending_asts, pending_basts here */
2158 list_for_each_safe(iter, tmpiter, &res->granted) { 2134 list_for_each_entry_safe(lock, next, &res->granted, list) {
2159 lock = list_entry (iter, struct dlm_lock, list);
2160 if (lock->ml.node == dead_node) { 2135 if (lock->ml.node == dead_node) {
2161 list_del_init(&lock->list); 2136 list_del_init(&lock->list);
2162 dlm_lock_put(lock); 2137 dlm_lock_put(lock);
2163 freed++; 2138 freed++;
2164 } 2139 }
2165 } 2140 }
2166 list_for_each_safe(iter, tmpiter, &res->converting) { 2141 list_for_each_entry_safe(lock, next, &res->converting, list) {
2167 lock = list_entry (iter, struct dlm_lock, list);
2168 if (lock->ml.node == dead_node) { 2142 if (lock->ml.node == dead_node) {
2169 list_del_init(&lock->list); 2143 list_del_init(&lock->list);
2170 dlm_lock_put(lock); 2144 dlm_lock_put(lock);
2171 freed++; 2145 freed++;
2172 } 2146 }
2173 } 2147 }
2174 list_for_each_safe(iter, tmpiter, &res->blocked) { 2148 list_for_each_entry_safe(lock, next, &res->blocked, list) {
2175 lock = list_entry (iter, struct dlm_lock, list);
2176 if (lock->ml.node == dead_node) { 2149 if (lock->ml.node == dead_node) {
2177 list_del_init(&lock->list); 2150 list_del_init(&lock->list);
2178 dlm_lock_put(lock); 2151 dlm_lock_put(lock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index d1bd305ef0d7..f71250ed166f 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -600,15 +600,13 @@ static inline int ocfs2_highest_compat_lock_level(int level)
600static void lockres_set_flags(struct ocfs2_lock_res *lockres, 600static void lockres_set_flags(struct ocfs2_lock_res *lockres,
601 unsigned long newflags) 601 unsigned long newflags)
602{ 602{
603 struct list_head *pos, *tmp; 603 struct ocfs2_mask_waiter *mw, *tmp;
604 struct ocfs2_mask_waiter *mw;
605 604
606 assert_spin_locked(&lockres->l_lock); 605 assert_spin_locked(&lockres->l_lock);
607 606
608 lockres->l_flags = newflags; 607 lockres->l_flags = newflags;
609 608
610 list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) { 609 list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) {
611 mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
612 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) 610 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
613 continue; 611 continue;
614 612
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index f226b2207628..ff257628af16 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -32,6 +32,11 @@ static inline void le32_add_cpu(__le32 *var, u32 val)
32 *var = cpu_to_le32(le32_to_cpu(*var) + val); 32 *var = cpu_to_le32(le32_to_cpu(*var) + val);
33} 33}
34 34
35static inline void le64_add_cpu(__le64 *var, u64 val)
36{
37 *var = cpu_to_le64(le64_to_cpu(*var) + val);
38}
39
35static inline void le32_and_cpu(__le32 *var, u32 val) 40static inline void le32_and_cpu(__le32 *var, u32 val)
36{ 41{
37 *var = cpu_to_le32(le32_to_cpu(*var) & val); 42 *var = cpu_to_le32(le32_to_cpu(*var) & val);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index ba2b2ab1c6e4..03c1d365c78b 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -109,17 +109,14 @@ static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
109 */ 109 */
110void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) 110void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
111{ 111{
112 struct list_head *p, *n; 112 struct ocfs2_extent_map_item *emi, *n;
113 struct ocfs2_extent_map_item *emi;
114 struct ocfs2_inode_info *oi = OCFS2_I(inode); 113 struct ocfs2_inode_info *oi = OCFS2_I(inode);
115 struct ocfs2_extent_map *em = &oi->ip_extent_map; 114 struct ocfs2_extent_map *em = &oi->ip_extent_map;
116 LIST_HEAD(tmp_list); 115 LIST_HEAD(tmp_list);
117 unsigned int range; 116 unsigned int range;
118 117
119 spin_lock(&oi->ip_lock); 118 spin_lock(&oi->ip_lock);
120 list_for_each_safe(p, n, &em->em_list) { 119 list_for_each_entry_safe(emi, n, &em->em_list, ei_list) {
121 emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
122
123 if (emi->ei_cpos >= cpos) { 120 if (emi->ei_cpos >= cpos) {
124 /* Full truncate of this record. */ 121 /* Full truncate of this record. */
125 list_move(&emi->ei_list, &tmp_list); 122 list_move(&emi->ei_list, &tmp_list);
@@ -136,8 +133,7 @@ void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
136 } 133 }
137 spin_unlock(&oi->ip_lock); 134 spin_unlock(&oi->ip_lock);
138 135
139 list_for_each_safe(p, n, &tmp_list) { 136 list_for_each_entry_safe(emi, n, &tmp_list, ei_list) {
140 emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
141 list_del(&emi->ei_list); 137 list_del(&emi->ei_list);
142 kfree(emi); 138 kfree(emi);
143 } 139 }
@@ -377,37 +373,6 @@ out:
377 return ret; 373 return ret;
378} 374}
379 375
380/*
381 * Return the index of the extent record which contains cluster #v_cluster.
382 * -1 is returned if it was not found.
383 *
384 * Should work fine on interior and exterior nodes.
385 */
386static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
387 u32 v_cluster)
388{
389 int ret = -1;
390 int i;
391 struct ocfs2_extent_rec *rec;
392 u32 rec_end, rec_start, clusters;
393
394 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
395 rec = &el->l_recs[i];
396
397 rec_start = le32_to_cpu(rec->e_cpos);
398 clusters = ocfs2_rec_clusters(el, rec);
399
400 rec_end = rec_start + clusters;
401
402 if (v_cluster >= rec_start && v_cluster < rec_end) {
403 ret = i;
404 break;
405 }
406 }
407
408 return ret;
409}
410
411int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, 376int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
412 u32 *p_cluster, u32 *num_clusters, 377 u32 *p_cluster, u32 *num_clusters,
413 unsigned int *extent_flags) 378 unsigned int *extent_flags)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4979b6675717..f04c7aa834cb 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -263,6 +263,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
263 int status; 263 int status;
264 handle_t *handle; 264 handle_t *handle;
265 struct ocfs2_dinode *di; 265 struct ocfs2_dinode *di;
266 u64 cluster_bytes;
266 267
267 mlog_entry_void(); 268 mlog_entry_void();
268 269
@@ -286,7 +287,9 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
286 /* 287 /*
287 * Do this before setting i_size. 288 * Do this before setting i_size.
288 */ 289 */
289 status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); 290 cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
291 status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
292 cluster_bytes);
290 if (status) { 293 if (status) {
291 mlog_errno(status); 294 mlog_errno(status);
292 goto out_commit; 295 goto out_commit;
@@ -326,9 +329,6 @@ static int ocfs2_truncate_file(struct inode *inode,
326 (unsigned long long)OCFS2_I(inode)->ip_blkno, 329 (unsigned long long)OCFS2_I(inode)->ip_blkno,
327 (unsigned long long)new_i_size); 330 (unsigned long long)new_i_size);
328 331
329 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
330 truncate_inode_pages(inode->i_mapping, new_i_size);
331
332 fe = (struct ocfs2_dinode *) di_bh->b_data; 332 fe = (struct ocfs2_dinode *) di_bh->b_data;
333 if (!OCFS2_IS_VALID_DINODE(fe)) { 333 if (!OCFS2_IS_VALID_DINODE(fe)) {
334 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 334 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
@@ -363,16 +363,23 @@ static int ocfs2_truncate_file(struct inode *inode,
363 if (new_i_size == le64_to_cpu(fe->i_size)) 363 if (new_i_size == le64_to_cpu(fe->i_size))
364 goto bail; 364 goto bail;
365 365
366 down_write(&OCFS2_I(inode)->ip_alloc_sem);
367
366 /* This forces other nodes to sync and drop their pages. Do 368 /* This forces other nodes to sync and drop their pages. Do
367 * this even if we have a truncate without allocation change - 369 * this even if we have a truncate without allocation change -
368 * ocfs2 cluster sizes can be much greater than page size, so 370 * ocfs2 cluster sizes can be much greater than page size, so
369 * we have to truncate them anyway. */ 371 * we have to truncate them anyway. */
370 status = ocfs2_data_lock(inode, 1); 372 status = ocfs2_data_lock(inode, 1);
371 if (status < 0) { 373 if (status < 0) {
374 up_write(&OCFS2_I(inode)->ip_alloc_sem);
375
372 mlog_errno(status); 376 mlog_errno(status);
373 goto bail; 377 goto bail;
374 } 378 }
375 379
380 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
381 truncate_inode_pages(inode->i_mapping, new_i_size);
382
376 /* alright, we're going to need to do a full blown alloc size 383 /* alright, we're going to need to do a full blown alloc size
377 * change. Orphan the inode so that recovery can complete the 384 * change. Orphan the inode so that recovery can complete the
378 * truncate if necessary. This does the task of marking 385 * truncate if necessary. This does the task of marking
@@ -399,6 +406,8 @@ static int ocfs2_truncate_file(struct inode *inode,
399bail_unlock_data: 406bail_unlock_data:
400 ocfs2_data_unlock(inode, 1); 407 ocfs2_data_unlock(inode, 1);
401 408
409 up_write(&OCFS2_I(inode)->ip_alloc_sem);
410
402bail: 411bail:
403 412
404 mlog_exit(status); 413 mlog_exit(status);
@@ -419,6 +428,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
419 struct inode *inode, 428 struct inode *inode,
420 u32 *logical_offset, 429 u32 *logical_offset,
421 u32 clusters_to_add, 430 u32 clusters_to_add,
431 int mark_unwritten,
422 struct buffer_head *fe_bh, 432 struct buffer_head *fe_bh,
423 handle_t *handle, 433 handle_t *handle,
424 struct ocfs2_alloc_context *data_ac, 434 struct ocfs2_alloc_context *data_ac,
@@ -431,9 +441,13 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
431 enum ocfs2_alloc_restarted reason = RESTART_NONE; 441 enum ocfs2_alloc_restarted reason = RESTART_NONE;
432 u32 bit_off, num_bits; 442 u32 bit_off, num_bits;
433 u64 block; 443 u64 block;
444 u8 flags = 0;
434 445
435 BUG_ON(!clusters_to_add); 446 BUG_ON(!clusters_to_add);
436 447
448 if (mark_unwritten)
449 flags = OCFS2_EXT_UNWRITTEN;
450
437 free_extents = ocfs2_num_free_extents(osb, inode, fe); 451 free_extents = ocfs2_num_free_extents(osb, inode, fe);
438 if (free_extents < 0) { 452 if (free_extents < 0) {
439 status = free_extents; 453 status = free_extents;
@@ -483,7 +497,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
483 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 497 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
484 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, 498 status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
485 *logical_offset, block, num_bits, 499 *logical_offset, block, num_bits,
486 meta_ac); 500 flags, meta_ac);
487 if (status < 0) { 501 if (status < 0) {
488 mlog_errno(status); 502 mlog_errno(status);
489 goto leave; 503 goto leave;
@@ -516,25 +530,31 @@ leave:
516 * For a given allocation, determine which allocators will need to be 530 * For a given allocation, determine which allocators will need to be
517 * accessed, and lock them, reserving the appropriate number of bits. 531 * accessed, and lock them, reserving the appropriate number of bits.
518 * 532 *
519 * Called from ocfs2_extend_allocation() for file systems which don't 533 * Sparse file systems call this from ocfs2_write_begin_nolock()
520 * support holes, and from ocfs2_write() for file systems which 534 * and ocfs2_allocate_unwritten_extents().
521 * understand sparse inodes. 535 *
536 * File systems which don't support holes call this from
537 * ocfs2_extend_allocation().
522 */ 538 */
523int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, 539int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
524 u32 clusters_to_add, 540 u32 clusters_to_add, u32 extents_to_split,
525 struct ocfs2_alloc_context **data_ac, 541 struct ocfs2_alloc_context **data_ac,
526 struct ocfs2_alloc_context **meta_ac) 542 struct ocfs2_alloc_context **meta_ac)
527{ 543{
528 int ret, num_free_extents; 544 int ret = 0, num_free_extents;
545 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
529 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 546 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
530 547
531 *meta_ac = NULL; 548 *meta_ac = NULL;
532 *data_ac = NULL; 549 if (data_ac)
550 *data_ac = NULL;
551
552 BUG_ON(clusters_to_add != 0 && data_ac == NULL);
533 553
534 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " 554 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
535 "clusters_to_add = %u\n", 555 "clusters_to_add = %u, extents_to_split = %u\n",
536 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 556 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
537 le32_to_cpu(di->i_clusters), clusters_to_add); 557 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
538 558
539 num_free_extents = ocfs2_num_free_extents(osb, inode, di); 559 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
540 if (num_free_extents < 0) { 560 if (num_free_extents < 0) {
@@ -552,9 +572,12 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
552 * 572 *
553 * Most of the time we'll only be seeing this 1 cluster at a time 573 * Most of the time we'll only be seeing this 1 cluster at a time
554 * anyway. 574 * anyway.
575 *
576 * Always lock for any unwritten extents - we might want to
577 * add blocks during a split.
555 */ 578 */
556 if (!num_free_extents || 579 if (!num_free_extents ||
557 (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { 580 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
558 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); 581 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
559 if (ret < 0) { 582 if (ret < 0) {
560 if (ret != -ENOSPC) 583 if (ret != -ENOSPC)
@@ -563,6 +586,9 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
563 } 586 }
564 } 587 }
565 588
589 if (clusters_to_add == 0)
590 goto out;
591
566 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); 592 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
567 if (ret < 0) { 593 if (ret < 0) {
568 if (ret != -ENOSPC) 594 if (ret != -ENOSPC)
@@ -585,14 +611,13 @@ out:
585 return ret; 611 return ret;
586} 612}
587 613
588static int ocfs2_extend_allocation(struct inode *inode, 614static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
589 u32 clusters_to_add) 615 u32 clusters_to_add, int mark_unwritten)
590{ 616{
591 int status = 0; 617 int status = 0;
592 int restart_func = 0; 618 int restart_func = 0;
593 int drop_alloc_sem = 0;
594 int credits; 619 int credits;
595 u32 prev_clusters, logical_start; 620 u32 prev_clusters;
596 struct buffer_head *bh = NULL; 621 struct buffer_head *bh = NULL;
597 struct ocfs2_dinode *fe = NULL; 622 struct ocfs2_dinode *fe = NULL;
598 handle_t *handle = NULL; 623 handle_t *handle = NULL;
@@ -607,7 +632,7 @@ static int ocfs2_extend_allocation(struct inode *inode,
607 * This function only exists for file systems which don't 632 * This function only exists for file systems which don't
608 * support holes. 633 * support holes.
609 */ 634 */
610 BUG_ON(ocfs2_sparse_alloc(osb)); 635 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
611 636
612 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 637 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
613 OCFS2_BH_CACHED, inode); 638 OCFS2_BH_CACHED, inode);
@@ -623,19 +648,10 @@ static int ocfs2_extend_allocation(struct inode *inode,
623 goto leave; 648 goto leave;
624 } 649 }
625 650
626 logical_start = OCFS2_I(inode)->ip_clusters;
627
628restart_all: 651restart_all:
629 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 652 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
630 653
631 /* blocks peope in read/write from reading our allocation 654 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
632 * until we're done changing it. We depend on i_mutex to block
633 * other extend/truncate calls while we're here. Ordering wrt
634 * start_trans is important here -- always do it before! */
635 down_write(&OCFS2_I(inode)->ip_alloc_sem);
636 drop_alloc_sem = 1;
637
638 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
639 &meta_ac); 655 &meta_ac);
640 if (status) { 656 if (status) {
641 mlog_errno(status); 657 mlog_errno(status);
@@ -668,6 +684,7 @@ restarted_transaction:
668 inode, 684 inode,
669 &logical_start, 685 &logical_start,
670 clusters_to_add, 686 clusters_to_add,
687 mark_unwritten,
671 bh, 688 bh,
672 handle, 689 handle,
673 data_ac, 690 data_ac,
@@ -720,10 +737,6 @@ restarted_transaction:
720 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 737 OCFS2_I(inode)->ip_clusters, i_size_read(inode));
721 738
722leave: 739leave:
723 if (drop_alloc_sem) {
724 up_write(&OCFS2_I(inode)->ip_alloc_sem);
725 drop_alloc_sem = 0;
726 }
727 if (handle) { 740 if (handle) {
728 ocfs2_commit_trans(osb, handle); 741 ocfs2_commit_trans(osb, handle);
729 handle = NULL; 742 handle = NULL;
@@ -749,6 +762,25 @@ leave:
749 return status; 762 return status;
750} 763}
751 764
765static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
766 u32 clusters_to_add, int mark_unwritten)
767{
768 int ret;
769
770 /*
771 * The alloc sem blocks peope in read/write from reading our
772 * allocation until we're done changing it. We depend on
773 * i_mutex to block other extend/truncate calls while we're
774 * here.
775 */
776 down_write(&OCFS2_I(inode)->ip_alloc_sem);
777 ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add,
778 mark_unwritten);
779 up_write(&OCFS2_I(inode)->ip_alloc_sem);
780
781 return ret;
782}
783
752/* Some parts of this taken from generic_cont_expand, which turned out 784/* Some parts of this taken from generic_cont_expand, which turned out
753 * to be too fragile to do exactly what we need without us having to 785 * to be too fragile to do exactly what we need without us having to
754 * worry about recursive locking in ->prepare_write() and 786 * worry about recursive locking in ->prepare_write() and
@@ -890,7 +922,9 @@ static int ocfs2_extend_file(struct inode *inode,
890 } 922 }
891 923
892 if (clusters_to_add) { 924 if (clusters_to_add) {
893 ret = ocfs2_extend_allocation(inode, clusters_to_add); 925 ret = ocfs2_extend_allocation(inode,
926 OCFS2_I(inode)->ip_clusters,
927 clusters_to_add, 0);
894 if (ret < 0) { 928 if (ret < 0) {
895 mlog_errno(ret); 929 mlog_errno(ret);
896 goto out_unlock; 930 goto out_unlock;
@@ -995,6 +1029,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
995 goto bail_unlock; 1029 goto bail_unlock;
996 } 1030 }
997 1031
1032 /*
1033 * This will intentionally not wind up calling vmtruncate(),
1034 * since all the work for a size change has been done above.
1035 * Otherwise, we could get into problems with truncate as
1036 * ip_alloc_sem is used there to protect against i_size
1037 * changes.
1038 */
998 status = inode_setattr(inode, attr); 1039 status = inode_setattr(inode, attr);
999 if (status < 0) { 1040 if (status < 0) {
1000 mlog_errno(status); 1041 mlog_errno(status);
@@ -1070,17 +1111,16 @@ out:
1070 return ret; 1111 return ret;
1071} 1112}
1072 1113
1073static int ocfs2_write_remove_suid(struct inode *inode) 1114static int __ocfs2_write_remove_suid(struct inode *inode,
1115 struct buffer_head *bh)
1074{ 1116{
1075 int ret; 1117 int ret;
1076 struct buffer_head *bh = NULL;
1077 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1078 handle_t *handle; 1118 handle_t *handle;
1079 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1119 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1080 struct ocfs2_dinode *di; 1120 struct ocfs2_dinode *di;
1081 1121
1082 mlog_entry("(Inode %llu, mode 0%o)\n", 1122 mlog_entry("(Inode %llu, mode 0%o)\n",
1083 (unsigned long long)oi->ip_blkno, inode->i_mode); 1123 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
1084 1124
1085 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1125 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1086 if (handle == NULL) { 1126 if (handle == NULL) {
@@ -1089,17 +1129,11 @@ static int ocfs2_write_remove_suid(struct inode *inode)
1089 goto out; 1129 goto out;
1090 } 1130 }
1091 1131
1092 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
1093 if (ret < 0) {
1094 mlog_errno(ret);
1095 goto out_trans;
1096 }
1097
1098 ret = ocfs2_journal_access(handle, inode, bh, 1132 ret = ocfs2_journal_access(handle, inode, bh,
1099 OCFS2_JOURNAL_ACCESS_WRITE); 1133 OCFS2_JOURNAL_ACCESS_WRITE);
1100 if (ret < 0) { 1134 if (ret < 0) {
1101 mlog_errno(ret); 1135 mlog_errno(ret);
1102 goto out_bh; 1136 goto out_trans;
1103 } 1137 }
1104 1138
1105 inode->i_mode &= ~S_ISUID; 1139 inode->i_mode &= ~S_ISUID;
@@ -1112,8 +1146,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
1112 ret = ocfs2_journal_dirty(handle, bh); 1146 ret = ocfs2_journal_dirty(handle, bh);
1113 if (ret < 0) 1147 if (ret < 0)
1114 mlog_errno(ret); 1148 mlog_errno(ret);
1115out_bh: 1149
1116 brelse(bh);
1117out_trans: 1150out_trans:
1118 ocfs2_commit_trans(osb, handle); 1151 ocfs2_commit_trans(osb, handle);
1119out: 1152out:
@@ -1159,6 +1192,460 @@ out:
1159 return ret; 1192 return ret;
1160} 1193}
1161 1194
1195static int ocfs2_write_remove_suid(struct inode *inode)
1196{
1197 int ret;
1198 struct buffer_head *bh = NULL;
1199 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1200
1201 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
1202 oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
1203 if (ret < 0) {
1204 mlog_errno(ret);
1205 goto out;
1206 }
1207
1208 ret = __ocfs2_write_remove_suid(inode, bh);
1209out:
1210 brelse(bh);
1211 return ret;
1212}
1213
1214/*
1215 * Allocate enough extents to cover the region starting at byte offset
1216 * start for len bytes. Existing extents are skipped, any extents
1217 * added are marked as "unwritten".
1218 */
1219static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1220 u64 start, u64 len)
1221{
1222 int ret;
1223 u32 cpos, phys_cpos, clusters, alloc_size;
1224
1225 /*
1226 * We consider both start and len to be inclusive.
1227 */
1228 cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1229 clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1230 clusters -= cpos;
1231
1232 while (clusters) {
1233 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1234 &alloc_size, NULL);
1235 if (ret) {
1236 mlog_errno(ret);
1237 goto out;
1238 }
1239
1240 /*
1241 * Hole or existing extent len can be arbitrary, so
1242 * cap it to our own allocation request.
1243 */
1244 if (alloc_size > clusters)
1245 alloc_size = clusters;
1246
1247 if (phys_cpos) {
1248 /*
1249 * We already have an allocation at this
1250 * region so we can safely skip it.
1251 */
1252 goto next;
1253 }
1254
1255 ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1256 if (ret) {
1257 if (ret != -ENOSPC)
1258 mlog_errno(ret);
1259 goto out;
1260 }
1261
1262next:
1263 cpos += alloc_size;
1264 clusters -= alloc_size;
1265 }
1266
1267 ret = 0;
1268out:
1269 return ret;
1270}
1271
1272static int __ocfs2_remove_inode_range(struct inode *inode,
1273 struct buffer_head *di_bh,
1274 u32 cpos, u32 phys_cpos, u32 len,
1275 struct ocfs2_cached_dealloc_ctxt *dealloc)
1276{
1277 int ret;
1278 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
1279 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1280 struct inode *tl_inode = osb->osb_tl_inode;
1281 handle_t *handle;
1282 struct ocfs2_alloc_context *meta_ac = NULL;
1283 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1284
1285 ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
1286 if (ret) {
1287 mlog_errno(ret);
1288 return ret;
1289 }
1290
1291 mutex_lock(&tl_inode->i_mutex);
1292
1293 if (ocfs2_truncate_log_needs_flush(osb)) {
1294 ret = __ocfs2_flush_truncate_log(osb);
1295 if (ret < 0) {
1296 mlog_errno(ret);
1297 goto out;
1298 }
1299 }
1300
1301 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
1302 if (handle == NULL) {
1303 ret = -ENOMEM;
1304 mlog_errno(ret);
1305 goto out;
1306 }
1307
1308 ret = ocfs2_journal_access(handle, inode, di_bh,
1309 OCFS2_JOURNAL_ACCESS_WRITE);
1310 if (ret) {
1311 mlog_errno(ret);
1312 goto out;
1313 }
1314
1315 ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
1316 dealloc);
1317 if (ret) {
1318 mlog_errno(ret);
1319 goto out_commit;
1320 }
1321
1322 OCFS2_I(inode)->ip_clusters -= len;
1323 di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
1324
1325 ret = ocfs2_journal_dirty(handle, di_bh);
1326 if (ret) {
1327 mlog_errno(ret);
1328 goto out_commit;
1329 }
1330
1331 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
1332 if (ret)
1333 mlog_errno(ret);
1334
1335out_commit:
1336 ocfs2_commit_trans(osb, handle);
1337out:
1338 mutex_unlock(&tl_inode->i_mutex);
1339
1340 if (meta_ac)
1341 ocfs2_free_alloc_context(meta_ac);
1342
1343 return ret;
1344}
1345
1346/*
1347 * Truncate a byte range, avoiding pages within partial clusters. This
1348 * preserves those pages for the zeroing code to write to.
1349 */
1350static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1351 u64 byte_len)
1352{
1353 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1354 loff_t start, end;
1355 struct address_space *mapping = inode->i_mapping;
1356
1357 start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1358 end = byte_start + byte_len;
1359 end = end & ~(osb->s_clustersize - 1);
1360
1361 if (start < end) {
1362 unmap_mapping_range(mapping, start, end - start, 0);
1363 truncate_inode_pages_range(mapping, start, end - 1);
1364 }
1365}
1366
1367static int ocfs2_zero_partial_clusters(struct inode *inode,
1368 u64 start, u64 len)
1369{
1370 int ret = 0;
1371 u64 tmpend, end = start + len;
1372 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1373 unsigned int csize = osb->s_clustersize;
1374 handle_t *handle;
1375
1376 /*
1377 * The "start" and "end" values are NOT necessarily part of
1378 * the range whose allocation is being deleted. Rather, this
1379 * is what the user passed in with the request. We must zero
1380 * partial clusters here. There's no need to worry about
1381 * physical allocation - the zeroing code knows to skip holes.
1382 */
1383 mlog(0, "byte start: %llu, end: %llu\n",
1384 (unsigned long long)start, (unsigned long long)end);
1385
1386 /*
1387 * If both edges are on a cluster boundary then there's no
1388 * zeroing required as the region is part of the allocation to
1389 * be truncated.
1390 */
1391 if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1392 goto out;
1393
1394 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1395 if (handle == NULL) {
1396 ret = -ENOMEM;
1397 mlog_errno(ret);
1398 goto out;
1399 }
1400
1401 /*
1402 * We want to get the byte offset of the end of the 1st cluster.
1403 */
1404 tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
1405 if (tmpend > end)
1406 tmpend = end;
1407
1408 mlog(0, "1st range: start: %llu, tmpend: %llu\n",
1409 (unsigned long long)start, (unsigned long long)tmpend);
1410
1411 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
1412 if (ret)
1413 mlog_errno(ret);
1414
1415 if (tmpend < end) {
1416 /*
1417 * This may make start and end equal, but the zeroing
1418 * code will skip any work in that case so there's no
1419 * need to catch it up here.
1420 */
1421 start = end & ~(osb->s_clustersize - 1);
1422
1423 mlog(0, "2nd range: start: %llu, end: %llu\n",
1424 (unsigned long long)start, (unsigned long long)end);
1425
1426 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1427 if (ret)
1428 mlog_errno(ret);
1429 }
1430
1431 ocfs2_commit_trans(osb, handle);
1432out:
1433 return ret;
1434}
1435
1436static int ocfs2_remove_inode_range(struct inode *inode,
1437 struct buffer_head *di_bh, u64 byte_start,
1438 u64 byte_len)
1439{
1440 int ret = 0;
1441 u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
1442 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1443 struct ocfs2_cached_dealloc_ctxt dealloc;
1444
1445 ocfs2_init_dealloc_ctxt(&dealloc);
1446
1447 if (byte_len == 0)
1448 return 0;
1449
1450 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1451 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
1452 if (trunc_len >= trunc_start)
1453 trunc_len -= trunc_start;
1454 else
1455 trunc_len = 0;
1456
1457 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
1458 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1459 (unsigned long long)byte_start,
1460 (unsigned long long)byte_len, trunc_start, trunc_len);
1461
1462 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1463 if (ret) {
1464 mlog_errno(ret);
1465 goto out;
1466 }
1467
1468 cpos = trunc_start;
1469 while (trunc_len) {
1470 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1471 &alloc_size, NULL);
1472 if (ret) {
1473 mlog_errno(ret);
1474 goto out;
1475 }
1476
1477 if (alloc_size > trunc_len)
1478 alloc_size = trunc_len;
1479
1480 /* Only do work for non-holes */
1481 if (phys_cpos != 0) {
1482 ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
1483 phys_cpos, alloc_size,
1484 &dealloc);
1485 if (ret) {
1486 mlog_errno(ret);
1487 goto out;
1488 }
1489 }
1490
1491 cpos += alloc_size;
1492 trunc_len -= alloc_size;
1493 }
1494
1495 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1496
1497out:
1498 ocfs2_schedule_truncate_log_flush(osb, 1);
1499 ocfs2_run_deallocs(osb, &dealloc);
1500
1501 return ret;
1502}
1503
1504/*
1505 * Parts of this function taken from xfs_change_file_space()
1506 */
1507int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1508 struct ocfs2_space_resv *sr)
1509{
1510 int ret;
1511 s64 llen;
1512 struct inode *inode = file->f_path.dentry->d_inode;
1513 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1514 struct buffer_head *di_bh = NULL;
1515 handle_t *handle;
1516 unsigned long long max_off = ocfs2_max_file_offset(inode->i_sb->s_blocksize_bits);
1517
1518 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1519 !ocfs2_writes_unwritten_extents(osb))
1520 return -ENOTTY;
1521 else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
1522 !ocfs2_sparse_alloc(osb))
1523 return -ENOTTY;
1524
1525 if (!S_ISREG(inode->i_mode))
1526 return -EINVAL;
1527
1528 if (!(file->f_mode & FMODE_WRITE))
1529 return -EBADF;
1530
1531 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1532 return -EROFS;
1533
1534 mutex_lock(&inode->i_mutex);
1535
1536 /*
1537 * This prevents concurrent writes on other nodes
1538 */
1539 ret = ocfs2_rw_lock(inode, 1);
1540 if (ret) {
1541 mlog_errno(ret);
1542 goto out;
1543 }
1544
1545 ret = ocfs2_meta_lock(inode, &di_bh, 1);
1546 if (ret) {
1547 mlog_errno(ret);
1548 goto out_rw_unlock;
1549 }
1550
1551 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1552 ret = -EPERM;
1553 goto out_meta_unlock;
1554 }
1555
1556 switch (sr->l_whence) {
1557 case 0: /*SEEK_SET*/
1558 break;
1559 case 1: /*SEEK_CUR*/
1560 sr->l_start += file->f_pos;
1561 break;
1562 case 2: /*SEEK_END*/
1563 sr->l_start += i_size_read(inode);
1564 break;
1565 default:
1566 ret = -EINVAL;
1567 goto out_meta_unlock;
1568 }
1569 sr->l_whence = 0;
1570
1571 llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1572
1573 if (sr->l_start < 0
1574 || sr->l_start > max_off
1575 || (sr->l_start + llen) < 0
1576 || (sr->l_start + llen) > max_off) {
1577 ret = -EINVAL;
1578 goto out_meta_unlock;
1579 }
1580
1581 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1582 if (sr->l_len <= 0) {
1583 ret = -EINVAL;
1584 goto out_meta_unlock;
1585 }
1586 }
1587
1588 if (should_remove_suid(file->f_path.dentry)) {
1589 ret = __ocfs2_write_remove_suid(inode, di_bh);
1590 if (ret) {
1591 mlog_errno(ret);
1592 goto out_meta_unlock;
1593 }
1594 }
1595
1596 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1597 switch (cmd) {
1598 case OCFS2_IOC_RESVSP:
1599 case OCFS2_IOC_RESVSP64:
1600 /*
1601 * This takes unsigned offsets, but the signed ones we
1602 * pass have been checked against overflow above.
1603 */
1604 ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
1605 sr->l_len);
1606 break;
1607 case OCFS2_IOC_UNRESVSP:
1608 case OCFS2_IOC_UNRESVSP64:
1609 ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
1610 sr->l_len);
1611 break;
1612 default:
1613 ret = -EINVAL;
1614 }
1615 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1616 if (ret) {
1617 mlog_errno(ret);
1618 goto out_meta_unlock;
1619 }
1620
1621 /*
1622 * We update c/mtime for these changes
1623 */
1624 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1625 if (IS_ERR(handle)) {
1626 ret = PTR_ERR(handle);
1627 mlog_errno(ret);
1628 goto out_meta_unlock;
1629 }
1630
1631 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1632 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
1633 if (ret < 0)
1634 mlog_errno(ret);
1635
1636 ocfs2_commit_trans(osb, handle);
1637
1638out_meta_unlock:
1639 brelse(di_bh);
1640 ocfs2_meta_unlock(inode, 1);
1641out_rw_unlock:
1642 ocfs2_rw_unlock(inode, 1);
1643
1644 mutex_unlock(&inode->i_mutex);
1645out:
1646 return ret;
1647}
1648
1162static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1649static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1163 loff_t *ppos, 1650 loff_t *ppos,
1164 size_t count, 1651 size_t count,
@@ -1329,15 +1816,16 @@ ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
1329 *basep = base; 1816 *basep = base;
1330} 1817}
1331 1818
1332static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, 1819static struct page * ocfs2_get_write_source(char **ret_src_buf,
1333 const struct iovec *cur_iov, 1820 const struct iovec *cur_iov,
1334 size_t iov_offset) 1821 size_t iov_offset)
1335{ 1822{
1336 int ret; 1823 int ret;
1337 char *buf; 1824 char *buf = cur_iov->iov_base + iov_offset;
1338 struct page *src_page = NULL; 1825 struct page *src_page = NULL;
1826 unsigned long off;
1339 1827
1340 buf = cur_iov->iov_base + iov_offset; 1828 off = (unsigned long)(buf) & ~PAGE_CACHE_MASK;
1341 1829
1342 if (!segment_eq(get_fs(), KERNEL_DS)) { 1830 if (!segment_eq(get_fs(), KERNEL_DS)) {
1343 /* 1831 /*
@@ -1349,18 +1837,17 @@ static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp
1349 (unsigned long)buf & PAGE_CACHE_MASK, 1, 1837 (unsigned long)buf & PAGE_CACHE_MASK, 1,
1350 0, 0, &src_page, NULL); 1838 0, 0, &src_page, NULL);
1351 if (ret == 1) 1839 if (ret == 1)
1352 bp->b_src_buf = kmap(src_page); 1840 *ret_src_buf = kmap(src_page) + off;
1353 else 1841 else
1354 src_page = ERR_PTR(-EFAULT); 1842 src_page = ERR_PTR(-EFAULT);
1355 } else { 1843 } else {
1356 bp->b_src_buf = buf; 1844 *ret_src_buf = buf;
1357 } 1845 }
1358 1846
1359 return src_page; 1847 return src_page;
1360} 1848}
1361 1849
1362static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, 1850static void ocfs2_put_write_source(struct page *page)
1363 struct page *page)
1364{ 1851{
1365 if (page) { 1852 if (page) {
1366 kunmap(page); 1853 kunmap(page);
@@ -1376,10 +1863,12 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1376{ 1863{
1377 int ret = 0; 1864 int ret = 0;
1378 ssize_t copied, total = 0; 1865 ssize_t copied, total = 0;
1379 size_t iov_offset = 0; 1866 size_t iov_offset = 0, bytes;
1867 loff_t pos;
1380 const struct iovec *cur_iov = iov; 1868 const struct iovec *cur_iov = iov;
1381 struct ocfs2_buffered_write_priv bp; 1869 struct page *user_page, *page;
1382 struct page *page; 1870 char *buf, *dst;
1871 void *fsdata;
1383 1872
1384 /* 1873 /*
1385 * handle partial DIO write. Adjust cur_iov if needed. 1874 * handle partial DIO write. Adjust cur_iov if needed.
@@ -1387,21 +1876,38 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1387 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); 1876 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
1388 1877
1389 do { 1878 do {
1390 bp.b_cur_off = iov_offset; 1879 pos = *ppos;
1391 bp.b_cur_iov = cur_iov;
1392 1880
1393 page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); 1881 user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset);
1394 if (IS_ERR(page)) { 1882 if (IS_ERR(user_page)) {
1395 ret = PTR_ERR(page); 1883 ret = PTR_ERR(user_page);
1396 goto out; 1884 goto out;
1397 } 1885 }
1398 1886
1399 copied = ocfs2_buffered_write_cluster(file, *ppos, count, 1887 /* Stay within our page boundaries */
1400 ocfs2_map_and_write_user_data, 1888 bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)),
1401 &bp); 1889 (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK)));
1890 /* Stay within the vector boundary */
1891 bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset);
1892 /* Stay within count */
1893 bytes = min(bytes, count);
1894
1895 page = NULL;
1896 ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0,
1897 &page, &fsdata);
1898 if (ret) {
1899 mlog_errno(ret);
1900 goto out;
1901 }
1402 1902
1403 ocfs2_put_write_source(&bp, page); 1903 dst = kmap_atomic(page, KM_USER0);
1904 memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes);
1905 kunmap_atomic(dst, KM_USER0);
1906 flush_dcache_page(page);
1907 ocfs2_put_write_source(user_page);
1404 1908
1909 copied = ocfs2_write_end(file, file->f_mapping, pos, bytes,
1910 bytes, page, fsdata);
1405 if (copied < 0) { 1911 if (copied < 0) {
1406 mlog_errno(copied); 1912 mlog_errno(copied);
1407 ret = copied; 1913 ret = copied;
@@ -1409,7 +1915,7 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1409 } 1915 }
1410 1916
1411 total += copied; 1917 total += copied;
1412 *ppos = *ppos + copied; 1918 *ppos = pos + copied;
1413 count -= copied; 1919 count -= copied;
1414 1920
1415 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); 1921 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
@@ -1579,52 +2085,46 @@ static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
1579 struct pipe_buffer *buf, 2085 struct pipe_buffer *buf,
1580 struct splice_desc *sd) 2086 struct splice_desc *sd)
1581{ 2087{
1582 int ret, count, total = 0; 2088 int ret, count;
1583 ssize_t copied = 0; 2089 ssize_t copied = 0;
1584 struct ocfs2_splice_write_priv sp; 2090 struct file *file = sd->u.file;
2091 unsigned int offset;
2092 struct page *page = NULL;
2093 void *fsdata;
2094 char *src, *dst;
1585 2095
1586 ret = buf->ops->confirm(pipe, buf); 2096 ret = buf->ops->confirm(pipe, buf);
1587 if (ret) 2097 if (ret)
1588 goto out; 2098 goto out;
1589 2099
1590 sp.s_sd = sd; 2100 offset = sd->pos & ~PAGE_CACHE_MASK;
1591 sp.s_buf = buf;
1592 sp.s_pipe = pipe;
1593 sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
1594 sp.s_buf_offset = buf->offset;
1595
1596 count = sd->len; 2101 count = sd->len;
1597 if (count + sp.s_offset > PAGE_CACHE_SIZE) 2102 if (count + offset > PAGE_CACHE_SIZE)
1598 count = PAGE_CACHE_SIZE - sp.s_offset; 2103 count = PAGE_CACHE_SIZE - offset;
1599 2104
1600 do { 2105 ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0,
1601 /* 2106 &page, &fsdata);
1602 * splice wants us to copy up to one page at a 2107 if (ret) {
1603 * time. For pagesize > cluster size, this means we 2108 mlog_errno(ret);
1604 * might enter ocfs2_buffered_write_cluster() more 2109 goto out;
1605 * than once, so keep track of our progress here. 2110 }
1606 */
1607 copied = ocfs2_buffered_write_cluster(sd->u.file,
1608 (loff_t)sd->pos + total,
1609 count,
1610 ocfs2_map_and_write_splice_data,
1611 &sp);
1612 if (copied < 0) {
1613 mlog_errno(copied);
1614 ret = copied;
1615 goto out;
1616 }
1617 2111
1618 count -= copied; 2112 src = buf->ops->map(pipe, buf, 1);
1619 sp.s_offset += copied; 2113 dst = kmap_atomic(page, KM_USER1);
1620 sp.s_buf_offset += copied; 2114 memcpy(dst + offset, src + buf->offset, count);
1621 total += copied; 2115 kunmap_atomic(page, KM_USER1);
1622 } while (count); 2116 buf->ops->unmap(pipe, buf, src);
1623 2117
1624 ret = 0; 2118 copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count,
2119 page, fsdata);
2120 if (copied < 0) {
2121 mlog_errno(copied);
2122 ret = copied;
2123 goto out;
2124 }
1625out: 2125out:
1626 2126
1627 return total ? total : ret; 2127 return copied ? copied : ret;
1628} 2128}
1629 2129
1630static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, 2130static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index a4dd1fa1822b..36fe27f268ee 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -39,15 +39,16 @@ enum ocfs2_alloc_restarted {
39}; 39};
40int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 40int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
41 struct inode *inode, 41 struct inode *inode,
42 u32 *cluster_start, 42 u32 *logical_offset,
43 u32 clusters_to_add, 43 u32 clusters_to_add,
44 int mark_unwritten,
44 struct buffer_head *fe_bh, 45 struct buffer_head *fe_bh,
45 handle_t *handle, 46 handle_t *handle,
46 struct ocfs2_alloc_context *data_ac, 47 struct ocfs2_alloc_context *data_ac,
47 struct ocfs2_alloc_context *meta_ac, 48 struct ocfs2_alloc_context *meta_ac,
48 enum ocfs2_alloc_restarted *reason); 49 enum ocfs2_alloc_restarted *reason_ret);
49int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, 50int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
50 u32 clusters_to_add, 51 u32 clusters_to_add, u32 extents_to_split,
51 struct ocfs2_alloc_context **data_ac, 52 struct ocfs2_alloc_context **data_ac,
52 struct ocfs2_alloc_context **meta_ac); 53 struct ocfs2_alloc_context **meta_ac);
53int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 54int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
@@ -61,4 +62,7 @@ int ocfs2_should_update_atime(struct inode *inode,
61int ocfs2_update_inode_atime(struct inode *inode, 62int ocfs2_update_inode_atime(struct inode *inode,
62 struct buffer_head *bh); 63 struct buffer_head *bh);
63 64
65int ocfs2_change_file_space(struct file *file, unsigned int cmd,
66 struct ocfs2_space_resv *sr);
67
64#endif /* OCFS2_FILE_H */ 68#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index b25ef63781ba..352eb4a13f98 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -157,16 +157,16 @@ int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
157 if (ocfs2_mount_local(osb)) 157 if (ocfs2_mount_local(osb))
158 return 0; 158 return 0;
159 159
160 status = o2hb_register_callback(&osb->osb_hb_down); 160 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
161 if (status < 0) { 161 if (status < 0) {
162 mlog_errno(status); 162 mlog_errno(status);
163 goto bail; 163 goto bail;
164 } 164 }
165 165
166 status = o2hb_register_callback(&osb->osb_hb_up); 166 status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
167 if (status < 0) { 167 if (status < 0) {
168 mlog_errno(status); 168 mlog_errno(status);
169 o2hb_unregister_callback(&osb->osb_hb_down); 169 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
170 } 170 }
171 171
172bail: 172bail:
@@ -178,8 +178,8 @@ void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
178 if (ocfs2_mount_local(osb)) 178 if (ocfs2_mount_local(osb))
179 return; 179 return;
180 180
181 o2hb_unregister_callback(&osb->osb_hb_down); 181 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
182 o2hb_unregister_callback(&osb->osb_hb_up); 182 o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
183} 183}
184 184
185void ocfs2_stop_heartbeat(struct ocfs2_super *osb) 185void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index f3ad21ad9aed..bd68c3f2afbe 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -14,6 +14,7 @@
14#include "ocfs2.h" 14#include "ocfs2.h"
15#include "alloc.h" 15#include "alloc.h"
16#include "dlmglue.h" 16#include "dlmglue.h"
17#include "file.h"
17#include "inode.h" 18#include "inode.h"
18#include "journal.h" 19#include "journal.h"
19 20
@@ -115,6 +116,7 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
115{ 116{
116 unsigned int flags; 117 unsigned int flags;
117 int status; 118 int status;
119 struct ocfs2_space_resv sr;
118 120
119 switch (cmd) { 121 switch (cmd) {
120 case OCFS2_IOC_GETFLAGS: 122 case OCFS2_IOC_GETFLAGS:
@@ -130,6 +132,14 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
130 132
131 return ocfs2_set_inode_attr(inode, flags, 133 return ocfs2_set_inode_attr(inode, flags,
132 OCFS2_FL_MODIFIABLE); 134 OCFS2_FL_MODIFIABLE);
135 case OCFS2_IOC_RESVSP:
136 case OCFS2_IOC_RESVSP64:
137 case OCFS2_IOC_UNRESVSP:
138 case OCFS2_IOC_UNRESVSP64:
139 if (copy_from_user(&sr, (int __user *) arg, sizeof(sr)))
140 return -EFAULT;
141
142 return ocfs2_change_file_space(filp, cmd, &sr);
133 default: 143 default:
134 return -ENOTTY; 144 return -ENOTTY;
135 } 145 }
@@ -148,6 +158,11 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
148 case OCFS2_IOC32_SETFLAGS: 158 case OCFS2_IOC32_SETFLAGS:
149 cmd = OCFS2_IOC_SETFLAGS; 159 cmd = OCFS2_IOC_SETFLAGS;
150 break; 160 break;
161 case OCFS2_IOC_RESVSP:
162 case OCFS2_IOC_RESVSP64:
163 case OCFS2_IOC_UNRESVSP:
164 case OCFS2_IOC_UNRESVSP64:
165 break;
151 default: 166 default:
152 return -ENOIOCTLCMD; 167 return -ENOIOCTLCMD;
153 } 168 }
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index dc1188081720..dbfb20bb27ea 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -722,8 +722,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
722 container_of(work, struct ocfs2_journal, j_recovery_work); 722 container_of(work, struct ocfs2_journal, j_recovery_work);
723 struct ocfs2_super *osb = journal->j_osb; 723 struct ocfs2_super *osb = journal->j_osb;
724 struct ocfs2_dinode *la_dinode, *tl_dinode; 724 struct ocfs2_dinode *la_dinode, *tl_dinode;
725 struct ocfs2_la_recovery_item *item; 725 struct ocfs2_la_recovery_item *item, *n;
726 struct list_head *p, *n;
727 LIST_HEAD(tmp_la_list); 726 LIST_HEAD(tmp_la_list);
728 727
729 mlog_entry_void(); 728 mlog_entry_void();
@@ -734,8 +733,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
734 list_splice_init(&journal->j_la_cleanups, &tmp_la_list); 733 list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
735 spin_unlock(&journal->j_lock); 734 spin_unlock(&journal->j_lock);
736 735
737 list_for_each_safe(p, n, &tmp_la_list) { 736 list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
738 item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
739 list_del_init(&item->lri_list); 737 list_del_init(&item->lri_list);
740 738
741 mlog(0, "Complete recovery for slot %d\n", item->lri_slot); 739 mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3db5de4506da..ce60aab013aa 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -289,6 +289,8 @@ int ocfs2_journal_dirty_data(handle_t *handle,
289#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ 289#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
290 + OCFS2_TRUNCATE_LOG_UPDATE) 290 + OCFS2_TRUNCATE_LOG_UPDATE)
291 291
292#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
293
292/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + 294/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
293 * bitmap block for the new bit) */ 295 * bitmap block for the new bit) */
294#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) 296#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index af01158b39f5..d79aa12137d2 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -37,11 +37,29 @@
37 37
38#include "ocfs2.h" 38#include "ocfs2.h"
39 39
40#include "aops.h"
40#include "dlmglue.h" 41#include "dlmglue.h"
41#include "file.h" 42#include "file.h"
42#include "inode.h" 43#include "inode.h"
43#include "mmap.h" 44#include "mmap.h"
44 45
46static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
47{
48 /* The best way to deal with signals in the vm path is
49 * to block them upfront, rather than allowing the
50 * locking paths to return -ERESTARTSYS. */
51 sigfillset(blocked);
52
53 /* We should technically never get a bad return value
54 * from sigprocmask */
55 return sigprocmask(SIG_BLOCK, blocked, oldset);
56}
57
58static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
59{
60 return sigprocmask(SIG_SETMASK, oldset, NULL);
61}
62
45static struct page *ocfs2_nopage(struct vm_area_struct * area, 63static struct page *ocfs2_nopage(struct vm_area_struct * area,
46 unsigned long address, 64 unsigned long address,
47 int *type) 65 int *type)
@@ -53,14 +71,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
53 mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address, 71 mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
54 type); 72 type);
55 73
56 /* The best way to deal with signals in this path is 74 ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
57 * to block them upfront, rather than allowing the
58 * locking paths to return -ERESTARTSYS. */
59 sigfillset(&blocked);
60
61 /* We should technically never get a bad ret return
62 * from sigprocmask */
63 ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
64 if (ret < 0) { 75 if (ret < 0) {
65 mlog_errno(ret); 76 mlog_errno(ret);
66 goto out; 77 goto out;
@@ -68,7 +79,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
68 79
69 page = filemap_nopage(area, address, type); 80 page = filemap_nopage(area, address, type);
70 81
71 ret = sigprocmask(SIG_SETMASK, &oldset, NULL); 82 ret = ocfs2_vm_op_unblock_sigs(&oldset);
72 if (ret < 0) 83 if (ret < 0)
73 mlog_errno(ret); 84 mlog_errno(ret);
74out: 85out:
@@ -76,28 +87,136 @@ out:
76 return page; 87 return page;
77} 88}
78 89
79static struct vm_operations_struct ocfs2_file_vm_ops = { 90static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
80 .nopage = ocfs2_nopage, 91 struct page *page)
81}; 92{
93 int ret;
94 struct address_space *mapping = inode->i_mapping;
95 loff_t pos = page->index << PAGE_CACHE_SHIFT;
96 unsigned int len = PAGE_CACHE_SIZE;
97 pgoff_t last_index;
98 struct page *locked_page = NULL;
99 void *fsdata;
100 loff_t size = i_size_read(inode);
82 101
83int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) 102 /*
103 * Another node might have truncated while we were waiting on
104 * cluster locks.
105 */
106 last_index = size >> PAGE_CACHE_SHIFT;
107 if (page->index > last_index) {
108 ret = -EINVAL;
109 goto out;
110 }
111
112 /*
113 * The i_size check above doesn't catch the case where nodes
114 * truncated and then re-extended the file. We'll re-check the
115 * page mapping after taking the page lock inside of
116 * ocfs2_write_begin_nolock().
117 */
118 if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
119 ret = -EINVAL;
120 goto out;
121 }
122
123 /*
124 * Call ocfs2_write_begin() and ocfs2_write_end() to take
125 * advantage of the allocation code there. We pass a write
126 * length of the whole page (chopped to i_size) to make sure
127 * the whole thing is allocated.
128 *
129 * Since we know the page is up to date, we don't have to
130 * worry about ocfs2_write_begin() skipping some buffer reads
131 * because the "write" would invalidate their data.
132 */
133 if (page->index == last_index)
134 len = size & ~PAGE_CACHE_MASK;
135
136 ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
137 &fsdata, di_bh, page);
138 if (ret) {
139 if (ret != -ENOSPC)
140 mlog_errno(ret);
141 goto out;
142 }
143
144 ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
145 fsdata);
146 if (ret < 0) {
147 mlog_errno(ret);
148 goto out;
149 }
150 BUG_ON(ret != len);
151 ret = 0;
152out:
153 return ret;
154}
155
156static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
84{ 157{
85 int ret = 0, lock_level = 0; 158 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
86 struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); 159 struct buffer_head *di_bh = NULL;
160 sigset_t blocked, oldset;
161 int ret, ret2;
162
163 ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
164 if (ret < 0) {
165 mlog_errno(ret);
166 return ret;
167 }
168
169 /*
170 * The cluster locks taken will block a truncate from another
171 * node. Taking the data lock will also ensure that we don't
172 * attempt page truncation as part of a downconvert.
173 */
174 ret = ocfs2_meta_lock(inode, &di_bh, 1);
175 if (ret < 0) {
176 mlog_errno(ret);
177 goto out;
178 }
87 179
88 /* 180 /*
89 * Only support shared writeable mmap for local mounts which 181 * The alloc sem should be enough to serialize with
90 * don't know about holes. 182 * ocfs2_truncate_file() changing i_size as well as any thread
183 * modifying the inode btree.
91 */ 184 */
92 if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) && 185 down_write(&OCFS2_I(inode)->ip_alloc_sem);
93 ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && 186
94 ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { 187 ret = ocfs2_data_lock(inode, 1);
95 mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); 188 if (ret < 0) {
96 /* This is -EINVAL because generic_file_readonly_mmap 189 mlog_errno(ret);
97 * returns it in a similar situation. */ 190 goto out_meta_unlock;
98 return -EINVAL;
99 } 191 }
100 192
193 ret = __ocfs2_page_mkwrite(inode, di_bh, page);
194
195 ocfs2_data_unlock(inode, 1);
196
197out_meta_unlock:
198 up_write(&OCFS2_I(inode)->ip_alloc_sem);
199
200 brelse(di_bh);
201 ocfs2_meta_unlock(inode, 1);
202
203out:
204 ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
205 if (ret2 < 0)
206 mlog_errno(ret2);
207
208 return ret;
209}
210
211static struct vm_operations_struct ocfs2_file_vm_ops = {
212 .nopage = ocfs2_nopage,
213 .page_mkwrite = ocfs2_page_mkwrite,
214};
215
216int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
217{
218 int ret = 0, lock_level = 0;
219
101 ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, 220 ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
102 file->f_vfsmnt, &lock_level); 221 file->f_vfsmnt, &lock_level);
103 if (ret < 0) { 222 if (ret < 0) {
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 36289e6295ce..d430fdab16e9 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1674,7 +1674,7 @@ static int ocfs2_symlink(struct inode *dir,
1674 u32 offset = 0; 1674 u32 offset = 0;
1675 1675
1676 inode->i_op = &ocfs2_symlink_inode_operations; 1676 inode->i_op = &ocfs2_symlink_inode_operations;
1677 status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 1677 status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0,
1678 new_fe_bh, 1678 new_fe_bh,
1679 handle, data_ac, NULL, 1679 handle, data_ac, NULL,
1680 NULL); 1680 NULL);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index a860633e833f..5cc90a40b3c5 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -219,6 +219,7 @@ struct ocfs2_super
219 u16 max_slots; 219 u16 max_slots;
220 s16 node_num; 220 s16 node_num;
221 s16 slot_num; 221 s16 slot_num;
222 s16 preferred_slot;
222 int s_sectsize_bits; 223 int s_sectsize_bits;
223 int s_clustersize; 224 int s_clustersize;
224 int s_clustersize_bits; 225 int s_clustersize_bits;
@@ -305,6 +306,19 @@ static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
305 return 0; 306 return 0;
306} 307}
307 308
309static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
310{
311 /*
312 * Support for sparse files is a pre-requisite
313 */
314 if (!ocfs2_sparse_alloc(osb))
315 return 0;
316
317 if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN)
318 return 1;
319 return 0;
320}
321
308/* set / clear functions because cluster events can make these happen 322/* set / clear functions because cluster events can make these happen
309 * in parallel so we want the transitions to be atomic. this also 323 * in parallel so we want the transitions to be atomic. this also
310 * means that any future flags osb_flags must be protected by spinlock 324 * means that any future flags osb_flags must be protected by spinlock
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index f0d9eb08547a..82f8a75b207e 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,7 @@
88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB 88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ 89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) 90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
91#define OCFS2_FEATURE_RO_COMPAT_SUPP 0 91#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
92 92
93/* 93/*
94 * Heartbeat-only devices are missing journals and other files. The 94 * Heartbeat-only devices are missing journals and other files. The
@@ -116,6 +116,11 @@
116 */ 116 */
117#define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001 117#define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001
118 118
119/*
120 * Unwritten extents support.
121 */
122#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001
123
119/* The byte offset of the first backup block will be 1G. 124/* The byte offset of the first backup block will be 1G.
120 * The following will be 4G, 16G, 64G, 256G and 1T. 125 * The following will be 4G, 16G, 64G, 256G and 1T.
121 */ 126 */
@@ -170,6 +175,32 @@
170#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int) 175#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
171 176
172/* 177/*
178 * Space reservation / allocation / free ioctls and argument structure
179 * are designed to be compatible with XFS.
180 *
181 * ALLOCSP* and FREESP* are not and will never be supported, but are
182 * included here for completeness.
183 */
184struct ocfs2_space_resv {
185 __s16 l_type;
186 __s16 l_whence;
187 __s64 l_start;
188 __s64 l_len; /* len == 0 means until end of file */
189 __s32 l_sysid;
190 __u32 l_pid;
191 __s32 l_pad[4]; /* reserve area */
192};
193
194#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
195#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
196#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
197#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
198#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
199#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
200#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
201#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
202
203/*
173 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 204 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
174 */ 205 */
175#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ 206#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d8b79067dc14..af4882b62cfa 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -121,17 +121,25 @@ static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
121 return ret; 121 return ret;
122} 122}
123 123
124static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si) 124static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred)
125{ 125{
126 int i; 126 int i;
127 s16 ret = OCFS2_INVALID_SLOT; 127 s16 ret = OCFS2_INVALID_SLOT;
128 128
129 if (preferred >= 0 && preferred < si->si_num_slots) {
130 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) {
131 ret = preferred;
132 goto out;
133 }
134 }
135
129 for(i = 0; i < si->si_num_slots; i++) { 136 for(i = 0; i < si->si_num_slots; i++) {
130 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { 137 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
131 ret = (s16) i; 138 ret = (s16) i;
132 break; 139 break;
133 } 140 }
134 } 141 }
142out:
135 return ret; 143 return ret;
136} 144}
137 145
@@ -248,7 +256,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
248 if (slot == OCFS2_INVALID_SLOT) { 256 if (slot == OCFS2_INVALID_SLOT) {
249 /* if no slot yet, then just take 1st available 257 /* if no slot yet, then just take 1st available
250 * one. */ 258 * one. */
251 slot = __ocfs2_find_empty_slot(si); 259 slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
252 if (slot == OCFS2_INVALID_SLOT) { 260 if (slot == OCFS2_INVALID_SLOT) {
253 spin_unlock(&si->si_lock); 261 spin_unlock(&si->si_lock);
254 mlog(ML_ERROR, "no free slots available!\n"); 262 mlog(ML_ERROR, "no free slots available!\n");
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index e3437626d183..d9c5c9fcb30f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -98,14 +98,6 @@ static int ocfs2_relink_block_group(handle_t *handle,
98 u16 chain); 98 u16 chain);
99static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 99static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
100 u32 wanted); 100 u32 wanted);
101static int ocfs2_free_suballoc_bits(handle_t *handle,
102 struct inode *alloc_inode,
103 struct buffer_head *alloc_bh,
104 unsigned int start_bit,
105 u64 bg_blkno,
106 unsigned int count);
107static inline u64 ocfs2_which_suballoc_group(u64 block,
108 unsigned int bit);
109static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, 101static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
110 u64 bg_blkno, 102 u64 bg_blkno,
111 u16 bg_bit_off); 103 u16 bg_bit_off);
@@ -496,13 +488,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
496 488
497 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); 489 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
498 (*ac)->ac_which = OCFS2_AC_USE_META; 490 (*ac)->ac_which = OCFS2_AC_USE_META;
499
500#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
501 slot = 0;
502#else
503 slot = osb->slot_num; 491 slot = osb->slot_num;
504#endif
505
506 (*ac)->ac_group_search = ocfs2_block_group_search; 492 (*ac)->ac_group_search = ocfs2_block_group_search;
507 493
508 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 494 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
@@ -1626,12 +1612,12 @@ bail:
1626/* 1612/*
1627 * expects the suballoc inode to already be locked. 1613 * expects the suballoc inode to already be locked.
1628 */ 1614 */
1629static int ocfs2_free_suballoc_bits(handle_t *handle, 1615int ocfs2_free_suballoc_bits(handle_t *handle,
1630 struct inode *alloc_inode, 1616 struct inode *alloc_inode,
1631 struct buffer_head *alloc_bh, 1617 struct buffer_head *alloc_bh,
1632 unsigned int start_bit, 1618 unsigned int start_bit,
1633 u64 bg_blkno, 1619 u64 bg_blkno,
1634 unsigned int count) 1620 unsigned int count)
1635{ 1621{
1636 int status = 0; 1622 int status = 0;
1637 u32 tmp_used; 1623 u32 tmp_used;
@@ -1703,13 +1689,6 @@ bail:
1703 return status; 1689 return status;
1704} 1690}
1705 1691
1706static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
1707{
1708 u64 group = block - (u64) bit;
1709
1710 return group;
1711}
1712
1713int ocfs2_free_dinode(handle_t *handle, 1692int ocfs2_free_dinode(handle_t *handle,
1714 struct inode *inode_alloc_inode, 1693 struct inode *inode_alloc_inode,
1715 struct buffer_head *inode_alloc_bh, 1694 struct buffer_head *inode_alloc_bh,
@@ -1723,19 +1702,6 @@ int ocfs2_free_dinode(handle_t *handle,
1723 inode_alloc_bh, bit, bg_blkno, 1); 1702 inode_alloc_bh, bit, bg_blkno, 1);
1724} 1703}
1725 1704
1726int ocfs2_free_extent_block(handle_t *handle,
1727 struct inode *eb_alloc_inode,
1728 struct buffer_head *eb_alloc_bh,
1729 struct ocfs2_extent_block *eb)
1730{
1731 u64 blk = le64_to_cpu(eb->h_blkno);
1732 u16 bit = le16_to_cpu(eb->h_suballoc_bit);
1733 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1734
1735 return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
1736 bit, bg_blkno, 1);
1737}
1738
1739int ocfs2_free_clusters(handle_t *handle, 1705int ocfs2_free_clusters(handle_t *handle,
1740 struct inode *bitmap_inode, 1706 struct inode *bitmap_inode,
1741 struct buffer_head *bitmap_bh, 1707 struct buffer_head *bitmap_bh,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 1a3c94cb9250..f212dc01a84b 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -86,20 +86,29 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
86 u32 *cluster_start, 86 u32 *cluster_start,
87 u32 *num_clusters); 87 u32 *num_clusters);
88 88
89int ocfs2_free_suballoc_bits(handle_t *handle,
90 struct inode *alloc_inode,
91 struct buffer_head *alloc_bh,
92 unsigned int start_bit,
93 u64 bg_blkno,
94 unsigned int count);
89int ocfs2_free_dinode(handle_t *handle, 95int ocfs2_free_dinode(handle_t *handle,
90 struct inode *inode_alloc_inode, 96 struct inode *inode_alloc_inode,
91 struct buffer_head *inode_alloc_bh, 97 struct buffer_head *inode_alloc_bh,
92 struct ocfs2_dinode *di); 98 struct ocfs2_dinode *di);
93int ocfs2_free_extent_block(handle_t *handle,
94 struct inode *eb_alloc_inode,
95 struct buffer_head *eb_alloc_bh,
96 struct ocfs2_extent_block *eb);
97int ocfs2_free_clusters(handle_t *handle, 99int ocfs2_free_clusters(handle_t *handle,
98 struct inode *bitmap_inode, 100 struct inode *bitmap_inode,
99 struct buffer_head *bitmap_bh, 101 struct buffer_head *bitmap_bh,
100 u64 start_blk, 102 u64 start_blk,
101 unsigned int num_clusters); 103 unsigned int num_clusters);
102 104
105static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
106{
107 u64 group = block - (u64) bit;
108
109 return group;
110}
111
103static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb, 112static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
104 u64 bg_blkno) 113 u64 bg_blkno)
105{ 114{
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 86b559c7dce9..3a5a1ed09ac9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -82,7 +82,8 @@ MODULE_AUTHOR("Oracle");
82MODULE_LICENSE("GPL"); 82MODULE_LICENSE("GPL");
83 83
84static int ocfs2_parse_options(struct super_block *sb, char *options, 84static int ocfs2_parse_options(struct super_block *sb, char *options,
85 unsigned long *mount_opt, int is_remount); 85 unsigned long *mount_opt, s16 *slot,
86 int is_remount);
86static void ocfs2_put_super(struct super_block *sb); 87static void ocfs2_put_super(struct super_block *sb);
87static int ocfs2_mount_volume(struct super_block *sb); 88static int ocfs2_mount_volume(struct super_block *sb);
88static int ocfs2_remount(struct super_block *sb, int *flags, char *data); 89static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
@@ -114,8 +115,6 @@ static void ocfs2_write_super(struct super_block *sb);
114static struct inode *ocfs2_alloc_inode(struct super_block *sb); 115static struct inode *ocfs2_alloc_inode(struct super_block *sb);
115static void ocfs2_destroy_inode(struct inode *inode); 116static void ocfs2_destroy_inode(struct inode *inode);
116 117
117static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
118
119static const struct super_operations ocfs2_sops = { 118static const struct super_operations ocfs2_sops = {
120 .statfs = ocfs2_statfs, 119 .statfs = ocfs2_statfs,
121 .alloc_inode = ocfs2_alloc_inode, 120 .alloc_inode = ocfs2_alloc_inode,
@@ -140,6 +139,7 @@ enum {
140 Opt_data_ordered, 139 Opt_data_ordered,
141 Opt_data_writeback, 140 Opt_data_writeback,
142 Opt_atime_quantum, 141 Opt_atime_quantum,
142 Opt_slot,
143 Opt_err, 143 Opt_err,
144}; 144};
145 145
@@ -154,6 +154,7 @@ static match_table_t tokens = {
154 {Opt_data_ordered, "data=ordered"}, 154 {Opt_data_ordered, "data=ordered"},
155 {Opt_data_writeback, "data=writeback"}, 155 {Opt_data_writeback, "data=writeback"},
156 {Opt_atime_quantum, "atime_quantum=%u"}, 156 {Opt_atime_quantum, "atime_quantum=%u"},
157 {Opt_slot, "preferred_slot=%u"},
157 {Opt_err, NULL} 158 {Opt_err, NULL}
158}; 159};
159 160
@@ -318,7 +319,7 @@ static void ocfs2_destroy_inode(struct inode *inode)
318/* From xfs_super.c:xfs_max_file_offset 319/* From xfs_super.c:xfs_max_file_offset
319 * Copyright (c) 2000-2004 Silicon Graphics, Inc. 320 * Copyright (c) 2000-2004 Silicon Graphics, Inc.
320 */ 321 */
321static unsigned long long ocfs2_max_file_offset(unsigned int blockshift) 322unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
322{ 323{
323 unsigned int pagefactor = 1; 324 unsigned int pagefactor = 1;
324 unsigned int bitshift = BITS_PER_LONG - 1; 325 unsigned int bitshift = BITS_PER_LONG - 1;
@@ -355,9 +356,10 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
355 int incompat_features; 356 int incompat_features;
356 int ret = 0; 357 int ret = 0;
357 unsigned long parsed_options; 358 unsigned long parsed_options;
359 s16 slot;
358 struct ocfs2_super *osb = OCFS2_SB(sb); 360 struct ocfs2_super *osb = OCFS2_SB(sb);
359 361
360 if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { 362 if (!ocfs2_parse_options(sb, data, &parsed_options, &slot, 1)) {
361 ret = -EINVAL; 363 ret = -EINVAL;
362 goto out; 364 goto out;
363 } 365 }
@@ -534,6 +536,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
534 struct dentry *root; 536 struct dentry *root;
535 int status, sector_size; 537 int status, sector_size;
536 unsigned long parsed_opt; 538 unsigned long parsed_opt;
539 s16 slot;
537 struct inode *inode = NULL; 540 struct inode *inode = NULL;
538 struct ocfs2_super *osb = NULL; 541 struct ocfs2_super *osb = NULL;
539 struct buffer_head *bh = NULL; 542 struct buffer_head *bh = NULL;
@@ -541,7 +544,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
541 544
542 mlog_entry("%p, %p, %i", sb, data, silent); 545 mlog_entry("%p, %p, %i", sb, data, silent);
543 546
544 if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { 547 if (!ocfs2_parse_options(sb, data, &parsed_opt, &slot, 0)) {
545 status = -EINVAL; 548 status = -EINVAL;
546 goto read_super_error; 549 goto read_super_error;
547 } 550 }
@@ -571,6 +574,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
571 brelse(bh); 574 brelse(bh);
572 bh = NULL; 575 bh = NULL;
573 osb->s_mount_opt = parsed_opt; 576 osb->s_mount_opt = parsed_opt;
577 osb->preferred_slot = slot;
574 578
575 sb->s_magic = OCFS2_SUPER_MAGIC; 579 sb->s_magic = OCFS2_SUPER_MAGIC;
576 580
@@ -713,6 +717,7 @@ static struct file_system_type ocfs2_fs_type = {
713static int ocfs2_parse_options(struct super_block *sb, 717static int ocfs2_parse_options(struct super_block *sb,
714 char *options, 718 char *options,
715 unsigned long *mount_opt, 719 unsigned long *mount_opt,
720 s16 *slot,
716 int is_remount) 721 int is_remount)
717{ 722{
718 int status; 723 int status;
@@ -722,6 +727,7 @@ static int ocfs2_parse_options(struct super_block *sb,
722 options ? options : "(none)"); 727 options ? options : "(none)");
723 728
724 *mount_opt = 0; 729 *mount_opt = 0;
730 *slot = OCFS2_INVALID_SLOT;
725 731
726 if (!options) { 732 if (!options) {
727 status = 1; 733 status = 1;
@@ -782,6 +788,15 @@ static int ocfs2_parse_options(struct super_block *sb,
782 else 788 else
783 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 789 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
784 break; 790 break;
791 case Opt_slot:
792 option = 0;
793 if (match_int(&args[0], &option)) {
794 status = 0;
795 goto bail;
796 }
797 if (option)
798 *slot = (s16)option;
799 break;
785 default: 800 default:
786 mlog(ML_ERROR, 801 mlog(ML_ERROR,
787 "Unrecognized mount option \"%s\" " 802 "Unrecognized mount option \"%s\" "
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a1..3b9cb3d0b008 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,6 @@ void __ocfs2_abort(struct super_block *sb,
45 45
46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) 46#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
47 47
48unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
49
48#endif /* OCFS2_SUPER_H */ 50#endif /* OCFS2_SUPER_H */
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index fef6f3d0a4a7..8c6967f3fb11 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -40,9 +40,9 @@
40#include <linux/types.h> 40#include <linux/types.h>
41#include <linux/list.h> 41#include <linux/list.h>
42#include <linux/kref.h> 42#include <linux/kref.h>
43#include <linux/mutex.h>
43 44
44#include <asm/atomic.h> 45#include <asm/atomic.h>
45#include <asm/semaphore.h>
46 46
47#define CONFIGFS_ITEM_NAME_LEN 20 47#define CONFIGFS_ITEM_NAME_LEN 20
48 48
@@ -75,7 +75,6 @@ extern void config_item_init(struct config_item *);
75extern void config_item_init_type_name(struct config_item *item, 75extern void config_item_init_type_name(struct config_item *item,
76 const char *name, 76 const char *name,
77 struct config_item_type *type); 77 struct config_item_type *type);
78extern void config_item_cleanup(struct config_item *);
79 78
80extern struct config_item * config_item_get(struct config_item *); 79extern struct config_item * config_item_get(struct config_item *);
81extern void config_item_put(struct config_item *); 80extern void config_item_put(struct config_item *);
@@ -87,12 +86,10 @@ struct config_item_type {
87 struct configfs_attribute **ct_attrs; 86 struct configfs_attribute **ct_attrs;
88}; 87};
89 88
90
91/** 89/**
92 * group - a group of config_items of a specific type, belonging 90 * group - a group of config_items of a specific type, belonging
93 * to a specific subsystem. 91 * to a specific subsystem.
94 */ 92 */
95
96struct config_group { 93struct config_group {
97 struct config_item cg_item; 94 struct config_item cg_item;
98 struct list_head cg_children; 95 struct list_head cg_children;
@@ -100,13 +97,11 @@ struct config_group {
100 struct config_group **default_groups; 97 struct config_group **default_groups;
101}; 98};
102 99
103
104extern void config_group_init(struct config_group *group); 100extern void config_group_init(struct config_group *group);
105extern void config_group_init_type_name(struct config_group *group, 101extern void config_group_init_type_name(struct config_group *group,
106 const char *name, 102 const char *name,
107 struct config_item_type *type); 103 struct config_item_type *type);
108 104
109
110static inline struct config_group *to_config_group(struct config_item *item) 105static inline struct config_group *to_config_group(struct config_item *item)
111{ 106{
112 return item ? container_of(item,struct config_group,cg_item) : NULL; 107 return item ? container_of(item,struct config_group,cg_item) : NULL;
@@ -122,7 +117,8 @@ static inline void config_group_put(struct config_group *group)
122 config_item_put(&group->cg_item); 117 config_item_put(&group->cg_item);
123} 118}
124 119
125extern struct config_item *config_group_find_obj(struct config_group *, const char *); 120extern struct config_item *config_group_find_item(struct config_group *,
121 const char *);
126 122
127 123
128struct configfs_attribute { 124struct configfs_attribute {
@@ -131,6 +127,22 @@ struct configfs_attribute {
131 mode_t ca_mode; 127 mode_t ca_mode;
132}; 128};
133 129
130/*
131 * Users often need to create attribute structures for their configurable
132 * attributes, containing a configfs_attribute member and function pointers
133 * for the show() and store() operations on that attribute. They can use
134 * this macro (similar to sysfs' __ATTR) to make defining attributes easier.
135 */
136#define __CONFIGFS_ATTR(_name, _mode, _show, _store) \
137{ \
138 .attr = { \
139 .ca_name = __stringify(_name), \
140 .ca_mode = _mode, \
141 .ca_owner = THIS_MODULE, \
142 }, \
143 .show = _show, \
144 .store = _store, \
145}
134 146
135/* 147/*
136 * If allow_link() exists, the item can symlink(2) out to other 148 * If allow_link() exists, the item can symlink(2) out to other
@@ -157,12 +169,13 @@ struct configfs_group_operations {
157 struct config_item *(*make_item)(struct config_group *group, const char *name); 169 struct config_item *(*make_item)(struct config_group *group, const char *name);
158 struct config_group *(*make_group)(struct config_group *group, const char *name); 170 struct config_group *(*make_group)(struct config_group *group, const char *name);
159 int (*commit_item)(struct config_item *item); 171 int (*commit_item)(struct config_item *item);
172 void (*disconnect_notify)(struct config_group *group, struct config_item *item);
160 void (*drop_item)(struct config_group *group, struct config_item *item); 173 void (*drop_item)(struct config_group *group, struct config_item *item);
161}; 174};
162 175
163struct configfs_subsystem { 176struct configfs_subsystem {
164 struct config_group su_group; 177 struct config_group su_group;
165 struct semaphore su_sem; 178 struct mutex su_mutex;
166}; 179};
167 180
168static inline struct configfs_subsystem *to_configfs_subsystem(struct config_group *group) 181static inline struct configfs_subsystem *to_configfs_subsystem(struct config_group *group)
@@ -175,6 +188,11 @@ static inline struct configfs_subsystem *to_configfs_subsystem(struct config_gro
175int configfs_register_subsystem(struct configfs_subsystem *subsys); 188int configfs_register_subsystem(struct configfs_subsystem *subsys);
176void configfs_unregister_subsystem(struct configfs_subsystem *subsys); 189void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
177 190
191/* These functions can sleep and can alloc with GFP_KERNEL */
192/* WARNING: These cannot be called underneath configfs callbacks!! */
193int configfs_depend_item(struct configfs_subsystem *subsys, struct config_item *target);
194void configfs_undepend_item(struct configfs_subsystem *subsys, struct config_item *target);
195
178#endif /* __KERNEL__ */ 196#endif /* __KERNEL__ */
179 197
180#endif /* _CONFIGFS_H_ */ 198#endif /* _CONFIGFS_H_ */