Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2

* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: [PATCH] ocfs2: Release mutex in error handling code [PATCH] ocfs2: Fix oops when racing files truncates with writes into an mmap region [PATCH 2/2] ocfs2: Fix race between mount and recovery [PATCH 1/2] ocfs2: Add counter in struct ocfs2_dinode to track journal replays [PATCH] configfs: Convenience macros for attribute definition. [PATCH] configfs: Pin configfs subsystems separately from new config_items. [PATCH] configfs: Fix open directory making rmdir() fail [PATCH] configfs: Lock new directory inodes before removing on cleanup after failure [PATCH] configfs: Prevent userspace from creating new entries under attaching directories [PATCH] configfs: Fix failing symlink() making rmdir() fail [PATCH] configfs: Fix symlink() to a removing item [PATCH] configfs: Include linux/err.h in linux/configfs.h
author: Linus Torvalds <torvalds@linux-foundation.org> 2008-08-01 14:54:05 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-08-01 14:54:05 -0400
commit: 63a16f90167850010864a9e8ebb71d216983090f (patch)
tree: c4b284cc596421ac8100e0ad3b2f56ead4563d2d
parent: 5adf2b03d97111c8955495ba11e8b7db27df8695 (diff)
parent: c259ae52e204d42f8b2d484c85517a4c367030e1 (diff)
14 files changed, 904 insertions, 112 deletions
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
index 44c97e6accb2..fabcb0e00f25 100644
--- a/Documentation/filesystems/configfs/configfs.txt
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -311,9 +311,20 @@ the subsystem must be ready for it.
 [An Example]
 The best example of these basic concepts is the simple_children
-subsystem/group and the simple_child item in configfs_example.c  It
+subsystem/group and the simple_child item in configfs_example_explicit.c
-shows a trivial object displaying and storing an attribute, and a simple
+and configfs_example_macros.c.  It shows a trivial object displaying and
-group creating and destroying these children.
+storing an attribute, and a simple group creating and destroying these
+children.
+The only difference between configfs_example_explicit.c and
+configfs_example_macros.c is how the attributes of the childless item
+are defined.  The childless item has extended attributes, each with
+their own show()/store() operation.  This follows a convention commonly
+used in sysfs.  configfs_example_explicit.c creates these attributes
+by explicitly defining the structures involved.  Conversely
+configfs_example_macros.c uses some convenience macros from configfs.h
+to define the attributes.  These macros are similar to their sysfs
+counterparts.
 [Hierarchy Navigation and the Subsystem Mutex]
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example_explicit.c
index 039648791701..d428cc9f07f3 100644
--- a/Documentation/filesystems/configfs/configfs_example.c
+++ b/Documentation/filesystems/configfs/configfs_example_explicit.c
@@ -1,8 +1,10 @@
 /*
 * vim: noexpandtab ts=8 sts=0 sw=8:
 *
- * configfs_example.c - This file is a demonstration module containing
+ * configfs_example_explicit.c - This file is a demonstration module
- *      a number of configfs subsystems.
+ *      containing a number of configfs subsystems.  It explicitly defines
+ *      each structure without using the helper macros defined in
+ *      configfs.h.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
@@ -281,7 +283,6 @@ static struct config_item *simple_children_make_item(struct config_group *group,
        if (!simple_child)
                return ERR_PTR(-ENOMEM);
        config_item_init_type_name(&simple_child->item, name,
                                   &simple_child_type);
@@ -302,8 +303,8 @@ static struct configfs_attribute *simple_children_attrs[] = {
 };
 static ssize_t simple_children_attr_show(struct config_item *item,
-                                         struct configfs_attribute *attr,
+                                         struct configfs_attribute *attr,
-                                         char *page)
+                                         char *page)
 {
        return sprintf(page,
 "[02-simple-children]\n"
@@ -318,7 +319,7 @@ static void simple_children_release(struct config_item *item)
 }
 static struct configfs_item_operations simple_children_item_ops = {
-        .release        = simple_children_release,
+        .release        = simple_children_release,
        .show_attribute = simple_children_attr_show,
 };
@@ -368,7 +369,6 @@ static struct config_group *group_children_make_group(struct config_group *group
        if (!simple_children)
                return ERR_PTR(-ENOMEM);
        config_group_init_type_name(&simple_children->group, name,
                                    &simple_children_type);
@@ -387,8 +387,8 @@ static struct configfs_attribute *group_children_attrs[] = {
 };
 static ssize_t group_children_attr_show(struct config_item *item,
-                                        struct configfs_attribute *attr,
+                                        struct configfs_attribute *attr,
-                                        char *page)
+                                        char *page)
 {
        return sprintf(page,
 "[03-group-children]\n"
diff --git a/Documentation/filesystems/configfs/configfs_example_macros.c b/Documentation/filesystems/configfs/configfs_example_macros.c
new file mode 100644
index 000000000000..d8e30a0378aa
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs_example_macros.c
@@ -0,0 +1,448 @@
+/*
+ * vim: noexpandtab ts=8 sts=0 sw=8:
+ *
+ * configfs_example_macros.c - This file is a demonstration module
+ *      containing a number of configfs subsystems.  It uses the helper
+ *      macros defined by configfs.h
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on sysfs:
+ *      sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
+ *
+ * configfs Copyright (C) 2005 Oracle.  All rights reserved.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/configfs.h>
+/*
+ * 01-childless
+ *
+ * This first example is a childless subsystem.  It cannot create
+ * any config_items.  It just has attributes.
+ *
+ * Note that we are enclosing the configfs_subsystem inside a container.
+ * This is not necessary if a subsystem has no attributes directly
+ * on the subsystem.  See the next example, 02-simple-children, for
+ * such a subsystem.
+ */
+struct childless {
+        struct configfs_subsystem subsys;
+        int showme;
+        int storeme;
+};
+static inline struct childless *to_childless(struct config_item *item)
+{
+        return item ? container_of(to_configfs_subsystem(to_config_group(item)), struct childless, subsys) : NULL;
+}
+CONFIGFS_ATTR_STRUCT(childless);
+#define CHILDLESS_ATTR(_name, _mode, _show, _store)     \
+struct childless_attribute childless_attr_##_name = __CONFIGFS_ATTR(_name, _mode, _show, _store)
+#define CHILDLESS_ATTR_RO(_name, _show) \
+struct childless_attribute childless_attr_##_name = __CONFIGFS_ATTR_RO(_name, _show);
+static ssize_t childless_showme_read(struct childless *childless,
+                                     char *page)
+{
+        ssize_t pos;
+        pos = sprintf(page, "%d\n", childless->showme);
+        childless->showme++;
+        return pos;
+}
+static ssize_t childless_storeme_read(struct childless *childless,
+                                      char *page)
+{
+        return sprintf(page, "%d\n", childless->storeme);
+}
+static ssize_t childless_storeme_write(struct childless *childless,
+                                       const char *page,
+                                       size_t count)
+{
+        unsigned long tmp;
+        char *p = (char *) page;
+        tmp = simple_strtoul(p, &p, 10);
+        if (!p || (*p && (*p != '\n')))
+                return -EINVAL;
+        if (tmp > INT_MAX)
+                return -ERANGE;
+        childless->storeme = tmp;
+        return count;
+}
+static ssize_t childless_description_read(struct childless *childless,
+                                          char *page)
+{
+        return sprintf(page,
+"[01-childless]\n"
+"\n"
+"The childless subsystem is the simplest possible subsystem in\n"
+"configfs.  It does not support the creation of child config_items.\n"
+"It only has a few attributes.  In fact, it isn't much different\n"
+"than a directory in /proc.\n");
+}
+CHILDLESS_ATTR_RO(showme, childless_showme_read);
+CHILDLESS_ATTR(storeme, S_IRUGO | S_IWUSR, childless_storeme_read,
+               childless_storeme_write);
+CHILDLESS_ATTR_RO(description, childless_description_read);
+static struct configfs_attribute *childless_attrs[] = {
+        &childless_attr_showme.attr,
+        &childless_attr_storeme.attr,
+        &childless_attr_description.attr,
+        NULL,
+};
+CONFIGFS_ATTR_OPS(childless);
+static struct configfs_item_operations childless_item_ops = {
+        .show_attribute         = childless_attr_show,
+        .store_attribute        = childless_attr_store,
+};
+static struct config_item_type childless_type = {
+        .ct_item_ops    = &childless_item_ops,
+        .ct_attrs       = childless_attrs,
+        .ct_owner       = THIS_MODULE,
+};
+static struct childless childless_subsys = {
+        .subsys = {
+                .su_group = {
+                        .cg_item = {
+                                .ci_namebuf = "01-childless",
+                                .ci_type = &childless_type,
+                        },
+                },
+        },
+};
+/* ----------------------------------------------------------------- */
+/*
+ * 02-simple-children
+ *
+ * This example merely has a simple one-attribute child.  Note that
+ * there is no extra attribute structure, as the child's attribute is
+ * known from the get-go.  Also, there is no container for the
+ * subsystem, as it has no attributes of its own.
+ */
+struct simple_child {
+        struct config_item item;
+        int storeme;
+};
+static inline struct simple_child *to_simple_child(struct config_item *item)
+{
+        return item ? container_of(item, struct simple_child, item) : NULL;
+}
+static struct configfs_attribute simple_child_attr_storeme = {
+        .ca_owner = THIS_MODULE,
+        .ca_name = "storeme",
+        .ca_mode = S_IRUGO | S_IWUSR,
+};
+static struct configfs_attribute *simple_child_attrs[] = {
+        &simple_child_attr_storeme,
+        NULL,
+};
+static ssize_t simple_child_attr_show(struct config_item *item,
+                                      struct configfs_attribute *attr,
+                                      char *page)
+{
+        ssize_t count;
+        struct simple_child *simple_child = to_simple_child(item);
+        count = sprintf(page, "%d\n", simple_child->storeme);
+        return count;
+}
+static ssize_t simple_child_attr_store(struct config_item *item,
+                                       struct configfs_attribute *attr,
+                                       const char *page, size_t count)
+{
+        struct simple_child *simple_child = to_simple_child(item);
+        unsigned long tmp;
+        char *p = (char *) page;
+        tmp = simple_strtoul(p, &p, 10);
+        if (!p || (*p && (*p != '\n')))
+                return -EINVAL;
+        if (tmp > INT_MAX)
+                return -ERANGE;
+        simple_child->storeme = tmp;
+        return count;
+}
+static void simple_child_release(struct config_item *item)
+{
+        kfree(to_simple_child(item));
+}
+static struct configfs_item_operations simple_child_item_ops = {
+        .release                = simple_child_release,
+        .show_attribute         = simple_child_attr_show,
+        .store_attribute        = simple_child_attr_store,
+};
+static struct config_item_type simple_child_type = {
+        .ct_item_ops    = &simple_child_item_ops,
+        .ct_attrs       = simple_child_attrs,
+        .ct_owner       = THIS_MODULE,
+};
+struct simple_children {
+        struct config_group group;
+};
+static inline struct simple_children *to_simple_children(struct config_item *item)
+{
+        return item ? container_of(to_config_group(item), struct simple_children, group) : NULL;
+}
+static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
+{
+        struct simple_child *simple_child;
+        simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL);
+        if (!simple_child)
+                return ERR_PTR(-ENOMEM);
+        config_item_init_type_name(&simple_child->item, name,
+                                   &simple_child_type);
+        simple_child->storeme = 0;
+        return &simple_child->item;
+}
+static struct configfs_attribute simple_children_attr_description = {
+        .ca_owner = THIS_MODULE,
+        .ca_name = "description",
+        .ca_mode = S_IRUGO,
+};
+static struct configfs_attribute *simple_children_attrs[] = {
+        &simple_children_attr_description,
+        NULL,
+};
+static ssize_t simple_children_attr_show(struct config_item *item,
+                                         struct configfs_attribute *attr,
+                                         char *page)
+{
+        return sprintf(page,
+"[02-simple-children]\n"
+"\n"
+"This subsystem allows the creation of child config_items.  These\n"
+"items have only one attribute that is readable and writeable.\n");
+}
+static void simple_children_release(struct config_item *item)
+{
+        kfree(to_simple_children(item));
+}
+static struct configfs_item_operations simple_children_item_ops = {
+        .release        = simple_children_release,
+        .show_attribute = simple_children_attr_show,
+};
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations simple_children_group_ops = {
+        .make_item      = simple_children_make_item,
+};
+static struct config_item_type simple_children_type = {
+        .ct_item_ops    = &simple_children_item_ops,
+        .ct_group_ops   = &simple_children_group_ops,
+        .ct_attrs       = simple_children_attrs,
+        .ct_owner       = THIS_MODULE,
+};
+static struct configfs_subsystem simple_children_subsys = {
+        .su_group = {
+                .cg_item = {
+                        .ci_namebuf = "02-simple-children",
+                        .ci_type = &simple_children_type,
+                },
+        },
+};
+/* ----------------------------------------------------------------- */
+/*
+ * 03-group-children
+ *
+ * This example reuses the simple_children group from above.  However,
+ * the simple_children group is not the subsystem itself, it is a
+ * child of the subsystem.  Creation of a group in the subsystem creates
+ * a new simple_children group.  That group can then have simple_child
+ * children of its own.
+ */
+static struct config_group *group_children_make_group(struct config_group *group, const char *name)
+{
+        struct simple_children *simple_children;
+        simple_children = kzalloc(sizeof(struct simple_children),
+                                  GFP_KERNEL);
+        if (!simple_children)
+                return ERR_PTR(-ENOMEM);
+        config_group_init_type_name(&simple_children->group, name,
+                                    &simple_children_type);
+        return &simple_children->group;
+}
+static struct configfs_attribute group_children_attr_description = {
+        .ca_owner = THIS_MODULE,
+        .ca_name = "description",
+        .ca_mode = S_IRUGO,
+};
+static struct configfs_attribute *group_children_attrs[] = {
+        &group_children_attr_description,
+        NULL,
+};
+static ssize_t group_children_attr_show(struct config_item *item,
+                                        struct configfs_attribute *attr,
+                                        char *page)
+{
+        return sprintf(page,
+"[03-group-children]\n"
+"\n"
+"This subsystem allows the creation of child config_groups.  These\n"
+"groups are like the subsystem simple-children.\n");
+}
+static struct configfs_item_operations group_children_item_ops = {
+        .show_attribute = group_children_attr_show,
+};
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations group_children_group_ops = {
+        .make_group     = group_children_make_group,
+};
+static struct config_item_type group_children_type = {
+        .ct_item_ops    = &group_children_item_ops,
+        .ct_group_ops   = &group_children_group_ops,
+        .ct_attrs       = group_children_attrs,
+        .ct_owner       = THIS_MODULE,
+};
+static struct configfs_subsystem group_children_subsys = {
+        .su_group = {
+                .cg_item = {
+                        .ci_namebuf = "03-group-children",
+                        .ci_type = &group_children_type,
+                },
+        },
+};
+/* ----------------------------------------------------------------- */
+/*
+ * We're now done with our subsystem definitions.
+ * For convenience in this module, here's a list of them all.  It
+ * allows the init function to easily register them.  Most modules
+ * will only have one subsystem, and will only call register_subsystem
+ * on it directly.
+ */
+static struct configfs_subsystem *example_subsys[] = {
+        &childless_subsys.subsys,
+        &simple_children_subsys,
+        &group_children_subsys,
+        NULL,
+};
+static int __init configfs_example_init(void)
+{
+        int ret;
+        int i;
+        struct configfs_subsystem *subsys;
+        for (i = 0; example_subsys[i]; i++) {
+                subsys = example_subsys[i];
+                config_group_init(&subsys->su_group);
+                mutex_init(&subsys->su_mutex);
+                ret = configfs_register_subsystem(subsys);
+                if (ret) {
+                        printk(KERN_ERR "Error %d while registering subsystem %s\n",
+                               ret,
+                               subsys->su_group.cg_item.ci_namebuf);
+                        goto out_unregister;
+                }
+        }
+        return 0;
+out_unregister:
+        for (; i >= 0; i--) {
+                configfs_unregister_subsystem(example_subsys[i]);
+        }
+        return ret;
+}
+static void __exit configfs_example_exit(void)
+{
+        int i;
+        for (i = 0; example_subsys[i]; i++) {
+                configfs_unregister_subsystem(example_subsys[i]);
+        }
+}
+module_init(configfs_example_init);
+module_exit(configfs_example_exit);
+MODULE_LICENSE("GPL");
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index da015c12e3ea..762d287123ca 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -49,8 +49,10 @@ struct configfs_dirent {
 #define CONFIGFS_USET_DEFAULT   0x0080
 #define CONFIGFS_USET_DROPPING  0x0100
 #define CONFIGFS_USET_IN_MKDIR  0x0200
+#define CONFIGFS_USET_CREATING  0x0400
 #define CONFIGFS_NOT_PINNED     (CONFIGFS_ITEM_ATTR)
+extern struct mutex configfs_symlink_mutex;
 extern spinlock_t configfs_dirent_lock;
 extern struct vfsmount * configfs_mount;
@@ -66,6 +68,7 @@ extern void configfs_inode_exit(void);
 extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
 extern int configfs_make_dirent(struct configfs_dirent *,
                                struct dentry *, void *, umode_t, int);
+extern int configfs_dirent_is_ready(struct configfs_dirent *);
 extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
 extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 179589be063a..7a8db78a91d2 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -185,7 +185,7 @@ static int create_dir(struct config_item * k, struct dentry * p,
        error = configfs_dirent_exists(p->d_fsdata, d->d_name.name);
        if (!error)
                error = configfs_make_dirent(p->d_fsdata, d, k, mode,
-                                             CONFIGFS_DIR);
+                                             CONFIGFS_DIR | CONFIGFS_USET_CREATING);
        if (!error) {
                error = configfs_create(d, mode, init_dir);
                if (!error) {
@@ -209,6 +209,9 @@ static int create_dir(struct config_item * k, struct dentry * p,
 *      configfs_create_dir - create a directory for an config_item.
 *      @item:          config_itemwe're creating directory for.
 *      @dentry:        config_item's dentry.
+ *
+ *      Note: user-created entries won't be allowed under this new directory
+ *      until it is validated by configfs_dir_set_ready()
 */
 static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
@@ -231,6 +234,44 @@ static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
        return error;
 }
+/*
+ * Allow userspace to create new entries under a new directory created with
+ * configfs_create_dir(), and under all of its chidlren directories recursively.
+ * @sd          configfs_dirent of the new directory to validate
+ *
+ * Caller must hold configfs_dirent_lock.
+ */
+static void configfs_dir_set_ready(struct configfs_dirent *sd)
+{
+        struct configfs_dirent *child_sd;
+        sd->s_type &= ~CONFIGFS_USET_CREATING;
+        list_for_each_entry(child_sd, &sd->s_children, s_sibling)
+                if (child_sd->s_type & CONFIGFS_USET_CREATING)
+                        configfs_dir_set_ready(child_sd);
+}
+/*
+ * Check that a directory does not belong to a directory hierarchy being
+ * attached and not validated yet.
+ * @sd          configfs_dirent of the directory to check
+ *
+ * @return      non-zero iff the directory was validated
+ *
+ * Note: takes configfs_dirent_lock, so the result may change from false to true
+ * in two consecutive calls, but never from true to false.
+ */
+int configfs_dirent_is_ready(struct configfs_dirent *sd)
+{
+        int ret;
+        spin_lock(&configfs_dirent_lock);
+        ret = !(sd->s_type & CONFIGFS_USET_CREATING);
+        spin_unlock(&configfs_dirent_lock);
+        return ret;
+}
 int configfs_create_link(struct configfs_symlink *sl,
                         struct dentry *parent,
                         struct dentry *dentry)
@@ -283,6 +324,8 @@ static void remove_dir(struct dentry * d)
 * The only thing special about this is that we remove any files in
 * the directory before we remove the directory, and we've inlined
 * what used to be configfs_rmdir() below, instead of calling separately.
+ *
+ * Caller holds the mutex of the item's inode
 */
 static void configfs_remove_dir(struct config_item * item)
@@ -330,7 +373,19 @@ static struct dentry * configfs_lookup(struct inode *dir,
        struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
        struct configfs_dirent * sd;
        int found = 0;
-        int err = 0;
+        int err;
+        /*
+         * Fake invisibility if dir belongs to a group/default groups hierarchy
+         * being attached
+         *
+         * This forbids userspace to read/write attributes of items which may
+         * not complete their initialization, since the dentries of the
+         * attributes won't be instantiated.
+         */
+        err = -ENOENT;
+        if (!configfs_dirent_is_ready(parent_sd))
+                goto out;
        list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
                if (sd->s_type & CONFIGFS_NOT_PINNED) {
@@ -353,6 +408,7 @@ static struct dentry * configfs_lookup(struct inode *dir,
                return simple_lookup(dir, dentry, nd);
        }
+out:
        return ERR_PTR(err);
 }
@@ -370,13 +426,17 @@ static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex
        struct configfs_dirent *sd;
        int ret;
+        /* Mark that we're trying to drop the group */
+        parent_sd->s_type |= CONFIGFS_USET_DROPPING;
        ret = -EBUSY;
        if (!list_empty(&parent_sd->s_links))
                goto out;
        ret = 0;
        list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
-                if (sd->s_type & CONFIGFS_NOT_PINNED)
+                if (!sd->s_element ||
+                    (sd->s_type & CONFIGFS_NOT_PINNED))
                        continue;
                if (sd->s_type & CONFIGFS_USET_DEFAULT) {
                        /* Abort if racing with mkdir() */
@@ -385,8 +445,6 @@ static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex
                                        *wait_mutex = &sd->s_dentry->d_inode->i_mutex;
                                return -EAGAIN;
                        }
-                        /* Mark that we're trying to drop the group */
-                        sd->s_type |= CONFIGFS_USET_DROPPING;
                        /*
                         * Yup, recursive.  If there's a problem, blame
@@ -414,12 +472,11 @@ static void configfs_detach_rollback(struct dentry *dentry)
        struct configfs_dirent *parent_sd = dentry->d_fsdata;
        struct configfs_dirent *sd;
-        list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+        parent_sd->s_type &= ~CONFIGFS_USET_DROPPING;
-                if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+        list_for_each_entry(sd, &parent_sd->s_children, s_sibling)
+                if (sd->s_type & CONFIGFS_USET_DEFAULT)
                        configfs_detach_rollback(sd->s_dentry);
-                        sd->s_type &= ~CONFIGFS_USET_DROPPING;
-                }
-        }
 }
 static void detach_attrs(struct config_item * item)
@@ -558,36 +615,21 @@ static int create_default_group(struct config_group *parent_group,
 static int populate_groups(struct config_group *group)
 {
        struct config_group *new_group;
-        struct dentry *dentry = group->cg_item.ci_dentry;
        int ret = 0;
        int i;
        if (group->default_groups) {
-                /*
-                 * FYI, we're faking mkdir here
-                 * I'm not sure we need this semaphore, as we're called
-                 * from our parent's mkdir.  That holds our parent's
-                 * i_mutex, so afaik lookup cannot continue through our
-                 * parent to find us, let alone mess with our tree.
-                 * That said, taking our i_mutex is closer to mkdir
-                 * emulation, and shouldn't hurt.
-                 */
-                mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
                for (i = 0; group->default_groups[i]; i++) {
                        new_group = group->default_groups[i];
                        ret = create_default_group(group, new_group);
-                        if (ret)
+                        if (ret) {
+                                detach_groups(group);
                                break;
+                        }
                }
-                mutex_unlock(&dentry->d_inode->i_mutex);
        }
-        if (ret)
-                detach_groups(group);
        return ret;
 }
@@ -702,7 +744,15 @@ static int configfs_attach_item(struct config_item *parent_item,
        if (!ret) {
                ret = populate_attrs(item);
                if (ret) {
+                        /*
+                         * We are going to remove an inode and its dentry but
+                         * the VFS may already have hit and used them. Thus,
+                         * we must lock them as rmdir() would.
+                         */
+                        mutex_lock(&dentry->d_inode->i_mutex);
                        configfs_remove_dir(item);
+                        dentry->d_inode->i_flags |= S_DEAD;
+                        mutex_unlock(&dentry->d_inode->i_mutex);
                        d_delete(dentry);
                }
        }
@@ -710,6 +760,7 @@ static int configfs_attach_item(struct config_item *parent_item,
        return ret;
 }
+/* Caller holds the mutex of the item's inode */
 static void configfs_detach_item(struct config_item *item)
 {
        detach_attrs(item);
@@ -728,16 +779,30 @@ static int configfs_attach_group(struct config_item *parent_item,
                sd = dentry->d_fsdata;
                sd->s_type |= CONFIGFS_USET_DIR;
+                /*
+                 * FYI, we're faking mkdir in populate_groups()
+                 * We must lock the group's inode to avoid races with the VFS
+                 * which can already hit the inode and try to add/remove entries
+                 * under it.
+                 *
+                 * We must also lock the inode to remove it safely in case of
+                 * error, as rmdir() would.
+                 */
+                mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
                ret = populate_groups(to_config_group(item));
                if (ret) {
                        configfs_detach_item(item);
-                        d_delete(dentry);
+                        dentry->d_inode->i_flags |= S_DEAD;
                }
+                mutex_unlock(&dentry->d_inode->i_mutex);
+                if (ret)
+                        d_delete(dentry);
        }
        return ret;
 }
+/* Caller holds the mutex of the group's inode */
 static void configfs_detach_group(struct config_item *item)
 {
        detach_groups(to_config_group(item));
@@ -1035,7 +1100,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct configfs_subsystem *subsys;
        struct configfs_dirent *sd;
        struct config_item_type *type;
-        struct module *owner = NULL;
+        struct module *subsys_owner = NULL, *new_item_owner = NULL;
        char *name;
        if (dentry->d_parent == configfs_sb->s_root) {
@@ -1044,6 +1109,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        }
        sd = dentry->d_parent->d_fsdata;
+        /*
+         * Fake invisibility if dir belongs to a group/default groups hierarchy
+         * being attached
+         */
+        if (!configfs_dirent_is_ready(sd)) {
+                ret = -ENOENT;
+                goto out;
+        }
        if (!(sd->s_type & CONFIGFS_USET_DIR)) {
                ret = -EPERM;
                goto out;
@@ -1062,10 +1137,25 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                goto out_put;
        }
+        /*
+         * The subsystem may belong to a different module than the item
+         * being created.  We don't want to safely pin the new item but
+         * fail to pin the subsystem it sits under.
+         */
+        if (!subsys->su_group.cg_item.ci_type) {
+                ret = -EINVAL;
+                goto out_put;
+        }
+        subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
+        if (!try_module_get(subsys_owner)) {
+                ret = -EINVAL;
+                goto out_put;
+        }
        name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
        if (!name) {
                ret = -ENOMEM;
-                goto out_put;
+                goto out_subsys_put;
        }
        snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
@@ -1094,10 +1184,10 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        kfree(name);
        if (ret) {
                /*
-                 * If item == NULL, then link_obj() was never called.
+                 * If ret != 0, then link_obj() was never called.
                 * There are no extra references to clean up.
                 */
-                goto out_put;
+                goto out_subsys_put;
        }
        /*
@@ -1111,8 +1201,8 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                goto out_unlink;
        }
-        owner = type->ct_owner;
+        new_item_owner = type->ct_owner;
-        if (!try_module_get(owner)) {
+        if (!try_module_get(new_item_owner)) {
                ret = -EINVAL;
                goto out_unlink;
        }
@@ -1142,6 +1232,8 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        spin_lock(&configfs_dirent_lock);
        sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
+        if (!ret)
+                configfs_dir_set_ready(dentry->d_fsdata);
        spin_unlock(&configfs_dirent_lock);
 out_unlink:
@@ -1159,9 +1251,13 @@ out_unlink:
                mutex_unlock(&subsys->su_mutex);
                if (module_got)
-                        module_put(owner);
+                        module_put(new_item_owner);
        }
+out_subsys_put:
+        if (ret)
+                module_put(subsys_owner);
 out_put:
        /*
         * link_obj()/link_group() took a reference from child->parent,
@@ -1180,7 +1276,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        struct config_item *item;
        struct configfs_subsystem *subsys;
        struct configfs_dirent *sd;
-        struct module *owner = NULL;
+        struct module *subsys_owner = NULL, *dead_item_owner = NULL;
        int ret;
        if (dentry->d_parent == configfs_sb->s_root)
@@ -1207,6 +1303,15 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
                return -EINVAL;
        }
+        /* configfs_mkdir() shouldn't have allowed this */
+        BUG_ON(!subsys->su_group.cg_item.ci_type);
+        subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
+        /*
+         * Ensure that no racing symlink() will make detach_prep() fail while
+         * the new link is temporarily attached
+         */
+        mutex_lock(&configfs_symlink_mutex);
        spin_lock(&configfs_dirent_lock);
        do {
                struct mutex *wait_mutex;
@@ -1215,6 +1320,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
                if (ret) {
                        configfs_detach_rollback(dentry);
                        spin_unlock(&configfs_dirent_lock);
+                        mutex_unlock(&configfs_symlink_mutex);
                        if (ret != -EAGAIN) {
                                config_item_put(parent_item);
                                return ret;
@@ -1224,10 +1330,12 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
                        mutex_lock(wait_mutex);
                        mutex_unlock(wait_mutex);
+                        mutex_lock(&configfs_symlink_mutex);
                        spin_lock(&configfs_dirent_lock);
                }
        } while (ret == -EAGAIN);
        spin_unlock(&configfs_dirent_lock);
+        mutex_unlock(&configfs_symlink_mutex);
        /* Get a working ref for the duration of this function */
        item = configfs_get_config_item(dentry);
@@ -1236,7 +1344,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        config_item_put(parent_item);
        if (item->ci_type)
-                owner = item->ci_type->ct_owner;
+                dead_item_owner = item->ci_type->ct_owner;
        if (sd->s_type & CONFIGFS_USET_DIR) {
                configfs_detach_group(item);
@@ -1258,7 +1366,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        /* Drop our reference from above */
        config_item_put(item);
-        module_put(owner);
+        module_put(dead_item_owner);
+        module_put(subsys_owner);
        return 0;
 }
@@ -1314,13 +1423,24 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
 {
        struct dentry * dentry = file->f_path.dentry;
        struct configfs_dirent * parent_sd = dentry->d_fsdata;
+        int err;
        mutex_lock(&dentry->d_inode->i_mutex);
-        file->private_data = configfs_new_dirent(parent_sd, NULL);
+        /*
+         * Fake invisibility if dir belongs to a group/default groups hierarchy
+         * being attached
+         */
+        err = -ENOENT;
+        if (configfs_dirent_is_ready(parent_sd)) {
+                file->private_data = configfs_new_dirent(parent_sd, NULL);
+                if (IS_ERR(file->private_data))
+                        err = PTR_ERR(file->private_data);
+                else
+                        err = 0;
+        }
        mutex_unlock(&dentry->d_inode->i_mutex);
-        return IS_ERR(file->private_data) ? PTR_ERR(file->private_data) : 0;
+        return err;
 }
 static int configfs_dir_close(struct inode *inode, struct file *file)
@@ -1491,6 +1611,10 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
                if (err) {
                        d_delete(dentry);
                        dput(dentry);
+                } else {
+                        spin_lock(&configfs_dirent_lock);
+                        configfs_dir_set_ready(dentry->d_fsdata);
+                        spin_unlock(&configfs_dirent_lock);
                }
        }
@@ -1517,11 +1641,13 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
        mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
                          I_MUTEX_PARENT);
        mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+        mutex_lock(&configfs_symlink_mutex);
        spin_lock(&configfs_dirent_lock);
        if (configfs_detach_prep(dentry, NULL)) {
                printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
        }
        spin_unlock(&configfs_dirent_lock);
+        mutex_unlock(&configfs_symlink_mutex);
        configfs_detach_group(&group->cg_item);
        dentry->d_inode->i_flags |= S_DEAD;
        mutex_unlock(&dentry->d_inode->i_mutex);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 0004d18c40ac..bf74973b0492 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -31,6 +31,9 @@
 #include <linux/configfs.h>
 #include "configfs_internal.h"
+/* Protects attachments of new symlinks */
+DEFINE_MUTEX(configfs_symlink_mutex);
 static int item_depth(struct config_item * item)
 {
        struct config_item * p = item;
@@ -73,11 +76,20 @@ static int create_link(struct config_item *parent_item,
        struct configfs_symlink *sl;
        int ret;
+        ret = -ENOENT;
+        if (!configfs_dirent_is_ready(target_sd))
+                goto out;
        ret = -ENOMEM;
        sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
        if (sl) {
                sl->sl_target = config_item_get(item);
                spin_lock(&configfs_dirent_lock);
+                if (target_sd->s_type & CONFIGFS_USET_DROPPING) {
+                        spin_unlock(&configfs_dirent_lock);
+                        config_item_put(item);
+                        kfree(sl);
+                        return -ENOENT;
+                }
                list_add(&sl->sl_list, &target_sd->s_links);
                spin_unlock(&configfs_dirent_lock);
                ret = configfs_create_link(sl, parent_item->ci_dentry,
@@ -91,6 +103,7 @@ static int create_link(struct config_item *parent_item,
                }
        }
+out:
        return ret;
 }
@@ -120,6 +133,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
 {
        int ret;
        struct nameidata nd;
+        struct configfs_dirent *sd;
        struct config_item *parent_item;
        struct config_item *target_item;
        struct config_item_type *type;
@@ -128,9 +142,19 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
        if (dentry->d_parent == configfs_sb->s_root)
                goto out;
+        sd = dentry->d_parent->d_fsdata;
+        /*
+         * Fake invisibility if dir belongs to a group/default groups hierarchy
+         * being attached
+         */
+        ret = -ENOENT;
+        if (!configfs_dirent_is_ready(sd))
+                goto out;
        parent_item = configfs_get_config_item(dentry->d_parent);
        type = parent_item->ci_type;
+        ret = -EPERM;
        if (!type || !type->ct_item_ops ||
            !type->ct_item_ops->allow_link)
                goto out_put;
@@ -141,7 +165,9 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
        ret = type->ct_item_ops->allow_link(parent_item, target_item);
        if (!ret) {
+                mutex_lock(&configfs_symlink_mutex);
                ret = create_link(parent_item, target_item, dentry);
+                mutex_unlock(&configfs_symlink_mutex);
                if (ret && type->ct_item_ops->drop_link)
                        type->ct_item_ops->drop_link(parent_item,
                                                     target_item);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1db080135c6d..506c24fb5078 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1073,12 +1073,15 @@ static void ocfs2_write_failure(struct inode *inode,
        for(i = 0; i < wc->w_num_pages; i++) {
                tmppage = wc->w_pages[i];
-                if (ocfs2_should_order_data(inode))
+                if (page_has_buffers(tmppage)) {
-                        walk_page_buffers(wc->w_handle, page_buffers(tmppage),
+                        if (ocfs2_should_order_data(inode))
-                                          from, to, NULL,
+                                walk_page_buffers(wc->w_handle,
-                                          ocfs2_journal_dirty_data);
+                                                  page_buffers(tmppage),
+                                                  from, to, NULL,
-                block_commit_write(tmppage, from, to);
+                                                  ocfs2_journal_dirty_data);
+                        block_commit_write(tmppage, from, to);
+                }
        }
 }
@@ -1901,12 +1904,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                        to = PAGE_CACHE_SIZE;
                }
-                if (ocfs2_should_order_data(inode))
+                if (page_has_buffers(tmppage)) {
-                        walk_page_buffers(wc->w_handle, page_buffers(tmppage),
+                        if (ocfs2_should_order_data(inode))
-                                          from, to, NULL,
+                                walk_page_buffers(wc->w_handle,
-                                          ocfs2_journal_dirty_data);
+                                                  page_buffers(tmppage),
+                                                  from, to, NULL,
-                block_commit_write(tmppage, from, to);
+                                                  ocfs2_journal_dirty_data);
+                        block_commit_write(tmppage, from, to);
+                }
        }
 out_write_size:
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index be2dd95d3a1d..ec2ed15c3daa 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1766,8 +1766,8 @@ out_inode_unlock:
 out_rw_unlock:
        ocfs2_rw_unlock(inode, 1);
-        mutex_unlock(&inode->i_mutex);
 out:
+        mutex_unlock(&inode->i_mutex);
        return ret;
 }
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a8c19cb3cfdd..7a37240f7a31 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -57,7 +57,7 @@ static int __ocfs2_recovery_thread(void *arg);
 static int ocfs2_commit_cache(struct ocfs2_super *osb);
 static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
-                                      int dirty);
+                                      int dirty, int replayed);
 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
                                 int slot_num);
 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
@@ -562,8 +562,18 @@ done:
        return status;
 }
+static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di)
+{
+        le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1);
+}
+static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di)
+{
+        return le32_to_cpu(di->id1.journal1.ij_recovery_generation);
+}
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
-                                      int dirty)
+                                      int dirty, int replayed)
 {
        int status;
        unsigned int flags;
@@ -593,6 +603,9 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
                flags &= ~OCFS2_JOURNAL_DIRTY_FL;
        fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+        if (replayed)
+                ocfs2_bump_recovery_generation(fe);
        status = ocfs2_write_block(osb, bh, journal->j_inode);
        if (status < 0)
                mlog_errno(status);
@@ -667,7 +680,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
                 * Do not toggle if flush was unsuccessful otherwise
                 * will leave dirty metadata in a "clean" journal
                 */
-                status = ocfs2_journal_toggle_dirty(osb, 0);
+                status = ocfs2_journal_toggle_dirty(osb, 0, 0);
                if (status < 0)
                        mlog_errno(status);
        }
@@ -710,7 +723,7 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
        }
 }
-int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
+int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
 {
        int status = 0;
        struct ocfs2_super *osb;
@@ -729,7 +742,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
        ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
-        status = ocfs2_journal_toggle_dirty(osb, 1);
+        status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
        if (status < 0) {
                mlog_errno(status);
                goto done;
@@ -771,7 +784,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
                goto bail;
        }
-        status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
+        status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0);
        if (status < 0)
                mlog_errno(status);
@@ -1034,6 +1047,12 @@ restart:
        spin_unlock(&osb->osb_lock);
        mlog(0, "All nodes recovered\n");
+        /* Refresh all journal recovery generations from disk */
+        status = ocfs2_check_journals_nolocks(osb);
+        status = (status == -EROFS) ? 0 : status;
+        if (status < 0)
+                mlog_errno(status);
        ocfs2_super_unlock(osb, 1);
        /* We always run recovery on our own orphan dir - the dead
@@ -1096,6 +1115,42 @@ out:
        mlog_exit_void();
 }
+static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
+                                    int slot_num,
+                                    struct buffer_head **bh,
+                                    struct inode **ret_inode)
+{
+        int status = -EACCES;
+        struct inode *inode = NULL;
+        BUG_ON(slot_num >= osb->max_slots);
+        inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+                                            slot_num);
+        if (!inode || is_bad_inode(inode)) {
+                mlog_errno(status);
+                goto bail;
+        }
+        SET_INODE_JOURNAL(inode);
+        status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        status = 0;
+bail:
+        if (inode) {
+                if (status || !ret_inode)
+                        iput(inode);
+                else
+                        *ret_inode = inode;
+        }
+        return status;
+}
 /* Does the actual journal replay and marks the journal inode as
 * clean. Will only replay if the journal inode is marked dirty. */
 static int ocfs2_replay_journal(struct ocfs2_super *osb,
@@ -1109,22 +1164,36 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        struct ocfs2_dinode *fe;
        journal_t *journal = NULL;
        struct buffer_head *bh = NULL;
+        u32 slot_reco_gen;
-        inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+        status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode);
-                                            slot_num);
+        if (status) {
-        if (inode == NULL) {
-                status = -EACCES;
                mlog_errno(status);
                goto done;
        }
-        if (is_bad_inode(inode)) {
-                status = -EACCES;
+        fe = (struct ocfs2_dinode *)bh->b_data;
-                iput(inode);
+        slot_reco_gen = ocfs2_get_recovery_generation(fe);
-                inode = NULL;
+        brelse(bh);
-                mlog_errno(status);
+        bh = NULL;
+        /*
+         * As the fs recovery is asynchronous, there is a small chance that
+         * another node mounted (and recovered) the slot before the recovery
+         * thread could get the lock. To handle that, we dirty read the journal
+         * inode for that slot to get the recovery generation. If it is
+         * different than what we expected, the slot has been recovered.
+         * If not, it needs recovery.
+         */
+        if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
+                mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num,
+                     osb->slot_recovery_generations[slot_num], slot_reco_gen);
+                osb->slot_recovery_generations[slot_num] = slot_reco_gen;
+                status = -EBUSY;
                goto done;
        }
-        SET_INODE_JOURNAL(inode);
+        /* Continue with recovery as the journal has not yet been recovered */
        status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
        if (status < 0) {
@@ -1138,9 +1207,12 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        fe = (struct ocfs2_dinode *) bh->b_data;
        flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+        slot_reco_gen = ocfs2_get_recovery_generation(fe);
        if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
                mlog(0, "No recovery required for node %d\n", node_num);
+                /* Refresh recovery generation for the slot */
+                osb->slot_recovery_generations[slot_num] = slot_reco_gen;
                goto done;
        }
@@ -1188,6 +1260,11 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        flags &= ~OCFS2_JOURNAL_DIRTY_FL;
        fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+        /* Increment recovery generation to indicate successful recovery */
+        ocfs2_bump_recovery_generation(fe);
+        osb->slot_recovery_generations[slot_num] =
+                                        ocfs2_get_recovery_generation(fe);
        status = ocfs2_write_block(osb, bh, inode);
        if (status < 0)
                mlog_errno(status);
@@ -1252,6 +1329,13 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
        status = ocfs2_replay_journal(osb, node_num, slot_num);
        if (status < 0) {
+                if (status == -EBUSY) {
+                        mlog(0, "Skipping recovery for slot %u (node %u) "
+                             "as another node has recovered it\n", slot_num,
+                             node_num);
+                        status = 0;
+                        goto done;
+                }
                mlog_errno(status);
                goto done;
        }
@@ -1334,12 +1418,29 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 {
        unsigned int node_num;
        int status, i;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_dinode *di;
        /* This is called with the super block cluster lock, so we
         * know that the slot map can't change underneath us. */
        spin_lock(&osb->osb_lock);
        for (i = 0; i < osb->max_slots; i++) {
+                /* Read journal inode to get the recovery generation */
+                status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
+                if (status) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                di = (struct ocfs2_dinode *)bh->b_data;
+                osb->slot_recovery_generations[i] =
+                                        ocfs2_get_recovery_generation(di);
+                brelse(bh);
+                bh = NULL;
+                mlog(0, "Slot %u recovery generation is %u\n", i,
+                     osb->slot_recovery_generations[i]);
                if (i == osb->slot_num)
                        continue;
@@ -1603,49 +1704,41 @@ static int ocfs2_commit_thread(void *arg)
        return 0;
 }
-/* Look for a dirty journal without taking any cluster locks. Used for
+/* Reads all the journal inodes without taking any cluster locks. Used
- * hard readonly access to determine whether the file system journals
+ * for hard readonly access to determine whether any journal requires
- * require recovery. */
+ * recovery. Also used to refresh the recovery generation numbers after
+ * a journal has been recovered by another node.
+ */
 int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
 {
        int ret = 0;
        unsigned int slot;
-        struct buffer_head *di_bh;
+        struct buffer_head *di_bh = NULL;
        struct ocfs2_dinode *di;
-        struct inode *journal = NULL;
+        int journal_dirty = 0;
        for(slot = 0; slot < osb->max_slots; slot++) {
-                journal = ocfs2_get_system_file_inode(osb,
+                ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL);
-                                                      JOURNAL_SYSTEM_INODE,
+                if (ret) {
-                                                      slot);
-                if (!journal || is_bad_inode(journal)) {
-                        ret = -EACCES;
-                        mlog_errno(ret);
-                        goto out;
-                }
-                di_bh = NULL;
-                ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
-                                       0, journal);
-                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
                }
                di = (struct ocfs2_dinode *) di_bh->b_data;
+                osb->slot_recovery_generations[slot] =
+                                        ocfs2_get_recovery_generation(di);
                if (le32_to_cpu(di->id1.journal1.ij_flags) &
                    OCFS2_JOURNAL_DIRTY_FL)
-                        ret = -EROFS;
+                        journal_dirty = 1;
                brelse(di_bh);
-                if (ret)
+                di_bh = NULL;
-                        break;
        }
 out:
-        if (journal)
+        if (journal_dirty)
-                iput(journal);
+                ret = -EROFS;
        return ret;
 }
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index db82be2532ed..2178ebffa05f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -161,7 +161,8 @@ int    ocfs2_journal_init(struct ocfs2_journal *journal,
 void   ocfs2_journal_shutdown(struct ocfs2_super *osb);
 int    ocfs2_journal_wipe(struct ocfs2_journal *journal,
                          int full);
-int    ocfs2_journal_load(struct ocfs2_journal *journal, int local);
+int    ocfs2_journal_load(struct ocfs2_journal *journal, int local,
+                          int replayed);
 int    ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
 void   ocfs2_recovery_thread(struct ocfs2_super *osb,
                             int node_num);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1cb814be8ef1..7f625f2b1117 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -204,6 +204,8 @@ struct ocfs2_super
        struct ocfs2_slot_info *slot_info;
+        u32 *slot_recovery_generations;
        spinlock_t node_map_lock;
        u64 root_blkno;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 3f1945177629..4f619850ccf7 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -660,7 +660,10 @@ struct ocfs2_dinode {
                struct {                /* Info for journal system
                                           inodes */
                        __le32 ij_flags;        /* Mounted, version, etc. */
-                        __le32 ij_pad;
+                        __le32 ij_recovery_generation; /* Incremented when the
+                                                          journal is recovered
+                                                          after an unclean
+                                                          shutdown */
                } journal1;
        } id1;                          /* Inode type dependant 1 */
 /*C0*/  union {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2560b33889aa..88255d3f52b4 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1442,6 +1442,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
        mlog(0, "max_slots for this device: %u\n", osb->max_slots);
+        osb->slot_recovery_generations =
+                kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
+                        GFP_KERNEL);
+        if (!osb->slot_recovery_generations) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
        init_waitqueue_head(&osb->osb_wipe_event);
        osb->osb_orphan_wipes = kcalloc(osb->max_slots,
                                        sizeof(*osb->osb_orphan_wipes),
@@ -1703,7 +1712,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
        local = ocfs2_mount_local(osb);
        /* will play back anything left in the journal. */
-        status = ocfs2_journal_load(osb->journal, local);
+        status = ocfs2_journal_load(osb->journal, local, dirty);
        if (status < 0) {
                mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
                goto finally;
@@ -1768,6 +1777,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
        ocfs2_free_slot_info(osb);
        kfree(osb->osb_orphan_wipes);
+        kfree(osb->slot_recovery_generations);
        /* FIXME
         * This belongs in journal shutdown, but because we have to
         * allocate osb->journal at the start of ocfs2_initalize_osb(),
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index d62c19ff041c..7f627775c947 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -40,6 +40,7 @@
 #include <linux/list.h>
 #include <linux/kref.h>
 #include <linux/mutex.h>
+#include <linux/err.h>
 #include <asm/atomic.h>
@@ -129,8 +130,25 @@ struct configfs_attribute {
 /*
 * Users often need to create attribute structures for their configurable
 * attributes, containing a configfs_attribute member and function pointers
- * for the show() and store() operations on that attribute. They can use
+ * for the show() and store() operations on that attribute. If they don't
- * this macro (similar to sysfs' __ATTR) to make defining attributes easier.
+ * need anything else on the extended attribute structure, they can use
+ * this macro to define it  The argument _item is the name of the
+ * config_item structure.
+ */
+#define CONFIGFS_ATTR_STRUCT(_item)                                     \
+struct _item##_attribute {                                              \
+        struct configfs_attribute attr;                                 \
+        ssize_t (*show)(struct _item *, char *);                        \
+        ssize_t (*store)(struct _item *, const char *, size_t);         \
+}
+/*
+ * With the extended attribute structure, users can use this macro
+ * (similar to sysfs' __ATTR) to make defining attributes easier.
+ * An example:
+ * #define MYITEM_ATTR(_name, _mode, _show, _store)     \
+ * struct myitem_attribute childless_attr_##_name =     \
+ *         __CONFIGFS_ATTR(_name, _mode, _show, _store)
 */
 #define __CONFIGFS_ATTR(_name, _mode, _show, _store)                    \
 {                                                                       \
@@ -142,6 +160,52 @@ struct configfs_attribute {
        .show   = _show,                                                \
        .store  = _store,                                               \
 }
+/* Here is a readonly version, only requiring a show() operation */
+#define __CONFIGFS_ATTR_RO(_name, _show)                                \
+{                                                                       \
+        .attr   = {                                                     \
+                        .ca_name = __stringify(_name),                  \
+                        .ca_mode = 0444,                                \
+                        .ca_owner = THIS_MODULE,                        \
+        },                                                              \
+        .show   = _show,                                                \
+}
+/*
+ * With these extended attributes, the simple show_attribute() and
+ * store_attribute() operations need to call the show() and store() of the
+ * attributes.  This is a common pattern, so we provide a macro to define
+ * them.  The argument _item is the name of the config_item structure.
+ * This macro expects the attributes to be named "struct <name>_attribute"
+ * and the function to_<name>() to exist;
+ */
+#define CONFIGFS_ATTR_OPS(_item)                                        \
+static ssize_t _item##_attr_show(struct config_item *item,              \
+                                 struct configfs_attribute *attr,       \
+                                 char *page)                            \
+{                                                                       \
+        struct _item *_item = to_##_item(item);                         \
+        struct _item##_attribute *_item##_attr =                        \
+                container_of(attr, struct _item##_attribute, attr);     \
+        ssize_t ret = 0;                                                \
+                                                                        \
+        if (_item##_attr->show)                                         \
+                ret = _item##_attr->show(_item, page);                  \
+        return ret;                                                     \
+}                                                                       \
+static ssize_t _item##_attr_store(struct config_item *item,             \
+                                  struct configfs_attribute *attr,      \
+                                  const char *page, size_t count)       \
+{                                                                       \
+        struct _item *_item = to_##_item(item);                         \
+        struct _item##_attribute *_item##_attr =                        \
+                container_of(attr, struct _item##_attribute, attr);     \
+        ssize_t ret = -EINVAL;                                          \
+                                                                        \
+        if (_item##_attr->store)                                        \
+                ret = _item##_attr->store(_item, page, count);          \
+        return ret;                                                     \
+}
 /*
 * If allow_link() exists, the item can symlink(2) out to other
author	Linus Torvalds <torvalds@linux-foundation.org>	2008-08-01 14:54:05 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-08-01 14:54:05 -0400
commit	63a16f90167850010864a9e8ebb71d216983090f (patch)
tree	c4b284cc596421ac8100e0ad3b2f56ead4563d2d
parent	5adf2b03d97111c8955495ba11e8b7db27df8695 (diff)
parent	c259ae52e204d42f8b2d484c85517a4c367030e1 (diff)