aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/obsolete/o2cb11
-rw-r--r--Documentation/ABI/stable/o2cb10
-rw-r--r--Documentation/ABI/testing/sysfs-ocfs289
-rw-r--r--Documentation/feature-removal-schedule.txt10
-rw-r--r--MAINTAINERS1
-rw-r--r--fs/Kconfig26
-rw-r--r--fs/ocfs2/Makefile14
-rw-r--r--fs/ocfs2/alloc.c465
-rw-r--r--fs/ocfs2/aops.c6
-rw-r--r--fs/ocfs2/cluster/Makefile2
-rw-r--r--fs/ocfs2/cluster/netdebug.c441
-rw-r--r--fs/ocfs2/cluster/nodemanager.c5
-rw-r--r--fs/ocfs2/cluster/sys.c9
-rw-r--r--fs/ocfs2/cluster/tcp.c164
-rw-r--r--fs/ocfs2/cluster/tcp.h32
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h26
-rw-r--r--fs/ocfs2/dlm/Makefile2
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h49
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c911
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h86
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c70
-rw-r--r--fs/ocfs2/dlm/dlmlock.c22
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c200
-rw-r--r--fs/ocfs2/dlmglue.c645
-rw-r--r--fs/ocfs2/dlmglue.h5
-rw-r--r--fs/ocfs2/file.c4
-rw-r--r--fs/ocfs2/heartbeat.c184
-rw-r--r--fs/ocfs2/heartbeat.h17
-rw-r--r--fs/ocfs2/ioctl.c13
-rw-r--r--fs/ocfs2/ioctl.h3
-rw-r--r--fs/ocfs2/journal.c211
-rw-r--r--fs/ocfs2/journal.h4
-rw-r--r--fs/ocfs2/localalloc.c4
-rw-r--r--fs/ocfs2/namei.c4
-rw-r--r--fs/ocfs2/ocfs2.h77
-rw-r--r--fs/ocfs2/ocfs2_fs.h79
-rw-r--r--fs/ocfs2/ocfs2_lockid.h2
-rw-r--r--fs/ocfs2/slot_map.c454
-rw-r--r--fs/ocfs2/slot_map.h32
-rw-r--r--fs/ocfs2/stack_o2cb.c420
-rw-r--r--fs/ocfs2/stack_user.c883
-rw-r--r--fs/ocfs2/stackglue.c568
-rw-r--r--fs/ocfs2/stackglue.h261
-rw-r--r--fs/ocfs2/suballoc.c103
-rw-r--r--fs/ocfs2/suballoc.h1
-rw-r--r--fs/ocfs2/super.c208
-rw-r--r--fs/sysfs/symlink.c9
47 files changed, 5800 insertions, 1042 deletions
diff --git a/Documentation/ABI/obsolete/o2cb b/Documentation/ABI/obsolete/o2cb
new file mode 100644
index 000000000000..9c49d8e6c0cc
--- /dev/null
+++ b/Documentation/ABI/obsolete/o2cb
@@ -0,0 +1,11 @@
1What: /sys/o2cb symlink
2Date: Dec 2005
3KernelVersion: 2.6.16
4Contact: ocfs2-devel@oss.oracle.com
5Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will
6 be removed when new versions of ocfs2-tools which know to look
7 in /sys/fs/o2cb are sufficiently prevalent. Don't code new
8 software to look here, it should try /sys/fs/o2cb instead.
9 See Documentation/ABI/stable/o2cb for more information on usage.
10Users: ocfs2-tools. It's sufficient to mail proposed changes to
11 ocfs2-devel@oss.oracle.com.
diff --git a/Documentation/ABI/stable/o2cb b/Documentation/ABI/stable/o2cb
new file mode 100644
index 000000000000..5eb1545e0b8d
--- /dev/null
+++ b/Documentation/ABI/stable/o2cb
@@ -0,0 +1,10 @@
1What: /sys/fs/o2cb/ (was /sys/o2cb)
2Date: Dec 2005
3KernelVersion: 2.6.16
4Contact: ocfs2-devel@oss.oracle.com
5Description: Ocfs2-tools looks at 'interface-revision' for versioning
6 information. Each logmask/ file controls a set of debug prints
7 and can be written into with the strings "allow", "deny", or
8 "off". Reading the file returns the current state.
9Users: ocfs2-tools. It's sufficient to mail proposed changes to
10 ocfs2-devel@oss.oracle.com.
diff --git a/Documentation/ABI/testing/sysfs-ocfs2 b/Documentation/ABI/testing/sysfs-ocfs2
new file mode 100644
index 000000000000..b7cc516a8a8a
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-ocfs2
@@ -0,0 +1,89 @@
1What: /sys/fs/ocfs2/
2Date: April 2008
3Contact: ocfs2-devel@oss.oracle.com
4Description:
5 The /sys/fs/ocfs2 directory contains knobs used by the
6 ocfs2-tools to interact with the filesystem.
7
8What: /sys/fs/ocfs2/max_locking_protocol
9Date: April 2008
10Contact: ocfs2-devel@oss.oracle.com
11Description:
12 The /sys/fs/ocfs2/max_locking_protocol file displays version
13 of ocfs2 locking supported by the filesystem. This version
14 covers how ocfs2 uses distributed locking between cluster
15 nodes.
16
17 The protocol version has a major and minor number. Two
18 cluster nodes can interoperate if they have an identical
19 major number and an overlapping minor number - thus,
20 a node with version 1.10 can interoperate with a node
21 sporting version 1.8, as long as both use the 1.8 protocol.
22
23 Reading from this file returns a single line, the major
24 number and minor number joined by a period, eg "1.10".
25
26 This file is read-only. The value is compiled into the
27 driver.
28
29What: /sys/fs/ocfs2/loaded_cluster_plugins
30Date: April 2008
31Contact: ocfs2-devel@oss.oracle.com
32Description:
33 The /sys/fs/ocfs2/loaded_cluster_plugins file describes
34 the available plugins to support ocfs2 cluster operation.
35 A cluster plugin is required to use ocfs2 in a cluster.
36 There are currently two available plugins:
37
38 * 'o2cb' - The classic o2cb cluster stack that ocfs2 has
39 used since its inception.
40 * 'user' - A plugin supporting userspace cluster software
41 in conjunction with fs/dlm.
42
43 Reading from this file returns the names of all loaded
44 plugins, one per line.
45
46 This file is read-only. Its contents may change as
47 plugins are loaded or removed.
48
49What: /sys/fs/ocfs2/active_cluster_plugin
50Date: April 2008
51Contact: ocfs2-devel@oss.oracle.com
52Description:
53 The /sys/fs/ocfs2/active_cluster_plugin displays which
54 cluster plugin is currently in use by the filesystem.
55 The active plugin will appear in the loaded_cluster_plugins
56 file as well. Only one plugin can be used at a time.
57
58 Reading from this file returns the name of the active plugin
59 on a single line.
60
61 This file is read-only. Which plugin is active depends on
62 the cluster stack in use. The contents may change
63 when all filesystems are unmounted and the cluster stack
64 is changed.
65
66What: /sys/fs/ocfs2/cluster_stack
67Date: April 2008
68Contact: ocfs2-devel@oss.oracle.com
69Description:
70 The /sys/fs/ocfs2/cluster_stack file contains the name
71 of current ocfs2 cluster stack. This value is set by
72 userspace tools when bringing the cluster stack online.
73
74 Cluster stack names are 4 characters in length.
75
76 When the 'o2cb' cluster stack is used, the 'o2cb' cluster
77 plugin is active. All other cluster stacks use the 'user'
78 cluster plugin.
79
80 Reading from this file returns the name of the current
81 cluster stack on a single line.
82
83 Writing a new stack name to this file changes the current
84 cluster stack unless there are mounted ocfs2 filesystems.
85 If there are mounted filesystems, attempts to change the
86 stack return an error.
87
88Users:
89 ocfs2-tools <ocfs2-tools-devel@oss.oracle.com>
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 164c89394cff..4b70622a8a91 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -318,3 +318,13 @@ Why: Not used in-tree. The current out-of-tree users used it to
318 code / infrastructure should be in the kernel and not in some 318 code / infrastructure should be in the kernel and not in some
319 out-of-tree driver. 319 out-of-tree driver.
320Who: Thomas Gleixner <tglx@linutronix.de> 320Who: Thomas Gleixner <tglx@linutronix.de>
321
322---------------------------
323
324What: /sys/o2cb symlink
325When: January 2010
326Why: /sys/fs/o2cb is the proper location for this information - /sys/o2cb
327 exists as a symlink for backwards compatibility for old versions of
328 ocfs2-tools. 2 years should be sufficient time to phase in new versions
329 which know to look in /sys/fs/o2cb.
330Who: ocfs2-devel@oss.oracle.com
diff --git a/MAINTAINERS b/MAINTAINERS
index 3eceebb48c92..974ee8ddb12c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2952,6 +2952,7 @@ P: Joel Becker
2952M: joel.becker@oracle.com 2952M: joel.becker@oracle.com
2953L: ocfs2-devel@oss.oracle.com 2953L: ocfs2-devel@oss.oracle.com
2954W: http://oss.oracle.com/projects/ocfs2/ 2954W: http://oss.oracle.com/projects/ocfs2/
2955T: git git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2.git
2955S: Supported 2956S: Supported
2956 2957
2957OMNIKEY CARDMAN 4000 DRIVER 2958OMNIKEY CARDMAN 4000 DRIVER
diff --git a/fs/Kconfig b/fs/Kconfig
index c509123bea49..028ae38ecc52 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -444,6 +444,32 @@ config OCFS2_FS
444 For more information on OCFS2, see the file 444 For more information on OCFS2, see the file
445 <file:Documentation/filesystems/ocfs2.txt>. 445 <file:Documentation/filesystems/ocfs2.txt>.
446 446
447config OCFS2_FS_O2CB
448 tristate "O2CB Kernelspace Clustering"
449 depends on OCFS2_FS
450 default y
451 help
452 OCFS2 includes a simple kernelspace clustering package, the OCFS2
453 Cluster Base. It only requires a very small userspace component
454 to configure it. This comes with the standard ocfs2-tools package.
455 O2CB is limited to maintaining a cluster for OCFS2 file systems.
456 It cannot manage any other cluster applications.
457
458 It is always safe to say Y here, as the clustering method is
459 run-time selectable.
460
461config OCFS2_FS_USERSPACE_CLUSTER
462 tristate "OCFS2 Userspace Clustering"
463 depends on OCFS2_FS && DLM
464 default y
465 help
466 This option will allow OCFS2 to use userspace clustering services
467 in conjunction with the DLM in fs/dlm. If you are using a
468 userspace cluster manager, say Y here.
469
470 It is safe to say Y, as the clustering method is run-time
471 selectable.
472
447config OCFS2_DEBUG_MASKLOG 473config OCFS2_DEBUG_MASKLOG
448 bool "OCFS2 logging support" 474 bool "OCFS2 logging support"
449 depends on OCFS2_FS 475 depends on OCFS2_FS
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 4d4ce48bb42c..f6956de56fdb 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -2,7 +2,12 @@ EXTRA_CFLAGS += -Ifs/ocfs2
2 2
3EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES 3EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
4 4
5obj-$(CONFIG_OCFS2_FS) += ocfs2.o 5obj-$(CONFIG_OCFS2_FS) += \
6 ocfs2.o \
7 ocfs2_stackglue.o
8
9obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_stack_o2cb.o
10obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
6 11
7ocfs2-objs := \ 12ocfs2-objs := \
8 alloc.o \ 13 alloc.o \
@@ -31,5 +36,10 @@ ocfs2-objs := \
31 uptodate.o \ 36 uptodate.o \
32 ver.o 37 ver.o
33 38
39ocfs2_stackglue-objs := stackglue.o
40ocfs2_stack_o2cb-objs := stack_o2cb.o
41ocfs2_stack_user-objs := stack_user.o
42
43# cluster/ is always needed when OCFS2_FS for masklog support
34obj-$(CONFIG_OCFS2_FS) += cluster/ 44obj-$(CONFIG_OCFS2_FS) += cluster/
35obj-$(CONFIG_OCFS2_FS) += dlm/ 45obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 447206eb5c2e..41f84c92094f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1029,8 +1029,7 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1029 BUG_ON(!next_free); 1029 BUG_ON(!next_free);
1030 1030
1031 /* The tree code before us didn't allow enough room in the leaf. */ 1031 /* The tree code before us didn't allow enough room in the leaf. */
1032 if (el->l_next_free_rec == el->l_count && !has_empty) 1032 BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1033 BUG();
1034 1033
1035 /* 1034 /*
1036 * The easiest way to approach this is to just remove the 1035 * The easiest way to approach this is to just remove the
@@ -1450,6 +1449,8 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1450 * - When our insert into the right path leaf is at the leftmost edge 1449 * - When our insert into the right path leaf is at the leftmost edge
1451 * and requires an update of the path immediately to it's left. This 1450 * and requires an update of the path immediately to it's left. This
1452 * can occur at the end of some types of rotation and appending inserts. 1451 * can occur at the end of some types of rotation and appending inserts.
1452 * - When we've adjusted the last extent record in the left path leaf and the
1453 * 1st extent record in the right path leaf during cross extent block merge.
1453 */ 1454 */
1454static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, 1455static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1455 struct ocfs2_path *left_path, 1456 struct ocfs2_path *left_path,
@@ -2712,24 +2713,147 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2712 } 2713 }
2713} 2714}
2714 2715
2716static int ocfs2_get_right_path(struct inode *inode,
2717 struct ocfs2_path *left_path,
2718 struct ocfs2_path **ret_right_path)
2719{
2720 int ret;
2721 u32 right_cpos;
2722 struct ocfs2_path *right_path = NULL;
2723 struct ocfs2_extent_list *left_el;
2724
2725 *ret_right_path = NULL;
2726
2727 /* This function shouldn't be called for non-trees. */
2728 BUG_ON(left_path->p_tree_depth == 0);
2729
2730 left_el = path_leaf_el(left_path);
2731 BUG_ON(left_el->l_next_free_rec != left_el->l_count);
2732
2733 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2734 &right_cpos);
2735 if (ret) {
2736 mlog_errno(ret);
2737 goto out;
2738 }
2739
2740 /* This function shouldn't be called for the rightmost leaf. */
2741 BUG_ON(right_cpos == 0);
2742
2743 right_path = ocfs2_new_path(path_root_bh(left_path),
2744 path_root_el(left_path));
2745 if (!right_path) {
2746 ret = -ENOMEM;
2747 mlog_errno(ret);
2748 goto out;
2749 }
2750
2751 ret = ocfs2_find_path(inode, right_path, right_cpos);
2752 if (ret) {
2753 mlog_errno(ret);
2754 goto out;
2755 }
2756
2757 *ret_right_path = right_path;
2758out:
2759 if (ret)
2760 ocfs2_free_path(right_path);
2761 return ret;
2762}
2763
2715/* 2764/*
2716 * Remove split_rec clusters from the record at index and merge them 2765 * Remove split_rec clusters from the record at index and merge them
2717 * onto the beginning of the record at index + 1. 2766 * onto the beginning of the record "next" to it.
2767 * For index < l_count - 1, the next means the extent rec at index + 1.
2768 * For index == l_count - 1, the "next" means the 1st extent rec of the
2769 * next extent block.
2718 */ 2770 */
2719static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh, 2771static int ocfs2_merge_rec_right(struct inode *inode,
2720 handle_t *handle, 2772 struct ocfs2_path *left_path,
2721 struct ocfs2_extent_rec *split_rec, 2773 handle_t *handle,
2722 struct ocfs2_extent_list *el, int index) 2774 struct ocfs2_extent_rec *split_rec,
2775 int index)
2723{ 2776{
2724 int ret; 2777 int ret, next_free, i;
2725 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); 2778 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2726 struct ocfs2_extent_rec *left_rec; 2779 struct ocfs2_extent_rec *left_rec;
2727 struct ocfs2_extent_rec *right_rec; 2780 struct ocfs2_extent_rec *right_rec;
2781 struct ocfs2_extent_list *right_el;
2782 struct ocfs2_path *right_path = NULL;
2783 int subtree_index = 0;
2784 struct ocfs2_extent_list *el = path_leaf_el(left_path);
2785 struct buffer_head *bh = path_leaf_bh(left_path);
2786 struct buffer_head *root_bh = NULL;
2728 2787
2729 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec)); 2788 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
2730
2731 left_rec = &el->l_recs[index]; 2789 left_rec = &el->l_recs[index];
2732 right_rec = &el->l_recs[index + 1]; 2790
2791 if (index == le16_to_cpu(el->l_next_free_rec - 1) &&
2792 le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
2793 /* we meet with a cross extent block merge. */
2794 ret = ocfs2_get_right_path(inode, left_path, &right_path);
2795 if (ret) {
2796 mlog_errno(ret);
2797 goto out;
2798 }
2799
2800 right_el = path_leaf_el(right_path);
2801 next_free = le16_to_cpu(right_el->l_next_free_rec);
2802 BUG_ON(next_free <= 0);
2803 right_rec = &right_el->l_recs[0];
2804 if (ocfs2_is_empty_extent(right_rec)) {
2805 BUG_ON(le16_to_cpu(next_free) <= 1);
2806 right_rec = &right_el->l_recs[1];
2807 }
2808
2809 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
2810 le16_to_cpu(left_rec->e_leaf_clusters) !=
2811 le32_to_cpu(right_rec->e_cpos));
2812
2813 subtree_index = ocfs2_find_subtree_root(inode,
2814 left_path, right_path);
2815
2816 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
2817 handle->h_buffer_credits,
2818 right_path);
2819 if (ret) {
2820 mlog_errno(ret);
2821 goto out;
2822 }
2823
2824 root_bh = left_path->p_node[subtree_index].bh;
2825 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2826
2827 ret = ocfs2_journal_access(handle, inode, root_bh,
2828 OCFS2_JOURNAL_ACCESS_WRITE);
2829 if (ret) {
2830 mlog_errno(ret);
2831 goto out;
2832 }
2833
2834 for (i = subtree_index + 1;
2835 i < path_num_items(right_path); i++) {
2836 ret = ocfs2_journal_access(handle, inode,
2837 right_path->p_node[i].bh,
2838 OCFS2_JOURNAL_ACCESS_WRITE);
2839 if (ret) {
2840 mlog_errno(ret);
2841 goto out;
2842 }
2843
2844 ret = ocfs2_journal_access(handle, inode,
2845 left_path->p_node[i].bh,
2846 OCFS2_JOURNAL_ACCESS_WRITE);
2847 if (ret) {
2848 mlog_errno(ret);
2849 goto out;
2850 }
2851 }
2852
2853 } else {
2854 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
2855 right_rec = &el->l_recs[index + 1];
2856 }
2733 2857
2734 ret = ocfs2_journal_access(handle, inode, bh, 2858 ret = ocfs2_journal_access(handle, inode, bh,
2735 OCFS2_JOURNAL_ACCESS_WRITE); 2859 OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2751,30 +2875,156 @@ static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
2751 if (ret) 2875 if (ret)
2752 mlog_errno(ret); 2876 mlog_errno(ret);
2753 2877
2878 if (right_path) {
2879 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2880 if (ret)
2881 mlog_errno(ret);
2882
2883 ocfs2_complete_edge_insert(inode, handle, left_path,
2884 right_path, subtree_index);
2885 }
2886out:
2887 if (right_path)
2888 ocfs2_free_path(right_path);
2889 return ret;
2890}
2891
2892static int ocfs2_get_left_path(struct inode *inode,
2893 struct ocfs2_path *right_path,
2894 struct ocfs2_path **ret_left_path)
2895{
2896 int ret;
2897 u32 left_cpos;
2898 struct ocfs2_path *left_path = NULL;
2899
2900 *ret_left_path = NULL;
2901
2902 /* This function shouldn't be called for non-trees. */
2903 BUG_ON(right_path->p_tree_depth == 0);
2904
2905 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
2906 right_path, &left_cpos);
2907 if (ret) {
2908 mlog_errno(ret);
2909 goto out;
2910 }
2911
2912 /* This function shouldn't be called for the leftmost leaf. */
2913 BUG_ON(left_cpos == 0);
2914
2915 left_path = ocfs2_new_path(path_root_bh(right_path),
2916 path_root_el(right_path));
2917 if (!left_path) {
2918 ret = -ENOMEM;
2919 mlog_errno(ret);
2920 goto out;
2921 }
2922
2923 ret = ocfs2_find_path(inode, left_path, left_cpos);
2924 if (ret) {
2925 mlog_errno(ret);
2926 goto out;
2927 }
2928
2929 *ret_left_path = left_path;
2754out: 2930out:
2931 if (ret)
2932 ocfs2_free_path(left_path);
2755 return ret; 2933 return ret;
2756} 2934}
2757 2935
2758/* 2936/*
2759 * Remove split_rec clusters from the record at index and merge them 2937 * Remove split_rec clusters from the record at index and merge them
2760 * onto the tail of the record at index - 1. 2938 * onto the tail of the record "before" it.
2939 * For index > 0, the "before" means the extent rec at index - 1.
2940 *
2941 * For index == 0, the "before" means the last record of the previous
2942 * extent block. And there is also a situation that we may need to
2943 * remove the rightmost leaf extent block in the right_path and change
2944 * the right path to indicate the new rightmost path.
2761 */ 2945 */
2762static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh, 2946static int ocfs2_merge_rec_left(struct inode *inode,
2947 struct ocfs2_path *right_path,
2763 handle_t *handle, 2948 handle_t *handle,
2764 struct ocfs2_extent_rec *split_rec, 2949 struct ocfs2_extent_rec *split_rec,
2765 struct ocfs2_extent_list *el, int index) 2950 struct ocfs2_cached_dealloc_ctxt *dealloc,
2951 int index)
2766{ 2952{
2767 int ret, has_empty_extent = 0; 2953 int ret, i, subtree_index = 0, has_empty_extent = 0;
2768 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); 2954 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2769 struct ocfs2_extent_rec *left_rec; 2955 struct ocfs2_extent_rec *left_rec;
2770 struct ocfs2_extent_rec *right_rec; 2956 struct ocfs2_extent_rec *right_rec;
2957 struct ocfs2_extent_list *el = path_leaf_el(right_path);
2958 struct buffer_head *bh = path_leaf_bh(right_path);
2959 struct buffer_head *root_bh = NULL;
2960 struct ocfs2_path *left_path = NULL;
2961 struct ocfs2_extent_list *left_el;
2771 2962
2772 BUG_ON(index <= 0); 2963 BUG_ON(index < 0);
2773 2964
2774 left_rec = &el->l_recs[index - 1];
2775 right_rec = &el->l_recs[index]; 2965 right_rec = &el->l_recs[index];
2776 if (ocfs2_is_empty_extent(&el->l_recs[0])) 2966 if (index == 0) {
2777 has_empty_extent = 1; 2967 /* we meet with a cross extent block merge. */
2968 ret = ocfs2_get_left_path(inode, right_path, &left_path);
2969 if (ret) {
2970 mlog_errno(ret);
2971 goto out;
2972 }
2973
2974 left_el = path_leaf_el(left_path);
2975 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
2976 le16_to_cpu(left_el->l_count));
2977
2978 left_rec = &left_el->l_recs[
2979 le16_to_cpu(left_el->l_next_free_rec) - 1];
2980 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
2981 le16_to_cpu(left_rec->e_leaf_clusters) !=
2982 le32_to_cpu(split_rec->e_cpos));
2983
2984 subtree_index = ocfs2_find_subtree_root(inode,
2985 left_path, right_path);
2986
2987 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
2988 handle->h_buffer_credits,
2989 left_path);
2990 if (ret) {
2991 mlog_errno(ret);
2992 goto out;
2993 }
2994
2995 root_bh = left_path->p_node[subtree_index].bh;
2996 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2997
2998 ret = ocfs2_journal_access(handle, inode, root_bh,
2999 OCFS2_JOURNAL_ACCESS_WRITE);
3000 if (ret) {
3001 mlog_errno(ret);
3002 goto out;
3003 }
3004
3005 for (i = subtree_index + 1;
3006 i < path_num_items(right_path); i++) {
3007 ret = ocfs2_journal_access(handle, inode,
3008 right_path->p_node[i].bh,
3009 OCFS2_JOURNAL_ACCESS_WRITE);
3010 if (ret) {
3011 mlog_errno(ret);
3012 goto out;
3013 }
3014
3015 ret = ocfs2_journal_access(handle, inode,
3016 left_path->p_node[i].bh,
3017 OCFS2_JOURNAL_ACCESS_WRITE);
3018 if (ret) {
3019 mlog_errno(ret);
3020 goto out;
3021 }
3022 }
3023 } else {
3024 left_rec = &el->l_recs[index - 1];
3025 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3026 has_empty_extent = 1;
3027 }
2778 3028
2779 ret = ocfs2_journal_access(handle, inode, bh, 3029 ret = ocfs2_journal_access(handle, inode, bh,
2780 OCFS2_JOURNAL_ACCESS_WRITE); 3030 OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2790,9 +3040,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
2790 *left_rec = *split_rec; 3040 *left_rec = *split_rec;
2791 3041
2792 has_empty_extent = 0; 3042 has_empty_extent = 0;
2793 } else { 3043 } else
2794 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); 3044 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
2795 }
2796 3045
2797 le32_add_cpu(&right_rec->e_cpos, split_clusters); 3046 le32_add_cpu(&right_rec->e_cpos, split_clusters);
2798 le64_add_cpu(&right_rec->e_blkno, 3047 le64_add_cpu(&right_rec->e_blkno,
@@ -2805,13 +3054,44 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
2805 if (ret) 3054 if (ret)
2806 mlog_errno(ret); 3055 mlog_errno(ret);
2807 3056
3057 if (left_path) {
3058 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3059 if (ret)
3060 mlog_errno(ret);
3061
3062 /*
3063 * In the situation that the right_rec is empty and the extent
3064 * block is empty also, ocfs2_complete_edge_insert can't handle
3065 * it and we need to delete the right extent block.
3066 */
3067 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3068 le16_to_cpu(el->l_next_free_rec) == 1) {
3069
3070 ret = ocfs2_remove_rightmost_path(inode, handle,
3071 right_path, dealloc);
3072 if (ret) {
3073 mlog_errno(ret);
3074 goto out;
3075 }
3076
3077 /* Now the rightmost extent block has been deleted.
3078 * So we use the new rightmost path.
3079 */
3080 ocfs2_mv_path(right_path, left_path);
3081 left_path = NULL;
3082 } else
3083 ocfs2_complete_edge_insert(inode, handle, left_path,
3084 right_path, subtree_index);
3085 }
2808out: 3086out:
3087 if (left_path)
3088 ocfs2_free_path(left_path);
2809 return ret; 3089 return ret;
2810} 3090}
2811 3091
2812static int ocfs2_try_to_merge_extent(struct inode *inode, 3092static int ocfs2_try_to_merge_extent(struct inode *inode,
2813 handle_t *handle, 3093 handle_t *handle,
2814 struct ocfs2_path *left_path, 3094 struct ocfs2_path *path,
2815 int split_index, 3095 int split_index,
2816 struct ocfs2_extent_rec *split_rec, 3096 struct ocfs2_extent_rec *split_rec,
2817 struct ocfs2_cached_dealloc_ctxt *dealloc, 3097 struct ocfs2_cached_dealloc_ctxt *dealloc,
@@ -2819,7 +3099,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2819 3099
2820{ 3100{
2821 int ret = 0; 3101 int ret = 0;
2822 struct ocfs2_extent_list *el = path_leaf_el(left_path); 3102 struct ocfs2_extent_list *el = path_leaf_el(path);
2823 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 3103 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
2824 3104
2825 BUG_ON(ctxt->c_contig_type == CONTIG_NONE); 3105 BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
@@ -2832,7 +3112,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2832 * extents - having more than one in a leaf is 3112 * extents - having more than one in a leaf is
2833 * illegal. 3113 * illegal.
2834 */ 3114 */
2835 ret = ocfs2_rotate_tree_left(inode, handle, left_path, 3115 ret = ocfs2_rotate_tree_left(inode, handle, path,
2836 dealloc); 3116 dealloc);
2837 if (ret) { 3117 if (ret) {
2838 mlog_errno(ret); 3118 mlog_errno(ret);
@@ -2847,7 +3127,6 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2847 * Left-right contig implies this. 3127 * Left-right contig implies this.
2848 */ 3128 */
2849 BUG_ON(!ctxt->c_split_covers_rec); 3129 BUG_ON(!ctxt->c_split_covers_rec);
2850 BUG_ON(split_index == 0);
2851 3130
2852 /* 3131 /*
2853 * Since the leftright insert always covers the entire 3132 * Since the leftright insert always covers the entire
@@ -2858,9 +3137,14 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2858 * Since the adding of an empty extent shifts 3137 * Since the adding of an empty extent shifts
2859 * everything back to the right, there's no need to 3138 * everything back to the right, there's no need to
2860 * update split_index here. 3139 * update split_index here.
3140 *
3141 * When the split_index is zero, we need to merge it to the
3142 * prevoius extent block. It is more efficient and easier
3143 * if we do merge_right first and merge_left later.
2861 */ 3144 */
2862 ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path), 3145 ret = ocfs2_merge_rec_right(inode, path,
2863 handle, split_rec, el, split_index); 3146 handle, split_rec,
3147 split_index);
2864 if (ret) { 3148 if (ret) {
2865 mlog_errno(ret); 3149 mlog_errno(ret);
2866 goto out; 3150 goto out;
@@ -2871,32 +3155,30 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2871 */ 3155 */
2872 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3156 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2873 3157
2874 /* 3158 /* The merge left us with an empty extent, remove it. */
2875 * The left merge left us with an empty extent, remove 3159 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
2876 * it.
2877 */
2878 ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
2879 if (ret) { 3160 if (ret) {
2880 mlog_errno(ret); 3161 mlog_errno(ret);
2881 goto out; 3162 goto out;
2882 } 3163 }
2883 split_index--; 3164
2884 rec = &el->l_recs[split_index]; 3165 rec = &el->l_recs[split_index];
2885 3166
2886 /* 3167 /*
2887 * Note that we don't pass split_rec here on purpose - 3168 * Note that we don't pass split_rec here on purpose -
2888 * we've merged it into the left side. 3169 * we've merged it into the rec already.
2889 */ 3170 */
2890 ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path), 3171 ret = ocfs2_merge_rec_left(inode, path,
2891 handle, rec, el, split_index); 3172 handle, rec,
3173 dealloc,
3174 split_index);
3175
2892 if (ret) { 3176 if (ret) {
2893 mlog_errno(ret); 3177 mlog_errno(ret);
2894 goto out; 3178 goto out;
2895 } 3179 }
2896 3180
2897 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3181 ret = ocfs2_rotate_tree_left(inode, handle, path,
2898
2899 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2900 dealloc); 3182 dealloc);
2901 /* 3183 /*
2902 * Error from this last rotate is not critical, so 3184 * Error from this last rotate is not critical, so
@@ -2915,8 +3197,9 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2915 */ 3197 */
2916 if (ctxt->c_contig_type == CONTIG_RIGHT) { 3198 if (ctxt->c_contig_type == CONTIG_RIGHT) {
2917 ret = ocfs2_merge_rec_left(inode, 3199 ret = ocfs2_merge_rec_left(inode,
2918 path_leaf_bh(left_path), 3200 path,
2919 handle, split_rec, el, 3201 handle, split_rec,
3202 dealloc,
2920 split_index); 3203 split_index);
2921 if (ret) { 3204 if (ret) {
2922 mlog_errno(ret); 3205 mlog_errno(ret);
@@ -2924,8 +3207,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2924 } 3207 }
2925 } else { 3208 } else {
2926 ret = ocfs2_merge_rec_right(inode, 3209 ret = ocfs2_merge_rec_right(inode,
2927 path_leaf_bh(left_path), 3210 path,
2928 handle, split_rec, el, 3211 handle, split_rec,
2929 split_index); 3212 split_index);
2930 if (ret) { 3213 if (ret) {
2931 mlog_errno(ret); 3214 mlog_errno(ret);
@@ -2938,7 +3221,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2938 * The merge may have left an empty extent in 3221 * The merge may have left an empty extent in
2939 * our leaf. Try to rotate it away. 3222 * our leaf. Try to rotate it away.
2940 */ 3223 */
2941 ret = ocfs2_rotate_tree_left(inode, handle, left_path, 3224 ret = ocfs2_rotate_tree_left(inode, handle, path,
2942 dealloc); 3225 dealloc);
2943 if (ret) 3226 if (ret)
2944 mlog_errno(ret); 3227 mlog_errno(ret);
@@ -3498,20 +3781,57 @@ out:
3498} 3781}
3499 3782
3500static enum ocfs2_contig_type 3783static enum ocfs2_contig_type
3501ocfs2_figure_merge_contig_type(struct inode *inode, 3784ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
3502 struct ocfs2_extent_list *el, int index, 3785 struct ocfs2_extent_list *el, int index,
3503 struct ocfs2_extent_rec *split_rec) 3786 struct ocfs2_extent_rec *split_rec)
3504{ 3787{
3505 struct ocfs2_extent_rec *rec; 3788 int status;
3506 enum ocfs2_contig_type ret = CONTIG_NONE; 3789 enum ocfs2_contig_type ret = CONTIG_NONE;
3790 u32 left_cpos, right_cpos;
3791 struct ocfs2_extent_rec *rec = NULL;
3792 struct ocfs2_extent_list *new_el;
3793 struct ocfs2_path *left_path = NULL, *right_path = NULL;
3794 struct buffer_head *bh;
3795 struct ocfs2_extent_block *eb;
3796
3797 if (index > 0) {
3798 rec = &el->l_recs[index - 1];
3799 } else if (path->p_tree_depth > 0) {
3800 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3801 path, &left_cpos);
3802 if (status)
3803 goto out;
3804
3805 if (left_cpos != 0) {
3806 left_path = ocfs2_new_path(path_root_bh(path),
3807 path_root_el(path));
3808 if (!left_path)
3809 goto out;
3810
3811 status = ocfs2_find_path(inode, left_path, left_cpos);
3812 if (status)
3813 goto out;
3814
3815 new_el = path_leaf_el(left_path);
3816
3817 if (le16_to_cpu(new_el->l_next_free_rec) !=
3818 le16_to_cpu(new_el->l_count)) {
3819 bh = path_leaf_bh(left_path);
3820 eb = (struct ocfs2_extent_block *)bh->b_data;
3821 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
3822 eb);
3823 goto out;
3824 }
3825 rec = &new_el->l_recs[
3826 le16_to_cpu(new_el->l_next_free_rec) - 1];
3827 }
3828 }
3507 3829
3508 /* 3830 /*
3509 * We're careful to check for an empty extent record here - 3831 * We're careful to check for an empty extent record here -
3510 * the merge code will know what to do if it sees one. 3832 * the merge code will know what to do if it sees one.
3511 */ 3833 */
3512 3834 if (rec) {
3513 if (index > 0) {
3514 rec = &el->l_recs[index - 1];
3515 if (index == 1 && ocfs2_is_empty_extent(rec)) { 3835 if (index == 1 && ocfs2_is_empty_extent(rec)) {
3516 if (split_rec->e_cpos == el->l_recs[index].e_cpos) 3836 if (split_rec->e_cpos == el->l_recs[index].e_cpos)
3517 ret = CONTIG_RIGHT; 3837 ret = CONTIG_RIGHT;
@@ -3520,10 +3840,45 @@ ocfs2_figure_merge_contig_type(struct inode *inode,
3520 } 3840 }
3521 } 3841 }
3522 3842
3523 if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) { 3843 rec = NULL;
3844 if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
3845 rec = &el->l_recs[index + 1];
3846 else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
3847 path->p_tree_depth > 0) {
3848 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
3849 path, &right_cpos);
3850 if (status)
3851 goto out;
3852
3853 if (right_cpos == 0)
3854 goto out;
3855
3856 right_path = ocfs2_new_path(path_root_bh(path),
3857 path_root_el(path));
3858 if (!right_path)
3859 goto out;
3860
3861 status = ocfs2_find_path(inode, right_path, right_cpos);
3862 if (status)
3863 goto out;
3864
3865 new_el = path_leaf_el(right_path);
3866 rec = &new_el->l_recs[0];
3867 if (ocfs2_is_empty_extent(rec)) {
3868 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
3869 bh = path_leaf_bh(right_path);
3870 eb = (struct ocfs2_extent_block *)bh->b_data;
3871 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
3872 eb);
3873 goto out;
3874 }
3875 rec = &new_el->l_recs[1];
3876 }
3877 }
3878
3879 if (rec) {
3524 enum ocfs2_contig_type contig_type; 3880 enum ocfs2_contig_type contig_type;
3525 3881
3526 rec = &el->l_recs[index + 1];
3527 contig_type = ocfs2_extent_contig(inode, rec, split_rec); 3882 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
3528 3883
3529 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) 3884 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
@@ -3532,6 +3887,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode,
3532 ret = contig_type; 3887 ret = contig_type;
3533 } 3888 }
3534 3889
3890out:
3891 if (left_path)
3892 ocfs2_free_path(left_path);
3893 if (right_path)
3894 ocfs2_free_path(right_path);
3895
3535 return ret; 3896 return ret;
3536} 3897}
3537 3898
@@ -3994,7 +4355,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
3994 goto out; 4355 goto out;
3995 } 4356 }
3996 4357
3997 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el, 4358 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
3998 split_index, 4359 split_index,
3999 split_rec); 4360 split_rec);
4000 4361
@@ -4788,6 +5149,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
4788 status = ocfs2_flush_truncate_log(osb); 5149 status = ocfs2_flush_truncate_log(osb);
4789 if (status < 0) 5150 if (status < 0)
4790 mlog_errno(status); 5151 mlog_errno(status);
5152 else
5153 ocfs2_init_inode_steal_slot(osb);
4791 5154
4792 mlog_exit(status); 5155 mlog_exit(status);
4793} 5156}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 90383ed61005..17964c0505a9 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -467,11 +467,11 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
467 unsigned to) 467 unsigned to)
468{ 468{
469 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 469 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
470 handle_t *handle = NULL; 470 handle_t *handle;
471 int ret = 0; 471 int ret = 0;
472 472
473 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 473 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
474 if (!handle) { 474 if (IS_ERR(handle)) {
475 ret = -ENOMEM; 475 ret = -ENOMEM;
476 mlog_errno(ret); 476 mlog_errno(ret);
477 goto out; 477 goto out;
@@ -487,7 +487,7 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
487 } 487 }
488out: 488out:
489 if (ret) { 489 if (ret) {
490 if (handle) 490 if (!IS_ERR(handle))
491 ocfs2_commit_trans(osb, handle); 491 ocfs2_commit_trans(osb, handle);
492 handle = ERR_PTR(ret); 492 handle = ERR_PTR(ret);
493 } 493 }
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index cdd162f13650..bc8c5e7d8608 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
1obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o 1obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
2 2
3ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ 3ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
4 quorum.o tcp.o ver.o 4 quorum.o tcp.o netdebug.o ver.o
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
new file mode 100644
index 000000000000..7bf3c0ea7bd9
--- /dev/null
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -0,0 +1,441 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * netdebug.c
5 *
6 * debug functionality for o2net
7 *
8 * Copyright (C) 2005, 2008 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifdef CONFIG_DEBUG_FS
28
29#include <linux/module.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/idr.h>
33#include <linux/kref.h>
34#include <linux/seq_file.h>
35#include <linux/debugfs.h>
36
37#include <linux/uaccess.h>
38
39#include "tcp.h"
40#include "nodemanager.h"
41#define MLOG_MASK_PREFIX ML_TCP
42#include "masklog.h"
43
44#include "tcp_internal.h"
45
46#define O2NET_DEBUG_DIR "o2net"
47#define SC_DEBUG_NAME "sock_containers"
48#define NST_DEBUG_NAME "send_tracking"
49
50static struct dentry *o2net_dentry;
51static struct dentry *sc_dentry;
52static struct dentry *nst_dentry;
53
54static DEFINE_SPINLOCK(o2net_debug_lock);
55
56static LIST_HEAD(sock_containers);
57static LIST_HEAD(send_tracking);
58
59void o2net_debug_add_nst(struct o2net_send_tracking *nst)
60{
61 spin_lock(&o2net_debug_lock);
62 list_add(&nst->st_net_debug_item, &send_tracking);
63 spin_unlock(&o2net_debug_lock);
64}
65
66void o2net_debug_del_nst(struct o2net_send_tracking *nst)
67{
68 spin_lock(&o2net_debug_lock);
69 if (!list_empty(&nst->st_net_debug_item))
70 list_del_init(&nst->st_net_debug_item);
71 spin_unlock(&o2net_debug_lock);
72}
73
74static struct o2net_send_tracking
75 *next_nst(struct o2net_send_tracking *nst_start)
76{
77 struct o2net_send_tracking *nst, *ret = NULL;
78
79 assert_spin_locked(&o2net_debug_lock);
80
81 list_for_each_entry(nst, &nst_start->st_net_debug_item,
82 st_net_debug_item) {
83 /* discover the head of the list */
84 if (&nst->st_net_debug_item == &send_tracking)
85 break;
86
87 /* use st_task to detect real nsts in the list */
88 if (nst->st_task != NULL) {
89 ret = nst;
90 break;
91 }
92 }
93
94 return ret;
95}
96
97static void *nst_seq_start(struct seq_file *seq, loff_t *pos)
98{
99 struct o2net_send_tracking *nst, *dummy_nst = seq->private;
100
101 spin_lock(&o2net_debug_lock);
102 nst = next_nst(dummy_nst);
103 spin_unlock(&o2net_debug_lock);
104
105 return nst;
106}
107
108static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
109{
110 struct o2net_send_tracking *nst, *dummy_nst = seq->private;
111
112 spin_lock(&o2net_debug_lock);
113 nst = next_nst(dummy_nst);
114 list_del_init(&dummy_nst->st_net_debug_item);
115 if (nst)
116 list_add(&dummy_nst->st_net_debug_item,
117 &nst->st_net_debug_item);
118 spin_unlock(&o2net_debug_lock);
119
120 return nst; /* unused, just needs to be null when done */
121}
122
123static int nst_seq_show(struct seq_file *seq, void *v)
124{
125 struct o2net_send_tracking *nst, *dummy_nst = seq->private;
126
127 spin_lock(&o2net_debug_lock);
128 nst = next_nst(dummy_nst);
129
130 if (nst != NULL) {
131 /* get_task_comm isn't exported. oh well. */
132 seq_printf(seq, "%p:\n"
133 " pid: %lu\n"
134 " tgid: %lu\n"
135 " process name: %s\n"
136 " node: %u\n"
137 " sc: %p\n"
138 " message id: %d\n"
139 " message type: %u\n"
140 " message key: 0x%08x\n"
141 " sock acquiry: %lu.%lu\n"
142 " send start: %lu.%lu\n"
143 " wait start: %lu.%lu\n",
144 nst, (unsigned long)nst->st_task->pid,
145 (unsigned long)nst->st_task->tgid,
146 nst->st_task->comm, nst->st_node,
147 nst->st_sc, nst->st_id, nst->st_msg_type,
148 nst->st_msg_key,
149 nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec,
150 nst->st_send_time.tv_sec, nst->st_send_time.tv_usec,
151 nst->st_status_time.tv_sec,
152 nst->st_status_time.tv_usec);
153 }
154
155 spin_unlock(&o2net_debug_lock);
156
157 return 0;
158}
159
160static void nst_seq_stop(struct seq_file *seq, void *v)
161{
162}
163
164static struct seq_operations nst_seq_ops = {
165 .start = nst_seq_start,
166 .next = nst_seq_next,
167 .stop = nst_seq_stop,
168 .show = nst_seq_show,
169};
170
171static int nst_fop_open(struct inode *inode, struct file *file)
172{
173 struct o2net_send_tracking *dummy_nst;
174 struct seq_file *seq;
175 int ret;
176
177 dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL);
178 if (dummy_nst == NULL) {
179 ret = -ENOMEM;
180 goto out;
181 }
182 dummy_nst->st_task = NULL;
183
184 ret = seq_open(file, &nst_seq_ops);
185 if (ret)
186 goto out;
187
188 seq = file->private_data;
189 seq->private = dummy_nst;
190 o2net_debug_add_nst(dummy_nst);
191
192 dummy_nst = NULL;
193
194out:
195 kfree(dummy_nst);
196 return ret;
197}
198
199static int nst_fop_release(struct inode *inode, struct file *file)
200{
201 struct seq_file *seq = file->private_data;
202 struct o2net_send_tracking *dummy_nst = seq->private;
203
204 o2net_debug_del_nst(dummy_nst);
205 return seq_release_private(inode, file);
206}
207
208static struct file_operations nst_seq_fops = {
209 .open = nst_fop_open,
210 .read = seq_read,
211 .llseek = seq_lseek,
212 .release = nst_fop_release,
213};
214
215void o2net_debug_add_sc(struct o2net_sock_container *sc)
216{
217 spin_lock(&o2net_debug_lock);
218 list_add(&sc->sc_net_debug_item, &sock_containers);
219 spin_unlock(&o2net_debug_lock);
220}
221
222void o2net_debug_del_sc(struct o2net_sock_container *sc)
223{
224 spin_lock(&o2net_debug_lock);
225 list_del_init(&sc->sc_net_debug_item);
226 spin_unlock(&o2net_debug_lock);
227}
228
229static struct o2net_sock_container
230 *next_sc(struct o2net_sock_container *sc_start)
231{
232 struct o2net_sock_container *sc, *ret = NULL;
233
234 assert_spin_locked(&o2net_debug_lock);
235
236 list_for_each_entry(sc, &sc_start->sc_net_debug_item,
237 sc_net_debug_item) {
238 /* discover the head of the list miscast as a sc */
239 if (&sc->sc_net_debug_item == &sock_containers)
240 break;
241
242 /* use sc_page to detect real scs in the list */
243 if (sc->sc_page != NULL) {
244 ret = sc;
245 break;
246 }
247 }
248
249 return ret;
250}
251
252static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
253{
254 struct o2net_sock_container *sc, *dummy_sc = seq->private;
255
256 spin_lock(&o2net_debug_lock);
257 sc = next_sc(dummy_sc);
258 spin_unlock(&o2net_debug_lock);
259
260 return sc;
261}
262
263static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
264{
265 struct o2net_sock_container *sc, *dummy_sc = seq->private;
266
267 spin_lock(&o2net_debug_lock);
268 sc = next_sc(dummy_sc);
269 list_del_init(&dummy_sc->sc_net_debug_item);
270 if (sc)
271 list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item);
272 spin_unlock(&o2net_debug_lock);
273
274 return sc; /* unused, just needs to be null when done */
275}
276
277#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec
278
279static int sc_seq_show(struct seq_file *seq, void *v)
280{
281 struct o2net_sock_container *sc, *dummy_sc = seq->private;
282
283 spin_lock(&o2net_debug_lock);
284 sc = next_sc(dummy_sc);
285
286 if (sc != NULL) {
287 struct inet_sock *inet = NULL;
288
289 __be32 saddr = 0, daddr = 0;
290 __be16 sport = 0, dport = 0;
291
292 if (sc->sc_sock) {
293 inet = inet_sk(sc->sc_sock->sk);
294 /* the stack's structs aren't sparse endian clean */
295 saddr = (__force __be32)inet->saddr;
296 daddr = (__force __be32)inet->daddr;
297 sport = (__force __be16)inet->sport;
298 dport = (__force __be16)inet->dport;
299 }
300
301 /* XXX sigh, inet-> doesn't have sparse annotation so any
302 * use of it here generates a warning with -Wbitwise */
303 seq_printf(seq, "%p:\n"
304 " krefs: %d\n"
305 " sock: %u.%u.%u.%u:%u -> "
306 "%u.%u.%u.%u:%u\n"
307 " remote node: %s\n"
308 " page off: %zu\n"
309 " handshake ok: %u\n"
310 " timer: %lu.%lu\n"
311 " data ready: %lu.%lu\n"
312 " advance start: %lu.%lu\n"
313 " advance stop: %lu.%lu\n"
314 " func start: %lu.%lu\n"
315 " func stop: %lu.%lu\n"
316 " func key: %u\n"
317 " func type: %u\n",
318 sc,
319 atomic_read(&sc->sc_kref.refcount),
320 NIPQUAD(saddr), inet ? ntohs(sport) : 0,
321 NIPQUAD(daddr), inet ? ntohs(dport) : 0,
322 sc->sc_node->nd_name,
323 sc->sc_page_off,
324 sc->sc_handshake_ok,
325 TV_SEC_USEC(sc->sc_tv_timer),
326 TV_SEC_USEC(sc->sc_tv_data_ready),
327 TV_SEC_USEC(sc->sc_tv_advance_start),
328 TV_SEC_USEC(sc->sc_tv_advance_stop),
329 TV_SEC_USEC(sc->sc_tv_func_start),
330 TV_SEC_USEC(sc->sc_tv_func_stop),
331 sc->sc_msg_key,
332 sc->sc_msg_type);
333 }
334
335
336 spin_unlock(&o2net_debug_lock);
337
338 return 0;
339}
340
341static void sc_seq_stop(struct seq_file *seq, void *v)
342{
343}
344
345static struct seq_operations sc_seq_ops = {
346 .start = sc_seq_start,
347 .next = sc_seq_next,
348 .stop = sc_seq_stop,
349 .show = sc_seq_show,
350};
351
352static int sc_fop_open(struct inode *inode, struct file *file)
353{
354 struct o2net_sock_container *dummy_sc;
355 struct seq_file *seq;
356 int ret;
357
358 dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL);
359 if (dummy_sc == NULL) {
360 ret = -ENOMEM;
361 goto out;
362 }
363 dummy_sc->sc_page = NULL;
364
365 ret = seq_open(file, &sc_seq_ops);
366 if (ret)
367 goto out;
368
369 seq = file->private_data;
370 seq->private = dummy_sc;
371 o2net_debug_add_sc(dummy_sc);
372
373 dummy_sc = NULL;
374
375out:
376 kfree(dummy_sc);
377 return ret;
378}
379
380static int sc_fop_release(struct inode *inode, struct file *file)
381{
382 struct seq_file *seq = file->private_data;
383 struct o2net_sock_container *dummy_sc = seq->private;
384
385 o2net_debug_del_sc(dummy_sc);
386 return seq_release_private(inode, file);
387}
388
389static struct file_operations sc_seq_fops = {
390 .open = sc_fop_open,
391 .read = seq_read,
392 .llseek = seq_lseek,
393 .release = sc_fop_release,
394};
395
396int o2net_debugfs_init(void)
397{
398 o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
399 if (!o2net_dentry) {
400 mlog_errno(-ENOMEM);
401 goto bail;
402 }
403
404 nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR,
405 o2net_dentry, NULL,
406 &nst_seq_fops);
407 if (!nst_dentry) {
408 mlog_errno(-ENOMEM);
409 goto bail;
410 }
411
412 sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR,
413 o2net_dentry, NULL,
414 &sc_seq_fops);
415 if (!sc_dentry) {
416 mlog_errno(-ENOMEM);
417 goto bail;
418 }
419
420 return 0;
421bail:
422 if (sc_dentry)
423 debugfs_remove(sc_dentry);
424 if (nst_dentry)
425 debugfs_remove(nst_dentry);
426 if (o2net_dentry)
427 debugfs_remove(o2net_dentry);
428 return -ENOMEM;
429}
430
431void o2net_debugfs_exit(void)
432{
433 if (sc_dentry)
434 debugfs_remove(sc_dentry);
435 if (nst_dentry)
436 debugfs_remove(nst_dentry);
437 if (o2net_dentry)
438 debugfs_remove(o2net_dentry);
439}
440
441#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 709fba25bf7e..cf9401e8cd0b 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -959,7 +959,10 @@ static int __init init_o2nm(void)
959 cluster_print_version(); 959 cluster_print_version();
960 960
961 o2hb_init(); 961 o2hb_init();
962 o2net_init(); 962
963 ret = o2net_init();
964 if (ret)
965 goto out;
963 966
964 ocfs2_table_header = register_sysctl_table(ocfs2_root_table); 967 ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
965 if (!ocfs2_table_header) { 968 if (!ocfs2_table_header) {
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 0c095ce7723d..98429fd68499 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -57,6 +57,7 @@ static struct kset *o2cb_kset;
57void o2cb_sys_shutdown(void) 57void o2cb_sys_shutdown(void)
58{ 58{
59 mlog_sys_shutdown(); 59 mlog_sys_shutdown();
60 sysfs_remove_link(NULL, "o2cb");
60 kset_unregister(o2cb_kset); 61 kset_unregister(o2cb_kset);
61} 62}
62 63
@@ -68,6 +69,14 @@ int o2cb_sys_init(void)
68 if (!o2cb_kset) 69 if (!o2cb_kset)
69 return -ENOMEM; 70 return -ENOMEM;
70 71
72 /*
73 * Create this symlink for backwards compatibility with old
74 * versions of ocfs2-tools which look for things in /sys/o2cb.
75 */
76 ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
77 if (ret)
78 goto error;
79
71 ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group); 80 ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
72 if (ret) 81 if (ret)
73 goto error; 82 goto error;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index b8057c51b205..1e44ad14881a 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -142,23 +142,65 @@ static void o2net_idle_timer(unsigned long data);
142static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); 142static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
143static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); 143static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
144 144
145/* 145static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
146 * FIXME: These should use to_o2nm_cluster_from_node(), but we end up 146 u32 msgkey, struct task_struct *task, u8 node)
147 * losing our parent link to the cluster during shutdown. This can be 147{
148 * solved by adding a pre-removal callback to configfs, or passing 148#ifdef CONFIG_DEBUG_FS
149 * around the cluster with the node. -jeffm 149 INIT_LIST_HEAD(&nst->st_net_debug_item);
150 */ 150 nst->st_task = task;
151static inline int o2net_reconnect_delay(struct o2nm_node *node) 151 nst->st_msg_type = msgtype;
152 nst->st_msg_key = msgkey;
153 nst->st_node = node;
154#endif
155}
156
157static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
158{
159#ifdef CONFIG_DEBUG_FS
160 do_gettimeofday(&nst->st_sock_time);
161#endif
162}
163
164static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
165{
166#ifdef CONFIG_DEBUG_FS
167 do_gettimeofday(&nst->st_send_time);
168#endif
169}
170
171static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
172{
173#ifdef CONFIG_DEBUG_FS
174 do_gettimeofday(&nst->st_status_time);
175#endif
176}
177
178static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
179 struct o2net_sock_container *sc)
180{
181#ifdef CONFIG_DEBUG_FS
182 nst->st_sc = sc;
183#endif
184}
185
186static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
187{
188#ifdef CONFIG_DEBUG_FS
189 nst->st_id = msg_id;
190#endif
191}
192
193static inline int o2net_reconnect_delay(void)
152{ 194{
153 return o2nm_single_cluster->cl_reconnect_delay_ms; 195 return o2nm_single_cluster->cl_reconnect_delay_ms;
154} 196}
155 197
156static inline int o2net_keepalive_delay(struct o2nm_node *node) 198static inline int o2net_keepalive_delay(void)
157{ 199{
158 return o2nm_single_cluster->cl_keepalive_delay_ms; 200 return o2nm_single_cluster->cl_keepalive_delay_ms;
159} 201}
160 202
161static inline int o2net_idle_timeout(struct o2nm_node *node) 203static inline int o2net_idle_timeout(void)
162{ 204{
163 return o2nm_single_cluster->cl_idle_timeout_ms; 205 return o2nm_single_cluster->cl_idle_timeout_ms;
164} 206}
@@ -296,6 +338,7 @@ static void sc_kref_release(struct kref *kref)
296 o2nm_node_put(sc->sc_node); 338 o2nm_node_put(sc->sc_node);
297 sc->sc_node = NULL; 339 sc->sc_node = NULL;
298 340
341 o2net_debug_del_sc(sc);
299 kfree(sc); 342 kfree(sc);
300} 343}
301 344
@@ -336,6 +379,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
336 379
337 ret = sc; 380 ret = sc;
338 sc->sc_page = page; 381 sc->sc_page = page;
382 o2net_debug_add_sc(sc);
339 sc = NULL; 383 sc = NULL;
340 page = NULL; 384 page = NULL;
341 385
@@ -399,8 +443,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
399 mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid); 443 mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
400 mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc); 444 mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
401 445
402 /* we won't reconnect after our valid conn goes away for
403 * this hb iteration.. here so it shows up in the logs */
404 if (was_valid && !valid && err == 0) 446 if (was_valid && !valid && err == 0)
405 err = -ENOTCONN; 447 err = -ENOTCONN;
406 448
@@ -430,11 +472,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
430 472
431 if (!was_valid && valid) { 473 if (!was_valid && valid) {
432 o2quo_conn_up(o2net_num_from_nn(nn)); 474 o2quo_conn_up(o2net_num_from_nn(nn));
433 /* this is a bit of a hack. we only try reconnecting
434 * when heartbeating starts until we get a connection.
435 * if that connection then dies we don't try reconnecting.
436 * the only way to start connecting again is to down
437 * heartbeat and bring it back up. */
438 cancel_delayed_work(&nn->nn_connect_expired); 475 cancel_delayed_work(&nn->nn_connect_expired);
439 printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n", 476 printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
440 o2nm_this_node() > sc->sc_node->nd_num ? 477 o2nm_this_node() > sc->sc_node->nd_num ?
@@ -451,12 +488,24 @@ static void o2net_set_nn_state(struct o2net_node *nn,
451 /* delay if we're withing a RECONNECT_DELAY of the 488 /* delay if we're withing a RECONNECT_DELAY of the
452 * last attempt */ 489 * last attempt */
453 delay = (nn->nn_last_connect_attempt + 490 delay = (nn->nn_last_connect_attempt +
454 msecs_to_jiffies(o2net_reconnect_delay(NULL))) 491 msecs_to_jiffies(o2net_reconnect_delay()))
455 - jiffies; 492 - jiffies;
456 if (delay > msecs_to_jiffies(o2net_reconnect_delay(NULL))) 493 if (delay > msecs_to_jiffies(o2net_reconnect_delay()))
457 delay = 0; 494 delay = 0;
458 mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); 495 mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
459 queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); 496 queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
497
498 /*
499 * Delay the expired work after idle timeout.
500 *
501 * We might have lots of failed connection attempts that run
502 * through here but we only cancel the connect_expired work when
503 * a connection attempt succeeds. So only the first enqueue of
504 * the connect_expired work will do anything. The rest will see
505 * that it's already queued and do nothing.
506 */
507 delay += msecs_to_jiffies(o2net_idle_timeout());
508 queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay);
460 } 509 }
461 510
462 /* keep track of the nn's sc ref for the caller */ 511 /* keep track of the nn's sc ref for the caller */
@@ -914,6 +963,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
914 struct o2net_status_wait nsw = { 963 struct o2net_status_wait nsw = {
915 .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item), 964 .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
916 }; 965 };
966 struct o2net_send_tracking nst;
967
968 o2net_init_nst(&nst, msg_type, key, current, target_node);
917 969
918 if (o2net_wq == NULL) { 970 if (o2net_wq == NULL) {
919 mlog(0, "attempt to tx without o2netd running\n"); 971 mlog(0, "attempt to tx without o2netd running\n");
@@ -939,6 +991,10 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
939 goto out; 991 goto out;
940 } 992 }
941 993
994 o2net_debug_add_nst(&nst);
995
996 o2net_set_nst_sock_time(&nst);
997
942 ret = wait_event_interruptible(nn->nn_sc_wq, 998 ret = wait_event_interruptible(nn->nn_sc_wq,
943 o2net_tx_can_proceed(nn, &sc, &error)); 999 o2net_tx_can_proceed(nn, &sc, &error));
944 if (!ret && error) 1000 if (!ret && error)
@@ -946,6 +1002,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
946 if (ret) 1002 if (ret)
947 goto out; 1003 goto out;
948 1004
1005 o2net_set_nst_sock_container(&nst, sc);
1006
949 veclen = caller_veclen + 1; 1007 veclen = caller_veclen + 1;
950 vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); 1008 vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
951 if (vec == NULL) { 1009 if (vec == NULL) {
@@ -972,6 +1030,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
972 goto out; 1030 goto out;
973 1031
974 msg->msg_num = cpu_to_be32(nsw.ns_id); 1032 msg->msg_num = cpu_to_be32(nsw.ns_id);
1033 o2net_set_nst_msg_id(&nst, nsw.ns_id);
1034
1035 o2net_set_nst_send_time(&nst);
975 1036
976 /* finally, convert the message header to network byte-order 1037 /* finally, convert the message header to network byte-order
977 * and send */ 1038 * and send */
@@ -986,6 +1047,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
986 } 1047 }
987 1048
988 /* wait on other node's handler */ 1049 /* wait on other node's handler */
1050 o2net_set_nst_status_time(&nst);
989 wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); 1051 wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
990 1052
991 /* Note that we avoid overwriting the callers status return 1053 /* Note that we avoid overwriting the callers status return
@@ -998,6 +1060,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
998 mlog(0, "woken, returning system status %d, user status %d\n", 1060 mlog(0, "woken, returning system status %d, user status %d\n",
999 ret, nsw.ns_status); 1061 ret, nsw.ns_status);
1000out: 1062out:
1063 o2net_debug_del_nst(&nst); /* must be before dropping sc and node */
1001 if (sc) 1064 if (sc)
1002 sc_put(sc); 1065 sc_put(sc);
1003 if (vec) 1066 if (vec)
@@ -1154,23 +1217,23 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
1154 * but isn't. This can ultimately cause corruption. 1217 * but isn't. This can ultimately cause corruption.
1155 */ 1218 */
1156 if (be32_to_cpu(hand->o2net_idle_timeout_ms) != 1219 if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
1157 o2net_idle_timeout(sc->sc_node)) { 1220 o2net_idle_timeout()) {
1158 mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " 1221 mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
1159 "%u ms, but we use %u ms locally. disconnecting\n", 1222 "%u ms, but we use %u ms locally. disconnecting\n",
1160 SC_NODEF_ARGS(sc), 1223 SC_NODEF_ARGS(sc),
1161 be32_to_cpu(hand->o2net_idle_timeout_ms), 1224 be32_to_cpu(hand->o2net_idle_timeout_ms),
1162 o2net_idle_timeout(sc->sc_node)); 1225 o2net_idle_timeout());
1163 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1226 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1164 return -1; 1227 return -1;
1165 } 1228 }
1166 1229
1167 if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != 1230 if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
1168 o2net_keepalive_delay(sc->sc_node)) { 1231 o2net_keepalive_delay()) {
1169 mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " 1232 mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
1170 "%u ms, but we use %u ms locally. disconnecting\n", 1233 "%u ms, but we use %u ms locally. disconnecting\n",
1171 SC_NODEF_ARGS(sc), 1234 SC_NODEF_ARGS(sc),
1172 be32_to_cpu(hand->o2net_keepalive_delay_ms), 1235 be32_to_cpu(hand->o2net_keepalive_delay_ms),
1173 o2net_keepalive_delay(sc->sc_node)); 1236 o2net_keepalive_delay());
1174 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1237 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1175 return -1; 1238 return -1;
1176 } 1239 }
@@ -1193,6 +1256,7 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
1193 * shut down already */ 1256 * shut down already */
1194 if (nn->nn_sc == sc) { 1257 if (nn->nn_sc == sc) {
1195 o2net_sc_reset_idle_timer(sc); 1258 o2net_sc_reset_idle_timer(sc);
1259 atomic_set(&nn->nn_timeout, 0);
1196 o2net_set_nn_state(nn, sc, 1, 0); 1260 o2net_set_nn_state(nn, sc, 1, 0);
1197 } 1261 }
1198 spin_unlock(&nn->nn_lock); 1262 spin_unlock(&nn->nn_lock);
@@ -1347,12 +1411,11 @@ static void o2net_initialize_handshake(void)
1347{ 1411{
1348 o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( 1412 o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
1349 O2HB_MAX_WRITE_TIMEOUT_MS); 1413 O2HB_MAX_WRITE_TIMEOUT_MS);
1350 o2net_hand->o2net_idle_timeout_ms = cpu_to_be32( 1414 o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout());
1351 o2net_idle_timeout(NULL));
1352 o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32( 1415 o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32(
1353 o2net_keepalive_delay(NULL)); 1416 o2net_keepalive_delay());
1354 o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32( 1417 o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32(
1355 o2net_reconnect_delay(NULL)); 1418 o2net_reconnect_delay());
1356} 1419}
1357 1420
1358/* ------------------------------------------------------------ */ 1421/* ------------------------------------------------------------ */
@@ -1391,14 +1454,15 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
1391static void o2net_idle_timer(unsigned long data) 1454static void o2net_idle_timer(unsigned long data)
1392{ 1455{
1393 struct o2net_sock_container *sc = (struct o2net_sock_container *)data; 1456 struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
1457 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1394 struct timeval now; 1458 struct timeval now;
1395 1459
1396 do_gettimeofday(&now); 1460 do_gettimeofday(&now);
1397 1461
1398 printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " 1462 printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
1399 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), 1463 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
1400 o2net_idle_timeout(sc->sc_node) / 1000, 1464 o2net_idle_timeout() / 1000,
1401 o2net_idle_timeout(sc->sc_node) % 1000); 1465 o2net_idle_timeout() % 1000);
1402 mlog(ML_NOTICE, "here are some times that might help debug the " 1466 mlog(ML_NOTICE, "here are some times that might help debug the "
1403 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " 1467 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
1404 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", 1468 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
@@ -1413,6 +1477,12 @@ static void o2net_idle_timer(unsigned long data)
1413 sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec, 1477 sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
1414 sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec); 1478 sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
1415 1479
1480 /*
1481 * Initialize the nn_timeout so that the next connection attempt
1482 * will continue in o2net_start_connect.
1483 */
1484 atomic_set(&nn->nn_timeout, 1);
1485
1416 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 1486 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
1417} 1487}
1418 1488
@@ -1420,10 +1490,10 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
1420{ 1490{
1421 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); 1491 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
1422 o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, 1492 o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
1423 msecs_to_jiffies(o2net_keepalive_delay(sc->sc_node))); 1493 msecs_to_jiffies(o2net_keepalive_delay()));
1424 do_gettimeofday(&sc->sc_tv_timer); 1494 do_gettimeofday(&sc->sc_tv_timer);
1425 mod_timer(&sc->sc_idle_timeout, 1495 mod_timer(&sc->sc_idle_timeout,
1426 jiffies + msecs_to_jiffies(o2net_idle_timeout(sc->sc_node))); 1496 jiffies + msecs_to_jiffies(o2net_idle_timeout()));
1427} 1497}
1428 1498
1429static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) 1499static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
@@ -1447,6 +1517,7 @@ static void o2net_start_connect(struct work_struct *work)
1447 struct socket *sock = NULL; 1517 struct socket *sock = NULL;
1448 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; 1518 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
1449 int ret = 0, stop; 1519 int ret = 0, stop;
1520 unsigned int timeout;
1450 1521
1451 /* if we're greater we initiate tx, otherwise we accept */ 1522 /* if we're greater we initiate tx, otherwise we accept */
1452 if (o2nm_this_node() <= o2net_num_from_nn(nn)) 1523 if (o2nm_this_node() <= o2net_num_from_nn(nn))
@@ -1466,8 +1537,17 @@ static void o2net_start_connect(struct work_struct *work)
1466 } 1537 }
1467 1538
1468 spin_lock(&nn->nn_lock); 1539 spin_lock(&nn->nn_lock);
1469 /* see if we already have one pending or have given up */ 1540 /*
1470 stop = (nn->nn_sc || nn->nn_persistent_error); 1541 * see if we already have one pending or have given up.
1542 * For nn_timeout, it is set when we close the connection
1543 * because of the idle time out. So it means that we have
1544 * at least connected to that node successfully once,
1545 * now try to connect to it again.
1546 */
1547 timeout = atomic_read(&nn->nn_timeout);
1548 stop = (nn->nn_sc ||
1549 (nn->nn_persistent_error &&
1550 (nn->nn_persistent_error != -ENOTCONN || timeout == 0)));
1471 spin_unlock(&nn->nn_lock); 1551 spin_unlock(&nn->nn_lock);
1472 if (stop) 1552 if (stop)
1473 goto out; 1553 goto out;
@@ -1555,8 +1635,8 @@ static void o2net_connect_expired(struct work_struct *work)
1555 mlog(ML_ERROR, "no connection established with node %u after " 1635 mlog(ML_ERROR, "no connection established with node %u after "
1556 "%u.%u seconds, giving up and returning errors.\n", 1636 "%u.%u seconds, giving up and returning errors.\n",
1557 o2net_num_from_nn(nn), 1637 o2net_num_from_nn(nn),
1558 o2net_idle_timeout(NULL) / 1000, 1638 o2net_idle_timeout() / 1000,
1559 o2net_idle_timeout(NULL) % 1000); 1639 o2net_idle_timeout() % 1000);
1560 1640
1561 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); 1641 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
1562 } 1642 }
@@ -1579,6 +1659,7 @@ void o2net_disconnect_node(struct o2nm_node *node)
1579 1659
1580 /* don't reconnect until it's heartbeating again */ 1660 /* don't reconnect until it's heartbeating again */
1581 spin_lock(&nn->nn_lock); 1661 spin_lock(&nn->nn_lock);
1662 atomic_set(&nn->nn_timeout, 0);
1582 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); 1663 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
1583 spin_unlock(&nn->nn_lock); 1664 spin_unlock(&nn->nn_lock);
1584 1665
@@ -1610,20 +1691,15 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1610 1691
1611 /* ensure an immediate connect attempt */ 1692 /* ensure an immediate connect attempt */
1612 nn->nn_last_connect_attempt = jiffies - 1693 nn->nn_last_connect_attempt = jiffies -
1613 (msecs_to_jiffies(o2net_reconnect_delay(node)) + 1); 1694 (msecs_to_jiffies(o2net_reconnect_delay()) + 1);
1614 1695
1615 if (node_num != o2nm_this_node()) { 1696 if (node_num != o2nm_this_node()) {
1616 /* heartbeat doesn't work unless a local node number is
1617 * configured and doing so brings up the o2net_wq, so we can
1618 * use it.. */
1619 queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
1620 msecs_to_jiffies(o2net_idle_timeout(node)));
1621
1622 /* believe it or not, accept and node hearbeating testing 1697 /* believe it or not, accept and node hearbeating testing
1623 * can succeed for this node before we got here.. so 1698 * can succeed for this node before we got here.. so
1624 * only use set_nn_state to clear the persistent error 1699 * only use set_nn_state to clear the persistent error
1625 * if that hasn't already happened */ 1700 * if that hasn't already happened */
1626 spin_lock(&nn->nn_lock); 1701 spin_lock(&nn->nn_lock);
1702 atomic_set(&nn->nn_timeout, 0);
1627 if (nn->nn_persistent_error) 1703 if (nn->nn_persistent_error)
1628 o2net_set_nn_state(nn, NULL, 0, 0); 1704 o2net_set_nn_state(nn, NULL, 0, 0);
1629 spin_unlock(&nn->nn_lock); 1705 spin_unlock(&nn->nn_lock);
@@ -1747,6 +1823,7 @@ static int o2net_accept_one(struct socket *sock)
1747 new_sock = NULL; 1823 new_sock = NULL;
1748 1824
1749 spin_lock(&nn->nn_lock); 1825 spin_lock(&nn->nn_lock);
1826 atomic_set(&nn->nn_timeout, 0);
1750 o2net_set_nn_state(nn, sc, 0, 0); 1827 o2net_set_nn_state(nn, sc, 0, 0);
1751 spin_unlock(&nn->nn_lock); 1828 spin_unlock(&nn->nn_lock);
1752 1829
@@ -1922,6 +1999,9 @@ int o2net_init(void)
1922 1999
1923 o2quo_init(); 2000 o2quo_init();
1924 2001
2002 if (o2net_debugfs_init())
2003 return -ENOMEM;
2004
1925 o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); 2005 o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
1926 o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); 2006 o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
1927 o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); 2007 o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
@@ -1941,6 +2021,7 @@ int o2net_init(void)
1941 for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { 2021 for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
1942 struct o2net_node *nn = o2net_nn_from_num(i); 2022 struct o2net_node *nn = o2net_nn_from_num(i);
1943 2023
2024 atomic_set(&nn->nn_timeout, 0);
1944 spin_lock_init(&nn->nn_lock); 2025 spin_lock_init(&nn->nn_lock);
1945 INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect); 2026 INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect);
1946 INIT_DELAYED_WORK(&nn->nn_connect_expired, 2027 INIT_DELAYED_WORK(&nn->nn_connect_expired,
@@ -1962,4 +2043,5 @@ void o2net_exit(void)
1962 kfree(o2net_hand); 2043 kfree(o2net_hand);
1963 kfree(o2net_keep_req); 2044 kfree(o2net_keep_req);
1964 kfree(o2net_keep_resp); 2045 kfree(o2net_keep_resp);
2046 o2net_debugfs_exit();
1965} 2047}
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index f36f66aab3dd..a705d5d19036 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -117,4 +117,36 @@ int o2net_num_connected_peers(void);
117int o2net_init(void); 117int o2net_init(void);
118void o2net_exit(void); 118void o2net_exit(void);
119 119
120struct o2net_send_tracking;
121struct o2net_sock_container;
122
123#ifdef CONFIG_DEBUG_FS
124int o2net_debugfs_init(void);
125void o2net_debugfs_exit(void);
126void o2net_debug_add_nst(struct o2net_send_tracking *nst);
127void o2net_debug_del_nst(struct o2net_send_tracking *nst);
128void o2net_debug_add_sc(struct o2net_sock_container *sc);
129void o2net_debug_del_sc(struct o2net_sock_container *sc);
130#else
131static int o2net_debugfs_init(void)
132{
133 return 0;
134}
135static void o2net_debugfs_exit(void)
136{
137}
138static void o2net_debug_add_nst(struct o2net_send_tracking *nst)
139{
140}
141static void o2net_debug_del_nst(struct o2net_send_tracking *nst)
142{
143}
144static void o2net_debug_add_sc(struct o2net_sock_container *sc)
145{
146}
147static void o2net_debug_del_sc(struct o2net_sock_container *sc)
148{
149}
150#endif /* CONFIG_DEBUG_FS */
151
120#endif /* O2CLUSTER_TCP_H */ 152#endif /* O2CLUSTER_TCP_H */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index d25b9af28500..8d58cfe410b1 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -95,6 +95,8 @@ struct o2net_node {
95 unsigned nn_sc_valid:1; 95 unsigned nn_sc_valid:1;
96 /* if this is set tx just returns it */ 96 /* if this is set tx just returns it */
97 int nn_persistent_error; 97 int nn_persistent_error;
98 /* It is only set to 1 after the idle time out. */
99 atomic_t nn_timeout;
98 100
99 /* threads waiting for an sc to arrive wait on the wq for generation 101 /* threads waiting for an sc to arrive wait on the wq for generation
100 * to increase. it is increased when a connecting socket succeeds 102 * to increase. it is increased when a connecting socket succeeds
@@ -164,7 +166,9 @@ struct o2net_sock_container {
164 /* original handlers for the sockets */ 166 /* original handlers for the sockets */
165 void (*sc_state_change)(struct sock *sk); 167 void (*sc_state_change)(struct sock *sk);
166 void (*sc_data_ready)(struct sock *sk, int bytes); 168 void (*sc_data_ready)(struct sock *sk, int bytes);
167 169#ifdef CONFIG_DEBUG_FS
170 struct list_head sc_net_debug_item;
171#endif
168 struct timeval sc_tv_timer; 172 struct timeval sc_tv_timer;
169 struct timeval sc_tv_data_ready; 173 struct timeval sc_tv_data_ready;
170 struct timeval sc_tv_advance_start; 174 struct timeval sc_tv_advance_start;
@@ -206,4 +210,24 @@ struct o2net_status_wait {
206 struct list_head ns_node_item; 210 struct list_head ns_node_item;
207}; 211};
208 212
213#ifdef CONFIG_DEBUG_FS
214/* just for state dumps */
215struct o2net_send_tracking {
216 struct list_head st_net_debug_item;
217 struct task_struct *st_task;
218 struct o2net_sock_container *st_sc;
219 u32 st_id;
220 u32 st_msg_type;
221 u32 st_msg_key;
222 u8 st_node;
223 struct timeval st_sock_time;
224 struct timeval st_send_time;
225 struct timeval st_status_time;
226};
227#else
228struct o2net_send_tracking {
229 u32 dummy;
230};
231#endif /* CONFIG_DEBUG_FS */
232
209#endif /* O2CLUSTER_TCP_INTERNAL_H */ 233#endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index ce3f7c29d270..190361375700 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,6 +1,6 @@
1EXTRA_CFLAGS += -Ifs/ocfs2 1EXTRA_CFLAGS += -Ifs/ocfs2
2 2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o 3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o
4 4
5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ 5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o 6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index dc8ea666efdb..d5a86fb81a49 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -49,6 +49,41 @@
49/* Intended to make it easier for us to switch out hash functions */ 49/* Intended to make it easier for us to switch out hash functions */
50#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) 50#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
51 51
52enum dlm_mle_type {
53 DLM_MLE_BLOCK,
54 DLM_MLE_MASTER,
55 DLM_MLE_MIGRATION
56};
57
58struct dlm_lock_name {
59 u8 len;
60 u8 name[DLM_LOCKID_NAME_MAX];
61};
62
63struct dlm_master_list_entry {
64 struct list_head list;
65 struct list_head hb_events;
66 struct dlm_ctxt *dlm;
67 spinlock_t spinlock;
68 wait_queue_head_t wq;
69 atomic_t woken;
70 struct kref mle_refs;
71 int inuse;
72 unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
73 unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
74 unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
75 unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
76 u8 master;
77 u8 new_master;
78 enum dlm_mle_type type;
79 struct o2hb_callback_func mle_hb_up;
80 struct o2hb_callback_func mle_hb_down;
81 union {
82 struct dlm_lock_resource *res;
83 struct dlm_lock_name name;
84 } u;
85};
86
52enum dlm_ast_type { 87enum dlm_ast_type {
53 DLM_AST = 0, 88 DLM_AST = 0,
54 DLM_BAST, 89 DLM_BAST,
@@ -101,6 +136,7 @@ struct dlm_ctxt
101 struct list_head purge_list; 136 struct list_head purge_list;
102 struct list_head pending_asts; 137 struct list_head pending_asts;
103 struct list_head pending_basts; 138 struct list_head pending_basts;
139 struct list_head tracking_list;
104 unsigned int purge_count; 140 unsigned int purge_count;
105 spinlock_t spinlock; 141 spinlock_t spinlock;
106 spinlock_t ast_lock; 142 spinlock_t ast_lock;
@@ -122,6 +158,9 @@ struct dlm_ctxt
122 atomic_t remote_resources; 158 atomic_t remote_resources;
123 atomic_t unknown_resources; 159 atomic_t unknown_resources;
124 160
161 struct dlm_debug_ctxt *dlm_debug_ctxt;
162 struct dentry *dlm_debugfs_subroot;
163
125 /* NOTE: Next three are protected by dlm_domain_lock */ 164 /* NOTE: Next three are protected by dlm_domain_lock */
126 struct kref dlm_refs; 165 struct kref dlm_refs;
127 enum dlm_ctxt_state dlm_state; 166 enum dlm_ctxt_state dlm_state;
@@ -270,6 +309,9 @@ struct dlm_lock_resource
270 struct list_head dirty; 309 struct list_head dirty;
271 struct list_head recovering; // dlm_recovery_ctxt.resources list 310 struct list_head recovering; // dlm_recovery_ctxt.resources list
272 311
312 /* Added during init and removed during release */
313 struct list_head tracking; /* dlm->tracking_list */
314
273 /* unused lock resources have their last_used stamped and are 315 /* unused lock resources have their last_used stamped and are
274 * put on a list for the dlm thread to run. */ 316 * put on a list for the dlm thread to run. */
275 unsigned long last_used; 317 unsigned long last_used;
@@ -963,9 +1005,16 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
963 DLM_LOCK_RES_MIGRATING)); 1005 DLM_LOCK_RES_MIGRATING));
964} 1006}
965 1007
1008/* create/destroy slab caches */
1009int dlm_init_master_caches(void);
1010void dlm_destroy_master_caches(void);
1011
1012int dlm_init_lock_cache(void);
1013void dlm_destroy_lock_cache(void);
966 1014
967int dlm_init_mle_cache(void); 1015int dlm_init_mle_cache(void);
968void dlm_destroy_mle_cache(void); 1016void dlm_destroy_mle_cache(void);
1017
969void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up); 1018void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
970int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, 1019int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
971 struct dlm_lock_resource *res); 1020 struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 64239b37e5d4..5f6d858770a2 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * debug functionality for the dlm 6 * debug functionality for the dlm
7 * 7 *
8 * Copyright (C) 2004 Oracle. All rights reserved. 8 * Copyright (C) 2004, 2008 Oracle. All rights reserved.
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public 11 * modify it under the terms of the GNU General Public
@@ -30,6 +30,7 @@
30#include <linux/utsname.h> 30#include <linux/utsname.h>
31#include <linux/sysctl.h> 31#include <linux/sysctl.h>
32#include <linux/spinlock.h> 32#include <linux/spinlock.h>
33#include <linux/debugfs.h>
33 34
34#include "cluster/heartbeat.h" 35#include "cluster/heartbeat.h"
35#include "cluster/nodemanager.h" 36#include "cluster/nodemanager.h"
@@ -37,17 +38,16 @@
37 38
38#include "dlmapi.h" 39#include "dlmapi.h"
39#include "dlmcommon.h" 40#include "dlmcommon.h"
40
41#include "dlmdomain.h" 41#include "dlmdomain.h"
42#include "dlmdebug.h"
42 43
43#define MLOG_MASK_PREFIX ML_DLM 44#define MLOG_MASK_PREFIX ML_DLM
44#include "cluster/masklog.h" 45#include "cluster/masklog.h"
45 46
47int stringify_lockname(const char *lockname, int locklen, char *buf, int len);
48
46void dlm_print_one_lock_resource(struct dlm_lock_resource *res) 49void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
47{ 50{
48 mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
49 res->lockname.len, res->lockname.name,
50 res->owner, res->state);
51 spin_lock(&res->spinlock); 51 spin_lock(&res->spinlock);
52 __dlm_print_one_lock_resource(res); 52 __dlm_print_one_lock_resource(res);
53 spin_unlock(&res->spinlock); 53 spin_unlock(&res->spinlock);
@@ -58,7 +58,7 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
58 int bit; 58 int bit;
59 assert_spin_locked(&res->spinlock); 59 assert_spin_locked(&res->spinlock);
60 60
61 mlog(ML_NOTICE, " refmap nodes: [ "); 61 printk(" refmap nodes: [ ");
62 bit = 0; 62 bit = 0;
63 while (1) { 63 while (1) {
64 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); 64 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
@@ -70,63 +70,66 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
70 printk("], inflight=%u\n", res->inflight_locks); 70 printk("], inflight=%u\n", res->inflight_locks);
71} 71}
72 72
73static void __dlm_print_lock(struct dlm_lock *lock)
74{
75 spin_lock(&lock->spinlock);
76
77 printk(" type=%d, conv=%d, node=%u, cookie=%u:%llu, "
78 "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), "
79 "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n",
80 lock->ml.type, lock->ml.convert_type, lock->ml.node,
81 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
82 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
83 atomic_read(&lock->lock_refs.refcount),
84 (list_empty(&lock->ast_list) ? 'y' : 'n'),
85 (lock->ast_pending ? 'y' : 'n'),
86 (list_empty(&lock->bast_list) ? 'y' : 'n'),
87 (lock->bast_pending ? 'y' : 'n'),
88 (lock->convert_pending ? 'y' : 'n'),
89 (lock->lock_pending ? 'y' : 'n'),
90 (lock->cancel_pending ? 'y' : 'n'),
91 (lock->unlock_pending ? 'y' : 'n'));
92
93 spin_unlock(&lock->spinlock);
94}
95
73void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) 96void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
74{ 97{
75 struct list_head *iter2; 98 struct list_head *iter2;
76 struct dlm_lock *lock; 99 struct dlm_lock *lock;
100 char buf[DLM_LOCKID_NAME_MAX];
77 101
78 assert_spin_locked(&res->spinlock); 102 assert_spin_locked(&res->spinlock);
79 103
80 mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n", 104 stringify_lockname(res->lockname.name, res->lockname.len,
81 res->lockname.len, res->lockname.name, 105 buf, sizeof(buf) - 1);
82 res->owner, res->state); 106 printk("lockres: %s, owner=%u, state=%u\n",
83 mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n", 107 buf, res->owner, res->state);
84 res->last_used, list_empty(&res->purge) ? "no" : "yes"); 108 printk(" last used: %lu, refcnt: %u, on purge list: %s\n",
109 res->last_used, atomic_read(&res->refs.refcount),
110 list_empty(&res->purge) ? "no" : "yes");
111 printk(" on dirty list: %s, on reco list: %s, "
112 "migrating pending: %s\n",
113 list_empty(&res->dirty) ? "no" : "yes",
114 list_empty(&res->recovering) ? "no" : "yes",
115 res->migration_pending ? "yes" : "no");
116 printk(" inflight locks: %d, asts reserved: %d\n",
117 res->inflight_locks, atomic_read(&res->asts_reserved));
85 dlm_print_lockres_refmap(res); 118 dlm_print_lockres_refmap(res);
86 mlog(ML_NOTICE, " granted queue: \n"); 119 printk(" granted queue:\n");
87 list_for_each(iter2, &res->granted) { 120 list_for_each(iter2, &res->granted) {
88 lock = list_entry(iter2, struct dlm_lock, list); 121 lock = list_entry(iter2, struct dlm_lock, list);
89 spin_lock(&lock->spinlock); 122 __dlm_print_lock(lock);
90 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
91 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
92 lock->ml.type, lock->ml.convert_type, lock->ml.node,
93 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
94 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
95 list_empty(&lock->ast_list) ? 'y' : 'n',
96 lock->ast_pending ? 'y' : 'n',
97 list_empty(&lock->bast_list) ? 'y' : 'n',
98 lock->bast_pending ? 'y' : 'n');
99 spin_unlock(&lock->spinlock);
100 } 123 }
101 mlog(ML_NOTICE, " converting queue: \n"); 124 printk(" converting queue:\n");
102 list_for_each(iter2, &res->converting) { 125 list_for_each(iter2, &res->converting) {
103 lock = list_entry(iter2, struct dlm_lock, list); 126 lock = list_entry(iter2, struct dlm_lock, list);
104 spin_lock(&lock->spinlock); 127 __dlm_print_lock(lock);
105 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
106 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
107 lock->ml.type, lock->ml.convert_type, lock->ml.node,
108 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
109 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
110 list_empty(&lock->ast_list) ? 'y' : 'n',
111 lock->ast_pending ? 'y' : 'n',
112 list_empty(&lock->bast_list) ? 'y' : 'n',
113 lock->bast_pending ? 'y' : 'n');
114 spin_unlock(&lock->spinlock);
115 } 128 }
116 mlog(ML_NOTICE, " blocked queue: \n"); 129 printk(" blocked queue:\n");
117 list_for_each(iter2, &res->blocked) { 130 list_for_each(iter2, &res->blocked) {
118 lock = list_entry(iter2, struct dlm_lock, list); 131 lock = list_entry(iter2, struct dlm_lock, list);
119 spin_lock(&lock->spinlock); 132 __dlm_print_lock(lock);
120 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
121 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
122 lock->ml.type, lock->ml.convert_type, lock->ml.node,
123 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
124 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
125 list_empty(&lock->ast_list) ? 'y' : 'n',
126 lock->ast_pending ? 'y' : 'n',
127 list_empty(&lock->bast_list) ? 'y' : 'n',
128 lock->bast_pending ? 'y' : 'n');
129 spin_unlock(&lock->spinlock);
130 } 133 }
131} 134}
132 135
@@ -136,31 +139,6 @@ void dlm_print_one_lock(struct dlm_lock *lockid)
136} 139}
137EXPORT_SYMBOL_GPL(dlm_print_one_lock); 140EXPORT_SYMBOL_GPL(dlm_print_one_lock);
138 141
139#if 0
140void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
141{
142 struct dlm_lock_resource *res;
143 struct hlist_node *iter;
144 struct hlist_head *bucket;
145 int i;
146
147 mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
148 dlm->name, dlm->node_num, dlm->key);
149 if (!dlm || !dlm->name) {
150 mlog(ML_ERROR, "dlm=%p\n", dlm);
151 return;
152 }
153
154 spin_lock(&dlm->spinlock);
155 for (i=0; i<DLM_HASH_BUCKETS; i++) {
156 bucket = dlm_lockres_hash(dlm, i);
157 hlist_for_each_entry(res, iter, bucket, hash_node)
158 dlm_print_one_lock_resource(res);
159 }
160 spin_unlock(&dlm->spinlock);
161}
162#endif /* 0 */
163
164static const char *dlm_errnames[] = { 142static const char *dlm_errnames[] = {
165 [DLM_NORMAL] = "DLM_NORMAL", 143 [DLM_NORMAL] = "DLM_NORMAL",
166 [DLM_GRANTED] = "DLM_GRANTED", 144 [DLM_GRANTED] = "DLM_GRANTED",
@@ -266,3 +244,792 @@ const char *dlm_errname(enum dlm_status err)
266 return dlm_errnames[err]; 244 return dlm_errnames[err];
267} 245}
268EXPORT_SYMBOL_GPL(dlm_errname); 246EXPORT_SYMBOL_GPL(dlm_errname);
247
248/* NOTE: This function converts a lockname into a string. It uses knowledge
249 * of the format of the lockname that should be outside the purview of the dlm.
250 * We are adding only to make dlm debugging slightly easier.
251 *
252 * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h.
253 */
254int stringify_lockname(const char *lockname, int locklen, char *buf, int len)
255{
256 int out = 0;
257 __be64 inode_blkno_be;
258
259#define OCFS2_DENTRY_LOCK_INO_START 18
260 if (*lockname == 'N') {
261 memcpy((__be64 *)&inode_blkno_be,
262 (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START],
263 sizeof(__be64));
264 out += snprintf(buf + out, len - out, "%.*s%08x",
265 OCFS2_DENTRY_LOCK_INO_START - 1, lockname,
266 (unsigned int)be64_to_cpu(inode_blkno_be));
267 } else
268 out += snprintf(buf + out, len - out, "%.*s",
269 locklen, lockname);
270 return out;
271}
272
273static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
274 char *buf, int len)
275{
276 int out = 0;
277 int i = -1;
278
279 while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes)
280 out += snprintf(buf + out, len - out, "%d ", i);
281
282 return out;
283}
284
285static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
286{
287 int out = 0;
288 unsigned int namelen;
289 const char *name;
290 char *mle_type;
291
292 if (mle->type != DLM_MLE_MASTER) {
293 namelen = mle->u.name.len;
294 name = mle->u.name.name;
295 } else {
296 namelen = mle->u.res->lockname.len;
297 name = mle->u.res->lockname.name;
298 }
299
300 if (mle->type == DLM_MLE_BLOCK)
301 mle_type = "BLK";
302 else if (mle->type == DLM_MLE_MASTER)
303 mle_type = "MAS";
304 else
305 mle_type = "MIG";
306
307 out += stringify_lockname(name, namelen, buf + out, len - out);
308 out += snprintf(buf + out, len - out,
309 "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
310 mle_type, mle->master, mle->new_master,
311 !list_empty(&mle->hb_events),
312 !!mle->inuse,
313 atomic_read(&mle->mle_refs.refcount));
314
315 out += snprintf(buf + out, len - out, "Maybe=");
316 out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
317 buf + out, len - out);
318 out += snprintf(buf + out, len - out, "\n");
319
320 out += snprintf(buf + out, len - out, "Vote=");
321 out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES,
322 buf + out, len - out);
323 out += snprintf(buf + out, len - out, "\n");
324
325 out += snprintf(buf + out, len - out, "Response=");
326 out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES,
327 buf + out, len - out);
328 out += snprintf(buf + out, len - out, "\n");
329
330 out += snprintf(buf + out, len - out, "Node=");
331 out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES,
332 buf + out, len - out);
333 out += snprintf(buf + out, len - out, "\n");
334
335 out += snprintf(buf + out, len - out, "\n");
336
337 return out;
338}
339
340void dlm_print_one_mle(struct dlm_master_list_entry *mle)
341{
342 char *buf;
343
344 buf = (char *) get_zeroed_page(GFP_NOFS);
345 if (buf) {
346 dump_mle(mle, buf, PAGE_SIZE - 1);
347 free_page((unsigned long)buf);
348 }
349}
350
351#ifdef CONFIG_DEBUG_FS
352
353static struct dentry *dlm_debugfs_root = NULL;
354
355#define DLM_DEBUGFS_DIR "o2dlm"
356#define DLM_DEBUGFS_DLM_STATE "dlm_state"
357#define DLM_DEBUGFS_LOCKING_STATE "locking_state"
358#define DLM_DEBUGFS_MLE_STATE "mle_state"
359#define DLM_DEBUGFS_PURGE_LIST "purge_list"
360
361/* begin - utils funcs */
362static void dlm_debug_free(struct kref *kref)
363{
364 struct dlm_debug_ctxt *dc;
365
366 dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt);
367
368 kfree(dc);
369}
370
371void dlm_debug_put(struct dlm_debug_ctxt *dc)
372{
373 if (dc)
374 kref_put(&dc->debug_refcnt, dlm_debug_free);
375}
376
377static void dlm_debug_get(struct dlm_debug_ctxt *dc)
378{
379 kref_get(&dc->debug_refcnt);
380}
381
382static struct debug_buffer *debug_buffer_allocate(void)
383{
384 struct debug_buffer *db = NULL;
385
386 db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
387 if (!db)
388 goto bail;
389
390 db->len = PAGE_SIZE;
391 db->buf = kmalloc(db->len, GFP_KERNEL);
392 if (!db->buf)
393 goto bail;
394
395 return db;
396bail:
397 kfree(db);
398 return NULL;
399}
400
401static ssize_t debug_buffer_read(struct file *file, char __user *buf,
402 size_t nbytes, loff_t *ppos)
403{
404 struct debug_buffer *db = file->private_data;
405
406 return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
407}
408
409static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
410{
411 struct debug_buffer *db = file->private_data;
412 loff_t new = -1;
413
414 switch (whence) {
415 case 0:
416 new = off;
417 break;
418 case 1:
419 new = file->f_pos + off;
420 break;
421 }
422
423 if (new < 0 || new > db->len)
424 return -EINVAL;
425
426 return (file->f_pos = new);
427}
428
429static int debug_buffer_release(struct inode *inode, struct file *file)
430{
431 struct debug_buffer *db = (struct debug_buffer *)file->private_data;
432
433 if (db)
434 kfree(db->buf);
435 kfree(db);
436
437 return 0;
438}
439/* end - util funcs */
440
441/* begin - purge list funcs */
442static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
443{
444 struct dlm_lock_resource *res;
445 int out = 0;
446 unsigned long total = 0;
447
448 out += snprintf(db->buf + out, db->len - out,
449 "Dumping Purgelist for Domain: %s\n", dlm->name);
450
451 spin_lock(&dlm->spinlock);
452 list_for_each_entry(res, &dlm->purge_list, purge) {
453 ++total;
454 if (db->len - out < 100)
455 continue;
456 spin_lock(&res->spinlock);
457 out += stringify_lockname(res->lockname.name,
458 res->lockname.len,
459 db->buf + out, db->len - out);
460 out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
461 (jiffies - res->last_used)/HZ);
462 spin_unlock(&res->spinlock);
463 }
464 spin_unlock(&dlm->spinlock);
465
466 out += snprintf(db->buf + out, db->len - out,
467 "Total on list: %ld\n", total);
468
469 return out;
470}
471
472static int debug_purgelist_open(struct inode *inode, struct file *file)
473{
474 struct dlm_ctxt *dlm = inode->i_private;
475 struct debug_buffer *db;
476
477 db = debug_buffer_allocate();
478 if (!db)
479 goto bail;
480
481 db->len = debug_purgelist_print(dlm, db);
482
483 file->private_data = db;
484
485 return 0;
486bail:
487 return -ENOMEM;
488}
489
490static struct file_operations debug_purgelist_fops = {
491 .open = debug_purgelist_open,
492 .release = debug_buffer_release,
493 .read = debug_buffer_read,
494 .llseek = debug_buffer_llseek,
495};
496/* end - purge list funcs */
497
498/* begin - debug mle funcs */
499static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
500{
501 struct dlm_master_list_entry *mle;
502 int out = 0;
503 unsigned long total = 0;
504
505 out += snprintf(db->buf + out, db->len - out,
506 "Dumping MLEs for Domain: %s\n", dlm->name);
507
508 spin_lock(&dlm->master_lock);
509 list_for_each_entry(mle, &dlm->master_list, list) {
510 ++total;
511 if (db->len - out < 200)
512 continue;
513 out += dump_mle(mle, db->buf + out, db->len - out);
514 }
515 spin_unlock(&dlm->master_lock);
516
517 out += snprintf(db->buf + out, db->len - out,
518 "Total on list: %ld\n", total);
519 return out;
520}
521
522static int debug_mle_open(struct inode *inode, struct file *file)
523{
524 struct dlm_ctxt *dlm = inode->i_private;
525 struct debug_buffer *db;
526
527 db = debug_buffer_allocate();
528 if (!db)
529 goto bail;
530
531 db->len = debug_mle_print(dlm, db);
532
533 file->private_data = db;
534
535 return 0;
536bail:
537 return -ENOMEM;
538}
539
540static struct file_operations debug_mle_fops = {
541 .open = debug_mle_open,
542 .release = debug_buffer_release,
543 .read = debug_buffer_read,
544 .llseek = debug_buffer_llseek,
545};
546
547/* end - debug mle funcs */
548
549/* begin - debug lockres funcs */
550static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
551{
552 int out;
553
554#define DEBUG_LOCK_VERSION 1
555 spin_lock(&lock->spinlock);
556 out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
557 "%d,%d,%d,%d\n",
558 DEBUG_LOCK_VERSION,
559 list_type, lock->ml.type, lock->ml.convert_type,
560 lock->ml.node,
561 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
562 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
563 !list_empty(&lock->ast_list),
564 !list_empty(&lock->bast_list),
565 lock->ast_pending, lock->bast_pending,
566 lock->convert_pending, lock->lock_pending,
567 lock->cancel_pending, lock->unlock_pending,
568 atomic_read(&lock->lock_refs.refcount));
569 spin_unlock(&lock->spinlock);
570
571 return out;
572}
573
574static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
575{
576 struct dlm_lock *lock;
577 int i;
578 int out = 0;
579
580 out += snprintf(buf + out, len - out, "NAME:");
581 out += stringify_lockname(res->lockname.name, res->lockname.len,
582 buf + out, len - out);
583 out += snprintf(buf + out, len - out, "\n");
584
585#define DEBUG_LRES_VERSION 1
586 out += snprintf(buf + out, len - out,
587 "LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n",
588 DEBUG_LRES_VERSION,
589 res->owner, res->state, res->last_used,
590 !list_empty(&res->purge),
591 !list_empty(&res->dirty),
592 !list_empty(&res->recovering),
593 res->inflight_locks, res->migration_pending,
594 atomic_read(&res->asts_reserved),
595 atomic_read(&res->refs.refcount));
596
597 /* refmap */
598 out += snprintf(buf + out, len - out, "RMAP:");
599 out += stringify_nodemap(res->refmap, O2NM_MAX_NODES,
600 buf + out, len - out);
601 out += snprintf(buf + out, len - out, "\n");
602
603 /* lvb */
604 out += snprintf(buf + out, len - out, "LVBX:");
605 for (i = 0; i < DLM_LVB_LEN; i++)
606 out += snprintf(buf + out, len - out,
607 "%02x", (unsigned char)res->lvb[i]);
608 out += snprintf(buf + out, len - out, "\n");
609
610 /* granted */
611 list_for_each_entry(lock, &res->granted, list)
612 out += dump_lock(lock, 0, buf + out, len - out);
613
614 /* converting */
615 list_for_each_entry(lock, &res->converting, list)
616 out += dump_lock(lock, 1, buf + out, len - out);
617
618 /* blocked */
619 list_for_each_entry(lock, &res->blocked, list)
620 out += dump_lock(lock, 2, buf + out, len - out);
621
622 out += snprintf(buf + out, len - out, "\n");
623
624 return out;
625}
626
627static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
628{
629 struct debug_lockres *dl = m->private;
630 struct dlm_ctxt *dlm = dl->dl_ctxt;
631 struct dlm_lock_resource *res = NULL;
632
633 spin_lock(&dlm->spinlock);
634
635 if (dl->dl_res) {
636 list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
637 if (dl->dl_res) {
638 dlm_lockres_put(dl->dl_res);
639 dl->dl_res = NULL;
640 }
641 if (&res->tracking == &dlm->tracking_list) {
642 mlog(0, "End of list found, %p\n", res);
643 dl = NULL;
644 break;
645 }
646 dlm_lockres_get(res);
647 dl->dl_res = res;
648 break;
649 }
650 } else {
651 if (!list_empty(&dlm->tracking_list)) {
652 list_for_each_entry(res, &dlm->tracking_list, tracking)
653 break;
654 dlm_lockres_get(res);
655 dl->dl_res = res;
656 } else
657 dl = NULL;
658 }
659
660 if (dl) {
661 spin_lock(&dl->dl_res->spinlock);
662 dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
663 spin_unlock(&dl->dl_res->spinlock);
664 }
665
666 spin_unlock(&dlm->spinlock);
667
668 return dl;
669}
670
671static void lockres_seq_stop(struct seq_file *m, void *v)
672{
673}
674
675static void *lockres_seq_next(struct seq_file *m, void *v, loff_t *pos)
676{
677 return NULL;
678}
679
680static int lockres_seq_show(struct seq_file *s, void *v)
681{
682 struct debug_lockres *dl = (struct debug_lockres *)v;
683
684 seq_printf(s, "%s", dl->dl_buf);
685
686 return 0;
687}
688
689static struct seq_operations debug_lockres_ops = {
690 .start = lockres_seq_start,
691 .stop = lockres_seq_stop,
692 .next = lockres_seq_next,
693 .show = lockres_seq_show,
694};
695
696static int debug_lockres_open(struct inode *inode, struct file *file)
697{
698 struct dlm_ctxt *dlm = inode->i_private;
699 int ret = -ENOMEM;
700 struct seq_file *seq;
701 struct debug_lockres *dl = NULL;
702
703 dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL);
704 if (!dl) {
705 mlog_errno(ret);
706 goto bail;
707 }
708
709 dl->dl_len = PAGE_SIZE;
710 dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL);
711 if (!dl->dl_buf) {
712 mlog_errno(ret);
713 goto bail;
714 }
715
716 ret = seq_open(file, &debug_lockres_ops);
717 if (ret) {
718 mlog_errno(ret);
719 goto bail;
720 }
721
722 seq = (struct seq_file *) file->private_data;
723 seq->private = dl;
724
725 dlm_grab(dlm);
726 dl->dl_ctxt = dlm;
727
728 return 0;
729bail:
730 if (dl)
731 kfree(dl->dl_buf);
732 kfree(dl);
733 return ret;
734}
735
736static int debug_lockres_release(struct inode *inode, struct file *file)
737{
738 struct seq_file *seq = (struct seq_file *)file->private_data;
739 struct debug_lockres *dl = (struct debug_lockres *)seq->private;
740
741 if (dl->dl_res)
742 dlm_lockres_put(dl->dl_res);
743 dlm_put(dl->dl_ctxt);
744 kfree(dl->dl_buf);
745 return seq_release_private(inode, file);
746}
747
748static struct file_operations debug_lockres_fops = {
749 .open = debug_lockres_open,
750 .release = debug_lockres_release,
751 .read = seq_read,
752 .llseek = seq_lseek,
753};
754/* end - debug lockres funcs */
755
756/* begin - debug state funcs */
757static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
758{
759 int out = 0;
760 struct dlm_reco_node_data *node;
761 char *state;
762 int lres, rres, ures, tres;
763
764 lres = atomic_read(&dlm->local_resources);
765 rres = atomic_read(&dlm->remote_resources);
766 ures = atomic_read(&dlm->unknown_resources);
767 tres = lres + rres + ures;
768
769 spin_lock(&dlm->spinlock);
770
771 switch (dlm->dlm_state) {
772 case DLM_CTXT_NEW:
773 state = "NEW"; break;
774 case DLM_CTXT_JOINED:
775 state = "JOINED"; break;
776 case DLM_CTXT_IN_SHUTDOWN:
777 state = "SHUTDOWN"; break;
778 case DLM_CTXT_LEAVING:
779 state = "LEAVING"; break;
780 default:
781 state = "UNKNOWN"; break;
782 }
783
784 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */
785 out += snprintf(db->buf + out, db->len - out,
786 "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key);
787
788 /* Thread Pid: xxx Node: xxx State: xxxxx */
789 out += snprintf(db->buf + out, db->len - out,
790 "Thread Pid: %d Node: %d State: %s\n",
791 dlm->dlm_thread_task->pid, dlm->node_num, state);
792
793 /* Number of Joins: xxx Joining Node: xxx */
794 out += snprintf(db->buf + out, db->len - out,
795 "Number of Joins: %d Joining Node: %d\n",
796 dlm->num_joins, dlm->joining_node);
797
798 /* Domain Map: xx xx xx */
799 out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
800 out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
801 db->buf + out, db->len - out);
802 out += snprintf(db->buf + out, db->len - out, "\n");
803
804 /* Live Map: xx xx xx */
805 out += snprintf(db->buf + out, db->len - out, "Live Map: ");
806 out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
807 db->buf + out, db->len - out);
808 out += snprintf(db->buf + out, db->len - out, "\n");
809
810 /* Mastered Resources Total: xxx Locally: xxx Remotely: ... */
811 out += snprintf(db->buf + out, db->len - out,
812 "Mastered Resources Total: %d Locally: %d "
813 "Remotely: %d Unknown: %d\n",
814 tres, lres, rres, ures);
815
816 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
817 out += snprintf(db->buf + out, db->len - out,
818 "Lists: Dirty=%s Purge=%s PendingASTs=%s "
819 "PendingBASTs=%s Master=%s\n",
820 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
821 (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
822 (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
823 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"),
824 (list_empty(&dlm->master_list) ? "Empty" : "InUse"));
825
826 /* Purge Count: xxx Refs: xxx */
827 out += snprintf(db->buf + out, db->len - out,
828 "Purge Count: %d Refs: %d\n", dlm->purge_count,
829 atomic_read(&dlm->dlm_refs.refcount));
830
831 /* Dead Node: xxx */
832 out += snprintf(db->buf + out, db->len - out,
833 "Dead Node: %d\n", dlm->reco.dead_node);
834
835 /* What about DLM_RECO_STATE_FINALIZE? */
836 if (dlm->reco.state == DLM_RECO_STATE_ACTIVE)
837 state = "ACTIVE";
838 else
839 state = "INACTIVE";
840
841 /* Recovery Pid: xxxx Master: xxx State: xxxx */
842 out += snprintf(db->buf + out, db->len - out,
843 "Recovery Pid: %d Master: %d State: %s\n",
844 dlm->dlm_reco_thread_task->pid,
845 dlm->reco.new_master, state);
846
847 /* Recovery Map: xx xx */
848 out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
849 out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
850 db->buf + out, db->len - out);
851 out += snprintf(db->buf + out, db->len - out, "\n");
852
853 /* Recovery Node State: */
854 out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
855 list_for_each_entry(node, &dlm->reco.node_data, list) {
856 switch (node->state) {
857 case DLM_RECO_NODE_DATA_INIT:
858 state = "INIT";
859 break;
860 case DLM_RECO_NODE_DATA_REQUESTING:
861 state = "REQUESTING";
862 break;
863 case DLM_RECO_NODE_DATA_DEAD:
864 state = "DEAD";
865 break;
866 case DLM_RECO_NODE_DATA_RECEIVING:
867 state = "RECEIVING";
868 break;
869 case DLM_RECO_NODE_DATA_REQUESTED:
870 state = "REQUESTED";
871 break;
872 case DLM_RECO_NODE_DATA_DONE:
873 state = "DONE";
874 break;
875 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
876 state = "FINALIZE-SENT";
877 break;
878 default:
879 state = "BAD";
880 break;
881 }
882 out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
883 node->node_num, state);
884 }
885
886 spin_unlock(&dlm->spinlock);
887
888 return out;
889}
890
891static int debug_state_open(struct inode *inode, struct file *file)
892{
893 struct dlm_ctxt *dlm = inode->i_private;
894 struct debug_buffer *db = NULL;
895
896 db = debug_buffer_allocate();
897 if (!db)
898 goto bail;
899
900 db->len = debug_state_print(dlm, db);
901
902 file->private_data = db;
903
904 return 0;
905bail:
906 return -ENOMEM;
907}
908
909static struct file_operations debug_state_fops = {
910 .open = debug_state_open,
911 .release = debug_buffer_release,
912 .read = debug_buffer_read,
913 .llseek = debug_buffer_llseek,
914};
915/* end - debug state funcs */
916
917/* files in subroot */
918int dlm_debug_init(struct dlm_ctxt *dlm)
919{
920 struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
921
922 /* for dumping dlm_ctxt */
923 dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE,
924 S_IFREG|S_IRUSR,
925 dlm->dlm_debugfs_subroot,
926 dlm, &debug_state_fops);
927 if (!dc->debug_state_dentry) {
928 mlog_errno(-ENOMEM);
929 goto bail;
930 }
931
932 /* for dumping lockres */
933 dc->debug_lockres_dentry =
934 debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE,
935 S_IFREG|S_IRUSR,
936 dlm->dlm_debugfs_subroot,
937 dlm, &debug_lockres_fops);
938 if (!dc->debug_lockres_dentry) {
939 mlog_errno(-ENOMEM);
940 goto bail;
941 }
942
943 /* for dumping mles */
944 dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE,
945 S_IFREG|S_IRUSR,
946 dlm->dlm_debugfs_subroot,
947 dlm, &debug_mle_fops);
948 if (!dc->debug_mle_dentry) {
949 mlog_errno(-ENOMEM);
950 goto bail;
951 }
952
953 /* for dumping lockres on the purge list */
954 dc->debug_purgelist_dentry =
955 debugfs_create_file(DLM_DEBUGFS_PURGE_LIST,
956 S_IFREG|S_IRUSR,
957 dlm->dlm_debugfs_subroot,
958 dlm, &debug_purgelist_fops);
959 if (!dc->debug_purgelist_dentry) {
960 mlog_errno(-ENOMEM);
961 goto bail;
962 }
963
964 dlm_debug_get(dc);
965 return 0;
966
967bail:
968 dlm_debug_shutdown(dlm);
969 return -ENOMEM;
970}
971
972void dlm_debug_shutdown(struct dlm_ctxt *dlm)
973{
974 struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
975
976 if (dc) {
977 if (dc->debug_purgelist_dentry)
978 debugfs_remove(dc->debug_purgelist_dentry);
979 if (dc->debug_mle_dentry)
980 debugfs_remove(dc->debug_mle_dentry);
981 if (dc->debug_lockres_dentry)
982 debugfs_remove(dc->debug_lockres_dentry);
983 if (dc->debug_state_dentry)
984 debugfs_remove(dc->debug_state_dentry);
985 dlm_debug_put(dc);
986 }
987}
988
989/* subroot - domain dir */
990int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
991{
992 dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
993 dlm_debugfs_root);
994 if (!dlm->dlm_debugfs_subroot) {
995 mlog_errno(-ENOMEM);
996 goto bail;
997 }
998
999 dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
1000 GFP_KERNEL);
1001 if (!dlm->dlm_debug_ctxt) {
1002 mlog_errno(-ENOMEM);
1003 goto bail;
1004 }
1005 kref_init(&dlm->dlm_debug_ctxt->debug_refcnt);
1006
1007 return 0;
1008bail:
1009 dlm_destroy_debugfs_subroot(dlm);
1010 return -ENOMEM;
1011}
1012
1013void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
1014{
1015 if (dlm->dlm_debugfs_subroot)
1016 debugfs_remove(dlm->dlm_debugfs_subroot);
1017}
1018
1019/* debugfs root */
1020int dlm_create_debugfs_root(void)
1021{
1022 dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL);
1023 if (!dlm_debugfs_root) {
1024 mlog_errno(-ENOMEM);
1025 return -ENOMEM;
1026 }
1027 return 0;
1028}
1029
1030void dlm_destroy_debugfs_root(void)
1031{
1032 if (dlm_debugfs_root)
1033 debugfs_remove(dlm_debugfs_root);
1034}
1035#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
new file mode 100644
index 000000000000..d34a62a3a625
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -0,0 +1,86 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmdebug.h
5 *
6 * Copyright (C) 2008 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 021110-1307, USA.
22 *
23 */
24
25#ifndef DLMDEBUG_H
26#define DLMDEBUG_H
27
28void dlm_print_one_mle(struct dlm_master_list_entry *mle);
29
30#ifdef CONFIG_DEBUG_FS
31
32struct dlm_debug_ctxt {
33 struct kref debug_refcnt;
34 struct dentry *debug_state_dentry;
35 struct dentry *debug_lockres_dentry;
36 struct dentry *debug_mle_dentry;
37 struct dentry *debug_purgelist_dentry;
38};
39
40struct debug_buffer {
41 int len;
42 char *buf;
43};
44
45struct debug_lockres {
46 int dl_len;
47 char *dl_buf;
48 struct dlm_ctxt *dl_ctxt;
49 struct dlm_lock_resource *dl_res;
50};
51
52int dlm_debug_init(struct dlm_ctxt *dlm);
53void dlm_debug_shutdown(struct dlm_ctxt *dlm);
54
55int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
56void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
57
58int dlm_create_debugfs_root(void);
59void dlm_destroy_debugfs_root(void);
60
61#else
62
63static int dlm_debug_init(struct dlm_ctxt *dlm)
64{
65 return 0;
66}
67static void dlm_debug_shutdown(struct dlm_ctxt *dlm)
68{
69}
70static int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
71{
72 return 0;
73}
74static void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
75{
76}
77static int dlm_create_debugfs_root(void)
78{
79 return 0;
80}
81static void dlm_destroy_debugfs_root(void)
82{
83}
84
85#endif /* CONFIG_DEBUG_FS */
86#endif /* DLMDEBUG_H */
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 0879d86113e3..63f8125824e8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -33,6 +33,7 @@
33#include <linux/spinlock.h> 33#include <linux/spinlock.h>
34#include <linux/delay.h> 34#include <linux/delay.h>
35#include <linux/err.h> 35#include <linux/err.h>
36#include <linux/debugfs.h>
36 37
37#include "cluster/heartbeat.h" 38#include "cluster/heartbeat.h"
38#include "cluster/nodemanager.h" 39#include "cluster/nodemanager.h"
@@ -40,8 +41,8 @@
40 41
41#include "dlmapi.h" 42#include "dlmapi.h"
42#include "dlmcommon.h" 43#include "dlmcommon.h"
43
44#include "dlmdomain.h" 44#include "dlmdomain.h"
45#include "dlmdebug.h"
45 46
46#include "dlmver.h" 47#include "dlmver.h"
47 48
@@ -298,6 +299,8 @@ static int dlm_wait_on_domain_helper(const char *domain)
298 299
299static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) 300static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
300{ 301{
302 dlm_destroy_debugfs_subroot(dlm);
303
301 if (dlm->lockres_hash) 304 if (dlm->lockres_hash)
302 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); 305 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
303 306
@@ -395,6 +398,7 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
395static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) 398static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
396{ 399{
397 dlm_unregister_domain_handlers(dlm); 400 dlm_unregister_domain_handlers(dlm);
401 dlm_debug_shutdown(dlm);
398 dlm_complete_thread(dlm); 402 dlm_complete_thread(dlm);
399 dlm_complete_recovery_thread(dlm); 403 dlm_complete_recovery_thread(dlm);
400 dlm_destroy_dlm_worker(dlm); 404 dlm_destroy_dlm_worker(dlm);
@@ -644,6 +648,7 @@ int dlm_shutting_down(struct dlm_ctxt *dlm)
644void dlm_unregister_domain(struct dlm_ctxt *dlm) 648void dlm_unregister_domain(struct dlm_ctxt *dlm)
645{ 649{
646 int leave = 0; 650 int leave = 0;
651 struct dlm_lock_resource *res;
647 652
648 spin_lock(&dlm_domain_lock); 653 spin_lock(&dlm_domain_lock);
649 BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED); 654 BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
@@ -673,6 +678,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
673 msleep(500); 678 msleep(500);
674 mlog(0, "%s: more migration to do\n", dlm->name); 679 mlog(0, "%s: more migration to do\n", dlm->name);
675 } 680 }
681
682 /* This list should be empty. If not, print remaining lockres */
683 if (!list_empty(&dlm->tracking_list)) {
684 mlog(ML_ERROR, "Following lockres' are still on the "
685 "tracking list:\n");
686 list_for_each_entry(res, &dlm->tracking_list, tracking)
687 dlm_print_one_lock_resource(res);
688 }
689
676 dlm_mark_domain_leaving(dlm); 690 dlm_mark_domain_leaving(dlm);
677 dlm_leave_domain(dlm); 691 dlm_leave_domain(dlm);
678 dlm_complete_dlm_shutdown(dlm); 692 dlm_complete_dlm_shutdown(dlm);
@@ -1405,6 +1419,12 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
1405 goto bail; 1419 goto bail;
1406 } 1420 }
1407 1421
1422 status = dlm_debug_init(dlm);
1423 if (status < 0) {
1424 mlog_errno(status);
1425 goto bail;
1426 }
1427
1408 status = dlm_launch_thread(dlm); 1428 status = dlm_launch_thread(dlm);
1409 if (status < 0) { 1429 if (status < 0) {
1410 mlog_errno(status); 1430 mlog_errno(status);
@@ -1472,6 +1492,7 @@ bail:
1472 1492
1473 if (status) { 1493 if (status) {
1474 dlm_unregister_domain_handlers(dlm); 1494 dlm_unregister_domain_handlers(dlm);
1495 dlm_debug_shutdown(dlm);
1475 dlm_complete_thread(dlm); 1496 dlm_complete_thread(dlm);
1476 dlm_complete_recovery_thread(dlm); 1497 dlm_complete_recovery_thread(dlm);
1477 dlm_destroy_dlm_worker(dlm); 1498 dlm_destroy_dlm_worker(dlm);
@@ -1484,6 +1505,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1484 u32 key) 1505 u32 key)
1485{ 1506{
1486 int i; 1507 int i;
1508 int ret;
1487 struct dlm_ctxt *dlm = NULL; 1509 struct dlm_ctxt *dlm = NULL;
1488 1510
1489 dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); 1511 dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
@@ -1516,6 +1538,15 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1516 dlm->key = key; 1538 dlm->key = key;
1517 dlm->node_num = o2nm_this_node(); 1539 dlm->node_num = o2nm_this_node();
1518 1540
1541 ret = dlm_create_debugfs_subroot(dlm);
1542 if (ret < 0) {
1543 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
1544 kfree(dlm->name);
1545 kfree(dlm);
1546 dlm = NULL;
1547 goto leave;
1548 }
1549
1519 spin_lock_init(&dlm->spinlock); 1550 spin_lock_init(&dlm->spinlock);
1520 spin_lock_init(&dlm->master_lock); 1551 spin_lock_init(&dlm->master_lock);
1521 spin_lock_init(&dlm->ast_lock); 1552 spin_lock_init(&dlm->ast_lock);
@@ -1526,6 +1557,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1526 INIT_LIST_HEAD(&dlm->reco.node_data); 1557 INIT_LIST_HEAD(&dlm->reco.node_data);
1527 INIT_LIST_HEAD(&dlm->purge_list); 1558 INIT_LIST_HEAD(&dlm->purge_list);
1528 INIT_LIST_HEAD(&dlm->dlm_domain_handlers); 1559 INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
1560 INIT_LIST_HEAD(&dlm->tracking_list);
1529 dlm->reco.state = 0; 1561 dlm->reco.state = 0;
1530 1562
1531 INIT_LIST_HEAD(&dlm->pending_asts); 1563 INIT_LIST_HEAD(&dlm->pending_asts);
@@ -1816,21 +1848,49 @@ static int __init dlm_init(void)
1816 dlm_print_version(); 1848 dlm_print_version();
1817 1849
1818 status = dlm_init_mle_cache(); 1850 status = dlm_init_mle_cache();
1819 if (status) 1851 if (status) {
1820 return -1; 1852 mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
1853 goto error;
1854 }
1855
1856 status = dlm_init_master_caches();
1857 if (status) {
1858 mlog(ML_ERROR, "Could not create o2dlm_lockres and "
1859 "o2dlm_lockname slabcaches\n");
1860 goto error;
1861 }
1862
1863 status = dlm_init_lock_cache();
1864 if (status) {
1865 mlog(ML_ERROR, "Count not create o2dlm_lock slabcache\n");
1866 goto error;
1867 }
1821 1868
1822 status = dlm_register_net_handlers(); 1869 status = dlm_register_net_handlers();
1823 if (status) { 1870 if (status) {
1824 dlm_destroy_mle_cache(); 1871 mlog(ML_ERROR, "Unable to register network handlers\n");
1825 return -1; 1872 goto error;
1826 } 1873 }
1827 1874
1875 status = dlm_create_debugfs_root();
1876 if (status)
1877 goto error;
1878
1828 return 0; 1879 return 0;
1880error:
1881 dlm_unregister_net_handlers();
1882 dlm_destroy_lock_cache();
1883 dlm_destroy_master_caches();
1884 dlm_destroy_mle_cache();
1885 return -1;
1829} 1886}
1830 1887
1831static void __exit dlm_exit (void) 1888static void __exit dlm_exit (void)
1832{ 1889{
1890 dlm_destroy_debugfs_root();
1833 dlm_unregister_net_handlers(); 1891 dlm_unregister_net_handlers();
1892 dlm_destroy_lock_cache();
1893 dlm_destroy_master_caches();
1834 dlm_destroy_mle_cache(); 1894 dlm_destroy_mle_cache();
1835} 1895}
1836 1896
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 52578d907d9a..83a9f2972ac8 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -53,6 +53,8 @@
53#define MLOG_MASK_PREFIX ML_DLM 53#define MLOG_MASK_PREFIX ML_DLM
54#include "cluster/masklog.h" 54#include "cluster/masklog.h"
55 55
56static struct kmem_cache *dlm_lock_cache = NULL;
57
56static DEFINE_SPINLOCK(dlm_cookie_lock); 58static DEFINE_SPINLOCK(dlm_cookie_lock);
57static u64 dlm_next_cookie = 1; 59static u64 dlm_next_cookie = 1;
58 60
@@ -64,6 +66,22 @@ static void dlm_init_lock(struct dlm_lock *newlock, int type,
64static void dlm_lock_release(struct kref *kref); 66static void dlm_lock_release(struct kref *kref);
65static void dlm_lock_detach_lockres(struct dlm_lock *lock); 67static void dlm_lock_detach_lockres(struct dlm_lock *lock);
66 68
69int dlm_init_lock_cache(void)
70{
71 dlm_lock_cache = kmem_cache_create("o2dlm_lock",
72 sizeof(struct dlm_lock),
73 0, SLAB_HWCACHE_ALIGN, NULL);
74 if (dlm_lock_cache == NULL)
75 return -ENOMEM;
76 return 0;
77}
78
79void dlm_destroy_lock_cache(void)
80{
81 if (dlm_lock_cache)
82 kmem_cache_destroy(dlm_lock_cache);
83}
84
67/* Tell us whether we can grant a new lock request. 85/* Tell us whether we can grant a new lock request.
68 * locking: 86 * locking:
69 * caller needs: res->spinlock 87 * caller needs: res->spinlock
@@ -353,7 +371,7 @@ static void dlm_lock_release(struct kref *kref)
353 mlog(0, "freeing kernel-allocated lksb\n"); 371 mlog(0, "freeing kernel-allocated lksb\n");
354 kfree(lock->lksb); 372 kfree(lock->lksb);
355 } 373 }
356 kfree(lock); 374 kmem_cache_free(dlm_lock_cache, lock);
357} 375}
358 376
359/* associate a lock with it's lockres, getting a ref on the lockres */ 377/* associate a lock with it's lockres, getting a ref on the lockres */
@@ -412,7 +430,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
412 struct dlm_lock *lock; 430 struct dlm_lock *lock;
413 int kernel_allocated = 0; 431 int kernel_allocated = 0;
414 432
415 lock = kzalloc(sizeof(*lock), GFP_NOFS); 433 lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
416 if (!lock) 434 if (!lock)
417 return NULL; 435 return NULL;
418 436
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index ea6b89577860..efc015c6128a 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -48,47 +48,11 @@
48#include "dlmapi.h" 48#include "dlmapi.h"
49#include "dlmcommon.h" 49#include "dlmcommon.h"
50#include "dlmdomain.h" 50#include "dlmdomain.h"
51#include "dlmdebug.h"
51 52
52#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) 53#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
53#include "cluster/masklog.h" 54#include "cluster/masklog.h"
54 55
55enum dlm_mle_type {
56 DLM_MLE_BLOCK,
57 DLM_MLE_MASTER,
58 DLM_MLE_MIGRATION
59};
60
61struct dlm_lock_name
62{
63 u8 len;
64 u8 name[DLM_LOCKID_NAME_MAX];
65};
66
67struct dlm_master_list_entry
68{
69 struct list_head list;
70 struct list_head hb_events;
71 struct dlm_ctxt *dlm;
72 spinlock_t spinlock;
73 wait_queue_head_t wq;
74 atomic_t woken;
75 struct kref mle_refs;
76 int inuse;
77 unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
78 unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
79 unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
80 unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
81 u8 master;
82 u8 new_master;
83 enum dlm_mle_type type;
84 struct o2hb_callback_func mle_hb_up;
85 struct o2hb_callback_func mle_hb_down;
86 union {
87 struct dlm_lock_resource *res;
88 struct dlm_lock_name name;
89 } u;
90};
91
92static void dlm_mle_node_down(struct dlm_ctxt *dlm, 56static void dlm_mle_node_down(struct dlm_ctxt *dlm,
93 struct dlm_master_list_entry *mle, 57 struct dlm_master_list_entry *mle,
94 struct o2nm_node *node, 58 struct o2nm_node *node,
@@ -128,98 +92,10 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
128 return 1; 92 return 1;
129} 93}
130 94
131#define dlm_print_nodemap(m) _dlm_print_nodemap(m,#m) 95static struct kmem_cache *dlm_lockres_cache = NULL;
132static void _dlm_print_nodemap(unsigned long *map, const char *mapname) 96static struct kmem_cache *dlm_lockname_cache = NULL;
133{
134 int i;
135 printk("%s=[ ", mapname);
136 for (i=0; i<O2NM_MAX_NODES; i++)
137 if (test_bit(i, map))
138 printk("%d ", i);
139 printk("]");
140}
141
142static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
143{
144 int refs;
145 char *type;
146 char attached;
147 u8 master;
148 unsigned int namelen;
149 const char *name;
150 struct kref *k;
151 unsigned long *maybe = mle->maybe_map,
152 *vote = mle->vote_map,
153 *resp = mle->response_map,
154 *node = mle->node_map;
155
156 k = &mle->mle_refs;
157 if (mle->type == DLM_MLE_BLOCK)
158 type = "BLK";
159 else if (mle->type == DLM_MLE_MASTER)
160 type = "MAS";
161 else
162 type = "MIG";
163 refs = atomic_read(&k->refcount);
164 master = mle->master;
165 attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
166
167 if (mle->type != DLM_MLE_MASTER) {
168 namelen = mle->u.name.len;
169 name = mle->u.name.name;
170 } else {
171 namelen = mle->u.res->lockname.len;
172 name = mle->u.res->lockname.name;
173 }
174
175 mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ",
176 namelen, name, type, refs, master, mle->new_master, attached,
177 mle->inuse);
178 dlm_print_nodemap(maybe);
179 printk(", ");
180 dlm_print_nodemap(vote);
181 printk(", ");
182 dlm_print_nodemap(resp);
183 printk(", ");
184 dlm_print_nodemap(node);
185 printk(", ");
186 printk("\n");
187}
188
189#if 0
190/* Code here is included but defined out as it aids debugging */
191
192static void dlm_dump_mles(struct dlm_ctxt *dlm)
193{
194 struct dlm_master_list_entry *mle;
195
196 mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
197 spin_lock(&dlm->master_lock);
198 list_for_each_entry(mle, &dlm->master_list, list)
199 dlm_print_one_mle(mle);
200 spin_unlock(&dlm->master_lock);
201}
202
203int dlm_dump_all_mles(const char __user *data, unsigned int len)
204{
205 struct dlm_ctxt *dlm;
206
207 spin_lock(&dlm_domain_lock);
208 list_for_each_entry(dlm, &dlm_domains, list) {
209 mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
210 dlm_dump_mles(dlm);
211 }
212 spin_unlock(&dlm_domain_lock);
213 return len;
214}
215EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
216
217#endif /* 0 */
218
219
220static struct kmem_cache *dlm_mle_cache = NULL; 97static struct kmem_cache *dlm_mle_cache = NULL;
221 98
222
223static void dlm_mle_release(struct kref *kref); 99static void dlm_mle_release(struct kref *kref);
224static void dlm_init_mle(struct dlm_master_list_entry *mle, 100static void dlm_init_mle(struct dlm_master_list_entry *mle,
225 enum dlm_mle_type type, 101 enum dlm_mle_type type,
@@ -507,7 +383,7 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm,
507 383
508int dlm_init_mle_cache(void) 384int dlm_init_mle_cache(void)
509{ 385{
510 dlm_mle_cache = kmem_cache_create("dlm_mle_cache", 386 dlm_mle_cache = kmem_cache_create("o2dlm_mle",
511 sizeof(struct dlm_master_list_entry), 387 sizeof(struct dlm_master_list_entry),
512 0, SLAB_HWCACHE_ALIGN, 388 0, SLAB_HWCACHE_ALIGN,
513 NULL); 389 NULL);
@@ -560,6 +436,35 @@ static void dlm_mle_release(struct kref *kref)
560 * LOCK RESOURCE FUNCTIONS 436 * LOCK RESOURCE FUNCTIONS
561 */ 437 */
562 438
439int dlm_init_master_caches(void)
440{
441 dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
442 sizeof(struct dlm_lock_resource),
443 0, SLAB_HWCACHE_ALIGN, NULL);
444 if (!dlm_lockres_cache)
445 goto bail;
446
447 dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
448 DLM_LOCKID_NAME_MAX, 0,
449 SLAB_HWCACHE_ALIGN, NULL);
450 if (!dlm_lockname_cache)
451 goto bail;
452
453 return 0;
454bail:
455 dlm_destroy_master_caches();
456 return -ENOMEM;
457}
458
459void dlm_destroy_master_caches(void)
460{
461 if (dlm_lockname_cache)
462 kmem_cache_destroy(dlm_lockname_cache);
463
464 if (dlm_lockres_cache)
465 kmem_cache_destroy(dlm_lockres_cache);
466}
467
563static void dlm_set_lockres_owner(struct dlm_ctxt *dlm, 468static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
564 struct dlm_lock_resource *res, 469 struct dlm_lock_resource *res,
565 u8 owner) 470 u8 owner)
@@ -610,6 +515,14 @@ static void dlm_lockres_release(struct kref *kref)
610 mlog(0, "destroying lockres %.*s\n", res->lockname.len, 515 mlog(0, "destroying lockres %.*s\n", res->lockname.len,
611 res->lockname.name); 516 res->lockname.name);
612 517
518 if (!list_empty(&res->tracking))
519 list_del_init(&res->tracking);
520 else {
521 mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
522 res->lockname.len, res->lockname.name);
523 dlm_print_one_lock_resource(res);
524 }
525
613 if (!hlist_unhashed(&res->hash_node) || 526 if (!hlist_unhashed(&res->hash_node) ||
614 !list_empty(&res->granted) || 527 !list_empty(&res->granted) ||
615 !list_empty(&res->converting) || 528 !list_empty(&res->converting) ||
@@ -642,9 +555,9 @@ static void dlm_lockres_release(struct kref *kref)
642 BUG_ON(!list_empty(&res->recovering)); 555 BUG_ON(!list_empty(&res->recovering));
643 BUG_ON(!list_empty(&res->purge)); 556 BUG_ON(!list_empty(&res->purge));
644 557
645 kfree(res->lockname.name); 558 kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
646 559
647 kfree(res); 560 kmem_cache_free(dlm_lockres_cache, res);
648} 561}
649 562
650void dlm_lockres_put(struct dlm_lock_resource *res) 563void dlm_lockres_put(struct dlm_lock_resource *res)
@@ -677,6 +590,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
677 INIT_LIST_HEAD(&res->dirty); 590 INIT_LIST_HEAD(&res->dirty);
678 INIT_LIST_HEAD(&res->recovering); 591 INIT_LIST_HEAD(&res->recovering);
679 INIT_LIST_HEAD(&res->purge); 592 INIT_LIST_HEAD(&res->purge);
593 INIT_LIST_HEAD(&res->tracking);
680 atomic_set(&res->asts_reserved, 0); 594 atomic_set(&res->asts_reserved, 0);
681 res->migration_pending = 0; 595 res->migration_pending = 0;
682 res->inflight_locks = 0; 596 res->inflight_locks = 0;
@@ -692,6 +606,8 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
692 606
693 res->last_used = 0; 607 res->last_used = 0;
694 608
609 list_add_tail(&res->tracking, &dlm->tracking_list);
610
695 memset(res->lvb, 0, DLM_LVB_LEN); 611 memset(res->lvb, 0, DLM_LVB_LEN);
696 memset(res->refmap, 0, sizeof(res->refmap)); 612 memset(res->refmap, 0, sizeof(res->refmap));
697} 613}
@@ -700,20 +616,28 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
700 const char *name, 616 const char *name,
701 unsigned int namelen) 617 unsigned int namelen)
702{ 618{
703 struct dlm_lock_resource *res; 619 struct dlm_lock_resource *res = NULL;
704 620
705 res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS); 621 res = (struct dlm_lock_resource *)
622 kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
706 if (!res) 623 if (!res)
707 return NULL; 624 goto error;
708 625
709 res->lockname.name = kmalloc(namelen, GFP_NOFS); 626 res->lockname.name = (char *)
710 if (!res->lockname.name) { 627 kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
711 kfree(res); 628 if (!res->lockname.name)
712 return NULL; 629 goto error;
713 }
714 630
715 dlm_init_lockres(dlm, res, name, namelen); 631 dlm_init_lockres(dlm, res, name, namelen);
716 return res; 632 return res;
633
634error:
635 if (res && res->lockname.name)
636 kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
637
638 if (res)
639 kmem_cache_free(dlm_lockres_cache, res);
640 return NULL;
717} 641}
718 642
719void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 643void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1f1873bf41fb..394d25a131a5 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -27,18 +27,11 @@
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/crc32.h>
31#include <linux/kthread.h> 30#include <linux/kthread.h>
32#include <linux/pagemap.h> 31#include <linux/pagemap.h>
33#include <linux/debugfs.h> 32#include <linux/debugfs.h>
34#include <linux/seq_file.h> 33#include <linux/seq_file.h>
35 34
36#include <cluster/heartbeat.h>
37#include <cluster/nodemanager.h>
38#include <cluster/tcp.h>
39
40#include <dlm/dlmapi.h>
41
42#define MLOG_MASK_PREFIX ML_DLM_GLUE 35#define MLOG_MASK_PREFIX ML_DLM_GLUE
43#include <cluster/masklog.h> 36#include <cluster/masklog.h>
44 37
@@ -53,6 +46,7 @@
53#include "heartbeat.h" 46#include "heartbeat.h"
54#include "inode.h" 47#include "inode.h"
55#include "journal.h" 48#include "journal.h"
49#include "stackglue.h"
56#include "slot_map.h" 50#include "slot_map.h"
57#include "super.h" 51#include "super.h"
58#include "uptodate.h" 52#include "uptodate.h"
@@ -113,7 +107,8 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
113 unsigned int line, 107 unsigned int line,
114 struct ocfs2_lock_res *lockres) 108 struct ocfs2_lock_res *lockres)
115{ 109{
116 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 110 struct ocfs2_meta_lvb *lvb =
111 (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
117 112
118 mlog(level, "LVB information for %s (called from %s:%u):\n", 113 mlog(level, "LVB information for %s (called from %s:%u):\n",
119 lockres->l_name, function, line); 114 lockres->l_name, function, line);
@@ -259,31 +254,6 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
259 .flags = 0, 254 .flags = 0,
260}; 255};
261 256
262/*
263 * This is the filesystem locking protocol version.
264 *
265 * Whenever the filesystem does new things with locks (adds or removes a
266 * lock, orders them differently, does different things underneath a lock),
267 * the version must be changed. The protocol is negotiated when joining
268 * the dlm domain. A node may join the domain if its major version is
269 * identical to all other nodes and its minor version is greater than
270 * or equal to all other nodes. When its minor version is greater than
271 * the other nodes, it will run at the minor version specified by the
272 * other nodes.
273 *
274 * If a locking change is made that will not be compatible with older
275 * versions, the major number must be increased and the minor version set
276 * to zero. If a change merely adds a behavior that can be disabled when
277 * speaking to older versions, the minor version must be increased. If a
278 * change adds a fully backwards compatible change (eg, LVB changes that
279 * are just ignored by older versions), the version does not need to be
280 * updated.
281 */
282const struct dlm_protocol_version ocfs2_locking_protocol = {
283 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
284 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
285};
286
287static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 257static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
288{ 258{
289 return lockres->l_type == OCFS2_LOCK_TYPE_META || 259 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -316,7 +286,7 @@ static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *l
316static int ocfs2_lock_create(struct ocfs2_super *osb, 286static int ocfs2_lock_create(struct ocfs2_super *osb,
317 struct ocfs2_lock_res *lockres, 287 struct ocfs2_lock_res *lockres,
318 int level, 288 int level,
319 int dlm_flags); 289 u32 dlm_flags);
320static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, 290static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
321 int wanted); 291 int wanted);
322static void ocfs2_cluster_unlock(struct ocfs2_super *osb, 292static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
@@ -330,10 +300,9 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
330 struct ocfs2_lock_res *lockres); 300 struct ocfs2_lock_res *lockres);
331static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 301static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
332 int convert); 302 int convert);
333#define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \ 303#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \
334 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ 304 mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
335 "resource %s: %s\n", dlm_errname(_stat), _func, \ 305 _err, _func, _lockres->l_name); \
336 _lockres->l_name, dlm_errmsg(_stat)); \
337} while (0) 306} while (0)
338static int ocfs2_downconvert_thread(void *arg); 307static int ocfs2_downconvert_thread(void *arg);
339static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, 308static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
@@ -342,12 +311,13 @@ static int ocfs2_inode_lock_update(struct inode *inode,
342 struct buffer_head **bh); 311 struct buffer_head **bh);
343static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); 312static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
344static inline int ocfs2_highest_compat_lock_level(int level); 313static inline int ocfs2_highest_compat_lock_level(int level);
345static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, 314static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
346 int new_level); 315 int new_level);
347static int ocfs2_downconvert_lock(struct ocfs2_super *osb, 316static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
348 struct ocfs2_lock_res *lockres, 317 struct ocfs2_lock_res *lockres,
349 int new_level, 318 int new_level,
350 int lvb); 319 int lvb,
320 unsigned int generation);
351static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, 321static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
352 struct ocfs2_lock_res *lockres); 322 struct ocfs2_lock_res *lockres);
353static int ocfs2_cancel_convert(struct ocfs2_super *osb, 323static int ocfs2_cancel_convert(struct ocfs2_super *osb,
@@ -406,9 +376,9 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
406 res->l_ops = ops; 376 res->l_ops = ops;
407 res->l_priv = priv; 377 res->l_priv = priv;
408 378
409 res->l_level = LKM_IVMODE; 379 res->l_level = DLM_LOCK_IV;
410 res->l_requested = LKM_IVMODE; 380 res->l_requested = DLM_LOCK_IV;
411 res->l_blocking = LKM_IVMODE; 381 res->l_blocking = DLM_LOCK_IV;
412 res->l_action = OCFS2_AST_INVALID; 382 res->l_action = OCFS2_AST_INVALID;
413 res->l_unlock_action = OCFS2_UNLOCK_INVALID; 383 res->l_unlock_action = OCFS2_UNLOCK_INVALID;
414 384
@@ -604,10 +574,10 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
604 BUG_ON(!lockres); 574 BUG_ON(!lockres);
605 575
606 switch(level) { 576 switch(level) {
607 case LKM_EXMODE: 577 case DLM_LOCK_EX:
608 lockres->l_ex_holders++; 578 lockres->l_ex_holders++;
609 break; 579 break;
610 case LKM_PRMODE: 580 case DLM_LOCK_PR:
611 lockres->l_ro_holders++; 581 lockres->l_ro_holders++;
612 break; 582 break;
613 default: 583 default:
@@ -625,11 +595,11 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
625 BUG_ON(!lockres); 595 BUG_ON(!lockres);
626 596
627 switch(level) { 597 switch(level) {
628 case LKM_EXMODE: 598 case DLM_LOCK_EX:
629 BUG_ON(!lockres->l_ex_holders); 599 BUG_ON(!lockres->l_ex_holders);
630 lockres->l_ex_holders--; 600 lockres->l_ex_holders--;
631 break; 601 break;
632 case LKM_PRMODE: 602 case DLM_LOCK_PR:
633 BUG_ON(!lockres->l_ro_holders); 603 BUG_ON(!lockres->l_ro_holders);
634 lockres->l_ro_holders--; 604 lockres->l_ro_holders--;
635 break; 605 break;
@@ -644,12 +614,12 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
644 * lock types are added. */ 614 * lock types are added. */
645static inline int ocfs2_highest_compat_lock_level(int level) 615static inline int ocfs2_highest_compat_lock_level(int level)
646{ 616{
647 int new_level = LKM_EXMODE; 617 int new_level = DLM_LOCK_EX;
648 618
649 if (level == LKM_EXMODE) 619 if (level == DLM_LOCK_EX)
650 new_level = LKM_NLMODE; 620 new_level = DLM_LOCK_NL;
651 else if (level == LKM_PRMODE) 621 else if (level == DLM_LOCK_PR)
652 new_level = LKM_PRMODE; 622 new_level = DLM_LOCK_PR;
653 return new_level; 623 return new_level;
654} 624}
655 625
@@ -688,12 +658,12 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res
688 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); 658 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
689 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); 659 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
690 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); 660 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
691 BUG_ON(lockres->l_blocking <= LKM_NLMODE); 661 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
692 662
693 lockres->l_level = lockres->l_requested; 663 lockres->l_level = lockres->l_requested;
694 if (lockres->l_level <= 664 if (lockres->l_level <=
695 ocfs2_highest_compat_lock_level(lockres->l_blocking)) { 665 ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
696 lockres->l_blocking = LKM_NLMODE; 666 lockres->l_blocking = DLM_LOCK_NL;
697 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); 667 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
698 } 668 }
699 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 669 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
@@ -712,7 +682,7 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
712 * information is already up to data. Convert from NL to 682 * information is already up to data. Convert from NL to
713 * *anything* however should mark ourselves as needing an 683 * *anything* however should mark ourselves as needing an
714 * update */ 684 * update */
715 if (lockres->l_level == LKM_NLMODE && 685 if (lockres->l_level == DLM_LOCK_NL &&
716 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) 686 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
717 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 687 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
718 688
@@ -729,7 +699,7 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
729 BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY))); 699 BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
730 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); 700 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
731 701
732 if (lockres->l_requested > LKM_NLMODE && 702 if (lockres->l_requested > DLM_LOCK_NL &&
733 !(lockres->l_flags & OCFS2_LOCK_LOCAL) && 703 !(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
734 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) 704 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
735 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 705 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
@@ -767,6 +737,113 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
767 return needs_downconvert; 737 return needs_downconvert;
768} 738}
769 739
740/*
741 * OCFS2_LOCK_PENDING and l_pending_gen.
742 *
743 * Why does OCFS2_LOCK_PENDING exist? To close a race between setting
744 * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock(). See ocfs2_unblock_lock()
745 * for more details on the race.
746 *
747 * OCFS2_LOCK_PENDING closes the race quite nicely. However, it introduces
748 * a race on itself. In o2dlm, we can get the ast before ocfs2_dlm_lock()
749 * returns. The ast clears OCFS2_LOCK_BUSY, and must therefore clear
750 * OCFS2_LOCK_PENDING at the same time. When ocfs2_dlm_lock() returns,
751 * the caller is going to try to clear PENDING again. If nothing else is
752 * happening, __lockres_clear_pending() sees PENDING is unset and does
753 * nothing.
754 *
755 * But what if another path (eg downconvert thread) has just started a
756 * new locking action? The other path has re-set PENDING. Our path
757 * cannot clear PENDING, because that will re-open the original race
758 * window.
759 *
760 * [Example]
761 *
762 * ocfs2_meta_lock()
763 * ocfs2_cluster_lock()
764 * set BUSY
765 * set PENDING
766 * drop l_lock
767 * ocfs2_dlm_lock()
768 * ocfs2_locking_ast() ocfs2_downconvert_thread()
769 * clear PENDING ocfs2_unblock_lock()
770 * take_l_lock
771 * !BUSY
772 * ocfs2_prepare_downconvert()
773 * set BUSY
774 * set PENDING
775 * drop l_lock
776 * take l_lock
777 * clear PENDING
778 * drop l_lock
779 * <window>
780 * ocfs2_dlm_lock()
781 *
782 * So as you can see, we now have a window where l_lock is not held,
783 * PENDING is not set, and ocfs2_dlm_lock() has not been called.
784 *
785 * The core problem is that ocfs2_cluster_lock() has cleared the PENDING
786 * set by ocfs2_prepare_downconvert(). That wasn't nice.
787 *
788 * To solve this we introduce l_pending_gen. A call to
789 * lockres_clear_pending() will only do so when it is passed a generation
790 * number that matches the lockres. lockres_set_pending() will return the
791 * current generation number. When ocfs2_cluster_lock() goes to clear
792 * PENDING, it passes the generation it got from set_pending(). In our
793 * example above, the generation numbers will *not* match. Thus,
794 * ocfs2_cluster_lock() will not clear the PENDING set by
795 * ocfs2_prepare_downconvert().
796 */
797
798/* Unlocked version for ocfs2_locking_ast() */
799static void __lockres_clear_pending(struct ocfs2_lock_res *lockres,
800 unsigned int generation,
801 struct ocfs2_super *osb)
802{
803 assert_spin_locked(&lockres->l_lock);
804
805 /*
806 * The ast and locking functions can race us here. The winner
807 * will clear pending, the loser will not.
808 */
809 if (!(lockres->l_flags & OCFS2_LOCK_PENDING) ||
810 (lockres->l_pending_gen != generation))
811 return;
812
813 lockres_clear_flags(lockres, OCFS2_LOCK_PENDING);
814 lockres->l_pending_gen++;
815
816 /*
817 * The downconvert thread may have skipped us because we
818 * were PENDING. Wake it up.
819 */
820 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
821 ocfs2_wake_downconvert_thread(osb);
822}
823
824/* Locked version for callers of ocfs2_dlm_lock() */
825static void lockres_clear_pending(struct ocfs2_lock_res *lockres,
826 unsigned int generation,
827 struct ocfs2_super *osb)
828{
829 unsigned long flags;
830
831 spin_lock_irqsave(&lockres->l_lock, flags);
832 __lockres_clear_pending(lockres, generation, osb);
833 spin_unlock_irqrestore(&lockres->l_lock, flags);
834}
835
836static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
837{
838 assert_spin_locked(&lockres->l_lock);
839 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
840
841 lockres_or_flags(lockres, OCFS2_LOCK_PENDING);
842
843 return lockres->l_pending_gen;
844}
845
846
770static void ocfs2_blocking_ast(void *opaque, int level) 847static void ocfs2_blocking_ast(void *opaque, int level)
771{ 848{
772 struct ocfs2_lock_res *lockres = opaque; 849 struct ocfs2_lock_res *lockres = opaque;
@@ -774,7 +851,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
774 int needs_downconvert; 851 int needs_downconvert;
775 unsigned long flags; 852 unsigned long flags;
776 853
777 BUG_ON(level <= LKM_NLMODE); 854 BUG_ON(level <= DLM_LOCK_NL);
778 855
779 mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n", 856 mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n",
780 lockres->l_name, level, lockres->l_level, 857 lockres->l_name, level, lockres->l_level,
@@ -801,14 +878,22 @@ static void ocfs2_blocking_ast(void *opaque, int level)
801static void ocfs2_locking_ast(void *opaque) 878static void ocfs2_locking_ast(void *opaque)
802{ 879{
803 struct ocfs2_lock_res *lockres = opaque; 880 struct ocfs2_lock_res *lockres = opaque;
804 struct dlm_lockstatus *lksb = &lockres->l_lksb; 881 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
805 unsigned long flags; 882 unsigned long flags;
883 int status;
806 884
807 spin_lock_irqsave(&lockres->l_lock, flags); 885 spin_lock_irqsave(&lockres->l_lock, flags);
808 886
809 if (lksb->status != DLM_NORMAL) { 887 status = ocfs2_dlm_lock_status(&lockres->l_lksb);
810 mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n", 888
811 lockres->l_name, lksb->status); 889 if (status == -EAGAIN) {
890 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
891 goto out;
892 }
893
894 if (status) {
895 mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n",
896 lockres->l_name, status);
812 spin_unlock_irqrestore(&lockres->l_lock, flags); 897 spin_unlock_irqrestore(&lockres->l_lock, flags);
813 return; 898 return;
814 } 899 }
@@ -831,11 +916,23 @@ static void ocfs2_locking_ast(void *opaque)
831 lockres->l_unlock_action); 916 lockres->l_unlock_action);
832 BUG(); 917 BUG();
833 } 918 }
834 919out:
835 /* set it to something invalid so if we get called again we 920 /* set it to something invalid so if we get called again we
836 * can catch it. */ 921 * can catch it. */
837 lockres->l_action = OCFS2_AST_INVALID; 922 lockres->l_action = OCFS2_AST_INVALID;
838 923
924 /* Did we try to cancel this lock? Clear that state */
925 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT)
926 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
927
928 /*
929 * We may have beaten the locking functions here. We certainly
930 * know that dlm_lock() has been called :-)
931 * Because we can't have two lock calls in flight at once, we
932 * can use lockres->l_pending_gen.
933 */
934 __lockres_clear_pending(lockres, lockres->l_pending_gen, osb);
935
839 wake_up(&lockres->l_event); 936 wake_up(&lockres->l_event);
840 spin_unlock_irqrestore(&lockres->l_lock, flags); 937 spin_unlock_irqrestore(&lockres->l_lock, flags);
841} 938}
@@ -865,15 +962,15 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
865static int ocfs2_lock_create(struct ocfs2_super *osb, 962static int ocfs2_lock_create(struct ocfs2_super *osb,
866 struct ocfs2_lock_res *lockres, 963 struct ocfs2_lock_res *lockres,
867 int level, 964 int level,
868 int dlm_flags) 965 u32 dlm_flags)
869{ 966{
870 int ret = 0; 967 int ret = 0;
871 enum dlm_status status = DLM_NORMAL;
872 unsigned long flags; 968 unsigned long flags;
969 unsigned int gen;
873 970
874 mlog_entry_void(); 971 mlog_entry_void();
875 972
876 mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level, 973 mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
877 dlm_flags); 974 dlm_flags);
878 975
879 spin_lock_irqsave(&lockres->l_lock, flags); 976 spin_lock_irqsave(&lockres->l_lock, flags);
@@ -886,24 +983,23 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
886 lockres->l_action = OCFS2_AST_ATTACH; 983 lockres->l_action = OCFS2_AST_ATTACH;
887 lockres->l_requested = level; 984 lockres->l_requested = level;
888 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 985 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
986 gen = lockres_set_pending(lockres);
889 spin_unlock_irqrestore(&lockres->l_lock, flags); 987 spin_unlock_irqrestore(&lockres->l_lock, flags);
890 988
891 status = dlmlock(osb->dlm, 989 ret = ocfs2_dlm_lock(osb->cconn,
892 level, 990 level,
893 &lockres->l_lksb, 991 &lockres->l_lksb,
894 dlm_flags, 992 dlm_flags,
895 lockres->l_name, 993 lockres->l_name,
896 OCFS2_LOCK_ID_MAX_LEN - 1, 994 OCFS2_LOCK_ID_MAX_LEN - 1,
897 ocfs2_locking_ast, 995 lockres);
898 lockres, 996 lockres_clear_pending(lockres, gen, osb);
899 ocfs2_blocking_ast); 997 if (ret) {
900 if (status != DLM_NORMAL) { 998 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
901 ocfs2_log_dlm_error("dlmlock", status, lockres);
902 ret = -EINVAL;
903 ocfs2_recover_from_dlm_error(lockres, 1); 999 ocfs2_recover_from_dlm_error(lockres, 1);
904 } 1000 }
905 1001
906 mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name); 1002 mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
907 1003
908bail: 1004bail:
909 mlog_exit(ret); 1005 mlog_exit(ret);
@@ -1016,21 +1112,22 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
1016static int ocfs2_cluster_lock(struct ocfs2_super *osb, 1112static int ocfs2_cluster_lock(struct ocfs2_super *osb,
1017 struct ocfs2_lock_res *lockres, 1113 struct ocfs2_lock_res *lockres,
1018 int level, 1114 int level,
1019 int lkm_flags, 1115 u32 lkm_flags,
1020 int arg_flags) 1116 int arg_flags)
1021{ 1117{
1022 struct ocfs2_mask_waiter mw; 1118 struct ocfs2_mask_waiter mw;
1023 enum dlm_status status;
1024 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); 1119 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
1025 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ 1120 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
1026 unsigned long flags; 1121 unsigned long flags;
1122 unsigned int gen;
1123 int noqueue_attempted = 0;
1027 1124
1028 mlog_entry_void(); 1125 mlog_entry_void();
1029 1126
1030 ocfs2_init_mask_waiter(&mw); 1127 ocfs2_init_mask_waiter(&mw);
1031 1128
1032 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) 1129 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
1033 lkm_flags |= LKM_VALBLK; 1130 lkm_flags |= DLM_LKF_VALBLK;
1034 1131
1035again: 1132again:
1036 wait = 0; 1133 wait = 0;
@@ -1068,52 +1165,56 @@ again:
1068 } 1165 }
1069 1166
1070 if (level > lockres->l_level) { 1167 if (level > lockres->l_level) {
1168 if (noqueue_attempted > 0) {
1169 ret = -EAGAIN;
1170 goto unlock;
1171 }
1172 if (lkm_flags & DLM_LKF_NOQUEUE)
1173 noqueue_attempted = 1;
1174
1071 if (lockres->l_action != OCFS2_AST_INVALID) 1175 if (lockres->l_action != OCFS2_AST_INVALID)
1072 mlog(ML_ERROR, "lockres %s has action %u pending\n", 1176 mlog(ML_ERROR, "lockres %s has action %u pending\n",
1073 lockres->l_name, lockres->l_action); 1177 lockres->l_name, lockres->l_action);
1074 1178
1075 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { 1179 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
1076 lockres->l_action = OCFS2_AST_ATTACH; 1180 lockres->l_action = OCFS2_AST_ATTACH;
1077 lkm_flags &= ~LKM_CONVERT; 1181 lkm_flags &= ~DLM_LKF_CONVERT;
1078 } else { 1182 } else {
1079 lockres->l_action = OCFS2_AST_CONVERT; 1183 lockres->l_action = OCFS2_AST_CONVERT;
1080 lkm_flags |= LKM_CONVERT; 1184 lkm_flags |= DLM_LKF_CONVERT;
1081 } 1185 }
1082 1186
1083 lockres->l_requested = level; 1187 lockres->l_requested = level;
1084 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 1188 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
1189 gen = lockres_set_pending(lockres);
1085 spin_unlock_irqrestore(&lockres->l_lock, flags); 1190 spin_unlock_irqrestore(&lockres->l_lock, flags);
1086 1191
1087 BUG_ON(level == LKM_IVMODE); 1192 BUG_ON(level == DLM_LOCK_IV);
1088 BUG_ON(level == LKM_NLMODE); 1193 BUG_ON(level == DLM_LOCK_NL);
1089 1194
1090 mlog(0, "lock %s, convert from %d to level = %d\n", 1195 mlog(0, "lock %s, convert from %d to level = %d\n",
1091 lockres->l_name, lockres->l_level, level); 1196 lockres->l_name, lockres->l_level, level);
1092 1197
1093 /* call dlm_lock to upgrade lock now */ 1198 /* call dlm_lock to upgrade lock now */
1094 status = dlmlock(osb->dlm, 1199 ret = ocfs2_dlm_lock(osb->cconn,
1095 level, 1200 level,
1096 &lockres->l_lksb, 1201 &lockres->l_lksb,
1097 lkm_flags, 1202 lkm_flags,
1098 lockres->l_name, 1203 lockres->l_name,
1099 OCFS2_LOCK_ID_MAX_LEN - 1, 1204 OCFS2_LOCK_ID_MAX_LEN - 1,
1100 ocfs2_locking_ast, 1205 lockres);
1101 lockres, 1206 lockres_clear_pending(lockres, gen, osb);
1102 ocfs2_blocking_ast); 1207 if (ret) {
1103 if (status != DLM_NORMAL) { 1208 if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
1104 if ((lkm_flags & LKM_NOQUEUE) && 1209 (ret != -EAGAIN)) {
1105 (status == DLM_NOTQUEUED)) 1210 ocfs2_log_dlm_error("ocfs2_dlm_lock",
1106 ret = -EAGAIN; 1211 ret, lockres);
1107 else {
1108 ocfs2_log_dlm_error("dlmlock", status,
1109 lockres);
1110 ret = -EINVAL;
1111 } 1212 }
1112 ocfs2_recover_from_dlm_error(lockres, 1); 1213 ocfs2_recover_from_dlm_error(lockres, 1);
1113 goto out; 1214 goto out;
1114 } 1215 }
1115 1216
1116 mlog(0, "lock %s, successfull return from dlmlock\n", 1217 mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
1117 lockres->l_name); 1218 lockres->l_name);
1118 1219
1119 /* At this point we've gone inside the dlm and need to 1220 /* At this point we've gone inside the dlm and need to
@@ -1177,9 +1278,9 @@ static int ocfs2_create_new_lock(struct ocfs2_super *osb,
1177 int ex, 1278 int ex,
1178 int local) 1279 int local)
1179{ 1280{
1180 int level = ex ? LKM_EXMODE : LKM_PRMODE; 1281 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
1181 unsigned long flags; 1282 unsigned long flags;
1182 int lkm_flags = local ? LKM_LOCAL : 0; 1283 u32 lkm_flags = local ? DLM_LKF_LOCAL : 0;
1183 1284
1184 spin_lock_irqsave(&lockres->l_lock, flags); 1285 spin_lock_irqsave(&lockres->l_lock, flags);
1185 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); 1286 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
@@ -1222,7 +1323,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
1222 } 1323 }
1223 1324
1224 /* 1325 /*
1225 * We don't want to use LKM_LOCAL on a meta data lock as they 1326 * We don't want to use DLM_LKF_LOCAL on a meta data lock as they
1226 * don't use a generation in their lock names. 1327 * don't use a generation in their lock names.
1227 */ 1328 */
1228 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0); 1329 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
@@ -1261,7 +1362,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
1261 1362
1262 lockres = &OCFS2_I(inode)->ip_rw_lockres; 1363 lockres = &OCFS2_I(inode)->ip_rw_lockres;
1263 1364
1264 level = write ? LKM_EXMODE : LKM_PRMODE; 1365 level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
1265 1366
1266 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, 1367 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
1267 0); 1368 0);
@@ -1274,7 +1375,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
1274 1375
1275void ocfs2_rw_unlock(struct inode *inode, int write) 1376void ocfs2_rw_unlock(struct inode *inode, int write)
1276{ 1377{
1277 int level = write ? LKM_EXMODE : LKM_PRMODE; 1378 int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
1278 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; 1379 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
1279 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1380 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1280 1381
@@ -1312,7 +1413,7 @@ int ocfs2_open_lock(struct inode *inode)
1312 lockres = &OCFS2_I(inode)->ip_open_lockres; 1413 lockres = &OCFS2_I(inode)->ip_open_lockres;
1313 1414
1314 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, 1415 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
1315 LKM_PRMODE, 0, 0); 1416 DLM_LOCK_PR, 0, 0);
1316 if (status < 0) 1417 if (status < 0)
1317 mlog_errno(status); 1418 mlog_errno(status);
1318 1419
@@ -1340,16 +1441,16 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
1340 1441
1341 lockres = &OCFS2_I(inode)->ip_open_lockres; 1442 lockres = &OCFS2_I(inode)->ip_open_lockres;
1342 1443
1343 level = write ? LKM_EXMODE : LKM_PRMODE; 1444 level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
1344 1445
1345 /* 1446 /*
1346 * The file system may already holding a PRMODE/EXMODE open lock. 1447 * The file system may already holding a PRMODE/EXMODE open lock.
1347 * Since we pass LKM_NOQUEUE, the request won't block waiting on 1448 * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
1348 * other nodes and the -EAGAIN will indicate to the caller that 1449 * other nodes and the -EAGAIN will indicate to the caller that
1349 * this inode is still in use. 1450 * this inode is still in use.
1350 */ 1451 */
1351 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, 1452 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
1352 level, LKM_NOQUEUE, 0); 1453 level, DLM_LKF_NOQUEUE, 0);
1353 1454
1354out: 1455out:
1355 mlog_exit(status); 1456 mlog_exit(status);
@@ -1374,10 +1475,10 @@ void ocfs2_open_unlock(struct inode *inode)
1374 1475
1375 if(lockres->l_ro_holders) 1476 if(lockres->l_ro_holders)
1376 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, 1477 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
1377 LKM_PRMODE); 1478 DLM_LOCK_PR);
1378 if(lockres->l_ex_holders) 1479 if(lockres->l_ex_holders)
1379 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, 1480 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
1380 LKM_EXMODE); 1481 DLM_LOCK_EX);
1381 1482
1382out: 1483out:
1383 mlog_exit_void(); 1484 mlog_exit_void();
@@ -1464,7 +1565,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1464 ocfs2_init_mask_waiter(&mw); 1565 ocfs2_init_mask_waiter(&mw);
1465 1566
1466 if ((lockres->l_flags & OCFS2_LOCK_BUSY) || 1567 if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
1467 (lockres->l_level > LKM_NLMODE)) { 1568 (lockres->l_level > DLM_LOCK_NL)) {
1468 mlog(ML_ERROR, 1569 mlog(ML_ERROR,
1469 "File lock \"%s\" has busy or locked state: flags: 0x%lx, " 1570 "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
1470 "level: %u\n", lockres->l_name, lockres->l_flags, 1571 "level: %u\n", lockres->l_name, lockres->l_flags,
@@ -1503,14 +1604,12 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1503 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); 1604 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1504 spin_unlock_irqrestore(&lockres->l_lock, flags); 1605 spin_unlock_irqrestore(&lockres->l_lock, flags);
1505 1606
1506 ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags, 1607 ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
1507 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, 1608 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
1508 ocfs2_locking_ast, lockres, ocfs2_blocking_ast); 1609 lockres);
1509 if (ret != DLM_NORMAL) { 1610 if (ret) {
1510 if (trylock && ret == DLM_NOTQUEUED) 1611 if (!trylock || (ret != -EAGAIN)) {
1511 ret = -EAGAIN; 1612 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
1512 else {
1513 ocfs2_log_dlm_error("dlmlock", ret, lockres);
1514 ret = -EINVAL; 1613 ret = -EINVAL;
1515 } 1614 }
1516 1615
@@ -1537,6 +1636,10 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1537 * to just bubble sucess back up to the user. 1636 * to just bubble sucess back up to the user.
1538 */ 1637 */
1539 ret = ocfs2_flock_handle_signal(lockres, level); 1638 ret = ocfs2_flock_handle_signal(lockres, level);
1639 } else if (!ret && (level > lockres->l_level)) {
1640 /* Trylock failed asynchronously */
1641 BUG_ON(!trylock);
1642 ret = -EAGAIN;
1540 } 1643 }
1541 1644
1542out: 1645out:
@@ -1549,6 +1652,7 @@ out:
1549void ocfs2_file_unlock(struct file *file) 1652void ocfs2_file_unlock(struct file *file)
1550{ 1653{
1551 int ret; 1654 int ret;
1655 unsigned int gen;
1552 unsigned long flags; 1656 unsigned long flags;
1553 struct ocfs2_file_private *fp = file->private_data; 1657 struct ocfs2_file_private *fp = file->private_data;
1554 struct ocfs2_lock_res *lockres = &fp->fp_flock; 1658 struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1572,13 +1676,13 @@ void ocfs2_file_unlock(struct file *file)
1572 * Fake a blocking ast for the downconvert code. 1676 * Fake a blocking ast for the downconvert code.
1573 */ 1677 */
1574 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); 1678 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
1575 lockres->l_blocking = LKM_EXMODE; 1679 lockres->l_blocking = DLM_LOCK_EX;
1576 1680
1577 ocfs2_prepare_downconvert(lockres, LKM_NLMODE); 1681 gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
1578 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); 1682 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1579 spin_unlock_irqrestore(&lockres->l_lock, flags); 1683 spin_unlock_irqrestore(&lockres->l_lock, flags);
1580 1684
1581 ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0); 1685 ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen);
1582 if (ret) { 1686 if (ret) {
1583 mlog_errno(ret); 1687 mlog_errno(ret);
1584 return; 1688 return;
@@ -1601,11 +1705,11 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
1601 * condition. */ 1705 * condition. */
1602 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { 1706 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1603 switch(lockres->l_blocking) { 1707 switch(lockres->l_blocking) {
1604 case LKM_EXMODE: 1708 case DLM_LOCK_EX:
1605 if (!lockres->l_ex_holders && !lockres->l_ro_holders) 1709 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
1606 kick = 1; 1710 kick = 1;
1607 break; 1711 break;
1608 case LKM_PRMODE: 1712 case DLM_LOCK_PR:
1609 if (!lockres->l_ex_holders) 1713 if (!lockres->l_ex_holders)
1610 kick = 1; 1714 kick = 1;
1611 break; 1715 break;
@@ -1648,7 +1752,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1648 1752
1649 mlog_entry_void(); 1753 mlog_entry_void();
1650 1754
1651 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1755 lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
1652 1756
1653 /* 1757 /*
1654 * Invalidate the LVB of a deleted inode - this way other 1758 * Invalidate the LVB of a deleted inode - this way other
@@ -1700,7 +1804,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1700 1804
1701 mlog_meta_lvb(0, lockres); 1805 mlog_meta_lvb(0, lockres);
1702 1806
1703 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1807 lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
1704 1808
1705 /* We're safe here without the lockres lock... */ 1809 /* We're safe here without the lockres lock... */
1706 spin_lock(&oi->ip_lock); 1810 spin_lock(&oi->ip_lock);
@@ -1735,7 +1839,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1735static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, 1839static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
1736 struct ocfs2_lock_res *lockres) 1840 struct ocfs2_lock_res *lockres)
1737{ 1841{
1738 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1842 struct ocfs2_meta_lvb *lvb =
1843 (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
1739 1844
1740 if (lvb->lvb_version == OCFS2_LVB_VERSION 1845 if (lvb->lvb_version == OCFS2_LVB_VERSION
1741 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation) 1846 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -1923,7 +2028,8 @@ int ocfs2_inode_lock_full(struct inode *inode,
1923 int ex, 2028 int ex,
1924 int arg_flags) 2029 int arg_flags)
1925{ 2030{
1926 int status, level, dlm_flags, acquired; 2031 int status, level, acquired;
2032 u32 dlm_flags;
1927 struct ocfs2_lock_res *lockres = NULL; 2033 struct ocfs2_lock_res *lockres = NULL;
1928 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2034 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1929 struct buffer_head *local_bh = NULL; 2035 struct buffer_head *local_bh = NULL;
@@ -1950,14 +2056,13 @@ int ocfs2_inode_lock_full(struct inode *inode,
1950 goto local; 2056 goto local;
1951 2057
1952 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) 2058 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1953 wait_event(osb->recovery_event, 2059 ocfs2_wait_for_recovery(osb);
1954 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1955 2060
1956 lockres = &OCFS2_I(inode)->ip_inode_lockres; 2061 lockres = &OCFS2_I(inode)->ip_inode_lockres;
1957 level = ex ? LKM_EXMODE : LKM_PRMODE; 2062 level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
1958 dlm_flags = 0; 2063 dlm_flags = 0;
1959 if (arg_flags & OCFS2_META_LOCK_NOQUEUE) 2064 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
1960 dlm_flags |= LKM_NOQUEUE; 2065 dlm_flags |= DLM_LKF_NOQUEUE;
1961 2066
1962 status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags); 2067 status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
1963 if (status < 0) { 2068 if (status < 0) {
@@ -1974,8 +2079,7 @@ int ocfs2_inode_lock_full(struct inode *inode,
1974 * committed to owning this lock so we don't allow signals to 2079 * committed to owning this lock so we don't allow signals to
1975 * abort the operation. */ 2080 * abort the operation. */
1976 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) 2081 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1977 wait_event(osb->recovery_event, 2082 ocfs2_wait_for_recovery(osb);
1978 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1979 2083
1980local: 2084local:
1981 /* 2085 /*
@@ -2109,7 +2213,7 @@ int ocfs2_inode_lock_atime(struct inode *inode,
2109void ocfs2_inode_unlock(struct inode *inode, 2213void ocfs2_inode_unlock(struct inode *inode,
2110 int ex) 2214 int ex)
2111{ 2215{
2112 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2216 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2113 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres; 2217 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
2114 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2218 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2115 2219
@@ -2130,10 +2234,8 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
2130 int ex) 2234 int ex)
2131{ 2235{
2132 int status = 0; 2236 int status = 0;
2133 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2237 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2134 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; 2238 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
2135 struct buffer_head *bh;
2136 struct ocfs2_slot_info *si = osb->slot_info;
2137 2239
2138 mlog_entry_void(); 2240 mlog_entry_void();
2139 2241
@@ -2159,11 +2261,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
2159 goto bail; 2261 goto bail;
2160 } 2262 }
2161 if (status) { 2263 if (status) {
2162 bh = si->si_bh; 2264 status = ocfs2_refresh_slot_info(osb);
2163 status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
2164 si->si_inode);
2165 if (status == 0)
2166 ocfs2_update_slot_info(si);
2167 2265
2168 ocfs2_complete_lock_res_refresh(lockres, status); 2266 ocfs2_complete_lock_res_refresh(lockres, status);
2169 2267
@@ -2178,7 +2276,7 @@ bail:
2178void ocfs2_super_unlock(struct ocfs2_super *osb, 2276void ocfs2_super_unlock(struct ocfs2_super *osb,
2179 int ex) 2277 int ex)
2180{ 2278{
2181 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2279 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2182 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; 2280 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
2183 2281
2184 if (!ocfs2_mount_local(osb)) 2282 if (!ocfs2_mount_local(osb))
@@ -2196,7 +2294,7 @@ int ocfs2_rename_lock(struct ocfs2_super *osb)
2196 if (ocfs2_mount_local(osb)) 2294 if (ocfs2_mount_local(osb))
2197 return 0; 2295 return 0;
2198 2296
2199 status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0); 2297 status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
2200 if (status < 0) 2298 if (status < 0)
2201 mlog_errno(status); 2299 mlog_errno(status);
2202 2300
@@ -2208,13 +2306,13 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
2208 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; 2306 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
2209 2307
2210 if (!ocfs2_mount_local(osb)) 2308 if (!ocfs2_mount_local(osb))
2211 ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); 2309 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
2212} 2310}
2213 2311
2214int ocfs2_dentry_lock(struct dentry *dentry, int ex) 2312int ocfs2_dentry_lock(struct dentry *dentry, int ex)
2215{ 2313{
2216 int ret; 2314 int ret;
2217 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2315 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2218 struct ocfs2_dentry_lock *dl = dentry->d_fsdata; 2316 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
2219 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); 2317 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
2220 2318
@@ -2235,7 +2333,7 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
2235 2333
2236void ocfs2_dentry_unlock(struct dentry *dentry, int ex) 2334void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
2237{ 2335{
2238 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2336 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2239 struct ocfs2_dentry_lock *dl = dentry->d_fsdata; 2337 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
2240 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); 2338 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
2241 2339
@@ -2400,7 +2498,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
2400 lockres->l_blocking); 2498 lockres->l_blocking);
2401 2499
2402 /* Dump the raw LVB */ 2500 /* Dump the raw LVB */
2403 lvb = lockres->l_lksb.lvb; 2501 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
2404 for(i = 0; i < DLM_LVB_LEN; i++) 2502 for(i = 0; i < DLM_LVB_LEN; i++)
2405 seq_printf(m, "0x%x\t", lvb[i]); 2503 seq_printf(m, "0x%x\t", lvb[i]);
2406 2504
@@ -2504,13 +2602,14 @@ static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
2504int ocfs2_dlm_init(struct ocfs2_super *osb) 2602int ocfs2_dlm_init(struct ocfs2_super *osb)
2505{ 2603{
2506 int status = 0; 2604 int status = 0;
2507 u32 dlm_key; 2605 struct ocfs2_cluster_connection *conn = NULL;
2508 struct dlm_ctxt *dlm = NULL;
2509 2606
2510 mlog_entry_void(); 2607 mlog_entry_void();
2511 2608
2512 if (ocfs2_mount_local(osb)) 2609 if (ocfs2_mount_local(osb)) {
2610 osb->node_num = 0;
2513 goto local; 2611 goto local;
2612 }
2514 2613
2515 status = ocfs2_dlm_init_debug(osb); 2614 status = ocfs2_dlm_init_debug(osb);
2516 if (status < 0) { 2615 if (status < 0) {
@@ -2527,26 +2626,31 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2527 goto bail; 2626 goto bail;
2528 } 2627 }
2529 2628
2530 /* used by the dlm code to make message headers unique, each
2531 * node in this domain must agree on this. */
2532 dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
2533
2534 /* for now, uuid == domain */ 2629 /* for now, uuid == domain */
2535 dlm = dlm_register_domain(osb->uuid_str, dlm_key, 2630 status = ocfs2_cluster_connect(osb->osb_cluster_stack,
2536 &osb->osb_locking_proto); 2631 osb->uuid_str,
2537 if (IS_ERR(dlm)) { 2632 strlen(osb->uuid_str),
2538 status = PTR_ERR(dlm); 2633 ocfs2_do_node_down, osb,
2634 &conn);
2635 if (status) {
2539 mlog_errno(status); 2636 mlog_errno(status);
2540 goto bail; 2637 goto bail;
2541 } 2638 }
2542 2639
2543 dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); 2640 status = ocfs2_cluster_this_node(&osb->node_num);
2641 if (status < 0) {
2642 mlog_errno(status);
2643 mlog(ML_ERROR,
2644 "could not find this host's node number\n");
2645 ocfs2_cluster_disconnect(conn, 0);
2646 goto bail;
2647 }
2544 2648
2545local: 2649local:
2546 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); 2650 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2547 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); 2651 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2548 2652
2549 osb->dlm = dlm; 2653 osb->cconn = conn;
2550 2654
2551 status = 0; 2655 status = 0;
2552bail: 2656bail:
@@ -2560,14 +2664,19 @@ bail:
2560 return status; 2664 return status;
2561} 2665}
2562 2666
2563void ocfs2_dlm_shutdown(struct ocfs2_super *osb) 2667void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
2668 int hangup_pending)
2564{ 2669{
2565 mlog_entry_void(); 2670 mlog_entry_void();
2566 2671
2567 dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
2568
2569 ocfs2_drop_osb_locks(osb); 2672 ocfs2_drop_osb_locks(osb);
2570 2673
2674 /*
2675 * Now that we have dropped all locks and ocfs2_dismount_volume()
2676 * has disabled recovery, the DLM won't be talking to us. It's
2677 * safe to tear things down before disconnecting the cluster.
2678 */
2679
2571 if (osb->dc_task) { 2680 if (osb->dc_task) {
2572 kthread_stop(osb->dc_task); 2681 kthread_stop(osb->dc_task);
2573 osb->dc_task = NULL; 2682 osb->dc_task = NULL;
@@ -2576,15 +2685,15 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2576 ocfs2_lock_res_free(&osb->osb_super_lockres); 2685 ocfs2_lock_res_free(&osb->osb_super_lockres);
2577 ocfs2_lock_res_free(&osb->osb_rename_lockres); 2686 ocfs2_lock_res_free(&osb->osb_rename_lockres);
2578 2687
2579 dlm_unregister_domain(osb->dlm); 2688 ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
2580 osb->dlm = NULL; 2689 osb->cconn = NULL;
2581 2690
2582 ocfs2_dlm_shutdown_debug(osb); 2691 ocfs2_dlm_shutdown_debug(osb);
2583 2692
2584 mlog_exit_void(); 2693 mlog_exit_void();
2585} 2694}
2586 2695
2587static void ocfs2_unlock_ast(void *opaque, enum dlm_status status) 2696static void ocfs2_unlock_ast(void *opaque, int error)
2588{ 2697{
2589 struct ocfs2_lock_res *lockres = opaque; 2698 struct ocfs2_lock_res *lockres = opaque;
2590 unsigned long flags; 2699 unsigned long flags;
@@ -2595,24 +2704,9 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
2595 lockres->l_unlock_action); 2704 lockres->l_unlock_action);
2596 2705
2597 spin_lock_irqsave(&lockres->l_lock, flags); 2706 spin_lock_irqsave(&lockres->l_lock, flags);
2598 /* We tried to cancel a convert request, but it was already 2707 if (error) {
2599 * granted. All we want to do here is clear our unlock 2708 mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
2600 * state. The wake_up call done at the bottom is redundant 2709 "unlock_action %d\n", error, lockres->l_name,
2601 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
2602 * hurt anything anyway */
2603 if (status == DLM_CANCELGRANT &&
2604 lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2605 mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
2606
2607 /* We don't clear the busy flag in this case as it
2608 * should have been cleared by the ast which the dlm
2609 * has called. */
2610 goto complete_unlock;
2611 }
2612
2613 if (status != DLM_NORMAL) {
2614 mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
2615 "unlock_action %d\n", status, lockres->l_name,
2616 lockres->l_unlock_action); 2710 lockres->l_unlock_action);
2617 spin_unlock_irqrestore(&lockres->l_lock, flags); 2711 spin_unlock_irqrestore(&lockres->l_lock, flags);
2618 return; 2712 return;
@@ -2624,14 +2718,13 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
2624 lockres->l_action = OCFS2_AST_INVALID; 2718 lockres->l_action = OCFS2_AST_INVALID;
2625 break; 2719 break;
2626 case OCFS2_UNLOCK_DROP_LOCK: 2720 case OCFS2_UNLOCK_DROP_LOCK:
2627 lockres->l_level = LKM_IVMODE; 2721 lockres->l_level = DLM_LOCK_IV;
2628 break; 2722 break;
2629 default: 2723 default:
2630 BUG(); 2724 BUG();
2631 } 2725 }
2632 2726
2633 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 2727 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
2634complete_unlock:
2635 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; 2728 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
2636 spin_unlock_irqrestore(&lockres->l_lock, flags); 2729 spin_unlock_irqrestore(&lockres->l_lock, flags);
2637 2730
@@ -2643,16 +2736,16 @@ complete_unlock:
2643static int ocfs2_drop_lock(struct ocfs2_super *osb, 2736static int ocfs2_drop_lock(struct ocfs2_super *osb,
2644 struct ocfs2_lock_res *lockres) 2737 struct ocfs2_lock_res *lockres)
2645{ 2738{
2646 enum dlm_status status; 2739 int ret;
2647 unsigned long flags; 2740 unsigned long flags;
2648 int lkm_flags = 0; 2741 u32 lkm_flags = 0;
2649 2742
2650 /* We didn't get anywhere near actually using this lockres. */ 2743 /* We didn't get anywhere near actually using this lockres. */
2651 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) 2744 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
2652 goto out; 2745 goto out;
2653 2746
2654 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) 2747 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
2655 lkm_flags |= LKM_VALBLK; 2748 lkm_flags |= DLM_LKF_VALBLK;
2656 2749
2657 spin_lock_irqsave(&lockres->l_lock, flags); 2750 spin_lock_irqsave(&lockres->l_lock, flags);
2658 2751
@@ -2678,7 +2771,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
2678 2771
2679 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { 2772 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
2680 if (lockres->l_flags & OCFS2_LOCK_ATTACHED && 2773 if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
2681 lockres->l_level == LKM_EXMODE && 2774 lockres->l_level == DLM_LOCK_EX &&
2682 !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) 2775 !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
2683 lockres->l_ops->set_lvb(lockres); 2776 lockres->l_ops->set_lvb(lockres);
2684 } 2777 }
@@ -2707,15 +2800,15 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
2707 2800
2708 mlog(0, "lock %s\n", lockres->l_name); 2801 mlog(0, "lock %s\n", lockres->l_name);
2709 2802
2710 status = dlmunlock(osb->dlm, &lockres->l_lksb, lkm_flags, 2803 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags,
2711 ocfs2_unlock_ast, lockres); 2804 lockres);
2712 if (status != DLM_NORMAL) { 2805 if (ret) {
2713 ocfs2_log_dlm_error("dlmunlock", status, lockres); 2806 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
2714 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); 2807 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
2715 dlm_print_one_lock(lockres->l_lksb.lockid); 2808 ocfs2_dlm_dump_lksb(&lockres->l_lksb);
2716 BUG(); 2809 BUG();
2717 } 2810 }
2718 mlog(0, "lock %s, successfull return from dlmunlock\n", 2811 mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
2719 lockres->l_name); 2812 lockres->l_name);
2720 2813
2721 ocfs2_wait_on_busy_lock(lockres); 2814 ocfs2_wait_on_busy_lock(lockres);
@@ -2806,15 +2899,15 @@ int ocfs2_drop_inode_locks(struct inode *inode)
2806 return status; 2899 return status;
2807} 2900}
2808 2901
2809static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, 2902static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
2810 int new_level) 2903 int new_level)
2811{ 2904{
2812 assert_spin_locked(&lockres->l_lock); 2905 assert_spin_locked(&lockres->l_lock);
2813 2906
2814 BUG_ON(lockres->l_blocking <= LKM_NLMODE); 2907 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
2815 2908
2816 if (lockres->l_level <= new_level) { 2909 if (lockres->l_level <= new_level) {
2817 mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n", 2910 mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n",
2818 lockres->l_level, new_level); 2911 lockres->l_level, new_level);
2819 BUG(); 2912 BUG();
2820 } 2913 }
@@ -2825,33 +2918,33 @@ static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
2825 lockres->l_action = OCFS2_AST_DOWNCONVERT; 2918 lockres->l_action = OCFS2_AST_DOWNCONVERT;
2826 lockres->l_requested = new_level; 2919 lockres->l_requested = new_level;
2827 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 2920 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2921 return lockres_set_pending(lockres);
2828} 2922}
2829 2923
2830static int ocfs2_downconvert_lock(struct ocfs2_super *osb, 2924static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
2831 struct ocfs2_lock_res *lockres, 2925 struct ocfs2_lock_res *lockres,
2832 int new_level, 2926 int new_level,
2833 int lvb) 2927 int lvb,
2928 unsigned int generation)
2834{ 2929{
2835 int ret, dlm_flags = LKM_CONVERT; 2930 int ret;
2836 enum dlm_status status; 2931 u32 dlm_flags = DLM_LKF_CONVERT;
2837 2932
2838 mlog_entry_void(); 2933 mlog_entry_void();
2839 2934
2840 if (lvb) 2935 if (lvb)
2841 dlm_flags |= LKM_VALBLK; 2936 dlm_flags |= DLM_LKF_VALBLK;
2842 2937
2843 status = dlmlock(osb->dlm, 2938 ret = ocfs2_dlm_lock(osb->cconn,
2844 new_level, 2939 new_level,
2845 &lockres->l_lksb, 2940 &lockres->l_lksb,
2846 dlm_flags, 2941 dlm_flags,
2847 lockres->l_name, 2942 lockres->l_name,
2848 OCFS2_LOCK_ID_MAX_LEN - 1, 2943 OCFS2_LOCK_ID_MAX_LEN - 1,
2849 ocfs2_locking_ast, 2944 lockres);
2850 lockres, 2945 lockres_clear_pending(lockres, generation, osb);
2851 ocfs2_blocking_ast); 2946 if (ret) {
2852 if (status != DLM_NORMAL) { 2947 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
2853 ocfs2_log_dlm_error("dlmlock", status, lockres);
2854 ret = -EINVAL;
2855 ocfs2_recover_from_dlm_error(lockres, 1); 2948 ocfs2_recover_from_dlm_error(lockres, 1);
2856 goto bail; 2949 goto bail;
2857 } 2950 }
@@ -2862,7 +2955,7 @@ bail:
2862 return ret; 2955 return ret;
2863} 2956}
2864 2957
2865/* returns 1 when the caller should unlock and call dlmunlock */ 2958/* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */
2866static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, 2959static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
2867 struct ocfs2_lock_res *lockres) 2960 struct ocfs2_lock_res *lockres)
2868{ 2961{
@@ -2898,24 +2991,18 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
2898 struct ocfs2_lock_res *lockres) 2991 struct ocfs2_lock_res *lockres)
2899{ 2992{
2900 int ret; 2993 int ret;
2901 enum dlm_status status;
2902 2994
2903 mlog_entry_void(); 2995 mlog_entry_void();
2904 mlog(0, "lock %s\n", lockres->l_name); 2996 mlog(0, "lock %s\n", lockres->l_name);
2905 2997
2906 ret = 0; 2998 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
2907 status = dlmunlock(osb->dlm, 2999 DLM_LKF_CANCEL, lockres);
2908 &lockres->l_lksb, 3000 if (ret) {
2909 LKM_CANCEL, 3001 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
2910 ocfs2_unlock_ast,
2911 lockres);
2912 if (status != DLM_NORMAL) {
2913 ocfs2_log_dlm_error("dlmunlock", status, lockres);
2914 ret = -EINVAL;
2915 ocfs2_recover_from_dlm_error(lockres, 0); 3002 ocfs2_recover_from_dlm_error(lockres, 0);
2916 } 3003 }
2917 3004
2918 mlog(0, "lock %s return from dlmunlock\n", lockres->l_name); 3005 mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name);
2919 3006
2920 mlog_exit(ret); 3007 mlog_exit(ret);
2921 return ret; 3008 return ret;
@@ -2930,6 +3017,7 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
2930 int new_level; 3017 int new_level;
2931 int ret = 0; 3018 int ret = 0;
2932 int set_lvb = 0; 3019 int set_lvb = 0;
3020 unsigned int gen;
2933 3021
2934 mlog_entry_void(); 3022 mlog_entry_void();
2935 3023
@@ -2939,6 +3027,32 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
2939 3027
2940recheck: 3028recheck:
2941 if (lockres->l_flags & OCFS2_LOCK_BUSY) { 3029 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
3030 /* XXX
3031 * This is a *big* race. The OCFS2_LOCK_PENDING flag
3032 * exists entirely for one reason - another thread has set
3033 * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock().
3034 *
3035 * If we do ocfs2_cancel_convert() before the other thread
3036 * calls dlm_lock(), our cancel will do nothing. We will
3037 * get no ast, and we will have no way of knowing the
3038 * cancel failed. Meanwhile, the other thread will call
3039 * into dlm_lock() and wait...forever.
3040 *
3041 * Why forever? Because another node has asked for the
3042 * lock first; that's why we're here in unblock_lock().
3043 *
3044 * The solution is OCFS2_LOCK_PENDING. When PENDING is
3045 * set, we just requeue the unblock. Only when the other
3046 * thread has called dlm_lock() and cleared PENDING will
3047 * we then cancel their request.
3048 *
3049 * All callers of dlm_lock() must set OCFS2_DLM_PENDING
3050 * at the same time they set OCFS2_DLM_BUSY. They must
3051 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
3052 */
3053 if (lockres->l_flags & OCFS2_LOCK_PENDING)
3054 goto leave_requeue;
3055
2942 ctl->requeue = 1; 3056 ctl->requeue = 1;
2943 ret = ocfs2_prepare_cancel_convert(osb, lockres); 3057 ret = ocfs2_prepare_cancel_convert(osb, lockres);
2944 spin_unlock_irqrestore(&lockres->l_lock, flags); 3058 spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -2952,13 +3066,13 @@ recheck:
2952 3066
2953 /* if we're blocking an exclusive and we have *any* holders, 3067 /* if we're blocking an exclusive and we have *any* holders,
2954 * then requeue. */ 3068 * then requeue. */
2955 if ((lockres->l_blocking == LKM_EXMODE) 3069 if ((lockres->l_blocking == DLM_LOCK_EX)
2956 && (lockres->l_ex_holders || lockres->l_ro_holders)) 3070 && (lockres->l_ex_holders || lockres->l_ro_holders))
2957 goto leave_requeue; 3071 goto leave_requeue;
2958 3072
2959 /* If it's a PR we're blocking, then only 3073 /* If it's a PR we're blocking, then only
2960 * requeue if we've got any EX holders */ 3074 * requeue if we've got any EX holders */
2961 if (lockres->l_blocking == LKM_PRMODE && 3075 if (lockres->l_blocking == DLM_LOCK_PR &&
2962 lockres->l_ex_holders) 3076 lockres->l_ex_holders)
2963 goto leave_requeue; 3077 goto leave_requeue;
2964 3078
@@ -3005,7 +3119,7 @@ downconvert:
3005 ctl->requeue = 0; 3119 ctl->requeue = 0;
3006 3120
3007 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { 3121 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
3008 if (lockres->l_level == LKM_EXMODE) 3122 if (lockres->l_level == DLM_LOCK_EX)
3009 set_lvb = 1; 3123 set_lvb = 1;
3010 3124
3011 /* 3125 /*
@@ -3018,9 +3132,11 @@ downconvert:
3018 lockres->l_ops->set_lvb(lockres); 3132 lockres->l_ops->set_lvb(lockres);
3019 } 3133 }
3020 3134
3021 ocfs2_prepare_downconvert(lockres, new_level); 3135 gen = ocfs2_prepare_downconvert(lockres, new_level);
3022 spin_unlock_irqrestore(&lockres->l_lock, flags); 3136 spin_unlock_irqrestore(&lockres->l_lock, flags);
3023 ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb); 3137 ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
3138 gen);
3139
3024leave: 3140leave:
3025 mlog_exit(ret); 3141 mlog_exit(ret);
3026 return ret; 3142 return ret;
@@ -3059,7 +3175,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
3059 (unsigned long long)OCFS2_I(inode)->ip_blkno); 3175 (unsigned long long)OCFS2_I(inode)->ip_blkno);
3060 } 3176 }
3061 sync_mapping_buffers(mapping); 3177 sync_mapping_buffers(mapping);
3062 if (blocking == LKM_EXMODE) { 3178 if (blocking == DLM_LOCK_EX) {
3063 truncate_inode_pages(mapping, 0); 3179 truncate_inode_pages(mapping, 0);
3064 } else { 3180 } else {
3065 /* We only need to wait on the I/O if we're not also 3181 /* We only need to wait on the I/O if we're not also
@@ -3080,8 +3196,8 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
3080 struct inode *inode = ocfs2_lock_res_inode(lockres); 3196 struct inode *inode = ocfs2_lock_res_inode(lockres);
3081 int checkpointed = ocfs2_inode_fully_checkpointed(inode); 3197 int checkpointed = ocfs2_inode_fully_checkpointed(inode);
3082 3198
3083 BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE); 3199 BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
3084 BUG_ON(lockres->l_level != LKM_EXMODE && !checkpointed); 3200 BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
3085 3201
3086 if (checkpointed) 3202 if (checkpointed)
3087 return 1; 3203 return 1;
@@ -3145,7 +3261,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
3145 * valid. The downconvert code will retain a PR for this node, 3261 * valid. The downconvert code will retain a PR for this node,
3146 * so there's no further work to do. 3262 * so there's no further work to do.
3147 */ 3263 */
3148 if (blocking == LKM_PRMODE) 3264 if (blocking == DLM_LOCK_PR)
3149 return UNBLOCK_CONTINUE; 3265 return UNBLOCK_CONTINUE;
3150 3266
3151 /* 3267 /*
@@ -3219,6 +3335,45 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
3219 return UNBLOCK_CONTINUE_POST; 3335 return UNBLOCK_CONTINUE_POST;
3220} 3336}
3221 3337
3338/*
3339 * This is the filesystem locking protocol. It provides the lock handling
3340 * hooks for the underlying DLM. It has a maximum version number.
3341 * The version number allows interoperability with systems running at
3342 * the same major number and an equal or smaller minor number.
3343 *
3344 * Whenever the filesystem does new things with locks (adds or removes a
3345 * lock, orders them differently, does different things underneath a lock),
3346 * the version must be changed. The protocol is negotiated when joining
3347 * the dlm domain. A node may join the domain if its major version is
3348 * identical to all other nodes and its minor version is greater than
3349 * or equal to all other nodes. When its minor version is greater than
3350 * the other nodes, it will run at the minor version specified by the
3351 * other nodes.
3352 *
3353 * If a locking change is made that will not be compatible with older
3354 * versions, the major number must be increased and the minor version set
3355 * to zero. If a change merely adds a behavior that can be disabled when
3356 * speaking to older versions, the minor version must be increased. If a
3357 * change adds a fully backwards compatible change (eg, LVB changes that
3358 * are just ignored by older versions), the version does not need to be
3359 * updated.
3360 */
3361static struct ocfs2_locking_protocol lproto = {
3362 .lp_max_version = {
3363 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
3364 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
3365 },
3366 .lp_lock_ast = ocfs2_locking_ast,
3367 .lp_blocking_ast = ocfs2_blocking_ast,
3368 .lp_unlock_ast = ocfs2_unlock_ast,
3369};
3370
3371void ocfs2_set_locking_protocol(void)
3372{
3373 ocfs2_stack_glue_set_locking_protocol(&lproto);
3374}
3375
3376
3222static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 3377static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3223 struct ocfs2_lock_res *lockres) 3378 struct ocfs2_lock_res *lockres)
3224{ 3379{
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e3cf902404b4..2bb01f09c1b1 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -58,7 +58,7 @@ struct ocfs2_meta_lvb {
58#define OCFS2_LOCK_NONBLOCK (0x04) 58#define OCFS2_LOCK_NONBLOCK (0x04)
59 59
60int ocfs2_dlm_init(struct ocfs2_super *osb); 60int ocfs2_dlm_init(struct ocfs2_super *osb);
61void ocfs2_dlm_shutdown(struct ocfs2_super *osb); 61void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
62void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res); 62void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
63void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, 63void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
64 enum ocfs2_lock_type type, 64 enum ocfs2_lock_type type,
@@ -114,5 +114,6 @@ void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
114struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); 114struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
115void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); 115void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
116 116
117extern const struct dlm_protocol_version ocfs2_locking_protocol; 117/* To set the locking protocol on module initialization */
118void ocfs2_set_locking_protocol(void);
118#endif /* DLMGLUE_H */ 119#endif /* DLMGLUE_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ed5d5232e85d..9154c82d3258 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2242,7 +2242,7 @@ const struct file_operations ocfs2_fops = {
2242 .open = ocfs2_file_open, 2242 .open = ocfs2_file_open,
2243 .aio_read = ocfs2_file_aio_read, 2243 .aio_read = ocfs2_file_aio_read,
2244 .aio_write = ocfs2_file_aio_write, 2244 .aio_write = ocfs2_file_aio_write,
2245 .ioctl = ocfs2_ioctl, 2245 .unlocked_ioctl = ocfs2_ioctl,
2246#ifdef CONFIG_COMPAT 2246#ifdef CONFIG_COMPAT
2247 .compat_ioctl = ocfs2_compat_ioctl, 2247 .compat_ioctl = ocfs2_compat_ioctl,
2248#endif 2248#endif
@@ -2258,7 +2258,7 @@ const struct file_operations ocfs2_dops = {
2258 .fsync = ocfs2_sync_file, 2258 .fsync = ocfs2_sync_file,
2259 .release = ocfs2_dir_release, 2259 .release = ocfs2_dir_release,
2260 .open = ocfs2_dir_open, 2260 .open = ocfs2_dir_open,
2261 .ioctl = ocfs2_ioctl, 2261 .unlocked_ioctl = ocfs2_ioctl,
2262#ifdef CONFIG_COMPAT 2262#ifdef CONFIG_COMPAT
2263 .compat_ioctl = ocfs2_compat_ioctl, 2263 .compat_ioctl = ocfs2_compat_ioctl,
2264#endif 2264#endif
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 0758daf64da0..c6e7213db868 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -28,9 +28,6 @@
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/kmod.h>
32
33#include <dlm/dlmapi.h>
34 31
35#define MLOG_MASK_PREFIX ML_SUPER 32#define MLOG_MASK_PREFIX ML_SUPER
36#include <cluster/masklog.h> 33#include <cluster/masklog.h>
@@ -48,7 +45,6 @@ static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
48 int bit); 45 int bit);
49static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, 46static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
50 int bit); 47 int bit);
51static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
52 48
53/* special case -1 for now 49/* special case -1 for now
54 * TODO: should *really* make sure the calling func never passes -1!! */ 50 * TODO: should *really* make sure the calling func never passes -1!! */
@@ -62,23 +58,23 @@ static void ocfs2_node_map_init(struct ocfs2_node_map *map)
62void ocfs2_init_node_maps(struct ocfs2_super *osb) 58void ocfs2_init_node_maps(struct ocfs2_super *osb)
63{ 59{
64 spin_lock_init(&osb->node_map_lock); 60 spin_lock_init(&osb->node_map_lock);
65 ocfs2_node_map_init(&osb->recovery_map);
66 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); 61 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
67} 62}
68 63
69static void ocfs2_do_node_down(int node_num, 64void ocfs2_do_node_down(int node_num, void *data)
70 struct ocfs2_super *osb)
71{ 65{
66 struct ocfs2_super *osb = data;
67
72 BUG_ON(osb->node_num == node_num); 68 BUG_ON(osb->node_num == node_num);
73 69
74 mlog(0, "ocfs2: node down event for %d\n", node_num); 70 mlog(0, "ocfs2: node down event for %d\n", node_num);
75 71
76 if (!osb->dlm) { 72 if (!osb->cconn) {
77 /* 73 /*
78 * No DLM means we're not even ready to participate yet. 74 * No cluster connection means we're not even ready to
79 * We check the slots after the DLM comes up, so we will 75 * participate yet. We check the slots after the cluster
80 * notice the node death then. We can safely ignore it 76 * comes up, so we will notice the node death then. We
81 * here. 77 * can safely ignore it here.
82 */ 78 */
83 return; 79 return;
84 } 80 }
@@ -86,61 +82,6 @@ static void ocfs2_do_node_down(int node_num,
86 ocfs2_recovery_thread(osb, node_num); 82 ocfs2_recovery_thread(osb, node_num);
87} 83}
88 84
89/* Called from the dlm when it's about to evict a node. We may also
90 * get a heartbeat callback later. */
91static void ocfs2_dlm_eviction_cb(int node_num,
92 void *data)
93{
94 struct ocfs2_super *osb = (struct ocfs2_super *) data;
95 struct super_block *sb = osb->sb;
96
97 mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
98 MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
99
100 ocfs2_do_node_down(node_num, osb);
101}
102
103void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
104{
105 /* Not exactly a heartbeat callback, but leads to essentially
106 * the same path so we set it up here. */
107 dlm_setup_eviction_cb(&osb->osb_eviction_cb,
108 ocfs2_dlm_eviction_cb,
109 osb);
110}
111
112void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
113{
114 int ret;
115 char *argv[5], *envp[3];
116
117 if (ocfs2_mount_local(osb))
118 return;
119
120 if (!osb->uuid_str) {
121 /* This can happen if we don't get far enough in mount... */
122 mlog(0, "No UUID with which to stop heartbeat!\n\n");
123 return;
124 }
125
126 argv[0] = (char *)o2nm_get_hb_ctl_path();
127 argv[1] = "-K";
128 argv[2] = "-u";
129 argv[3] = osb->uuid_str;
130 argv[4] = NULL;
131
132 mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
133
134 /* minimal command environment taken from cpu_run_sbin_hotplug */
135 envp[0] = "HOME=/";
136 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
137 envp[2] = NULL;
138
139 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
140 if (ret < 0)
141 mlog_errno(ret);
142}
143
144static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, 85static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
145 int bit) 86 int bit)
146{ 87{
@@ -192,112 +133,3 @@ int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
192 return ret; 133 return ret;
193} 134}
194 135
195static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
196{
197 int bit;
198 bit = find_next_bit(map->map, map->num_nodes, 0);
199 if (bit < map->num_nodes)
200 return 0;
201 return 1;
202}
203
204int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
205 struct ocfs2_node_map *map)
206{
207 int ret;
208 BUG_ON(map->num_nodes == 0);
209 spin_lock(&osb->node_map_lock);
210 ret = __ocfs2_node_map_is_empty(map);
211 spin_unlock(&osb->node_map_lock);
212 return ret;
213}
214
215#if 0
216
217static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
218 struct ocfs2_node_map *from)
219{
220 BUG_ON(from->num_nodes == 0);
221 ocfs2_node_map_init(target);
222 __ocfs2_node_map_set(target, from);
223}
224
225/* returns 1 if bit is the only bit set in target, 0 otherwise */
226int ocfs2_node_map_is_only(struct ocfs2_super *osb,
227 struct ocfs2_node_map *target,
228 int bit)
229{
230 struct ocfs2_node_map temp;
231 int ret;
232
233 spin_lock(&osb->node_map_lock);
234 __ocfs2_node_map_dup(&temp, target);
235 __ocfs2_node_map_clear_bit(&temp, bit);
236 ret = __ocfs2_node_map_is_empty(&temp);
237 spin_unlock(&osb->node_map_lock);
238
239 return ret;
240}
241
242static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
243 struct ocfs2_node_map *from)
244{
245 int num_longs, i;
246
247 BUG_ON(target->num_nodes != from->num_nodes);
248 BUG_ON(target->num_nodes == 0);
249
250 num_longs = BITS_TO_LONGS(target->num_nodes);
251 for (i = 0; i < num_longs; i++)
252 target->map[i] = from->map[i];
253}
254
255#endif /* 0 */
256
257/* Returns whether the recovery bit was actually set - it may not be
258 * if a node is still marked as needing recovery */
259int ocfs2_recovery_map_set(struct ocfs2_super *osb,
260 int num)
261{
262 int set = 0;
263
264 spin_lock(&osb->node_map_lock);
265
266 if (!test_bit(num, osb->recovery_map.map)) {
267 __ocfs2_node_map_set_bit(&osb->recovery_map, num);
268 set = 1;
269 }
270
271 spin_unlock(&osb->node_map_lock);
272
273 return set;
274}
275
276void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
277 int num)
278{
279 ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
280}
281
282int ocfs2_node_map_iterate(struct ocfs2_super *osb,
283 struct ocfs2_node_map *map,
284 int idx)
285{
286 int i = idx;
287
288 idx = O2NM_INVALID_NODE_NUM;
289 spin_lock(&osb->node_map_lock);
290 if ((i != O2NM_INVALID_NODE_NUM) &&
291 (i >= 0) &&
292 (i < map->num_nodes)) {
293 while(i < map->num_nodes) {
294 if (test_bit(i, map->map)) {
295 idx = i;
296 break;
297 }
298 i++;
299 }
300 }
301 spin_unlock(&osb->node_map_lock);
302 return idx;
303}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index eac63aed7611..74b9c5dda28d 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -28,13 +28,10 @@
28 28
29void ocfs2_init_node_maps(struct ocfs2_super *osb); 29void ocfs2_init_node_maps(struct ocfs2_super *osb);
30 30
31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb); 31void ocfs2_do_node_down(int node_num, void *data);
32void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
33 32
34/* node map functions - used to keep track of mounted and in-recovery 33/* node map functions - used to keep track of mounted and in-recovery
35 * nodes. */ 34 * nodes. */
36int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
37 struct ocfs2_node_map *map);
38void ocfs2_node_map_set_bit(struct ocfs2_super *osb, 35void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
39 struct ocfs2_node_map *map, 36 struct ocfs2_node_map *map,
40 int bit); 37 int bit);
@@ -44,17 +41,5 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
44int ocfs2_node_map_test_bit(struct ocfs2_super *osb, 41int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
45 struct ocfs2_node_map *map, 42 struct ocfs2_node_map *map,
46 int bit); 43 int bit);
47int ocfs2_node_map_iterate(struct ocfs2_super *osb,
48 struct ocfs2_node_map *map,
49 int idx);
50static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
51 struct ocfs2_node_map *map)
52{
53 return ocfs2_node_map_iterate(osb, map, 0);
54}
55int ocfs2_recovery_map_set(struct ocfs2_super *osb,
56 int num);
57void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
58 int num);
59 44
60#endif /* OCFS2_HEARTBEAT_H */ 45#endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 5177fba5162b..b413166dd163 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/smp_lock.h>
10 11
11#define MLOG_MASK_PREFIX ML_INODE 12#define MLOG_MASK_PREFIX ML_INODE
12#include <cluster/masklog.h> 13#include <cluster/masklog.h>
@@ -112,9 +113,9 @@ bail:
112 return status; 113 return status;
113} 114}
114 115
115int ocfs2_ioctl(struct inode * inode, struct file * filp, 116long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
116 unsigned int cmd, unsigned long arg)
117{ 117{
118 struct inode *inode = filp->f_path.dentry->d_inode;
118 unsigned int flags; 119 unsigned int flags;
119 int new_clusters; 120 int new_clusters;
120 int status; 121 int status;
@@ -168,9 +169,6 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
168#ifdef CONFIG_COMPAT 169#ifdef CONFIG_COMPAT
169long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) 170long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
170{ 171{
171 struct inode *inode = file->f_path.dentry->d_inode;
172 int ret;
173
174 switch (cmd) { 172 switch (cmd) {
175 case OCFS2_IOC32_GETFLAGS: 173 case OCFS2_IOC32_GETFLAGS:
176 cmd = OCFS2_IOC_GETFLAGS; 174 cmd = OCFS2_IOC_GETFLAGS;
@@ -190,9 +188,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
190 return -ENOIOCTLCMD; 188 return -ENOIOCTLCMD;
191 } 189 }
192 190
193 lock_kernel(); 191 return ocfs2_ioctl(file, cmd, arg);
194 ret = ocfs2_ioctl(inode, file, cmd, arg);
195 unlock_kernel();
196 return ret;
197} 192}
198#endif 193#endif
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 4d6c4f430d0d..cf9a5ee30fef 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -10,8 +10,7 @@
10#ifndef OCFS2_IOCTL_H 10#ifndef OCFS2_IOCTL_H
11#define OCFS2_IOCTL_H 11#define OCFS2_IOCTL_H
12 12
13int ocfs2_ioctl(struct inode * inode, struct file * filp, 13long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
14 unsigned int cmd, unsigned long arg);
15long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); 14long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
16 15
17#endif /* OCFS2_IOCTL_H */ 16#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f31c7e8c19c3..9698338adc39 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -64,6 +64,137 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
64 int slot); 64 int slot);
65static int ocfs2_commit_thread(void *arg); 65static int ocfs2_commit_thread(void *arg);
66 66
67
68/*
69 * The recovery_list is a simple linked list of node numbers to recover.
70 * It is protected by the recovery_lock.
71 */
72
73struct ocfs2_recovery_map {
74 unsigned int rm_used;
75 unsigned int *rm_entries;
76};
77
78int ocfs2_recovery_init(struct ocfs2_super *osb)
79{
80 struct ocfs2_recovery_map *rm;
81
82 mutex_init(&osb->recovery_lock);
83 osb->disable_recovery = 0;
84 osb->recovery_thread_task = NULL;
85 init_waitqueue_head(&osb->recovery_event);
86
87 rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
88 osb->max_slots * sizeof(unsigned int),
89 GFP_KERNEL);
90 if (!rm) {
91 mlog_errno(-ENOMEM);
92 return -ENOMEM;
93 }
94
95 rm->rm_entries = (unsigned int *)((char *)rm +
96 sizeof(struct ocfs2_recovery_map));
97 osb->recovery_map = rm;
98
99 return 0;
100}
101
102/* we can't grab the goofy sem lock from inside wait_event, so we use
103 * memory barriers to make sure that we'll see the null task before
104 * being woken up */
105static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
106{
107 mb();
108 return osb->recovery_thread_task != NULL;
109}
110
111void ocfs2_recovery_exit(struct ocfs2_super *osb)
112{
113 struct ocfs2_recovery_map *rm;
114
115 /* disable any new recovery threads and wait for any currently
116 * running ones to exit. Do this before setting the vol_state. */
117 mutex_lock(&osb->recovery_lock);
118 osb->disable_recovery = 1;
119 mutex_unlock(&osb->recovery_lock);
120 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
121
122 /* At this point, we know that no more recovery threads can be
123 * launched, so wait for any recovery completion work to
124 * complete. */
125 flush_workqueue(ocfs2_wq);
126
127 /*
128 * Now that recovery is shut down, and the osb is about to be
129 * freed, the osb_lock is not taken here.
130 */
131 rm = osb->recovery_map;
132 /* XXX: Should we bug if there are dirty entries? */
133
134 kfree(rm);
135}
136
137static int __ocfs2_recovery_map_test(struct ocfs2_super *osb,
138 unsigned int node_num)
139{
140 int i;
141 struct ocfs2_recovery_map *rm = osb->recovery_map;
142
143 assert_spin_locked(&osb->osb_lock);
144
145 for (i = 0; i < rm->rm_used; i++) {
146 if (rm->rm_entries[i] == node_num)
147 return 1;
148 }
149
150 return 0;
151}
152
153/* Behaves like test-and-set. Returns the previous value */
154static int ocfs2_recovery_map_set(struct ocfs2_super *osb,
155 unsigned int node_num)
156{
157 struct ocfs2_recovery_map *rm = osb->recovery_map;
158
159 spin_lock(&osb->osb_lock);
160 if (__ocfs2_recovery_map_test(osb, node_num)) {
161 spin_unlock(&osb->osb_lock);
162 return 1;
163 }
164
165 /* XXX: Can this be exploited? Not from o2dlm... */
166 BUG_ON(rm->rm_used >= osb->max_slots);
167
168 rm->rm_entries[rm->rm_used] = node_num;
169 rm->rm_used++;
170 spin_unlock(&osb->osb_lock);
171
172 return 0;
173}
174
175static void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
176 unsigned int node_num)
177{
178 int i;
179 struct ocfs2_recovery_map *rm = osb->recovery_map;
180
181 spin_lock(&osb->osb_lock);
182
183 for (i = 0; i < rm->rm_used; i++) {
184 if (rm->rm_entries[i] == node_num)
185 break;
186 }
187
188 if (i < rm->rm_used) {
189 /* XXX: be careful with the pointer math */
190 memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]),
191 (rm->rm_used - i - 1) * sizeof(unsigned int));
192 rm->rm_used--;
193 }
194
195 spin_unlock(&osb->osb_lock);
196}
197
67static int ocfs2_commit_cache(struct ocfs2_super *osb) 198static int ocfs2_commit_cache(struct ocfs2_super *osb)
68{ 199{
69 int status = 0; 200 int status = 0;
@@ -586,8 +717,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
586 717
587 mlog_entry_void(); 718 mlog_entry_void();
588 719
589 if (!journal) 720 BUG_ON(!journal);
590 BUG();
591 721
592 osb = journal->j_osb; 722 osb = journal->j_osb;
593 723
@@ -650,6 +780,23 @@ bail:
650 return status; 780 return status;
651} 781}
652 782
783static int ocfs2_recovery_completed(struct ocfs2_super *osb)
784{
785 int empty;
786 struct ocfs2_recovery_map *rm = osb->recovery_map;
787
788 spin_lock(&osb->osb_lock);
789 empty = (rm->rm_used == 0);
790 spin_unlock(&osb->osb_lock);
791
792 return empty;
793}
794
795void ocfs2_wait_for_recovery(struct ocfs2_super *osb)
796{
797 wait_event(osb->recovery_event, ocfs2_recovery_completed(osb));
798}
799
653/* 800/*
654 * JBD Might read a cached version of another nodes journal file. We 801 * JBD Might read a cached version of another nodes journal file. We
655 * don't want this as this file changes often and we get no 802 * don't want this as this file changes often and we get no
@@ -848,6 +995,7 @@ static int __ocfs2_recovery_thread(void *arg)
848{ 995{
849 int status, node_num; 996 int status, node_num;
850 struct ocfs2_super *osb = arg; 997 struct ocfs2_super *osb = arg;
998 struct ocfs2_recovery_map *rm = osb->recovery_map;
851 999
852 mlog_entry_void(); 1000 mlog_entry_void();
853 1001
@@ -863,26 +1011,29 @@ restart:
863 goto bail; 1011 goto bail;
864 } 1012 }
865 1013
866 while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { 1014 spin_lock(&osb->osb_lock);
867 node_num = ocfs2_node_map_first_set_bit(osb, 1015 while (rm->rm_used) {
868 &osb->recovery_map); 1016 /* It's always safe to remove entry zero, as we won't
869 if (node_num == O2NM_INVALID_NODE_NUM) { 1017 * clear it until ocfs2_recover_node() has succeeded. */
870 mlog(0, "Out of nodes to recover.\n"); 1018 node_num = rm->rm_entries[0];
871 break; 1019 spin_unlock(&osb->osb_lock);
872 }
873 1020
874 status = ocfs2_recover_node(osb, node_num); 1021 status = ocfs2_recover_node(osb, node_num);
875 if (status < 0) { 1022 if (!status) {
1023 ocfs2_recovery_map_clear(osb, node_num);
1024 } else {
876 mlog(ML_ERROR, 1025 mlog(ML_ERROR,
877 "Error %d recovering node %d on device (%u,%u)!\n", 1026 "Error %d recovering node %d on device (%u,%u)!\n",
878 status, node_num, 1027 status, node_num,
879 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1028 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
880 mlog(ML_ERROR, "Volume requires unmount.\n"); 1029 mlog(ML_ERROR, "Volume requires unmount.\n");
881 continue;
882 } 1030 }
883 1031
884 ocfs2_recovery_map_clear(osb, node_num); 1032 spin_lock(&osb->osb_lock);
885 } 1033 }
1034 spin_unlock(&osb->osb_lock);
1035 mlog(0, "All nodes recovered\n");
1036
886 ocfs2_super_unlock(osb, 1); 1037 ocfs2_super_unlock(osb, 1);
887 1038
888 /* We always run recovery on our own orphan dir - the dead 1039 /* We always run recovery on our own orphan dir - the dead
@@ -893,8 +1044,7 @@ restart:
893 1044
894bail: 1045bail:
895 mutex_lock(&osb->recovery_lock); 1046 mutex_lock(&osb->recovery_lock);
896 if (!status && 1047 if (!status && !ocfs2_recovery_completed(osb)) {
897 !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
898 mutex_unlock(&osb->recovery_lock); 1048 mutex_unlock(&osb->recovery_lock);
899 goto restart; 1049 goto restart;
900 } 1050 }
@@ -924,8 +1074,8 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
924 1074
925 /* People waiting on recovery will wait on 1075 /* People waiting on recovery will wait on
926 * the recovery map to empty. */ 1076 * the recovery map to empty. */
927 if (!ocfs2_recovery_map_set(osb, node_num)) 1077 if (ocfs2_recovery_map_set(osb, node_num))
928 mlog(0, "node %d already be in recovery.\n", node_num); 1078 mlog(0, "node %d already in recovery map.\n", node_num);
929 1079
930 mlog(0, "starting recovery thread...\n"); 1080 mlog(0, "starting recovery thread...\n");
931 1081
@@ -1079,7 +1229,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1079{ 1229{
1080 int status = 0; 1230 int status = 0;
1081 int slot_num; 1231 int slot_num;
1082 struct ocfs2_slot_info *si = osb->slot_info;
1083 struct ocfs2_dinode *la_copy = NULL; 1232 struct ocfs2_dinode *la_copy = NULL;
1084 struct ocfs2_dinode *tl_copy = NULL; 1233 struct ocfs2_dinode *tl_copy = NULL;
1085 1234
@@ -1092,8 +1241,8 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1092 * case we should've called ocfs2_journal_load instead. */ 1241 * case we should've called ocfs2_journal_load instead. */
1093 BUG_ON(osb->node_num == node_num); 1242 BUG_ON(osb->node_num == node_num);
1094 1243
1095 slot_num = ocfs2_node_num_to_slot(si, node_num); 1244 slot_num = ocfs2_node_num_to_slot(osb, node_num);
1096 if (slot_num == OCFS2_INVALID_SLOT) { 1245 if (slot_num == -ENOENT) {
1097 status = 0; 1246 status = 0;
1098 mlog(0, "no slot for this node, so no recovery required.\n"); 1247 mlog(0, "no slot for this node, so no recovery required.\n");
1099 goto done; 1248 goto done;
@@ -1123,8 +1272,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1123 1272
1124 /* Likewise, this would be a strange but ultimately not so 1273 /* Likewise, this would be a strange but ultimately not so
1125 * harmful place to get an error... */ 1274 * harmful place to get an error... */
1126 ocfs2_clear_slot(si, slot_num); 1275 status = ocfs2_clear_slot(osb, slot_num);
1127 status = ocfs2_update_disk_slots(osb, si);
1128 if (status < 0) 1276 if (status < 0)
1129 mlog_errno(status); 1277 mlog_errno(status);
1130 1278
@@ -1184,23 +1332,24 @@ bail:
1184 * slot info struct has been updated from disk. */ 1332 * slot info struct has been updated from disk. */
1185int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) 1333int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1186{ 1334{
1187 int status, i, node_num; 1335 unsigned int node_num;
1188 struct ocfs2_slot_info *si = osb->slot_info; 1336 int status, i;
1189 1337
1190 /* This is called with the super block cluster lock, so we 1338 /* This is called with the super block cluster lock, so we
1191 * know that the slot map can't change underneath us. */ 1339 * know that the slot map can't change underneath us. */
1192 1340
1193 spin_lock(&si->si_lock); 1341 spin_lock(&osb->osb_lock);
1194 for(i = 0; i < si->si_num_slots; i++) { 1342 for (i = 0; i < osb->max_slots; i++) {
1195 if (i == osb->slot_num) 1343 if (i == osb->slot_num)
1196 continue; 1344 continue;
1197 if (ocfs2_is_empty_slot(si, i)) 1345
1346 status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
1347 if (status == -ENOENT)
1198 continue; 1348 continue;
1199 1349
1200 node_num = si->si_global_node_nums[i]; 1350 if (__ocfs2_recovery_map_test(osb, node_num))
1201 if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
1202 continue; 1351 continue;
1203 spin_unlock(&si->si_lock); 1352 spin_unlock(&osb->osb_lock);
1204 1353
1205 /* Ok, we have a slot occupied by another node which 1354 /* Ok, we have a slot occupied by another node which
1206 * is not in the recovery map. We trylock his journal 1355 * is not in the recovery map. We trylock his journal
@@ -1216,9 +1365,9 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1216 goto bail; 1365 goto bail;
1217 } 1366 }
1218 1367
1219 spin_lock(&si->si_lock); 1368 spin_lock(&osb->osb_lock);
1220 } 1369 }
1221 spin_unlock(&si->si_lock); 1370 spin_unlock(&osb->osb_lock);
1222 1371
1223 status = 0; 1372 status = 0;
1224bail: 1373bail:
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 220f3e818e78..db82be2532ed 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -134,6 +134,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
134 134
135/* Exported only for the journal struct init code in super.c. Do not call. */ 135/* Exported only for the journal struct init code in super.c. Do not call. */
136void ocfs2_complete_recovery(struct work_struct *work); 136void ocfs2_complete_recovery(struct work_struct *work);
137void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
138
139int ocfs2_recovery_init(struct ocfs2_super *osb);
140void ocfs2_recovery_exit(struct ocfs2_super *osb);
137 141
138/* 142/*
139 * Journal Control: 143 * Journal Control:
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ab83fd562429..ce0dc147602a 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -447,6 +447,8 @@ out_mutex:
447 iput(main_bm_inode); 447 iput(main_bm_inode);
448 448
449out: 449out:
450 if (!status)
451 ocfs2_init_inode_steal_slot(osb);
450 mlog_exit(status); 452 mlog_exit(status);
451 return status; 453 return status;
452} 454}
@@ -523,6 +525,8 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
523 } 525 }
524 526
525 ac->ac_inode = local_alloc_inode; 527 ac->ac_inode = local_alloc_inode;
528 /* We should never use localalloc from another slot */
529 ac->ac_alloc_slot = osb->slot_num;
526 ac->ac_which = OCFS2_AC_USE_LOCAL; 530 ac->ac_which = OCFS2_AC_USE_LOCAL;
527 get_bh(osb->local_alloc_bh); 531 get_bh(osb->local_alloc_bh);
528 ac->ac_bh = osb->local_alloc_bh; 532 ac->ac_bh = osb->local_alloc_bh;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index ae9ad9587516..d5d808fe0140 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -424,7 +424,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
424 fe->i_fs_generation = cpu_to_le32(osb->fs_generation); 424 fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
425 fe->i_blkno = cpu_to_le64(fe_blkno); 425 fe->i_blkno = cpu_to_le64(fe_blkno);
426 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); 426 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
427 fe->i_suballoc_slot = cpu_to_le16(osb->slot_num); 427 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
428 fe->i_uid = cpu_to_le32(current->fsuid); 428 fe->i_uid = cpu_to_le32(current->fsuid);
429 if (dir->i_mode & S_ISGID) { 429 if (dir->i_mode & S_ISGID) {
430 fe->i_gid = cpu_to_le32(dir->i_gid); 430 fe->i_gid = cpu_to_le32(dir->i_gid);
@@ -997,7 +997,7 @@ static int ocfs2_rename(struct inode *old_dir,
997 * 997 *
998 * And that's why, just like the VFS, we need a file system 998 * And that's why, just like the VFS, we need a file system
999 * rename lock. */ 999 * rename lock. */
1000 if (old_dentry != new_dentry) { 1000 if (old_dir != new_dir && S_ISDIR(old_inode->i_mode)) {
1001 status = ocfs2_rename_lock(osb); 1001 status = ocfs2_rename_lock(osb);
1002 if (status < 0) { 1002 if (status < 0) {
1003 mlog_errno(status); 1003 mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6546cef212e3..31692379c170 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -36,11 +36,8 @@
36#include <linux/mutex.h> 36#include <linux/mutex.h>
37#include <linux/jbd.h> 37#include <linux/jbd.h>
38 38
39#include "cluster/nodemanager.h" 39/* For union ocfs2_dlm_lksb */
40#include "cluster/heartbeat.h" 40#include "stackglue.h"
41#include "cluster/tcp.h"
42
43#include "dlm/dlmapi.h"
44 41
45#include "ocfs2_fs.h" 42#include "ocfs2_fs.h"
46#include "ocfs2_lockid.h" 43#include "ocfs2_lockid.h"
@@ -101,6 +98,9 @@ enum ocfs2_unlock_action {
101 * dropped. */ 98 * dropped. */
102#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ 99#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
103#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */ 100#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */
101#define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a
102 call to dlm_lock. Only
103 exists with BUSY set. */
104 104
105struct ocfs2_lock_res_ops; 105struct ocfs2_lock_res_ops;
106 106
@@ -120,13 +120,14 @@ struct ocfs2_lock_res {
120 int l_level; 120 int l_level;
121 unsigned int l_ro_holders; 121 unsigned int l_ro_holders;
122 unsigned int l_ex_holders; 122 unsigned int l_ex_holders;
123 struct dlm_lockstatus l_lksb; 123 union ocfs2_dlm_lksb l_lksb;
124 124
125 /* used from AST/BAST funcs. */ 125 /* used from AST/BAST funcs. */
126 enum ocfs2_ast_action l_action; 126 enum ocfs2_ast_action l_action;
127 enum ocfs2_unlock_action l_unlock_action; 127 enum ocfs2_unlock_action l_unlock_action;
128 int l_requested; 128 int l_requested;
129 int l_blocking; 129 int l_blocking;
130 unsigned int l_pending_gen;
130 131
131 wait_queue_head_t l_event; 132 wait_queue_head_t l_event;
132 133
@@ -179,6 +180,8 @@ enum ocfs2_mount_options
179#define OCFS2_DEFAULT_ATIME_QUANTUM 60 180#define OCFS2_DEFAULT_ATIME_QUANTUM 60
180 181
181struct ocfs2_journal; 182struct ocfs2_journal;
183struct ocfs2_slot_info;
184struct ocfs2_recovery_map;
182struct ocfs2_super 185struct ocfs2_super
183{ 186{
184 struct task_struct *commit_task; 187 struct task_struct *commit_task;
@@ -190,7 +193,6 @@ struct ocfs2_super
190 struct ocfs2_slot_info *slot_info; 193 struct ocfs2_slot_info *slot_info;
191 194
192 spinlock_t node_map_lock; 195 spinlock_t node_map_lock;
193 struct ocfs2_node_map recovery_map;
194 196
195 u64 root_blkno; 197 u64 root_blkno;
196 u64 system_dir_blkno; 198 u64 system_dir_blkno;
@@ -206,25 +208,29 @@ struct ocfs2_super
206 u32 s_feature_incompat; 208 u32 s_feature_incompat;
207 u32 s_feature_ro_compat; 209 u32 s_feature_ro_compat;
208 210
209 /* Protects s_next_generaion, osb_flags. Could protect more on 211 /* Protects s_next_generation, osb_flags and s_inode_steal_slot.
210 * osb as it's very short lived. */ 212 * Could protect more on osb as it's very short lived.
213 */
211 spinlock_t osb_lock; 214 spinlock_t osb_lock;
212 u32 s_next_generation; 215 u32 s_next_generation;
213 unsigned long osb_flags; 216 unsigned long osb_flags;
217 s16 s_inode_steal_slot;
218 atomic_t s_num_inodes_stolen;
214 219
215 unsigned long s_mount_opt; 220 unsigned long s_mount_opt;
216 unsigned int s_atime_quantum; 221 unsigned int s_atime_quantum;
217 222
218 u16 max_slots; 223 unsigned int max_slots;
219 s16 node_num; 224 unsigned int node_num;
220 s16 slot_num; 225 int slot_num;
221 s16 preferred_slot; 226 int preferred_slot;
222 int s_sectsize_bits; 227 int s_sectsize_bits;
223 int s_clustersize; 228 int s_clustersize;
224 int s_clustersize_bits; 229 int s_clustersize_bits;
225 230
226 atomic_t vol_state; 231 atomic_t vol_state;
227 struct mutex recovery_lock; 232 struct mutex recovery_lock;
233 struct ocfs2_recovery_map *recovery_map;
228 struct task_struct *recovery_thread_task; 234 struct task_struct *recovery_thread_task;
229 int disable_recovery; 235 int disable_recovery;
230 wait_queue_head_t checkpoint_event; 236 wait_queue_head_t checkpoint_event;
@@ -245,12 +251,11 @@ struct ocfs2_super
245 struct ocfs2_alloc_stats alloc_stats; 251 struct ocfs2_alloc_stats alloc_stats;
246 char dev_str[20]; /* "major,minor" of the device */ 252 char dev_str[20]; /* "major,minor" of the device */
247 253
248 struct dlm_ctxt *dlm; 254 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
255 struct ocfs2_cluster_connection *cconn;
249 struct ocfs2_lock_res osb_super_lockres; 256 struct ocfs2_lock_res osb_super_lockres;
250 struct ocfs2_lock_res osb_rename_lockres; 257 struct ocfs2_lock_res osb_rename_lockres;
251 struct dlm_eviction_cb osb_eviction_cb;
252 struct ocfs2_dlm_debug *osb_dlm_debug; 258 struct ocfs2_dlm_debug *osb_dlm_debug;
253 struct dlm_protocol_version osb_locking_proto;
254 259
255 struct dentry *osb_debug_root; 260 struct dentry *osb_debug_root;
256 261
@@ -367,11 +372,24 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
367 return ret; 372 return ret;
368} 373}
369 374
375static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
376{
377 return (osb->s_feature_incompat &
378 OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK);
379}
380
370static inline int ocfs2_mount_local(struct ocfs2_super *osb) 381static inline int ocfs2_mount_local(struct ocfs2_super *osb)
371{ 382{
372 return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); 383 return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
373} 384}
374 385
386static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
387{
388 return (osb->s_feature_incompat &
389 OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP);
390}
391
392
375#define OCFS2_IS_VALID_DINODE(ptr) \ 393#define OCFS2_IS_VALID_DINODE(ptr) \
376 (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) 394 (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
377 395
@@ -522,6 +540,33 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
522 return pages_per_cluster; 540 return pages_per_cluster;
523} 541}
524 542
543static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
544{
545 spin_lock(&osb->osb_lock);
546 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
547 spin_unlock(&osb->osb_lock);
548 atomic_set(&osb->s_num_inodes_stolen, 0);
549}
550
551static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb,
552 s16 slot)
553{
554 spin_lock(&osb->osb_lock);
555 osb->s_inode_steal_slot = slot;
556 spin_unlock(&osb->osb_lock);
557}
558
559static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
560{
561 s16 slot;
562
563 spin_lock(&osb->osb_lock);
564 slot = osb->s_inode_steal_slot;
565 spin_unlock(&osb->osb_lock);
566
567 return slot;
568}
569
525#define ocfs2_set_bit ext2_set_bit 570#define ocfs2_set_bit ext2_set_bit
526#define ocfs2_clear_bit ext2_clear_bit 571#define ocfs2_clear_bit ext2_clear_bit
527#define ocfs2_test_bit ext2_test_bit 572#define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 3633edd3982f..52c426665154 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,9 @@
88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB 88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ 89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ 90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
91 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA) 91 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
92 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
93 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK)
92#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 94#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
93 95
94/* 96/*
@@ -125,6 +127,21 @@
125/* Support for data packed into inode blocks */ 127/* Support for data packed into inode blocks */
126#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 128#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040
127 129
130/* Support for the extended slot map */
131#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
132
133
134/*
135 * Support for alternate, userspace cluster stacks. If set, the superblock
136 * field s_cluster_info contains a tag for the alternate stack in use as
137 * well as the name of the cluster being joined.
138 * mount.ocfs2 must pass in a matching stack name.
139 *
140 * If not set, the classic stack will be used. This is compatbile with
141 * all older versions.
142 */
143#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080
144
128/* 145/*
129 * backup superblock flag is used to indicate that this volume 146 * backup superblock flag is used to indicate that this volume
130 * has backup superblocks. 147 * has backup superblocks.
@@ -267,6 +284,10 @@ struct ocfs2_new_group_input {
267#define OCFS2_VOL_UUID_LEN 16 284#define OCFS2_VOL_UUID_LEN 16
268#define OCFS2_MAX_VOL_LABEL_LEN 64 285#define OCFS2_MAX_VOL_LABEL_LEN 64
269 286
287/* The alternate, userspace stack fields */
288#define OCFS2_STACK_LABEL_LEN 4
289#define OCFS2_CLUSTER_NAME_LEN 16
290
270/* Journal limits (in bytes) */ 291/* Journal limits (in bytes) */
271#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 292#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
272 293
@@ -475,6 +496,47 @@ struct ocfs2_extent_block
475}; 496};
476 497
477/* 498/*
499 * On disk slot map for OCFS2. This defines the contents of the "slot_map"
500 * system file. A slot is valid if it contains a node number >= 0. The
501 * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty.
502 */
503struct ocfs2_slot_map {
504/*00*/ __le16 sm_slots[0];
505/*
506 * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255,
507 * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize.
508 */
509};
510
511struct ocfs2_extended_slot {
512/*00*/ __u8 es_valid;
513 __u8 es_reserved1[3];
514 __le32 es_node_num;
515/*10*/
516};
517
518/*
519 * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP
520 * is set. It separates out the valid marker from the node number, and
521 * has room to grow. Unlike the old slot map, this format is defined by
522 * i_size.
523 */
524struct ocfs2_slot_map_extended {
525/*00*/ struct ocfs2_extended_slot se_slots[0];
526/*
527 * Actual size is i_size of the slot_map system file. It should
528 * match s_max_slots * sizeof(struct ocfs2_extended_slot)
529 */
530};
531
532struct ocfs2_cluster_info {
533/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN];
534 __le32 ci_reserved;
535/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN];
536/*18*/
537};
538
539/*
478 * On disk superblock for OCFS2 540 * On disk superblock for OCFS2
479 * Note that it is contained inside an ocfs2_dinode, so all offsets 541 * Note that it is contained inside an ocfs2_dinode, so all offsets
480 * are relative to the start of ocfs2_dinode.id2. 542 * are relative to the start of ocfs2_dinode.id2.
@@ -506,7 +568,20 @@ struct ocfs2_super_block {
506 * group header */ 568 * group header */
507/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ 569/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
508/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ 570/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
509/*A0*/ 571/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
572 stack. Only valid
573 with INCOMPAT flag. */
574/*B8*/ __le64 s_reserved2[17]; /* Fill out superblock */
575/*140*/
576
577 /*
578 * NOTE: As stated above, all offsets are relative to
579 * ocfs2_dinode.id2, which is at 0xC0 in the inode.
580 * 0xC0 + 0x140 = 0x200 or 512 bytes. A superblock must fit within
581 * our smallest blocksize, which is 512 bytes. To ensure this,
582 * we reserve the space in s_reserved2. Anything past s_reserved2
583 * will not be available on the smallest blocksize.
584 */
510}; 585};
511 586
512/* 587/*
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 86f3e3799c2b..82c200f7a8f1 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -100,7 +100,7 @@ static char *ocfs2_lock_type_strings[] = {
100static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 100static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
101{ 101{
102#ifdef __KERNEL__ 102#ifdef __KERNEL__
103 mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type); 103 BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
104#endif 104#endif
105 return ocfs2_lock_type_strings[type]; 105 return ocfs2_lock_type_strings[type];
106} 106}
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 3a50ce555e64..bb5ff8939bf1 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -42,81 +42,244 @@
42 42
43#include "buffer_head_io.h" 43#include "buffer_head_io.h"
44 44
45static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, 45
46 s16 global); 46struct ocfs2_slot {
47static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, 47 int sl_valid;
48 s16 slot_num, 48 unsigned int sl_node_num;
49 s16 node_num); 49};
50 50
51/* post the slot information on disk into our slot_info struct. */ 51struct ocfs2_slot_info {
52void ocfs2_update_slot_info(struct ocfs2_slot_info *si) 52 int si_extended;
53 int si_slots_per_block;
54 struct inode *si_inode;
55 unsigned int si_blocks;
56 struct buffer_head **si_bh;
57 unsigned int si_num_slots;
58 struct ocfs2_slot *si_slots;
59};
60
61
62static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
63 unsigned int node_num);
64
65static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si,
66 int slot_num)
67{
68 BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
69 si->si_slots[slot_num].sl_valid = 0;
70}
71
72static void ocfs2_set_slot(struct ocfs2_slot_info *si,
73 int slot_num, unsigned int node_num)
74{
75 BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
76
77 si->si_slots[slot_num].sl_valid = 1;
78 si->si_slots[slot_num].sl_node_num = node_num;
79}
80
81/* This version is for the extended slot map */
82static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si)
83{
84 int b, i, slotno;
85 struct ocfs2_slot_map_extended *se;
86
87 slotno = 0;
88 for (b = 0; b < si->si_blocks; b++) {
89 se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data;
90 for (i = 0;
91 (i < si->si_slots_per_block) &&
92 (slotno < si->si_num_slots);
93 i++, slotno++) {
94 if (se->se_slots[i].es_valid)
95 ocfs2_set_slot(si, slotno,
96 le32_to_cpu(se->se_slots[i].es_node_num));
97 else
98 ocfs2_invalidate_slot(si, slotno);
99 }
100 }
101}
102
103/*
104 * Post the slot information on disk into our slot_info struct.
105 * Must be protected by osb_lock.
106 */
107static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si)
53{ 108{
54 int i; 109 int i;
55 __le16 *disk_info; 110 struct ocfs2_slot_map *sm;
56 111
57 /* we don't read the slot block here as ocfs2_super_lock 112 sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
58 * should've made sure we have the most recent copy. */
59 spin_lock(&si->si_lock);
60 disk_info = (__le16 *) si->si_bh->b_data;
61 113
62 for (i = 0; i < si->si_size; i++) 114 for (i = 0; i < si->si_num_slots; i++) {
63 si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]); 115 if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT)
116 ocfs2_invalidate_slot(si, i);
117 else
118 ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i]));
119 }
120}
64 121
65 spin_unlock(&si->si_lock); 122static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
123{
124 /*
125 * The slot data will have been refreshed when ocfs2_super_lock
126 * was taken.
127 */
128 if (si->si_extended)
129 ocfs2_update_slot_info_extended(si);
130 else
131 ocfs2_update_slot_info_old(si);
132}
133
134int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
135{
136 int ret;
137 struct ocfs2_slot_info *si = osb->slot_info;
138
139 if (si == NULL)
140 return 0;
141
142 BUG_ON(si->si_blocks == 0);
143 BUG_ON(si->si_bh == NULL);
144
145 mlog(0, "Refreshing slot map, reading %u block(s)\n",
146 si->si_blocks);
147
148 /*
149 * We pass -1 as blocknr because we expect all of si->si_bh to
150 * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
151 * this is not true, the read of -1 (UINT64_MAX) will fail.
152 */
153 ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0,
154 si->si_inode);
155 if (ret == 0) {
156 spin_lock(&osb->osb_lock);
157 ocfs2_update_slot_info(si);
158 spin_unlock(&osb->osb_lock);
159 }
160
161 return ret;
66} 162}
67 163
68/* post the our slot info stuff into it's destination bh and write it 164/* post the our slot info stuff into it's destination bh and write it
69 * out. */ 165 * out. */
70int ocfs2_update_disk_slots(struct ocfs2_super *osb, 166static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si,
71 struct ocfs2_slot_info *si) 167 int slot_num,
168 struct buffer_head **bh)
72{ 169{
73 int status, i; 170 int blkind = slot_num / si->si_slots_per_block;
74 __le16 *disk_info = (__le16 *) si->si_bh->b_data; 171 int slotno = slot_num % si->si_slots_per_block;
172 struct ocfs2_slot_map_extended *se;
173
174 BUG_ON(blkind >= si->si_blocks);
175
176 se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data;
177 se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid;
178 if (si->si_slots[slot_num].sl_valid)
179 se->se_slots[slotno].es_node_num =
180 cpu_to_le32(si->si_slots[slot_num].sl_node_num);
181 *bh = si->si_bh[blkind];
182}
75 183
76 spin_lock(&si->si_lock); 184static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si,
77 for (i = 0; i < si->si_size; i++) 185 int slot_num,
78 disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]); 186 struct buffer_head **bh)
79 spin_unlock(&si->si_lock); 187{
188 int i;
189 struct ocfs2_slot_map *sm;
190
191 sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
192 for (i = 0; i < si->si_num_slots; i++) {
193 if (si->si_slots[i].sl_valid)
194 sm->sm_slots[i] =
195 cpu_to_le16(si->si_slots[i].sl_node_num);
196 else
197 sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
198 }
199 *bh = si->si_bh[0];
200}
201
202static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
203 struct ocfs2_slot_info *si,
204 int slot_num)
205{
206 int status;
207 struct buffer_head *bh;
208
209 spin_lock(&osb->osb_lock);
210 if (si->si_extended)
211 ocfs2_update_disk_slot_extended(si, slot_num, &bh);
212 else
213 ocfs2_update_disk_slot_old(si, slot_num, &bh);
214 spin_unlock(&osb->osb_lock);
80 215
81 status = ocfs2_write_block(osb, si->si_bh, si->si_inode); 216 status = ocfs2_write_block(osb, bh, si->si_inode);
82 if (status < 0) 217 if (status < 0)
83 mlog_errno(status); 218 mlog_errno(status);
84 219
85 return status; 220 return status;
86} 221}
87 222
88/* try to find global node in the slot info. Returns 223/*
89 * OCFS2_INVALID_SLOT if nothing is found. */ 224 * Calculate how many bytes are needed by the slot map. Returns
90static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, 225 * an error if the slot map file is too small.
91 s16 global) 226 */
227static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
228 struct inode *inode,
229 unsigned long long *bytes)
92{ 230{
93 int i; 231 unsigned long long bytes_needed;
94 s16 ret = OCFS2_INVALID_SLOT; 232
233 if (ocfs2_uses_extended_slot_map(osb)) {
234 bytes_needed = osb->max_slots *
235 sizeof(struct ocfs2_extended_slot);
236 } else {
237 bytes_needed = osb->max_slots * sizeof(__le16);
238 }
239 if (bytes_needed > i_size_read(inode)) {
240 mlog(ML_ERROR,
241 "Slot map file is too small! (size %llu, needed %llu)\n",
242 i_size_read(inode), bytes_needed);
243 return -ENOSPC;
244 }
245
246 *bytes = bytes_needed;
247 return 0;
248}
249
250/* try to find global node in the slot info. Returns -ENOENT
251 * if nothing is found. */
252static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
253 unsigned int node_num)
254{
255 int i, ret = -ENOENT;
95 256
96 for(i = 0; i < si->si_num_slots; i++) { 257 for(i = 0; i < si->si_num_slots; i++) {
97 if (global == si->si_global_node_nums[i]) { 258 if (si->si_slots[i].sl_valid &&
98 ret = (s16) i; 259 (node_num == si->si_slots[i].sl_node_num)) {
260 ret = i;
99 break; 261 break;
100 } 262 }
101 } 263 }
264
102 return ret; 265 return ret;
103} 266}
104 267
105static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred) 268static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
269 int preferred)
106{ 270{
107 int i; 271 int i, ret = -ENOSPC;
108 s16 ret = OCFS2_INVALID_SLOT;
109 272
110 if (preferred >= 0 && preferred < si->si_num_slots) { 273 if ((preferred >= 0) && (preferred < si->si_num_slots)) {
111 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) { 274 if (!si->si_slots[preferred].sl_valid) {
112 ret = preferred; 275 ret = preferred;
113 goto out; 276 goto out;
114 } 277 }
115 } 278 }
116 279
117 for(i = 0; i < si->si_num_slots; i++) { 280 for(i = 0; i < si->si_num_slots; i++) {
118 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { 281 if (!si->si_slots[i].sl_valid) {
119 ret = (s16) i; 282 ret = i;
120 break; 283 break;
121 } 284 }
122 } 285 }
@@ -124,58 +287,155 @@ out:
124 return ret; 287 return ret;
125} 288}
126 289
127s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, 290int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
128 s16 global)
129{ 291{
130 s16 ret; 292 int slot;
293 struct ocfs2_slot_info *si = osb->slot_info;
131 294
132 spin_lock(&si->si_lock); 295 spin_lock(&osb->osb_lock);
133 ret = __ocfs2_node_num_to_slot(si, global); 296 slot = __ocfs2_node_num_to_slot(si, node_num);
134 spin_unlock(&si->si_lock); 297 spin_unlock(&osb->osb_lock);
135 return ret; 298
299 return slot;
300}
301
302int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
303 unsigned int *node_num)
304{
305 struct ocfs2_slot_info *si = osb->slot_info;
306
307 assert_spin_locked(&osb->osb_lock);
308
309 BUG_ON(slot_num < 0);
310 BUG_ON(slot_num > osb->max_slots);
311
312 if (!si->si_slots[slot_num].sl_valid)
313 return -ENOENT;
314
315 *node_num = si->si_slots[slot_num].sl_node_num;
316 return 0;
136} 317}
137 318
138static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, 319static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
139 s16 slot_num,
140 s16 node_num)
141{ 320{
142 BUG_ON(slot_num == OCFS2_INVALID_SLOT); 321 unsigned int i;
143 BUG_ON(slot_num >= si->si_num_slots); 322
144 BUG_ON((node_num != O2NM_INVALID_NODE_NUM) && 323 if (si == NULL)
145 (node_num >= O2NM_MAX_NODES)); 324 return;
325
326 if (si->si_inode)
327 iput(si->si_inode);
328 if (si->si_bh) {
329 for (i = 0; i < si->si_blocks; i++) {
330 if (si->si_bh[i]) {
331 brelse(si->si_bh[i]);
332 si->si_bh[i] = NULL;
333 }
334 }
335 kfree(si->si_bh);
336 }
146 337
147 si->si_global_node_nums[slot_num] = node_num; 338 kfree(si);
148} 339}
149 340
150void ocfs2_clear_slot(struct ocfs2_slot_info *si, 341int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
151 s16 slot_num)
152{ 342{
153 spin_lock(&si->si_lock); 343 struct ocfs2_slot_info *si = osb->slot_info;
154 __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT); 344
155 spin_unlock(&si->si_lock); 345 if (si == NULL)
346 return 0;
347
348 spin_lock(&osb->osb_lock);
349 ocfs2_invalidate_slot(si, slot_num);
350 spin_unlock(&osb->osb_lock);
351
352 return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num);
156} 353}
157 354
158int ocfs2_init_slot_info(struct ocfs2_super *osb) 355static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
356 struct ocfs2_slot_info *si)
159{ 357{
160 int status, i; 358 int status = 0;
161 u64 blkno; 359 u64 blkno;
360 unsigned long long blocks, bytes;
361 unsigned int i;
362 struct buffer_head *bh;
363
364 status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes);
365 if (status)
366 goto bail;
367
368 blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes);
369 BUG_ON(blocks > UINT_MAX);
370 si->si_blocks = blocks;
371 if (!si->si_blocks)
372 goto bail;
373
374 if (si->si_extended)
375 si->si_slots_per_block =
376 (osb->sb->s_blocksize /
377 sizeof(struct ocfs2_extended_slot));
378 else
379 si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16);
380
381 /* The size checks above should ensure this */
382 BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
383
384 mlog(0, "Slot map needs %u buffers for %llu bytes\n",
385 si->si_blocks, bytes);
386
387 si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
388 GFP_KERNEL);
389 if (!si->si_bh) {
390 status = -ENOMEM;
391 mlog_errno(status);
392 goto bail;
393 }
394
395 for (i = 0; i < si->si_blocks; i++) {
396 status = ocfs2_extent_map_get_blocks(si->si_inode, i,
397 &blkno, NULL, NULL);
398 if (status < 0) {
399 mlog_errno(status);
400 goto bail;
401 }
402
403 mlog(0, "Reading slot map block %u at %llu\n", i,
404 (unsigned long long)blkno);
405
406 bh = NULL; /* Acquire a fresh bh */
407 status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode);
408 if (status < 0) {
409 mlog_errno(status);
410 goto bail;
411 }
412
413 si->si_bh[i] = bh;
414 }
415
416bail:
417 return status;
418}
419
420int ocfs2_init_slot_info(struct ocfs2_super *osb)
421{
422 int status;
162 struct inode *inode = NULL; 423 struct inode *inode = NULL;
163 struct buffer_head *bh = NULL;
164 struct ocfs2_slot_info *si; 424 struct ocfs2_slot_info *si;
165 425
166 si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL); 426 si = kzalloc(sizeof(struct ocfs2_slot_info) +
427 (sizeof(struct ocfs2_slot) * osb->max_slots),
428 GFP_KERNEL);
167 if (!si) { 429 if (!si) {
168 status = -ENOMEM; 430 status = -ENOMEM;
169 mlog_errno(status); 431 mlog_errno(status);
170 goto bail; 432 goto bail;
171 } 433 }
172 434
173 spin_lock_init(&si->si_lock); 435 si->si_extended = ocfs2_uses_extended_slot_map(osb);
174 si->si_num_slots = osb->max_slots; 436 si->si_num_slots = osb->max_slots;
175 si->si_size = OCFS2_MAX_SLOTS; 437 si->si_slots = (struct ocfs2_slot *)((char *)si +
176 438 sizeof(struct ocfs2_slot_info));
177 for(i = 0; i < si->si_num_slots; i++)
178 si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
179 439
180 inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, 440 inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
181 OCFS2_INVALID_SLOT); 441 OCFS2_INVALID_SLOT);
@@ -185,61 +445,53 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
185 goto bail; 445 goto bail;
186 } 446 }
187 447
188 status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL); 448 si->si_inode = inode;
189 if (status < 0) { 449 status = ocfs2_map_slot_buffers(osb, si);
190 mlog_errno(status);
191 goto bail;
192 }
193
194 status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
195 if (status < 0) { 450 if (status < 0) {
196 mlog_errno(status); 451 mlog_errno(status);
197 goto bail; 452 goto bail;
198 } 453 }
199 454
200 si->si_inode = inode; 455 osb->slot_info = (struct ocfs2_slot_info *)si;
201 si->si_bh = bh;
202 osb->slot_info = si;
203bail: 456bail:
204 if (status < 0 && si) 457 if (status < 0 && si)
205 ocfs2_free_slot_info(si); 458 __ocfs2_free_slot_info(si);
206 459
207 return status; 460 return status;
208} 461}
209 462
210void ocfs2_free_slot_info(struct ocfs2_slot_info *si) 463void ocfs2_free_slot_info(struct ocfs2_super *osb)
211{ 464{
212 if (si->si_inode) 465 struct ocfs2_slot_info *si = osb->slot_info;
213 iput(si->si_inode); 466
214 if (si->si_bh) 467 osb->slot_info = NULL;
215 brelse(si->si_bh); 468 __ocfs2_free_slot_info(si);
216 kfree(si);
217} 469}
218 470
219int ocfs2_find_slot(struct ocfs2_super *osb) 471int ocfs2_find_slot(struct ocfs2_super *osb)
220{ 472{
221 int status; 473 int status;
222 s16 slot; 474 int slot;
223 struct ocfs2_slot_info *si; 475 struct ocfs2_slot_info *si;
224 476
225 mlog_entry_void(); 477 mlog_entry_void();
226 478
227 si = osb->slot_info; 479 si = osb->slot_info;
228 480
481 spin_lock(&osb->osb_lock);
229 ocfs2_update_slot_info(si); 482 ocfs2_update_slot_info(si);
230 483
231 spin_lock(&si->si_lock);
232 /* search for ourselves first and take the slot if it already 484 /* search for ourselves first and take the slot if it already
233 * exists. Perhaps we need to mark this in a variable for our 485 * exists. Perhaps we need to mark this in a variable for our
234 * own journal recovery? Possibly not, though we certainly 486 * own journal recovery? Possibly not, though we certainly
235 * need to warn to the user */ 487 * need to warn to the user */
236 slot = __ocfs2_node_num_to_slot(si, osb->node_num); 488 slot = __ocfs2_node_num_to_slot(si, osb->node_num);
237 if (slot == OCFS2_INVALID_SLOT) { 489 if (slot < 0) {
238 /* if no slot yet, then just take 1st available 490 /* if no slot yet, then just take 1st available
239 * one. */ 491 * one. */
240 slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); 492 slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
241 if (slot == OCFS2_INVALID_SLOT) { 493 if (slot < 0) {
242 spin_unlock(&si->si_lock); 494 spin_unlock(&osb->osb_lock);
243 mlog(ML_ERROR, "no free slots available!\n"); 495 mlog(ML_ERROR, "no free slots available!\n");
244 status = -EINVAL; 496 status = -EINVAL;
245 goto bail; 497 goto bail;
@@ -248,13 +500,13 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
248 mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", 500 mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
249 slot); 501 slot);
250 502
251 __ocfs2_fill_slot(si, slot, osb->node_num); 503 ocfs2_set_slot(si, slot, osb->node_num);
252 osb->slot_num = slot; 504 osb->slot_num = slot;
253 spin_unlock(&si->si_lock); 505 spin_unlock(&osb->osb_lock);
254 506
255 mlog(0, "taking node slot %d\n", osb->slot_num); 507 mlog(0, "taking node slot %d\n", osb->slot_num);
256 508
257 status = ocfs2_update_disk_slots(osb, si); 509 status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
258 if (status < 0) 510 if (status < 0)
259 mlog_errno(status); 511 mlog_errno(status);
260 512
@@ -265,27 +517,27 @@ bail:
265 517
266void ocfs2_put_slot(struct ocfs2_super *osb) 518void ocfs2_put_slot(struct ocfs2_super *osb)
267{ 519{
268 int status; 520 int status, slot_num;
269 struct ocfs2_slot_info *si = osb->slot_info; 521 struct ocfs2_slot_info *si = osb->slot_info;
270 522
271 if (!si) 523 if (!si)
272 return; 524 return;
273 525
526 spin_lock(&osb->osb_lock);
274 ocfs2_update_slot_info(si); 527 ocfs2_update_slot_info(si);
275 528
276 spin_lock(&si->si_lock); 529 slot_num = osb->slot_num;
277 __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT); 530 ocfs2_invalidate_slot(si, osb->slot_num);
278 osb->slot_num = OCFS2_INVALID_SLOT; 531 osb->slot_num = OCFS2_INVALID_SLOT;
279 spin_unlock(&si->si_lock); 532 spin_unlock(&osb->osb_lock);
280 533
281 status = ocfs2_update_disk_slots(osb, si); 534 status = ocfs2_update_disk_slot(osb, si, slot_num);
282 if (status < 0) { 535 if (status < 0) {
283 mlog_errno(status); 536 mlog_errno(status);
284 goto bail; 537 goto bail;
285 } 538 }
286 539
287bail: 540bail:
288 osb->slot_info = NULL; 541 ocfs2_free_slot_info(osb);
289 ocfs2_free_slot_info(si);
290} 542}
291 543
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index 1025872aaade..601c95fd7003 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -27,38 +27,18 @@
27#ifndef SLOTMAP_H 27#ifndef SLOTMAP_H
28#define SLOTMAP_H 28#define SLOTMAP_H
29 29
30struct ocfs2_slot_info {
31 spinlock_t si_lock;
32
33 struct inode *si_inode;
34 struct buffer_head *si_bh;
35 unsigned int si_num_slots;
36 unsigned int si_size;
37 s16 si_global_node_nums[OCFS2_MAX_SLOTS];
38};
39
40int ocfs2_init_slot_info(struct ocfs2_super *osb); 30int ocfs2_init_slot_info(struct ocfs2_super *osb);
41void ocfs2_free_slot_info(struct ocfs2_slot_info *si); 31void ocfs2_free_slot_info(struct ocfs2_super *osb);
42 32
43int ocfs2_find_slot(struct ocfs2_super *osb); 33int ocfs2_find_slot(struct ocfs2_super *osb);
44void ocfs2_put_slot(struct ocfs2_super *osb); 34void ocfs2_put_slot(struct ocfs2_super *osb);
45 35
46void ocfs2_update_slot_info(struct ocfs2_slot_info *si); 36int ocfs2_refresh_slot_info(struct ocfs2_super *osb);
47int ocfs2_update_disk_slots(struct ocfs2_super *osb,
48 struct ocfs2_slot_info *si);
49
50s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
51 s16 global);
52void ocfs2_clear_slot(struct ocfs2_slot_info *si,
53 s16 slot_num);
54 37
55static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, 38int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num);
56 int slot_num) 39int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
57{ 40 unsigned int *node_num);
58 BUG_ON(slot_num == OCFS2_INVALID_SLOT);
59 assert_spin_locked(&si->si_lock);
60 41
61 return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT; 42int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num);
62}
63 43
64#endif 44#endif
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
new file mode 100644
index 000000000000..ac1d74c63bf5
--- /dev/null
+++ b/fs/ocfs2/stack_o2cb.c
@@ -0,0 +1,420 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * stack_o2cb.c
5 *
6 * Code which interfaces ocfs2 with the o2cb stack.
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation, version 2.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#include <linux/crc32.h>
21#include <linux/module.h>
22
23/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
24#include <linux/fs.h>
25
26#include "cluster/masklog.h"
27#include "cluster/nodemanager.h"
28#include "cluster/heartbeat.h"
29
30#include "stackglue.h"
31
32struct o2dlm_private {
33 struct dlm_eviction_cb op_eviction_cb;
34};
35
36static struct ocfs2_stack_plugin o2cb_stack;
37
38/* These should be identical */
39#if (DLM_LOCK_IV != LKM_IVMODE)
40# error Lock modes do not match
41#endif
42#if (DLM_LOCK_NL != LKM_NLMODE)
43# error Lock modes do not match
44#endif
45#if (DLM_LOCK_CR != LKM_CRMODE)
46# error Lock modes do not match
47#endif
48#if (DLM_LOCK_CW != LKM_CWMODE)
49# error Lock modes do not match
50#endif
51#if (DLM_LOCK_PR != LKM_PRMODE)
52# error Lock modes do not match
53#endif
54#if (DLM_LOCK_PW != LKM_PWMODE)
55# error Lock modes do not match
56#endif
57#if (DLM_LOCK_EX != LKM_EXMODE)
58# error Lock modes do not match
59#endif
60static inline int mode_to_o2dlm(int mode)
61{
62 BUG_ON(mode > LKM_MAXMODE);
63
64 return mode;
65}
66
67#define map_flag(_generic, _o2dlm) \
68 if (flags & (_generic)) { \
69 flags &= ~(_generic); \
70 o2dlm_flags |= (_o2dlm); \
71 }
72static int flags_to_o2dlm(u32 flags)
73{
74 int o2dlm_flags = 0;
75
76 map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE);
77 map_flag(DLM_LKF_CANCEL, LKM_CANCEL);
78 map_flag(DLM_LKF_CONVERT, LKM_CONVERT);
79 map_flag(DLM_LKF_VALBLK, LKM_VALBLK);
80 map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK);
81 map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN);
82 map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE);
83 map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT);
84 map_flag(DLM_LKF_LOCAL, LKM_LOCAL);
85
86 /* map_flag() should have cleared every flag passed in */
87 BUG_ON(flags != 0);
88
89 return o2dlm_flags;
90}
91#undef map_flag
92
93/*
94 * Map an o2dlm status to standard errno values.
95 *
96 * o2dlm only uses a handful of these, and returns even fewer to the
97 * caller. Still, we try to assign sane values to each error.
98 *
99 * The following value pairs have special meanings to dlmglue, thus
100 * the right hand side needs to stay unique - never duplicate the
101 * mapping elsewhere in the table!
102 *
103 * DLM_NORMAL: 0
104 * DLM_NOTQUEUED: -EAGAIN
105 * DLM_CANCELGRANT: -EBUSY
106 * DLM_CANCEL: -DLM_ECANCEL
107 */
108/* Keep in sync with dlmapi.h */
109static int status_map[] = {
110 [DLM_NORMAL] = 0, /* Success */
111 [DLM_GRANTED] = -EINVAL,
112 [DLM_DENIED] = -EACCES,
113 [DLM_DENIED_NOLOCKS] = -EACCES,
114 [DLM_WORKING] = -EACCES,
115 [DLM_BLOCKED] = -EINVAL,
116 [DLM_BLOCKED_ORPHAN] = -EINVAL,
117 [DLM_DENIED_GRACE_PERIOD] = -EACCES,
118 [DLM_SYSERR] = -ENOMEM, /* It is what it is */
119 [DLM_NOSUPPORT] = -EPROTO,
120 [DLM_CANCELGRANT] = -EBUSY, /* Cancel after grant */
121 [DLM_IVLOCKID] = -EINVAL,
122 [DLM_SYNC] = -EINVAL,
123 [DLM_BADTYPE] = -EINVAL,
124 [DLM_BADRESOURCE] = -EINVAL,
125 [DLM_MAXHANDLES] = -ENOMEM,
126 [DLM_NOCLINFO] = -EINVAL,
127 [DLM_NOLOCKMGR] = -EINVAL,
128 [DLM_NOPURGED] = -EINVAL,
129 [DLM_BADARGS] = -EINVAL,
130 [DLM_VOID] = -EINVAL,
131 [DLM_NOTQUEUED] = -EAGAIN, /* Trylock failed */
132 [DLM_IVBUFLEN] = -EINVAL,
133 [DLM_CVTUNGRANT] = -EPERM,
134 [DLM_BADPARAM] = -EINVAL,
135 [DLM_VALNOTVALID] = -EINVAL,
136 [DLM_REJECTED] = -EPERM,
137 [DLM_ABORT] = -EINVAL,
138 [DLM_CANCEL] = -DLM_ECANCEL, /* Successful cancel */
139 [DLM_IVRESHANDLE] = -EINVAL,
140 [DLM_DEADLOCK] = -EDEADLK,
141 [DLM_DENIED_NOASTS] = -EINVAL,
142 [DLM_FORWARD] = -EINVAL,
143 [DLM_TIMEOUT] = -ETIMEDOUT,
144 [DLM_IVGROUPID] = -EINVAL,
145 [DLM_VERS_CONFLICT] = -EOPNOTSUPP,
146 [DLM_BAD_DEVICE_PATH] = -ENOENT,
147 [DLM_NO_DEVICE_PERMISSION] = -EPERM,
148 [DLM_NO_CONTROL_DEVICE] = -ENOENT,
149 [DLM_RECOVERING] = -ENOTCONN,
150 [DLM_MIGRATING] = -ERESTART,
151 [DLM_MAXSTATS] = -EINVAL,
152};
153
154static int dlm_status_to_errno(enum dlm_status status)
155{
156 BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
157
158 return status_map[status];
159}
160
161static void o2dlm_lock_ast_wrapper(void *astarg)
162{
163 BUG_ON(o2cb_stack.sp_proto == NULL);
164
165 o2cb_stack.sp_proto->lp_lock_ast(astarg);
166}
167
168static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
169{
170 BUG_ON(o2cb_stack.sp_proto == NULL);
171
172 o2cb_stack.sp_proto->lp_blocking_ast(astarg, level);
173}
174
175static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
176{
177 int error = dlm_status_to_errno(status);
178
179 BUG_ON(o2cb_stack.sp_proto == NULL);
180
181 /*
182 * In o2dlm, you can get both the lock_ast() for the lock being
183 * granted and the unlock_ast() for the CANCEL failing. A
184 * successful cancel sends DLM_NORMAL here. If the
185 * lock grant happened before the cancel arrived, you get
186 * DLM_CANCELGRANT.
187 *
188 * There's no need for the double-ast. If we see DLM_CANCELGRANT,
189 * we just ignore it. We expect the lock_ast() to handle the
190 * granted lock.
191 */
192 if (status == DLM_CANCELGRANT)
193 return;
194
195 o2cb_stack.sp_proto->lp_unlock_ast(astarg, error);
196}
197
198static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
199 int mode,
200 union ocfs2_dlm_lksb *lksb,
201 u32 flags,
202 void *name,
203 unsigned int namelen,
204 void *astarg)
205{
206 enum dlm_status status;
207 int o2dlm_mode = mode_to_o2dlm(mode);
208 int o2dlm_flags = flags_to_o2dlm(flags);
209 int ret;
210
211 status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
212 o2dlm_flags, name, namelen,
213 o2dlm_lock_ast_wrapper, astarg,
214 o2dlm_blocking_ast_wrapper);
215 ret = dlm_status_to_errno(status);
216 return ret;
217}
218
219static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
220 union ocfs2_dlm_lksb *lksb,
221 u32 flags,
222 void *astarg)
223{
224 enum dlm_status status;
225 int o2dlm_flags = flags_to_o2dlm(flags);
226 int ret;
227
228 status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
229 o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg);
230 ret = dlm_status_to_errno(status);
231 return ret;
232}
233
234static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
235{
236 return dlm_status_to_errno(lksb->lksb_o2dlm.status);
237}
238
239static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
240{
241 return (void *)(lksb->lksb_o2dlm.lvb);
242}
243
244static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb)
245{
246 dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
247}
248
249/*
250 * Called from the dlm when it's about to evict a node. This is how the
251 * classic stack signals node death.
252 */
253static void o2dlm_eviction_cb(int node_num, void *data)
254{
255 struct ocfs2_cluster_connection *conn = data;
256
257 mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
258 node_num, conn->cc_namelen, conn->cc_name);
259
260 conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
261}
262
263static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
264{
265 int rc = 0;
266 u32 dlm_key;
267 struct dlm_ctxt *dlm;
268 struct o2dlm_private *priv;
269 struct dlm_protocol_version dlm_version;
270
271 BUG_ON(conn == NULL);
272 BUG_ON(o2cb_stack.sp_proto == NULL);
273
274 /* for now we only have one cluster/node, make sure we see it
275 * in the heartbeat universe */
276 if (!o2hb_check_local_node_heartbeating()) {
277 rc = -EINVAL;
278 goto out;
279 }
280
281 priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL);
282 if (!priv) {
283 rc = -ENOMEM;
284 goto out_free;
285 }
286
287 /* This just fills the structure in. It is safe to pass conn. */
288 dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb,
289 conn);
290
291 conn->cc_private = priv;
292
293 /* used by the dlm code to make message headers unique, each
294 * node in this domain must agree on this. */
295 dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
296 dlm_version.pv_major = conn->cc_version.pv_major;
297 dlm_version.pv_minor = conn->cc_version.pv_minor;
298
299 dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version);
300 if (IS_ERR(dlm)) {
301 rc = PTR_ERR(dlm);
302 mlog_errno(rc);
303 goto out_free;
304 }
305
306 conn->cc_version.pv_major = dlm_version.pv_major;
307 conn->cc_version.pv_minor = dlm_version.pv_minor;
308 conn->cc_lockspace = dlm;
309
310 dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
311
312out_free:
313 if (rc && conn->cc_private)
314 kfree(conn->cc_private);
315
316out:
317 return rc;
318}
319
320static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn,
321 int hangup_pending)
322{
323 struct dlm_ctxt *dlm = conn->cc_lockspace;
324 struct o2dlm_private *priv = conn->cc_private;
325
326 dlm_unregister_eviction_cb(&priv->op_eviction_cb);
327 conn->cc_private = NULL;
328 kfree(priv);
329
330 dlm_unregister_domain(dlm);
331 conn->cc_lockspace = NULL;
332
333 return 0;
334}
335
336static void o2hb_stop(const char *group)
337{
338 int ret;
339 char *argv[5], *envp[3];
340
341 argv[0] = (char *)o2nm_get_hb_ctl_path();
342 argv[1] = "-K";
343 argv[2] = "-u";
344 argv[3] = (char *)group;
345 argv[4] = NULL;
346
347 mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
348
349 /* minimal command environment taken from cpu_run_sbin_hotplug */
350 envp[0] = "HOME=/";
351 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
352 envp[2] = NULL;
353
354 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
355 if (ret < 0)
356 mlog_errno(ret);
357}
358
359/*
360 * Hangup is a hack for tools compatibility. Older ocfs2-tools software
361 * expects the filesystem to call "ocfs2_hb_ctl" during unmount. This
362 * happens regardless of whether the DLM got started, so we can't do it
363 * in ocfs2_cluster_disconnect(). We bring the o2hb_stop() function into
364 * the glue and provide a "hangup" API for super.c to call.
365 *
366 * Other stacks will eventually provide a NULL ->hangup() pointer.
367 */
368static void o2cb_cluster_hangup(const char *group, int grouplen)
369{
370 o2hb_stop(group);
371}
372
373static int o2cb_cluster_this_node(unsigned int *node)
374{
375 int node_num;
376
377 node_num = o2nm_this_node();
378 if (node_num == O2NM_INVALID_NODE_NUM)
379 return -ENOENT;
380
381 if (node_num >= O2NM_MAX_NODES)
382 return -EOVERFLOW;
383
384 *node = node_num;
385 return 0;
386}
387
388struct ocfs2_stack_operations o2cb_stack_ops = {
389 .connect = o2cb_cluster_connect,
390 .disconnect = o2cb_cluster_disconnect,
391 .hangup = o2cb_cluster_hangup,
392 .this_node = o2cb_cluster_this_node,
393 .dlm_lock = o2cb_dlm_lock,
394 .dlm_unlock = o2cb_dlm_unlock,
395 .lock_status = o2cb_dlm_lock_status,
396 .lock_lvb = o2cb_dlm_lvb,
397 .dump_lksb = o2cb_dump_lksb,
398};
399
400static struct ocfs2_stack_plugin o2cb_stack = {
401 .sp_name = "o2cb",
402 .sp_ops = &o2cb_stack_ops,
403 .sp_owner = THIS_MODULE,
404};
405
406static int __init o2cb_stack_init(void)
407{
408 return ocfs2_stack_glue_register(&o2cb_stack);
409}
410
411static void __exit o2cb_stack_exit(void)
412{
413 ocfs2_stack_glue_unregister(&o2cb_stack);
414}
415
416MODULE_AUTHOR("Oracle");
417MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack");
418MODULE_LICENSE("GPL");
419module_init(o2cb_stack_init);
420module_exit(o2cb_stack_exit);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
new file mode 100644
index 000000000000..7428663f9cbb
--- /dev/null
+++ b/fs/ocfs2/stack_user.c
@@ -0,0 +1,883 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * stack_user.c
5 *
6 * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation, version 2.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#include <linux/module.h>
21#include <linux/fs.h>
22#include <linux/miscdevice.h>
23#include <linux/mutex.h>
24#include <linux/reboot.h>
25#include <asm/uaccess.h>
26
27#include "ocfs2.h" /* For struct ocfs2_lock_res */
28#include "stackglue.h"
29
30
31/*
32 * The control protocol starts with a handshake. Until the handshake
33 * is complete, the control device will fail all write(2)s.
34 *
35 * The handshake is simple. First, the client reads until EOF. Each line
36 * of output is a supported protocol tag. All protocol tags are a single
37 * character followed by a two hex digit version number. Currently the
38 * only things supported is T01, for "Text-base version 0x01". Next, the
39 * client writes the version they would like to use, including the newline.
40 * Thus, the protocol tag is 'T01\n'. If the version tag written is
41 * unknown, -EINVAL is returned. Once the negotiation is complete, the
42 * client can start sending messages.
43 *
44 * The T01 protocol has three messages. First is the "SETN" message.
45 * It has the following syntax:
46 *
47 * SETN<space><8-char-hex-nodenum><newline>
48 *
49 * This is 14 characters.
50 *
51 * The "SETN" message must be the first message following the protocol.
52 * It tells ocfs2_control the local node number.
53 *
54 * Next comes the "SETV" message. It has the following syntax:
55 *
56 * SETV<space><2-char-hex-major><space><2-char-hex-minor><newline>
57 *
58 * This is 11 characters.
59 *
60 * The "SETV" message sets the filesystem locking protocol version as
61 * negotiated by the client. The client negotiates based on the maximum
62 * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major
63 * number from the "SETV" message must match
64 * user_stack.sp_proto->lp_max_version.pv_major, and the minor number
65 * must be less than or equal to ...->lp_max_version.pv_minor.
66 *
67 * Once this information has been set, mounts will be allowed. From this
68 * point on, the "DOWN" message can be sent for node down notification.
69 * It has the following syntax:
70 *
71 * DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline>
72 *
73 * eg:
74 *
75 * DOWN 632A924FDD844190BDA93C0DF6B94899 00000001\n
76 *
77 * This is 47 characters.
78 */
79
80/*
81 * Whether or not the client has done the handshake.
82 * For now, we have just one protocol version.
83 */
84#define OCFS2_CONTROL_PROTO "T01\n"
85#define OCFS2_CONTROL_PROTO_LEN 4
86
87/* Handshake states */
88#define OCFS2_CONTROL_HANDSHAKE_INVALID (0)
89#define OCFS2_CONTROL_HANDSHAKE_READ (1)
90#define OCFS2_CONTROL_HANDSHAKE_PROTOCOL (2)
91#define OCFS2_CONTROL_HANDSHAKE_VALID (3)
92
93/* Messages */
94#define OCFS2_CONTROL_MESSAGE_OP_LEN 4
95#define OCFS2_CONTROL_MESSAGE_SETNODE_OP "SETN"
96#define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN 14
97#define OCFS2_CONTROL_MESSAGE_SETVERSION_OP "SETV"
98#define OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN 11
99#define OCFS2_CONTROL_MESSAGE_DOWN_OP "DOWN"
100#define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN 47
101#define OCFS2_TEXT_UUID_LEN 32
102#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2
103#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8
104
105/*
106 * ocfs2_live_connection is refcounted because the filesystem and
107 * miscdevice sides can detach in different order. Let's just be safe.
108 */
109struct ocfs2_live_connection {
110 struct list_head oc_list;
111 struct ocfs2_cluster_connection *oc_conn;
112};
113
114struct ocfs2_control_private {
115 struct list_head op_list;
116 int op_state;
117 int op_this_node;
118 struct ocfs2_protocol_version op_proto;
119};
120
121/* SETN<space><8-char-hex-nodenum><newline> */
122struct ocfs2_control_message_setn {
123 char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
124 char space;
125 char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
126 char newline;
127};
128
129/* SETV<space><2-char-hex-major><space><2-char-hex-minor><newline> */
130struct ocfs2_control_message_setv {
131 char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
132 char space1;
133 char major[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
134 char space2;
135 char minor[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
136 char newline;
137};
138
139/* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */
140struct ocfs2_control_message_down {
141 char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
142 char space1;
143 char uuid[OCFS2_TEXT_UUID_LEN];
144 char space2;
145 char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
146 char newline;
147};
148
149union ocfs2_control_message {
150 char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
151 struct ocfs2_control_message_setn u_setn;
152 struct ocfs2_control_message_setv u_setv;
153 struct ocfs2_control_message_down u_down;
154};
155
156static struct ocfs2_stack_plugin user_stack;
157
158static atomic_t ocfs2_control_opened;
159static int ocfs2_control_this_node = -1;
160static struct ocfs2_protocol_version running_proto;
161
162static LIST_HEAD(ocfs2_live_connection_list);
163static LIST_HEAD(ocfs2_control_private_list);
164static DEFINE_MUTEX(ocfs2_control_lock);
165
166static inline void ocfs2_control_set_handshake_state(struct file *file,
167 int state)
168{
169 struct ocfs2_control_private *p = file->private_data;
170 p->op_state = state;
171}
172
173static inline int ocfs2_control_get_handshake_state(struct file *file)
174{
175 struct ocfs2_control_private *p = file->private_data;
176 return p->op_state;
177}
178
179static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
180{
181 size_t len = strlen(name);
182 struct ocfs2_live_connection *c;
183
184 BUG_ON(!mutex_is_locked(&ocfs2_control_lock));
185
186 list_for_each_entry(c, &ocfs2_live_connection_list, oc_list) {
187 if ((c->oc_conn->cc_namelen == len) &&
188 !strncmp(c->oc_conn->cc_name, name, len))
189 return c;
190 }
191
192 return c;
193}
194
195/*
196 * ocfs2_live_connection structures are created underneath the ocfs2
197 * mount path. Since the VFS prevents multiple calls to
198 * fill_super(), we can't get dupes here.
199 */
200static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
201 struct ocfs2_live_connection **c_ret)
202{
203 int rc = 0;
204 struct ocfs2_live_connection *c;
205
206 c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
207 if (!c)
208 return -ENOMEM;
209
210 mutex_lock(&ocfs2_control_lock);
211 c->oc_conn = conn;
212
213 if (atomic_read(&ocfs2_control_opened))
214 list_add(&c->oc_list, &ocfs2_live_connection_list);
215 else {
216 printk(KERN_ERR
217 "ocfs2: Userspace control daemon is not present\n");
218 rc = -ESRCH;
219 }
220
221 mutex_unlock(&ocfs2_control_lock);
222
223 if (!rc)
224 *c_ret = c;
225 else
226 kfree(c);
227
228 return rc;
229}
230
231/*
232 * This function disconnects the cluster connection from ocfs2_control.
233 * Afterwards, userspace can't affect the cluster connection.
234 */
235static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c)
236{
237 mutex_lock(&ocfs2_control_lock);
238 list_del_init(&c->oc_list);
239 c->oc_conn = NULL;
240 mutex_unlock(&ocfs2_control_lock);
241
242 kfree(c);
243}
244
245static int ocfs2_control_cfu(void *target, size_t target_len,
246 const char __user *buf, size_t count)
247{
248 /* The T01 expects write(2) calls to have exactly one command */
249 if ((count != target_len) ||
250 (count > sizeof(union ocfs2_control_message)))
251 return -EINVAL;
252
253 if (copy_from_user(target, buf, target_len))
254 return -EFAULT;
255
256 return 0;
257}
258
259static ssize_t ocfs2_control_validate_protocol(struct file *file,
260 const char __user *buf,
261 size_t count)
262{
263 ssize_t ret;
264 char kbuf[OCFS2_CONTROL_PROTO_LEN];
265
266 ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN,
267 buf, count);
268 if (ret)
269 return ret;
270
271 if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN))
272 return -EINVAL;
273
274 ocfs2_control_set_handshake_state(file,
275 OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
276
277 return count;
278}
279
280static void ocfs2_control_send_down(const char *uuid,
281 int nodenum)
282{
283 struct ocfs2_live_connection *c;
284
285 mutex_lock(&ocfs2_control_lock);
286
287 c = ocfs2_connection_find(uuid);
288 if (c) {
289 BUG_ON(c->oc_conn == NULL);
290 c->oc_conn->cc_recovery_handler(nodenum,
291 c->oc_conn->cc_recovery_data);
292 }
293
294 mutex_unlock(&ocfs2_control_lock);
295}
296
297/*
298 * Called whenever configuration elements are sent to /dev/ocfs2_control.
299 * If all configuration elements are present, try to set the global
300 * values. If there is a problem, return an error. Skip any missing
301 * elements, and only bump ocfs2_control_opened when we have all elements
302 * and are successful.
303 */
304static int ocfs2_control_install_private(struct file *file)
305{
306 int rc = 0;
307 int set_p = 1;
308 struct ocfs2_control_private *p = file->private_data;
309
310 BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
311
312 mutex_lock(&ocfs2_control_lock);
313
314 if (p->op_this_node < 0) {
315 set_p = 0;
316 } else if ((ocfs2_control_this_node >= 0) &&
317 (ocfs2_control_this_node != p->op_this_node)) {
318 rc = -EINVAL;
319 goto out_unlock;
320 }
321
322 if (!p->op_proto.pv_major) {
323 set_p = 0;
324 } else if (!list_empty(&ocfs2_live_connection_list) &&
325 ((running_proto.pv_major != p->op_proto.pv_major) ||
326 (running_proto.pv_minor != p->op_proto.pv_minor))) {
327 rc = -EINVAL;
328 goto out_unlock;
329 }
330
331 if (set_p) {
332 ocfs2_control_this_node = p->op_this_node;
333 running_proto.pv_major = p->op_proto.pv_major;
334 running_proto.pv_minor = p->op_proto.pv_minor;
335 }
336
337out_unlock:
338 mutex_unlock(&ocfs2_control_lock);
339
340 if (!rc && set_p) {
341 /* We set the global values successfully */
342 atomic_inc(&ocfs2_control_opened);
343 ocfs2_control_set_handshake_state(file,
344 OCFS2_CONTROL_HANDSHAKE_VALID);
345 }
346
347 return rc;
348}
349
350static int ocfs2_control_get_this_node(void)
351{
352 int rc;
353
354 mutex_lock(&ocfs2_control_lock);
355 if (ocfs2_control_this_node < 0)
356 rc = -EINVAL;
357 else
358 rc = ocfs2_control_this_node;
359 mutex_unlock(&ocfs2_control_lock);
360
361 return rc;
362}
363
364static int ocfs2_control_do_setnode_msg(struct file *file,
365 struct ocfs2_control_message_setn *msg)
366{
367 long nodenum;
368 char *ptr = NULL;
369 struct ocfs2_control_private *p = file->private_data;
370
371 if (ocfs2_control_get_handshake_state(file) !=
372 OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
373 return -EINVAL;
374
375 if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
376 OCFS2_CONTROL_MESSAGE_OP_LEN))
377 return -EINVAL;
378
379 if ((msg->space != ' ') || (msg->newline != '\n'))
380 return -EINVAL;
381 msg->space = msg->newline = '\0';
382
383 nodenum = simple_strtol(msg->nodestr, &ptr, 16);
384 if (!ptr || *ptr)
385 return -EINVAL;
386
387 if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
388 (nodenum > INT_MAX) || (nodenum < 0))
389 return -ERANGE;
390 p->op_this_node = nodenum;
391
392 return ocfs2_control_install_private(file);
393}
394
395static int ocfs2_control_do_setversion_msg(struct file *file,
396 struct ocfs2_control_message_setv *msg)
397 {
398 long major, minor;
399 char *ptr = NULL;
400 struct ocfs2_control_private *p = file->private_data;
401 struct ocfs2_protocol_version *max =
402 &user_stack.sp_proto->lp_max_version;
403
404 if (ocfs2_control_get_handshake_state(file) !=
405 OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
406 return -EINVAL;
407
408 if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
409 OCFS2_CONTROL_MESSAGE_OP_LEN))
410 return -EINVAL;
411
412 if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
413 (msg->newline != '\n'))
414 return -EINVAL;
415 msg->space1 = msg->space2 = msg->newline = '\0';
416
417 major = simple_strtol(msg->major, &ptr, 16);
418 if (!ptr || *ptr)
419 return -EINVAL;
420 minor = simple_strtol(msg->minor, &ptr, 16);
421 if (!ptr || *ptr)
422 return -EINVAL;
423
424 /*
425 * The major must be between 1 and 255, inclusive. The minor
426 * must be between 0 and 255, inclusive. The version passed in
427 * must be within the maximum version supported by the filesystem.
428 */
429 if ((major == LONG_MIN) || (major == LONG_MAX) ||
430 (major > (u8)-1) || (major < 1))
431 return -ERANGE;
432 if ((minor == LONG_MIN) || (minor == LONG_MAX) ||
433 (minor > (u8)-1) || (minor < 0))
434 return -ERANGE;
435 if ((major != max->pv_major) ||
436 (minor > max->pv_minor))
437 return -EINVAL;
438
439 p->op_proto.pv_major = major;
440 p->op_proto.pv_minor = minor;
441
442 return ocfs2_control_install_private(file);
443}
444
445static int ocfs2_control_do_down_msg(struct file *file,
446 struct ocfs2_control_message_down *msg)
447{
448 long nodenum;
449 char *p = NULL;
450
451 if (ocfs2_control_get_handshake_state(file) !=
452 OCFS2_CONTROL_HANDSHAKE_VALID)
453 return -EINVAL;
454
455 if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
456 OCFS2_CONTROL_MESSAGE_OP_LEN))
457 return -EINVAL;
458
459 if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
460 (msg->newline != '\n'))
461 return -EINVAL;
462 msg->space1 = msg->space2 = msg->newline = '\0';
463
464 nodenum = simple_strtol(msg->nodestr, &p, 16);
465 if (!p || *p)
466 return -EINVAL;
467
468 if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
469 (nodenum > INT_MAX) || (nodenum < 0))
470 return -ERANGE;
471
472 ocfs2_control_send_down(msg->uuid, nodenum);
473
474 return 0;
475}
476
477static ssize_t ocfs2_control_message(struct file *file,
478 const char __user *buf,
479 size_t count)
480{
481 ssize_t ret;
482 union ocfs2_control_message msg;
483
484 /* Try to catch padding issues */
485 WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) !=
486 (sizeof(msg.u_down.tag) + sizeof(msg.u_down.space1)));
487
488 memset(&msg, 0, sizeof(union ocfs2_control_message));
489 ret = ocfs2_control_cfu(&msg, count, buf, count);
490 if (ret)
491 goto out;
492
493 if ((count == OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN) &&
494 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
495 OCFS2_CONTROL_MESSAGE_OP_LEN))
496 ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn);
497 else if ((count == OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN) &&
498 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
499 OCFS2_CONTROL_MESSAGE_OP_LEN))
500 ret = ocfs2_control_do_setversion_msg(file, &msg.u_setv);
501 else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) &&
502 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
503 OCFS2_CONTROL_MESSAGE_OP_LEN))
504 ret = ocfs2_control_do_down_msg(file, &msg.u_down);
505 else
506 ret = -EINVAL;
507
508out:
509 return ret ? ret : count;
510}
511
512static ssize_t ocfs2_control_write(struct file *file,
513 const char __user *buf,
514 size_t count,
515 loff_t *ppos)
516{
517 ssize_t ret;
518
519 switch (ocfs2_control_get_handshake_state(file)) {
520 case OCFS2_CONTROL_HANDSHAKE_INVALID:
521 ret = -EINVAL;
522 break;
523
524 case OCFS2_CONTROL_HANDSHAKE_READ:
525 ret = ocfs2_control_validate_protocol(file, buf,
526 count);
527 break;
528
529 case OCFS2_CONTROL_HANDSHAKE_PROTOCOL:
530 case OCFS2_CONTROL_HANDSHAKE_VALID:
531 ret = ocfs2_control_message(file, buf, count);
532 break;
533
534 default:
535 BUG();
536 ret = -EIO;
537 break;
538 }
539
540 return ret;
541}
542
543/*
544 * This is a naive version. If we ever have a new protocol, we'll expand
545 * it. Probably using seq_file.
546 */
547static ssize_t ocfs2_control_read(struct file *file,
548 char __user *buf,
549 size_t count,
550 loff_t *ppos)
551{
552 char *proto_string = OCFS2_CONTROL_PROTO;
553 size_t to_write = 0;
554
555 if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
556 return 0;
557
558 to_write = OCFS2_CONTROL_PROTO_LEN - *ppos;
559 if (to_write > count)
560 to_write = count;
561 if (copy_to_user(buf, proto_string + *ppos, to_write))
562 return -EFAULT;
563
564 *ppos += to_write;
565
566 /* Have we read the whole protocol list? */
567 if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
568 ocfs2_control_set_handshake_state(file,
569 OCFS2_CONTROL_HANDSHAKE_READ);
570
571 return to_write;
572}
573
574static int ocfs2_control_release(struct inode *inode, struct file *file)
575{
576 struct ocfs2_control_private *p = file->private_data;
577
578 mutex_lock(&ocfs2_control_lock);
579
580 if (ocfs2_control_get_handshake_state(file) !=
581 OCFS2_CONTROL_HANDSHAKE_VALID)
582 goto out;
583
584 if (atomic_dec_and_test(&ocfs2_control_opened)) {
585 if (!list_empty(&ocfs2_live_connection_list)) {
586 /* XXX: Do bad things! */
587 printk(KERN_ERR
588 "ocfs2: Unexpected release of ocfs2_control!\n"
589 " Loss of cluster connection requires "
590 "an emergency restart!\n");
591 emergency_restart();
592 }
593 /*
594 * Last valid close clears the node number and resets
595 * the locking protocol version
596 */
597 ocfs2_control_this_node = -1;
598 running_proto.pv_major = 0;
599 running_proto.pv_major = 0;
600 }
601
602out:
603 list_del_init(&p->op_list);
604 file->private_data = NULL;
605
606 mutex_unlock(&ocfs2_control_lock);
607
608 kfree(p);
609
610 return 0;
611}
612
613static int ocfs2_control_open(struct inode *inode, struct file *file)
614{
615 struct ocfs2_control_private *p;
616
617 p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL);
618 if (!p)
619 return -ENOMEM;
620 p->op_this_node = -1;
621
622 mutex_lock(&ocfs2_control_lock);
623 file->private_data = p;
624 list_add(&p->op_list, &ocfs2_control_private_list);
625 mutex_unlock(&ocfs2_control_lock);
626
627 return 0;
628}
629
630static const struct file_operations ocfs2_control_fops = {
631 .open = ocfs2_control_open,
632 .release = ocfs2_control_release,
633 .read = ocfs2_control_read,
634 .write = ocfs2_control_write,
635 .owner = THIS_MODULE,
636};
637
638struct miscdevice ocfs2_control_device = {
639 .minor = MISC_DYNAMIC_MINOR,
640 .name = "ocfs2_control",
641 .fops = &ocfs2_control_fops,
642};
643
644static int ocfs2_control_init(void)
645{
646 int rc;
647
648 atomic_set(&ocfs2_control_opened, 0);
649
650 rc = misc_register(&ocfs2_control_device);
651 if (rc)
652 printk(KERN_ERR
653 "ocfs2: Unable to register ocfs2_control device "
654 "(errno %d)\n",
655 -rc);
656
657 return rc;
658}
659
660static void ocfs2_control_exit(void)
661{
662 int rc;
663
664 rc = misc_deregister(&ocfs2_control_device);
665 if (rc)
666 printk(KERN_ERR
667 "ocfs2: Unable to deregister ocfs2_control device "
668 "(errno %d)\n",
669 -rc);
670}
671
672static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
673{
674 struct ocfs2_lock_res *res = astarg;
675 return &res->l_lksb.lksb_fsdlm;
676}
677
678static void fsdlm_lock_ast_wrapper(void *astarg)
679{
680 struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg);
681 int status = lksb->sb_status;
682
683 BUG_ON(user_stack.sp_proto == NULL);
684
685 /*
686 * For now we're punting on the issue of other non-standard errors
687 * where we can't tell if the unlock_ast or lock_ast should be called.
688 * The main "other error" that's possible is EINVAL which means the
689 * function was called with invalid args, which shouldn't be possible
690 * since the caller here is under our control. Other non-standard
691 * errors probably fall into the same category, or otherwise are fatal
692 * which means we can't carry on anyway.
693 */
694
695 if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
696 user_stack.sp_proto->lp_unlock_ast(astarg, 0);
697 else
698 user_stack.sp_proto->lp_lock_ast(astarg);
699}
700
701static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
702{
703 BUG_ON(user_stack.sp_proto == NULL);
704
705 user_stack.sp_proto->lp_blocking_ast(astarg, level);
706}
707
708static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
709 int mode,
710 union ocfs2_dlm_lksb *lksb,
711 u32 flags,
712 void *name,
713 unsigned int namelen,
714 void *astarg)
715{
716 int ret;
717
718 if (!lksb->lksb_fsdlm.sb_lvbptr)
719 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
720 sizeof(struct dlm_lksb);
721
722 ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
723 flags|DLM_LKF_NODLCKWT, name, namelen, 0,
724 fsdlm_lock_ast_wrapper, astarg,
725 fsdlm_blocking_ast_wrapper);
726 return ret;
727}
728
729static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
730 union ocfs2_dlm_lksb *lksb,
731 u32 flags,
732 void *astarg)
733{
734 int ret;
735
736 ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
737 flags, &lksb->lksb_fsdlm, astarg);
738 return ret;
739}
740
741static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
742{
743 return lksb->lksb_fsdlm.sb_status;
744}
745
746static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
747{
748 return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
749}
750
751static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
752{
753}
754
755/*
756 * Compare a requested locking protocol version against the current one.
757 *
758 * If the major numbers are different, they are incompatible.
759 * If the current minor is greater than the request, they are incompatible.
760 * If the current minor is less than or equal to the request, they are
761 * compatible, and the requester should run at the current minor version.
762 */
763static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
764 struct ocfs2_protocol_version *request)
765{
766 if (existing->pv_major != request->pv_major)
767 return 1;
768
769 if (existing->pv_minor > request->pv_minor)
770 return 1;
771
772 if (existing->pv_minor < request->pv_minor)
773 request->pv_minor = existing->pv_minor;
774
775 return 0;
776}
777
778static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
779{
780 dlm_lockspace_t *fsdlm;
781 struct ocfs2_live_connection *control;
782 int rc = 0;
783
784 BUG_ON(conn == NULL);
785
786 rc = ocfs2_live_connection_new(conn, &control);
787 if (rc)
788 goto out;
789
790 /*
791 * running_proto must have been set before we allowed any mounts
792 * to proceed.
793 */
794 if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
795 printk(KERN_ERR
796 "Unable to mount with fs locking protocol version "
797 "%u.%u because the userspace control daemon has "
798 "negotiated %u.%u\n",
799 conn->cc_version.pv_major, conn->cc_version.pv_minor,
800 running_proto.pv_major, running_proto.pv_minor);
801 rc = -EPROTO;
802 ocfs2_live_connection_drop(control);
803 goto out;
804 }
805
806 rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name),
807 &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN);
808 if (rc) {
809 ocfs2_live_connection_drop(control);
810 goto out;
811 }
812
813 conn->cc_private = control;
814 conn->cc_lockspace = fsdlm;
815out:
816 return rc;
817}
818
819static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn,
820 int hangup_pending)
821{
822 dlm_release_lockspace(conn->cc_lockspace, 2);
823 conn->cc_lockspace = NULL;
824 ocfs2_live_connection_drop(conn->cc_private);
825 conn->cc_private = NULL;
826 return 0;
827}
828
829static int user_cluster_this_node(unsigned int *this_node)
830{
831 int rc;
832
833 rc = ocfs2_control_get_this_node();
834 if (rc < 0)
835 return rc;
836
837 *this_node = rc;
838 return 0;
839}
840
841static struct ocfs2_stack_operations user_stack_ops = {
842 .connect = user_cluster_connect,
843 .disconnect = user_cluster_disconnect,
844 .this_node = user_cluster_this_node,
845 .dlm_lock = user_dlm_lock,
846 .dlm_unlock = user_dlm_unlock,
847 .lock_status = user_dlm_lock_status,
848 .lock_lvb = user_dlm_lvb,
849 .dump_lksb = user_dlm_dump_lksb,
850};
851
852static struct ocfs2_stack_plugin user_stack = {
853 .sp_name = "user",
854 .sp_ops = &user_stack_ops,
855 .sp_owner = THIS_MODULE,
856};
857
858
859static int __init user_stack_init(void)
860{
861 int rc;
862
863 rc = ocfs2_control_init();
864 if (!rc) {
865 rc = ocfs2_stack_glue_register(&user_stack);
866 if (rc)
867 ocfs2_control_exit();
868 }
869
870 return rc;
871}
872
873static void __exit user_stack_exit(void)
874{
875 ocfs2_stack_glue_unregister(&user_stack);
876 ocfs2_control_exit();
877}
878
879MODULE_AUTHOR("Oracle");
880MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks");
881MODULE_LICENSE("GPL");
882module_init(user_stack_init);
883module_exit(user_stack_exit);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
new file mode 100644
index 000000000000..119f60cea9cc
--- /dev/null
+++ b/fs/ocfs2/stackglue.c
@@ -0,0 +1,568 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * stackglue.c
5 *
6 * Code which implements an OCFS2 specific interface to underlying
7 * cluster stacks.
8 *
9 * Copyright (C) 2007 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation, version 2.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 */
20
21#include <linux/list.h>
22#include <linux/spinlock.h>
23#include <linux/module.h>
24#include <linux/slab.h>
25#include <linux/kmod.h>
26#include <linux/fs.h>
27#include <linux/kobject.h>
28#include <linux/sysfs.h>
29
30#include "ocfs2_fs.h"
31
32#include "stackglue.h"
33
34#define OCFS2_STACK_PLUGIN_O2CB "o2cb"
35#define OCFS2_STACK_PLUGIN_USER "user"
36
37static struct ocfs2_locking_protocol *lproto;
38static DEFINE_SPINLOCK(ocfs2_stack_lock);
39static LIST_HEAD(ocfs2_stack_list);
40static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
41
42/*
43 * The stack currently in use. If not null, active_stack->sp_count > 0,
44 * the module is pinned, and the locking protocol cannot be changed.
45 */
46static struct ocfs2_stack_plugin *active_stack;
47
48static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
49{
50 struct ocfs2_stack_plugin *p;
51
52 assert_spin_locked(&ocfs2_stack_lock);
53
54 list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
55 if (!strcmp(p->sp_name, name))
56 return p;
57 }
58
59 return NULL;
60}
61
62static int ocfs2_stack_driver_request(const char *stack_name,
63 const char *plugin_name)
64{
65 int rc;
66 struct ocfs2_stack_plugin *p;
67
68 spin_lock(&ocfs2_stack_lock);
69
70 /*
71 * If the stack passed by the filesystem isn't the selected one,
72 * we can't continue.
73 */
74 if (strcmp(stack_name, cluster_stack_name)) {
75 rc = -EBUSY;
76 goto out;
77 }
78
79 if (active_stack) {
80 /*
81 * If the active stack isn't the one we want, it cannot
82 * be selected right now.
83 */
84 if (!strcmp(active_stack->sp_name, plugin_name))
85 rc = 0;
86 else
87 rc = -EBUSY;
88 goto out;
89 }
90
91 p = ocfs2_stack_lookup(plugin_name);
92 if (!p || !try_module_get(p->sp_owner)) {
93 rc = -ENOENT;
94 goto out;
95 }
96
97 /* Ok, the stack is pinned */
98 p->sp_count++;
99 active_stack = p;
100
101 rc = 0;
102
103out:
104 spin_unlock(&ocfs2_stack_lock);
105 return rc;
106}
107
108/*
109 * This function looks up the appropriate stack and makes it active. If
110 * there is no stack, it tries to load it. It will fail if the stack still
111 * cannot be found. It will also fail if a different stack is in use.
112 */
113static int ocfs2_stack_driver_get(const char *stack_name)
114{
115 int rc;
116 char *plugin_name = OCFS2_STACK_PLUGIN_O2CB;
117
118 /*
119 * Classic stack does not pass in a stack name. This is
120 * compatible with older tools as well.
121 */
122 if (!stack_name || !*stack_name)
123 stack_name = OCFS2_STACK_PLUGIN_O2CB;
124
125 if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) {
126 printk(KERN_ERR
127 "ocfs2 passed an invalid cluster stack label: \"%s\"\n",
128 stack_name);
129 return -EINVAL;
130 }
131
132 /* Anything that isn't the classic stack is a user stack */
133 if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB))
134 plugin_name = OCFS2_STACK_PLUGIN_USER;
135
136 rc = ocfs2_stack_driver_request(stack_name, plugin_name);
137 if (rc == -ENOENT) {
138 request_module("ocfs2_stack_%s", plugin_name);
139 rc = ocfs2_stack_driver_request(stack_name, plugin_name);
140 }
141
142 if (rc == -ENOENT) {
143 printk(KERN_ERR
144 "ocfs2: Cluster stack driver \"%s\" cannot be found\n",
145 plugin_name);
146 } else if (rc == -EBUSY) {
147 printk(KERN_ERR
148 "ocfs2: A different cluster stack is in use\n");
149 }
150
151 return rc;
152}
153
154static void ocfs2_stack_driver_put(void)
155{
156 spin_lock(&ocfs2_stack_lock);
157 BUG_ON(active_stack == NULL);
158 BUG_ON(active_stack->sp_count == 0);
159
160 active_stack->sp_count--;
161 if (!active_stack->sp_count) {
162 module_put(active_stack->sp_owner);
163 active_stack = NULL;
164 }
165 spin_unlock(&ocfs2_stack_lock);
166}
167
168int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
169{
170 int rc;
171
172 spin_lock(&ocfs2_stack_lock);
173 if (!ocfs2_stack_lookup(plugin->sp_name)) {
174 plugin->sp_count = 0;
175 plugin->sp_proto = lproto;
176 list_add(&plugin->sp_list, &ocfs2_stack_list);
177 printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
178 plugin->sp_name);
179 rc = 0;
180 } else {
181 printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n",
182 plugin->sp_name);
183 rc = -EEXIST;
184 }
185 spin_unlock(&ocfs2_stack_lock);
186
187 return rc;
188}
189EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register);
190
191void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
192{
193 struct ocfs2_stack_plugin *p;
194
195 spin_lock(&ocfs2_stack_lock);
196 p = ocfs2_stack_lookup(plugin->sp_name);
197 if (p) {
198 BUG_ON(p != plugin);
199 BUG_ON(plugin == active_stack);
200 BUG_ON(plugin->sp_count != 0);
201 list_del_init(&plugin->sp_list);
202 printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n",
203 plugin->sp_name);
204 } else {
205 printk(KERN_ERR "Stack \"%s\" is not registered\n",
206 plugin->sp_name);
207 }
208 spin_unlock(&ocfs2_stack_lock);
209}
210EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
211
212void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
213{
214 struct ocfs2_stack_plugin *p;
215
216 BUG_ON(proto == NULL);
217
218 spin_lock(&ocfs2_stack_lock);
219 BUG_ON(active_stack != NULL);
220
221 lproto = proto;
222 list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
223 p->sp_proto = lproto;
224 }
225
226 spin_unlock(&ocfs2_stack_lock);
227}
228EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol);
229
230
231/*
232 * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take
233 * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the
234 * underlying stack plugins need to pilfer the lksb off of the lock_res.
235 * If some other structure needs to be passed as an astarg, the plugins
236 * will need to be given a different avenue to the lksb.
237 */
238int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
239 int mode,
240 union ocfs2_dlm_lksb *lksb,
241 u32 flags,
242 void *name,
243 unsigned int namelen,
244 struct ocfs2_lock_res *astarg)
245{
246 BUG_ON(lproto == NULL);
247
248 return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
249 name, namelen, astarg);
250}
251EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
252
253int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
254 union ocfs2_dlm_lksb *lksb,
255 u32 flags,
256 struct ocfs2_lock_res *astarg)
257{
258 BUG_ON(lproto == NULL);
259
260 return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg);
261}
262EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
263
264int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
265{
266 return active_stack->sp_ops->lock_status(lksb);
267}
268EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
269
270/*
271 * Why don't we cast to ocfs2_meta_lvb? The "clean" answer is that we
272 * don't cast at the glue level. The real answer is that the header
273 * ordering is nigh impossible.
274 */
275void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
276{
277 return active_stack->sp_ops->lock_lvb(lksb);
278}
279EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
280
281void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
282{
283 active_stack->sp_ops->dump_lksb(lksb);
284}
285EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
286
287int ocfs2_cluster_connect(const char *stack_name,
288 const char *group,
289 int grouplen,
290 void (*recovery_handler)(int node_num,
291 void *recovery_data),
292 void *recovery_data,
293 struct ocfs2_cluster_connection **conn)
294{
295 int rc = 0;
296 struct ocfs2_cluster_connection *new_conn;
297
298 BUG_ON(group == NULL);
299 BUG_ON(conn == NULL);
300 BUG_ON(recovery_handler == NULL);
301
302 if (grouplen > GROUP_NAME_MAX) {
303 rc = -EINVAL;
304 goto out;
305 }
306
307 new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
308 GFP_KERNEL);
309 if (!new_conn) {
310 rc = -ENOMEM;
311 goto out;
312 }
313
314 memcpy(new_conn->cc_name, group, grouplen);
315 new_conn->cc_namelen = grouplen;
316 new_conn->cc_recovery_handler = recovery_handler;
317 new_conn->cc_recovery_data = recovery_data;
318
319 /* Start the new connection at our maximum compatibility level */
320 new_conn->cc_version = lproto->lp_max_version;
321
322 /* This will pin the stack driver if successful */
323 rc = ocfs2_stack_driver_get(stack_name);
324 if (rc)
325 goto out_free;
326
327 rc = active_stack->sp_ops->connect(new_conn);
328 if (rc) {
329 ocfs2_stack_driver_put();
330 goto out_free;
331 }
332
333 *conn = new_conn;
334
335out_free:
336 if (rc)
337 kfree(new_conn);
338
339out:
340 return rc;
341}
342EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
343
344/* If hangup_pending is 0, the stack driver will be dropped */
345int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
346 int hangup_pending)
347{
348 int ret;
349
350 BUG_ON(conn == NULL);
351
352 ret = active_stack->sp_ops->disconnect(conn, hangup_pending);
353
354 /* XXX Should we free it anyway? */
355 if (!ret) {
356 kfree(conn);
357 if (!hangup_pending)
358 ocfs2_stack_driver_put();
359 }
360
361 return ret;
362}
363EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect);
364
365void ocfs2_cluster_hangup(const char *group, int grouplen)
366{
367 BUG_ON(group == NULL);
368 BUG_ON(group[grouplen] != '\0');
369
370 if (active_stack->sp_ops->hangup)
371 active_stack->sp_ops->hangup(group, grouplen);
372
373 /* cluster_disconnect() was called with hangup_pending==1 */
374 ocfs2_stack_driver_put();
375}
376EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
377
378int ocfs2_cluster_this_node(unsigned int *node)
379{
380 return active_stack->sp_ops->this_node(node);
381}
382EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
383
384
385/*
386 * Sysfs bits
387 */
388
389static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
390 struct kobj_attribute *attr,
391 char *buf)
392{
393 ssize_t ret = 0;
394
395 spin_lock(&ocfs2_stack_lock);
396 if (lproto)
397 ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
398 lproto->lp_max_version.pv_major,
399 lproto->lp_max_version.pv_minor);
400 spin_unlock(&ocfs2_stack_lock);
401
402 return ret;
403}
404
405static struct kobj_attribute ocfs2_attr_max_locking_protocol =
406 __ATTR(max_locking_protocol, S_IFREG | S_IRUGO,
407 ocfs2_max_locking_protocol_show, NULL);
408
409static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
410 struct kobj_attribute *attr,
411 char *buf)
412{
413 ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
414 struct ocfs2_stack_plugin *p;
415
416 spin_lock(&ocfs2_stack_lock);
417 list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
418 ret = snprintf(buf, remain, "%s\n",
419 p->sp_name);
420 if (ret < 0) {
421 total = ret;
422 break;
423 }
424 if (ret == remain) {
425 /* snprintf() didn't fit */
426 total = -E2BIG;
427 break;
428 }
429 total += ret;
430 remain -= ret;
431 }
432 spin_unlock(&ocfs2_stack_lock);
433
434 return total;
435}
436
437static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
438 __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO,
439 ocfs2_loaded_cluster_plugins_show, NULL);
440
441static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
442 struct kobj_attribute *attr,
443 char *buf)
444{
445 ssize_t ret = 0;
446
447 spin_lock(&ocfs2_stack_lock);
448 if (active_stack) {
449 ret = snprintf(buf, PAGE_SIZE, "%s\n",
450 active_stack->sp_name);
451 if (ret == PAGE_SIZE)
452 ret = -E2BIG;
453 }
454 spin_unlock(&ocfs2_stack_lock);
455
456 return ret;
457}
458
459static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
460 __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO,
461 ocfs2_active_cluster_plugin_show, NULL);
462
463static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
464 struct kobj_attribute *attr,
465 char *buf)
466{
467 ssize_t ret;
468 spin_lock(&ocfs2_stack_lock);
469 ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name);
470 spin_unlock(&ocfs2_stack_lock);
471
472 return ret;
473}
474
475static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
476 struct kobj_attribute *attr,
477 const char *buf, size_t count)
478{
479 size_t len = count;
480 ssize_t ret;
481
482 if (len == 0)
483 return len;
484
485 if (buf[len - 1] == '\n')
486 len--;
487
488 if ((len != OCFS2_STACK_LABEL_LEN) ||
489 (strnlen(buf, len) != len))
490 return -EINVAL;
491
492 spin_lock(&ocfs2_stack_lock);
493 if (active_stack) {
494 if (!strncmp(buf, cluster_stack_name, len))
495 ret = count;
496 else
497 ret = -EBUSY;
498 } else {
499 memcpy(cluster_stack_name, buf, len);
500 ret = count;
501 }
502 spin_unlock(&ocfs2_stack_lock);
503
504 return ret;
505}
506
507
508static struct kobj_attribute ocfs2_attr_cluster_stack =
509 __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR,
510 ocfs2_cluster_stack_show,
511 ocfs2_cluster_stack_store);
512
513static struct attribute *ocfs2_attrs[] = {
514 &ocfs2_attr_max_locking_protocol.attr,
515 &ocfs2_attr_loaded_cluster_plugins.attr,
516 &ocfs2_attr_active_cluster_plugin.attr,
517 &ocfs2_attr_cluster_stack.attr,
518 NULL,
519};
520
521static struct attribute_group ocfs2_attr_group = {
522 .attrs = ocfs2_attrs,
523};
524
525static struct kset *ocfs2_kset;
526
527static void ocfs2_sysfs_exit(void)
528{
529 kset_unregister(ocfs2_kset);
530}
531
532static int ocfs2_sysfs_init(void)
533{
534 int ret;
535
536 ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj);
537 if (!ocfs2_kset)
538 return -ENOMEM;
539
540 ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group);
541 if (ret)
542 goto error;
543
544 return 0;
545
546error:
547 kset_unregister(ocfs2_kset);
548 return ret;
549}
550
551static int __init ocfs2_stack_glue_init(void)
552{
553 strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
554
555 return ocfs2_sysfs_init();
556}
557
558static void __exit ocfs2_stack_glue_exit(void)
559{
560 lproto = NULL;
561 ocfs2_sysfs_exit();
562}
563
564MODULE_AUTHOR("Oracle");
565MODULE_DESCRIPTION("ocfs2 cluter stack glue layer");
566MODULE_LICENSE("GPL");
567module_init(ocfs2_stack_glue_init);
568module_exit(ocfs2_stack_glue_exit);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
new file mode 100644
index 000000000000..005e4f170e0f
--- /dev/null
+++ b/fs/ocfs2/stackglue.h
@@ -0,0 +1,261 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * stackglue.h
5 *
6 * Glue to the underlying cluster stack.
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation, version 2.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20
21#ifndef STACKGLUE_H
22#define STACKGLUE_H
23
24#include <linux/types.h>
25#include <linux/list.h>
26#include <linux/dlmconstants.h>
27
28#include "dlm/dlmapi.h"
29#include <linux/dlm.h>
30
31/*
32 * dlmconstants.h does not have a LOCAL flag. We hope to remove it
33 * some day, but right now we need it. Let's fake it. This value is larger
34 * than any flag in dlmconstants.h.
35 */
36#define DLM_LKF_LOCAL 0x00100000
37
38/*
39 * This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h. That probably
40 * wants to be in a public header.
41 */
42#define GROUP_NAME_MAX 64
43
44
45/*
46 * ocfs2_protocol_version changes when ocfs2 does something different in
47 * its inter-node behavior. See dlmglue.c for more information.
48 */
49struct ocfs2_protocol_version {
50 u8 pv_major;
51 u8 pv_minor;
52};
53
54/*
55 * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
56 */
57struct ocfs2_locking_protocol {
58 struct ocfs2_protocol_version lp_max_version;
59 void (*lp_lock_ast)(void *astarg);
60 void (*lp_blocking_ast)(void *astarg, int level);
61 void (*lp_unlock_ast)(void *astarg, int error);
62};
63
64
65/*
66 * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
67 * has a pointer to separately allocated lvb space. This struct exists only to
68 * include in the lksb union to make space for a combined dlm_lksb and lvb.
69 */
70struct fsdlm_lksb_plus_lvb {
71 struct dlm_lksb lksb;
72 char lvb[DLM_LVB_LEN];
73};
74
75/*
76 * A union of all lock status structures. We define it here so that the
77 * size of the union is known. Lock status structures are embedded in
78 * ocfs2 inodes.
79 */
80union ocfs2_dlm_lksb {
81 struct dlm_lockstatus lksb_o2dlm;
82 struct dlm_lksb lksb_fsdlm;
83 struct fsdlm_lksb_plus_lvb padding;
84};
85
86/*
87 * A cluster connection. Mostly opaque to ocfs2, the connection holds
88 * state for the underlying stack. ocfs2 does use cc_version to determine
89 * locking compatibility.
90 */
91struct ocfs2_cluster_connection {
92 char cc_name[GROUP_NAME_MAX];
93 int cc_namelen;
94 struct ocfs2_protocol_version cc_version;
95 void (*cc_recovery_handler)(int node_num, void *recovery_data);
96 void *cc_recovery_data;
97 void *cc_lockspace;
98 void *cc_private;
99};
100
101/*
102 * Each cluster stack implements the stack operations structure. Not used
103 * in the ocfs2 code, the stackglue code translates generic cluster calls
104 * into stack operations.
105 */
106struct ocfs2_stack_operations {
107 /*
108 * The fs code calls ocfs2_cluster_connect() to attach a new
109 * filesystem to the cluster stack. The ->connect() op is passed
110 * an ocfs2_cluster_connection with the name and recovery field
111 * filled in.
112 *
113 * The stack must set up any notification mechanisms and create
114 * the filesystem lockspace in the DLM. The lockspace should be
115 * stored on cc_lockspace. Any other information can be stored on
116 * cc_private.
117 *
118 * ->connect() must not return until it is guaranteed that
119 *
120 * - Node down notifications for the filesystem will be recieved
121 * and passed to conn->cc_recovery_handler().
122 * - Locking requests for the filesystem will be processed.
123 */
124 int (*connect)(struct ocfs2_cluster_connection *conn);
125
126 /*
127 * The fs code calls ocfs2_cluster_disconnect() when a filesystem
128 * no longer needs cluster services. All DLM locks have been
129 * dropped, and recovery notification is being ignored by the
130 * fs code. The stack must disengage from the DLM and discontinue
131 * recovery notification.
132 *
133 * Once ->disconnect() has returned, the connection structure will
134 * be freed. Thus, a stack must not return from ->disconnect()
135 * until it will no longer reference the conn pointer.
136 *
137 * If hangup_pending is zero, ocfs2_cluster_disconnect() will also
138 * be dropping the reference on the module.
139 */
140 int (*disconnect)(struct ocfs2_cluster_connection *conn,
141 int hangup_pending);
142
143 /*
144 * ocfs2_cluster_hangup() exists for compatibility with older
145 * ocfs2 tools. Only the classic stack really needs it. As such
146 * ->hangup() is not required of all stacks. See the comment by
147 * ocfs2_cluster_hangup() for more details.
148 *
149 * Note that ocfs2_cluster_hangup() can only be called if
150 * hangup_pending was passed to ocfs2_cluster_disconnect().
151 */
152 void (*hangup)(const char *group, int grouplen);
153
154 /*
155 * ->this_node() returns the cluster's unique identifier for the
156 * local node.
157 */
158 int (*this_node)(unsigned int *node);
159
160 /*
161 * Call the underlying dlm lock function. The ->dlm_lock()
162 * callback should convert the flags and mode as appropriate.
163 *
164 * ast and bast functions are not part of the call because the
165 * stack will likely want to wrap ast and bast calls before passing
166 * them to stack->sp_proto.
167 */
168 int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
169 int mode,
170 union ocfs2_dlm_lksb *lksb,
171 u32 flags,
172 void *name,
173 unsigned int namelen,
174 void *astarg);
175
176 /*
177 * Call the underlying dlm unlock function. The ->dlm_unlock()
178 * function should convert the flags as appropriate.
179 *
180 * The unlock ast is not passed, as the stack will want to wrap
181 * it before calling stack->sp_proto->lp_unlock_ast().
182 */
183 int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
184 union ocfs2_dlm_lksb *lksb,
185 u32 flags,
186 void *astarg);
187
188 /*
189 * Return the status of the current lock status block. The fs
190 * code should never dereference the union. The ->lock_status()
191 * callback pulls out the stack-specific lksb, converts the status
192 * to a proper errno, and returns it.
193 */
194 int (*lock_status)(union ocfs2_dlm_lksb *lksb);
195
196 /*
197 * Pull the lvb pointer off of the stack-specific lksb.
198 */
199 void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
200
201 /*
202 * This is an optoinal debugging hook. If provided, the
203 * stack can dump debugging information about this lock.
204 */
205 void (*dump_lksb)(union ocfs2_dlm_lksb *lksb);
206};
207
208/*
209 * Each stack plugin must describe itself by registering a
210 * ocfs2_stack_plugin structure. This is only seen by stackglue and the
211 * stack driver.
212 */
213struct ocfs2_stack_plugin {
214 char *sp_name;
215 struct ocfs2_stack_operations *sp_ops;
216 struct module *sp_owner;
217
218 /* These are managed by the stackglue code. */
219 struct list_head sp_list;
220 unsigned int sp_count;
221 struct ocfs2_locking_protocol *sp_proto;
222};
223
224
225/* Used by the filesystem */
226int ocfs2_cluster_connect(const char *stack_name,
227 const char *group,
228 int grouplen,
229 void (*recovery_handler)(int node_num,
230 void *recovery_data),
231 void *recovery_data,
232 struct ocfs2_cluster_connection **conn);
233int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
234 int hangup_pending);
235void ocfs2_cluster_hangup(const char *group, int grouplen);
236int ocfs2_cluster_this_node(unsigned int *node);
237
238struct ocfs2_lock_res;
239int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
240 int mode,
241 union ocfs2_dlm_lksb *lksb,
242 u32 flags,
243 void *name,
244 unsigned int namelen,
245 struct ocfs2_lock_res *astarg);
246int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
247 union ocfs2_dlm_lksb *lksb,
248 u32 flags,
249 struct ocfs2_lock_res *astarg);
250
251int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
252void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
253void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
254
255void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
256
257
258/* Used by stack plugins */
259int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
260void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
261#endif /* STACKGLUE_H */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 72c198a004df..d2d278fb9819 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -46,6 +46,11 @@
46 46
47#include "buffer_head_io.h" 47#include "buffer_head_io.h"
48 48
49#define NOT_ALLOC_NEW_GROUP 0
50#define ALLOC_NEW_GROUP 1
51
52#define OCFS2_MAX_INODES_TO_STEAL 1024
53
49static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 54static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
50static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 55static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
51static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 56static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -106,7 +111,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
106 u64 *bg_blkno, 111 u64 *bg_blkno,
107 u16 *bg_bit_off); 112 u16 *bg_bit_off);
108 113
109void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 114static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
110{ 115{
111 struct inode *inode = ac->ac_inode; 116 struct inode *inode = ac->ac_inode;
112 117
@@ -117,9 +122,17 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
117 mutex_unlock(&inode->i_mutex); 122 mutex_unlock(&inode->i_mutex);
118 123
119 iput(inode); 124 iput(inode);
125 ac->ac_inode = NULL;
120 } 126 }
121 if (ac->ac_bh) 127 if (ac->ac_bh) {
122 brelse(ac->ac_bh); 128 brelse(ac->ac_bh);
129 ac->ac_bh = NULL;
130 }
131}
132
133void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
134{
135 ocfs2_free_ac_resource(ac);
123 kfree(ac); 136 kfree(ac);
124} 137}
125 138
@@ -391,7 +404,8 @@ bail:
391static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, 404static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
392 struct ocfs2_alloc_context *ac, 405 struct ocfs2_alloc_context *ac,
393 int type, 406 int type,
394 u32 slot) 407 u32 slot,
408 int alloc_new_group)
395{ 409{
396 int status; 410 int status;
397 u32 bits_wanted = ac->ac_bits_wanted; 411 u32 bits_wanted = ac->ac_bits_wanted;
@@ -420,6 +434,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
420 } 434 }
421 435
422 ac->ac_inode = alloc_inode; 436 ac->ac_inode = alloc_inode;
437 ac->ac_alloc_slot = slot;
423 438
424 fe = (struct ocfs2_dinode *) bh->b_data; 439 fe = (struct ocfs2_dinode *) bh->b_data;
425 if (!OCFS2_IS_VALID_DINODE(fe)) { 440 if (!OCFS2_IS_VALID_DINODE(fe)) {
@@ -446,6 +461,14 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
446 goto bail; 461 goto bail;
447 } 462 }
448 463
464 if (alloc_new_group != ALLOC_NEW_GROUP) {
465 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
466 "and we don't alloc a new group for it.\n",
467 slot, bits_wanted, free_bits);
468 status = -ENOSPC;
469 goto bail;
470 }
471
449 status = ocfs2_block_group_alloc(osb, alloc_inode, bh); 472 status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
450 if (status < 0) { 473 if (status < 0) {
451 if (status != -ENOSPC) 474 if (status != -ENOSPC)
@@ -490,7 +513,8 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
490 (*ac)->ac_group_search = ocfs2_block_group_search; 513 (*ac)->ac_group_search = ocfs2_block_group_search;
491 514
492 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 515 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
493 EXTENT_ALLOC_SYSTEM_INODE, slot); 516 EXTENT_ALLOC_SYSTEM_INODE,
517 slot, ALLOC_NEW_GROUP);
494 if (status < 0) { 518 if (status < 0) {
495 if (status != -ENOSPC) 519 if (status != -ENOSPC)
496 mlog_errno(status); 520 mlog_errno(status);
@@ -508,10 +532,42 @@ bail:
508 return status; 532 return status;
509} 533}
510 534
535static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
536 struct ocfs2_alloc_context *ac)
537{
538 int i, status = -ENOSPC;
539 s16 slot = ocfs2_get_inode_steal_slot(osb);
540
541 /* Start to steal inodes from the first slot after ours. */
542 if (slot == OCFS2_INVALID_SLOT)
543 slot = osb->slot_num + 1;
544
545 for (i = 0; i < osb->max_slots; i++, slot++) {
546 if (slot == osb->max_slots)
547 slot = 0;
548
549 if (slot == osb->slot_num)
550 continue;
551
552 status = ocfs2_reserve_suballoc_bits(osb, ac,
553 INODE_ALLOC_SYSTEM_INODE,
554 slot, NOT_ALLOC_NEW_GROUP);
555 if (status >= 0) {
556 ocfs2_set_inode_steal_slot(osb, slot);
557 break;
558 }
559
560 ocfs2_free_ac_resource(ac);
561 }
562
563 return status;
564}
565
511int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 566int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
512 struct ocfs2_alloc_context **ac) 567 struct ocfs2_alloc_context **ac)
513{ 568{
514 int status; 569 int status;
570 s16 slot = ocfs2_get_inode_steal_slot(osb);
515 571
516 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 572 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
517 if (!(*ac)) { 573 if (!(*ac)) {
@@ -525,9 +581,43 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
525 581
526 (*ac)->ac_group_search = ocfs2_block_group_search; 582 (*ac)->ac_group_search = ocfs2_block_group_search;
527 583
584 /*
585 * slot is set when we successfully steal inode from other nodes.
586 * It is reset in 3 places:
587 * 1. when we flush the truncate log
588 * 2. when we complete local alloc recovery.
589 * 3. when we successfully allocate from our own slot.
590 * After it is set, we will go on stealing inodes until we find the
591 * need to check our slots to see whether there is some space for us.
592 */
593 if (slot != OCFS2_INVALID_SLOT &&
594 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
595 goto inode_steal;
596
597 atomic_set(&osb->s_num_inodes_stolen, 0);
528 status = ocfs2_reserve_suballoc_bits(osb, *ac, 598 status = ocfs2_reserve_suballoc_bits(osb, *ac,
529 INODE_ALLOC_SYSTEM_INODE, 599 INODE_ALLOC_SYSTEM_INODE,
530 osb->slot_num); 600 osb->slot_num, ALLOC_NEW_GROUP);
601 if (status >= 0) {
602 status = 0;
603
604 /*
605 * Some inodes must be freed by us, so try to allocate
606 * from our own next time.
607 */
608 if (slot != OCFS2_INVALID_SLOT)
609 ocfs2_init_inode_steal_slot(osb);
610 goto bail;
611 } else if (status < 0 && status != -ENOSPC) {
612 mlog_errno(status);
613 goto bail;
614 }
615
616 ocfs2_free_ac_resource(*ac);
617
618inode_steal:
619 status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
620 atomic_inc(&osb->s_num_inodes_stolen);
531 if (status < 0) { 621 if (status < 0) {
532 if (status != -ENOSPC) 622 if (status != -ENOSPC)
533 mlog_errno(status); 623 mlog_errno(status);
@@ -557,7 +647,8 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
557 647
558 status = ocfs2_reserve_suballoc_bits(osb, ac, 648 status = ocfs2_reserve_suballoc_bits(osb, ac,
559 GLOBAL_BITMAP_SYSTEM_INODE, 649 GLOBAL_BITMAP_SYSTEM_INODE,
560 OCFS2_INVALID_SLOT); 650 OCFS2_INVALID_SLOT,
651 ALLOC_NEW_GROUP);
561 if (status < 0 && status != -ENOSPC) { 652 if (status < 0 && status != -ENOSPC) {
562 mlog_errno(status); 653 mlog_errno(status);
563 goto bail; 654 goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8799033bb459..544c600662bd 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -36,6 +36,7 @@ typedef int (group_search_t)(struct inode *,
36struct ocfs2_alloc_context { 36struct ocfs2_alloc_context {
37 struct inode *ac_inode; /* which bitmap are we allocating from? */ 37 struct inode *ac_inode; /* which bitmap are we allocating from? */
38 struct buffer_head *ac_bh; /* file entry bh */ 38 struct buffer_head *ac_bh; /* file entry bh */
39 u32 ac_alloc_slot; /* which slot are we allocating from? */
39 u32 ac_bits_wanted; 40 u32 ac_bits_wanted;
40 u32 ac_bits_given; 41 u32 ac_bits_given;
41#define OCFS2_AC_USE_LOCAL 1 42#define OCFS2_AC_USE_LOCAL 1
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index bec75aff3d9f..df63ba20ae90 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -40,8 +40,7 @@
40#include <linux/crc32.h> 40#include <linux/crc32.h>
41#include <linux/debugfs.h> 41#include <linux/debugfs.h>
42#include <linux/mount.h> 42#include <linux/mount.h>
43 43#include <linux/seq_file.h>
44#include <cluster/nodemanager.h>
45 44
46#define MLOG_MASK_PREFIX ML_SUPER 45#define MLOG_MASK_PREFIX ML_SUPER
47#include <cluster/masklog.h> 46#include <cluster/masklog.h>
@@ -88,6 +87,7 @@ struct mount_options
88 unsigned int atime_quantum; 87 unsigned int atime_quantum;
89 signed short slot; 88 signed short slot;
90 unsigned int localalloc_opt; 89 unsigned int localalloc_opt;
90 char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
91}; 91};
92 92
93static int ocfs2_parse_options(struct super_block *sb, char *options, 93static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -109,7 +109,6 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait);
109static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); 109static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
110static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); 110static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
111static void ocfs2_release_system_inodes(struct ocfs2_super *osb); 111static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
112static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
113static int ocfs2_check_volume(struct ocfs2_super *osb); 112static int ocfs2_check_volume(struct ocfs2_super *osb);
114static int ocfs2_verify_volume(struct ocfs2_dinode *di, 113static int ocfs2_verify_volume(struct ocfs2_dinode *di,
115 struct buffer_head *bh, 114 struct buffer_head *bh,
@@ -154,6 +153,7 @@ enum {
154 Opt_commit, 153 Opt_commit,
155 Opt_localalloc, 154 Opt_localalloc,
156 Opt_localflocks, 155 Opt_localflocks,
156 Opt_stack,
157 Opt_err, 157 Opt_err,
158}; 158};
159 159
@@ -172,6 +172,7 @@ static match_table_t tokens = {
172 {Opt_commit, "commit=%u"}, 172 {Opt_commit, "commit=%u"},
173 {Opt_localalloc, "localalloc=%d"}, 173 {Opt_localalloc, "localalloc=%d"},
174 {Opt_localflocks, "localflocks"}, 174 {Opt_localflocks, "localflocks"},
175 {Opt_stack, "cluster_stack=%s"},
175 {Opt_err, NULL} 176 {Opt_err, NULL}
176}; 177};
177 178
@@ -551,8 +552,17 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
551 } 552 }
552 } 553 }
553 554
555 if (ocfs2_userspace_stack(osb)) {
556 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
557 mlog(ML_ERROR, "Userspace stack expected, but "
558 "o2cb heartbeat arguments passed to mount\n");
559 return -EINVAL;
560 }
561 }
562
554 if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { 563 if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
555 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb)) { 564 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
565 !ocfs2_userspace_stack(osb)) {
556 mlog(ML_ERROR, "Heartbeat has to be started to mount " 566 mlog(ML_ERROR, "Heartbeat has to be started to mount "
557 "a read-write clustered device.\n"); 567 "a read-write clustered device.\n");
558 return -EINVAL; 568 return -EINVAL;
@@ -562,6 +572,35 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
562 return 0; 572 return 0;
563} 573}
564 574
575/*
576 * If we're using a userspace stack, mount should have passed
577 * a name that matches the disk. If not, mount should not
578 * have passed a stack.
579 */
580static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
581 struct mount_options *mopt)
582{
583 if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) {
584 mlog(ML_ERROR,
585 "cluster stack passed to mount, but this filesystem "
586 "does not support it\n");
587 return -EINVAL;
588 }
589
590 if (ocfs2_userspace_stack(osb) &&
591 strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
592 OCFS2_STACK_LABEL_LEN)) {
593 mlog(ML_ERROR,
594 "cluster stack passed to mount (\"%s\") does not "
595 "match the filesystem (\"%s\")\n",
596 mopt->cluster_stack,
597 osb->osb_cluster_stack);
598 return -EINVAL;
599 }
600
601 return 0;
602}
603
565static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) 604static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
566{ 605{
567 struct dentry *root; 606 struct dentry *root;
@@ -579,15 +618,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
579 goto read_super_error; 618 goto read_super_error;
580 } 619 }
581 620
582 /* for now we only have one cluster/node, make sure we see it
583 * in the heartbeat universe */
584 if (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL) {
585 if (!o2hb_check_local_node_heartbeating()) {
586 status = -EINVAL;
587 goto read_super_error;
588 }
589 }
590
591 /* probe for superblock */ 621 /* probe for superblock */
592 status = ocfs2_sb_probe(sb, &bh, &sector_size); 622 status = ocfs2_sb_probe(sb, &bh, &sector_size);
593 if (status < 0) { 623 if (status < 0) {
@@ -609,6 +639,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
609 osb->osb_commit_interval = parsed_options.commit_interval; 639 osb->osb_commit_interval = parsed_options.commit_interval;
610 osb->local_alloc_size = parsed_options.localalloc_opt; 640 osb->local_alloc_size = parsed_options.localalloc_opt;
611 641
642 status = ocfs2_verify_userspace_stack(osb, &parsed_options);
643 if (status)
644 goto read_super_error;
645
612 sb->s_magic = OCFS2_SUPER_MAGIC; 646 sb->s_magic = OCFS2_SUPER_MAGIC;
613 647
614 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, 648 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
@@ -694,7 +728,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
694 if (ocfs2_mount_local(osb)) 728 if (ocfs2_mount_local(osb))
695 snprintf(nodestr, sizeof(nodestr), "local"); 729 snprintf(nodestr, sizeof(nodestr), "local");
696 else 730 else
697 snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); 731 snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
698 732
699 printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) " 733 printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) "
700 "with %s data mode.\n", 734 "with %s data mode.\n",
@@ -763,6 +797,7 @@ static int ocfs2_parse_options(struct super_block *sb,
763 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 797 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
764 mopt->slot = OCFS2_INVALID_SLOT; 798 mopt->slot = OCFS2_INVALID_SLOT;
765 mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; 799 mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
800 mopt->cluster_stack[0] = '\0';
766 801
767 if (!options) { 802 if (!options) {
768 status = 1; 803 status = 1;
@@ -864,6 +899,25 @@ static int ocfs2_parse_options(struct super_block *sb,
864 if (!is_remount) 899 if (!is_remount)
865 mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; 900 mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
866 break; 901 break;
902 case Opt_stack:
903 /* Check both that the option we were passed
904 * is of the right length and that it is a proper
905 * string of the right length.
906 */
907 if (((args[0].to - args[0].from) !=
908 OCFS2_STACK_LABEL_LEN) ||
909 (strnlen(args[0].from,
910 OCFS2_STACK_LABEL_LEN) !=
911 OCFS2_STACK_LABEL_LEN)) {
912 mlog(ML_ERROR,
913 "Invalid cluster_stack option\n");
914 status = 0;
915 goto bail;
916 }
917 memcpy(mopt->cluster_stack, args[0].from,
918 OCFS2_STACK_LABEL_LEN);
919 mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
920 break;
867 default: 921 default:
868 mlog(ML_ERROR, 922 mlog(ML_ERROR,
869 "Unrecognized mount option \"%s\" " 923 "Unrecognized mount option \"%s\" "
@@ -922,6 +976,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
922 if (opts & OCFS2_MOUNT_LOCALFLOCKS) 976 if (opts & OCFS2_MOUNT_LOCALFLOCKS)
923 seq_printf(s, ",localflocks,"); 977 seq_printf(s, ",localflocks,");
924 978
979 if (osb->osb_cluster_stack[0])
980 seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
981 osb->osb_cluster_stack);
982
925 return 0; 983 return 0;
926} 984}
927 985
@@ -957,6 +1015,8 @@ static int __init ocfs2_init(void)
957 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); 1015 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
958 } 1016 }
959 1017
1018 ocfs2_set_locking_protocol();
1019
960leave: 1020leave:
961 if (status < 0) { 1021 if (status < 0) {
962 ocfs2_free_mem_caches(); 1022 ocfs2_free_mem_caches();
@@ -1132,31 +1192,6 @@ static int ocfs2_get_sector(struct super_block *sb,
1132 return 0; 1192 return 0;
1133} 1193}
1134 1194
1135/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
1136static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
1137{
1138 int status;
1139
1140 /* XXX hold a ref on the node while mounte? easy enough, if
1141 * desirable. */
1142 if (ocfs2_mount_local(osb))
1143 osb->node_num = 0;
1144 else
1145 osb->node_num = o2nm_this_node();
1146
1147 if (osb->node_num == O2NM_MAX_NODES) {
1148 mlog(ML_ERROR, "could not find this host's node number\n");
1149 status = -ENOENT;
1150 goto bail;
1151 }
1152
1153 mlog(0, "I am node %d\n", osb->node_num);
1154
1155 status = 0;
1156bail:
1157 return status;
1158}
1159
1160static int ocfs2_mount_volume(struct super_block *sb) 1195static int ocfs2_mount_volume(struct super_block *sb)
1161{ 1196{
1162 int status = 0; 1197 int status = 0;
@@ -1168,12 +1203,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
1168 if (ocfs2_is_hard_readonly(osb)) 1203 if (ocfs2_is_hard_readonly(osb))
1169 goto leave; 1204 goto leave;
1170 1205
1171 status = ocfs2_fill_local_node_info(osb);
1172 if (status < 0) {
1173 mlog_errno(status);
1174 goto leave;
1175 }
1176
1177 status = ocfs2_dlm_init(osb); 1206 status = ocfs2_dlm_init(osb);
1178 if (status < 0) { 1207 if (status < 0) {
1179 mlog_errno(status); 1208 mlog_errno(status);
@@ -1224,18 +1253,9 @@ leave:
1224 return status; 1253 return status;
1225} 1254}
1226 1255
1227/* we can't grab the goofy sem lock from inside wait_event, so we use
1228 * memory barriers to make sure that we'll see the null task before
1229 * being woken up */
1230static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
1231{
1232 mb();
1233 return osb->recovery_thread_task != NULL;
1234}
1235
1236static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) 1256static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1237{ 1257{
1238 int tmp; 1258 int tmp, hangup_needed = 0;
1239 struct ocfs2_super *osb = NULL; 1259 struct ocfs2_super *osb = NULL;
1240 char nodestr[8]; 1260 char nodestr[8];
1241 1261
@@ -1249,25 +1269,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1249 1269
1250 ocfs2_truncate_log_shutdown(osb); 1270 ocfs2_truncate_log_shutdown(osb);
1251 1271
1252 /* disable any new recovery threads and wait for any currently 1272 /* This will disable recovery and flush any recovery work. */
1253 * running ones to exit. Do this before setting the vol_state. */ 1273 ocfs2_recovery_exit(osb);
1254 mutex_lock(&osb->recovery_lock);
1255 osb->disable_recovery = 1;
1256 mutex_unlock(&osb->recovery_lock);
1257 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
1258
1259 /* At this point, we know that no more recovery threads can be
1260 * launched, so wait for any recovery completion work to
1261 * complete. */
1262 flush_workqueue(ocfs2_wq);
1263 1274
1264 ocfs2_journal_shutdown(osb); 1275 ocfs2_journal_shutdown(osb);
1265 1276
1266 ocfs2_sync_blockdev(sb); 1277 ocfs2_sync_blockdev(sb);
1267 1278
1268 /* No dlm means we've failed during mount, so skip all the 1279 /* No cluster connection means we've failed during mount, so skip
1269 * steps which depended on that to complete. */ 1280 * all the steps which depended on that to complete. */
1270 if (osb->dlm) { 1281 if (osb->cconn) {
1271 tmp = ocfs2_super_lock(osb, 1); 1282 tmp = ocfs2_super_lock(osb, 1);
1272 if (tmp < 0) { 1283 if (tmp < 0) {
1273 mlog_errno(tmp); 1284 mlog_errno(tmp);
@@ -1278,25 +1289,34 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1278 if (osb->slot_num != OCFS2_INVALID_SLOT) 1289 if (osb->slot_num != OCFS2_INVALID_SLOT)
1279 ocfs2_put_slot(osb); 1290 ocfs2_put_slot(osb);
1280 1291
1281 if (osb->dlm) 1292 if (osb->cconn)
1282 ocfs2_super_unlock(osb, 1); 1293 ocfs2_super_unlock(osb, 1);
1283 1294
1284 ocfs2_release_system_inodes(osb); 1295 ocfs2_release_system_inodes(osb);
1285 1296
1286 if (osb->dlm) 1297 /*
1287 ocfs2_dlm_shutdown(osb); 1298 * If we're dismounting due to mount error, mount.ocfs2 will clean
1299 * up heartbeat. If we're a local mount, there is no heartbeat.
1300 * If we failed before we got a uuid_str yet, we can't stop
1301 * heartbeat. Otherwise, do it.
1302 */
1303 if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
1304 hangup_needed = 1;
1305
1306 if (osb->cconn)
1307 ocfs2_dlm_shutdown(osb, hangup_needed);
1288 1308
1289 debugfs_remove(osb->osb_debug_root); 1309 debugfs_remove(osb->osb_debug_root);
1290 1310
1291 if (!mnt_err) 1311 if (hangup_needed)
1292 ocfs2_stop_heartbeat(osb); 1312 ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
1293 1313
1294 atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); 1314 atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
1295 1315
1296 if (ocfs2_mount_local(osb)) 1316 if (ocfs2_mount_local(osb))
1297 snprintf(nodestr, sizeof(nodestr), "local"); 1317 snprintf(nodestr, sizeof(nodestr), "local");
1298 else 1318 else
1299 snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); 1319 snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
1300 1320
1301 printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n", 1321 printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n",
1302 osb->dev_str, nodestr); 1322 osb->dev_str, nodestr);
@@ -1355,7 +1375,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1355 sb->s_fs_info = osb; 1375 sb->s_fs_info = osb;
1356 sb->s_op = &ocfs2_sops; 1376 sb->s_op = &ocfs2_sops;
1357 sb->s_export_op = &ocfs2_export_ops; 1377 sb->s_export_op = &ocfs2_export_ops;
1358 osb->osb_locking_proto = ocfs2_locking_protocol;
1359 sb->s_time_gran = 1; 1378 sb->s_time_gran = 1;
1360 sb->s_flags |= MS_NOATIME; 1379 sb->s_flags |= MS_NOATIME;
1361 /* this is needed to support O_LARGEFILE */ 1380 /* this is needed to support O_LARGEFILE */
@@ -1368,7 +1387,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1368 osb->s_sectsize_bits = blksize_bits(sector_size); 1387 osb->s_sectsize_bits = blksize_bits(sector_size);
1369 BUG_ON(!osb->s_sectsize_bits); 1388 BUG_ON(!osb->s_sectsize_bits);
1370 1389
1371 init_waitqueue_head(&osb->recovery_event);
1372 spin_lock_init(&osb->dc_task_lock); 1390 spin_lock_init(&osb->dc_task_lock);
1373 init_waitqueue_head(&osb->dc_event); 1391 init_waitqueue_head(&osb->dc_event);
1374 osb->dc_work_sequence = 0; 1392 osb->dc_work_sequence = 0;
@@ -1376,6 +1394,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1376 INIT_LIST_HEAD(&osb->blocked_lock_list); 1394 INIT_LIST_HEAD(&osb->blocked_lock_list);
1377 osb->blocked_lock_count = 0; 1395 osb->blocked_lock_count = 0;
1378 spin_lock_init(&osb->osb_lock); 1396 spin_lock_init(&osb->osb_lock);
1397 ocfs2_init_inode_steal_slot(osb);
1379 1398
1380 atomic_set(&osb->alloc_stats.moves, 0); 1399 atomic_set(&osb->alloc_stats.moves, 0);
1381 atomic_set(&osb->alloc_stats.local_data, 0); 1400 atomic_set(&osb->alloc_stats.local_data, 0);
@@ -1388,24 +1407,23 @@ static int ocfs2_initialize_super(struct super_block *sb,
1388 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", 1407 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
1389 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1408 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1390 1409
1391 mutex_init(&osb->recovery_lock); 1410 status = ocfs2_recovery_init(osb);
1392 1411 if (status) {
1393 osb->disable_recovery = 0; 1412 mlog(ML_ERROR, "Unable to initialize recovery state\n");
1394 osb->recovery_thread_task = NULL; 1413 mlog_errno(status);
1414 goto bail;
1415 }
1395 1416
1396 init_waitqueue_head(&osb->checkpoint_event); 1417 init_waitqueue_head(&osb->checkpoint_event);
1397 atomic_set(&osb->needs_checkpoint, 0); 1418 atomic_set(&osb->needs_checkpoint, 0);
1398 1419
1399 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 1420 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
1400 1421
1401 osb->node_num = O2NM_INVALID_NODE_NUM;
1402 osb->slot_num = OCFS2_INVALID_SLOT; 1422 osb->slot_num = OCFS2_INVALID_SLOT;
1403 1423
1404 osb->local_alloc_state = OCFS2_LA_UNUSED; 1424 osb->local_alloc_state = OCFS2_LA_UNUSED;
1405 osb->local_alloc_bh = NULL; 1425 osb->local_alloc_bh = NULL;
1406 1426
1407 ocfs2_setup_hb_callbacks(osb);
1408
1409 init_waitqueue_head(&osb->osb_mount_event); 1427 init_waitqueue_head(&osb->osb_mount_event);
1410 1428
1411 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); 1429 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
@@ -1455,6 +1473,25 @@ static int ocfs2_initialize_super(struct super_block *sb,
1455 goto bail; 1473 goto bail;
1456 } 1474 }
1457 1475
1476 if (ocfs2_userspace_stack(osb)) {
1477 memcpy(osb->osb_cluster_stack,
1478 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
1479 OCFS2_STACK_LABEL_LEN);
1480 osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
1481 if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
1482 mlog(ML_ERROR,
1483 "couldn't mount because of an invalid "
1484 "cluster stack label (%s) \n",
1485 osb->osb_cluster_stack);
1486 status = -EINVAL;
1487 goto bail;
1488 }
1489 } else {
1490 /* The empty string is identical with classic tools that
1491 * don't know about s_cluster_info. */
1492 osb->osb_cluster_stack[0] = '\0';
1493 }
1494
1458 get_random_bytes(&osb->s_next_generation, sizeof(u32)); 1495 get_random_bytes(&osb->s_next_generation, sizeof(u32));
1459 1496
1460 /* FIXME 1497 /* FIXME
@@ -1724,8 +1761,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
1724 1761
1725 /* This function assumes that the caller has the main osb resource */ 1762 /* This function assumes that the caller has the main osb resource */
1726 1763
1727 if (osb->slot_info) 1764 ocfs2_free_slot_info(osb);
1728 ocfs2_free_slot_info(osb->slot_info);
1729 1765
1730 kfree(osb->osb_orphan_wipes); 1766 kfree(osb->osb_orphan_wipes);
1731 /* FIXME 1767 /* FIXME
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 5f66c4466151..817f5966edca 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -87,7 +87,14 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
87 87
88void sysfs_remove_link(struct kobject * kobj, const char * name) 88void sysfs_remove_link(struct kobject * kobj, const char * name)
89{ 89{
90 sysfs_hash_and_remove(kobj->sd, name); 90 struct sysfs_dirent *parent_sd = NULL;
91
92 if (!kobj)
93 parent_sd = &sysfs_root;
94 else
95 parent_sd = kobj->sd;
96
97 sysfs_hash_and_remove(parent_sd, name);
91} 98}
92 99
93static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, 100static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,